| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.9986530690783146, |
| "eval_steps": 500, |
| "global_step": 10392, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0019241870309794113, |
| "grad_norm": 1.963148361362126, |
| "learning_rate": 2.403846153846154e-07, |
| "loss": 0.5398, |
| "mean_token_accuracy": 0.8655876636505127, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0038483740619588225, |
| "grad_norm": 1.9157009818195054, |
| "learning_rate": 4.807692307692308e-07, |
| "loss": 0.5307, |
| "mean_token_accuracy": 0.8676515460014343, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.005772561092938233, |
| "grad_norm": 1.5111307334029855, |
| "learning_rate": 7.211538461538462e-07, |
| "loss": 0.5196, |
| "mean_token_accuracy": 0.8695880591869354, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.007696748123917645, |
| "grad_norm": 0.853529638874443, |
| "learning_rate": 9.615384615384617e-07, |
| "loss": 0.5095, |
| "mean_token_accuracy": 0.8688256680965424, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.009620935154897056, |
| "grad_norm": 0.7077984713233373, |
| "learning_rate": 1.201923076923077e-06, |
| "loss": 0.4858, |
| "mean_token_accuracy": 0.8708780348300934, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.011545122185876467, |
| "grad_norm": 0.7200684137615945, |
| "learning_rate": 1.4423076923076924e-06, |
| "loss": 0.4602, |
| "mean_token_accuracy": 0.8745088517665863, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.01346930921685588, |
| "grad_norm": 0.4688698486503959, |
| "learning_rate": 1.6826923076923077e-06, |
| "loss": 0.4473, |
| "mean_token_accuracy": 0.8767399728298187, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.01539349624783529, |
| "grad_norm": 0.45791546768368196, |
| "learning_rate": 1.9230769230769234e-06, |
| "loss": 0.4325, |
| "mean_token_accuracy": 0.8795706152915954, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0173176832788147, |
| "grad_norm": 0.2883581646319124, |
| "learning_rate": 2.1634615384615387e-06, |
| "loss": 0.4035, |
| "mean_token_accuracy": 0.8868738651275635, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.01924187030979411, |
| "grad_norm": 0.2975988257840724, |
| "learning_rate": 2.403846153846154e-06, |
| "loss": 0.4, |
| "mean_token_accuracy": 0.8870362162590026, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.021166057340773523, |
| "grad_norm": 0.21745403228236987, |
| "learning_rate": 2.644230769230769e-06, |
| "loss": 0.3981, |
| "mean_token_accuracy": 0.8870003700256348, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.023090244371752933, |
| "grad_norm": 0.23081938415720538, |
| "learning_rate": 2.884615384615385e-06, |
| "loss": 0.3841, |
| "mean_token_accuracy": 0.8902194917201995, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.025014431402732344, |
| "grad_norm": 0.1954993628169396, |
| "learning_rate": 3.125e-06, |
| "loss": 0.3801, |
| "mean_token_accuracy": 0.8912450432777405, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.02693861843371176, |
| "grad_norm": 0.2003159839657926, |
| "learning_rate": 3.3653846153846154e-06, |
| "loss": 0.3768, |
| "mean_token_accuracy": 0.8914297997951508, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.02886280546469117, |
| "grad_norm": 0.19367313769698136, |
| "learning_rate": 3.6057692307692307e-06, |
| "loss": 0.3718, |
| "mean_token_accuracy": 0.892903745174408, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.03078699249567058, |
| "grad_norm": 0.17732974119710987, |
| "learning_rate": 3.846153846153847e-06, |
| "loss": 0.3634, |
| "mean_token_accuracy": 0.8940442979335785, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.03271117952664999, |
| "grad_norm": 0.18202600444863165, |
| "learning_rate": 4.086538461538462e-06, |
| "loss": 0.3618, |
| "mean_token_accuracy": 0.8950167894363403, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.0346353665576294, |
| "grad_norm": 0.1814077955452155, |
| "learning_rate": 4.326923076923077e-06, |
| "loss": 0.3608, |
| "mean_token_accuracy": 0.894530737400055, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.036559553588608816, |
| "grad_norm": 0.17982732704049031, |
| "learning_rate": 4.567307692307692e-06, |
| "loss": 0.3546, |
| "mean_token_accuracy": 0.8964052140712738, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.03848374061958822, |
| "grad_norm": 0.18760114918066836, |
| "learning_rate": 4.807692307692308e-06, |
| "loss": 0.3527, |
| "mean_token_accuracy": 0.896798574924469, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04040792765056764, |
| "grad_norm": 0.18792678975884775, |
| "learning_rate": 5.0480769230769235e-06, |
| "loss": 0.3552, |
| "mean_token_accuracy": 0.8957571089267731, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.042332114681547045, |
| "grad_norm": 0.18458960773369418, |
| "learning_rate": 5.288461538461538e-06, |
| "loss": 0.35, |
| "mean_token_accuracy": 0.8970022380352021, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.04425630171252646, |
| "grad_norm": 0.20512394864726513, |
| "learning_rate": 5.528846153846154e-06, |
| "loss": 0.3494, |
| "mean_token_accuracy": 0.8968957602977753, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.04618048874350587, |
| "grad_norm": 0.19060088598183494, |
| "learning_rate": 5.76923076923077e-06, |
| "loss": 0.3435, |
| "mean_token_accuracy": 0.8983101010322571, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.04810467577448528, |
| "grad_norm": 0.1837400972185552, |
| "learning_rate": 6.0096153846153855e-06, |
| "loss": 0.3423, |
| "mean_token_accuracy": 0.8987049281597137, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.05002886280546469, |
| "grad_norm": 0.19128226203549084, |
| "learning_rate": 6.25e-06, |
| "loss": 0.34, |
| "mean_token_accuracy": 0.8992797672748566, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0519530498364441, |
| "grad_norm": 0.2012016913255968, |
| "learning_rate": 6.490384615384616e-06, |
| "loss": 0.3415, |
| "mean_token_accuracy": 0.8987887859344482, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.05387723686742352, |
| "grad_norm": 0.20162833105748967, |
| "learning_rate": 6.730769230769231e-06, |
| "loss": 0.339, |
| "mean_token_accuracy": 0.8988729476928711, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.055801423898402924, |
| "grad_norm": 0.1975004512012202, |
| "learning_rate": 6.9711538461538465e-06, |
| "loss": 0.3363, |
| "mean_token_accuracy": 0.8998090863227844, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.05772561092938234, |
| "grad_norm": 0.19122157725014663, |
| "learning_rate": 7.211538461538461e-06, |
| "loss": 0.335, |
| "mean_token_accuracy": 0.8997473120689392, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.059649797960361746, |
| "grad_norm": 0.19833364273809126, |
| "learning_rate": 7.451923076923077e-06, |
| "loss": 0.3358, |
| "mean_token_accuracy": 0.8996488392353058, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.06157398499134116, |
| "grad_norm": 0.19478486853041554, |
| "learning_rate": 7.692307692307694e-06, |
| "loss": 0.338, |
| "mean_token_accuracy": 0.8991158306598663, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.06349817202232057, |
| "grad_norm": 0.2237604537377637, |
| "learning_rate": 7.932692307692308e-06, |
| "loss": 0.3324, |
| "mean_token_accuracy": 0.9004191577434539, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.06542235905329997, |
| "grad_norm": 0.22412298555295504, |
| "learning_rate": 8.173076923076923e-06, |
| "loss": 0.3316, |
| "mean_token_accuracy": 0.9007892608642578, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0673465460842794, |
| "grad_norm": 0.22506299457001125, |
| "learning_rate": 8.41346153846154e-06, |
| "loss": 0.3331, |
| "mean_token_accuracy": 0.9003400206565857, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.0692707331152588, |
| "grad_norm": 0.20552697674020787, |
| "learning_rate": 8.653846153846155e-06, |
| "loss": 0.3277, |
| "mean_token_accuracy": 0.9019653260707855, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.07119492014623821, |
| "grad_norm": 0.20325569047208658, |
| "learning_rate": 8.89423076923077e-06, |
| "loss": 0.3285, |
| "mean_token_accuracy": 0.9014288187026978, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.07311910717721763, |
| "grad_norm": 0.20125377934796918, |
| "learning_rate": 9.134615384615384e-06, |
| "loss": 0.3308, |
| "mean_token_accuracy": 0.9006717920303344, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.07504329420819704, |
| "grad_norm": 0.2030873071758768, |
| "learning_rate": 9.375000000000001e-06, |
| "loss": 0.3216, |
| "mean_token_accuracy": 0.9034212172031403, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.07696748123917645, |
| "grad_norm": 0.20006678924701887, |
| "learning_rate": 9.615384615384616e-06, |
| "loss": 0.323, |
| "mean_token_accuracy": 0.9028126895427704, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07889166827015585, |
| "grad_norm": 0.21243281284856008, |
| "learning_rate": 9.85576923076923e-06, |
| "loss": 0.3248, |
| "mean_token_accuracy": 0.9024546027183533, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.08081585530113528, |
| "grad_norm": 0.21408796709828543, |
| "learning_rate": 1.0096153846153847e-05, |
| "loss": 0.3207, |
| "mean_token_accuracy": 0.903283417224884, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.08274004233211468, |
| "grad_norm": 0.2172138301632152, |
| "learning_rate": 1.0336538461538462e-05, |
| "loss": 0.3237, |
| "mean_token_accuracy": 0.9019180119037629, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.08466422936309409, |
| "grad_norm": 0.20230661485907633, |
| "learning_rate": 1.0576923076923077e-05, |
| "loss": 0.3201, |
| "mean_token_accuracy": 0.9030288457870483, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.0865884163940735, |
| "grad_norm": 0.20311664902409643, |
| "learning_rate": 1.0817307692307693e-05, |
| "loss": 0.3182, |
| "mean_token_accuracy": 0.9034170091152192, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.08851260342505292, |
| "grad_norm": 0.21362908226608346, |
| "learning_rate": 1.1057692307692308e-05, |
| "loss": 0.3187, |
| "mean_token_accuracy": 0.9034193456172943, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.09043679045603233, |
| "grad_norm": 0.2169502521795548, |
| "learning_rate": 1.1298076923076923e-05, |
| "loss": 0.3154, |
| "mean_token_accuracy": 0.9046095728874206, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.09236097748701173, |
| "grad_norm": 0.20379925548365357, |
| "learning_rate": 1.153846153846154e-05, |
| "loss": 0.3164, |
| "mean_token_accuracy": 0.9037660837173462, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.09428516451799115, |
| "grad_norm": 0.21148715632559753, |
| "learning_rate": 1.1778846153846154e-05, |
| "loss": 0.3131, |
| "mean_token_accuracy": 0.9047864377498627, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.09620935154897056, |
| "grad_norm": 0.21702583427227537, |
| "learning_rate": 1.2019230769230771e-05, |
| "loss": 0.312, |
| "mean_token_accuracy": 0.9051642954349518, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.09813353857994997, |
| "grad_norm": 0.22069801101779785, |
| "learning_rate": 1.2259615384615384e-05, |
| "loss": 0.3134, |
| "mean_token_accuracy": 0.9046817421913147, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.10005772561092938, |
| "grad_norm": 0.22529773024079458, |
| "learning_rate": 1.25e-05, |
| "loss": 0.3102, |
| "mean_token_accuracy": 0.9054987668991089, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.1019819126419088, |
| "grad_norm": 0.23939421584415665, |
| "learning_rate": 1.2740384615384615e-05, |
| "loss": 0.3129, |
| "mean_token_accuracy": 0.9048707842826843, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.1039060996728882, |
| "grad_norm": 0.2163800175850561, |
| "learning_rate": 1.2980769230769232e-05, |
| "loss": 0.309, |
| "mean_token_accuracy": 0.905825936794281, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.10583028670386761, |
| "grad_norm": 0.20997253552830777, |
| "learning_rate": 1.3221153846153847e-05, |
| "loss": 0.306, |
| "mean_token_accuracy": 0.9066311120986938, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.10775447373484703, |
| "grad_norm": 0.21975272965592113, |
| "learning_rate": 1.3461538461538462e-05, |
| "loss": 0.3059, |
| "mean_token_accuracy": 0.9064570367336273, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.10967866076582644, |
| "grad_norm": 0.2268669947079344, |
| "learning_rate": 1.3701923076923078e-05, |
| "loss": 0.3027, |
| "mean_token_accuracy": 0.9075237393379212, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.11160284779680585, |
| "grad_norm": 0.2253035741790312, |
| "learning_rate": 1.3942307692307693e-05, |
| "loss": 0.3041, |
| "mean_token_accuracy": 0.9066643595695496, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.11352703482778526, |
| "grad_norm": 0.22634929034520535, |
| "learning_rate": 1.4182692307692308e-05, |
| "loss": 0.3019, |
| "mean_token_accuracy": 0.9074173867702484, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.11545122185876468, |
| "grad_norm": 0.22715884246265147, |
| "learning_rate": 1.4423076923076923e-05, |
| "loss": 0.301, |
| "mean_token_accuracy": 0.9073862195014953, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.11737540888974408, |
| "grad_norm": 0.26148249937262613, |
| "learning_rate": 1.466346153846154e-05, |
| "loss": 0.2997, |
| "mean_token_accuracy": 0.9080133736133575, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.11929959592072349, |
| "grad_norm": 0.22681858980447167, |
| "learning_rate": 1.4903846153846154e-05, |
| "loss": 0.2975, |
| "mean_token_accuracy": 0.908364349603653, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.1212237829517029, |
| "grad_norm": 0.22867695868438945, |
| "learning_rate": 1.5144230769230769e-05, |
| "loss": 0.2952, |
| "mean_token_accuracy": 0.90918750166893, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.12314796998268232, |
| "grad_norm": 0.2332682731441986, |
| "learning_rate": 1.5384615384615387e-05, |
| "loss": 0.2955, |
| "mean_token_accuracy": 0.9092335760593414, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.12507215701366173, |
| "grad_norm": 0.2459196384955729, |
| "learning_rate": 1.5625e-05, |
| "loss": 0.2939, |
| "mean_token_accuracy": 0.9092588603496552, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.12699634404464114, |
| "grad_norm": 0.22828931788344325, |
| "learning_rate": 1.5865384615384617e-05, |
| "loss": 0.2873, |
| "mean_token_accuracy": 0.911347258090973, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.12892053107562054, |
| "grad_norm": 0.21946639421360295, |
| "learning_rate": 1.6105769230769233e-05, |
| "loss": 0.2866, |
| "mean_token_accuracy": 0.9112029016017914, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.13084471810659995, |
| "grad_norm": 0.22530996089046024, |
| "learning_rate": 1.6346153846153847e-05, |
| "loss": 0.2875, |
| "mean_token_accuracy": 0.9108709812164306, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.13276890513757938, |
| "grad_norm": 0.2551096047012644, |
| "learning_rate": 1.6586538461538463e-05, |
| "loss": 0.2839, |
| "mean_token_accuracy": 0.9118252098560333, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.1346930921685588, |
| "grad_norm": 0.23107070145783154, |
| "learning_rate": 1.682692307692308e-05, |
| "loss": 0.2862, |
| "mean_token_accuracy": 0.911124873161316, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.1366172791995382, |
| "grad_norm": 0.24198231450428304, |
| "learning_rate": 1.7067307692307693e-05, |
| "loss": 0.2795, |
| "mean_token_accuracy": 0.9127156972885132, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.1385414662305176, |
| "grad_norm": 0.23288959582422472, |
| "learning_rate": 1.730769230769231e-05, |
| "loss": 0.2795, |
| "mean_token_accuracy": 0.9133687376976013, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.14046565326149701, |
| "grad_norm": 0.25473076353716345, |
| "learning_rate": 1.7548076923076922e-05, |
| "loss": 0.2833, |
| "mean_token_accuracy": 0.9122391879558563, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.14238984029247642, |
| "grad_norm": 0.2401352594878419, |
| "learning_rate": 1.778846153846154e-05, |
| "loss": 0.2772, |
| "mean_token_accuracy": 0.9137230932712554, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.14431402732345583, |
| "grad_norm": 0.2277816706017706, |
| "learning_rate": 1.8028846153846156e-05, |
| "loss": 0.2748, |
| "mean_token_accuracy": 0.9141271114349365, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.14623821435443526, |
| "grad_norm": 0.30055025836902316, |
| "learning_rate": 1.826923076923077e-05, |
| "loss": 0.2757, |
| "mean_token_accuracy": 0.9140791058540344, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.14816240138541467, |
| "grad_norm": 0.2978323202801783, |
| "learning_rate": 1.8509615384615385e-05, |
| "loss": 0.2712, |
| "mean_token_accuracy": 0.9153619170188904, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.15008658841639408, |
| "grad_norm": 0.26309054507870117, |
| "learning_rate": 1.8750000000000002e-05, |
| "loss": 0.2733, |
| "mean_token_accuracy": 0.9149505078792572, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.1520107754473735, |
| "grad_norm": 0.23686532210659178, |
| "learning_rate": 1.8990384615384615e-05, |
| "loss": 0.273, |
| "mean_token_accuracy": 0.9151660382747651, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.1539349624783529, |
| "grad_norm": 0.2512431182310415, |
| "learning_rate": 1.923076923076923e-05, |
| "loss": 0.267, |
| "mean_token_accuracy": 0.9165635347366333, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1558591495093323, |
| "grad_norm": 0.23304477086405798, |
| "learning_rate": 1.9471153846153848e-05, |
| "loss": 0.2658, |
| "mean_token_accuracy": 0.9168694078922272, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.1577833365403117, |
| "grad_norm": 0.2676902091848038, |
| "learning_rate": 1.971153846153846e-05, |
| "loss": 0.266, |
| "mean_token_accuracy": 0.9166438400745391, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.15970752357129112, |
| "grad_norm": 0.26789992728688017, |
| "learning_rate": 1.9951923076923078e-05, |
| "loss": 0.2664, |
| "mean_token_accuracy": 0.916456151008606, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.16163171060227055, |
| "grad_norm": 0.2979302921999793, |
| "learning_rate": 2.0192307692307694e-05, |
| "loss": 0.2631, |
| "mean_token_accuracy": 0.9175773561000824, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.16355589763324996, |
| "grad_norm": 0.2831262323507047, |
| "learning_rate": 2.0432692307692307e-05, |
| "loss": 0.262, |
| "mean_token_accuracy": 0.917683893442154, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.16548008466422937, |
| "grad_norm": 0.28630264276949907, |
| "learning_rate": 2.0673076923076924e-05, |
| "loss": 0.2622, |
| "mean_token_accuracy": 0.9177160143852234, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.16740427169520877, |
| "grad_norm": 0.25697007177807973, |
| "learning_rate": 2.091346153846154e-05, |
| "loss": 0.259, |
| "mean_token_accuracy": 0.9182245790958404, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.16932845872618818, |
| "grad_norm": 0.23508881366270015, |
| "learning_rate": 2.1153846153846154e-05, |
| "loss": 0.2564, |
| "mean_token_accuracy": 0.9188485264778137, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.1712526457571676, |
| "grad_norm": 0.25717377003632436, |
| "learning_rate": 2.139423076923077e-05, |
| "loss": 0.2543, |
| "mean_token_accuracy": 0.9198683559894562, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.173176832788147, |
| "grad_norm": 0.24779708644039147, |
| "learning_rate": 2.1634615384615387e-05, |
| "loss": 0.2497, |
| "mean_token_accuracy": 0.9209720194339752, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.17510101981912643, |
| "grad_norm": 0.25956326734408053, |
| "learning_rate": 2.1875e-05, |
| "loss": 0.2559, |
| "mean_token_accuracy": 0.9193503677845001, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.17702520685010584, |
| "grad_norm": 0.24756279829978345, |
| "learning_rate": 2.2115384615384616e-05, |
| "loss": 0.2542, |
| "mean_token_accuracy": 0.9198038160800934, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.17894939388108524, |
| "grad_norm": 0.29512233740298904, |
| "learning_rate": 2.2355769230769233e-05, |
| "loss": 0.25, |
| "mean_token_accuracy": 0.9208518505096436, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.18087358091206465, |
| "grad_norm": 0.25467507539452205, |
| "learning_rate": 2.2596153846153846e-05, |
| "loss": 0.2471, |
| "mean_token_accuracy": 0.9220957338809967, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.18279776794304406, |
| "grad_norm": 0.24226828627780678, |
| "learning_rate": 2.2836538461538463e-05, |
| "loss": 0.2473, |
| "mean_token_accuracy": 0.9218408524990082, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.18472195497402347, |
| "grad_norm": 0.2613108563091361, |
| "learning_rate": 2.307692307692308e-05, |
| "loss": 0.2472, |
| "mean_token_accuracy": 0.9217159509658813, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.18664614200500287, |
| "grad_norm": 0.23328958429498128, |
| "learning_rate": 2.3317307692307692e-05, |
| "loss": 0.2486, |
| "mean_token_accuracy": 0.9216979265213012, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.1885703290359823, |
| "grad_norm": 0.23884925516205582, |
| "learning_rate": 2.355769230769231e-05, |
| "loss": 0.2452, |
| "mean_token_accuracy": 0.9223480701446534, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.19049451606696172, |
| "grad_norm": 0.23034686602700213, |
| "learning_rate": 2.3798076923076922e-05, |
| "loss": 0.245, |
| "mean_token_accuracy": 0.9224259674549102, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.19241870309794112, |
| "grad_norm": 0.23791074060824063, |
| "learning_rate": 2.4038461538461542e-05, |
| "loss": 0.2447, |
| "mean_token_accuracy": 0.9224281191825867, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.19434289012892053, |
| "grad_norm": 0.23743899436610488, |
| "learning_rate": 2.4278846153846155e-05, |
| "loss": 0.2386, |
| "mean_token_accuracy": 0.9246494829654693, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.19626707715989994, |
| "grad_norm": 0.25251125640157557, |
| "learning_rate": 2.4519230769230768e-05, |
| "loss": 0.2414, |
| "mean_token_accuracy": 0.9234608709812164, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.19819126419087935, |
| "grad_norm": 0.24599247169778568, |
| "learning_rate": 2.4759615384615388e-05, |
| "loss": 0.2374, |
| "mean_token_accuracy": 0.9246467411518097, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.20011545122185875, |
| "grad_norm": 0.2618092608426384, |
| "learning_rate": 2.5e-05, |
| "loss": 0.2382, |
| "mean_token_accuracy": 0.92443066239357, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.2020396382528382, |
| "grad_norm": 0.22703499557895365, |
| "learning_rate": 2.5240384615384614e-05, |
| "loss": 0.2368, |
| "mean_token_accuracy": 0.924897426366806, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.2039638252838176, |
| "grad_norm": 0.2607950217469307, |
| "learning_rate": 2.548076923076923e-05, |
| "loss": 0.2367, |
| "mean_token_accuracy": 0.9249145805835723, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.205888012314797, |
| "grad_norm": 0.24761827539633852, |
| "learning_rate": 2.5721153846153844e-05, |
| "loss": 0.2359, |
| "mean_token_accuracy": 0.924891984462738, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.2078121993457764, |
| "grad_norm": 0.23623344107823752, |
| "learning_rate": 2.5961538461538464e-05, |
| "loss": 0.2335, |
| "mean_token_accuracy": 0.9254171848297119, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.20973638637675582, |
| "grad_norm": 0.24983941011324626, |
| "learning_rate": 2.620192307692308e-05, |
| "loss": 0.2343, |
| "mean_token_accuracy": 0.9255397915840149, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.21166057340773523, |
| "grad_norm": 0.2292795574738372, |
| "learning_rate": 2.6442307692307694e-05, |
| "loss": 0.2345, |
| "mean_token_accuracy": 0.9254584074020386, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.21358476043871463, |
| "grad_norm": 0.2333800602809108, |
| "learning_rate": 2.668269230769231e-05, |
| "loss": 0.2292, |
| "mean_token_accuracy": 0.9271303296089173, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.21550894746969407, |
| "grad_norm": 0.26776777652632827, |
| "learning_rate": 2.6923076923076923e-05, |
| "loss": 0.228, |
| "mean_token_accuracy": 0.9271810591220856, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.21743313450067348, |
| "grad_norm": 0.22449836229224876, |
| "learning_rate": 2.7163461538461536e-05, |
| "loss": 0.2309, |
| "mean_token_accuracy": 0.9264620125293732, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.21935732153165288, |
| "grad_norm": 0.2478440965545411, |
| "learning_rate": 2.7403846153846156e-05, |
| "loss": 0.2279, |
| "mean_token_accuracy": 0.9271829903125763, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.2212815085626323, |
| "grad_norm": 0.2515161851481189, |
| "learning_rate": 2.7644230769230773e-05, |
| "loss": 0.2287, |
| "mean_token_accuracy": 0.9269922792911529, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.2232056955936117, |
| "grad_norm": 0.2372790976207239, |
| "learning_rate": 2.7884615384615386e-05, |
| "loss": 0.2238, |
| "mean_token_accuracy": 0.9283542573451996, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.2251298826245911, |
| "grad_norm": 0.2308583498822988, |
| "learning_rate": 2.8125000000000003e-05, |
| "loss": 0.2245, |
| "mean_token_accuracy": 0.9282214522361756, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.2270540696555705, |
| "grad_norm": 0.23936964659736945, |
| "learning_rate": 2.8365384615384616e-05, |
| "loss": 0.2256, |
| "mean_token_accuracy": 0.9280098855495453, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.22897825668654992, |
| "grad_norm": 0.23065855672205646, |
| "learning_rate": 2.860576923076923e-05, |
| "loss": 0.2264, |
| "mean_token_accuracy": 0.9275277733802796, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.23090244371752935, |
| "grad_norm": 0.23666991823944167, |
| "learning_rate": 2.8846153846153845e-05, |
| "loss": 0.2279, |
| "mean_token_accuracy": 0.927252185344696, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.23282663074850876, |
| "grad_norm": 0.2362324188657187, |
| "learning_rate": 2.9086538461538465e-05, |
| "loss": 0.2213, |
| "mean_token_accuracy": 0.9293029963970184, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.23475081777948817, |
| "grad_norm": 0.23060183587573338, |
| "learning_rate": 2.932692307692308e-05, |
| "loss": 0.2242, |
| "mean_token_accuracy": 0.9282744646072387, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.23667500481046758, |
| "grad_norm": 0.21245991724386762, |
| "learning_rate": 2.9567307692307695e-05, |
| "loss": 0.2233, |
| "mean_token_accuracy": 0.9289264142513275, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.23859919184144698, |
| "grad_norm": 0.23324459763738273, |
| "learning_rate": 2.9807692307692308e-05, |
| "loss": 0.2206, |
| "mean_token_accuracy": 0.9290121555328369, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.2405233788724264, |
| "grad_norm": 0.22128479385792282, |
| "learning_rate": 3.0048076923076925e-05, |
| "loss": 0.2208, |
| "mean_token_accuracy": 0.9296835780143737, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.2424475659034058, |
| "grad_norm": 0.22415876113873653, |
| "learning_rate": 3.0288461538461538e-05, |
| "loss": 0.219, |
| "mean_token_accuracy": 0.929944384098053, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.24437175293438523, |
| "grad_norm": 0.23998099576004747, |
| "learning_rate": 3.052884615384616e-05, |
| "loss": 0.2186, |
| "mean_token_accuracy": 0.9297797441482544, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.24629593996536464, |
| "grad_norm": 0.23903304478335358, |
| "learning_rate": 3.0769230769230774e-05, |
| "loss": 0.2191, |
| "mean_token_accuracy": 0.9300007164478302, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.24822012699634405, |
| "grad_norm": 0.21012421644185675, |
| "learning_rate": 3.1009615384615384e-05, |
| "loss": 0.2179, |
| "mean_token_accuracy": 0.9301288604736329, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.25014431402732346, |
| "grad_norm": 0.21025751764366593, |
| "learning_rate": 3.125e-05, |
| "loss": 0.2204, |
| "mean_token_accuracy": 0.9291583478450776, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.25206850105830286, |
| "grad_norm": 0.20781743259005545, |
| "learning_rate": 3.149038461538462e-05, |
| "loss": 0.2122, |
| "mean_token_accuracy": 0.9316769897937774, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.25399268808928227, |
| "grad_norm": 0.23423569885618364, |
| "learning_rate": 3.1730769230769234e-05, |
| "loss": 0.2161, |
| "mean_token_accuracy": 0.9302766799926758, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.2559168751202617, |
| "grad_norm": 0.2618434429763953, |
| "learning_rate": 3.1971153846153843e-05, |
| "loss": 0.2172, |
| "mean_token_accuracy": 0.9301712214946747, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.2578410621512411, |
| "grad_norm": 0.21344866448076094, |
| "learning_rate": 3.221153846153847e-05, |
| "loss": 0.2143, |
| "mean_token_accuracy": 0.9310815393924713, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.2597652491822205, |
| "grad_norm": 0.19740495346391507, |
| "learning_rate": 3.2451923076923077e-05, |
| "loss": 0.2149, |
| "mean_token_accuracy": 0.930985963344574, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.2616894362131999, |
| "grad_norm": 0.21087424388155931, |
| "learning_rate": 3.269230769230769e-05, |
| "loss": 0.2155, |
| "mean_token_accuracy": 0.9308305561542511, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.2636136232441793, |
| "grad_norm": 0.22783958767619705, |
| "learning_rate": 3.293269230769231e-05, |
| "loss": 0.2188, |
| "mean_token_accuracy": 0.9300263047218322, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.26553781027515877, |
| "grad_norm": 0.20761301453893283, |
| "learning_rate": 3.3173076923076926e-05, |
| "loss": 0.2165, |
| "mean_token_accuracy": 0.9306348443031311, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.2674619973061382, |
| "grad_norm": 0.21496205447404498, |
| "learning_rate": 3.3413461538461536e-05, |
| "loss": 0.2129, |
| "mean_token_accuracy": 0.9316029012203216, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.2693861843371176, |
| "grad_norm": 0.206245562580476, |
| "learning_rate": 3.365384615384616e-05, |
| "loss": 0.2134, |
| "mean_token_accuracy": 0.9316189765930176, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.271310371368097, |
| "grad_norm": 0.20802188731768906, |
| "learning_rate": 3.3894230769230776e-05, |
| "loss": 0.2117, |
| "mean_token_accuracy": 0.9319843292236328, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.2732345583990764, |
| "grad_norm": 0.2009145459195632, |
| "learning_rate": 3.4134615384615386e-05, |
| "loss": 0.2139, |
| "mean_token_accuracy": 0.9317736685276031, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.2751587454300558, |
| "grad_norm": 0.20069344484919666, |
| "learning_rate": 3.4375e-05, |
| "loss": 0.2116, |
| "mean_token_accuracy": 0.9322500884532928, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.2770829324610352, |
| "grad_norm": 0.20952627537217056, |
| "learning_rate": 3.461538461538462e-05, |
| "loss": 0.2124, |
| "mean_token_accuracy": 0.9313783466815948, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.2790071194920146, |
| "grad_norm": 0.20442808485359995, |
| "learning_rate": 3.485576923076923e-05, |
| "loss": 0.2092, |
| "mean_token_accuracy": 0.9327110588550568, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.28093130652299403, |
| "grad_norm": 0.19161824037544672, |
| "learning_rate": 3.5096153846153845e-05, |
| "loss": 0.2103, |
| "mean_token_accuracy": 0.9320014178752899, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.28285549355397344, |
| "grad_norm": 0.20837842647969745, |
| "learning_rate": 3.533653846153847e-05, |
| "loss": 0.2117, |
| "mean_token_accuracy": 0.9322972476482392, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.28477968058495284, |
| "grad_norm": 0.21822156699986803, |
| "learning_rate": 3.557692307692308e-05, |
| "loss": 0.2146, |
| "mean_token_accuracy": 0.9314618885517121, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.28670386761593225, |
| "grad_norm": 0.22156113384687548, |
| "learning_rate": 3.5817307692307695e-05, |
| "loss": 0.2101, |
| "mean_token_accuracy": 0.9324331998825073, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.28862805464691166, |
| "grad_norm": 0.20641479681027783, |
| "learning_rate": 3.605769230769231e-05, |
| "loss": 0.2138, |
| "mean_token_accuracy": 0.9312689363956451, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.29055224167789107, |
| "grad_norm": 0.23309235000114525, |
| "learning_rate": 3.629807692307692e-05, |
| "loss": 0.2096, |
| "mean_token_accuracy": 0.9329419672489166, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.29247642870887053, |
| "grad_norm": 0.21847662656225478, |
| "learning_rate": 3.653846153846154e-05, |
| "loss": 0.2131, |
| "mean_token_accuracy": 0.9315805315971375, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.29440061573984994, |
| "grad_norm": 0.18839750574033276, |
| "learning_rate": 3.677884615384616e-05, |
| "loss": 0.2105, |
| "mean_token_accuracy": 0.9323232233524322, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.29632480277082934, |
| "grad_norm": 0.21122122983313732, |
| "learning_rate": 3.701923076923077e-05, |
| "loss": 0.2078, |
| "mean_token_accuracy": 0.9330488741397858, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.29824898980180875, |
| "grad_norm": 0.21721244243557053, |
| "learning_rate": 3.725961538461539e-05, |
| "loss": 0.207, |
| "mean_token_accuracy": 0.9333645045757294, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.30017317683278816, |
| "grad_norm": 0.20222043253832608, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.2078, |
| "mean_token_accuracy": 0.9331040740013122, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.30209736386376757, |
| "grad_norm": 0.19877908114652662, |
| "learning_rate": 3.774038461538461e-05, |
| "loss": 0.2099, |
| "mean_token_accuracy": 0.9326650083065033, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.304021550894747, |
| "grad_norm": 0.19472736523917117, |
| "learning_rate": 3.798076923076923e-05, |
| "loss": 0.2092, |
| "mean_token_accuracy": 0.9328258693218231, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.3059457379257264, |
| "grad_norm": 0.19321786695920545, |
| "learning_rate": 3.8221153846153846e-05, |
| "loss": 0.206, |
| "mean_token_accuracy": 0.9335503697395324, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.3078699249567058, |
| "grad_norm": 0.18972433432584948, |
| "learning_rate": 3.846153846153846e-05, |
| "loss": 0.2088, |
| "mean_token_accuracy": 0.9328764796257019, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.3097941119876852, |
| "grad_norm": 0.1923307709214479, |
| "learning_rate": 3.870192307692308e-05, |
| "loss": 0.2099, |
| "mean_token_accuracy": 0.9327602207660675, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.3117182990186646, |
| "grad_norm": 0.18143925889012769, |
| "learning_rate": 3.8942307692307696e-05, |
| "loss": 0.2076, |
| "mean_token_accuracy": 0.9332870185375214, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.313642486049644, |
| "grad_norm": 0.18503423559580173, |
| "learning_rate": 3.918269230769231e-05, |
| "loss": 0.2058, |
| "mean_token_accuracy": 0.9337727010250092, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.3155666730806234, |
| "grad_norm": 0.19520954027014065, |
| "learning_rate": 3.942307692307692e-05, |
| "loss": 0.2058, |
| "mean_token_accuracy": 0.9339201748371124, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.3174908601116028, |
| "grad_norm": 0.19288588146770536, |
| "learning_rate": 3.966346153846154e-05, |
| "loss": 0.2045, |
| "mean_token_accuracy": 0.9341778755187988, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.31941504714258223, |
| "grad_norm": 0.1978389444065355, |
| "learning_rate": 3.9903846153846155e-05, |
| "loss": 0.2067, |
| "mean_token_accuracy": 0.9330973982810974, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.3213392341735617, |
| "grad_norm": 0.19378571981651038, |
| "learning_rate": 4.014423076923077e-05, |
| "loss": 0.2063, |
| "mean_token_accuracy": 0.9334518015384674, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.3232634212045411, |
| "grad_norm": 0.19124108346964624, |
| "learning_rate": 4.038461538461539e-05, |
| "loss": 0.2064, |
| "mean_token_accuracy": 0.9336638748645782, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.3251876082355205, |
| "grad_norm": 0.18622997253186024, |
| "learning_rate": 4.0625000000000005e-05, |
| "loss": 0.2052, |
| "mean_token_accuracy": 0.933771300315857, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.3271117952664999, |
| "grad_norm": 0.18047500059188085, |
| "learning_rate": 4.0865384615384615e-05, |
| "loss": 0.2036, |
| "mean_token_accuracy": 0.9343971490859986, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.3290359822974793, |
| "grad_norm": 0.17877914026319977, |
| "learning_rate": 4.110576923076923e-05, |
| "loss": 0.2048, |
| "mean_token_accuracy": 0.9344187080860138, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.33096016932845873, |
| "grad_norm": 0.18446937805529498, |
| "learning_rate": 4.134615384615385e-05, |
| "loss": 0.202, |
| "mean_token_accuracy": 0.9348318338394165, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.33288435635943814, |
| "grad_norm": 0.1780966137250119, |
| "learning_rate": 4.1586538461538464e-05, |
| "loss": 0.2032, |
| "mean_token_accuracy": 0.9346541225910187, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.33480854339041755, |
| "grad_norm": 0.18714951188569887, |
| "learning_rate": 4.182692307692308e-05, |
| "loss": 0.2018, |
| "mean_token_accuracy": 0.9349911212921143, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.33673273042139695, |
| "grad_norm": 0.18414606574133185, |
| "learning_rate": 4.20673076923077e-05, |
| "loss": 0.2068, |
| "mean_token_accuracy": 0.9334247648715973, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.33865691745237636, |
| "grad_norm": 0.1890699417514011, |
| "learning_rate": 4.230769230769231e-05, |
| "loss": 0.2022, |
| "mean_token_accuracy": 0.9347915649414062, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.34058110448335577, |
| "grad_norm": 0.1952117855015522, |
| "learning_rate": 4.2548076923076924e-05, |
| "loss": 0.2044, |
| "mean_token_accuracy": 0.9339782655239105, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.3425052915143352, |
| "grad_norm": 0.17922877325513092, |
| "learning_rate": 4.278846153846154e-05, |
| "loss": 0.2036, |
| "mean_token_accuracy": 0.93400137424469, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.3444294785453146, |
| "grad_norm": 0.19127546209858898, |
| "learning_rate": 4.302884615384616e-05, |
| "loss": 0.2023, |
| "mean_token_accuracy": 0.9349508583545685, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.346353665576294, |
| "grad_norm": 0.18030007018769648, |
| "learning_rate": 4.326923076923077e-05, |
| "loss": 0.2025, |
| "mean_token_accuracy": 0.9346420764923096, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.34827785260727345, |
| "grad_norm": 0.18714905196078213, |
| "learning_rate": 4.350961538461539e-05, |
| "loss": 0.2019, |
| "mean_token_accuracy": 0.9350101053714752, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.35020203963825286, |
| "grad_norm": 0.18031917297695335, |
| "learning_rate": 4.375e-05, |
| "loss": 0.2038, |
| "mean_token_accuracy": 0.9343986690044404, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.35212622666923227, |
| "grad_norm": 0.17658313934751513, |
| "learning_rate": 4.3990384615384616e-05, |
| "loss": 0.2048, |
| "mean_token_accuracy": 0.934125417470932, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.3540504137002117, |
| "grad_norm": 0.1853217964006783, |
| "learning_rate": 4.423076923076923e-05, |
| "loss": 0.1997, |
| "mean_token_accuracy": 0.9355320036411285, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.3559746007311911, |
| "grad_norm": 0.18337184641002502, |
| "learning_rate": 4.447115384615384e-05, |
| "loss": 0.2011, |
| "mean_token_accuracy": 0.9351063668727875, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.3578987877621705, |
| "grad_norm": 0.18647822815016663, |
| "learning_rate": 4.4711538461538466e-05, |
| "loss": 0.2079, |
| "mean_token_accuracy": 0.9331667721271515, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.3598229747931499, |
| "grad_norm": 0.1786427996544572, |
| "learning_rate": 4.495192307692308e-05, |
| "loss": 0.2063, |
| "mean_token_accuracy": 0.933509111404419, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.3617471618241293, |
| "grad_norm": 0.17995728376539782, |
| "learning_rate": 4.519230769230769e-05, |
| "loss": 0.2056, |
| "mean_token_accuracy": 0.9337765216827393, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.3636713488551087, |
| "grad_norm": 0.18565188071732694, |
| "learning_rate": 4.543269230769231e-05, |
| "loss": 0.2033, |
| "mean_token_accuracy": 0.9345883190631866, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.3655955358860881, |
| "grad_norm": 0.20682212909883085, |
| "learning_rate": 4.5673076923076925e-05, |
| "loss": 0.201, |
| "mean_token_accuracy": 0.935194593667984, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.3675197229170675, |
| "grad_norm": 0.17087885109099807, |
| "learning_rate": 4.591346153846154e-05, |
| "loss": 0.2023, |
| "mean_token_accuracy": 0.9346142172813415, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.36944390994804693, |
| "grad_norm": 0.17283361779419767, |
| "learning_rate": 4.615384615384616e-05, |
| "loss": 0.2014, |
| "mean_token_accuracy": 0.934814327955246, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.37136809697902634, |
| "grad_norm": 0.17663705053214648, |
| "learning_rate": 4.6394230769230775e-05, |
| "loss": 0.2004, |
| "mean_token_accuracy": 0.9352030277252197, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.37329228401000575, |
| "grad_norm": 0.17188964913509067, |
| "learning_rate": 4.6634615384615384e-05, |
| "loss": 0.2035, |
| "mean_token_accuracy": 0.9344177782535553, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.3752164710409852, |
| "grad_norm": 0.17299384159996276, |
| "learning_rate": 4.6875e-05, |
| "loss": 0.1993, |
| "mean_token_accuracy": 0.935628992319107, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.3771406580719646, |
| "grad_norm": 0.1787796731368517, |
| "learning_rate": 4.711538461538462e-05, |
| "loss": 0.1996, |
| "mean_token_accuracy": 0.9355967044830322, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.379064845102944, |
| "grad_norm": 0.17902949352868802, |
| "learning_rate": 4.7355769230769234e-05, |
| "loss": 0.1998, |
| "mean_token_accuracy": 0.9354398012161255, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.38098903213392343, |
| "grad_norm": 0.18172066960666447, |
| "learning_rate": 4.7596153846153844e-05, |
| "loss": 0.2011, |
| "mean_token_accuracy": 0.9351277530193329, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.38291321916490284, |
| "grad_norm": 0.18052681549380922, |
| "learning_rate": 4.783653846153847e-05, |
| "loss": 0.1991, |
| "mean_token_accuracy": 0.9355564057826996, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.38483740619588225, |
| "grad_norm": 0.17224435083974163, |
| "learning_rate": 4.8076923076923084e-05, |
| "loss": 0.2009, |
| "mean_token_accuracy": 0.9353624939918518, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.38676159322686166, |
| "grad_norm": 0.18548492282198598, |
| "learning_rate": 4.8317307692307693e-05, |
| "loss": 0.2008, |
| "mean_token_accuracy": 0.9351412355899811, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.38868578025784106, |
| "grad_norm": 0.17377556409194556, |
| "learning_rate": 4.855769230769231e-05, |
| "loss": 0.2007, |
| "mean_token_accuracy": 0.9352908134460449, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.39060996728882047, |
| "grad_norm": 0.1773864121321377, |
| "learning_rate": 4.8798076923076926e-05, |
| "loss": 0.2005, |
| "mean_token_accuracy": 0.9353309094905853, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.3925341543197999, |
| "grad_norm": 0.1731017926113393, |
| "learning_rate": 4.9038461538461536e-05, |
| "loss": 0.2025, |
| "mean_token_accuracy": 0.934537136554718, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.3944583413507793, |
| "grad_norm": 0.16644220626271233, |
| "learning_rate": 4.927884615384616e-05, |
| "loss": 0.1976, |
| "mean_token_accuracy": 0.9359455347061157, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.3963825283817587, |
| "grad_norm": 0.17009248714497768, |
| "learning_rate": 4.9519230769230776e-05, |
| "loss": 0.2023, |
| "mean_token_accuracy": 0.934699285030365, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.3983067154127381, |
| "grad_norm": 0.17340750114798312, |
| "learning_rate": 4.9759615384615386e-05, |
| "loss": 0.1972, |
| "mean_token_accuracy": 0.9362557351589202, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.4002309024437175, |
| "grad_norm": 0.17122966366041767, |
| "learning_rate": 5e-05, |
| "loss": 0.1985, |
| "mean_token_accuracy": 0.935871708393097, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.4021550894746969, |
| "grad_norm": 0.17659027782667022, |
| "learning_rate": 4.999996826173528e-05, |
| "loss": 0.1995, |
| "mean_token_accuracy": 0.9355424761772155, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.4040792765056764, |
| "grad_norm": 0.18105261227658284, |
| "learning_rate": 4.999987304703068e-05, |
| "loss": 0.2029, |
| "mean_token_accuracy": 0.9345368921756745, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.4060034635366558, |
| "grad_norm": 0.17884616958670743, |
| "learning_rate": 4.999971435615479e-05, |
| "loss": 0.2037, |
| "mean_token_accuracy": 0.9342390179634095, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.4079276505676352, |
| "grad_norm": 0.17233022523598526, |
| "learning_rate": 4.999949218955533e-05, |
| "loss": 0.2001, |
| "mean_token_accuracy": 0.9354979634284973, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.4098518375986146, |
| "grad_norm": 0.16780066324475737, |
| "learning_rate": 4.999920654785905e-05, |
| "loss": 0.1982, |
| "mean_token_accuracy": 0.9359723389148712, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.411776024629594, |
| "grad_norm": 0.16408063918538596, |
| "learning_rate": 4.999885743187181e-05, |
| "loss": 0.1969, |
| "mean_token_accuracy": 0.9364181160926819, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.4137002116605734, |
| "grad_norm": 0.16436131857729716, |
| "learning_rate": 4.9998444842578536e-05, |
| "loss": 0.197, |
| "mean_token_accuracy": 0.9364462256431579, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.4156243986915528, |
| "grad_norm": 0.1649527103012933, |
| "learning_rate": 4.999796878114321e-05, |
| "loss": 0.2003, |
| "mean_token_accuracy": 0.9352196276187896, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.41754858572253223, |
| "grad_norm": 0.1753592212006608, |
| "learning_rate": 4.9997429248908874e-05, |
| "loss": 0.2027, |
| "mean_token_accuracy": 0.9347833514213562, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.41947277275351164, |
| "grad_norm": 0.1721618475449869, |
| "learning_rate": 4.999682624739765e-05, |
| "loss": 0.1957, |
| "mean_token_accuracy": 0.9364085972309113, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.42139695978449104, |
| "grad_norm": 0.15782787279679403, |
| "learning_rate": 4.9996159778310734e-05, |
| "loss": 0.2024, |
| "mean_token_accuracy": 0.9344262480735779, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.42332114681547045, |
| "grad_norm": 0.16736454601981463, |
| "learning_rate": 4.9995429843528316e-05, |
| "loss": 0.2025, |
| "mean_token_accuracy": 0.9348289132118225, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.42524533384644986, |
| "grad_norm": 0.16222474872691323, |
| "learning_rate": 4.999463644510971e-05, |
| "loss": 0.2023, |
| "mean_token_accuracy": 0.9346409559249877, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.42716952087742927, |
| "grad_norm": 0.15986614324171702, |
| "learning_rate": 4.999377958529322e-05, |
| "loss": 0.2, |
| "mean_token_accuracy": 0.9356332302093506, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.4290937079084087, |
| "grad_norm": 0.16333128122988397, |
| "learning_rate": 4.99928592664962e-05, |
| "loss": 0.1979, |
| "mean_token_accuracy": 0.9361936330795289, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.43101789493938814, |
| "grad_norm": 0.16227308161461754, |
| "learning_rate": 4.9991875491315034e-05, |
| "loss": 0.1972, |
| "mean_token_accuracy": 0.9363140523433685, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.43294208197036754, |
| "grad_norm": 0.1713591306571476, |
| "learning_rate": 4.999082826252513e-05, |
| "loss": 0.197, |
| "mean_token_accuracy": 0.9363386750221252, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.43486626900134695, |
| "grad_norm": 0.15990954791736506, |
| "learning_rate": 4.9989717583080906e-05, |
| "loss": 0.1972, |
| "mean_token_accuracy": 0.9362068951129914, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.43679045603232636, |
| "grad_norm": 0.1737279224848079, |
| "learning_rate": 4.998854345611579e-05, |
| "loss": 0.1956, |
| "mean_token_accuracy": 0.9366493225097656, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.43871464306330576, |
| "grad_norm": 0.1562327893766701, |
| "learning_rate": 4.998730588494221e-05, |
| "loss": 0.1971, |
| "mean_token_accuracy": 0.9364685654640198, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.44063883009428517, |
| "grad_norm": 0.1588473310407234, |
| "learning_rate": 4.998600487305156e-05, |
| "loss": 0.1948, |
| "mean_token_accuracy": 0.9368900418281555, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.4425630171252646, |
| "grad_norm": 0.17995519422150538, |
| "learning_rate": 4.998464042411424e-05, |
| "loss": 0.1991, |
| "mean_token_accuracy": 0.9357859253883362, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.444487204156244, |
| "grad_norm": 0.16926915405066503, |
| "learning_rate": 4.9983212541979594e-05, |
| "loss": 0.1971, |
| "mean_token_accuracy": 0.9361749947071075, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.4464113911872234, |
| "grad_norm": 0.16163021262186142, |
| "learning_rate": 4.998172123067595e-05, |
| "loss": 0.1956, |
| "mean_token_accuracy": 0.936591511964798, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.4483355782182028, |
| "grad_norm": 0.15952963787166335, |
| "learning_rate": 4.9980166494410556e-05, |
| "loss": 0.1929, |
| "mean_token_accuracy": 0.937582665681839, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.4502597652491822, |
| "grad_norm": 0.16284192484187074, |
| "learning_rate": 4.99785483375696e-05, |
| "loss": 0.1971, |
| "mean_token_accuracy": 0.9360791742801666, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.4521839522801616, |
| "grad_norm": 0.15441856261654946, |
| "learning_rate": 4.99768667647182e-05, |
| "loss": 0.1922, |
| "mean_token_accuracy": 0.9375222563743592, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.454108139311141, |
| "grad_norm": 0.16405561020180287, |
| "learning_rate": 4.9975121780600356e-05, |
| "loss": 0.1965, |
| "mean_token_accuracy": 0.9364106237888337, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.45603232634212043, |
| "grad_norm": 0.14887241164649884, |
| "learning_rate": 4.9973313390138985e-05, |
| "loss": 0.1947, |
| "mean_token_accuracy": 0.9369799256324768, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.45795651337309984, |
| "grad_norm": 0.15910575747869182, |
| "learning_rate": 4.9971441598435905e-05, |
| "loss": 0.1997, |
| "mean_token_accuracy": 0.9359139621257782, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.4598807004040793, |
| "grad_norm": 0.1596570148547143, |
| "learning_rate": 4.996950641077174e-05, |
| "loss": 0.1938, |
| "mean_token_accuracy": 0.9371317565441132, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.4618048874350587, |
| "grad_norm": 0.1542919586208301, |
| "learning_rate": 4.996750783260602e-05, |
| "loss": 0.1972, |
| "mean_token_accuracy": 0.9365085899829865, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4637290744660381, |
| "grad_norm": 0.1554002350742973, |
| "learning_rate": 4.9965445869577106e-05, |
| "loss": 0.193, |
| "mean_token_accuracy": 0.9373429834842681, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.4656532614970175, |
| "grad_norm": 0.15006275457030324, |
| "learning_rate": 4.996332052750214e-05, |
| "loss": 0.1931, |
| "mean_token_accuracy": 0.9375267207622529, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.46757744852799693, |
| "grad_norm": 0.15502590460532487, |
| "learning_rate": 4.99611318123771e-05, |
| "loss": 0.1941, |
| "mean_token_accuracy": 0.9373384654521942, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.46950163555897634, |
| "grad_norm": 0.14854596494688704, |
| "learning_rate": 4.9958879730376754e-05, |
| "loss": 0.1926, |
| "mean_token_accuracy": 0.9373452723026275, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.47142582258995575, |
| "grad_norm": 0.14988651130185893, |
| "learning_rate": 4.995656428785461e-05, |
| "loss": 0.198, |
| "mean_token_accuracy": 0.936055588722229, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.47335000962093515, |
| "grad_norm": 0.15044586479128982, |
| "learning_rate": 4.995418549134296e-05, |
| "loss": 0.194, |
| "mean_token_accuracy": 0.937198007106781, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.47527419665191456, |
| "grad_norm": 0.14304018627938955, |
| "learning_rate": 4.995174334755281e-05, |
| "loss": 0.1913, |
| "mean_token_accuracy": 0.9380493998527527, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.47719838368289397, |
| "grad_norm": 0.15847240603024573, |
| "learning_rate": 4.994923786337389e-05, |
| "loss": 0.1941, |
| "mean_token_accuracy": 0.9369276583194732, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.4791225707138734, |
| "grad_norm": 0.15050361963809947, |
| "learning_rate": 4.9946669045874616e-05, |
| "loss": 0.1965, |
| "mean_token_accuracy": 0.9365364372730255, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.4810467577448528, |
| "grad_norm": 0.15143997168073883, |
| "learning_rate": 4.994403690230208e-05, |
| "loss": 0.1952, |
| "mean_token_accuracy": 0.9370022714138031, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.4829709447758322, |
| "grad_norm": 0.14721494174882732, |
| "learning_rate": 4.994134144008203e-05, |
| "loss": 0.1924, |
| "mean_token_accuracy": 0.9377739369869232, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.4848951318068116, |
| "grad_norm": 0.15299254663295053, |
| "learning_rate": 4.993858266681885e-05, |
| "loss": 0.194, |
| "mean_token_accuracy": 0.9371655225753784, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.48681931883779106, |
| "grad_norm": 0.15807599202373432, |
| "learning_rate": 4.9935760590295534e-05, |
| "loss": 0.1923, |
| "mean_token_accuracy": 0.9380616784095764, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.48874350586877047, |
| "grad_norm": 0.14790047243010532, |
| "learning_rate": 4.9932875218473666e-05, |
| "loss": 0.1921, |
| "mean_token_accuracy": 0.9376394093036652, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.4906676928997499, |
| "grad_norm": 0.15177558035884425, |
| "learning_rate": 4.992992655949339e-05, |
| "loss": 0.193, |
| "mean_token_accuracy": 0.9373527705669403, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.4925918799307293, |
| "grad_norm": 0.15030494230476282, |
| "learning_rate": 4.992691462167342e-05, |
| "loss": 0.1952, |
| "mean_token_accuracy": 0.93678218126297, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.4945160669617087, |
| "grad_norm": 0.14934065421375978, |
| "learning_rate": 4.992383941351094e-05, |
| "loss": 0.1924, |
| "mean_token_accuracy": 0.9374918937683105, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.4964402539926881, |
| "grad_norm": 0.14795547731798583, |
| "learning_rate": 4.9920700943681695e-05, |
| "loss": 0.1918, |
| "mean_token_accuracy": 0.937732708454132, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.4983644410236675, |
| "grad_norm": 0.14706674754198268, |
| "learning_rate": 4.991749922103984e-05, |
| "loss": 0.1937, |
| "mean_token_accuracy": 0.9371985256671905, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.5002886280546469, |
| "grad_norm": 0.18209623021883456, |
| "learning_rate": 4.991423425461804e-05, |
| "loss": 0.1925, |
| "mean_token_accuracy": 0.9376034200191498, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.5022128150856263, |
| "grad_norm": 0.14998754818873625, |
| "learning_rate": 4.991090605362733e-05, |
| "loss": 0.1952, |
| "mean_token_accuracy": 0.9369374454021454, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.5041370021166057, |
| "grad_norm": 0.14623146138818685, |
| "learning_rate": 4.990751462745717e-05, |
| "loss": 0.1925, |
| "mean_token_accuracy": 0.9375016152858734, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.5060611891475851, |
| "grad_norm": 0.14877961023165806, |
| "learning_rate": 4.990405998567537e-05, |
| "loss": 0.1942, |
| "mean_token_accuracy": 0.9372852742671967, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.5079853761785645, |
| "grad_norm": 0.14508285936489035, |
| "learning_rate": 4.99005421380281e-05, |
| "loss": 0.1942, |
| "mean_token_accuracy": 0.9370539069175721, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.509909563209544, |
| "grad_norm": 0.14621668211521224, |
| "learning_rate": 4.9896961094439844e-05, |
| "loss": 0.1921, |
| "mean_token_accuracy": 0.9378675162792206, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.5118337502405234, |
| "grad_norm": 0.14679732692209072, |
| "learning_rate": 4.989331686501335e-05, |
| "loss": 0.1878, |
| "mean_token_accuracy": 0.9390625476837158, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.5137579372715028, |
| "grad_norm": 0.14745685811150153, |
| "learning_rate": 4.9889609460029654e-05, |
| "loss": 0.1905, |
| "mean_token_accuracy": 0.9380582571029663, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.5156821243024822, |
| "grad_norm": 0.14611296052225958, |
| "learning_rate": 4.988583888994802e-05, |
| "loss": 0.1899, |
| "mean_token_accuracy": 0.938438081741333, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.5176063113334616, |
| "grad_norm": 0.15273461896563223, |
| "learning_rate": 4.9882005165405885e-05, |
| "loss": 0.1932, |
| "mean_token_accuracy": 0.9374414086341858, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.519530498364441, |
| "grad_norm": 0.1440988164125214, |
| "learning_rate": 4.987810829721887e-05, |
| "loss": 0.192, |
| "mean_token_accuracy": 0.9377279877662659, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.5214546853954204, |
| "grad_norm": 0.14678789036749104, |
| "learning_rate": 4.9874148296380754e-05, |
| "loss": 0.1936, |
| "mean_token_accuracy": 0.9375370383262634, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.5233788724263998, |
| "grad_norm": 0.14293580047102425, |
| "learning_rate": 4.9870125174063384e-05, |
| "loss": 0.1901, |
| "mean_token_accuracy": 0.9380953311920166, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.5253030594573792, |
| "grad_norm": 0.14829079991813543, |
| "learning_rate": 4.9866038941616736e-05, |
| "loss": 0.1891, |
| "mean_token_accuracy": 0.9382195234298706, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.5272272464883586, |
| "grad_norm": 0.1454512033844362, |
| "learning_rate": 4.986188961056879e-05, |
| "loss": 0.1937, |
| "mean_token_accuracy": 0.9372387111186982, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.5291514335193381, |
| "grad_norm": 0.138895169464844, |
| "learning_rate": 4.9857677192625564e-05, |
| "loss": 0.1906, |
| "mean_token_accuracy": 0.9381425142288208, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.5310756205503175, |
| "grad_norm": 0.14740272656815195, |
| "learning_rate": 4.9853401699671016e-05, |
| "loss": 0.1896, |
| "mean_token_accuracy": 0.9385837614536285, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.532999807581297, |
| "grad_norm": 0.14804617939261644, |
| "learning_rate": 4.98490631437671e-05, |
| "loss": 0.1924, |
| "mean_token_accuracy": 0.9376095533370972, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.5349239946122764, |
| "grad_norm": 0.13962868701210818, |
| "learning_rate": 4.9844661537153656e-05, |
| "loss": 0.1894, |
| "mean_token_accuracy": 0.9385484099388123, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.5368481816432558, |
| "grad_norm": 0.13562255479313326, |
| "learning_rate": 4.98401968922484e-05, |
| "loss": 0.1924, |
| "mean_token_accuracy": 0.9374591946601868, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.5387723686742352, |
| "grad_norm": 0.13681927313682837, |
| "learning_rate": 4.9835669221646896e-05, |
| "loss": 0.1907, |
| "mean_token_accuracy": 0.9382172763347626, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.5406965557052146, |
| "grad_norm": 0.1416451055009483, |
| "learning_rate": 4.983107853812252e-05, |
| "loss": 0.1899, |
| "mean_token_accuracy": 0.9381133437156677, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.542620742736194, |
| "grad_norm": 0.1398213343702262, |
| "learning_rate": 4.98264248546264e-05, |
| "loss": 0.19, |
| "mean_token_accuracy": 0.9381514072418213, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.5445449297671734, |
| "grad_norm": 0.14336297674820447, |
| "learning_rate": 4.982170818428742e-05, |
| "loss": 0.1924, |
| "mean_token_accuracy": 0.9378505229949952, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.5464691167981528, |
| "grad_norm": 0.1378930710651164, |
| "learning_rate": 4.981692854041215e-05, |
| "loss": 0.1878, |
| "mean_token_accuracy": 0.938957291841507, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.5483933038291322, |
| "grad_norm": 0.1543001875922398, |
| "learning_rate": 4.981208593648482e-05, |
| "loss": 0.1922, |
| "mean_token_accuracy": 0.937881326675415, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.5503174908601116, |
| "grad_norm": 0.14381566153232886, |
| "learning_rate": 4.980718038616728e-05, |
| "loss": 0.1897, |
| "mean_token_accuracy": 0.9386523187160491, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.552241677891091, |
| "grad_norm": 0.1414593617076599, |
| "learning_rate": 4.980221190329898e-05, |
| "loss": 0.1897, |
| "mean_token_accuracy": 0.938573569059372, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.5541658649220704, |
| "grad_norm": 0.13802418684524773, |
| "learning_rate": 4.979718050189688e-05, |
| "loss": 0.1891, |
| "mean_token_accuracy": 0.9385579884052276, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.5560900519530498, |
| "grad_norm": 0.13125752086195527, |
| "learning_rate": 4.979208619615547e-05, |
| "loss": 0.191, |
| "mean_token_accuracy": 0.9383366286754609, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.5580142389840292, |
| "grad_norm": 0.14209482273294116, |
| "learning_rate": 4.978692900044671e-05, |
| "loss": 0.1882, |
| "mean_token_accuracy": 0.9388597071170807, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.5599384260150087, |
| "grad_norm": 0.13255400542137394, |
| "learning_rate": 4.978170892931996e-05, |
| "loss": 0.1882, |
| "mean_token_accuracy": 0.9389723241329193, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.5618626130459881, |
| "grad_norm": 0.13459691303653742, |
| "learning_rate": 4.977642599750198e-05, |
| "loss": 0.1883, |
| "mean_token_accuracy": 0.9389943897724151, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.5637868000769675, |
| "grad_norm": 0.13445586406317245, |
| "learning_rate": 4.9771080219896875e-05, |
| "loss": 0.1897, |
| "mean_token_accuracy": 0.9383779525756836, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.5657109871079469, |
| "grad_norm": 0.14319192722592644, |
| "learning_rate": 4.976567161158603e-05, |
| "loss": 0.1877, |
| "mean_token_accuracy": 0.9391680121421814, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.5676351741389263, |
| "grad_norm": 0.14311727857762083, |
| "learning_rate": 4.9760200187828104e-05, |
| "loss": 0.19, |
| "mean_token_accuracy": 0.9383435606956482, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.5695593611699057, |
| "grad_norm": 0.1433601280447413, |
| "learning_rate": 4.9754665964058956e-05, |
| "loss": 0.1917, |
| "mean_token_accuracy": 0.9377669870853425, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.5714835482008851, |
| "grad_norm": 0.1372126840203153, |
| "learning_rate": 4.974906895589162e-05, |
| "loss": 0.1897, |
| "mean_token_accuracy": 0.9382838129997253, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.5734077352318645, |
| "grad_norm": 0.14135144914463524, |
| "learning_rate": 4.974340917911628e-05, |
| "loss": 0.1889, |
| "mean_token_accuracy": 0.9386630952358246, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.5753319222628439, |
| "grad_norm": 0.1431861008729524, |
| "learning_rate": 4.9737686649700154e-05, |
| "loss": 0.1886, |
| "mean_token_accuracy": 0.9389110207557678, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.5772561092938233, |
| "grad_norm": 0.134683506562364, |
| "learning_rate": 4.973190138378754e-05, |
| "loss": 0.19, |
| "mean_token_accuracy": 0.9384437501430511, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5791802963248027, |
| "grad_norm": 0.13612786893218118, |
| "learning_rate": 4.97260533976997e-05, |
| "loss": 0.1915, |
| "mean_token_accuracy": 0.9379990577697754, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.5811044833557821, |
| "grad_norm": 0.14289619967069558, |
| "learning_rate": 4.972014270793485e-05, |
| "loss": 0.1897, |
| "mean_token_accuracy": 0.9383374452590942, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.5830286703867615, |
| "grad_norm": 0.13842544154199743, |
| "learning_rate": 4.9714169331168104e-05, |
| "loss": 0.1923, |
| "mean_token_accuracy": 0.9376774847507476, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.5849528574177411, |
| "grad_norm": 0.13006096264789718, |
| "learning_rate": 4.970813328425143e-05, |
| "loss": 0.1871, |
| "mean_token_accuracy": 0.9393136322498321, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.5868770444487205, |
| "grad_norm": 0.13174047857207602, |
| "learning_rate": 4.9702034584213605e-05, |
| "loss": 0.1905, |
| "mean_token_accuracy": 0.9378688871860504, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.5888012314796999, |
| "grad_norm": 0.1300374585549842, |
| "learning_rate": 4.9695873248260145e-05, |
| "loss": 0.1879, |
| "mean_token_accuracy": 0.9391686737537384, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.5907254185106793, |
| "grad_norm": 0.13007064298516266, |
| "learning_rate": 4.968964929377328e-05, |
| "loss": 0.1885, |
| "mean_token_accuracy": 0.9387658298015594, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.5926496055416587, |
| "grad_norm": 0.13360649826667276, |
| "learning_rate": 4.9683362738311913e-05, |
| "loss": 0.1875, |
| "mean_token_accuracy": 0.9391666889190674, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.5945737925726381, |
| "grad_norm": 0.13060804250585295, |
| "learning_rate": 4.967701359961152e-05, |
| "loss": 0.1889, |
| "mean_token_accuracy": 0.9385679483413696, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.5964979796036175, |
| "grad_norm": 0.14032541180172345, |
| "learning_rate": 4.9670601895584186e-05, |
| "loss": 0.1842, |
| "mean_token_accuracy": 0.9397392094135284, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.5984221666345969, |
| "grad_norm": 0.13607270266800195, |
| "learning_rate": 4.966412764431845e-05, |
| "loss": 0.1892, |
| "mean_token_accuracy": 0.9384610235691071, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.6003463536655763, |
| "grad_norm": 0.1312707683845405, |
| "learning_rate": 4.965759086407936e-05, |
| "loss": 0.1882, |
| "mean_token_accuracy": 0.9386238515377044, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.6022705406965557, |
| "grad_norm": 0.13457563564564892, |
| "learning_rate": 4.965099157330832e-05, |
| "loss": 0.185, |
| "mean_token_accuracy": 0.939483368396759, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.6041947277275351, |
| "grad_norm": 0.13439942039177172, |
| "learning_rate": 4.964432979062313e-05, |
| "loss": 0.1842, |
| "mean_token_accuracy": 0.9405307114124298, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.6061189147585145, |
| "grad_norm": 0.1277496982429766, |
| "learning_rate": 4.963760553481786e-05, |
| "loss": 0.1899, |
| "mean_token_accuracy": 0.9384656131267548, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.608043101789494, |
| "grad_norm": 0.1330158934788981, |
| "learning_rate": 4.963081882486284e-05, |
| "loss": 0.1834, |
| "mean_token_accuracy": 0.9401716351509094, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.6099672888204734, |
| "grad_norm": 0.13754542628006322, |
| "learning_rate": 4.96239696799046e-05, |
| "loss": 0.1919, |
| "mean_token_accuracy": 0.9377372145652771, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.6118914758514528, |
| "grad_norm": 0.14098492971953913, |
| "learning_rate": 4.9617058119265805e-05, |
| "loss": 0.1894, |
| "mean_token_accuracy": 0.9386878430843353, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.6138156628824322, |
| "grad_norm": 0.12664632216663757, |
| "learning_rate": 4.961008416244519e-05, |
| "loss": 0.185, |
| "mean_token_accuracy": 0.9396873474121094, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.6157398499134116, |
| "grad_norm": 0.13549624226878912, |
| "learning_rate": 4.960304782911756e-05, |
| "loss": 0.1863, |
| "mean_token_accuracy": 0.9393212020397186, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.617664036944391, |
| "grad_norm": 0.1276823140599733, |
| "learning_rate": 4.959594913913366e-05, |
| "loss": 0.1868, |
| "mean_token_accuracy": 0.9388953983783722, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.6195882239753704, |
| "grad_norm": 0.1336813264156418, |
| "learning_rate": 4.9588788112520164e-05, |
| "loss": 0.1845, |
| "mean_token_accuracy": 0.9399426996707916, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.6215124110063498, |
| "grad_norm": 0.13481862710535897, |
| "learning_rate": 4.958156476947961e-05, |
| "loss": 0.1831, |
| "mean_token_accuracy": 0.940336000919342, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.6234365980373292, |
| "grad_norm": 0.13324160235017124, |
| "learning_rate": 4.957427913039034e-05, |
| "loss": 0.1893, |
| "mean_token_accuracy": 0.9386727988719941, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.6253607850683086, |
| "grad_norm": 0.13030229625983306, |
| "learning_rate": 4.9566931215806464e-05, |
| "loss": 0.1883, |
| "mean_token_accuracy": 0.938975191116333, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.627284972099288, |
| "grad_norm": 0.12493763582679271, |
| "learning_rate": 4.955952104645775e-05, |
| "loss": 0.1846, |
| "mean_token_accuracy": 0.9397555112838745, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.6292091591302674, |
| "grad_norm": 0.12723126191253217, |
| "learning_rate": 4.955204864324961e-05, |
| "loss": 0.183, |
| "mean_token_accuracy": 0.9404920816421509, |
| "step": 1635 |
| }, |
| { |
| "epoch": 0.6311333461612468, |
| "grad_norm": 0.1293781315765133, |
| "learning_rate": 4.9544514027263034e-05, |
| "loss": 0.1879, |
| "mean_token_accuracy": 0.9391253471374512, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.6330575331922262, |
| "grad_norm": 0.13138978342222302, |
| "learning_rate": 4.953691721975453e-05, |
| "loss": 0.1867, |
| "mean_token_accuracy": 0.9391219913959503, |
| "step": 1645 |
| }, |
| { |
| "epoch": 0.6349817202232056, |
| "grad_norm": 0.12270187391728252, |
| "learning_rate": 4.9529258242156046e-05, |
| "loss": 0.1851, |
| "mean_token_accuracy": 0.9395655930042267, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.636905907254185, |
| "grad_norm": 0.12918156512326553, |
| "learning_rate": 4.9521537116074926e-05, |
| "loss": 0.1857, |
| "mean_token_accuracy": 0.9395742118358612, |
| "step": 1655 |
| }, |
| { |
| "epoch": 0.6388300942851645, |
| "grad_norm": 0.13972950651489827, |
| "learning_rate": 4.951375386329387e-05, |
| "loss": 0.1859, |
| "mean_token_accuracy": 0.9398290574550628, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.640754281316144, |
| "grad_norm": 0.12809225460490406, |
| "learning_rate": 4.95059085057708e-05, |
| "loss": 0.1873, |
| "mean_token_accuracy": 0.9390866160392761, |
| "step": 1665 |
| }, |
| { |
| "epoch": 0.6426784683471234, |
| "grad_norm": 0.13667531914360745, |
| "learning_rate": 4.949800106563889e-05, |
| "loss": 0.1849, |
| "mean_token_accuracy": 0.9396887421607971, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.6446026553781028, |
| "grad_norm": 0.13602807879743783, |
| "learning_rate": 4.9490031565206445e-05, |
| "loss": 0.1863, |
| "mean_token_accuracy": 0.939508056640625, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.6465268424090822, |
| "grad_norm": 0.13096626380887844, |
| "learning_rate": 4.948200002695685e-05, |
| "loss": 0.1842, |
| "mean_token_accuracy": 0.9400151014328003, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.6484510294400616, |
| "grad_norm": 0.12291861404114918, |
| "learning_rate": 4.947390647354851e-05, |
| "loss": 0.1873, |
| "mean_token_accuracy": 0.9389609873294831, |
| "step": 1685 |
| }, |
| { |
| "epoch": 0.650375216471041, |
| "grad_norm": 0.13409913540340435, |
| "learning_rate": 4.94657509278148e-05, |
| "loss": 0.1859, |
| "mean_token_accuracy": 0.9396482467651367, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.6522994035020204, |
| "grad_norm": 0.12281251011094463, |
| "learning_rate": 4.945753341276395e-05, |
| "loss": 0.1848, |
| "mean_token_accuracy": 0.939803171157837, |
| "step": 1695 |
| }, |
| { |
| "epoch": 0.6542235905329998, |
| "grad_norm": 0.1228749169492863, |
| "learning_rate": 4.944925395157907e-05, |
| "loss": 0.1875, |
| "mean_token_accuracy": 0.9391197860240936, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.6561477775639792, |
| "grad_norm": 0.12664527861803945, |
| "learning_rate": 4.944091256761798e-05, |
| "loss": 0.1892, |
| "mean_token_accuracy": 0.9383731603622436, |
| "step": 1705 |
| }, |
| { |
| "epoch": 0.6580719645949586, |
| "grad_norm": 0.12403803816854818, |
| "learning_rate": 4.943250928441324e-05, |
| "loss": 0.1836, |
| "mean_token_accuracy": 0.9401000320911408, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.659996151625938, |
| "grad_norm": 0.1342330704907052, |
| "learning_rate": 4.942404412567201e-05, |
| "loss": 0.1887, |
| "mean_token_accuracy": 0.9387575447559356, |
| "step": 1715 |
| }, |
| { |
| "epoch": 0.6619203386569175, |
| "grad_norm": 0.12352416672457492, |
| "learning_rate": 4.941551711527601e-05, |
| "loss": 0.1873, |
| "mean_token_accuracy": 0.9390809714794159, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.6638445256878969, |
| "grad_norm": 0.12952860042778544, |
| "learning_rate": 4.940692827728146e-05, |
| "loss": 0.1868, |
| "mean_token_accuracy": 0.9390901625156403, |
| "step": 1725 |
| }, |
| { |
| "epoch": 0.6657687127188763, |
| "grad_norm": 0.12949620744531398, |
| "learning_rate": 4.939827763591902e-05, |
| "loss": 0.1873, |
| "mean_token_accuracy": 0.9391276597976684, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.6676928997498557, |
| "grad_norm": 0.1326361157966624, |
| "learning_rate": 4.93895652155937e-05, |
| "loss": 0.1873, |
| "mean_token_accuracy": 0.9392795920372009, |
| "step": 1735 |
| }, |
| { |
| "epoch": 0.6696170867808351, |
| "grad_norm": 0.1259436738522025, |
| "learning_rate": 4.93807910408848e-05, |
| "loss": 0.1869, |
| "mean_token_accuracy": 0.9394049286842346, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.6715412738118145, |
| "grad_norm": 0.12833385292087515, |
| "learning_rate": 4.937195513654582e-05, |
| "loss": 0.1847, |
| "mean_token_accuracy": 0.9400054693222046, |
| "step": 1745 |
| }, |
| { |
| "epoch": 0.6734654608427939, |
| "grad_norm": 0.1305902666993928, |
| "learning_rate": 4.936305752750445e-05, |
| "loss": 0.1862, |
| "mean_token_accuracy": 0.9397811412811279, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.6753896478737733, |
| "grad_norm": 0.12503772393114956, |
| "learning_rate": 4.9354098238862434e-05, |
| "loss": 0.1866, |
| "mean_token_accuracy": 0.9391416549682617, |
| "step": 1755 |
| }, |
| { |
| "epoch": 0.6773138349047527, |
| "grad_norm": 0.12907005180260395, |
| "learning_rate": 4.934507729589552e-05, |
| "loss": 0.1837, |
| "mean_token_accuracy": 0.9401533901691437, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.6792380219357321, |
| "grad_norm": 0.12952056395716516, |
| "learning_rate": 4.93359947240534e-05, |
| "loss": 0.1857, |
| "mean_token_accuracy": 0.9394142091274261, |
| "step": 1765 |
| }, |
| { |
| "epoch": 0.6811622089667115, |
| "grad_norm": 0.12964032848148677, |
| "learning_rate": 4.9326850548959655e-05, |
| "loss": 0.1872, |
| "mean_token_accuracy": 0.9389999568462372, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.6830863959976909, |
| "grad_norm": 0.12307908590279568, |
| "learning_rate": 4.9317644796411626e-05, |
| "loss": 0.1869, |
| "mean_token_accuracy": 0.9391985893249511, |
| "step": 1775 |
| }, |
| { |
| "epoch": 0.6850105830286704, |
| "grad_norm": 0.12409231912708199, |
| "learning_rate": 4.9308377492380395e-05, |
| "loss": 0.1859, |
| "mean_token_accuracy": 0.939410537481308, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.6869347700596498, |
| "grad_norm": 0.1284220476802047, |
| "learning_rate": 4.929904866301069e-05, |
| "loss": 0.1853, |
| "mean_token_accuracy": 0.9397732496261597, |
| "step": 1785 |
| }, |
| { |
| "epoch": 0.6888589570906292, |
| "grad_norm": 0.1283258198594263, |
| "learning_rate": 4.9289658334620795e-05, |
| "loss": 0.187, |
| "mean_token_accuracy": 0.939195990562439, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.6907831441216086, |
| "grad_norm": 0.12635478080816434, |
| "learning_rate": 4.928020653370253e-05, |
| "loss": 0.1874, |
| "mean_token_accuracy": 0.9393328726291656, |
| "step": 1795 |
| }, |
| { |
| "epoch": 0.692707331152588, |
| "grad_norm": 0.12484761872443023, |
| "learning_rate": 4.92706932869211e-05, |
| "loss": 0.1856, |
| "mean_token_accuracy": 0.9395947933197022, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.6946315181835675, |
| "grad_norm": 0.13346046560029717, |
| "learning_rate": 4.9261118621115075e-05, |
| "loss": 0.1857, |
| "mean_token_accuracy": 0.9395689606666565, |
| "step": 1805 |
| }, |
| { |
| "epoch": 0.6965557052145469, |
| "grad_norm": 0.12461615072971878, |
| "learning_rate": 4.925148256329632e-05, |
| "loss": 0.1873, |
| "mean_token_accuracy": 0.938995772600174, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.6984798922455263, |
| "grad_norm": 0.1320523043841543, |
| "learning_rate": 4.9241785140649874e-05, |
| "loss": 0.1851, |
| "mean_token_accuracy": 0.9397156715393067, |
| "step": 1815 |
| }, |
| { |
| "epoch": 0.7004040792765057, |
| "grad_norm": 0.12176312501257469, |
| "learning_rate": 4.92320263805339e-05, |
| "loss": 0.1853, |
| "mean_token_accuracy": 0.9398873805999756, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.7023282663074851, |
| "grad_norm": 0.12276787751903055, |
| "learning_rate": 4.922220631047959e-05, |
| "loss": 0.1838, |
| "mean_token_accuracy": 0.9400618195533752, |
| "step": 1825 |
| }, |
| { |
| "epoch": 0.7042524533384645, |
| "grad_norm": 0.12945726528749396, |
| "learning_rate": 4.921232495819115e-05, |
| "loss": 0.1883, |
| "mean_token_accuracy": 0.9388338088989258, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.7061766403694439, |
| "grad_norm": 0.13077148163199273, |
| "learning_rate": 4.9202382351545635e-05, |
| "loss": 0.182, |
| "mean_token_accuracy": 0.9405941784381866, |
| "step": 1835 |
| }, |
| { |
| "epoch": 0.7081008274004233, |
| "grad_norm": 0.13399997817922676, |
| "learning_rate": 4.91923785185929e-05, |
| "loss": 0.1858, |
| "mean_token_accuracy": 0.939314740896225, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.7100250144314028, |
| "grad_norm": 0.13410360541184926, |
| "learning_rate": 4.918231348755558e-05, |
| "loss": 0.1868, |
| "mean_token_accuracy": 0.9394115328788757, |
| "step": 1845 |
| }, |
| { |
| "epoch": 0.7119492014623822, |
| "grad_norm": 0.12784687831405284, |
| "learning_rate": 4.917218728682891e-05, |
| "loss": 0.1832, |
| "mean_token_accuracy": 0.9403221547603607, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.7138733884933616, |
| "grad_norm": 0.12638458550832857, |
| "learning_rate": 4.916199994498073e-05, |
| "loss": 0.186, |
| "mean_token_accuracy": 0.9395315229892731, |
| "step": 1855 |
| }, |
| { |
| "epoch": 0.715797575524341, |
| "grad_norm": 0.13064255797982968, |
| "learning_rate": 4.915175149075134e-05, |
| "loss": 0.1876, |
| "mean_token_accuracy": 0.9386875092983246, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.7177217625553204, |
| "grad_norm": 0.13292186717008103, |
| "learning_rate": 4.914144195305346e-05, |
| "loss": 0.185, |
| "mean_token_accuracy": 0.9393829703330994, |
| "step": 1865 |
| }, |
| { |
| "epoch": 0.7196459495862998, |
| "grad_norm": 0.12405323655496656, |
| "learning_rate": 4.9131071360972166e-05, |
| "loss": 0.1851, |
| "mean_token_accuracy": 0.939610755443573, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.7215701366172792, |
| "grad_norm": 0.11979493582212139, |
| "learning_rate": 4.912063974376475e-05, |
| "loss": 0.1821, |
| "mean_token_accuracy": 0.9403452455997467, |
| "step": 1875 |
| }, |
| { |
| "epoch": 0.7234943236482586, |
| "grad_norm": 0.12773120330873297, |
| "learning_rate": 4.9110147130860645e-05, |
| "loss": 0.1872, |
| "mean_token_accuracy": 0.9392232894897461, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.725418510679238, |
| "grad_norm": 0.1274813879213902, |
| "learning_rate": 4.909959355186143e-05, |
| "loss": 0.1859, |
| "mean_token_accuracy": 0.9395413219928741, |
| "step": 1885 |
| }, |
| { |
| "epoch": 0.7273426977102174, |
| "grad_norm": 0.11820515206801051, |
| "learning_rate": 4.908897903654061e-05, |
| "loss": 0.1857, |
| "mean_token_accuracy": 0.9395869731903076, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.7292668847411968, |
| "grad_norm": 0.13102076278056418, |
| "learning_rate": 4.907830361484365e-05, |
| "loss": 0.183, |
| "mean_token_accuracy": 0.9400752663612366, |
| "step": 1895 |
| }, |
| { |
| "epoch": 0.7311910717721762, |
| "grad_norm": 0.1278752224588257, |
| "learning_rate": 4.9067567316887827e-05, |
| "loss": 0.1833, |
| "mean_token_accuracy": 0.9402811288833618, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.7331152588031556, |
| "grad_norm": 0.12646029457624683, |
| "learning_rate": 4.905677017296215e-05, |
| "loss": 0.1842, |
| "mean_token_accuracy": 0.9400483667850494, |
| "step": 1905 |
| }, |
| { |
| "epoch": 0.735039445834135, |
| "grad_norm": 0.12189597684353602, |
| "learning_rate": 4.90459122135273e-05, |
| "loss": 0.1853, |
| "mean_token_accuracy": 0.939655190706253, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.7369636328651145, |
| "grad_norm": 0.11912995970116359, |
| "learning_rate": 4.9034993469215536e-05, |
| "loss": 0.1855, |
| "mean_token_accuracy": 0.9393987059593201, |
| "step": 1915 |
| }, |
| { |
| "epoch": 0.7388878198960939, |
| "grad_norm": 0.12473059168347295, |
| "learning_rate": 4.902401397083057e-05, |
| "loss": 0.1827, |
| "mean_token_accuracy": 0.9405106902122498, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.7408120069270733, |
| "grad_norm": 0.12178119702571959, |
| "learning_rate": 4.901297374934756e-05, |
| "loss": 0.182, |
| "mean_token_accuracy": 0.940519267320633, |
| "step": 1925 |
| }, |
| { |
| "epoch": 0.7427361939580527, |
| "grad_norm": 0.12298998058807746, |
| "learning_rate": 4.900187283591292e-05, |
| "loss": 0.1855, |
| "mean_token_accuracy": 0.9396823763847351, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.7446603809890321, |
| "grad_norm": 0.1649138029052641, |
| "learning_rate": 4.899071126184433e-05, |
| "loss": 0.1869, |
| "mean_token_accuracy": 0.9390595555305481, |
| "step": 1935 |
| }, |
| { |
| "epoch": 0.7465845680200115, |
| "grad_norm": 0.12302266948272966, |
| "learning_rate": 4.897948905863059e-05, |
| "loss": 0.1865, |
| "mean_token_accuracy": 0.939179515838623, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.7485087550509909, |
| "grad_norm": 0.13051190459136533, |
| "learning_rate": 4.896820625793154e-05, |
| "loss": 0.1833, |
| "mean_token_accuracy": 0.9401630163192749, |
| "step": 1945 |
| }, |
| { |
| "epoch": 0.7504329420819704, |
| "grad_norm": 0.12589310940355214, |
| "learning_rate": 4.8956862891577985e-05, |
| "loss": 0.1838, |
| "mean_token_accuracy": 0.9398818373680115, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.7523571291129498, |
| "grad_norm": 0.13251423292021208, |
| "learning_rate": 4.89454589915716e-05, |
| "loss": 0.1846, |
| "mean_token_accuracy": 0.9395718336105346, |
| "step": 1955 |
| }, |
| { |
| "epoch": 0.7542813161439292, |
| "grad_norm": 0.12609386176353532, |
| "learning_rate": 4.893399459008481e-05, |
| "loss": 0.188, |
| "mean_token_accuracy": 0.9388734877109528, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.7562055031749086, |
| "grad_norm": 0.13969598978136358, |
| "learning_rate": 4.892246971946075e-05, |
| "loss": 0.1867, |
| "mean_token_accuracy": 0.9394755423069, |
| "step": 1965 |
| }, |
| { |
| "epoch": 0.758129690205888, |
| "grad_norm": 0.11916269700290026, |
| "learning_rate": 4.891088441221316e-05, |
| "loss": 0.1834, |
| "mean_token_accuracy": 0.9400242328643799, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.7600538772368675, |
| "grad_norm": 0.11842170357576598, |
| "learning_rate": 4.889923870102625e-05, |
| "loss": 0.1828, |
| "mean_token_accuracy": 0.9405191838741302, |
| "step": 1975 |
| }, |
| { |
| "epoch": 0.7619780642678469, |
| "grad_norm": 0.12122423863343659, |
| "learning_rate": 4.888753261875467e-05, |
| "loss": 0.1847, |
| "mean_token_accuracy": 0.9396079897880554, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.7639022512988263, |
| "grad_norm": 0.1319218553876708, |
| "learning_rate": 4.887576619842336e-05, |
| "loss": 0.1851, |
| "mean_token_accuracy": 0.9395697891712189, |
| "step": 1985 |
| }, |
| { |
| "epoch": 0.7658264383298057, |
| "grad_norm": 0.12946605027962763, |
| "learning_rate": 4.886393947322751e-05, |
| "loss": 0.1836, |
| "mean_token_accuracy": 0.9401982545852661, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.7677506253607851, |
| "grad_norm": 0.12078596356425471, |
| "learning_rate": 4.885205247653242e-05, |
| "loss": 0.1838, |
| "mean_token_accuracy": 0.9403159558773041, |
| "step": 1995 |
| }, |
| { |
| "epoch": 0.7696748123917645, |
| "grad_norm": 0.12064298731271933, |
| "learning_rate": 4.884010524187345e-05, |
| "loss": 0.1815, |
| "mean_token_accuracy": 0.9408186912536621, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.7715989994227439, |
| "grad_norm": 0.12556269641903606, |
| "learning_rate": 4.882809780295587e-05, |
| "loss": 0.1815, |
| "mean_token_accuracy": 0.9409188389778137, |
| "step": 2005 |
| }, |
| { |
| "epoch": 0.7735231864537233, |
| "grad_norm": 0.13254601751365308, |
| "learning_rate": 4.8816030193654836e-05, |
| "loss": 0.1858, |
| "mean_token_accuracy": 0.9394202411174775, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.7754473734847027, |
| "grad_norm": 0.11416378254386586, |
| "learning_rate": 4.880390244801523e-05, |
| "loss": 0.1835, |
| "mean_token_accuracy": 0.9398796260356903, |
| "step": 2015 |
| }, |
| { |
| "epoch": 0.7773715605156821, |
| "grad_norm": 0.12378280260912523, |
| "learning_rate": 4.879171460025157e-05, |
| "loss": 0.1859, |
| "mean_token_accuracy": 0.9396220564842224, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.7792957475466615, |
| "grad_norm": 0.11753365831717989, |
| "learning_rate": 4.8779466684748004e-05, |
| "loss": 0.1869, |
| "mean_token_accuracy": 0.9389756441116333, |
| "step": 2025 |
| }, |
| { |
| "epoch": 0.7812199345776409, |
| "grad_norm": 0.11907092470176393, |
| "learning_rate": 4.8767158736058046e-05, |
| "loss": 0.1842, |
| "mean_token_accuracy": 0.9402213454246521, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.7831441216086203, |
| "grad_norm": 0.1201609963783468, |
| "learning_rate": 4.8754790788904656e-05, |
| "loss": 0.1846, |
| "mean_token_accuracy": 0.9401138067245484, |
| "step": 2035 |
| }, |
| { |
| "epoch": 0.7850683086395998, |
| "grad_norm": 0.12241701349986983, |
| "learning_rate": 4.874236287818002e-05, |
| "loss": 0.185, |
| "mean_token_accuracy": 0.9396168053150177, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.7869924956705792, |
| "grad_norm": 0.11890858606428757, |
| "learning_rate": 4.872987503894549e-05, |
| "loss": 0.1824, |
| "mean_token_accuracy": 0.9403604686260223, |
| "step": 2045 |
| }, |
| { |
| "epoch": 0.7889166827015586, |
| "grad_norm": 0.12085338845765432, |
| "learning_rate": 4.8717327306431505e-05, |
| "loss": 0.1857, |
| "mean_token_accuracy": 0.9393939733505249, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.790840869732538, |
| "grad_norm": 0.12386143683480247, |
| "learning_rate": 4.870471971603746e-05, |
| "loss": 0.1841, |
| "mean_token_accuracy": 0.9399244070053101, |
| "step": 2055 |
| }, |
| { |
| "epoch": 0.7927650567635174, |
| "grad_norm": 0.11380417602513576, |
| "learning_rate": 4.8692052303331636e-05, |
| "loss": 0.1838, |
| "mean_token_accuracy": 0.9398261904716492, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.7946892437944968, |
| "grad_norm": 0.12935731088043464, |
| "learning_rate": 4.8679325104051074e-05, |
| "loss": 0.1837, |
| "mean_token_accuracy": 0.9402774631977081, |
| "step": 2065 |
| }, |
| { |
| "epoch": 0.7966134308254762, |
| "grad_norm": 0.1250137586530349, |
| "learning_rate": 4.866653815410146e-05, |
| "loss": 0.1838, |
| "mean_token_accuracy": 0.9400751292705536, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.7985376178564556, |
| "grad_norm": 0.11926647844819718, |
| "learning_rate": 4.86536914895571e-05, |
| "loss": 0.1852, |
| "mean_token_accuracy": 0.9398459553718567, |
| "step": 2075 |
| }, |
| { |
| "epoch": 0.800461804887435, |
| "grad_norm": 0.11842733754417097, |
| "learning_rate": 4.86407851466607e-05, |
| "loss": 0.1814, |
| "mean_token_accuracy": 0.9405053138732911, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.8023859919184144, |
| "grad_norm": 0.11684244820743363, |
| "learning_rate": 4.86278191618234e-05, |
| "loss": 0.1851, |
| "mean_token_accuracy": 0.9398811280727386, |
| "step": 2085 |
| }, |
| { |
| "epoch": 0.8043101789493938, |
| "grad_norm": 0.11974151405660274, |
| "learning_rate": 4.861479357162455e-05, |
| "loss": 0.1843, |
| "mean_token_accuracy": 0.9397909820079804, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.8062343659803733, |
| "grad_norm": 0.11691695951953154, |
| "learning_rate": 4.8601708412811666e-05, |
| "loss": 0.1841, |
| "mean_token_accuracy": 0.9401267170906067, |
| "step": 2095 |
| }, |
| { |
| "epoch": 0.8081585530113528, |
| "grad_norm": 0.1230162234683747, |
| "learning_rate": 4.8588563722300335e-05, |
| "loss": 0.1811, |
| "mean_token_accuracy": 0.9409735321998596, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.8100827400423322, |
| "grad_norm": 0.12199443457355415, |
| "learning_rate": 4.857535953717408e-05, |
| "loss": 0.1836, |
| "mean_token_accuracy": 0.940091347694397, |
| "step": 2105 |
| }, |
| { |
| "epoch": 0.8120069270733116, |
| "grad_norm": 0.12805212639654023, |
| "learning_rate": 4.856209589468427e-05, |
| "loss": 0.1818, |
| "mean_token_accuracy": 0.940629106760025, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.813931114104291, |
| "grad_norm": 0.12938937862607275, |
| "learning_rate": 4.8548772832250015e-05, |
| "loss": 0.182, |
| "mean_token_accuracy": 0.9406083405017853, |
| "step": 2115 |
| }, |
| { |
| "epoch": 0.8158553011352704, |
| "grad_norm": 0.12552376482220515, |
| "learning_rate": 4.8535390387458066e-05, |
| "loss": 0.1872, |
| "mean_token_accuracy": 0.9390264034271241, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.8177794881662498, |
| "grad_norm": 0.1170373209408773, |
| "learning_rate": 4.852194859806269e-05, |
| "loss": 0.1808, |
| "mean_token_accuracy": 0.9408780634403229, |
| "step": 2125 |
| }, |
| { |
| "epoch": 0.8197036751972292, |
| "grad_norm": 0.11615734062516449, |
| "learning_rate": 4.8508447501985585e-05, |
| "loss": 0.1873, |
| "mean_token_accuracy": 0.9388710737228394, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.8216278622282086, |
| "grad_norm": 0.11616601365701719, |
| "learning_rate": 4.849488713731576e-05, |
| "loss": 0.1824, |
| "mean_token_accuracy": 0.9404361367225647, |
| "step": 2135 |
| }, |
| { |
| "epoch": 0.823552049259188, |
| "grad_norm": 0.1200473574748539, |
| "learning_rate": 4.8481267542309425e-05, |
| "loss": 0.1814, |
| "mean_token_accuracy": 0.9407158613204956, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.8254762362901674, |
| "grad_norm": 0.11416470941265607, |
| "learning_rate": 4.8467588755389915e-05, |
| "loss": 0.1827, |
| "mean_token_accuracy": 0.940354073047638, |
| "step": 2145 |
| }, |
| { |
| "epoch": 0.8274004233211468, |
| "grad_norm": 0.1165530375543256, |
| "learning_rate": 4.845385081514752e-05, |
| "loss": 0.1805, |
| "mean_token_accuracy": 0.9412187516689301, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.8293246103521262, |
| "grad_norm": 0.11754202343664585, |
| "learning_rate": 4.8440053760339446e-05, |
| "loss": 0.1834, |
| "mean_token_accuracy": 0.9401270806789398, |
| "step": 2155 |
| }, |
| { |
| "epoch": 0.8312487973831056, |
| "grad_norm": 0.1176517652797786, |
| "learning_rate": 4.842619762988963e-05, |
| "loss": 0.1826, |
| "mean_token_accuracy": 0.9402092635631562, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.833172984414085, |
| "grad_norm": 0.12133212991641226, |
| "learning_rate": 4.841228246288873e-05, |
| "loss": 0.1815, |
| "mean_token_accuracy": 0.9408377289772034, |
| "step": 2165 |
| }, |
| { |
| "epoch": 0.8350971714450645, |
| "grad_norm": 0.12044128214619812, |
| "learning_rate": 4.83983082985939e-05, |
| "loss": 0.1836, |
| "mean_token_accuracy": 0.940137755870819, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.8370213584760439, |
| "grad_norm": 0.11885648796006264, |
| "learning_rate": 4.838427517642877e-05, |
| "loss": 0.1801, |
| "mean_token_accuracy": 0.9411330699920655, |
| "step": 2175 |
| }, |
| { |
| "epoch": 0.8389455455070233, |
| "grad_norm": 0.1106034905535314, |
| "learning_rate": 4.837018313598328e-05, |
| "loss": 0.1845, |
| "mean_token_accuracy": 0.9397697389125824, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.8408697325380027, |
| "grad_norm": 0.11987613929957971, |
| "learning_rate": 4.835603221701362e-05, |
| "loss": 0.1803, |
| "mean_token_accuracy": 0.9408568203449249, |
| "step": 2185 |
| }, |
| { |
| "epoch": 0.8427939195689821, |
| "grad_norm": 0.11966460776022199, |
| "learning_rate": 4.834182245944205e-05, |
| "loss": 0.1785, |
| "mean_token_accuracy": 0.9416488707065582, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.8447181065999615, |
| "grad_norm": 0.1182565560206525, |
| "learning_rate": 4.8327553903356836e-05, |
| "loss": 0.1824, |
| "mean_token_accuracy": 0.9405614793300628, |
| "step": 2195 |
| }, |
| { |
| "epoch": 0.8466422936309409, |
| "grad_norm": 0.11535883667619559, |
| "learning_rate": 4.831322658901215e-05, |
| "loss": 0.1817, |
| "mean_token_accuracy": 0.9406129837036132, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.8485664806619203, |
| "grad_norm": 0.11788009293521055, |
| "learning_rate": 4.82988405568279e-05, |
| "loss": 0.1836, |
| "mean_token_accuracy": 0.9399451076984405, |
| "step": 2205 |
| }, |
| { |
| "epoch": 0.8504906676928997, |
| "grad_norm": 0.11917536508755569, |
| "learning_rate": 4.828439584738966e-05, |
| "loss": 0.184, |
| "mean_token_accuracy": 0.9401583254337311, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.8524148547238791, |
| "grad_norm": 0.1146235510311429, |
| "learning_rate": 4.826989250144854e-05, |
| "loss": 0.1819, |
| "mean_token_accuracy": 0.9403225839138031, |
| "step": 2215 |
| }, |
| { |
| "epoch": 0.8543390417548585, |
| "grad_norm": 0.12469548548238404, |
| "learning_rate": 4.8255330559921074e-05, |
| "loss": 0.1827, |
| "mean_token_accuracy": 0.9404394030570984, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.8562632287858379, |
| "grad_norm": 0.11613789081146532, |
| "learning_rate": 4.824071006388912e-05, |
| "loss": 0.1829, |
| "mean_token_accuracy": 0.9401649594306946, |
| "step": 2225 |
| }, |
| { |
| "epoch": 0.8581874158168173, |
| "grad_norm": 0.12267984426574025, |
| "learning_rate": 4.82260310545997e-05, |
| "loss": 0.1825, |
| "mean_token_accuracy": 0.940345722436905, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.8601116028477968, |
| "grad_norm": 0.12100207158363169, |
| "learning_rate": 4.8211293573464946e-05, |
| "loss": 0.1842, |
| "mean_token_accuracy": 0.9399094760417939, |
| "step": 2235 |
| }, |
| { |
| "epoch": 0.8620357898787763, |
| "grad_norm": 0.10970579702066553, |
| "learning_rate": 4.8196497662061914e-05, |
| "loss": 0.1838, |
| "mean_token_accuracy": 0.9400064706802368, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.8639599769097557, |
| "grad_norm": 0.11453745047830523, |
| "learning_rate": 4.818164336213252e-05, |
| "loss": 0.1855, |
| "mean_token_accuracy": 0.9396546185016632, |
| "step": 2245 |
| }, |
| { |
| "epoch": 0.8658841639407351, |
| "grad_norm": 0.11513182404489364, |
| "learning_rate": 4.8166730715583425e-05, |
| "loss": 0.1818, |
| "mean_token_accuracy": 0.940559697151184, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.8678083509717145, |
| "grad_norm": 0.1476666402315563, |
| "learning_rate": 4.8151759764485856e-05, |
| "loss": 0.1824, |
| "mean_token_accuracy": 0.9404184460639954, |
| "step": 2255 |
| }, |
| { |
| "epoch": 0.8697325380026939, |
| "grad_norm": 0.11948879120992552, |
| "learning_rate": 4.813673055107555e-05, |
| "loss": 0.1819, |
| "mean_token_accuracy": 0.9407192707061768, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.8716567250336733, |
| "grad_norm": 0.1262081227430968, |
| "learning_rate": 4.812164311775261e-05, |
| "loss": 0.1821, |
| "mean_token_accuracy": 0.9405526936054229, |
| "step": 2265 |
| }, |
| { |
| "epoch": 0.8735809120646527, |
| "grad_norm": 0.11634634554542392, |
| "learning_rate": 4.810649750708139e-05, |
| "loss": 0.1831, |
| "mean_token_accuracy": 0.9402897179126739, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.8755050990956321, |
| "grad_norm": 0.11942845642892773, |
| "learning_rate": 4.8091293761790376e-05, |
| "loss": 0.1842, |
| "mean_token_accuracy": 0.9398006618022918, |
| "step": 2275 |
| }, |
| { |
| "epoch": 0.8774292861266115, |
| "grad_norm": 0.12249680244184764, |
| "learning_rate": 4.807603192477204e-05, |
| "loss": 0.1815, |
| "mean_token_accuracy": 0.9406807065010071, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.8793534731575909, |
| "grad_norm": 0.11271491646058761, |
| "learning_rate": 4.8060712039082776e-05, |
| "loss": 0.181, |
| "mean_token_accuracy": 0.9405488193035125, |
| "step": 2285 |
| }, |
| { |
| "epoch": 0.8812776601885703, |
| "grad_norm": 0.11533467431113814, |
| "learning_rate": 4.804533414794272e-05, |
| "loss": 0.1853, |
| "mean_token_accuracy": 0.9396409273147583, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.8832018472195498, |
| "grad_norm": 0.1285187047296205, |
| "learning_rate": 4.8029898294735645e-05, |
| "loss": 0.182, |
| "mean_token_accuracy": 0.9406699240207672, |
| "step": 2295 |
| }, |
| { |
| "epoch": 0.8851260342505292, |
| "grad_norm": 0.1411300877607505, |
| "learning_rate": 4.801440452300886e-05, |
| "loss": 0.1848, |
| "mean_token_accuracy": 0.9397783398628234, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.8870502212815086, |
| "grad_norm": 0.12830296379755649, |
| "learning_rate": 4.799885287647308e-05, |
| "loss": 0.1836, |
| "mean_token_accuracy": 0.9400253415107727, |
| "step": 2305 |
| }, |
| { |
| "epoch": 0.888974408312488, |
| "grad_norm": 0.12269949468159491, |
| "learning_rate": 4.798324339900228e-05, |
| "loss": 0.1832, |
| "mean_token_accuracy": 0.9401925921440124, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.8908985953434674, |
| "grad_norm": 0.11756618845811823, |
| "learning_rate": 4.7967576134633596e-05, |
| "loss": 0.1804, |
| "mean_token_accuracy": 0.9412013351917267, |
| "step": 2315 |
| }, |
| { |
| "epoch": 0.8928227823744468, |
| "grad_norm": 0.16572054412065182, |
| "learning_rate": 4.7951851127567184e-05, |
| "loss": 0.1807, |
| "mean_token_accuracy": 0.9412114441394805, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.8947469694054262, |
| "grad_norm": 0.12116911140290487, |
| "learning_rate": 4.793606842216609e-05, |
| "loss": 0.1795, |
| "mean_token_accuracy": 0.941188383102417, |
| "step": 2325 |
| }, |
| { |
| "epoch": 0.8966711564364056, |
| "grad_norm": 0.1260181768965728, |
| "learning_rate": 4.792022806295618e-05, |
| "loss": 0.1854, |
| "mean_token_accuracy": 0.9395481050014496, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.898595343467385, |
| "grad_norm": 0.12955287274675212, |
| "learning_rate": 4.790433009462592e-05, |
| "loss": 0.1813, |
| "mean_token_accuracy": 0.9407733678817749, |
| "step": 2335 |
| }, |
| { |
| "epoch": 0.9005195304983644, |
| "grad_norm": 0.21585557593260274, |
| "learning_rate": 4.788837456202634e-05, |
| "loss": 0.1912, |
| "mean_token_accuracy": 0.9396725356578827, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.9024437175293438, |
| "grad_norm": 0.7997740442778281, |
| "learning_rate": 4.787236151017085e-05, |
| "loss": 0.3293, |
| "mean_token_accuracy": 0.9140214204788208, |
| "step": 2345 |
| }, |
| { |
| "epoch": 0.9043679045603232, |
| "grad_norm": 0.2971714473559009, |
| "learning_rate": 4.785629098423513e-05, |
| "loss": 0.2142, |
| "mean_token_accuracy": 0.931775027513504, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.9062920915913026, |
| "grad_norm": 0.27663796986222866, |
| "learning_rate": 4.7840163029557034e-05, |
| "loss": 0.1949, |
| "mean_token_accuracy": 0.9374295115470886, |
| "step": 2355 |
| }, |
| { |
| "epoch": 0.908216278622282, |
| "grad_norm": 0.14943846174583816, |
| "learning_rate": 4.782397769163638e-05, |
| "loss": 0.1926, |
| "mean_token_accuracy": 0.9374601423740387, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.9101404656532615, |
| "grad_norm": 0.2191158094100316, |
| "learning_rate": 4.780773501613493e-05, |
| "loss": 0.1902, |
| "mean_token_accuracy": 0.9381971001625061, |
| "step": 2365 |
| }, |
| { |
| "epoch": 0.9120646526842409, |
| "grad_norm": 0.1368075879535361, |
| "learning_rate": 4.7791435048876166e-05, |
| "loss": 0.1883, |
| "mean_token_accuracy": 0.9388419568538666, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.9139888397152203, |
| "grad_norm": 0.14956559770526698, |
| "learning_rate": 4.777507783584522e-05, |
| "loss": 0.1877, |
| "mean_token_accuracy": 0.9390267848968505, |
| "step": 2375 |
| }, |
| { |
| "epoch": 0.9159130267461997, |
| "grad_norm": 0.1320819312114568, |
| "learning_rate": 4.775866342318871e-05, |
| "loss": 0.1857, |
| "mean_token_accuracy": 0.9396218538284302, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.9178372137771792, |
| "grad_norm": 0.12840423074414092, |
| "learning_rate": 4.774219185721466e-05, |
| "loss": 0.1868, |
| "mean_token_accuracy": 0.9395570278167724, |
| "step": 2385 |
| }, |
| { |
| "epoch": 0.9197614008081586, |
| "grad_norm": 0.12215985631548372, |
| "learning_rate": 4.7725663184392284e-05, |
| "loss": 0.1845, |
| "mean_token_accuracy": 0.9402369081974029, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.921685587839138, |
| "grad_norm": 0.23556308545025137, |
| "learning_rate": 4.770907745135194e-05, |
| "loss": 0.1814, |
| "mean_token_accuracy": 0.9406795680522919, |
| "step": 2395 |
| }, |
| { |
| "epoch": 0.9236097748701174, |
| "grad_norm": 0.13504443301156466, |
| "learning_rate": 4.769243470488493e-05, |
| "loss": 0.1848, |
| "mean_token_accuracy": 0.9396166443824768, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.9255339619010968, |
| "grad_norm": 0.25044759571321284, |
| "learning_rate": 4.767573499194344e-05, |
| "loss": 0.1869, |
| "mean_token_accuracy": 0.939172875881195, |
| "step": 2405 |
| }, |
| { |
| "epoch": 0.9274581489320762, |
| "grad_norm": 0.12385091018646273, |
| "learning_rate": 4.765897835964035e-05, |
| "loss": 0.1852, |
| "mean_token_accuracy": 0.9395855963230133, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.9293823359630556, |
| "grad_norm": 0.11439544922055143, |
| "learning_rate": 4.7642164855249124e-05, |
| "loss": 0.1852, |
| "mean_token_accuracy": 0.9394470155239105, |
| "step": 2415 |
| }, |
| { |
| "epoch": 0.931306522994035, |
| "grad_norm": 0.1235397306154651, |
| "learning_rate": 4.7625294526203657e-05, |
| "loss": 0.1847, |
| "mean_token_accuracy": 0.9398016691207886, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.9332307100250145, |
| "grad_norm": 1.6728295055753946, |
| "learning_rate": 4.760836742009818e-05, |
| "loss": 0.1851, |
| "mean_token_accuracy": 0.9397288858890533, |
| "step": 2425 |
| }, |
| { |
| "epoch": 0.9351548970559939, |
| "grad_norm": 0.12316305160239477, |
| "learning_rate": 4.759138358468709e-05, |
| "loss": 0.1835, |
| "mean_token_accuracy": 0.9399518132209778, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.9370790840869733, |
| "grad_norm": 0.13793514276935762, |
| "learning_rate": 4.757434306788482e-05, |
| "loss": 0.1861, |
| "mean_token_accuracy": 0.9393800020217895, |
| "step": 2435 |
| }, |
| { |
| "epoch": 0.9390032711179527, |
| "grad_norm": 0.12149826036690749, |
| "learning_rate": 4.755724591776572e-05, |
| "loss": 0.1846, |
| "mean_token_accuracy": 0.9398538827896118, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.9409274581489321, |
| "grad_norm": 0.11764487668178286, |
| "learning_rate": 4.754009218256392e-05, |
| "loss": 0.1828, |
| "mean_token_accuracy": 0.9401457965373993, |
| "step": 2445 |
| }, |
| { |
| "epoch": 0.9428516451799115, |
| "grad_norm": 0.11557655011443249, |
| "learning_rate": 4.752288191067317e-05, |
| "loss": 0.1833, |
| "mean_token_accuracy": 0.9400545418262481, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.9447758322108909, |
| "grad_norm": 0.11671398919818406, |
| "learning_rate": 4.7505615150646737e-05, |
| "loss": 0.1815, |
| "mean_token_accuracy": 0.9409000337123871, |
| "step": 2455 |
| }, |
| { |
| "epoch": 0.9467000192418703, |
| "grad_norm": 0.11369930035927728, |
| "learning_rate": 4.748829195119724e-05, |
| "loss": 0.1811, |
| "mean_token_accuracy": 0.9409979999065399, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.9486242062728497, |
| "grad_norm": 0.14297427532144838, |
| "learning_rate": 4.747091236119653e-05, |
| "loss": 0.1848, |
| "mean_token_accuracy": 0.9399132430553436, |
| "step": 2465 |
| }, |
| { |
| "epoch": 0.9505483933038291, |
| "grad_norm": 0.11870533079661626, |
| "learning_rate": 4.7453476429675545e-05, |
| "loss": 0.1824, |
| "mean_token_accuracy": 0.9405813694000245, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.9524725803348085, |
| "grad_norm": 0.11984706759405574, |
| "learning_rate": 4.7435984205824155e-05, |
| "loss": 0.185, |
| "mean_token_accuracy": 0.9398678004741668, |
| "step": 2475 |
| }, |
| { |
| "epoch": 0.9543967673657879, |
| "grad_norm": 0.12387680072322939, |
| "learning_rate": 4.741843573899107e-05, |
| "loss": 0.1835, |
| "mean_token_accuracy": 0.939969539642334, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.9563209543967673, |
| "grad_norm": 0.12033084141979045, |
| "learning_rate": 4.7400831078683655e-05, |
| "loss": 0.1861, |
| "mean_token_accuracy": 0.9396663069725036, |
| "step": 2485 |
| }, |
| { |
| "epoch": 0.9582451414277467, |
| "grad_norm": 0.12538542915432893, |
| "learning_rate": 4.738317027456782e-05, |
| "loss": 0.1854, |
| "mean_token_accuracy": 0.9394169688224793, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.9601693284587262, |
| "grad_norm": 0.11316308781679958, |
| "learning_rate": 4.7365453376467836e-05, |
| "loss": 0.1824, |
| "mean_token_accuracy": 0.940411388874054, |
| "step": 2495 |
| }, |
| { |
| "epoch": 0.9620935154897056, |
| "grad_norm": 0.12556823145271817, |
| "learning_rate": 4.734768043436625e-05, |
| "loss": 0.1819, |
| "mean_token_accuracy": 0.9407488465309143, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.964017702520685, |
| "grad_norm": 0.12484667142634479, |
| "learning_rate": 4.732985149840373e-05, |
| "loss": 0.183, |
| "mean_token_accuracy": 0.9400827348232269, |
| "step": 2505 |
| }, |
| { |
| "epoch": 0.9659418895516644, |
| "grad_norm": 0.11562585712771245, |
| "learning_rate": 4.7311966618878874e-05, |
| "loss": 0.1816, |
| "mean_token_accuracy": 0.940721720457077, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.9678660765826438, |
| "grad_norm": 0.11245306612355185, |
| "learning_rate": 4.729402584624815e-05, |
| "loss": 0.1803, |
| "mean_token_accuracy": 0.9411805689334869, |
| "step": 2515 |
| }, |
| { |
| "epoch": 0.9697902636136232, |
| "grad_norm": 0.11271273272429061, |
| "learning_rate": 4.727602923112568e-05, |
| "loss": 0.1829, |
| "mean_token_accuracy": 0.9400192379951477, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.9717144506446026, |
| "grad_norm": 0.11323226408965702, |
| "learning_rate": 4.725797682428314e-05, |
| "loss": 0.1831, |
| "mean_token_accuracy": 0.9404727399349213, |
| "step": 2525 |
| }, |
| { |
| "epoch": 0.9736386376755821, |
| "grad_norm": 0.11663415399743987, |
| "learning_rate": 4.72398686766496e-05, |
| "loss": 0.1809, |
| "mean_token_accuracy": 0.9406877100467682, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.9755628247065615, |
| "grad_norm": 0.11252169021412911, |
| "learning_rate": 4.72217048393114e-05, |
| "loss": 0.1802, |
| "mean_token_accuracy": 0.9408616960048676, |
| "step": 2535 |
| }, |
| { |
| "epoch": 0.9774870117375409, |
| "grad_norm": 0.11696017091375428, |
| "learning_rate": 4.720348536351197e-05, |
| "loss": 0.1836, |
| "mean_token_accuracy": 0.9399718284606934, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.9794111987685203, |
| "grad_norm": 0.10962144925513925, |
| "learning_rate": 4.718521030065171e-05, |
| "loss": 0.1789, |
| "mean_token_accuracy": 0.9413145005702972, |
| "step": 2545 |
| }, |
| { |
| "epoch": 0.9813353857994997, |
| "grad_norm": 0.10656545904669233, |
| "learning_rate": 4.7166879702287844e-05, |
| "loss": 0.1793, |
| "mean_token_accuracy": 0.9414481461048126, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.9832595728304792, |
| "grad_norm": 0.1365718930459726, |
| "learning_rate": 4.714849362013428e-05, |
| "loss": 0.1802, |
| "mean_token_accuracy": 0.9410699248313904, |
| "step": 2555 |
| }, |
| { |
| "epoch": 0.9851837598614586, |
| "grad_norm": 0.10982982510596055, |
| "learning_rate": 4.7130052106061454e-05, |
| "loss": 0.1808, |
| "mean_token_accuracy": 0.9409809350967407, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.987107946892438, |
| "grad_norm": 0.11421754495270237, |
| "learning_rate": 4.711155521209616e-05, |
| "loss": 0.1809, |
| "mean_token_accuracy": 0.9407776176929474, |
| "step": 2565 |
| }, |
| { |
| "epoch": 0.9890321339234174, |
| "grad_norm": 0.1274611305378617, |
| "learning_rate": 4.7093002990421466e-05, |
| "loss": 0.1777, |
| "mean_token_accuracy": 0.9420558035373687, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.9909563209543968, |
| "grad_norm": 0.11024541409362704, |
| "learning_rate": 4.70743954933765e-05, |
| "loss": 0.1787, |
| "mean_token_accuracy": 0.941502434015274, |
| "step": 2575 |
| }, |
| { |
| "epoch": 0.9928805079853762, |
| "grad_norm": 0.16449134699837414, |
| "learning_rate": 4.705573277345635e-05, |
| "loss": 0.181, |
| "mean_token_accuracy": 0.9409343540668488, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.9948046950163556, |
| "grad_norm": 0.11524065323328825, |
| "learning_rate": 4.70370148833119e-05, |
| "loss": 0.1807, |
| "mean_token_accuracy": 0.9407903850078583, |
| "step": 2585 |
| }, |
| { |
| "epoch": 0.996728882047335, |
| "grad_norm": 0.10713501522499569, |
| "learning_rate": 4.701824187574965e-05, |
| "loss": 0.1791, |
| "mean_token_accuracy": 0.9414562463760376, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.9986530690783144, |
| "grad_norm": 0.11504187540918072, |
| "learning_rate": 4.699941380373163e-05, |
| "loss": 0.1802, |
| "mean_token_accuracy": 0.940996652841568, |
| "step": 2595 |
| }, |
| { |
| "epoch": 1.0003848374061959, |
| "grad_norm": 0.17475084050053288, |
| "learning_rate": 4.69805307203752e-05, |
| "loss": 0.1764, |
| "mean_token_accuracy": 0.9420868423249986, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.0023090244371753, |
| "grad_norm": 0.11291991760873382, |
| "learning_rate": 4.696159267895291e-05, |
| "loss": 0.169, |
| "mean_token_accuracy": 0.943702882528305, |
| "step": 2605 |
| }, |
| { |
| "epoch": 1.0042332114681547, |
| "grad_norm": 0.11662597339258252, |
| "learning_rate": 4.694259973289239e-05, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9445142686367035, |
| "step": 2610 |
| }, |
| { |
| "epoch": 1.006157398499134, |
| "grad_norm": 0.11242395335687048, |
| "learning_rate": 4.692355193577612e-05, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.9452146112918853, |
| "step": 2615 |
| }, |
| { |
| "epoch": 1.0080815855301135, |
| "grad_norm": 0.11836124984243517, |
| "learning_rate": 4.690444934134136e-05, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9443571090698242, |
| "step": 2620 |
| }, |
| { |
| "epoch": 1.010005772561093, |
| "grad_norm": 0.11951371179098023, |
| "learning_rate": 4.6885292003479945e-05, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9452442586421966, |
| "step": 2625 |
| }, |
| { |
| "epoch": 1.0119299595920723, |
| "grad_norm": 0.14562315224241446, |
| "learning_rate": 4.686607997623816e-05, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.9449779331684113, |
| "step": 2630 |
| }, |
| { |
| "epoch": 1.0138541466230517, |
| "grad_norm": 0.1209319195563948, |
| "learning_rate": 4.6846813313816555e-05, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9453990161418915, |
| "step": 2635 |
| }, |
| { |
| "epoch": 1.0157783336540311, |
| "grad_norm": 0.12074689017528342, |
| "learning_rate": 4.682749207056986e-05, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9453830003738404, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.0177025206850105, |
| "grad_norm": 0.11368217084841102, |
| "learning_rate": 4.680811630100675e-05, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.9448183536529541, |
| "step": 2645 |
| }, |
| { |
| "epoch": 1.01962670771599, |
| "grad_norm": 0.11343676694591881, |
| "learning_rate": 4.678868605978975e-05, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9464600622653961, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.0215508947469694, |
| "grad_norm": 0.11741034993722078, |
| "learning_rate": 4.676920140173504e-05, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9449496328830719, |
| "step": 2655 |
| }, |
| { |
| "epoch": 1.0234750817779488, |
| "grad_norm": 0.11961210804780995, |
| "learning_rate": 4.674966238181234e-05, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9452983498573303, |
| "step": 2660 |
| }, |
| { |
| "epoch": 1.0253992688089282, |
| "grad_norm": 0.7073890270630259, |
| "learning_rate": 4.67300690551447e-05, |
| "loss": 0.1662, |
| "mean_token_accuracy": 0.9446015000343323, |
| "step": 2665 |
| }, |
| { |
| "epoch": 1.0273234558399076, |
| "grad_norm": 0.11899414259953112, |
| "learning_rate": 4.671042147700844e-05, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9445567905902863, |
| "step": 2670 |
| }, |
| { |
| "epoch": 1.029247642870887, |
| "grad_norm": 0.11154402650406485, |
| "learning_rate": 4.669071970283287e-05, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9451187491416931, |
| "step": 2675 |
| }, |
| { |
| "epoch": 1.0311718299018664, |
| "grad_norm": 0.11674403117276437, |
| "learning_rate": 4.667096378820023e-05, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9446981310844421, |
| "step": 2680 |
| }, |
| { |
| "epoch": 1.0330960169328458, |
| "grad_norm": 0.11233220894814873, |
| "learning_rate": 4.665115378884549e-05, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.9454202771186828, |
| "step": 2685 |
| }, |
| { |
| "epoch": 1.0350202039638252, |
| "grad_norm": 0.11490664025644873, |
| "learning_rate": 4.663128976065622e-05, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9450455904006958, |
| "step": 2690 |
| }, |
| { |
| "epoch": 1.0369443909948046, |
| "grad_norm": 0.11352501568114064, |
| "learning_rate": 4.661137175967239e-05, |
| "loss": 0.1673, |
| "mean_token_accuracy": 0.9439740359783173, |
| "step": 2695 |
| }, |
| { |
| "epoch": 1.038868578025784, |
| "grad_norm": 0.11140206907733896, |
| "learning_rate": 4.659139984208624e-05, |
| "loss": 0.1633, |
| "mean_token_accuracy": 0.945688658952713, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.0407927650567634, |
| "grad_norm": 0.11939481459237158, |
| "learning_rate": 4.657137406424214e-05, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9446808695793152, |
| "step": 2705 |
| }, |
| { |
| "epoch": 1.0427169520877428, |
| "grad_norm": 0.1135153559788723, |
| "learning_rate": 4.655129448263639e-05, |
| "loss": 0.1666, |
| "mean_token_accuracy": 0.944467556476593, |
| "step": 2710 |
| }, |
| { |
| "epoch": 1.0446411391187223, |
| "grad_norm": 0.11014146292873342, |
| "learning_rate": 4.6531161153917094e-05, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.944754683971405, |
| "step": 2715 |
| }, |
| { |
| "epoch": 1.0465653261497017, |
| "grad_norm": 0.1159482186501732, |
| "learning_rate": 4.6510974134883964e-05, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9449528694152832, |
| "step": 2720 |
| }, |
| { |
| "epoch": 1.048489513180681, |
| "grad_norm": 0.11779456258476576, |
| "learning_rate": 4.649073348248821e-05, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9451721370220184, |
| "step": 2725 |
| }, |
| { |
| "epoch": 1.0504137002116605, |
| "grad_norm": 0.11851875590633976, |
| "learning_rate": 4.6470439253832316e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9451947033405304, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.0523378872426399, |
| "grad_norm": 0.11416263688350742, |
| "learning_rate": 4.645009150616995e-05, |
| "loss": 0.164, |
| "mean_token_accuracy": 0.9453210353851318, |
| "step": 2735 |
| }, |
| { |
| "epoch": 1.0542620742736193, |
| "grad_norm": 0.11153611396218852, |
| "learning_rate": 4.6429690296905756e-05, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.9449005126953125, |
| "step": 2740 |
| }, |
| { |
| "epoch": 1.0561862613045987, |
| "grad_norm": 0.11896875234146256, |
| "learning_rate": 4.64092356835952e-05, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.9447975337505341, |
| "step": 2745 |
| }, |
| { |
| "epoch": 1.0581104483355783, |
| "grad_norm": 0.11935152821930041, |
| "learning_rate": 4.6388727723944395e-05, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.9447598576545715, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.0600346353665577, |
| "grad_norm": 0.1163982177154289, |
| "learning_rate": 4.636816647580998e-05, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.9461501777172089, |
| "step": 2755 |
| }, |
| { |
| "epoch": 1.0619588223975371, |
| "grad_norm": 0.21829881993472672, |
| "learning_rate": 4.6347551997198915e-05, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9448219954967498, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.0638830094285165, |
| "grad_norm": 0.11191678688827768, |
| "learning_rate": 4.632688434626833e-05, |
| "loss": 0.1682, |
| "mean_token_accuracy": 0.9440916359424592, |
| "step": 2765 |
| }, |
| { |
| "epoch": 1.065807196459496, |
| "grad_norm": 0.10912851202680238, |
| "learning_rate": 4.630616358132538e-05, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.944388085603714, |
| "step": 2770 |
| }, |
| { |
| "epoch": 1.0677313834904754, |
| "grad_norm": 0.11325247994043755, |
| "learning_rate": 4.6285389760827035e-05, |
| "loss": 0.1603, |
| "mean_token_accuracy": 0.9464464247226715, |
| "step": 2775 |
| }, |
| { |
| "epoch": 1.0696555705214548, |
| "grad_norm": 0.10949410815060585, |
| "learning_rate": 4.626456294337999e-05, |
| "loss": 0.1606, |
| "mean_token_accuracy": 0.9462958097457885, |
| "step": 2780 |
| }, |
| { |
| "epoch": 1.0715797575524342, |
| "grad_norm": 0.11071407586025536, |
| "learning_rate": 4.6243683187740414e-05, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9457931101322175, |
| "step": 2785 |
| }, |
| { |
| "epoch": 1.0735039445834136, |
| "grad_norm": 0.1158591760502722, |
| "learning_rate": 4.6222750552813834e-05, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.944324654340744, |
| "step": 2790 |
| }, |
| { |
| "epoch": 1.075428131614393, |
| "grad_norm": 0.10880174773529794, |
| "learning_rate": 4.620176509765496e-05, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9458514332771302, |
| "step": 2795 |
| }, |
| { |
| "epoch": 1.0773523186453724, |
| "grad_norm": 0.11470522185752449, |
| "learning_rate": 4.618072688146752e-05, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9443574488162995, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.0792765056763518, |
| "grad_norm": 0.1169552366814147, |
| "learning_rate": 4.615963596360411e-05, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9445386469364166, |
| "step": 2805 |
| }, |
| { |
| "epoch": 1.0812006927073312, |
| "grad_norm": 0.1146473820257585, |
| "learning_rate": 4.613849240356595e-05, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9454758405685425, |
| "step": 2810 |
| }, |
| { |
| "epoch": 1.0831248797383106, |
| "grad_norm": 0.1180821382249639, |
| "learning_rate": 4.611729626100284e-05, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9456037998199462, |
| "step": 2815 |
| }, |
| { |
| "epoch": 1.08504906676929, |
| "grad_norm": 0.10806023645742613, |
| "learning_rate": 4.6096047595712874e-05, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.9445085465908051, |
| "step": 2820 |
| }, |
| { |
| "epoch": 1.0869732538002694, |
| "grad_norm": 0.10893231386169591, |
| "learning_rate": 4.607474646764236e-05, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9448854207992554, |
| "step": 2825 |
| }, |
| { |
| "epoch": 1.0888974408312488, |
| "grad_norm": 0.12081177073304383, |
| "learning_rate": 4.605339293688558e-05, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.944972711801529, |
| "step": 2830 |
| }, |
| { |
| "epoch": 1.0908216278622282, |
| "grad_norm": 0.10938210180068166, |
| "learning_rate": 4.603198706368468e-05, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.946021032333374, |
| "step": 2835 |
| }, |
| { |
| "epoch": 1.0927458148932077, |
| "grad_norm": 0.10519629631556646, |
| "learning_rate": 4.6010528908429445e-05, |
| "loss": 0.1675, |
| "mean_token_accuracy": 0.9442543506622314, |
| "step": 2840 |
| }, |
| { |
| "epoch": 1.094670001924187, |
| "grad_norm": 0.10324109258642244, |
| "learning_rate": 4.598901853165719e-05, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9447111248970032, |
| "step": 2845 |
| }, |
| { |
| "epoch": 1.0965941889551665, |
| "grad_norm": 0.10728210589019434, |
| "learning_rate": 4.596745599405254e-05, |
| "loss": 0.1662, |
| "mean_token_accuracy": 0.9445396065711975, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.0985183759861459, |
| "grad_norm": 0.11493544596778879, |
| "learning_rate": 4.5945841356447255e-05, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.9453412830829621, |
| "step": 2855 |
| }, |
| { |
| "epoch": 1.1004425630171253, |
| "grad_norm": 0.10825846359143984, |
| "learning_rate": 4.5924174679820124e-05, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.945780211687088, |
| "step": 2860 |
| }, |
| { |
| "epoch": 1.1023667500481047, |
| "grad_norm": 0.11159733375533923, |
| "learning_rate": 4.5902456025296716e-05, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.94489386677742, |
| "step": 2865 |
| }, |
| { |
| "epoch": 1.104290937079084, |
| "grad_norm": 0.12082066111535134, |
| "learning_rate": 4.588068545414924e-05, |
| "loss": 0.1695, |
| "mean_token_accuracy": 0.9434131681919098, |
| "step": 2870 |
| }, |
| { |
| "epoch": 1.1062151241100635, |
| "grad_norm": 0.10837959440293671, |
| "learning_rate": 4.585886302779637e-05, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9456695020198822, |
| "step": 2875 |
| }, |
| { |
| "epoch": 1.108139311141043, |
| "grad_norm": 0.10937977096751192, |
| "learning_rate": 4.5836988807803086e-05, |
| "loss": 0.1665, |
| "mean_token_accuracy": 0.9442308902740478, |
| "step": 2880 |
| }, |
| { |
| "epoch": 1.1100634981720223, |
| "grad_norm": 0.1091698135418322, |
| "learning_rate": 4.581506285588049e-05, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.9454576671123505, |
| "step": 2885 |
| }, |
| { |
| "epoch": 1.1119876852030017, |
| "grad_norm": 0.10725811081270654, |
| "learning_rate": 4.579308523388559e-05, |
| "loss": 0.1639, |
| "mean_token_accuracy": 0.9453332245349884, |
| "step": 2890 |
| }, |
| { |
| "epoch": 1.1139118722339811, |
| "grad_norm": 0.11351347695275504, |
| "learning_rate": 4.577105600382122e-05, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.9453843176364899, |
| "step": 2895 |
| }, |
| { |
| "epoch": 1.1158360592649605, |
| "grad_norm": 0.1138607424511027, |
| "learning_rate": 4.574897522783578e-05, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9444127380847931, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.11776024629594, |
| "grad_norm": 0.1106852790197566, |
| "learning_rate": 4.572684296822308e-05, |
| "loss": 0.1691, |
| "mean_token_accuracy": 0.9439824044704437, |
| "step": 2905 |
| }, |
| { |
| "epoch": 1.1196844333269194, |
| "grad_norm": 0.10667739581669644, |
| "learning_rate": 4.5704659287422203e-05, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.944707703590393, |
| "step": 2910 |
| }, |
| { |
| "epoch": 1.1216086203578988, |
| "grad_norm": 0.11007892960055661, |
| "learning_rate": 4.568242424801727e-05, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9453161656856537, |
| "step": 2915 |
| }, |
| { |
| "epoch": 1.1235328073888782, |
| "grad_norm": 0.1195397398056014, |
| "learning_rate": 4.566013791273733e-05, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9449052453041077, |
| "step": 2920 |
| }, |
| { |
| "epoch": 1.1254569944198576, |
| "grad_norm": 0.11078554270766434, |
| "learning_rate": 4.56378003444561e-05, |
| "loss": 0.1662, |
| "mean_token_accuracy": 0.9444914698600769, |
| "step": 2925 |
| }, |
| { |
| "epoch": 1.127381181450837, |
| "grad_norm": 0.1152580424421967, |
| "learning_rate": 4.561541160619188e-05, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.9451714098453522, |
| "step": 2930 |
| }, |
| { |
| "epoch": 1.1293053684818164, |
| "grad_norm": 0.11601480600361394, |
| "learning_rate": 4.5592971761107305e-05, |
| "loss": 0.1688, |
| "mean_token_accuracy": 0.9436035096645355, |
| "step": 2935 |
| }, |
| { |
| "epoch": 1.1312295555127958, |
| "grad_norm": 0.11043130745236737, |
| "learning_rate": 4.557048087250919e-05, |
| "loss": 0.1705, |
| "mean_token_accuracy": 0.943270879983902, |
| "step": 2940 |
| }, |
| { |
| "epoch": 1.1331537425437752, |
| "grad_norm": 0.11410012718160904, |
| "learning_rate": 4.5547939003848374e-05, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9457004249095917, |
| "step": 2945 |
| }, |
| { |
| "epoch": 1.1350779295747546, |
| "grad_norm": 0.14452950245071855, |
| "learning_rate": 4.5525346218719494e-05, |
| "loss": 0.1682, |
| "mean_token_accuracy": 0.944180291891098, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.137002116605734, |
| "grad_norm": 0.11593464274125287, |
| "learning_rate": 4.550270258086085e-05, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9445393085479736, |
| "step": 2955 |
| }, |
| { |
| "epoch": 1.1389263036367134, |
| "grad_norm": 0.10914297851497087, |
| "learning_rate": 4.548000815415419e-05, |
| "loss": 0.1655, |
| "mean_token_accuracy": 0.9448778092861175, |
| "step": 2960 |
| }, |
| { |
| "epoch": 1.1408504906676928, |
| "grad_norm": 0.11799163038725619, |
| "learning_rate": 4.5457263002624564e-05, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.9448582708835602, |
| "step": 2965 |
| }, |
| { |
| "epoch": 1.1427746776986722, |
| "grad_norm": 0.10917437430822485, |
| "learning_rate": 4.543446719044011e-05, |
| "loss": 0.1676, |
| "mean_token_accuracy": 0.9441717028617859, |
| "step": 2970 |
| }, |
| { |
| "epoch": 1.1446988647296517, |
| "grad_norm": 0.10999293274127507, |
| "learning_rate": 4.541162078191191e-05, |
| "loss": 0.1662, |
| "mean_token_accuracy": 0.9447192370891571, |
| "step": 2975 |
| }, |
| { |
| "epoch": 1.146623051760631, |
| "grad_norm": 0.1213298439573089, |
| "learning_rate": 4.5388723841493756e-05, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9453912734985351, |
| "step": 2980 |
| }, |
| { |
| "epoch": 1.1485472387916105, |
| "grad_norm": 0.1120433593367859, |
| "learning_rate": 4.536577643378203e-05, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9446643471717835, |
| "step": 2985 |
| }, |
| { |
| "epoch": 1.1504714258225899, |
| "grad_norm": 0.1195181116215837, |
| "learning_rate": 4.534277862351548e-05, |
| "loss": 0.1689, |
| "mean_token_accuracy": 0.9437766313552857, |
| "step": 2990 |
| }, |
| { |
| "epoch": 1.1523956128535693, |
| "grad_norm": 0.10566122117237882, |
| "learning_rate": 4.531973047557504e-05, |
| "loss": 0.1651, |
| "mean_token_accuracy": 0.9449531376361847, |
| "step": 2995 |
| }, |
| { |
| "epoch": 1.1543197998845487, |
| "grad_norm": 0.11045165779517872, |
| "learning_rate": 4.529663205498367e-05, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9451044261455536, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.156243986915528, |
| "grad_norm": 0.10897028680389657, |
| "learning_rate": 4.5273483426906136e-05, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.9457724034786225, |
| "step": 3005 |
| }, |
| { |
| "epoch": 1.1581681739465075, |
| "grad_norm": 0.11229669750924309, |
| "learning_rate": 4.525028465664888e-05, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9444893836975098, |
| "step": 3010 |
| }, |
| { |
| "epoch": 1.1600923609774871, |
| "grad_norm": 0.11598130083724927, |
| "learning_rate": 4.522703580965979e-05, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.9450352728366852, |
| "step": 3015 |
| }, |
| { |
| "epoch": 1.1620165480084665, |
| "grad_norm": 0.11264247770065546, |
| "learning_rate": 4.5203736951528015e-05, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.945190292596817, |
| "step": 3020 |
| }, |
| { |
| "epoch": 1.163940735039446, |
| "grad_norm": 0.11105533940678633, |
| "learning_rate": 4.5180388147983804e-05, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9444464981555939, |
| "step": 3025 |
| }, |
| { |
| "epoch": 1.1658649220704254, |
| "grad_norm": 0.10819037808707227, |
| "learning_rate": 4.515698946489833e-05, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9460033416748047, |
| "step": 3030 |
| }, |
| { |
| "epoch": 1.1677891091014048, |
| "grad_norm": 0.106476221373719, |
| "learning_rate": 4.513354096828345e-05, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9452900588512421, |
| "step": 3035 |
| }, |
| { |
| "epoch": 1.1697132961323842, |
| "grad_norm": 0.10565816473934767, |
| "learning_rate": 4.511004272429158e-05, |
| "loss": 0.1685, |
| "mean_token_accuracy": 0.9438742876052857, |
| "step": 3040 |
| }, |
| { |
| "epoch": 1.1716374831633636, |
| "grad_norm": 0.10537825559045179, |
| "learning_rate": 4.508649479921547e-05, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9447853684425354, |
| "step": 3045 |
| }, |
| { |
| "epoch": 1.173561670194343, |
| "grad_norm": 0.1458321817709821, |
| "learning_rate": 4.506289725948805e-05, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.945037055015564, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.1754858572253224, |
| "grad_norm": 0.11225857644127477, |
| "learning_rate": 4.503925017168219e-05, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.9449495792388916, |
| "step": 3055 |
| }, |
| { |
| "epoch": 1.1774100442563018, |
| "grad_norm": 0.15497296316426867, |
| "learning_rate": 4.501555360251056e-05, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9451259493827819, |
| "step": 3060 |
| }, |
| { |
| "epoch": 1.1793342312872812, |
| "grad_norm": 0.10779990682132905, |
| "learning_rate": 4.499180761882543e-05, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.9450840950012207, |
| "step": 3065 |
| }, |
| { |
| "epoch": 1.1812584183182606, |
| "grad_norm": 0.11208595212873654, |
| "learning_rate": 4.4968012287618474e-05, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.9442859888076782, |
| "step": 3070 |
| }, |
| { |
| "epoch": 1.18318260534924, |
| "grad_norm": 0.10790877379409972, |
| "learning_rate": 4.494416767602058e-05, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9444129526615143, |
| "step": 3075 |
| }, |
| { |
| "epoch": 1.1851067923802194, |
| "grad_norm": 0.13362309151311375, |
| "learning_rate": 4.492027385130166e-05, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9450437903404236, |
| "step": 3080 |
| }, |
| { |
| "epoch": 1.1870309794111988, |
| "grad_norm": 0.10548405243633255, |
| "learning_rate": 4.489633088087049e-05, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9451897084712982, |
| "step": 3085 |
| }, |
| { |
| "epoch": 1.1889551664421782, |
| "grad_norm": 0.1094571190971919, |
| "learning_rate": 4.487233883227446e-05, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9450724065303803, |
| "step": 3090 |
| }, |
| { |
| "epoch": 1.1908793534731577, |
| "grad_norm": 0.11368729549868266, |
| "learning_rate": 4.4848297773199444e-05, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9449832141399384, |
| "step": 3095 |
| }, |
| { |
| "epoch": 1.192803540504137, |
| "grad_norm": 0.10633275002345796, |
| "learning_rate": 4.482420777146958e-05, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.9445993602275848, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.1947277275351165, |
| "grad_norm": 0.10890863944556516, |
| "learning_rate": 4.480006889504707e-05, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.9447604954242707, |
| "step": 3105 |
| }, |
| { |
| "epoch": 1.1966519145660959, |
| "grad_norm": 0.10605220946366134, |
| "learning_rate": 4.477588121203201e-05, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.9447374463081359, |
| "step": 3110 |
| }, |
| { |
| "epoch": 1.1985761015970753, |
| "grad_norm": 0.1091304337833763, |
| "learning_rate": 4.475164479066218e-05, |
| "loss": 0.1655, |
| "mean_token_accuracy": 0.9448967158794404, |
| "step": 3115 |
| }, |
| { |
| "epoch": 1.2005002886280547, |
| "grad_norm": 0.10964839732163159, |
| "learning_rate": 4.472735969931287e-05, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.9460000276565552, |
| "step": 3120 |
| }, |
| { |
| "epoch": 1.202424475659034, |
| "grad_norm": 0.1078550524683339, |
| "learning_rate": 4.470302600649667e-05, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9453963935375214, |
| "step": 3125 |
| }, |
| { |
| "epoch": 1.2043486626900135, |
| "grad_norm": 0.11060898474209213, |
| "learning_rate": 4.467864378086329e-05, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.944653457403183, |
| "step": 3130 |
| }, |
| { |
| "epoch": 1.206272849720993, |
| "grad_norm": 0.11874980904115744, |
| "learning_rate": 4.4654213091199345e-05, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.94460169672966, |
| "step": 3135 |
| }, |
| { |
| "epoch": 1.2081970367519723, |
| "grad_norm": 0.11565952367311678, |
| "learning_rate": 4.46297340064282e-05, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9452078878879547, |
| "step": 3140 |
| }, |
| { |
| "epoch": 1.2101212237829517, |
| "grad_norm": 0.1113501248031809, |
| "learning_rate": 4.460520659560973e-05, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.9448917925357818, |
| "step": 3145 |
| }, |
| { |
| "epoch": 1.2120454108139311, |
| "grad_norm": 0.11584440119707458, |
| "learning_rate": 4.4580630927940145e-05, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9459242165088654, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.2139695978449105, |
| "grad_norm": 0.10654586713838474, |
| "learning_rate": 4.455600707275181e-05, |
| "loss": 0.163, |
| "mean_token_accuracy": 0.9455973863601684, |
| "step": 3155 |
| }, |
| { |
| "epoch": 1.21589378487589, |
| "grad_norm": 0.11309187543030096, |
| "learning_rate": 4.453133509951304e-05, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9448260486125946, |
| "step": 3160 |
| }, |
| { |
| "epoch": 1.2178179719068694, |
| "grad_norm": 0.10795367642944186, |
| "learning_rate": 4.450661507782788e-05, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9442971289157868, |
| "step": 3165 |
| }, |
| { |
| "epoch": 1.2197421589378488, |
| "grad_norm": 0.11325608172844308, |
| "learning_rate": 4.448184707743594e-05, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.9452082395553589, |
| "step": 3170 |
| }, |
| { |
| "epoch": 1.2216663459688282, |
| "grad_norm": 0.10770156568934536, |
| "learning_rate": 4.4457031168212195e-05, |
| "loss": 0.1665, |
| "mean_token_accuracy": 0.9443559110164642, |
| "step": 3175 |
| }, |
| { |
| "epoch": 1.2235905329998076, |
| "grad_norm": 0.10345448099059522, |
| "learning_rate": 4.443216742016675e-05, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9450966417789459, |
| "step": 3180 |
| }, |
| { |
| "epoch": 1.225514720030787, |
| "grad_norm": 0.10362673775355276, |
| "learning_rate": 4.440725590344469e-05, |
| "loss": 0.1655, |
| "mean_token_accuracy": 0.9448781192302704, |
| "step": 3185 |
| }, |
| { |
| "epoch": 1.2274389070617664, |
| "grad_norm": 0.10715949067340574, |
| "learning_rate": 4.4382296688325894e-05, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.9453873097896576, |
| "step": 3190 |
| }, |
| { |
| "epoch": 1.2293630940927458, |
| "grad_norm": 0.11161907208207356, |
| "learning_rate": 4.4357289845224755e-05, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9458705008029937, |
| "step": 3195 |
| }, |
| { |
| "epoch": 1.2312872811237252, |
| "grad_norm": 0.10746793786362824, |
| "learning_rate": 4.433223544469006e-05, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9446219742298126, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.2332114681547046, |
| "grad_norm": 0.2789190819561189, |
| "learning_rate": 4.4307133557404754e-05, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9454685270786285, |
| "step": 3205 |
| }, |
| { |
| "epoch": 1.235135655185684, |
| "grad_norm": 0.11125241310201558, |
| "learning_rate": 4.428198425418576e-05, |
| "loss": 0.1638, |
| "mean_token_accuracy": 0.9451810359954834, |
| "step": 3210 |
| }, |
| { |
| "epoch": 1.2370598422166634, |
| "grad_norm": 0.10590925043833456, |
| "learning_rate": 4.425678760598377e-05, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9448261022567749, |
| "step": 3215 |
| }, |
| { |
| "epoch": 1.2389840292476428, |
| "grad_norm": 0.10703980730393392, |
| "learning_rate": 4.423154368388304e-05, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9447467207908631, |
| "step": 3220 |
| }, |
| { |
| "epoch": 1.2409082162786222, |
| "grad_norm": 0.11244571794665767, |
| "learning_rate": 4.42062525591012e-05, |
| "loss": 0.1651, |
| "mean_token_accuracy": 0.9450976312160492, |
| "step": 3225 |
| }, |
| { |
| "epoch": 1.2428324033096017, |
| "grad_norm": 0.11057302698238773, |
| "learning_rate": 4.418091430298903e-05, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.944273978471756, |
| "step": 3230 |
| }, |
| { |
| "epoch": 1.244756590340581, |
| "grad_norm": 0.109162505178306, |
| "learning_rate": 4.41555289870303e-05, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9443233251571655, |
| "step": 3235 |
| }, |
| { |
| "epoch": 1.2466807773715605, |
| "grad_norm": 0.11127896226989155, |
| "learning_rate": 4.413009668284153e-05, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.945149028301239, |
| "step": 3240 |
| }, |
| { |
| "epoch": 1.2486049644025399, |
| "grad_norm": 0.10557729092136117, |
| "learning_rate": 4.410461746217179e-05, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9456705510616302, |
| "step": 3245 |
| }, |
| { |
| "epoch": 1.2505291514335193, |
| "grad_norm": 0.11207584287790179, |
| "learning_rate": 4.407909139690255e-05, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9445265352725982, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.2524533384644987, |
| "grad_norm": 0.10876050990252382, |
| "learning_rate": 4.405351855904739e-05, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9442014992237091, |
| "step": 3255 |
| }, |
| { |
| "epoch": 1.254377525495478, |
| "grad_norm": 0.1058929093267954, |
| "learning_rate": 4.402789902075187e-05, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9443489611148834, |
| "step": 3260 |
| }, |
| { |
| "epoch": 1.2563017125264575, |
| "grad_norm": 0.12172411780115582, |
| "learning_rate": 4.4002232854293305e-05, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9444850265979767, |
| "step": 3265 |
| }, |
| { |
| "epoch": 1.258225899557437, |
| "grad_norm": 0.1117116436027642, |
| "learning_rate": 4.397652013208054e-05, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9442491412162781, |
| "step": 3270 |
| }, |
| { |
| "epoch": 1.2601500865884163, |
| "grad_norm": 0.11305742675214413, |
| "learning_rate": 4.395076092665377e-05, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9447296023368835, |
| "step": 3275 |
| }, |
| { |
| "epoch": 1.2620742736193957, |
| "grad_norm": 0.10524076252473272, |
| "learning_rate": 4.392495531068433e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9456575155258179, |
| "step": 3280 |
| }, |
| { |
| "epoch": 1.2639984606503751, |
| "grad_norm": 0.10565547204693512, |
| "learning_rate": 4.389910335697447e-05, |
| "loss": 0.1684, |
| "mean_token_accuracy": 0.9437812447547913, |
| "step": 3285 |
| }, |
| { |
| "epoch": 1.2659226476813545, |
| "grad_norm": 0.5886669314222224, |
| "learning_rate": 4.3873205138457204e-05, |
| "loss": 0.1662, |
| "mean_token_accuracy": 0.9450515389442444, |
| "step": 3290 |
| }, |
| { |
| "epoch": 1.267846834712334, |
| "grad_norm": 0.11366376959578357, |
| "learning_rate": 4.384726072819602e-05, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9443962574005127, |
| "step": 3295 |
| }, |
| { |
| "epoch": 1.2697710217433134, |
| "grad_norm": 0.1127714483239427, |
| "learning_rate": 4.382127019938477e-05, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9446232855319977, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.2716952087742928, |
| "grad_norm": 0.10977940299951658, |
| "learning_rate": 4.379523362534736e-05, |
| "loss": 0.1638, |
| "mean_token_accuracy": 0.9452247381210327, |
| "step": 3305 |
| }, |
| { |
| "epoch": 1.2736193958052722, |
| "grad_norm": 0.1043758131726259, |
| "learning_rate": 4.376915107953767e-05, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9448330402374268, |
| "step": 3310 |
| }, |
| { |
| "epoch": 1.2755435828362516, |
| "grad_norm": 0.11335117990899052, |
| "learning_rate": 4.37430226355392e-05, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9443521559238434, |
| "step": 3315 |
| }, |
| { |
| "epoch": 1.277467769867231, |
| "grad_norm": 0.11887980255705852, |
| "learning_rate": 4.371684836706497e-05, |
| "loss": 0.1693, |
| "mean_token_accuracy": 0.9434103488922119, |
| "step": 3320 |
| }, |
| { |
| "epoch": 1.2793919568982104, |
| "grad_norm": 0.10264830047952361, |
| "learning_rate": 4.3690628347957294e-05, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9455609321594238, |
| "step": 3325 |
| }, |
| { |
| "epoch": 1.2813161439291898, |
| "grad_norm": 0.10656476625453781, |
| "learning_rate": 4.3664362652187544e-05, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9452636063098907, |
| "step": 3330 |
| }, |
| { |
| "epoch": 1.2832403309601692, |
| "grad_norm": 0.11373328764330298, |
| "learning_rate": 4.363805135385593e-05, |
| "loss": 0.1685, |
| "mean_token_accuracy": 0.9438173949718476, |
| "step": 3335 |
| }, |
| { |
| "epoch": 1.2851645179911486, |
| "grad_norm": 0.1110834865814892, |
| "learning_rate": 4.361169452719136e-05, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9445045590400696, |
| "step": 3340 |
| }, |
| { |
| "epoch": 1.287088705022128, |
| "grad_norm": 0.10499389790518351, |
| "learning_rate": 4.358529224655115e-05, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9443840861320496, |
| "step": 3345 |
| }, |
| { |
| "epoch": 1.2890128920531074, |
| "grad_norm": 0.10943161523408348, |
| "learning_rate": 4.355884458642085e-05, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9448311269283295, |
| "step": 3350 |
| }, |
| { |
| "epoch": 1.2909370790840868, |
| "grad_norm": 0.11241992154282385, |
| "learning_rate": 4.3532351621414076e-05, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.944890421628952, |
| "step": 3355 |
| }, |
| { |
| "epoch": 1.2928612661150665, |
| "grad_norm": 0.12204214677028828, |
| "learning_rate": 4.3505813426272206e-05, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.9451867461204528, |
| "step": 3360 |
| }, |
| { |
| "epoch": 1.2947854531460459, |
| "grad_norm": 0.11051458573708776, |
| "learning_rate": 4.347923007586424e-05, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9464590072631835, |
| "step": 3365 |
| }, |
| { |
| "epoch": 1.2967096401770253, |
| "grad_norm": 0.10588729835633466, |
| "learning_rate": 4.3452601645186576e-05, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9458398222923279, |
| "step": 3370 |
| }, |
| { |
| "epoch": 1.2986338272080047, |
| "grad_norm": 0.1129626094027341, |
| "learning_rate": 4.3425928209362784e-05, |
| "loss": 0.1676, |
| "mean_token_accuracy": 0.9443058788776397, |
| "step": 3375 |
| }, |
| { |
| "epoch": 1.300558014238984, |
| "grad_norm": 0.11026036715010902, |
| "learning_rate": 4.339920984364341e-05, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9446721136569977, |
| "step": 3380 |
| }, |
| { |
| "epoch": 1.3024822012699635, |
| "grad_norm": 0.10431970395231525, |
| "learning_rate": 4.337244662340574e-05, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9446083605289459, |
| "step": 3385 |
| }, |
| { |
| "epoch": 1.304406388300943, |
| "grad_norm": 0.10863163610480198, |
| "learning_rate": 4.334563862415361e-05, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.9451094448566437, |
| "step": 3390 |
| }, |
| { |
| "epoch": 1.3063305753319223, |
| "grad_norm": 0.10344587307267852, |
| "learning_rate": 4.33187859215172e-05, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9459316253662109, |
| "step": 3395 |
| }, |
| { |
| "epoch": 1.3082547623629017, |
| "grad_norm": 0.10329769224219561, |
| "learning_rate": 4.3291888591252774e-05, |
| "loss": 0.1681, |
| "mean_token_accuracy": 0.9443010270595551, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.3101789493938811, |
| "grad_norm": 0.10573120313146973, |
| "learning_rate": 4.326494670924254e-05, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9456990361213684, |
| "step": 3405 |
| }, |
| { |
| "epoch": 1.3121031364248605, |
| "grad_norm": 0.10999181599566926, |
| "learning_rate": 4.323796035149435e-05, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9456778645515442, |
| "step": 3410 |
| }, |
| { |
| "epoch": 1.31402732345584, |
| "grad_norm": 0.11732350200322827, |
| "learning_rate": 4.321092959414157e-05, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.9445131242275238, |
| "step": 3415 |
| }, |
| { |
| "epoch": 1.3159515104868194, |
| "grad_norm": 0.11232867292678717, |
| "learning_rate": 4.318385451344278e-05, |
| "loss": 0.164, |
| "mean_token_accuracy": 0.945280921459198, |
| "step": 3420 |
| }, |
| { |
| "epoch": 1.3178756975177988, |
| "grad_norm": 0.10617374846493216, |
| "learning_rate": 4.315673518578167e-05, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.9459102272987365, |
| "step": 3425 |
| }, |
| { |
| "epoch": 1.3197998845487782, |
| "grad_norm": 0.10896252285505933, |
| "learning_rate": 4.312957168766669e-05, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9459147036075592, |
| "step": 3430 |
| }, |
| { |
| "epoch": 1.3217240715797576, |
| "grad_norm": 0.11137213099006038, |
| "learning_rate": 4.310236409573095e-05, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9451059341430664, |
| "step": 3435 |
| }, |
| { |
| "epoch": 1.323648258610737, |
| "grad_norm": 0.10832084844435444, |
| "learning_rate": 4.307511248673193e-05, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9447243928909301, |
| "step": 3440 |
| }, |
| { |
| "epoch": 1.3255724456417164, |
| "grad_norm": 0.11469692641777185, |
| "learning_rate": 4.30478169375513e-05, |
| "loss": 0.1681, |
| "mean_token_accuracy": 0.9439410865306854, |
| "step": 3445 |
| }, |
| { |
| "epoch": 1.3274966326726958, |
| "grad_norm": 0.10691215801933739, |
| "learning_rate": 4.30204775251947e-05, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9449956476688385, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.3294208197036752, |
| "grad_norm": 0.11825050155634374, |
| "learning_rate": 4.2993094326791495e-05, |
| "loss": 0.1691, |
| "mean_token_accuracy": 0.9436154067516327, |
| "step": 3455 |
| }, |
| { |
| "epoch": 1.3313450067346546, |
| "grad_norm": 0.09976872054833154, |
| "learning_rate": 4.296566741959461e-05, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9454048812389374, |
| "step": 3460 |
| }, |
| { |
| "epoch": 1.333269193765634, |
| "grad_norm": 0.10977843829733427, |
| "learning_rate": 4.293819688098024e-05, |
| "loss": 0.163, |
| "mean_token_accuracy": 0.94544877409935, |
| "step": 3465 |
| }, |
| { |
| "epoch": 1.3351933807966134, |
| "grad_norm": 0.10388477951978371, |
| "learning_rate": 4.291068278844771e-05, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9446780025959015, |
| "step": 3470 |
| }, |
| { |
| "epoch": 1.3371175678275928, |
| "grad_norm": 0.11193340370564675, |
| "learning_rate": 4.288312521961919e-05, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9452359676361084, |
| "step": 3475 |
| }, |
| { |
| "epoch": 1.3390417548585722, |
| "grad_norm": 0.10573875704379705, |
| "learning_rate": 4.285552425223955e-05, |
| "loss": 0.1639, |
| "mean_token_accuracy": 0.9454302430152893, |
| "step": 3480 |
| }, |
| { |
| "epoch": 1.3409659418895516, |
| "grad_norm": 0.10442435507308796, |
| "learning_rate": 4.282787996417601e-05, |
| "loss": 0.1703, |
| "mean_token_accuracy": 0.9433972775936127, |
| "step": 3485 |
| }, |
| { |
| "epoch": 1.342890128920531, |
| "grad_norm": 0.10178493746278647, |
| "learning_rate": 4.2800192433418094e-05, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9455658197402954, |
| "step": 3490 |
| }, |
| { |
| "epoch": 1.3448143159515105, |
| "grad_norm": 0.10658339710742912, |
| "learning_rate": 4.2772461738077274e-05, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.9446894705295563, |
| "step": 3495 |
| }, |
| { |
| "epoch": 1.3467385029824899, |
| "grad_norm": 0.11283900491670172, |
| "learning_rate": 4.274468795638681e-05, |
| "loss": 0.1633, |
| "mean_token_accuracy": 0.9456519961357117, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.3486626900134693, |
| "grad_norm": 0.10214896741095081, |
| "learning_rate": 4.271687116670151e-05, |
| "loss": 0.1651, |
| "mean_token_accuracy": 0.9451316177845002, |
| "step": 3505 |
| }, |
| { |
| "epoch": 1.3505868770444487, |
| "grad_norm": 0.10559833937537223, |
| "learning_rate": 4.268901144749753e-05, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9450106203556061, |
| "step": 3510 |
| }, |
| { |
| "epoch": 1.352511064075428, |
| "grad_norm": 0.10894574541369677, |
| "learning_rate": 4.26611088773721e-05, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.9443780064582825, |
| "step": 3515 |
| }, |
| { |
| "epoch": 1.3544352511064075, |
| "grad_norm": 0.1152130055560103, |
| "learning_rate": 4.263316353504341e-05, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9447641968727112, |
| "step": 3520 |
| }, |
| { |
| "epoch": 1.356359438137387, |
| "grad_norm": 0.11157843122921446, |
| "learning_rate": 4.260517549935024e-05, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9448266983032226, |
| "step": 3525 |
| }, |
| { |
| "epoch": 1.3582836251683663, |
| "grad_norm": 0.10694050714172938, |
| "learning_rate": 4.257714484925185e-05, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.9442776262760162, |
| "step": 3530 |
| }, |
| { |
| "epoch": 1.3602078121993457, |
| "grad_norm": 0.10508952012523275, |
| "learning_rate": 4.254907166382775e-05, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9448453724384308, |
| "step": 3535 |
| }, |
| { |
| "epoch": 1.3621319992303251, |
| "grad_norm": 0.10804026722792796, |
| "learning_rate": 4.2520956022277415e-05, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.9450315535068512, |
| "step": 3540 |
| }, |
| { |
| "epoch": 1.3640561862613045, |
| "grad_norm": 0.10584159448655887, |
| "learning_rate": 4.249279800392009e-05, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.946070384979248, |
| "step": 3545 |
| }, |
| { |
| "epoch": 1.365980373292284, |
| "grad_norm": 0.10247054811830368, |
| "learning_rate": 4.24645976881946e-05, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9454725086688995, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.3679045603232634, |
| "grad_norm": 0.10303603825443779, |
| "learning_rate": 4.2436355154659085e-05, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.9457809746265411, |
| "step": 3555 |
| }, |
| { |
| "epoch": 1.3698287473542428, |
| "grad_norm": 0.10231812479939655, |
| "learning_rate": 4.240807048299079e-05, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.9446156203746796, |
| "step": 3560 |
| }, |
| { |
| "epoch": 1.3717529343852222, |
| "grad_norm": 0.09923793110107657, |
| "learning_rate": 4.237974375298584e-05, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.94522123336792, |
| "step": 3565 |
| }, |
| { |
| "epoch": 1.3736771214162016, |
| "grad_norm": 0.10193644008290698, |
| "learning_rate": 4.2351375044558996e-05, |
| "loss": 0.1645, |
| "mean_token_accuracy": 0.9449735343456268, |
| "step": 3570 |
| }, |
| { |
| "epoch": 1.3756013084471812, |
| "grad_norm": 0.10993227566346095, |
| "learning_rate": 4.232296443774349e-05, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.9448629319667816, |
| "step": 3575 |
| }, |
| { |
| "epoch": 1.3775254954781606, |
| "grad_norm": 0.10305085529258905, |
| "learning_rate": 4.229451201269072e-05, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.9453402578830719, |
| "step": 3580 |
| }, |
| { |
| "epoch": 1.37944968250914, |
| "grad_norm": 0.10694140393689279, |
| "learning_rate": 4.226601784967008e-05, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.944640851020813, |
| "step": 3585 |
| }, |
| { |
| "epoch": 1.3813738695401194, |
| "grad_norm": 0.10315714341865034, |
| "learning_rate": 4.223748202906869e-05, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.9451101958751679, |
| "step": 3590 |
| }, |
| { |
| "epoch": 1.3832980565710988, |
| "grad_norm": 0.10596272263919713, |
| "learning_rate": 4.220890463139122e-05, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9449066698551178, |
| "step": 3595 |
| }, |
| { |
| "epoch": 1.3852222436020782, |
| "grad_norm": 0.11068312840643807, |
| "learning_rate": 4.218028573725963e-05, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9452288806438446, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.3871464306330576, |
| "grad_norm": 0.11399898827725882, |
| "learning_rate": 4.215162542741295e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9458402037620545, |
| "step": 3605 |
| }, |
| { |
| "epoch": 1.389070617664037, |
| "grad_norm": 0.10629897967961681, |
| "learning_rate": 4.2122923782707035e-05, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9450836300849914, |
| "step": 3610 |
| }, |
| { |
| "epoch": 1.3909948046950165, |
| "grad_norm": 0.11877802521782936, |
| "learning_rate": 4.2094180884114375e-05, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9447278261184693, |
| "step": 3615 |
| }, |
| { |
| "epoch": 1.3929189917259959, |
| "grad_norm": 0.10567173738423105, |
| "learning_rate": 4.206539681272382e-05, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9448735773563385, |
| "step": 3620 |
| }, |
| { |
| "epoch": 1.3948431787569753, |
| "grad_norm": 0.10352666517341502, |
| "learning_rate": 4.2036571649740404e-05, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9451052606105804, |
| "step": 3625 |
| }, |
| { |
| "epoch": 1.3967673657879547, |
| "grad_norm": 0.11357510290183502, |
| "learning_rate": 4.2007705476485064e-05, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9447233974933624, |
| "step": 3630 |
| }, |
| { |
| "epoch": 1.398691552818934, |
| "grad_norm": 0.11195536430362973, |
| "learning_rate": 4.197879837439446e-05, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9459154546260834, |
| "step": 3635 |
| }, |
| { |
| "epoch": 1.4006157398499135, |
| "grad_norm": 0.10415607753076003, |
| "learning_rate": 4.194985042502069e-05, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9454761385917664, |
| "step": 3640 |
| }, |
| { |
| "epoch": 1.402539926880893, |
| "grad_norm": 0.10766512506284352, |
| "learning_rate": 4.1920861710031094e-05, |
| "loss": 0.1638, |
| "mean_token_accuracy": 0.9452585756778717, |
| "step": 3645 |
| }, |
| { |
| "epoch": 1.4044641139118723, |
| "grad_norm": 0.10856833595067805, |
| "learning_rate": 4.1891832311208055e-05, |
| "loss": 0.1681, |
| "mean_token_accuracy": 0.9444329977035523, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.4063883009428517, |
| "grad_norm": 0.10904305554437421, |
| "learning_rate": 4.1862762310448686e-05, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9446165263652802, |
| "step": 3655 |
| }, |
| { |
| "epoch": 1.4083124879738311, |
| "grad_norm": 0.10282925284209865, |
| "learning_rate": 4.1833651789764675e-05, |
| "loss": 0.1631, |
| "mean_token_accuracy": 0.9455777108669281, |
| "step": 3660 |
| }, |
| { |
| "epoch": 1.4102366750048105, |
| "grad_norm": 0.10235844426693394, |
| "learning_rate": 4.1804500831282006e-05, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9464917063713074, |
| "step": 3665 |
| }, |
| { |
| "epoch": 1.41216086203579, |
| "grad_norm": 0.10763382432644855, |
| "learning_rate": 4.177530951724076e-05, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9452625811100006, |
| "step": 3670 |
| }, |
| { |
| "epoch": 1.4140850490667694, |
| "grad_norm": 0.10239862565132175, |
| "learning_rate": 4.1746077929994865e-05, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.9451430201530456, |
| "step": 3675 |
| }, |
| { |
| "epoch": 1.4160092360977488, |
| "grad_norm": 0.1019056980175764, |
| "learning_rate": 4.171680615201185e-05, |
| "loss": 0.1645, |
| "mean_token_accuracy": 0.9452406644821167, |
| "step": 3680 |
| }, |
| { |
| "epoch": 1.4179334231287282, |
| "grad_norm": 0.10649424349147088, |
| "learning_rate": 4.168749426587265e-05, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.9451565325260163, |
| "step": 3685 |
| }, |
| { |
| "epoch": 1.4198576101597076, |
| "grad_norm": 0.10680893725233677, |
| "learning_rate": 4.165814235427135e-05, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9464034974575043, |
| "step": 3690 |
| }, |
| { |
| "epoch": 1.421781797190687, |
| "grad_norm": 0.10238376427445661, |
| "learning_rate": 4.1628750500014947e-05, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.94556645154953, |
| "step": 3695 |
| }, |
| { |
| "epoch": 1.4237059842216664, |
| "grad_norm": 0.10632577681067389, |
| "learning_rate": 4.159931878602312e-05, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9463468253612518, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.4256301712526458, |
| "grad_norm": 0.1066470110654814, |
| "learning_rate": 4.1569847295328e-05, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9456010282039642, |
| "step": 3705 |
| }, |
| { |
| "epoch": 1.4275543582836252, |
| "grad_norm": 0.10638056425969808, |
| "learning_rate": 4.1540336111073956e-05, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.9455960631370545, |
| "step": 3710 |
| }, |
| { |
| "epoch": 1.4294785453146046, |
| "grad_norm": 0.10984777816492315, |
| "learning_rate": 4.151078531651733e-05, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9452995538711548, |
| "step": 3715 |
| }, |
| { |
| "epoch": 1.431402732345584, |
| "grad_norm": 0.10653537135718445, |
| "learning_rate": 4.148119499502617e-05, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9448102474212646, |
| "step": 3720 |
| }, |
| { |
| "epoch": 1.4333269193765634, |
| "grad_norm": 0.10140762916952242, |
| "learning_rate": 4.1451565230080114e-05, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.9449801802635193, |
| "step": 3725 |
| }, |
| { |
| "epoch": 1.4352511064075428, |
| "grad_norm": 0.10708663553306287, |
| "learning_rate": 4.142189610527e-05, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9456826210021972, |
| "step": 3730 |
| }, |
| { |
| "epoch": 1.4371752934385222, |
| "grad_norm": 0.10793078077394648, |
| "learning_rate": 4.139218770429776e-05, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9444868803024292, |
| "step": 3735 |
| }, |
| { |
| "epoch": 1.4390994804695016, |
| "grad_norm": 0.10309565815294483, |
| "learning_rate": 4.136244011097612e-05, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9456857860088348, |
| "step": 3740 |
| }, |
| { |
| "epoch": 1.441023667500481, |
| "grad_norm": 0.10410978514797928, |
| "learning_rate": 4.133265340922836e-05, |
| "loss": 0.1612, |
| "mean_token_accuracy": 0.9461513102054596, |
| "step": 3745 |
| }, |
| { |
| "epoch": 1.4429478545314605, |
| "grad_norm": 0.12668018886073118, |
| "learning_rate": 4.130282768308809e-05, |
| "loss": 0.1662, |
| "mean_token_accuracy": 0.9444233894348144, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.4448720415624399, |
| "grad_norm": 0.1052856451043635, |
| "learning_rate": 4.127296301669903e-05, |
| "loss": 0.1645, |
| "mean_token_accuracy": 0.9449902892112731, |
| "step": 3755 |
| }, |
| { |
| "epoch": 1.4467962285934193, |
| "grad_norm": 0.10299103372921697, |
| "learning_rate": 4.124305949431477e-05, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9454239845275879, |
| "step": 3760 |
| }, |
| { |
| "epoch": 1.4487204156243987, |
| "grad_norm": 0.10222015678259523, |
| "learning_rate": 4.121311720029848e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9456360638141632, |
| "step": 3765 |
| }, |
| { |
| "epoch": 1.450644602655378, |
| "grad_norm": 0.10718052740518302, |
| "learning_rate": 4.118313621912275e-05, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9451362669467926, |
| "step": 3770 |
| }, |
| { |
| "epoch": 1.4525687896863575, |
| "grad_norm": 0.10677535074707435, |
| "learning_rate": 4.115311663536928e-05, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9452996253967285, |
| "step": 3775 |
| }, |
| { |
| "epoch": 1.454492976717337, |
| "grad_norm": 0.11089059502709091, |
| "learning_rate": 4.112305853372871e-05, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9444291234016419, |
| "step": 3780 |
| }, |
| { |
| "epoch": 1.4564171637483163, |
| "grad_norm": 0.11249418555034482, |
| "learning_rate": 4.109296199900031e-05, |
| "loss": 0.163, |
| "mean_token_accuracy": 0.9453545689582825, |
| "step": 3785 |
| }, |
| { |
| "epoch": 1.4583413507792957, |
| "grad_norm": 0.10298839043242691, |
| "learning_rate": 4.1062827116091805e-05, |
| "loss": 0.1638, |
| "mean_token_accuracy": 0.9453439474105835, |
| "step": 3790 |
| }, |
| { |
| "epoch": 1.4602655378102751, |
| "grad_norm": 0.10368991422762244, |
| "learning_rate": 4.1032653970019105e-05, |
| "loss": 0.1645, |
| "mean_token_accuracy": 0.9450232863426209, |
| "step": 3795 |
| }, |
| { |
| "epoch": 1.4621897248412545, |
| "grad_norm": 0.1013222632737639, |
| "learning_rate": 4.100244264590604e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.945792269706726, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.464113911872234, |
| "grad_norm": 0.11295443317373963, |
| "learning_rate": 4.097219322898417e-05, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9462812006473541, |
| "step": 3805 |
| }, |
| { |
| "epoch": 1.4660380989032133, |
| "grad_norm": 0.10843179532916145, |
| "learning_rate": 4.0941905804592526e-05, |
| "loss": 0.1645, |
| "mean_token_accuracy": 0.9450453817844391, |
| "step": 3810 |
| }, |
| { |
| "epoch": 1.4679622859341928, |
| "grad_norm": 0.10545222962017423, |
| "learning_rate": 4.091158045817735e-05, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9447169959545135, |
| "step": 3815 |
| }, |
| { |
| "epoch": 1.4698864729651722, |
| "grad_norm": 0.10689442875428007, |
| "learning_rate": 4.088121727529188e-05, |
| "loss": 0.1631, |
| "mean_token_accuracy": 0.9454691410064697, |
| "step": 3820 |
| }, |
| { |
| "epoch": 1.4718106599961516, |
| "grad_norm": 0.10976362275608818, |
| "learning_rate": 4.0850816341596084e-05, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9450465559959411, |
| "step": 3825 |
| }, |
| { |
| "epoch": 1.473734847027131, |
| "grad_norm": 0.10317032616560282, |
| "learning_rate": 4.082037774285645e-05, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9461302757263184, |
| "step": 3830 |
| }, |
| { |
| "epoch": 1.4756590340581104, |
| "grad_norm": 0.10571152503983529, |
| "learning_rate": 4.0789901564945704e-05, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.945213770866394, |
| "step": 3835 |
| }, |
| { |
| "epoch": 1.4775832210890898, |
| "grad_norm": 0.10519775368401885, |
| "learning_rate": 4.075938789384262e-05, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.9450217008590698, |
| "step": 3840 |
| }, |
| { |
| "epoch": 1.4795074081200692, |
| "grad_norm": 0.10585444591489851, |
| "learning_rate": 4.072883681563171e-05, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9457007706165313, |
| "step": 3845 |
| }, |
| { |
| "epoch": 1.4814315951510486, |
| "grad_norm": 0.11925588297419709, |
| "learning_rate": 4.069824841650304e-05, |
| "loss": 0.1645, |
| "mean_token_accuracy": 0.9450950980186462, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.483355782182028, |
| "grad_norm": 0.10791072692381821, |
| "learning_rate": 4.0667622782751986e-05, |
| "loss": 0.1631, |
| "mean_token_accuracy": 0.9458039939403534, |
| "step": 3855 |
| }, |
| { |
| "epoch": 1.4852799692130074, |
| "grad_norm": 0.1032554985834212, |
| "learning_rate": 4.0636960000778906e-05, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.945751404762268, |
| "step": 3860 |
| }, |
| { |
| "epoch": 1.4872041562439868, |
| "grad_norm": 0.10170527823886044, |
| "learning_rate": 4.060626015708903e-05, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.9446871399879455, |
| "step": 3865 |
| }, |
| { |
| "epoch": 1.4891283432749662, |
| "grad_norm": 0.10902366153161942, |
| "learning_rate": 4.057552333829211e-05, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.945387089252472, |
| "step": 3870 |
| }, |
| { |
| "epoch": 1.4910525303059456, |
| "grad_norm": 0.10938961670814928, |
| "learning_rate": 4.0544749631102205e-05, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.944713294506073, |
| "step": 3875 |
| }, |
| { |
| "epoch": 1.492976717336925, |
| "grad_norm": 0.10089328025560834, |
| "learning_rate": 4.0513939122337455e-05, |
| "loss": 0.1631, |
| "mean_token_accuracy": 0.9452538788318634, |
| "step": 3880 |
| }, |
| { |
| "epoch": 1.4949009043679045, |
| "grad_norm": 0.10384666680800574, |
| "learning_rate": 4.048309189891984e-05, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9458374917507172, |
| "step": 3885 |
| }, |
| { |
| "epoch": 1.4968250913988839, |
| "grad_norm": 0.10536014803015964, |
| "learning_rate": 4.045220804787487e-05, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9458865523338318, |
| "step": 3890 |
| }, |
| { |
| "epoch": 1.4987492784298633, |
| "grad_norm": 0.1086882198235937, |
| "learning_rate": 4.042128765633146e-05, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9448476135730743, |
| "step": 3895 |
| }, |
| { |
| "epoch": 1.5006734654608427, |
| "grad_norm": 0.10352972906796397, |
| "learning_rate": 4.0390330811521546e-05, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9454161047935485, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.502597652491822, |
| "grad_norm": 0.10540351615679729, |
| "learning_rate": 4.035933760077992e-05, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.944770210981369, |
| "step": 3905 |
| }, |
| { |
| "epoch": 1.5045218395228015, |
| "grad_norm": 0.10305963532964829, |
| "learning_rate": 4.0328308111544014e-05, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9457070767879486, |
| "step": 3910 |
| }, |
| { |
| "epoch": 1.506446026553781, |
| "grad_norm": 0.11035442755861583, |
| "learning_rate": 4.029724243135355e-05, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9449433743953705, |
| "step": 3915 |
| }, |
| { |
| "epoch": 1.5083702135847603, |
| "grad_norm": 0.11050723665714139, |
| "learning_rate": 4.026614064785038e-05, |
| "loss": 0.1633, |
| "mean_token_accuracy": 0.9456405460834503, |
| "step": 3920 |
| }, |
| { |
| "epoch": 1.5102944006157397, |
| "grad_norm": 0.11786552783526551, |
| "learning_rate": 4.023500284877822e-05, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9459376811981202, |
| "step": 3925 |
| }, |
| { |
| "epoch": 1.5122185876467191, |
| "grad_norm": 0.10870625983906573, |
| "learning_rate": 4.020382912198235e-05, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.945826131105423, |
| "step": 3930 |
| }, |
| { |
| "epoch": 1.5141427746776985, |
| "grad_norm": 0.10948135264308353, |
| "learning_rate": 4.017261955540945e-05, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9464793860912323, |
| "step": 3935 |
| }, |
| { |
| "epoch": 1.516066961708678, |
| "grad_norm": 0.10162679882740341, |
| "learning_rate": 4.01413742371073e-05, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.945406460762024, |
| "step": 3940 |
| }, |
| { |
| "epoch": 1.5179911487396573, |
| "grad_norm": 0.10521806512733063, |
| "learning_rate": 4.0110093255224534e-05, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9450480759143829, |
| "step": 3945 |
| }, |
| { |
| "epoch": 1.5199153357706368, |
| "grad_norm": 0.10833725994973822, |
| "learning_rate": 4.00787766980104e-05, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.9450052797794342, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.5218395228016162, |
| "grad_norm": 0.109814598470446, |
| "learning_rate": 4.004742465381454e-05, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9465232491493225, |
| "step": 3955 |
| }, |
| { |
| "epoch": 1.5237637098325956, |
| "grad_norm": 0.10694664255943216, |
| "learning_rate": 4.001603721108665e-05, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9463098287582398, |
| "step": 3960 |
| }, |
| { |
| "epoch": 1.525687896863575, |
| "grad_norm": 0.10263512740022389, |
| "learning_rate": 3.998461445837634e-05, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.9454042196273804, |
| "step": 3965 |
| }, |
| { |
| "epoch": 1.5276120838945544, |
| "grad_norm": 0.10032118708257971, |
| "learning_rate": 3.995315648433283e-05, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9457547128200531, |
| "step": 3970 |
| }, |
| { |
| "epoch": 1.529536270925534, |
| "grad_norm": 0.10523384809000864, |
| "learning_rate": 3.992166337770469e-05, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.9447933554649353, |
| "step": 3975 |
| }, |
| { |
| "epoch": 1.5314604579565134, |
| "grad_norm": 0.10296416037816417, |
| "learning_rate": 3.989013522733961e-05, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9456773638725281, |
| "step": 3980 |
| }, |
| { |
| "epoch": 1.5333846449874928, |
| "grad_norm": 0.10534602947819298, |
| "learning_rate": 3.9858572122184165e-05, |
| "loss": 0.1633, |
| "mean_token_accuracy": 0.945252388715744, |
| "step": 3985 |
| }, |
| { |
| "epoch": 1.5353088320184722, |
| "grad_norm": 0.10465920572337489, |
| "learning_rate": 3.982697415128352e-05, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.945354574918747, |
| "step": 3990 |
| }, |
| { |
| "epoch": 1.5372330190494516, |
| "grad_norm": 0.098211960325333, |
| "learning_rate": 3.97953414037812e-05, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9457767367362976, |
| "step": 3995 |
| }, |
| { |
| "epoch": 1.539157206080431, |
| "grad_norm": 0.10124489861736853, |
| "learning_rate": 3.976367396891887e-05, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9456769168376923, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.5410813931114105, |
| "grad_norm": 0.10314687402688161, |
| "learning_rate": 3.9731971936036004e-05, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9460195481777192, |
| "step": 4005 |
| }, |
| { |
| "epoch": 1.5430055801423899, |
| "grad_norm": 0.10177769628276215, |
| "learning_rate": 3.970023539456974e-05, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9456782817840577, |
| "step": 4010 |
| }, |
| { |
| "epoch": 1.5449297671733693, |
| "grad_norm": 0.09796056102911006, |
| "learning_rate": 3.966846443405455e-05, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9447887420654297, |
| "step": 4015 |
| }, |
| { |
| "epoch": 1.5468539542043487, |
| "grad_norm": 0.10469377145655043, |
| "learning_rate": 3.963665914412197e-05, |
| "loss": 0.1585, |
| "mean_token_accuracy": 0.9467072010040283, |
| "step": 4020 |
| }, |
| { |
| "epoch": 1.548778141235328, |
| "grad_norm": 0.1006826335320941, |
| "learning_rate": 3.960481961450045e-05, |
| "loss": 0.1612, |
| "mean_token_accuracy": 0.9459984958171844, |
| "step": 4025 |
| }, |
| { |
| "epoch": 1.5507023282663075, |
| "grad_norm": 0.10467059451473193, |
| "learning_rate": 3.9572945935014996e-05, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9454215824604034, |
| "step": 4030 |
| }, |
| { |
| "epoch": 1.552626515297287, |
| "grad_norm": 0.10041987447318289, |
| "learning_rate": 3.954103819558697e-05, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9457914710044861, |
| "step": 4035 |
| }, |
| { |
| "epoch": 1.5545507023282663, |
| "grad_norm": 0.10089989409325976, |
| "learning_rate": 3.95090964862338e-05, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9456892788410187, |
| "step": 4040 |
| }, |
| { |
| "epoch": 1.5564748893592457, |
| "grad_norm": 0.10159003859464948, |
| "learning_rate": 3.947712089706879e-05, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9456854522228241, |
| "step": 4045 |
| }, |
| { |
| "epoch": 1.5583990763902251, |
| "grad_norm": 0.10090677861345802, |
| "learning_rate": 3.9445111518300805e-05, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9454401433467865, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.5603232634212045, |
| "grad_norm": 0.09959007302395402, |
| "learning_rate": 3.941306844023402e-05, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9458894848823547, |
| "step": 4055 |
| }, |
| { |
| "epoch": 1.562247450452184, |
| "grad_norm": 0.10522278792323339, |
| "learning_rate": 3.9380991753267704e-05, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9460979759693146, |
| "step": 4060 |
| }, |
| { |
| "epoch": 1.5641716374831633, |
| "grad_norm": 0.10308921430670417, |
| "learning_rate": 3.934888154789593e-05, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.9448984205722809, |
| "step": 4065 |
| }, |
| { |
| "epoch": 1.5660958245141428, |
| "grad_norm": 0.0990674715690409, |
| "learning_rate": 3.931673791470734e-05, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9455141544342041, |
| "step": 4070 |
| }, |
| { |
| "epoch": 1.5680200115451222, |
| "grad_norm": 0.099198857552859, |
| "learning_rate": 3.928456094438489e-05, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9452341854572296, |
| "step": 4075 |
| }, |
| { |
| "epoch": 1.5699441985761016, |
| "grad_norm": 0.10447787722426305, |
| "learning_rate": 3.9252350727705555e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9459849536418915, |
| "step": 4080 |
| }, |
| { |
| "epoch": 1.571868385607081, |
| "grad_norm": 0.10013162065383374, |
| "learning_rate": 3.922010735554014e-05, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.9463956356048584, |
| "step": 4085 |
| }, |
| { |
| "epoch": 1.5737925726380604, |
| "grad_norm": 0.10170511979504664, |
| "learning_rate": 3.918783091885297e-05, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.944677346944809, |
| "step": 4090 |
| }, |
| { |
| "epoch": 1.5757167596690398, |
| "grad_norm": 0.10131083936715862, |
| "learning_rate": 3.915552150870166e-05, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9461105525493622, |
| "step": 4095 |
| }, |
| { |
| "epoch": 1.5776409467000192, |
| "grad_norm": 0.1083208948877487, |
| "learning_rate": 3.9123179216236826e-05, |
| "loss": 0.1581, |
| "mean_token_accuracy": 0.946831899881363, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.5795651337309986, |
| "grad_norm": 0.10235631166571117, |
| "learning_rate": 3.9090804132701887e-05, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9463982284069061, |
| "step": 4105 |
| }, |
| { |
| "epoch": 1.581489320761978, |
| "grad_norm": 0.09790623152102618, |
| "learning_rate": 3.905839634943273e-05, |
| "loss": 0.16, |
| "mean_token_accuracy": 0.9465533256530761, |
| "step": 4110 |
| }, |
| { |
| "epoch": 1.5834135077929576, |
| "grad_norm": 0.1028509160004831, |
| "learning_rate": 3.9025955957857524e-05, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.94595587849617, |
| "step": 4115 |
| }, |
| { |
| "epoch": 1.585337694823937, |
| "grad_norm": 0.10797185942056255, |
| "learning_rate": 3.899348304949642e-05, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.9453164756298065, |
| "step": 4120 |
| }, |
| { |
| "epoch": 1.5872618818549165, |
| "grad_norm": 0.09812477671946834, |
| "learning_rate": 3.896097771596133e-05, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9459312915802002, |
| "step": 4125 |
| }, |
| { |
| "epoch": 1.5891860688858959, |
| "grad_norm": 0.10397428273694205, |
| "learning_rate": 3.892844004895559e-05, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9457064151763916, |
| "step": 4130 |
| }, |
| { |
| "epoch": 1.5911102559168753, |
| "grad_norm": 0.10265428157169595, |
| "learning_rate": 3.889587014027381e-05, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9455591917037964, |
| "step": 4135 |
| }, |
| { |
| "epoch": 1.5930344429478547, |
| "grad_norm": 0.09954216473225751, |
| "learning_rate": 3.886326808180152e-05, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9466226279735566, |
| "step": 4140 |
| }, |
| { |
| "epoch": 1.594958629978834, |
| "grad_norm": 0.0948999453129512, |
| "learning_rate": 3.8830633965514965e-05, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9466159641742706, |
| "step": 4145 |
| }, |
| { |
| "epoch": 1.5968828170098135, |
| "grad_norm": 0.10150749898077574, |
| "learning_rate": 3.879796788348084e-05, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9443914890289307, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.598807004040793, |
| "grad_norm": 0.10002527883833515, |
| "learning_rate": 3.876526992785602e-05, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.9461474359035492, |
| "step": 4155 |
| }, |
| { |
| "epoch": 1.6007311910717723, |
| "grad_norm": 0.1049235476768015, |
| "learning_rate": 3.873254019088727e-05, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.945132714509964, |
| "step": 4160 |
| }, |
| { |
| "epoch": 1.6026553781027517, |
| "grad_norm": 0.09716463237940702, |
| "learning_rate": 3.869977876491105e-05, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9458979725837707, |
| "step": 4165 |
| }, |
| { |
| "epoch": 1.6045795651337311, |
| "grad_norm": 0.09527653419455659, |
| "learning_rate": 3.8666985742353214e-05, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9457536578178406, |
| "step": 4170 |
| }, |
| { |
| "epoch": 1.6065037521647105, |
| "grad_norm": 0.09974264857216013, |
| "learning_rate": 3.863416121572875e-05, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.945774495601654, |
| "step": 4175 |
| }, |
| { |
| "epoch": 1.60842793919569, |
| "grad_norm": 0.1016351993204537, |
| "learning_rate": 3.860130527764153e-05, |
| "loss": 0.1608, |
| "mean_token_accuracy": 0.9462750256061554, |
| "step": 4180 |
| }, |
| { |
| "epoch": 1.6103521262266693, |
| "grad_norm": 0.09782946120729334, |
| "learning_rate": 3.856841802078403e-05, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.9449570298194885, |
| "step": 4185 |
| }, |
| { |
| "epoch": 1.6122763132576488, |
| "grad_norm": 0.09900853294736975, |
| "learning_rate": 3.85354995379371e-05, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9465481758117675, |
| "step": 4190 |
| }, |
| { |
| "epoch": 1.6142005002886282, |
| "grad_norm": 0.09657776840856795, |
| "learning_rate": 3.850254992196967e-05, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9456781804561615, |
| "step": 4195 |
| }, |
| { |
| "epoch": 1.6161246873196076, |
| "grad_norm": 0.10028799032204168, |
| "learning_rate": 3.84695692658385e-05, |
| "loss": 0.1596, |
| "mean_token_accuracy": 0.9467596590518952, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.618048874350587, |
| "grad_norm": 0.09996049503329667, |
| "learning_rate": 3.8436557662587945e-05, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.944662743806839, |
| "step": 4205 |
| }, |
| { |
| "epoch": 1.6199730613815664, |
| "grad_norm": 0.09724019229821691, |
| "learning_rate": 3.840351520534964e-05, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.9463082849979401, |
| "step": 4210 |
| }, |
| { |
| "epoch": 1.6218972484125458, |
| "grad_norm": 0.09820808813755214, |
| "learning_rate": 3.8370441987342274e-05, |
| "loss": 0.1612, |
| "mean_token_accuracy": 0.9463602125644683, |
| "step": 4215 |
| }, |
| { |
| "epoch": 1.6238214354435252, |
| "grad_norm": 0.09953918320915445, |
| "learning_rate": 3.833733810187131e-05, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9448364675045013, |
| "step": 4220 |
| }, |
| { |
| "epoch": 1.6257456224745046, |
| "grad_norm": 0.10152002216245509, |
| "learning_rate": 3.830420364232876e-05, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9457645535469055, |
| "step": 4225 |
| }, |
| { |
| "epoch": 1.627669809505484, |
| "grad_norm": 0.10335735116416461, |
| "learning_rate": 3.827103870219285e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9453236222267151, |
| "step": 4230 |
| }, |
| { |
| "epoch": 1.6295939965364634, |
| "grad_norm": 0.10024442307017409, |
| "learning_rate": 3.823784337502782e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9455093562602996, |
| "step": 4235 |
| }, |
| { |
| "epoch": 1.6315181835674428, |
| "grad_norm": 0.10809315246772307, |
| "learning_rate": 3.820461775448364e-05, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.9452807545661926, |
| "step": 4240 |
| }, |
| { |
| "epoch": 1.6334423705984222, |
| "grad_norm": 0.10929760682984689, |
| "learning_rate": 3.817136193429571e-05, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9450959205627442, |
| "step": 4245 |
| }, |
| { |
| "epoch": 1.6353665576294016, |
| "grad_norm": 0.09951760251430594, |
| "learning_rate": 3.813807600828468e-05, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9465416312217713, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.637290744660381, |
| "grad_norm": 0.09218652520491599, |
| "learning_rate": 3.810476007035611e-05, |
| "loss": 0.1585, |
| "mean_token_accuracy": 0.9469627916812897, |
| "step": 4255 |
| }, |
| { |
| "epoch": 1.6392149316913605, |
| "grad_norm": 0.10040311322983182, |
| "learning_rate": 3.807141421450021e-05, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9466736435890197, |
| "step": 4260 |
| }, |
| { |
| "epoch": 1.6411391187223399, |
| "grad_norm": 0.10402262287219591, |
| "learning_rate": 3.803803853479163e-05, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.945269650220871, |
| "step": 4265 |
| }, |
| { |
| "epoch": 1.6430633057533193, |
| "grad_norm": 0.09907131803218525, |
| "learning_rate": 3.8004633125389115e-05, |
| "loss": 0.1583, |
| "mean_token_accuracy": 0.9469216406345368, |
| "step": 4270 |
| }, |
| { |
| "epoch": 1.6449874927842987, |
| "grad_norm": 0.10596990721340989, |
| "learning_rate": 3.797119808053533e-05, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9460356116294861, |
| "step": 4275 |
| }, |
| { |
| "epoch": 1.646911679815278, |
| "grad_norm": 0.09841311054889437, |
| "learning_rate": 3.793773349455652e-05, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9460277736186982, |
| "step": 4280 |
| }, |
| { |
| "epoch": 1.6488358668462575, |
| "grad_norm": 0.09806735698422851, |
| "learning_rate": 3.790423946186226e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9456752181053162, |
| "step": 4285 |
| }, |
| { |
| "epoch": 1.650760053877237, |
| "grad_norm": 0.10126137877354573, |
| "learning_rate": 3.787071607694523e-05, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9453158974647522, |
| "step": 4290 |
| }, |
| { |
| "epoch": 1.6526842409082163, |
| "grad_norm": 0.09767249513375194, |
| "learning_rate": 3.783716343438091e-05, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.9464418232440949, |
| "step": 4295 |
| }, |
| { |
| "epoch": 1.6546084279391957, |
| "grad_norm": 0.1002960076707505, |
| "learning_rate": 3.7803581628827285e-05, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9464677512645722, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.6565326149701751, |
| "grad_norm": 0.1083689472471439, |
| "learning_rate": 3.776997075502466e-05, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9464883804321289, |
| "step": 4305 |
| }, |
| { |
| "epoch": 1.6584568020011545, |
| "grad_norm": 0.09991261064628595, |
| "learning_rate": 3.773633090779534e-05, |
| "loss": 0.1598, |
| "mean_token_accuracy": 0.9466773569583893, |
| "step": 4310 |
| }, |
| { |
| "epoch": 1.660380989032134, |
| "grad_norm": 0.09813598977680858, |
| "learning_rate": 3.770266218204334e-05, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9454030215740203, |
| "step": 4315 |
| }, |
| { |
| "epoch": 1.6623051760631133, |
| "grad_norm": 0.09964594545739816, |
| "learning_rate": 3.766896467275417e-05, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9455722093582153, |
| "step": 4320 |
| }, |
| { |
| "epoch": 1.6642293630940928, |
| "grad_norm": 0.10360473288926222, |
| "learning_rate": 3.763523847499454e-05, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.945340096950531, |
| "step": 4325 |
| }, |
| { |
| "epoch": 1.6661535501250722, |
| "grad_norm": 0.09450293752635669, |
| "learning_rate": 3.76014836839121e-05, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9461982131004334, |
| "step": 4330 |
| }, |
| { |
| "epoch": 1.6680777371560516, |
| "grad_norm": 0.10472592004396242, |
| "learning_rate": 3.7567700394735144e-05, |
| "loss": 0.1633, |
| "mean_token_accuracy": 0.9452694237232209, |
| "step": 4335 |
| }, |
| { |
| "epoch": 1.670001924187031, |
| "grad_norm": 0.10278164728725776, |
| "learning_rate": 3.75338887027724e-05, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9455021142959594, |
| "step": 4340 |
| }, |
| { |
| "epoch": 1.6719261112180104, |
| "grad_norm": 0.09960828921624268, |
| "learning_rate": 3.750004870341269e-05, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9457294404506683, |
| "step": 4345 |
| }, |
| { |
| "epoch": 1.6738502982489898, |
| "grad_norm": 0.09986603053296246, |
| "learning_rate": 3.746618049212473e-05, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9466879844665528, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.6757744852799692, |
| "grad_norm": 0.10259521262476153, |
| "learning_rate": 3.7432284164456793e-05, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.945849597454071, |
| "step": 4355 |
| }, |
| { |
| "epoch": 1.6776986723109486, |
| "grad_norm": 0.10462616055841983, |
| "learning_rate": 3.73983598160365e-05, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.946190345287323, |
| "step": 4360 |
| }, |
| { |
| "epoch": 1.679622859341928, |
| "grad_norm": 0.11107610378497039, |
| "learning_rate": 3.736440754257051e-05, |
| "loss": 0.158, |
| "mean_token_accuracy": 0.9471249282360077, |
| "step": 4365 |
| }, |
| { |
| "epoch": 1.6815470463729074, |
| "grad_norm": 0.09987899542924915, |
| "learning_rate": 3.733042743984425e-05, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9459578812122345, |
| "step": 4370 |
| }, |
| { |
| "epoch": 1.6834712334038868, |
| "grad_norm": 0.10117763930369007, |
| "learning_rate": 3.7296419603721706e-05, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9449024319648742, |
| "step": 4375 |
| }, |
| { |
| "epoch": 1.6853954204348662, |
| "grad_norm": 0.09923596483325249, |
| "learning_rate": 3.7262384130145054e-05, |
| "loss": 0.1651, |
| "mean_token_accuracy": 0.9448891043663025, |
| "step": 4380 |
| }, |
| { |
| "epoch": 1.6873196074658456, |
| "grad_norm": 0.10668055665280804, |
| "learning_rate": 3.722832111513447e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9455032289028168, |
| "step": 4385 |
| }, |
| { |
| "epoch": 1.689243794496825, |
| "grad_norm": 0.10363929813286556, |
| "learning_rate": 3.719423065478782e-05, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9447316706180573, |
| "step": 4390 |
| }, |
| { |
| "epoch": 1.6911679815278045, |
| "grad_norm": 0.10047421943729479, |
| "learning_rate": 3.7160112845280385e-05, |
| "loss": 0.1631, |
| "mean_token_accuracy": 0.9456027746200562, |
| "step": 4395 |
| }, |
| { |
| "epoch": 1.6930921685587839, |
| "grad_norm": 0.09742967063763901, |
| "learning_rate": 3.7125967782864624e-05, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9459131598472595, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.6950163555897633, |
| "grad_norm": 0.10819151467635159, |
| "learning_rate": 3.7091795563869876e-05, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.945240992307663, |
| "step": 4405 |
| }, |
| { |
| "epoch": 1.6969405426207427, |
| "grad_norm": 0.10434588415019977, |
| "learning_rate": 3.705759628470208e-05, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.9450987458229065, |
| "step": 4410 |
| }, |
| { |
| "epoch": 1.698864729651722, |
| "grad_norm": 0.10145849516304649, |
| "learning_rate": 3.702337004184354e-05, |
| "loss": 0.1606, |
| "mean_token_accuracy": 0.9462997257709503, |
| "step": 4415 |
| }, |
| { |
| "epoch": 1.7007889166827015, |
| "grad_norm": 0.10005157393993798, |
| "learning_rate": 3.6989116931852616e-05, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.946276193857193, |
| "step": 4420 |
| }, |
| { |
| "epoch": 1.702713103713681, |
| "grad_norm": 0.11138468161786759, |
| "learning_rate": 3.695483705136345e-05, |
| "loss": 0.1651, |
| "mean_token_accuracy": 0.9449223577976227, |
| "step": 4425 |
| }, |
| { |
| "epoch": 1.7046372907446603, |
| "grad_norm": 0.1004360772894267, |
| "learning_rate": 3.692053049708574e-05, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9464481592178344, |
| "step": 4430 |
| }, |
| { |
| "epoch": 1.7065614777756397, |
| "grad_norm": 0.09901623918063937, |
| "learning_rate": 3.688619736580441e-05, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9458293974399566, |
| "step": 4435 |
| }, |
| { |
| "epoch": 1.7084856648066191, |
| "grad_norm": 0.09797231983138321, |
| "learning_rate": 3.685183775437938e-05, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9456776857376099, |
| "step": 4440 |
| }, |
| { |
| "epoch": 1.7104098518375985, |
| "grad_norm": 0.1004592463191462, |
| "learning_rate": 3.681745175974525e-05, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9465253472328186, |
| "step": 4445 |
| }, |
| { |
| "epoch": 1.712334038868578, |
| "grad_norm": 0.09635015865058442, |
| "learning_rate": 3.6783039478911104e-05, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9462304711341858, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.7142582258995573, |
| "grad_norm": 0.10150560455863913, |
| "learning_rate": 3.674860100896011e-05, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9462295591831207, |
| "step": 4455 |
| }, |
| { |
| "epoch": 1.7161824129305367, |
| "grad_norm": 0.09964573607398473, |
| "learning_rate": 3.671413644704938e-05, |
| "loss": 0.1603, |
| "mean_token_accuracy": 0.9462384462356568, |
| "step": 4460 |
| }, |
| { |
| "epoch": 1.7181065999615162, |
| "grad_norm": 0.0989426450473668, |
| "learning_rate": 3.667964589040961e-05, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9463894844055176, |
| "step": 4465 |
| }, |
| { |
| "epoch": 1.7200307869924956, |
| "grad_norm": 0.0947431826482723, |
| "learning_rate": 3.664512943634485e-05, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.9468457162380218, |
| "step": 4470 |
| }, |
| { |
| "epoch": 1.721954974023475, |
| "grad_norm": 0.10292528415128935, |
| "learning_rate": 3.661058718223216e-05, |
| "loss": 0.1585, |
| "mean_token_accuracy": 0.946750795841217, |
| "step": 4475 |
| }, |
| { |
| "epoch": 1.7238791610544544, |
| "grad_norm": 0.09812618533590778, |
| "learning_rate": 3.6576019225521474e-05, |
| "loss": 0.1608, |
| "mean_token_accuracy": 0.9462311148643494, |
| "step": 4480 |
| }, |
| { |
| "epoch": 1.7258033480854338, |
| "grad_norm": 0.09992020088492978, |
| "learning_rate": 3.654142566373516e-05, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9459415197372436, |
| "step": 4485 |
| }, |
| { |
| "epoch": 1.7277275351164132, |
| "grad_norm": 0.10029244391981966, |
| "learning_rate": 3.6506806594467845e-05, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.9462564051151275, |
| "step": 4490 |
| }, |
| { |
| "epoch": 1.7296517221473926, |
| "grad_norm": 0.10382887562209585, |
| "learning_rate": 3.647216211538615e-05, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.9461699724197388, |
| "step": 4495 |
| }, |
| { |
| "epoch": 1.731575909178372, |
| "grad_norm": 0.10869185316485384, |
| "learning_rate": 3.643749232422833e-05, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9458631217479706, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.7335000962093514, |
| "grad_norm": 0.10790114677302129, |
| "learning_rate": 3.64027973188041e-05, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9466256439685822, |
| "step": 4505 |
| }, |
| { |
| "epoch": 1.7354242832403308, |
| "grad_norm": 0.10633086609839314, |
| "learning_rate": 3.6368077196994255e-05, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9459817409515381, |
| "step": 4510 |
| }, |
| { |
| "epoch": 1.7373484702713102, |
| "grad_norm": 0.10885187496202672, |
| "learning_rate": 3.633333205675049e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9459581017494202, |
| "step": 4515 |
| }, |
| { |
| "epoch": 1.7392726573022896, |
| "grad_norm": 0.10176958875864693, |
| "learning_rate": 3.629856199609507e-05, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9459785044193267, |
| "step": 4520 |
| }, |
| { |
| "epoch": 1.741196844333269, |
| "grad_norm": 0.09916618375083491, |
| "learning_rate": 3.626376711312056e-05, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9466202616691589, |
| "step": 4525 |
| }, |
| { |
| "epoch": 1.7431210313642485, |
| "grad_norm": 0.10060420158449908, |
| "learning_rate": 3.622894750598956e-05, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9465162098407746, |
| "step": 4530 |
| }, |
| { |
| "epoch": 1.7450452183952279, |
| "grad_norm": 0.09887169050050851, |
| "learning_rate": 3.6194103272934407e-05, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9455604314804077, |
| "step": 4535 |
| }, |
| { |
| "epoch": 1.7469694054262073, |
| "grad_norm": 0.10871757318822675, |
| "learning_rate": 3.615923451225694e-05, |
| "loss": 0.1612, |
| "mean_token_accuracy": 0.9459405720233918, |
| "step": 4540 |
| }, |
| { |
| "epoch": 1.7488935924571867, |
| "grad_norm": 0.09759522454802382, |
| "learning_rate": 3.6124341322328164e-05, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.9464699804782868, |
| "step": 4545 |
| }, |
| { |
| "epoch": 1.7508177794881663, |
| "grad_norm": 0.10405359692751093, |
| "learning_rate": 3.608942380158802e-05, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9467595756053925, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.7527419665191457, |
| "grad_norm": 0.10730189765746177, |
| "learning_rate": 3.605448204854508e-05, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9453517079353333, |
| "step": 4555 |
| }, |
| { |
| "epoch": 1.7546661535501251, |
| "grad_norm": 0.10031455080369805, |
| "learning_rate": 3.60195161617763e-05, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9458699703216553, |
| "step": 4560 |
| }, |
| { |
| "epoch": 1.7565903405811045, |
| "grad_norm": 0.10551108903780186, |
| "learning_rate": 3.59845262399267e-05, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9461313605308532, |
| "step": 4565 |
| }, |
| { |
| "epoch": 1.758514527612084, |
| "grad_norm": 0.10398560325781105, |
| "learning_rate": 3.594951238170912e-05, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9458823204040527, |
| "step": 4570 |
| }, |
| { |
| "epoch": 1.7604387146430633, |
| "grad_norm": 0.1007130950595616, |
| "learning_rate": 3.591447468590392e-05, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9455589532852173, |
| "step": 4575 |
| }, |
| { |
| "epoch": 1.7623629016740427, |
| "grad_norm": 0.0955238435134818, |
| "learning_rate": 3.5879413251358724e-05, |
| "loss": 0.1583, |
| "mean_token_accuracy": 0.9468055486679077, |
| "step": 4580 |
| }, |
| { |
| "epoch": 1.7642870887050222, |
| "grad_norm": 0.094174218073043, |
| "learning_rate": 3.5844328176988105e-05, |
| "loss": 0.1584, |
| "mean_token_accuracy": 0.9468781888484955, |
| "step": 4585 |
| }, |
| { |
| "epoch": 1.7662112757360016, |
| "grad_norm": 0.0997841174583319, |
| "learning_rate": 3.5809219561773346e-05, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9462185323238372, |
| "step": 4590 |
| }, |
| { |
| "epoch": 1.768135462766981, |
| "grad_norm": 0.09884549747941763, |
| "learning_rate": 3.5774087504762144e-05, |
| "loss": 0.1586, |
| "mean_token_accuracy": 0.9469363689422607, |
| "step": 4595 |
| }, |
| { |
| "epoch": 1.7700596497979604, |
| "grad_norm": 0.09899189814693576, |
| "learning_rate": 3.573893210506832e-05, |
| "loss": 0.1586, |
| "mean_token_accuracy": 0.9469284415245056, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.7719838368289398, |
| "grad_norm": 0.0993621353004576, |
| "learning_rate": 3.570375346187155e-05, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9463739633560181, |
| "step": 4605 |
| }, |
| { |
| "epoch": 1.7739080238599192, |
| "grad_norm": 0.10547214561067843, |
| "learning_rate": 3.5668551674417084e-05, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.945833295583725, |
| "step": 4610 |
| }, |
| { |
| "epoch": 1.7758322108908986, |
| "grad_norm": 0.09370909630735182, |
| "learning_rate": 3.563332684201548e-05, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9458651065826416, |
| "step": 4615 |
| }, |
| { |
| "epoch": 1.777756397921878, |
| "grad_norm": 0.09452531437014097, |
| "learning_rate": 3.559807906404228e-05, |
| "loss": 0.1606, |
| "mean_token_accuracy": 0.9462214589118958, |
| "step": 4620 |
| }, |
| { |
| "epoch": 1.7796805849528574, |
| "grad_norm": 0.09829866705995945, |
| "learning_rate": 3.556280843993779e-05, |
| "loss": 0.1586, |
| "mean_token_accuracy": 0.9468035578727723, |
| "step": 4625 |
| }, |
| { |
| "epoch": 1.7816047719838368, |
| "grad_norm": 0.09935570637840965, |
| "learning_rate": 3.552751506920676e-05, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.9466434597969056, |
| "step": 4630 |
| }, |
| { |
| "epoch": 1.7835289590148162, |
| "grad_norm": 0.09686737707137055, |
| "learning_rate": 3.54921990514181e-05, |
| "loss": 0.1584, |
| "mean_token_accuracy": 0.946895694732666, |
| "step": 4635 |
| }, |
| { |
| "epoch": 1.7854531460457956, |
| "grad_norm": 0.1036327803378326, |
| "learning_rate": 3.5456860486204637e-05, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9463905453681946, |
| "step": 4640 |
| }, |
| { |
| "epoch": 1.787377333076775, |
| "grad_norm": 0.11585854547485858, |
| "learning_rate": 3.5421499473262776e-05, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9461309313774109, |
| "step": 4645 |
| }, |
| { |
| "epoch": 1.7893015201077545, |
| "grad_norm": 0.09913408037481335, |
| "learning_rate": 3.538611611235226e-05, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9463733434677124, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.7912257071387339, |
| "grad_norm": 0.09816208195520347, |
| "learning_rate": 3.535071050329591e-05, |
| "loss": 0.16, |
| "mean_token_accuracy": 0.9465721905231476, |
| "step": 4655 |
| }, |
| { |
| "epoch": 1.7931498941697133, |
| "grad_norm": 0.0954450905445559, |
| "learning_rate": 3.5315282745979275e-05, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9462323606014251, |
| "step": 4660 |
| }, |
| { |
| "epoch": 1.7950740812006927, |
| "grad_norm": 0.09685202748056504, |
| "learning_rate": 3.527983294035041e-05, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9465530514717102, |
| "step": 4665 |
| }, |
| { |
| "epoch": 1.796998268231672, |
| "grad_norm": 0.09595396274966222, |
| "learning_rate": 3.524436118641956e-05, |
| "loss": 0.1613, |
| "mean_token_accuracy": 0.9458921074867248, |
| "step": 4670 |
| }, |
| { |
| "epoch": 1.7989224552626515, |
| "grad_norm": 0.1004200825903867, |
| "learning_rate": 3.52088675842589e-05, |
| "loss": 0.1591, |
| "mean_token_accuracy": 0.946587735414505, |
| "step": 4675 |
| }, |
| { |
| "epoch": 1.800846642293631, |
| "grad_norm": 0.09727212472084619, |
| "learning_rate": 3.517335223400223e-05, |
| "loss": 0.1633, |
| "mean_token_accuracy": 0.9454812586307526, |
| "step": 4680 |
| }, |
| { |
| "epoch": 1.8027708293246103, |
| "grad_norm": 0.10781734060683949, |
| "learning_rate": 3.513781523584473e-05, |
| "loss": 0.1608, |
| "mean_token_accuracy": 0.946499890089035, |
| "step": 4685 |
| }, |
| { |
| "epoch": 1.8046950163555897, |
| "grad_norm": 0.09765609473673183, |
| "learning_rate": 3.510225669004262e-05, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9462354481220245, |
| "step": 4690 |
| }, |
| { |
| "epoch": 1.8066192033865693, |
| "grad_norm": 0.09593958412025588, |
| "learning_rate": 3.506667669691292e-05, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9458974361419678, |
| "step": 4695 |
| }, |
| { |
| "epoch": 1.8085433904175487, |
| "grad_norm": 0.09690005881713605, |
| "learning_rate": 3.5031075356833184e-05, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.9459888219833374, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.8104675774485282, |
| "grad_norm": 0.10049690019327372, |
| "learning_rate": 3.4995452770241146e-05, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.946663624048233, |
| "step": 4705 |
| }, |
| { |
| "epoch": 1.8123917644795076, |
| "grad_norm": 0.10080261659131069, |
| "learning_rate": 3.495980903763453e-05, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9462504684925079, |
| "step": 4710 |
| }, |
| { |
| "epoch": 1.814315951510487, |
| "grad_norm": 0.097535424302901, |
| "learning_rate": 3.4924144259570665e-05, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.946724146604538, |
| "step": 4715 |
| }, |
| { |
| "epoch": 1.8162401385414664, |
| "grad_norm": 0.10097992438397164, |
| "learning_rate": 3.488845853666628e-05, |
| "loss": 0.1606, |
| "mean_token_accuracy": 0.9463124930858612, |
| "step": 4720 |
| }, |
| { |
| "epoch": 1.8181643255724458, |
| "grad_norm": 0.09896929588763823, |
| "learning_rate": 3.485275196959719e-05, |
| "loss": 0.159, |
| "mean_token_accuracy": 0.9467292606830597, |
| "step": 4725 |
| }, |
| { |
| "epoch": 1.8200885126034252, |
| "grad_norm": 0.10281217785566646, |
| "learning_rate": 3.481702465909803e-05, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.94689239859581, |
| "step": 4730 |
| }, |
| { |
| "epoch": 1.8220126996344046, |
| "grad_norm": 0.09840610317271821, |
| "learning_rate": 3.478127670596193e-05, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.9461028575897217, |
| "step": 4735 |
| }, |
| { |
| "epoch": 1.823936886665384, |
| "grad_norm": 0.09885915505162275, |
| "learning_rate": 3.474550821104026e-05, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.9459745645523071, |
| "step": 4740 |
| }, |
| { |
| "epoch": 1.8258610736963634, |
| "grad_norm": 0.10031987785936006, |
| "learning_rate": 3.470971927524236e-05, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9460942864418029, |
| "step": 4745 |
| }, |
| { |
| "epoch": 1.8277852607273428, |
| "grad_norm": 0.10261317824465392, |
| "learning_rate": 3.467390999953524e-05, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9455275893211365, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.8297094477583222, |
| "grad_norm": 0.09844431473152106, |
| "learning_rate": 3.463808048494325e-05, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9458800673484802, |
| "step": 4755 |
| }, |
| { |
| "epoch": 1.8316336347893016, |
| "grad_norm": 0.096300292573579, |
| "learning_rate": 3.4602230832547885e-05, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9462751030921936, |
| "step": 4760 |
| }, |
| { |
| "epoch": 1.833557821820281, |
| "grad_norm": 0.09643759538772625, |
| "learning_rate": 3.456636114348744e-05, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9467679262161255, |
| "step": 4765 |
| }, |
| { |
| "epoch": 1.8354820088512604, |
| "grad_norm": 0.09947537690648842, |
| "learning_rate": 3.4530471518956715e-05, |
| "loss": 0.16, |
| "mean_token_accuracy": 0.9465085625648498, |
| "step": 4770 |
| }, |
| { |
| "epoch": 1.8374061958822399, |
| "grad_norm": 0.0961494745039196, |
| "learning_rate": 3.449456206020677e-05, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9457358002662659, |
| "step": 4775 |
| }, |
| { |
| "epoch": 1.8393303829132193, |
| "grad_norm": 0.09980075650621577, |
| "learning_rate": 3.4458632868544647e-05, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.9465402901172638, |
| "step": 4780 |
| }, |
| { |
| "epoch": 1.8412545699441987, |
| "grad_norm": 0.10689945107107311, |
| "learning_rate": 3.4422684045332994e-05, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9462505578994751, |
| "step": 4785 |
| }, |
| { |
| "epoch": 1.843178756975178, |
| "grad_norm": 0.09496817919075264, |
| "learning_rate": 3.43867156919899e-05, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.9463814616203308, |
| "step": 4790 |
| }, |
| { |
| "epoch": 1.8451029440061575, |
| "grad_norm": 0.1018698324932633, |
| "learning_rate": 3.435072790998852e-05, |
| "loss": 0.1633, |
| "mean_token_accuracy": 0.9452820897102356, |
| "step": 4795 |
| }, |
| { |
| "epoch": 1.847027131037137, |
| "grad_norm": 0.10232179591168951, |
| "learning_rate": 3.431472080085684e-05, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9454320549964905, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.8489513180681163, |
| "grad_norm": 0.10072763548455216, |
| "learning_rate": 3.427869446617736e-05, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.9469704747200012, |
| "step": 4805 |
| }, |
| { |
| "epoch": 1.8508755050990957, |
| "grad_norm": 0.09896154806898143, |
| "learning_rate": 3.424264900758682e-05, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9457802414894104, |
| "step": 4810 |
| }, |
| { |
| "epoch": 1.8527996921300751, |
| "grad_norm": 0.09851390977832954, |
| "learning_rate": 3.420658452677592e-05, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.946274584531784, |
| "step": 4815 |
| }, |
| { |
| "epoch": 1.8547238791610545, |
| "grad_norm": 0.10141452729511008, |
| "learning_rate": 3.4170501125489005e-05, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.94554203748703, |
| "step": 4820 |
| }, |
| { |
| "epoch": 1.856648066192034, |
| "grad_norm": 0.09465743780992764, |
| "learning_rate": 3.413439890552384e-05, |
| "loss": 0.159, |
| "mean_token_accuracy": 0.9467312037944794, |
| "step": 4825 |
| }, |
| { |
| "epoch": 1.8585722532230133, |
| "grad_norm": 0.09693986948890045, |
| "learning_rate": 3.409827796873122e-05, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9463321626186371, |
| "step": 4830 |
| }, |
| { |
| "epoch": 1.8604964402539927, |
| "grad_norm": 0.09809270772137343, |
| "learning_rate": 3.40621384170148e-05, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9462391793727875, |
| "step": 4835 |
| }, |
| { |
| "epoch": 1.8624206272849722, |
| "grad_norm": 0.09646790690485375, |
| "learning_rate": 3.402598035233072e-05, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9458190202713013, |
| "step": 4840 |
| }, |
| { |
| "epoch": 1.8643448143159516, |
| "grad_norm": 0.3675774609735443, |
| "learning_rate": 3.398980387668735e-05, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9459438502788544, |
| "step": 4845 |
| }, |
| { |
| "epoch": 1.866269001346931, |
| "grad_norm": 0.10581255129356738, |
| "learning_rate": 3.395360909214502e-05, |
| "loss": 0.1591, |
| "mean_token_accuracy": 0.9468293070793152, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.8681931883779104, |
| "grad_norm": 0.09716761917946114, |
| "learning_rate": 3.391739610081568e-05, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.946347177028656, |
| "step": 4855 |
| }, |
| { |
| "epoch": 1.8701173754088898, |
| "grad_norm": 0.10643801155332201, |
| "learning_rate": 3.388116500486268e-05, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9463170647621155, |
| "step": 4860 |
| }, |
| { |
| "epoch": 1.8720415624398692, |
| "grad_norm": 0.09878880152730199, |
| "learning_rate": 3.3844915906500426e-05, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9468757033348083, |
| "step": 4865 |
| }, |
| { |
| "epoch": 1.8739657494708486, |
| "grad_norm": 0.10644131887426493, |
| "learning_rate": 3.380864890799411e-05, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9463419139385223, |
| "step": 4870 |
| }, |
| { |
| "epoch": 1.875889936501828, |
| "grad_norm": 0.10976722626462512, |
| "learning_rate": 3.3772364111659444e-05, |
| "loss": 0.1581, |
| "mean_token_accuracy": 0.9467686593532563, |
| "step": 4875 |
| }, |
| { |
| "epoch": 1.8778141235328074, |
| "grad_norm": 0.09598048657221817, |
| "learning_rate": 3.373606161986231e-05, |
| "loss": 0.159, |
| "mean_token_accuracy": 0.9466426372528076, |
| "step": 4880 |
| }, |
| { |
| "epoch": 1.8797383105637868, |
| "grad_norm": 0.09662806316650543, |
| "learning_rate": 3.369974153501857e-05, |
| "loss": 0.1608, |
| "mean_token_accuracy": 0.9462577164173126, |
| "step": 4885 |
| }, |
| { |
| "epoch": 1.8816624975947662, |
| "grad_norm": 0.09303829159925717, |
| "learning_rate": 3.3663403959593673e-05, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9464956879615783, |
| "step": 4890 |
| }, |
| { |
| "epoch": 1.8835866846257456, |
| "grad_norm": 0.10133191319579814, |
| "learning_rate": 3.362704899610242e-05, |
| "loss": 0.1578, |
| "mean_token_accuracy": 0.9470635533332825, |
| "step": 4895 |
| }, |
| { |
| "epoch": 1.885510871656725, |
| "grad_norm": 0.10158166791887556, |
| "learning_rate": 3.3590676747108685e-05, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9463416457176208, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.8874350586877044, |
| "grad_norm": 0.0977662785830903, |
| "learning_rate": 3.355428731522509e-05, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9458343744277954, |
| "step": 4905 |
| }, |
| { |
| "epoch": 1.8893592457186839, |
| "grad_norm": 0.09788446984575432, |
| "learning_rate": 3.351788080311275e-05, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.9464067101478577, |
| "step": 4910 |
| }, |
| { |
| "epoch": 1.8912834327496633, |
| "grad_norm": 0.10214906792155744, |
| "learning_rate": 3.3481457313480934e-05, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9464616537094116, |
| "step": 4915 |
| }, |
| { |
| "epoch": 1.8932076197806427, |
| "grad_norm": 0.10060544611170799, |
| "learning_rate": 3.344501694908686e-05, |
| "loss": 0.1596, |
| "mean_token_accuracy": 0.9465152502059937, |
| "step": 4920 |
| }, |
| { |
| "epoch": 1.895131806811622, |
| "grad_norm": 0.2794861014508182, |
| "learning_rate": 3.340855981273528e-05, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9458632647991181, |
| "step": 4925 |
| }, |
| { |
| "epoch": 1.8970559938426015, |
| "grad_norm": 0.10406982278890324, |
| "learning_rate": 3.3372086007278344e-05, |
| "loss": 0.1603, |
| "mean_token_accuracy": 0.9462267875671386, |
| "step": 4930 |
| }, |
| { |
| "epoch": 1.898980180873581, |
| "grad_norm": 0.09761129864564562, |
| "learning_rate": 3.333559563561517e-05, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9467118740081787, |
| "step": 4935 |
| }, |
| { |
| "epoch": 1.9009043679045603, |
| "grad_norm": 0.10085599183569154, |
| "learning_rate": 3.329908880069163e-05, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9460000395774841, |
| "step": 4940 |
| }, |
| { |
| "epoch": 1.9028285549355397, |
| "grad_norm": 0.09683653725025748, |
| "learning_rate": 3.326256560550006e-05, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.9466755270957947, |
| "step": 4945 |
| }, |
| { |
| "epoch": 1.9047527419665191, |
| "grad_norm": 0.09342326037959806, |
| "learning_rate": 3.322602615307891e-05, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9468986928462982, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.9066769289974985, |
| "grad_norm": 0.10555195157867583, |
| "learning_rate": 3.318947054651254e-05, |
| "loss": 0.1581, |
| "mean_token_accuracy": 0.9468642890453338, |
| "step": 4955 |
| }, |
| { |
| "epoch": 1.908601116028478, |
| "grad_norm": 0.09511827856569342, |
| "learning_rate": 3.315289888893085e-05, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.9460481464862823, |
| "step": 4960 |
| }, |
| { |
| "epoch": 1.9105253030594573, |
| "grad_norm": 0.0961044811313041, |
| "learning_rate": 3.3116311283509046e-05, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.945965725183487, |
| "step": 4965 |
| }, |
| { |
| "epoch": 1.9124494900904367, |
| "grad_norm": 0.09474214908651583, |
| "learning_rate": 3.30797078334673e-05, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9464991927146912, |
| "step": 4970 |
| }, |
| { |
| "epoch": 1.9143736771214162, |
| "grad_norm": 0.09640519175928383, |
| "learning_rate": 3.304308864207052e-05, |
| "loss": 0.158, |
| "mean_token_accuracy": 0.9469054996967315, |
| "step": 4975 |
| }, |
| { |
| "epoch": 1.9162978641523956, |
| "grad_norm": 0.09736860178647218, |
| "learning_rate": 3.300645381262798e-05, |
| "loss": 0.1595, |
| "mean_token_accuracy": 0.9467026233673096, |
| "step": 4980 |
| }, |
| { |
| "epoch": 1.918222051183375, |
| "grad_norm": 0.10031238266022684, |
| "learning_rate": 3.2969803448493116e-05, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9458101212978363, |
| "step": 4985 |
| }, |
| { |
| "epoch": 1.9201462382143544, |
| "grad_norm": 0.09654538040286698, |
| "learning_rate": 3.2933137653063154e-05, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9466418564319611, |
| "step": 4990 |
| }, |
| { |
| "epoch": 1.9220704252453338, |
| "grad_norm": 0.09793397852312749, |
| "learning_rate": 3.289645652977888e-05, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9470449328422547, |
| "step": 4995 |
| }, |
| { |
| "epoch": 1.9239946122763132, |
| "grad_norm": 0.09742505251785624, |
| "learning_rate": 3.285976018212429e-05, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.9468475997447967, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.9259187993072926, |
| "grad_norm": 0.10083861652857352, |
| "learning_rate": 3.2823048713626395e-05, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.945935195684433, |
| "step": 5005 |
| }, |
| { |
| "epoch": 1.927842986338272, |
| "grad_norm": 0.09563911338044254, |
| "learning_rate": 3.278632222785478e-05, |
| "loss": 0.1598, |
| "mean_token_accuracy": 0.9465391278266907, |
| "step": 5010 |
| }, |
| { |
| "epoch": 1.9297671733692514, |
| "grad_norm": 0.09947135052042433, |
| "learning_rate": 3.274958082842145e-05, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9466078162193299, |
| "step": 5015 |
| }, |
| { |
| "epoch": 1.9316913604002308, |
| "grad_norm": 0.09939040105234527, |
| "learning_rate": 3.271282461898049e-05, |
| "loss": 0.158, |
| "mean_token_accuracy": 0.9472302615642547, |
| "step": 5020 |
| }, |
| { |
| "epoch": 1.9336155474312102, |
| "grad_norm": 0.10048369508357508, |
| "learning_rate": 3.267605370322773e-05, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9461906492710114, |
| "step": 5025 |
| }, |
| { |
| "epoch": 1.9355397344621896, |
| "grad_norm": 0.09790767485398483, |
| "learning_rate": 3.2639268184900506e-05, |
| "loss": 0.1574, |
| "mean_token_accuracy": 0.9472224354743958, |
| "step": 5030 |
| }, |
| { |
| "epoch": 1.937463921493169, |
| "grad_norm": 0.09849370603158782, |
| "learning_rate": 3.260246816777737e-05, |
| "loss": 0.1563, |
| "mean_token_accuracy": 0.9475186169147491, |
| "step": 5035 |
| }, |
| { |
| "epoch": 1.9393881085241484, |
| "grad_norm": 0.0980768828044851, |
| "learning_rate": 3.256565375567776e-05, |
| "loss": 0.1584, |
| "mean_token_accuracy": 0.9468068718910218, |
| "step": 5040 |
| }, |
| { |
| "epoch": 1.9413122955551279, |
| "grad_norm": 0.10480453762445419, |
| "learning_rate": 3.252882505246171e-05, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.945637184381485, |
| "step": 5045 |
| }, |
| { |
| "epoch": 1.9432364825861073, |
| "grad_norm": 0.10616535211196816, |
| "learning_rate": 3.24919821620296e-05, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9462700366973877, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.9451606696170867, |
| "grad_norm": 0.09894956259060722, |
| "learning_rate": 3.2455125188321806e-05, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9463880658149719, |
| "step": 5055 |
| }, |
| { |
| "epoch": 1.947084856648066, |
| "grad_norm": 0.09224677772865261, |
| "learning_rate": 3.2418254235318474e-05, |
| "loss": 0.1578, |
| "mean_token_accuracy": 0.947035801410675, |
| "step": 5060 |
| }, |
| { |
| "epoch": 1.9490090436790455, |
| "grad_norm": 0.09838850162687025, |
| "learning_rate": 3.238136940703915e-05, |
| "loss": 0.1578, |
| "mean_token_accuracy": 0.9472411751747132, |
| "step": 5065 |
| }, |
| { |
| "epoch": 1.950933230710025, |
| "grad_norm": 0.09555416511064461, |
| "learning_rate": 3.234447080754255e-05, |
| "loss": 0.1574, |
| "mean_token_accuracy": 0.9474334597587586, |
| "step": 5070 |
| }, |
| { |
| "epoch": 1.9528574177410043, |
| "grad_norm": 0.09801635064811252, |
| "learning_rate": 3.230755854092622e-05, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9464401781558991, |
| "step": 5075 |
| }, |
| { |
| "epoch": 1.9547816047719837, |
| "grad_norm": 0.09373235655008649, |
| "learning_rate": 3.2270632711326285e-05, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9457164108753204, |
| "step": 5080 |
| }, |
| { |
| "epoch": 1.9567057918029631, |
| "grad_norm": 0.09348989261398567, |
| "learning_rate": 3.223369342291711e-05, |
| "loss": 0.1572, |
| "mean_token_accuracy": 0.9472990691661834, |
| "step": 5085 |
| }, |
| { |
| "epoch": 1.9586299788339425, |
| "grad_norm": 0.10418420549716748, |
| "learning_rate": 3.2196740779911054e-05, |
| "loss": 0.1596, |
| "mean_token_accuracy": 0.9464613854885101, |
| "step": 5090 |
| }, |
| { |
| "epoch": 1.960554165864922, |
| "grad_norm": 0.09745844772641116, |
| "learning_rate": 3.215977488655814e-05, |
| "loss": 0.1581, |
| "mean_token_accuracy": 0.9471061944961547, |
| "step": 5095 |
| }, |
| { |
| "epoch": 1.9624783528959013, |
| "grad_norm": 0.0992049012870913, |
| "learning_rate": 3.212279584714577e-05, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9458459138870239, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.9644025399268807, |
| "grad_norm": 0.105459848856873, |
| "learning_rate": 3.2085803765998435e-05, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9468812644481659, |
| "step": 5105 |
| }, |
| { |
| "epoch": 1.9663267269578601, |
| "grad_norm": 0.09767862155081931, |
| "learning_rate": 3.204879874747743e-05, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9461267232894898, |
| "step": 5110 |
| }, |
| { |
| "epoch": 1.9682509139888396, |
| "grad_norm": 0.09507934172125736, |
| "learning_rate": 3.201178089598053e-05, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9458625495433808, |
| "step": 5115 |
| }, |
| { |
| "epoch": 1.970175101019819, |
| "grad_norm": 0.0974695929044688, |
| "learning_rate": 3.1974750315941725e-05, |
| "loss": 0.1552, |
| "mean_token_accuracy": 0.9478743553161622, |
| "step": 5120 |
| }, |
| { |
| "epoch": 1.9720992880507984, |
| "grad_norm": 0.10126002440399436, |
| "learning_rate": 3.193770711183092e-05, |
| "loss": 0.1598, |
| "mean_token_accuracy": 0.9465251445770264, |
| "step": 5125 |
| }, |
| { |
| "epoch": 1.974023475081778, |
| "grad_norm": 0.09827018057592911, |
| "learning_rate": 3.1900651388153604e-05, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9464419603347778, |
| "step": 5130 |
| }, |
| { |
| "epoch": 1.9759476621127574, |
| "grad_norm": 0.09528586355201614, |
| "learning_rate": 3.1863583249450645e-05, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9459393203258515, |
| "step": 5135 |
| }, |
| { |
| "epoch": 1.9778718491437368, |
| "grad_norm": 0.09611110064857509, |
| "learning_rate": 3.182650280029788e-05, |
| "loss": 0.1613, |
| "mean_token_accuracy": 0.9462891340255737, |
| "step": 5140 |
| }, |
| { |
| "epoch": 1.9797960361747162, |
| "grad_norm": 0.09725057007008812, |
| "learning_rate": 3.17894101453059e-05, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.9462293803691864, |
| "step": 5145 |
| }, |
| { |
| "epoch": 1.9817202232056956, |
| "grad_norm": 0.09214072277376484, |
| "learning_rate": 3.175230538911972e-05, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.946392560005188, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.983644410236675, |
| "grad_norm": 0.09804047929466903, |
| "learning_rate": 3.171518863641852e-05, |
| "loss": 0.16, |
| "mean_token_accuracy": 0.9464940249919891, |
| "step": 5155 |
| }, |
| { |
| "epoch": 1.9855685972676544, |
| "grad_norm": 0.09701178740625536, |
| "learning_rate": 3.167805999191528e-05, |
| "loss": 0.1586, |
| "mean_token_accuracy": 0.9466553747653961, |
| "step": 5160 |
| }, |
| { |
| "epoch": 1.9874927842986339, |
| "grad_norm": 0.09811194237130026, |
| "learning_rate": 3.164091956035659e-05, |
| "loss": 0.16, |
| "mean_token_accuracy": 0.9464266657829284, |
| "step": 5165 |
| }, |
| { |
| "epoch": 1.9894169713296133, |
| "grad_norm": 0.09316358619416497, |
| "learning_rate": 3.1603767446522234e-05, |
| "loss": 0.156, |
| "mean_token_accuracy": 0.9476395964622497, |
| "step": 5170 |
| }, |
| { |
| "epoch": 1.9913411583605927, |
| "grad_norm": 0.10022414149683, |
| "learning_rate": 3.1566603755224976e-05, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9465124249458313, |
| "step": 5175 |
| }, |
| { |
| "epoch": 1.993265345391572, |
| "grad_norm": 0.09865479422492948, |
| "learning_rate": 3.152942859131026e-05, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9463797211647034, |
| "step": 5180 |
| }, |
| { |
| "epoch": 1.9951895324225515, |
| "grad_norm": 0.10300294525826155, |
| "learning_rate": 3.149224205965587e-05, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9461157321929932, |
| "step": 5185 |
| }, |
| { |
| "epoch": 1.9971137194535309, |
| "grad_norm": 0.09814417031680028, |
| "learning_rate": 3.145504426517168e-05, |
| "loss": 0.1586, |
| "mean_token_accuracy": 0.9469424188137054, |
| "step": 5190 |
| }, |
| { |
| "epoch": 1.9990379064845103, |
| "grad_norm": 0.09283170373290625, |
| "learning_rate": 3.141783531279932e-05, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.9467655956745148, |
| "step": 5195 |
| }, |
| { |
| "epoch": 2.0007696748123918, |
| "grad_norm": 0.0979217349593047, |
| "learning_rate": 3.138061530751193e-05, |
| "loss": 0.1461, |
| "mean_token_accuracy": 0.950280037191179, |
| "step": 5200 |
| }, |
| { |
| "epoch": 2.002693861843371, |
| "grad_norm": 0.10634331349231106, |
| "learning_rate": 3.134338435431377e-05, |
| "loss": 0.1306, |
| "mean_token_accuracy": 0.954954308271408, |
| "step": 5205 |
| }, |
| { |
| "epoch": 2.0046180488743506, |
| "grad_norm": 0.10679220723686542, |
| "learning_rate": 3.130614255824006e-05, |
| "loss": 0.1318, |
| "mean_token_accuracy": 0.9545697510242462, |
| "step": 5210 |
| }, |
| { |
| "epoch": 2.00654223590533, |
| "grad_norm": 0.1056994101202361, |
| "learning_rate": 3.1268890024356575e-05, |
| "loss": 0.1316, |
| "mean_token_accuracy": 0.9546972990036011, |
| "step": 5215 |
| }, |
| { |
| "epoch": 2.0084664229363094, |
| "grad_norm": 0.1074679315159629, |
| "learning_rate": 3.1231626857759365e-05, |
| "loss": 0.1279, |
| "mean_token_accuracy": 0.9559260845184326, |
| "step": 5220 |
| }, |
| { |
| "epoch": 2.010390609967289, |
| "grad_norm": 0.3393926179404455, |
| "learning_rate": 3.119435316357451e-05, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9549328625202179, |
| "step": 5225 |
| }, |
| { |
| "epoch": 2.012314796998268, |
| "grad_norm": 0.10947564562909612, |
| "learning_rate": 3.115706904695778e-05, |
| "loss": 0.1298, |
| "mean_token_accuracy": 0.955171936750412, |
| "step": 5230 |
| }, |
| { |
| "epoch": 2.0142389840292476, |
| "grad_norm": 0.10517939325961964, |
| "learning_rate": 3.1119774613094335e-05, |
| "loss": 0.1314, |
| "mean_token_accuracy": 0.9554017126560211, |
| "step": 5235 |
| }, |
| { |
| "epoch": 2.016163171060227, |
| "grad_norm": 0.116200055131795, |
| "learning_rate": 3.1082469967198457e-05, |
| "loss": 0.1261, |
| "mean_token_accuracy": 0.9565369427204132, |
| "step": 5240 |
| }, |
| { |
| "epoch": 2.0180873580912064, |
| "grad_norm": 0.10526597846445619, |
| "learning_rate": 3.104515521451323e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.955550628900528, |
| "step": 5245 |
| }, |
| { |
| "epoch": 2.020011545122186, |
| "grad_norm": 0.11016870301187903, |
| "learning_rate": 3.1007830460310264e-05, |
| "loss": 0.1301, |
| "mean_token_accuracy": 0.9552641570568084, |
| "step": 5250 |
| }, |
| { |
| "epoch": 2.0219357321531652, |
| "grad_norm": 0.10511615738632486, |
| "learning_rate": 3.097049580988935e-05, |
| "loss": 0.129, |
| "mean_token_accuracy": 0.9556998193264008, |
| "step": 5255 |
| }, |
| { |
| "epoch": 2.0238599191841447, |
| "grad_norm": 0.10691090435399411, |
| "learning_rate": 3.093315136857825e-05, |
| "loss": 0.1291, |
| "mean_token_accuracy": 0.9555999100208282, |
| "step": 5260 |
| }, |
| { |
| "epoch": 2.025784106215124, |
| "grad_norm": 0.15292956488636977, |
| "learning_rate": 3.089579724173231e-05, |
| "loss": 0.1279, |
| "mean_token_accuracy": 0.9560263872146606, |
| "step": 5265 |
| }, |
| { |
| "epoch": 2.0277082932461035, |
| "grad_norm": 0.10684680779913877, |
| "learning_rate": 3.085843353473421e-05, |
| "loss": 0.1298, |
| "mean_token_accuracy": 0.9552083313465118, |
| "step": 5270 |
| }, |
| { |
| "epoch": 2.029632480277083, |
| "grad_norm": 0.58474037537279, |
| "learning_rate": 3.082106035299366e-05, |
| "loss": 0.1292, |
| "mean_token_accuracy": 0.9555357456207275, |
| "step": 5275 |
| }, |
| { |
| "epoch": 2.0315566673080623, |
| "grad_norm": 0.1065973145069723, |
| "learning_rate": 3.07836778019471e-05, |
| "loss": 0.1318, |
| "mean_token_accuracy": 0.9546404302120208, |
| "step": 5280 |
| }, |
| { |
| "epoch": 2.0334808543390417, |
| "grad_norm": 0.10442064704720966, |
| "learning_rate": 3.07462859870574e-05, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9547493577003479, |
| "step": 5285 |
| }, |
| { |
| "epoch": 2.035405041370021, |
| "grad_norm": 0.10873826596804126, |
| "learning_rate": 3.070888501381357e-05, |
| "loss": 0.1298, |
| "mean_token_accuracy": 0.9552703380584717, |
| "step": 5290 |
| }, |
| { |
| "epoch": 2.0373292284010005, |
| "grad_norm": 0.10357965628977037, |
| "learning_rate": 3.067147498773045e-05, |
| "loss": 0.1304, |
| "mean_token_accuracy": 0.9551702678203583, |
| "step": 5295 |
| }, |
| { |
| "epoch": 2.03925341543198, |
| "grad_norm": 0.1074588597649682, |
| "learning_rate": 3.063405601434841e-05, |
| "loss": 0.1274, |
| "mean_token_accuracy": 0.9560691356658936, |
| "step": 5300 |
| }, |
| { |
| "epoch": 2.0411776024629593, |
| "grad_norm": 0.10911825659535652, |
| "learning_rate": 3.059662819923311e-05, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9555826544761657, |
| "step": 5305 |
| }, |
| { |
| "epoch": 2.0431017894939387, |
| "grad_norm": 0.10108948948238519, |
| "learning_rate": 3.0559191647975074e-05, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.9549266755580902, |
| "step": 5310 |
| }, |
| { |
| "epoch": 2.045025976524918, |
| "grad_norm": 0.1071244864350153, |
| "learning_rate": 3.052174646618956e-05, |
| "loss": 0.1306, |
| "mean_token_accuracy": 0.9551376879215241, |
| "step": 5315 |
| }, |
| { |
| "epoch": 2.0469501635558975, |
| "grad_norm": 0.11039730911750427, |
| "learning_rate": 3.0484292759516104e-05, |
| "loss": 0.1309, |
| "mean_token_accuracy": 0.954924464225769, |
| "step": 5320 |
| }, |
| { |
| "epoch": 2.048874350586877, |
| "grad_norm": 0.10763586909319262, |
| "learning_rate": 3.0446830633618334e-05, |
| "loss": 0.1293, |
| "mean_token_accuracy": 0.9557511568069458, |
| "step": 5325 |
| }, |
| { |
| "epoch": 2.0507985376178564, |
| "grad_norm": 0.11038708625650738, |
| "learning_rate": 3.0409360194183605e-05, |
| "loss": 0.1325, |
| "mean_token_accuracy": 0.9542287409305572, |
| "step": 5330 |
| }, |
| { |
| "epoch": 2.0527227246488358, |
| "grad_norm": 0.10981688615939526, |
| "learning_rate": 3.0371881546922748e-05, |
| "loss": 0.1309, |
| "mean_token_accuracy": 0.9547199666500091, |
| "step": 5335 |
| }, |
| { |
| "epoch": 2.054646911679815, |
| "grad_norm": 0.10796682056039919, |
| "learning_rate": 3.0334394797569725e-05, |
| "loss": 0.1316, |
| "mean_token_accuracy": 0.954637098312378, |
| "step": 5340 |
| }, |
| { |
| "epoch": 2.0565710987107946, |
| "grad_norm": 0.11915701134432091, |
| "learning_rate": 3.029690005188139e-05, |
| "loss": 0.1296, |
| "mean_token_accuracy": 0.9551791489124298, |
| "step": 5345 |
| }, |
| { |
| "epoch": 2.058495285741774, |
| "grad_norm": 0.11569250091945521, |
| "learning_rate": 3.0259397415637114e-05, |
| "loss": 0.128, |
| "mean_token_accuracy": 0.9559932589530945, |
| "step": 5350 |
| }, |
| { |
| "epoch": 2.0604194727727534, |
| "grad_norm": 0.10436449803415335, |
| "learning_rate": 3.0221886994638567e-05, |
| "loss": 0.1279, |
| "mean_token_accuracy": 0.9558859884738922, |
| "step": 5355 |
| }, |
| { |
| "epoch": 2.062343659803733, |
| "grad_norm": 0.10517908575815679, |
| "learning_rate": 3.0184368894709343e-05, |
| "loss": 0.1314, |
| "mean_token_accuracy": 0.9548024773597718, |
| "step": 5360 |
| }, |
| { |
| "epoch": 2.064267846834712, |
| "grad_norm": 0.13334566474512122, |
| "learning_rate": 3.014684322169474e-05, |
| "loss": 0.13, |
| "mean_token_accuracy": 0.9551829278469086, |
| "step": 5365 |
| }, |
| { |
| "epoch": 2.0661920338656916, |
| "grad_norm": 0.291710679915826, |
| "learning_rate": 3.0109310081461405e-05, |
| "loss": 0.1302, |
| "mean_token_accuracy": 0.9552090466022491, |
| "step": 5370 |
| }, |
| { |
| "epoch": 2.068116220896671, |
| "grad_norm": 0.11518704261619796, |
| "learning_rate": 3.007176957989703e-05, |
| "loss": 0.1281, |
| "mean_token_accuracy": 0.9560563921928406, |
| "step": 5375 |
| }, |
| { |
| "epoch": 2.0700404079276504, |
| "grad_norm": 0.10943743689817875, |
| "learning_rate": 3.0034221822910108e-05, |
| "loss": 0.1296, |
| "mean_token_accuracy": 0.9554878771305084, |
| "step": 5380 |
| }, |
| { |
| "epoch": 2.07196459495863, |
| "grad_norm": 0.10570008629104946, |
| "learning_rate": 2.9996666916429578e-05, |
| "loss": 0.1318, |
| "mean_token_accuracy": 0.954410445690155, |
| "step": 5385 |
| }, |
| { |
| "epoch": 2.0738887819896092, |
| "grad_norm": 0.11138483865068619, |
| "learning_rate": 2.9959104966404562e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9557713150978089, |
| "step": 5390 |
| }, |
| { |
| "epoch": 2.0758129690205886, |
| "grad_norm": 0.11771707950321599, |
| "learning_rate": 2.9921536078804042e-05, |
| "loss": 0.1267, |
| "mean_token_accuracy": 0.9561442255973815, |
| "step": 5395 |
| }, |
| { |
| "epoch": 2.077737156051568, |
| "grad_norm": 0.12294598274945921, |
| "learning_rate": 2.9883960359616587e-05, |
| "loss": 0.1301, |
| "mean_token_accuracy": 0.9550639927387238, |
| "step": 5400 |
| }, |
| { |
| "epoch": 2.0796613430825475, |
| "grad_norm": 0.11316747482591458, |
| "learning_rate": 2.984637791485001e-05, |
| "loss": 0.1308, |
| "mean_token_accuracy": 0.9548614978790283, |
| "step": 5405 |
| }, |
| { |
| "epoch": 2.081585530113527, |
| "grad_norm": 0.10864043023159166, |
| "learning_rate": 2.9808788850531145e-05, |
| "loss": 0.1318, |
| "mean_token_accuracy": 0.9546567440032959, |
| "step": 5410 |
| }, |
| { |
| "epoch": 2.0835097171445063, |
| "grad_norm": 0.10886707239628111, |
| "learning_rate": 2.9771193272705454e-05, |
| "loss": 0.1308, |
| "mean_token_accuracy": 0.9549291670322418, |
| "step": 5415 |
| }, |
| { |
| "epoch": 2.0854339041754857, |
| "grad_norm": 0.10824866644582333, |
| "learning_rate": 2.9733591287436807e-05, |
| "loss": 0.1286, |
| "mean_token_accuracy": 0.9558004319667817, |
| "step": 5420 |
| }, |
| { |
| "epoch": 2.087358091206465, |
| "grad_norm": 0.10670212406836654, |
| "learning_rate": 2.9695983000807133e-05, |
| "loss": 0.1281, |
| "mean_token_accuracy": 0.9559083044528961, |
| "step": 5425 |
| }, |
| { |
| "epoch": 2.0892822782374445, |
| "grad_norm": 0.10772997897188348, |
| "learning_rate": 2.965836851891614e-05, |
| "loss": 0.1316, |
| "mean_token_accuracy": 0.9547219812870026, |
| "step": 5430 |
| }, |
| { |
| "epoch": 2.091206465268424, |
| "grad_norm": 0.10726583715210378, |
| "learning_rate": 2.9620747947881016e-05, |
| "loss": 0.1302, |
| "mean_token_accuracy": 0.9550222814083099, |
| "step": 5435 |
| }, |
| { |
| "epoch": 2.0931306522994033, |
| "grad_norm": 0.10923784314271137, |
| "learning_rate": 2.958312139383615e-05, |
| "loss": 0.1305, |
| "mean_token_accuracy": 0.9548932433128356, |
| "step": 5440 |
| }, |
| { |
| "epoch": 2.0950548393303827, |
| "grad_norm": 0.1149543715362021, |
| "learning_rate": 2.9545488962932764e-05, |
| "loss": 0.1299, |
| "mean_token_accuracy": 0.9552010416984558, |
| "step": 5445 |
| }, |
| { |
| "epoch": 2.096979026361362, |
| "grad_norm": 0.10556163043636207, |
| "learning_rate": 2.9507850761338694e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9556468963623047, |
| "step": 5450 |
| }, |
| { |
| "epoch": 2.0989032133923415, |
| "grad_norm": 0.1105578089091931, |
| "learning_rate": 2.947020689523806e-05, |
| "loss": 0.1304, |
| "mean_token_accuracy": 0.9551441192626953, |
| "step": 5455 |
| }, |
| { |
| "epoch": 2.100827400423321, |
| "grad_norm": 0.10875601486403272, |
| "learning_rate": 2.943255747083093e-05, |
| "loss": 0.1311, |
| "mean_token_accuracy": 0.9547959864139557, |
| "step": 5460 |
| }, |
| { |
| "epoch": 2.1027515874543004, |
| "grad_norm": 0.11107473151673773, |
| "learning_rate": 2.939490259433308e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.955434012413025, |
| "step": 5465 |
| }, |
| { |
| "epoch": 2.1046757744852798, |
| "grad_norm": 0.10500369274811092, |
| "learning_rate": 2.9357242371975663e-05, |
| "loss": 0.1289, |
| "mean_token_accuracy": 0.9555002927780152, |
| "step": 5470 |
| }, |
| { |
| "epoch": 2.106599961516259, |
| "grad_norm": 0.11034845372006379, |
| "learning_rate": 2.9319576910004908e-05, |
| "loss": 0.1297, |
| "mean_token_accuracy": 0.955095624923706, |
| "step": 5475 |
| }, |
| { |
| "epoch": 2.1085241485472386, |
| "grad_norm": 0.1062967363159288, |
| "learning_rate": 2.9281906314681828e-05, |
| "loss": 0.1282, |
| "mean_token_accuracy": 0.9557537794113159, |
| "step": 5480 |
| }, |
| { |
| "epoch": 2.1104483355782184, |
| "grad_norm": 0.10818146503538062, |
| "learning_rate": 2.9244230692281928e-05, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9547770500183106, |
| "step": 5485 |
| }, |
| { |
| "epoch": 2.1123725226091974, |
| "grad_norm": 0.15693716675982816, |
| "learning_rate": 2.920655014909487e-05, |
| "loss": 0.1281, |
| "mean_token_accuracy": 0.9557695984840393, |
| "step": 5490 |
| }, |
| { |
| "epoch": 2.1142967096401772, |
| "grad_norm": 0.10530890895897269, |
| "learning_rate": 2.916886479142422e-05, |
| "loss": 0.1299, |
| "mean_token_accuracy": 0.9554035186767578, |
| "step": 5495 |
| }, |
| { |
| "epoch": 2.1162208966711566, |
| "grad_norm": 0.11209369119736379, |
| "learning_rate": 2.9131174725587134e-05, |
| "loss": 0.1304, |
| "mean_token_accuracy": 0.9551724195480347, |
| "step": 5500 |
| }, |
| { |
| "epoch": 2.118145083702136, |
| "grad_norm": 0.10735945610865963, |
| "learning_rate": 2.9093480057914018e-05, |
| "loss": 0.1287, |
| "mean_token_accuracy": 0.9553767502307892, |
| "step": 5505 |
| }, |
| { |
| "epoch": 2.1200692707331155, |
| "grad_norm": 0.10427220007610355, |
| "learning_rate": 2.9055780894748284e-05, |
| "loss": 0.1297, |
| "mean_token_accuracy": 0.9552501380443573, |
| "step": 5510 |
| }, |
| { |
| "epoch": 2.121993457764095, |
| "grad_norm": 0.10587862731328973, |
| "learning_rate": 2.9018077342446042e-05, |
| "loss": 0.1296, |
| "mean_token_accuracy": 0.9554009735584259, |
| "step": 5515 |
| }, |
| { |
| "epoch": 2.1239176447950743, |
| "grad_norm": 0.09896574916048452, |
| "learning_rate": 2.8980369507375744e-05, |
| "loss": 0.1292, |
| "mean_token_accuracy": 0.9556243360042572, |
| "step": 5520 |
| }, |
| { |
| "epoch": 2.1258418318260537, |
| "grad_norm": 0.11066609082111911, |
| "learning_rate": 2.8942657495917945e-05, |
| "loss": 0.1301, |
| "mean_token_accuracy": 0.9550888121128083, |
| "step": 5525 |
| }, |
| { |
| "epoch": 2.127766018857033, |
| "grad_norm": 0.10443162080898229, |
| "learning_rate": 2.8904941414465002e-05, |
| "loss": 0.1303, |
| "mean_token_accuracy": 0.9551991522312164, |
| "step": 5530 |
| }, |
| { |
| "epoch": 2.1296902058880125, |
| "grad_norm": 0.10637238952626588, |
| "learning_rate": 2.8867221369420722e-05, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.9547699511051178, |
| "step": 5535 |
| }, |
| { |
| "epoch": 2.131614392918992, |
| "grad_norm": 0.10864780408653604, |
| "learning_rate": 2.8829497467200105e-05, |
| "loss": 0.1332, |
| "mean_token_accuracy": 0.9541658878326416, |
| "step": 5540 |
| }, |
| { |
| "epoch": 2.1335385799499713, |
| "grad_norm": 0.10480536707422429, |
| "learning_rate": 2.879176981422904e-05, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9548263072967529, |
| "step": 5545 |
| }, |
| { |
| "epoch": 2.1354627669809507, |
| "grad_norm": 0.10840316140924505, |
| "learning_rate": 2.8754038516943988e-05, |
| "loss": 0.1299, |
| "mean_token_accuracy": 0.9551911413669586, |
| "step": 5550 |
| }, |
| { |
| "epoch": 2.13738695401193, |
| "grad_norm": 0.10511882631547212, |
| "learning_rate": 2.87163036817917e-05, |
| "loss": 0.1307, |
| "mean_token_accuracy": 0.9550678730010986, |
| "step": 5555 |
| }, |
| { |
| "epoch": 2.1393111410429095, |
| "grad_norm": 0.11303406026455419, |
| "learning_rate": 2.8678565415228915e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9555539965629578, |
| "step": 5560 |
| }, |
| { |
| "epoch": 2.141235328073889, |
| "grad_norm": 0.10462909368723101, |
| "learning_rate": 2.8640823823722013e-05, |
| "loss": 0.1317, |
| "mean_token_accuracy": 0.9547511637210846, |
| "step": 5565 |
| }, |
| { |
| "epoch": 2.1431595151048684, |
| "grad_norm": 0.10309809452782778, |
| "learning_rate": 2.8603079013746802e-05, |
| "loss": 0.1287, |
| "mean_token_accuracy": 0.9556194484233856, |
| "step": 5570 |
| }, |
| { |
| "epoch": 2.1450837021358478, |
| "grad_norm": 0.10589336408568385, |
| "learning_rate": 2.856533109178815e-05, |
| "loss": 0.1305, |
| "mean_token_accuracy": 0.9550436675548554, |
| "step": 5575 |
| }, |
| { |
| "epoch": 2.147007889166827, |
| "grad_norm": 0.10548353857019672, |
| "learning_rate": 2.8527580164339706e-05, |
| "loss": 0.1321, |
| "mean_token_accuracy": 0.9544054627418518, |
| "step": 5580 |
| }, |
| { |
| "epoch": 2.1489320761978066, |
| "grad_norm": 0.10686359661962178, |
| "learning_rate": 2.8489826337903585e-05, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.9546919703483582, |
| "step": 5585 |
| }, |
| { |
| "epoch": 2.150856263228786, |
| "grad_norm": 0.10848484445107079, |
| "learning_rate": 2.845206971899011e-05, |
| "loss": 0.1307, |
| "mean_token_accuracy": 0.9550032496452332, |
| "step": 5590 |
| }, |
| { |
| "epoch": 2.1527804502597654, |
| "grad_norm": 0.11059986768054421, |
| "learning_rate": 2.841431041411745e-05, |
| "loss": 0.1302, |
| "mean_token_accuracy": 0.9552673876285553, |
| "step": 5595 |
| }, |
| { |
| "epoch": 2.154704637290745, |
| "grad_norm": 0.1153169713302373, |
| "learning_rate": 2.8376548529811377e-05, |
| "loss": 0.1306, |
| "mean_token_accuracy": 0.9550434827804566, |
| "step": 5600 |
| }, |
| { |
| "epoch": 2.156628824321724, |
| "grad_norm": 0.10497406788532454, |
| "learning_rate": 2.8338784172604915e-05, |
| "loss": 0.1324, |
| "mean_token_accuracy": 0.9546684205532074, |
| "step": 5605 |
| }, |
| { |
| "epoch": 2.1585530113527036, |
| "grad_norm": 0.1105251857493249, |
| "learning_rate": 2.8301017449038087e-05, |
| "loss": 0.1327, |
| "mean_token_accuracy": 0.9545048475265503, |
| "step": 5610 |
| }, |
| { |
| "epoch": 2.160477198383683, |
| "grad_norm": 0.1080109687561676, |
| "learning_rate": 2.8263248465657595e-05, |
| "loss": 0.1286, |
| "mean_token_accuracy": 0.9556637406349182, |
| "step": 5615 |
| }, |
| { |
| "epoch": 2.1624013854146624, |
| "grad_norm": 0.10582265706415724, |
| "learning_rate": 2.8225477329016487e-05, |
| "loss": 0.1336, |
| "mean_token_accuracy": 0.9541392922401428, |
| "step": 5620 |
| }, |
| { |
| "epoch": 2.164325572445642, |
| "grad_norm": 0.10822890534554394, |
| "learning_rate": 2.8187704145673914e-05, |
| "loss": 0.1327, |
| "mean_token_accuracy": 0.9543117165565491, |
| "step": 5625 |
| }, |
| { |
| "epoch": 2.1662497594766212, |
| "grad_norm": 0.1062661018079552, |
| "learning_rate": 2.8149929022194783e-05, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.9552195489406585, |
| "step": 5630 |
| }, |
| { |
| "epoch": 2.1681739465076006, |
| "grad_norm": 0.10273705135295484, |
| "learning_rate": 2.81121520651495e-05, |
| "loss": 0.1303, |
| "mean_token_accuracy": 0.9550776362419129, |
| "step": 5635 |
| }, |
| { |
| "epoch": 2.17009813353858, |
| "grad_norm": 0.10648913106036861, |
| "learning_rate": 2.80743733811136e-05, |
| "loss": 0.1318, |
| "mean_token_accuracy": 0.9548194766044616, |
| "step": 5640 |
| }, |
| { |
| "epoch": 2.1720223205695595, |
| "grad_norm": 0.10344426917331391, |
| "learning_rate": 2.8036593076667533e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9556067049503326, |
| "step": 5645 |
| }, |
| { |
| "epoch": 2.173946507600539, |
| "grad_norm": 0.10956564255431846, |
| "learning_rate": 2.79988112583963e-05, |
| "loss": 0.1298, |
| "mean_token_accuracy": 0.9553233087062836, |
| "step": 5650 |
| }, |
| { |
| "epoch": 2.1758706946315183, |
| "grad_norm": 0.10202792432984834, |
| "learning_rate": 2.796102803288918e-05, |
| "loss": 0.1311, |
| "mean_token_accuracy": 0.9548551559448242, |
| "step": 5655 |
| }, |
| { |
| "epoch": 2.1777948816624977, |
| "grad_norm": 0.10763154625024812, |
| "learning_rate": 2.792324350673941e-05, |
| "loss": 0.1296, |
| "mean_token_accuracy": 0.9553223311901092, |
| "step": 5660 |
| }, |
| { |
| "epoch": 2.179719068693477, |
| "grad_norm": 0.10845788005478905, |
| "learning_rate": 2.7885457786543924e-05, |
| "loss": 0.1291, |
| "mean_token_accuracy": 0.9553823292255401, |
| "step": 5665 |
| }, |
| { |
| "epoch": 2.1816432557244565, |
| "grad_norm": 0.10586949851671838, |
| "learning_rate": 2.784767097890298e-05, |
| "loss": 0.1297, |
| "mean_token_accuracy": 0.955373477935791, |
| "step": 5670 |
| }, |
| { |
| "epoch": 2.183567442755436, |
| "grad_norm": 0.1072073809147426, |
| "learning_rate": 2.7809883190419945e-05, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9549127817153931, |
| "step": 5675 |
| }, |
| { |
| "epoch": 2.1854916297864153, |
| "grad_norm": 0.10760418876402711, |
| "learning_rate": 2.777209452770093e-05, |
| "loss": 0.1323, |
| "mean_token_accuracy": 0.9545275568962097, |
| "step": 5680 |
| }, |
| { |
| "epoch": 2.1874158168173947, |
| "grad_norm": 0.12750165015325696, |
| "learning_rate": 2.7734305097354528e-05, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.9550306141376496, |
| "step": 5685 |
| }, |
| { |
| "epoch": 2.189340003848374, |
| "grad_norm": 0.10549599237818093, |
| "learning_rate": 2.7696515005991487e-05, |
| "loss": 0.1287, |
| "mean_token_accuracy": 0.9555880784988403, |
| "step": 5690 |
| }, |
| { |
| "epoch": 2.1912641908793535, |
| "grad_norm": 0.10416146043638136, |
| "learning_rate": 2.765872436022442e-05, |
| "loss": 0.1281, |
| "mean_token_accuracy": 0.9558050215244294, |
| "step": 5695 |
| }, |
| { |
| "epoch": 2.193188377910333, |
| "grad_norm": 0.11222982332531294, |
| "learning_rate": 2.7620933266667502e-05, |
| "loss": 0.1305, |
| "mean_token_accuracy": 0.954997855424881, |
| "step": 5700 |
| }, |
| { |
| "epoch": 2.1951125649413123, |
| "grad_norm": 0.10838434196926705, |
| "learning_rate": 2.7583141831936187e-05, |
| "loss": 0.1296, |
| "mean_token_accuracy": 0.9555193424224854, |
| "step": 5705 |
| }, |
| { |
| "epoch": 2.1970367519722918, |
| "grad_norm": 0.10364017631139727, |
| "learning_rate": 2.7545350162646882e-05, |
| "loss": 0.1274, |
| "mean_token_accuracy": 0.9559944987297058, |
| "step": 5710 |
| }, |
| { |
| "epoch": 2.198960939003271, |
| "grad_norm": 0.11070015056421295, |
| "learning_rate": 2.750755836541663e-05, |
| "loss": 0.1311, |
| "mean_token_accuracy": 0.9547731518745423, |
| "step": 5715 |
| }, |
| { |
| "epoch": 2.2008851260342506, |
| "grad_norm": 0.10907380677894396, |
| "learning_rate": 2.7469766546862878e-05, |
| "loss": 0.1305, |
| "mean_token_accuracy": 0.9551173925399781, |
| "step": 5720 |
| }, |
| { |
| "epoch": 2.20280931306523, |
| "grad_norm": 0.10698591041494747, |
| "learning_rate": 2.7431974813603128e-05, |
| "loss": 0.1276, |
| "mean_token_accuracy": 0.9559794843196869, |
| "step": 5725 |
| }, |
| { |
| "epoch": 2.2047335000962094, |
| "grad_norm": 0.10973257299820491, |
| "learning_rate": 2.7394183272254603e-05, |
| "loss": 0.1297, |
| "mean_token_accuracy": 0.9552995502948761, |
| "step": 5730 |
| }, |
| { |
| "epoch": 2.206657687127189, |
| "grad_norm": 0.10609738612776014, |
| "learning_rate": 2.7356392029434008e-05, |
| "loss": 0.1269, |
| "mean_token_accuracy": 0.9562586367130279, |
| "step": 5735 |
| }, |
| { |
| "epoch": 2.208581874158168, |
| "grad_norm": 0.10594911969855815, |
| "learning_rate": 2.7318601191757227e-05, |
| "loss": 0.1301, |
| "mean_token_accuracy": 0.9548856914043427, |
| "step": 5740 |
| }, |
| { |
| "epoch": 2.2105060611891476, |
| "grad_norm": 0.11002601673144265, |
| "learning_rate": 2.7280810865838968e-05, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.9547413468360901, |
| "step": 5745 |
| }, |
| { |
| "epoch": 2.212430248220127, |
| "grad_norm": 0.10815764932630612, |
| "learning_rate": 2.7243021158292508e-05, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9547680258750916, |
| "step": 5750 |
| }, |
| { |
| "epoch": 2.2143544352511064, |
| "grad_norm": 0.10325292557809732, |
| "learning_rate": 2.7205232175729385e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9557075679302216, |
| "step": 5755 |
| }, |
| { |
| "epoch": 2.216278622282086, |
| "grad_norm": 0.10368790762039573, |
| "learning_rate": 2.7167444024759072e-05, |
| "loss": 0.1274, |
| "mean_token_accuracy": 0.9563073456287384, |
| "step": 5760 |
| }, |
| { |
| "epoch": 2.2182028093130652, |
| "grad_norm": 0.10585655474534729, |
| "learning_rate": 2.712965681198873e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9553906381130218, |
| "step": 5765 |
| }, |
| { |
| "epoch": 2.2201269963440446, |
| "grad_norm": 0.10760885515979707, |
| "learning_rate": 2.7091870644022848e-05, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.954918497800827, |
| "step": 5770 |
| }, |
| { |
| "epoch": 2.222051183375024, |
| "grad_norm": 0.10509001085051531, |
| "learning_rate": 2.7054085627462967e-05, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.954748409986496, |
| "step": 5775 |
| }, |
| { |
| "epoch": 2.2239753704060035, |
| "grad_norm": 0.10709947966640206, |
| "learning_rate": 2.701630186890738e-05, |
| "loss": 0.1302, |
| "mean_token_accuracy": 0.9552448928356171, |
| "step": 5780 |
| }, |
| { |
| "epoch": 2.225899557436983, |
| "grad_norm": 0.11355608152711724, |
| "learning_rate": 2.6978519474950853e-05, |
| "loss": 0.13, |
| "mean_token_accuracy": 0.9551321148872376, |
| "step": 5785 |
| }, |
| { |
| "epoch": 2.2278237444679623, |
| "grad_norm": 0.10715051929012634, |
| "learning_rate": 2.694073855218428e-05, |
| "loss": 0.1297, |
| "mean_token_accuracy": 0.9552355110645294, |
| "step": 5790 |
| }, |
| { |
| "epoch": 2.2297479314989417, |
| "grad_norm": 0.10240754149347893, |
| "learning_rate": 2.6902959207194394e-05, |
| "loss": 0.1306, |
| "mean_token_accuracy": 0.9551454603672027, |
| "step": 5795 |
| }, |
| { |
| "epoch": 2.231672118529921, |
| "grad_norm": 0.10684337470859812, |
| "learning_rate": 2.6865181546563516e-05, |
| "loss": 0.1323, |
| "mean_token_accuracy": 0.9545264899730682, |
| "step": 5800 |
| }, |
| { |
| "epoch": 2.2335963055609005, |
| "grad_norm": 0.11288874992338749, |
| "learning_rate": 2.682740567686918e-05, |
| "loss": 0.1291, |
| "mean_token_accuracy": 0.9554365873336792, |
| "step": 5805 |
| }, |
| { |
| "epoch": 2.23552049259188, |
| "grad_norm": 0.10965823833426662, |
| "learning_rate": 2.6789631704683887e-05, |
| "loss": 0.1277, |
| "mean_token_accuracy": 0.9561128377914428, |
| "step": 5810 |
| }, |
| { |
| "epoch": 2.2374446796228593, |
| "grad_norm": 0.10668910390319478, |
| "learning_rate": 2.6751859736574764e-05, |
| "loss": 0.1307, |
| "mean_token_accuracy": 0.9552412092685699, |
| "step": 5815 |
| }, |
| { |
| "epoch": 2.2393688666538387, |
| "grad_norm": 0.10517025682012639, |
| "learning_rate": 2.6714089879103304e-05, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9549421489238739, |
| "step": 5820 |
| }, |
| { |
| "epoch": 2.241293053684818, |
| "grad_norm": 0.1086958023252, |
| "learning_rate": 2.667632223882504e-05, |
| "loss": 0.1314, |
| "mean_token_accuracy": 0.9544699192047119, |
| "step": 5825 |
| }, |
| { |
| "epoch": 2.2432172407157975, |
| "grad_norm": 0.10850082865160174, |
| "learning_rate": 2.6638556922289266e-05, |
| "loss": 0.1278, |
| "mean_token_accuracy": 0.9557575345039367, |
| "step": 5830 |
| }, |
| { |
| "epoch": 2.245141427746777, |
| "grad_norm": 0.11397149685095452, |
| "learning_rate": 2.660079403603867e-05, |
| "loss": 0.1298, |
| "mean_token_accuracy": 0.9550568044185639, |
| "step": 5835 |
| }, |
| { |
| "epoch": 2.2470656147777563, |
| "grad_norm": 0.10692811819225884, |
| "learning_rate": 2.6563033686609135e-05, |
| "loss": 0.1288, |
| "mean_token_accuracy": 0.95555659532547, |
| "step": 5840 |
| }, |
| { |
| "epoch": 2.2489898018087358, |
| "grad_norm": 0.10865015750464559, |
| "learning_rate": 2.6525275980529375e-05, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.9553133845329285, |
| "step": 5845 |
| }, |
| { |
| "epoch": 2.250913988839715, |
| "grad_norm": 0.10811587288138687, |
| "learning_rate": 2.648752102432062e-05, |
| "loss": 0.1308, |
| "mean_token_accuracy": 0.9549175322055816, |
| "step": 5850 |
| }, |
| { |
| "epoch": 2.2528381758706946, |
| "grad_norm": 0.11089053169125833, |
| "learning_rate": 2.6449768924496392e-05, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9547737777233124, |
| "step": 5855 |
| }, |
| { |
| "epoch": 2.254762362901674, |
| "grad_norm": 0.11025011291342199, |
| "learning_rate": 2.6412019787562103e-05, |
| "loss": 0.1259, |
| "mean_token_accuracy": 0.9564937889575958, |
| "step": 5860 |
| }, |
| { |
| "epoch": 2.2566865499326534, |
| "grad_norm": 0.10938026792029695, |
| "learning_rate": 2.6374273720014836e-05, |
| "loss": 0.1322, |
| "mean_token_accuracy": 0.9544373095035553, |
| "step": 5865 |
| }, |
| { |
| "epoch": 2.258610736963633, |
| "grad_norm": 0.10214032067674882, |
| "learning_rate": 2.6336530828343e-05, |
| "loss": 0.1288, |
| "mean_token_accuracy": 0.9556487739086151, |
| "step": 5870 |
| }, |
| { |
| "epoch": 2.260534923994612, |
| "grad_norm": 0.10768313363292305, |
| "learning_rate": 2.629879121902607e-05, |
| "loss": 0.1291, |
| "mean_token_accuracy": 0.9554717481136322, |
| "step": 5875 |
| }, |
| { |
| "epoch": 2.2624591110255916, |
| "grad_norm": 0.1064398396196536, |
| "learning_rate": 2.6261054998534225e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9554791688919068, |
| "step": 5880 |
| }, |
| { |
| "epoch": 2.264383298056571, |
| "grad_norm": 0.10476910163839293, |
| "learning_rate": 2.62233222733281e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9552315890789032, |
| "step": 5885 |
| }, |
| { |
| "epoch": 2.2663074850875504, |
| "grad_norm": 0.10681177262629739, |
| "learning_rate": 2.6185593149858485e-05, |
| "loss": 0.1312, |
| "mean_token_accuracy": 0.9546931982040405, |
| "step": 5890 |
| }, |
| { |
| "epoch": 2.26823167211853, |
| "grad_norm": 0.11073979536465368, |
| "learning_rate": 2.614786773456599e-05, |
| "loss": 0.1279, |
| "mean_token_accuracy": 0.9559510767459869, |
| "step": 5895 |
| }, |
| { |
| "epoch": 2.2701558591495092, |
| "grad_norm": 0.10405240104670332, |
| "learning_rate": 2.611014613388075e-05, |
| "loss": 0.1288, |
| "mean_token_accuracy": 0.9558224618434906, |
| "step": 5900 |
| }, |
| { |
| "epoch": 2.2720800461804886, |
| "grad_norm": 0.1063203663396225, |
| "learning_rate": 2.6072428454222175e-05, |
| "loss": 0.1301, |
| "mean_token_accuracy": 0.9550588607788086, |
| "step": 5905 |
| }, |
| { |
| "epoch": 2.274004233211468, |
| "grad_norm": 0.10756387316528157, |
| "learning_rate": 2.603471480199859e-05, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.954735153913498, |
| "step": 5910 |
| }, |
| { |
| "epoch": 2.2759284202424475, |
| "grad_norm": 0.1036959392954559, |
| "learning_rate": 2.599700528360697e-05, |
| "loss": 0.13, |
| "mean_token_accuracy": 0.9552125930786133, |
| "step": 5915 |
| }, |
| { |
| "epoch": 2.277852607273427, |
| "grad_norm": 0.10582831579947122, |
| "learning_rate": 2.5959300005432596e-05, |
| "loss": 0.1316, |
| "mean_token_accuracy": 0.9549217462539673, |
| "step": 5920 |
| }, |
| { |
| "epoch": 2.2797767943044063, |
| "grad_norm": 0.10638865399029085, |
| "learning_rate": 2.5921599073848828e-05, |
| "loss": 0.129, |
| "mean_token_accuracy": 0.9555549681186676, |
| "step": 5925 |
| }, |
| { |
| "epoch": 2.2817009813353857, |
| "grad_norm": 0.1063432249936701, |
| "learning_rate": 2.5883902595216737e-05, |
| "loss": 0.1299, |
| "mean_token_accuracy": 0.9553889214992524, |
| "step": 5930 |
| }, |
| { |
| "epoch": 2.283625168366365, |
| "grad_norm": 0.11415249587430622, |
| "learning_rate": 2.584621067588486e-05, |
| "loss": 0.13, |
| "mean_token_accuracy": 0.9552380859851837, |
| "step": 5935 |
| }, |
| { |
| "epoch": 2.2855493553973445, |
| "grad_norm": 0.10559109236104772, |
| "learning_rate": 2.580852342218883e-05, |
| "loss": 0.1312, |
| "mean_token_accuracy": 0.9549641907215118, |
| "step": 5940 |
| }, |
| { |
| "epoch": 2.287473542428324, |
| "grad_norm": 0.11142315713399763, |
| "learning_rate": 2.5770840940451134e-05, |
| "loss": 0.1274, |
| "mean_token_accuracy": 0.9562214732170105, |
| "step": 5945 |
| }, |
| { |
| "epoch": 2.2893977294593033, |
| "grad_norm": 0.10625800042077924, |
| "learning_rate": 2.5733163336980825e-05, |
| "loss": 0.1282, |
| "mean_token_accuracy": 0.9559169769287109, |
| "step": 5950 |
| }, |
| { |
| "epoch": 2.2913219164902827, |
| "grad_norm": 0.10950350463698343, |
| "learning_rate": 2.5695490718073158e-05, |
| "loss": 0.1341, |
| "mean_token_accuracy": 0.9538970530033112, |
| "step": 5955 |
| }, |
| { |
| "epoch": 2.293246103521262, |
| "grad_norm": 0.10484188965016204, |
| "learning_rate": 2.5657823190009338e-05, |
| "loss": 0.1309, |
| "mean_token_accuracy": 0.9549141764640808, |
| "step": 5960 |
| }, |
| { |
| "epoch": 2.2951702905522415, |
| "grad_norm": 0.11126703936208204, |
| "learning_rate": 2.5620160859056204e-05, |
| "loss": 0.1296, |
| "mean_token_accuracy": 0.955399614572525, |
| "step": 5965 |
| }, |
| { |
| "epoch": 2.297094477583221, |
| "grad_norm": 0.11122131191415677, |
| "learning_rate": 2.5582503831465938e-05, |
| "loss": 0.1321, |
| "mean_token_accuracy": 0.9543533623218536, |
| "step": 5970 |
| }, |
| { |
| "epoch": 2.2990186646142003, |
| "grad_norm": 0.1049382676038808, |
| "learning_rate": 2.554485221347575e-05, |
| "loss": 0.1325, |
| "mean_token_accuracy": 0.9545806467533111, |
| "step": 5975 |
| }, |
| { |
| "epoch": 2.3009428516451798, |
| "grad_norm": 0.10943907755115642, |
| "learning_rate": 2.5507206111307626e-05, |
| "loss": 0.1307, |
| "mean_token_accuracy": 0.9549081921577454, |
| "step": 5980 |
| }, |
| { |
| "epoch": 2.302867038676159, |
| "grad_norm": 0.10775764576510022, |
| "learning_rate": 2.5469565631167934e-05, |
| "loss": 0.1296, |
| "mean_token_accuracy": 0.9553714513778686, |
| "step": 5985 |
| }, |
| { |
| "epoch": 2.3047912257071386, |
| "grad_norm": 0.11461333255761326, |
| "learning_rate": 2.5431930879247218e-05, |
| "loss": 0.1301, |
| "mean_token_accuracy": 0.9552597403526306, |
| "step": 5990 |
| }, |
| { |
| "epoch": 2.306715412738118, |
| "grad_norm": 0.11188764073802483, |
| "learning_rate": 2.5394301961719858e-05, |
| "loss": 0.1293, |
| "mean_token_accuracy": 0.9555404365062714, |
| "step": 5995 |
| }, |
| { |
| "epoch": 2.3086395997690974, |
| "grad_norm": 0.10376185181583203, |
| "learning_rate": 2.535667898474377e-05, |
| "loss": 0.1277, |
| "mean_token_accuracy": 0.955972284078598, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.310563786800077, |
| "grad_norm": 0.10982484930848939, |
| "learning_rate": 2.531906205446009e-05, |
| "loss": 0.1319, |
| "mean_token_accuracy": 0.9546591579914093, |
| "step": 6005 |
| }, |
| { |
| "epoch": 2.312487973831056, |
| "grad_norm": 0.10442132804146616, |
| "learning_rate": 2.528145127699294e-05, |
| "loss": 0.1279, |
| "mean_token_accuracy": 0.9557735204696656, |
| "step": 6010 |
| }, |
| { |
| "epoch": 2.3144121608620356, |
| "grad_norm": 0.10481317479938844, |
| "learning_rate": 2.5243846758449042e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9558473527431488, |
| "step": 6015 |
| }, |
| { |
| "epoch": 2.316336347893015, |
| "grad_norm": 0.10672119468012388, |
| "learning_rate": 2.520624860491748e-05, |
| "loss": 0.1286, |
| "mean_token_accuracy": 0.9557987153530121, |
| "step": 6020 |
| }, |
| { |
| "epoch": 2.3182605349239944, |
| "grad_norm": 0.10977936393100184, |
| "learning_rate": 2.5168656922469398e-05, |
| "loss": 0.1301, |
| "mean_token_accuracy": 0.9552335500717163, |
| "step": 6025 |
| }, |
| { |
| "epoch": 2.3201847219549743, |
| "grad_norm": 0.10657246279734564, |
| "learning_rate": 2.5131071817157636e-05, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.9553155183792115, |
| "step": 6030 |
| }, |
| { |
| "epoch": 2.3221089089859532, |
| "grad_norm": 0.11667544810089918, |
| "learning_rate": 2.509349339501652e-05, |
| "loss": 0.128, |
| "mean_token_accuracy": 0.9557375490665436, |
| "step": 6035 |
| }, |
| { |
| "epoch": 2.324033096016933, |
| "grad_norm": 0.10684484275625068, |
| "learning_rate": 2.505592176206151e-05, |
| "loss": 0.1307, |
| "mean_token_accuracy": 0.9550171077251435, |
| "step": 6040 |
| }, |
| { |
| "epoch": 2.325957283047912, |
| "grad_norm": 0.11365053047555458, |
| "learning_rate": 2.5018357024288917e-05, |
| "loss": 0.128, |
| "mean_token_accuracy": 0.9558144569396972, |
| "step": 6045 |
| }, |
| { |
| "epoch": 2.327881470078892, |
| "grad_norm": 0.1057871075763028, |
| "learning_rate": 2.4980799287675578e-05, |
| "loss": 0.1281, |
| "mean_token_accuracy": 0.9556811392307282, |
| "step": 6050 |
| }, |
| { |
| "epoch": 2.329805657109871, |
| "grad_norm": 0.11063592984299654, |
| "learning_rate": 2.4943248658178603e-05, |
| "loss": 0.1304, |
| "mean_token_accuracy": 0.9552521586418152, |
| "step": 6055 |
| }, |
| { |
| "epoch": 2.3317298441408507, |
| "grad_norm": 0.11130860025823555, |
| "learning_rate": 2.4905705241735032e-05, |
| "loss": 0.1312, |
| "mean_token_accuracy": 0.9547226846218109, |
| "step": 6060 |
| }, |
| { |
| "epoch": 2.3336540311718297, |
| "grad_norm": 0.10343549418767663, |
| "learning_rate": 2.4868169144261595e-05, |
| "loss": 0.1276, |
| "mean_token_accuracy": 0.9562317252159118, |
| "step": 6065 |
| }, |
| { |
| "epoch": 2.3355782182028095, |
| "grad_norm": 0.11017111425706975, |
| "learning_rate": 2.4830640471654317e-05, |
| "loss": 0.1279, |
| "mean_token_accuracy": 0.9559598863124847, |
| "step": 6070 |
| }, |
| { |
| "epoch": 2.3375024052337885, |
| "grad_norm": 0.10762762532836595, |
| "learning_rate": 2.4793119329788307e-05, |
| "loss": 0.129, |
| "mean_token_accuracy": 0.9555034756660461, |
| "step": 6075 |
| }, |
| { |
| "epoch": 2.3394265922647683, |
| "grad_norm": 0.1047860968850429, |
| "learning_rate": 2.475560582451743e-05, |
| "loss": 0.1323, |
| "mean_token_accuracy": 0.9545365333557129, |
| "step": 6080 |
| }, |
| { |
| "epoch": 2.3413507792957473, |
| "grad_norm": 0.10233494138880919, |
| "learning_rate": 2.471810006167401e-05, |
| "loss": 0.1296, |
| "mean_token_accuracy": 0.9555085003376007, |
| "step": 6085 |
| }, |
| { |
| "epoch": 2.343274966326727, |
| "grad_norm": 0.10617127970482486, |
| "learning_rate": 2.4680602147068526e-05, |
| "loss": 0.1289, |
| "mean_token_accuracy": 0.9554856240749359, |
| "step": 6090 |
| }, |
| { |
| "epoch": 2.345199153357706, |
| "grad_norm": 0.10203977332496438, |
| "learning_rate": 2.464311218648928e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9553018271923065, |
| "step": 6095 |
| }, |
| { |
| "epoch": 2.347123340388686, |
| "grad_norm": 0.10411939683124619, |
| "learning_rate": 2.4605630285702196e-05, |
| "loss": 0.1302, |
| "mean_token_accuracy": 0.9553432047367096, |
| "step": 6100 |
| }, |
| { |
| "epoch": 2.3490475274196654, |
| "grad_norm": 0.11351176055257131, |
| "learning_rate": 2.456815655045041e-05, |
| "loss": 0.1324, |
| "mean_token_accuracy": 0.9544381856918335, |
| "step": 6105 |
| }, |
| { |
| "epoch": 2.350971714450645, |
| "grad_norm": 0.11034240481943162, |
| "learning_rate": 2.4530691086454055e-05, |
| "loss": 0.1301, |
| "mean_token_accuracy": 0.9551987648010254, |
| "step": 6110 |
| }, |
| { |
| "epoch": 2.352895901481624, |
| "grad_norm": 0.11259140096829676, |
| "learning_rate": 2.4493233999409904e-05, |
| "loss": 0.1325, |
| "mean_token_accuracy": 0.9543862223625184, |
| "step": 6115 |
| }, |
| { |
| "epoch": 2.3548200885126036, |
| "grad_norm": 0.10479962693609542, |
| "learning_rate": 2.4455785394991104e-05, |
| "loss": 0.1303, |
| "mean_token_accuracy": 0.9551300168037414, |
| "step": 6120 |
| }, |
| { |
| "epoch": 2.356744275543583, |
| "grad_norm": 0.10724118026152689, |
| "learning_rate": 2.441834537884688e-05, |
| "loss": 0.1302, |
| "mean_token_accuracy": 0.9552154541015625, |
| "step": 6125 |
| }, |
| { |
| "epoch": 2.3586684625745624, |
| "grad_norm": 0.11255024870502185, |
| "learning_rate": 2.438091405660224e-05, |
| "loss": 0.1292, |
| "mean_token_accuracy": 0.955460250377655, |
| "step": 6130 |
| }, |
| { |
| "epoch": 2.360592649605542, |
| "grad_norm": 0.10969541941331791, |
| "learning_rate": 2.4343491533857616e-05, |
| "loss": 0.1292, |
| "mean_token_accuracy": 0.9556892514228821, |
| "step": 6135 |
| }, |
| { |
| "epoch": 2.3625168366365212, |
| "grad_norm": 0.111322111822571, |
| "learning_rate": 2.4306077916188662e-05, |
| "loss": 0.134, |
| "mean_token_accuracy": 0.953878390789032, |
| "step": 6140 |
| }, |
| { |
| "epoch": 2.3644410236675006, |
| "grad_norm": 0.10714069318727834, |
| "learning_rate": 2.4268673309145894e-05, |
| "loss": 0.1304, |
| "mean_token_accuracy": 0.9550508975982666, |
| "step": 6145 |
| }, |
| { |
| "epoch": 2.36636521069848, |
| "grad_norm": 0.1053211683904264, |
| "learning_rate": 2.423127781825441e-05, |
| "loss": 0.1299, |
| "mean_token_accuracy": 0.9550895631313324, |
| "step": 6150 |
| }, |
| { |
| "epoch": 2.3682893977294595, |
| "grad_norm": 0.10556315886193816, |
| "learning_rate": 2.419389154901358e-05, |
| "loss": 0.1276, |
| "mean_token_accuracy": 0.9559677720069886, |
| "step": 6155 |
| }, |
| { |
| "epoch": 2.370213584760439, |
| "grad_norm": 0.10627216599036601, |
| "learning_rate": 2.415651460689677e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.955669516324997, |
| "step": 6160 |
| }, |
| { |
| "epoch": 2.3721377717914183, |
| "grad_norm": 0.10979734924510835, |
| "learning_rate": 2.4119147097351014e-05, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9557546257972718, |
| "step": 6165 |
| }, |
| { |
| "epoch": 2.3740619588223977, |
| "grad_norm": 0.10970260677661284, |
| "learning_rate": 2.4081789125796766e-05, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.9555782079696655, |
| "step": 6170 |
| }, |
| { |
| "epoch": 2.375986145853377, |
| "grad_norm": 0.10998606034881979, |
| "learning_rate": 2.404444079762756e-05, |
| "loss": 0.1311, |
| "mean_token_accuracy": 0.9547999441623688, |
| "step": 6175 |
| }, |
| { |
| "epoch": 2.3779103328843565, |
| "grad_norm": 0.10907461607280992, |
| "learning_rate": 2.400710221820969e-05, |
| "loss": 0.1308, |
| "mean_token_accuracy": 0.9551190435886383, |
| "step": 6180 |
| }, |
| { |
| "epoch": 2.379834519915336, |
| "grad_norm": 0.11295246820088488, |
| "learning_rate": 2.3969773492881992e-05, |
| "loss": 0.1273, |
| "mean_token_accuracy": 0.9561882793903351, |
| "step": 6185 |
| }, |
| { |
| "epoch": 2.3817587069463153, |
| "grad_norm": 0.10798424508112144, |
| "learning_rate": 2.393245472695549e-05, |
| "loss": 0.1297, |
| "mean_token_accuracy": 0.9552112817764282, |
| "step": 6190 |
| }, |
| { |
| "epoch": 2.3836828939772947, |
| "grad_norm": 0.10748192867953561, |
| "learning_rate": 2.38951460257131e-05, |
| "loss": 0.1306, |
| "mean_token_accuracy": 0.9548419535160064, |
| "step": 6195 |
| }, |
| { |
| "epoch": 2.385607081008274, |
| "grad_norm": 0.10813811798830832, |
| "learning_rate": 2.3857847494409346e-05, |
| "loss": 0.1296, |
| "mean_token_accuracy": 0.9553369998931884, |
| "step": 6200 |
| }, |
| { |
| "epoch": 2.3875312680392535, |
| "grad_norm": 0.10923338235212277, |
| "learning_rate": 2.3820559238270075e-05, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9558035612106324, |
| "step": 6205 |
| }, |
| { |
| "epoch": 2.389455455070233, |
| "grad_norm": 0.10504288528798039, |
| "learning_rate": 2.378328136249212e-05, |
| "loss": 0.1276, |
| "mean_token_accuracy": 0.9560654640197754, |
| "step": 6210 |
| }, |
| { |
| "epoch": 2.3913796421012123, |
| "grad_norm": 0.10898082105141034, |
| "learning_rate": 2.3746013972243063e-05, |
| "loss": 0.1316, |
| "mean_token_accuracy": 0.9543659031391144, |
| "step": 6215 |
| }, |
| { |
| "epoch": 2.3933038291321918, |
| "grad_norm": 0.1106138395232921, |
| "learning_rate": 2.370875717266087e-05, |
| "loss": 0.1308, |
| "mean_token_accuracy": 0.9551527023315429, |
| "step": 6220 |
| }, |
| { |
| "epoch": 2.395228016163171, |
| "grad_norm": 0.11050211677827432, |
| "learning_rate": 2.3671511068853654e-05, |
| "loss": 0.1321, |
| "mean_token_accuracy": 0.9545704066753388, |
| "step": 6225 |
| }, |
| { |
| "epoch": 2.3971522031941506, |
| "grad_norm": 0.10582372989514112, |
| "learning_rate": 2.3634275765899334e-05, |
| "loss": 0.1311, |
| "mean_token_accuracy": 0.9553862631320953, |
| "step": 6230 |
| }, |
| { |
| "epoch": 2.39907639022513, |
| "grad_norm": 0.10502224579162833, |
| "learning_rate": 2.3597051368845387e-05, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.955961138010025, |
| "step": 6235 |
| }, |
| { |
| "epoch": 2.4010005772561094, |
| "grad_norm": 0.10486880517915072, |
| "learning_rate": 2.355983798270848e-05, |
| "loss": 0.1257, |
| "mean_token_accuracy": 0.9567702889442444, |
| "step": 6240 |
| }, |
| { |
| "epoch": 2.402924764287089, |
| "grad_norm": 0.10885534350453244, |
| "learning_rate": 2.3522635712474255e-05, |
| "loss": 0.1299, |
| "mean_token_accuracy": 0.9552649140357972, |
| "step": 6245 |
| }, |
| { |
| "epoch": 2.404848951318068, |
| "grad_norm": 0.10640605673443858, |
| "learning_rate": 2.348544466309698e-05, |
| "loss": 0.1268, |
| "mean_token_accuracy": 0.9563517928123474, |
| "step": 6250 |
| }, |
| { |
| "epoch": 2.4067731383490476, |
| "grad_norm": 0.10772345727706822, |
| "learning_rate": 2.3448264939499254e-05, |
| "loss": 0.1317, |
| "mean_token_accuracy": 0.9546447098255157, |
| "step": 6255 |
| }, |
| { |
| "epoch": 2.408697325380027, |
| "grad_norm": 0.10469289530051222, |
| "learning_rate": 2.341109664657175e-05, |
| "loss": 0.13, |
| "mean_token_accuracy": 0.9551982462406159, |
| "step": 6260 |
| }, |
| { |
| "epoch": 2.4106215124110064, |
| "grad_norm": 0.10016045196030172, |
| "learning_rate": 2.337393988917287e-05, |
| "loss": 0.1278, |
| "mean_token_accuracy": 0.9561413705348969, |
| "step": 6265 |
| }, |
| { |
| "epoch": 2.412545699441986, |
| "grad_norm": 0.10461285052219418, |
| "learning_rate": 2.3336794772128472e-05, |
| "loss": 0.1287, |
| "mean_token_accuracy": 0.9557947039604187, |
| "step": 6270 |
| }, |
| { |
| "epoch": 2.4144698864729652, |
| "grad_norm": 0.10948702841750811, |
| "learning_rate": 2.3299661400231592e-05, |
| "loss": 0.128, |
| "mean_token_accuracy": 0.9558991074562073, |
| "step": 6275 |
| }, |
| { |
| "epoch": 2.4163940735039446, |
| "grad_norm": 0.1056211764656455, |
| "learning_rate": 2.326253987824214e-05, |
| "loss": 0.1289, |
| "mean_token_accuracy": 0.9556284606456756, |
| "step": 6280 |
| }, |
| { |
| "epoch": 2.418318260534924, |
| "grad_norm": 0.10727635952931647, |
| "learning_rate": 2.322543031088655e-05, |
| "loss": 0.1273, |
| "mean_token_accuracy": 0.9564097642898559, |
| "step": 6285 |
| }, |
| { |
| "epoch": 2.4202424475659035, |
| "grad_norm": 0.10332341197029843, |
| "learning_rate": 2.3188332802857564e-05, |
| "loss": 0.1289, |
| "mean_token_accuracy": 0.9555360496044158, |
| "step": 6290 |
| }, |
| { |
| "epoch": 2.422166634596883, |
| "grad_norm": 0.10742295084290715, |
| "learning_rate": 2.3151247458813907e-05, |
| "loss": 0.1307, |
| "mean_token_accuracy": 0.954986971616745, |
| "step": 6295 |
| }, |
| { |
| "epoch": 2.4240908216278623, |
| "grad_norm": 0.10873199498010518, |
| "learning_rate": 2.3114174383379972e-05, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9566889405250549, |
| "step": 6300 |
| }, |
| { |
| "epoch": 2.4260150086588417, |
| "grad_norm": 0.11212883852617865, |
| "learning_rate": 2.3077113681145534e-05, |
| "loss": 0.1301, |
| "mean_token_accuracy": 0.9550743222236633, |
| "step": 6305 |
| }, |
| { |
| "epoch": 2.427939195689821, |
| "grad_norm": 0.10950238580710399, |
| "learning_rate": 2.304006545666548e-05, |
| "loss": 0.1299, |
| "mean_token_accuracy": 0.9552758276462555, |
| "step": 6310 |
| }, |
| { |
| "epoch": 2.4298633827208005, |
| "grad_norm": 0.10426204545899302, |
| "learning_rate": 2.300302981445948e-05, |
| "loss": 0.1291, |
| "mean_token_accuracy": 0.9555193781852722, |
| "step": 6315 |
| }, |
| { |
| "epoch": 2.43178756975178, |
| "grad_norm": 0.11268725403596633, |
| "learning_rate": 2.296600685901173e-05, |
| "loss": 0.1286, |
| "mean_token_accuracy": 0.9557891607284545, |
| "step": 6320 |
| }, |
| { |
| "epoch": 2.4337117567827593, |
| "grad_norm": 0.10299250442381977, |
| "learning_rate": 2.2928996694770595e-05, |
| "loss": 0.1297, |
| "mean_token_accuracy": 0.955392187833786, |
| "step": 6325 |
| }, |
| { |
| "epoch": 2.4356359438137387, |
| "grad_norm": 0.11412695538997933, |
| "learning_rate": 2.2891999426148386e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9556662082672119, |
| "step": 6330 |
| }, |
| { |
| "epoch": 2.437560130844718, |
| "grad_norm": 0.1072278675480645, |
| "learning_rate": 2.285501515752102e-05, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.9552786529064179, |
| "step": 6335 |
| }, |
| { |
| "epoch": 2.4394843178756975, |
| "grad_norm": 0.10597779313195058, |
| "learning_rate": 2.281804399322775e-05, |
| "loss": 0.1288, |
| "mean_token_accuracy": 0.9556537508964539, |
| "step": 6340 |
| }, |
| { |
| "epoch": 2.441408504906677, |
| "grad_norm": 0.10529809189878536, |
| "learning_rate": 2.2781086037570857e-05, |
| "loss": 0.1286, |
| "mean_token_accuracy": 0.9557879388332366, |
| "step": 6345 |
| }, |
| { |
| "epoch": 2.4433326919376563, |
| "grad_norm": 0.10633758212276818, |
| "learning_rate": 2.274414139481533e-05, |
| "loss": 0.129, |
| "mean_token_accuracy": 0.955594515800476, |
| "step": 6350 |
| }, |
| { |
| "epoch": 2.4452568789686357, |
| "grad_norm": 0.10585477214924503, |
| "learning_rate": 2.2707210169188644e-05, |
| "loss": 0.1282, |
| "mean_token_accuracy": 0.9559841394424439, |
| "step": 6355 |
| }, |
| { |
| "epoch": 2.447181065999615, |
| "grad_norm": 0.10316097711805097, |
| "learning_rate": 2.2670292464880383e-05, |
| "loss": 0.1263, |
| "mean_token_accuracy": 0.956607323884964, |
| "step": 6360 |
| }, |
| { |
| "epoch": 2.4491052530305946, |
| "grad_norm": 0.1073703759899156, |
| "learning_rate": 2.263338838604202e-05, |
| "loss": 0.1253, |
| "mean_token_accuracy": 0.956674462556839, |
| "step": 6365 |
| }, |
| { |
| "epoch": 2.451029440061574, |
| "grad_norm": 0.10776694295962279, |
| "learning_rate": 2.259649803678656e-05, |
| "loss": 0.1304, |
| "mean_token_accuracy": 0.9550556719303132, |
| "step": 6370 |
| }, |
| { |
| "epoch": 2.4529536270925534, |
| "grad_norm": 0.10483844103984719, |
| "learning_rate": 2.2559621521188277e-05, |
| "loss": 0.1279, |
| "mean_token_accuracy": 0.9559533834457398, |
| "step": 6375 |
| }, |
| { |
| "epoch": 2.454877814123533, |
| "grad_norm": 0.1060013694021598, |
| "learning_rate": 2.2522758943282442e-05, |
| "loss": 0.1289, |
| "mean_token_accuracy": 0.9556844234466553, |
| "step": 6380 |
| }, |
| { |
| "epoch": 2.456802001154512, |
| "grad_norm": 0.10954260145685858, |
| "learning_rate": 2.2485910407064985e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9554052412509918, |
| "step": 6385 |
| }, |
| { |
| "epoch": 2.4587261881854916, |
| "grad_norm": 0.10584320212310534, |
| "learning_rate": 2.2449076016492222e-05, |
| "loss": 0.1291, |
| "mean_token_accuracy": 0.9558072209358215, |
| "step": 6390 |
| }, |
| { |
| "epoch": 2.460650375216471, |
| "grad_norm": 0.11050640193114457, |
| "learning_rate": 2.2412255875480558e-05, |
| "loss": 0.1287, |
| "mean_token_accuracy": 0.9557153820991516, |
| "step": 6395 |
| }, |
| { |
| "epoch": 2.4625745622474504, |
| "grad_norm": 0.10813729433867338, |
| "learning_rate": 2.2375450087906214e-05, |
| "loss": 0.128, |
| "mean_token_accuracy": 0.9559723854064941, |
| "step": 6400 |
| }, |
| { |
| "epoch": 2.46449874927843, |
| "grad_norm": 0.10682656562324673, |
| "learning_rate": 2.2338658757604908e-05, |
| "loss": 0.129, |
| "mean_token_accuracy": 0.9554776012897491, |
| "step": 6405 |
| }, |
| { |
| "epoch": 2.4664229363094092, |
| "grad_norm": 0.10658960516187169, |
| "learning_rate": 2.2301881988371574e-05, |
| "loss": 0.1273, |
| "mean_token_accuracy": 0.9560759603977204, |
| "step": 6410 |
| }, |
| { |
| "epoch": 2.4683471233403886, |
| "grad_norm": 0.10616178994876622, |
| "learning_rate": 2.226511988396006e-05, |
| "loss": 0.13, |
| "mean_token_accuracy": 0.9551859855651855, |
| "step": 6415 |
| }, |
| { |
| "epoch": 2.470271310371368, |
| "grad_norm": 0.1172111283269971, |
| "learning_rate": 2.2228372548082842e-05, |
| "loss": 0.1298, |
| "mean_token_accuracy": 0.9552642643451691, |
| "step": 6420 |
| }, |
| { |
| "epoch": 2.4721954974023475, |
| "grad_norm": 0.10774671208770267, |
| "learning_rate": 2.2191640084410735e-05, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9561912536621093, |
| "step": 6425 |
| }, |
| { |
| "epoch": 2.474119684433327, |
| "grad_norm": 0.10511961603577678, |
| "learning_rate": 2.215492259657262e-05, |
| "loss": 0.128, |
| "mean_token_accuracy": 0.9561578452587127, |
| "step": 6430 |
| }, |
| { |
| "epoch": 2.4760438714643063, |
| "grad_norm": 0.10594521909254916, |
| "learning_rate": 2.2118220188155077e-05, |
| "loss": 0.1286, |
| "mean_token_accuracy": 0.9555766880512238, |
| "step": 6435 |
| }, |
| { |
| "epoch": 2.4779680584952857, |
| "grad_norm": 0.11163384256017442, |
| "learning_rate": 2.2081532962702177e-05, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9559310555458069, |
| "step": 6440 |
| }, |
| { |
| "epoch": 2.479892245526265, |
| "grad_norm": 0.1045737292888624, |
| "learning_rate": 2.2044861023715174e-05, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9565778374671936, |
| "step": 6445 |
| }, |
| { |
| "epoch": 2.4818164325572445, |
| "grad_norm": 0.1060088421688071, |
| "learning_rate": 2.2008204474652162e-05, |
| "loss": 0.1252, |
| "mean_token_accuracy": 0.9568614602088928, |
| "step": 6450 |
| }, |
| { |
| "epoch": 2.483740619588224, |
| "grad_norm": 0.10825864694038108, |
| "learning_rate": 2.1971563418927822e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9552516937255859, |
| "step": 6455 |
| }, |
| { |
| "epoch": 2.4856648066192033, |
| "grad_norm": 0.1070769288062764, |
| "learning_rate": 2.1934937959913142e-05, |
| "loss": 0.1293, |
| "mean_token_accuracy": 0.9554482281208039, |
| "step": 6460 |
| }, |
| { |
| "epoch": 2.4875889936501827, |
| "grad_norm": 0.10621601500927427, |
| "learning_rate": 2.1898328200935097e-05, |
| "loss": 0.13, |
| "mean_token_accuracy": 0.9551669120788574, |
| "step": 6465 |
| }, |
| { |
| "epoch": 2.489513180681162, |
| "grad_norm": 0.10455563675305558, |
| "learning_rate": 2.186173424527639e-05, |
| "loss": 0.1284, |
| "mean_token_accuracy": 0.9556362867355347, |
| "step": 6470 |
| }, |
| { |
| "epoch": 2.4914373677121415, |
| "grad_norm": 0.10818889779174191, |
| "learning_rate": 2.1825156196175106e-05, |
| "loss": 0.1291, |
| "mean_token_accuracy": 0.9556038022041321, |
| "step": 6475 |
| }, |
| { |
| "epoch": 2.493361554743121, |
| "grad_norm": 0.1054164214678568, |
| "learning_rate": 2.178859415682447e-05, |
| "loss": 0.1307, |
| "mean_token_accuracy": 0.9552402257919311, |
| "step": 6480 |
| }, |
| { |
| "epoch": 2.4952857417741003, |
| "grad_norm": 0.10836527087770427, |
| "learning_rate": 2.175204823037255e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9558619618415832, |
| "step": 6485 |
| }, |
| { |
| "epoch": 2.4972099288050797, |
| "grad_norm": 0.10985437809136969, |
| "learning_rate": 2.1715518519921957e-05, |
| "loss": 0.1297, |
| "mean_token_accuracy": 0.9555224239826202, |
| "step": 6490 |
| }, |
| { |
| "epoch": 2.499134115836059, |
| "grad_norm": 0.10726465444554566, |
| "learning_rate": 2.1679005128529546e-05, |
| "loss": 0.1302, |
| "mean_token_accuracy": 0.9549250960350036, |
| "step": 6495 |
| }, |
| { |
| "epoch": 2.5010583028670386, |
| "grad_norm": 0.10841250150364372, |
| "learning_rate": 2.164250815920611e-05, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9560544431209564, |
| "step": 6500 |
| }, |
| { |
| "epoch": 2.502982489898018, |
| "grad_norm": 0.10814091803405554, |
| "learning_rate": 2.1606027714916157e-05, |
| "loss": 0.1284, |
| "mean_token_accuracy": 0.9557877242565155, |
| "step": 6505 |
| }, |
| { |
| "epoch": 2.5049066769289974, |
| "grad_norm": 0.10796065674626645, |
| "learning_rate": 2.1569563898577545e-05, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9558773934841156, |
| "step": 6510 |
| }, |
| { |
| "epoch": 2.506830863959977, |
| "grad_norm": 0.10792165801758698, |
| "learning_rate": 2.1533116813061237e-05, |
| "loss": 0.1286, |
| "mean_token_accuracy": 0.955658346414566, |
| "step": 6515 |
| }, |
| { |
| "epoch": 2.508755050990956, |
| "grad_norm": 0.10946774449000102, |
| "learning_rate": 2.149668656119099e-05, |
| "loss": 0.1309, |
| "mean_token_accuracy": 0.9549214005470276, |
| "step": 6520 |
| }, |
| { |
| "epoch": 2.5106792380219356, |
| "grad_norm": 0.10364437855149361, |
| "learning_rate": 2.1460273245743047e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9554930865764618, |
| "step": 6525 |
| }, |
| { |
| "epoch": 2.512603425052915, |
| "grad_norm": 0.1026199533452469, |
| "learning_rate": 2.142387696944591e-05, |
| "loss": 0.128, |
| "mean_token_accuracy": 0.9559862554073334, |
| "step": 6530 |
| }, |
| { |
| "epoch": 2.5145276120838944, |
| "grad_norm": 0.1069750821084188, |
| "learning_rate": 2.1387497834979997e-05, |
| "loss": 0.1301, |
| "mean_token_accuracy": 0.9553844213485718, |
| "step": 6535 |
| }, |
| { |
| "epoch": 2.516451799114874, |
| "grad_norm": 0.10262110699029317, |
| "learning_rate": 2.1351135944977358e-05, |
| "loss": 0.1277, |
| "mean_token_accuracy": 0.956086528301239, |
| "step": 6540 |
| }, |
| { |
| "epoch": 2.5183759861458532, |
| "grad_norm": 0.10776952420084977, |
| "learning_rate": 2.131479140202138e-05, |
| "loss": 0.1277, |
| "mean_token_accuracy": 0.9561761379241943, |
| "step": 6545 |
| }, |
| { |
| "epoch": 2.5203001731768326, |
| "grad_norm": 0.10971225695518153, |
| "learning_rate": 2.1278464308646552e-05, |
| "loss": 0.1302, |
| "mean_token_accuracy": 0.9550739526748657, |
| "step": 6550 |
| }, |
| { |
| "epoch": 2.522224360207812, |
| "grad_norm": 0.11199782190688244, |
| "learning_rate": 2.1242154767338096e-05, |
| "loss": 0.1286, |
| "mean_token_accuracy": 0.9557594180107116, |
| "step": 6555 |
| }, |
| { |
| "epoch": 2.5241485472387915, |
| "grad_norm": 0.10862608502738286, |
| "learning_rate": 2.120586288053173e-05, |
| "loss": 0.1265, |
| "mean_token_accuracy": 0.9562830090522766, |
| "step": 6560 |
| }, |
| { |
| "epoch": 2.526072734269771, |
| "grad_norm": 0.10796891648445665, |
| "learning_rate": 2.1169588750613373e-05, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9559341192245483, |
| "step": 6565 |
| }, |
| { |
| "epoch": 2.5279969213007503, |
| "grad_norm": 0.10522758943486674, |
| "learning_rate": 2.1133332479918833e-05, |
| "loss": 0.1269, |
| "mean_token_accuracy": 0.9564804494380951, |
| "step": 6570 |
| }, |
| { |
| "epoch": 2.52992110833173, |
| "grad_norm": 0.10780286762109235, |
| "learning_rate": 2.109709417073355e-05, |
| "loss": 0.1278, |
| "mean_token_accuracy": 0.9557962656021118, |
| "step": 6575 |
| }, |
| { |
| "epoch": 2.531845295362709, |
| "grad_norm": 0.10552125099465876, |
| "learning_rate": 2.1060873925292286e-05, |
| "loss": 0.1277, |
| "mean_token_accuracy": 0.9557866632938385, |
| "step": 6580 |
| }, |
| { |
| "epoch": 2.533769482393689, |
| "grad_norm": 0.10706134126626986, |
| "learning_rate": 2.1024671845778826e-05, |
| "loss": 0.1273, |
| "mean_token_accuracy": 0.9560283482074737, |
| "step": 6585 |
| }, |
| { |
| "epoch": 2.535693669424668, |
| "grad_norm": 0.10633516501224893, |
| "learning_rate": 2.098848803432573e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9558331072330475, |
| "step": 6590 |
| }, |
| { |
| "epoch": 2.5376178564556477, |
| "grad_norm": 0.1113097029767163, |
| "learning_rate": 2.0952322593014017e-05, |
| "loss": 0.1267, |
| "mean_token_accuracy": 0.9564849495887756, |
| "step": 6595 |
| }, |
| { |
| "epoch": 2.5395420434866267, |
| "grad_norm": 0.10324399717862949, |
| "learning_rate": 2.0916175623872867e-05, |
| "loss": 0.1278, |
| "mean_token_accuracy": 0.9562363088130951, |
| "step": 6600 |
| }, |
| { |
| "epoch": 2.5414662305176066, |
| "grad_norm": 0.11009476615467657, |
| "learning_rate": 2.088004722887934e-05, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9554395854473114, |
| "step": 6605 |
| }, |
| { |
| "epoch": 2.5433904175485855, |
| "grad_norm": 0.10771448514039726, |
| "learning_rate": 2.0843937509958134e-05, |
| "loss": 0.1306, |
| "mean_token_accuracy": 0.955112874507904, |
| "step": 6610 |
| }, |
| { |
| "epoch": 2.5453146045795654, |
| "grad_norm": 0.10636329319110463, |
| "learning_rate": 2.0807846568981203e-05, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9561453282833099, |
| "step": 6615 |
| }, |
| { |
| "epoch": 2.5472387916105443, |
| "grad_norm": 0.10508686260356949, |
| "learning_rate": 2.0771774507767587e-05, |
| "loss": 0.1297, |
| "mean_token_accuracy": 0.9554025113582612, |
| "step": 6620 |
| }, |
| { |
| "epoch": 2.549162978641524, |
| "grad_norm": 0.10820991587473601, |
| "learning_rate": 2.0735721428083017e-05, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9562888860702514, |
| "step": 6625 |
| }, |
| { |
| "epoch": 2.551087165672503, |
| "grad_norm": 0.10943406239997874, |
| "learning_rate": 2.069968743163967e-05, |
| "loss": 0.1292, |
| "mean_token_accuracy": 0.955457329750061, |
| "step": 6630 |
| }, |
| { |
| "epoch": 2.553011352703483, |
| "grad_norm": 0.10546854728372138, |
| "learning_rate": 2.066367262009592e-05, |
| "loss": 0.1273, |
| "mean_token_accuracy": 0.9560437321662902, |
| "step": 6635 |
| }, |
| { |
| "epoch": 2.554935539734462, |
| "grad_norm": 0.10297408910326183, |
| "learning_rate": 2.0627677095056015e-05, |
| "loss": 0.13, |
| "mean_token_accuracy": 0.9552169620990754, |
| "step": 6640 |
| }, |
| { |
| "epoch": 2.556859726765442, |
| "grad_norm": 0.10714476408363253, |
| "learning_rate": 2.059170095806975e-05, |
| "loss": 0.1289, |
| "mean_token_accuracy": 0.9556257486343384, |
| "step": 6645 |
| }, |
| { |
| "epoch": 2.558783913796421, |
| "grad_norm": 0.10349217261873211, |
| "learning_rate": 2.0555744310632258e-05, |
| "loss": 0.1279, |
| "mean_token_accuracy": 0.9557923078536987, |
| "step": 6650 |
| }, |
| { |
| "epoch": 2.5607081008274006, |
| "grad_norm": 0.11009263883011676, |
| "learning_rate": 2.0519807254183687e-05, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.955490505695343, |
| "step": 6655 |
| }, |
| { |
| "epoch": 2.5626322878583796, |
| "grad_norm": 0.10526751847837303, |
| "learning_rate": 2.0483889890108898e-05, |
| "loss": 0.1296, |
| "mean_token_accuracy": 0.9555670380592346, |
| "step": 6660 |
| }, |
| { |
| "epoch": 2.5645564748893594, |
| "grad_norm": 0.10815576645410302, |
| "learning_rate": 2.044799231973723e-05, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9559342682361602, |
| "step": 6665 |
| }, |
| { |
| "epoch": 2.5664806619203384, |
| "grad_norm": 0.10041254582476256, |
| "learning_rate": 2.041211464434214e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.955743670463562, |
| "step": 6670 |
| }, |
| { |
| "epoch": 2.5684048489513183, |
| "grad_norm": 0.11004558434797768, |
| "learning_rate": 2.037625696514097e-05, |
| "loss": 0.1274, |
| "mean_token_accuracy": 0.9560822963714599, |
| "step": 6675 |
| }, |
| { |
| "epoch": 2.5703290359822972, |
| "grad_norm": 0.10609185866996279, |
| "learning_rate": 2.034041938329466e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9558017492294312, |
| "step": 6680 |
| }, |
| { |
| "epoch": 2.572253223013277, |
| "grad_norm": 0.1053426050483543, |
| "learning_rate": 2.0304601999907468e-05, |
| "loss": 0.1253, |
| "mean_token_accuracy": 0.9568973779678345, |
| "step": 6685 |
| }, |
| { |
| "epoch": 2.574177410044256, |
| "grad_norm": 0.1106416909992716, |
| "learning_rate": 2.026880491602662e-05, |
| "loss": 0.1268, |
| "mean_token_accuracy": 0.9561525583267212, |
| "step": 6690 |
| }, |
| { |
| "epoch": 2.576101597075236, |
| "grad_norm": 0.1066555488483838, |
| "learning_rate": 2.0233028232642103e-05, |
| "loss": 0.1274, |
| "mean_token_accuracy": 0.955971074104309, |
| "step": 6695 |
| }, |
| { |
| "epoch": 2.578025784106215, |
| "grad_norm": 0.10551400270459926, |
| "learning_rate": 2.019727205068636e-05, |
| "loss": 0.1269, |
| "mean_token_accuracy": 0.9562014818191529, |
| "step": 6700 |
| }, |
| { |
| "epoch": 2.5799499711371947, |
| "grad_norm": 0.10119569323207261, |
| "learning_rate": 2.016153647103398e-05, |
| "loss": 0.1257, |
| "mean_token_accuracy": 0.9566797375679016, |
| "step": 6705 |
| }, |
| { |
| "epoch": 2.5818741581681737, |
| "grad_norm": 0.10273703318326284, |
| "learning_rate": 2.0125821594501425e-05, |
| "loss": 0.1274, |
| "mean_token_accuracy": 0.9561544001102448, |
| "step": 6710 |
| }, |
| { |
| "epoch": 2.5837983451991535, |
| "grad_norm": 0.11088570545697085, |
| "learning_rate": 2.0090127521846763e-05, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9558983266353607, |
| "step": 6715 |
| }, |
| { |
| "epoch": 2.585722532230133, |
| "grad_norm": 0.10191231115830651, |
| "learning_rate": 2.0054454353769365e-05, |
| "loss": 0.1249, |
| "mean_token_accuracy": 0.9568327248096467, |
| "step": 6720 |
| }, |
| { |
| "epoch": 2.5876467192611123, |
| "grad_norm": 0.10667919841751378, |
| "learning_rate": 2.001880219090963e-05, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9559061825275421, |
| "step": 6725 |
| }, |
| { |
| "epoch": 2.5895709062920917, |
| "grad_norm": 0.11740208475938707, |
| "learning_rate": 1.9983171133848695e-05, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9562270224094391, |
| "step": 6730 |
| }, |
| { |
| "epoch": 2.591495093323071, |
| "grad_norm": 0.10727495132744076, |
| "learning_rate": 1.994756128310814e-05, |
| "loss": 0.1253, |
| "mean_token_accuracy": 0.9566376507282257, |
| "step": 6735 |
| }, |
| { |
| "epoch": 2.5934192803540506, |
| "grad_norm": 0.1042013004686997, |
| "learning_rate": 1.9911972739149744e-05, |
| "loss": 0.1268, |
| "mean_token_accuracy": 0.9562085866928101, |
| "step": 6740 |
| }, |
| { |
| "epoch": 2.59534346738503, |
| "grad_norm": 0.10940813111634506, |
| "learning_rate": 1.9876405602375163e-05, |
| "loss": 0.1284, |
| "mean_token_accuracy": 0.9557159006595611, |
| "step": 6745 |
| }, |
| { |
| "epoch": 2.5972676544160094, |
| "grad_norm": 0.10570441947989995, |
| "learning_rate": 1.984085997312566e-05, |
| "loss": 0.1278, |
| "mean_token_accuracy": 0.9561220288276673, |
| "step": 6750 |
| }, |
| { |
| "epoch": 2.599191841446989, |
| "grad_norm": 0.10513289877971002, |
| "learning_rate": 1.980533595168181e-05, |
| "loss": 0.1267, |
| "mean_token_accuracy": 0.9563930928707123, |
| "step": 6755 |
| }, |
| { |
| "epoch": 2.601116028477968, |
| "grad_norm": 0.1053488036160959, |
| "learning_rate": 1.9769833638263248e-05, |
| "loss": 0.127, |
| "mean_token_accuracy": 0.9562098443508148, |
| "step": 6760 |
| }, |
| { |
| "epoch": 2.6030402155089476, |
| "grad_norm": 0.10389067066050878, |
| "learning_rate": 1.973435313302835e-05, |
| "loss": 0.1273, |
| "mean_token_accuracy": 0.9562519192695618, |
| "step": 6765 |
| }, |
| { |
| "epoch": 2.604964402539927, |
| "grad_norm": 0.10461893885646582, |
| "learning_rate": 1.9698894536073992e-05, |
| "loss": 0.1259, |
| "mean_token_accuracy": 0.9565521538257599, |
| "step": 6770 |
| }, |
| { |
| "epoch": 2.6068885895709064, |
| "grad_norm": 0.10336641235090573, |
| "learning_rate": 1.966345794743521e-05, |
| "loss": 0.1278, |
| "mean_token_accuracy": 0.9559028029441834, |
| "step": 6775 |
| }, |
| { |
| "epoch": 2.608812776601886, |
| "grad_norm": 0.1101183754720948, |
| "learning_rate": 1.9628043467084972e-05, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9560283601284028, |
| "step": 6780 |
| }, |
| { |
| "epoch": 2.6107369636328652, |
| "grad_norm": 0.1112827673424888, |
| "learning_rate": 1.9592651194933864e-05, |
| "loss": 0.1263, |
| "mean_token_accuracy": 0.9564993560314179, |
| "step": 6785 |
| }, |
| { |
| "epoch": 2.6126611506638446, |
| "grad_norm": 0.11076839280052687, |
| "learning_rate": 1.9557281230829842e-05, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9558199286460877, |
| "step": 6790 |
| }, |
| { |
| "epoch": 2.614585337694824, |
| "grad_norm": 0.10513416421018357, |
| "learning_rate": 1.952193367455789e-05, |
| "loss": 0.1267, |
| "mean_token_accuracy": 0.9565446972846985, |
| "step": 6795 |
| }, |
| { |
| "epoch": 2.6165095247258034, |
| "grad_norm": 0.10570491472709956, |
| "learning_rate": 1.9486608625839796e-05, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9567613065242767, |
| "step": 6800 |
| }, |
| { |
| "epoch": 2.618433711756783, |
| "grad_norm": 0.10932952586639395, |
| "learning_rate": 1.9451306184333866e-05, |
| "loss": 0.1268, |
| "mean_token_accuracy": 0.9563915014266968, |
| "step": 6805 |
| }, |
| { |
| "epoch": 2.6203578987877623, |
| "grad_norm": 0.10423882458759645, |
| "learning_rate": 1.941602644963459e-05, |
| "loss": 0.1241, |
| "mean_token_accuracy": 0.9574026703834534, |
| "step": 6810 |
| }, |
| { |
| "epoch": 2.6222820858187417, |
| "grad_norm": 0.11217501853304582, |
| "learning_rate": 1.938076952127243e-05, |
| "loss": 0.1257, |
| "mean_token_accuracy": 0.9566045343875885, |
| "step": 6815 |
| }, |
| { |
| "epoch": 2.624206272849721, |
| "grad_norm": 0.10654173637076583, |
| "learning_rate": 1.934553549871349e-05, |
| "loss": 0.1264, |
| "mean_token_accuracy": 0.9563957929611206, |
| "step": 6820 |
| }, |
| { |
| "epoch": 2.6261304598807005, |
| "grad_norm": 0.10847404127449174, |
| "learning_rate": 1.931032448135925e-05, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.9567358374595643, |
| "step": 6825 |
| }, |
| { |
| "epoch": 2.62805464691168, |
| "grad_norm": 0.10767105156625693, |
| "learning_rate": 1.9275136568546308e-05, |
| "loss": 0.1269, |
| "mean_token_accuracy": 0.9563683986663818, |
| "step": 6830 |
| }, |
| { |
| "epoch": 2.6299788339426593, |
| "grad_norm": 0.11103186074935593, |
| "learning_rate": 1.923997185954607e-05, |
| "loss": 0.1305, |
| "mean_token_accuracy": 0.9552824318408966, |
| "step": 6835 |
| }, |
| { |
| "epoch": 2.6319030209736387, |
| "grad_norm": 0.11115348877053942, |
| "learning_rate": 1.920483045356446e-05, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.9568055391311645, |
| "step": 6840 |
| }, |
| { |
| "epoch": 2.633827208004618, |
| "grad_norm": 0.11276780856069071, |
| "learning_rate": 1.9169712449741688e-05, |
| "loss": 0.1279, |
| "mean_token_accuracy": 0.9558255076408386, |
| "step": 6845 |
| }, |
| { |
| "epoch": 2.6357513950355975, |
| "grad_norm": 0.11060901419193486, |
| "learning_rate": 1.9134617947151938e-05, |
| "loss": 0.1276, |
| "mean_token_accuracy": 0.9561234533786773, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.637675582066577, |
| "grad_norm": 0.10861450011475303, |
| "learning_rate": 1.9099547044803084e-05, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9562245905399323, |
| "step": 6855 |
| }, |
| { |
| "epoch": 2.6395997690975563, |
| "grad_norm": 0.10678156519802066, |
| "learning_rate": 1.9064499841636413e-05, |
| "loss": 0.1272, |
| "mean_token_accuracy": 0.9562100887298584, |
| "step": 6860 |
| }, |
| { |
| "epoch": 2.6415239561285357, |
| "grad_norm": 0.10875833039075991, |
| "learning_rate": 1.9029476436526374e-05, |
| "loss": 0.1261, |
| "mean_token_accuracy": 0.956655991077423, |
| "step": 6865 |
| }, |
| { |
| "epoch": 2.643448143159515, |
| "grad_norm": 0.11511155356237994, |
| "learning_rate": 1.8994476928280246e-05, |
| "loss": 0.1272, |
| "mean_token_accuracy": 0.9563781440258026, |
| "step": 6870 |
| }, |
| { |
| "epoch": 2.6453723301904946, |
| "grad_norm": 0.10591073491980244, |
| "learning_rate": 1.8959501415637935e-05, |
| "loss": 0.1267, |
| "mean_token_accuracy": 0.9563282132148743, |
| "step": 6875 |
| }, |
| { |
| "epoch": 2.647296517221474, |
| "grad_norm": 0.10860092720760628, |
| "learning_rate": 1.8924549997271614e-05, |
| "loss": 0.1289, |
| "mean_token_accuracy": 0.9557060241699219, |
| "step": 6880 |
| }, |
| { |
| "epoch": 2.6492207042524534, |
| "grad_norm": 0.10503352307327395, |
| "learning_rate": 1.888962277178548e-05, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9563852548599243, |
| "step": 6885 |
| }, |
| { |
| "epoch": 2.651144891283433, |
| "grad_norm": 0.11013776759663174, |
| "learning_rate": 1.8854719837715513e-05, |
| "loss": 0.1266, |
| "mean_token_accuracy": 0.9563451290130616, |
| "step": 6890 |
| }, |
| { |
| "epoch": 2.653069078314412, |
| "grad_norm": 0.10670392442749238, |
| "learning_rate": 1.8819841293529135e-05, |
| "loss": 0.1293, |
| "mean_token_accuracy": 0.9553812265396118, |
| "step": 6895 |
| }, |
| { |
| "epoch": 2.6549932653453916, |
| "grad_norm": 0.10979631553488628, |
| "learning_rate": 1.8784987237624958e-05, |
| "loss": 0.13, |
| "mean_token_accuracy": 0.9554021894931793, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.656917452376371, |
| "grad_norm": 0.10650971195174591, |
| "learning_rate": 1.8750157768332515e-05, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9567661046981811, |
| "step": 6905 |
| }, |
| { |
| "epoch": 2.6588416394073504, |
| "grad_norm": 0.10569859297963535, |
| "learning_rate": 1.8715352983911987e-05, |
| "loss": 0.1279, |
| "mean_token_accuracy": 0.9560502111911774, |
| "step": 6910 |
| }, |
| { |
| "epoch": 2.66076582643833, |
| "grad_norm": 0.10535601083717823, |
| "learning_rate": 1.868057298255389e-05, |
| "loss": 0.1269, |
| "mean_token_accuracy": 0.9561728775501251, |
| "step": 6915 |
| }, |
| { |
| "epoch": 2.6626900134693092, |
| "grad_norm": 0.10643100279970104, |
| "learning_rate": 1.8645817862378857e-05, |
| "loss": 0.1264, |
| "mean_token_accuracy": 0.9563452661037445, |
| "step": 6920 |
| }, |
| { |
| "epoch": 2.6646142005002886, |
| "grad_norm": 0.10962944725677765, |
| "learning_rate": 1.8611087721437287e-05, |
| "loss": 0.1282, |
| "mean_token_accuracy": 0.9559251666069031, |
| "step": 6925 |
| }, |
| { |
| "epoch": 2.666538387531268, |
| "grad_norm": 0.10546649694289567, |
| "learning_rate": 1.8576382657709128e-05, |
| "loss": 0.1248, |
| "mean_token_accuracy": 0.9570260524749756, |
| "step": 6930 |
| }, |
| { |
| "epoch": 2.6684625745622474, |
| "grad_norm": 0.10294778802052634, |
| "learning_rate": 1.8541702769103586e-05, |
| "loss": 0.1284, |
| "mean_token_accuracy": 0.9561637580394745, |
| "step": 6935 |
| }, |
| { |
| "epoch": 2.670386761593227, |
| "grad_norm": 0.10709037535879702, |
| "learning_rate": 1.8507048153458852e-05, |
| "loss": 0.1284, |
| "mean_token_accuracy": 0.9558945298194885, |
| "step": 6940 |
| }, |
| { |
| "epoch": 2.6723109486242063, |
| "grad_norm": 0.11244918165012138, |
| "learning_rate": 1.8472418908541778e-05, |
| "loss": 0.125, |
| "mean_token_accuracy": 0.9568565487861633, |
| "step": 6945 |
| }, |
| { |
| "epoch": 2.6742351356551857, |
| "grad_norm": 0.10630468922918597, |
| "learning_rate": 1.843781513204767e-05, |
| "loss": 0.127, |
| "mean_token_accuracy": 0.956307715177536, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.676159322686165, |
| "grad_norm": 0.10419951897148877, |
| "learning_rate": 1.8403236921599987e-05, |
| "loss": 0.1293, |
| "mean_token_accuracy": 0.9554457426071167, |
| "step": 6955 |
| }, |
| { |
| "epoch": 2.6780835097171445, |
| "grad_norm": 0.11218125498578436, |
| "learning_rate": 1.836868437475006e-05, |
| "loss": 0.1277, |
| "mean_token_accuracy": 0.9557630300521851, |
| "step": 6960 |
| }, |
| { |
| "epoch": 2.680007696748124, |
| "grad_norm": 0.10948606028667217, |
| "learning_rate": 1.8334157588976784e-05, |
| "loss": 0.1266, |
| "mean_token_accuracy": 0.9562645971775054, |
| "step": 6965 |
| }, |
| { |
| "epoch": 2.6819318837791033, |
| "grad_norm": 0.10772262444167868, |
| "learning_rate": 1.8299656661686438e-05, |
| "loss": 0.1248, |
| "mean_token_accuracy": 0.9570554375648499, |
| "step": 6970 |
| }, |
| { |
| "epoch": 2.6838560708100827, |
| "grad_norm": 0.10960968242808095, |
| "learning_rate": 1.8265181690212292e-05, |
| "loss": 0.1267, |
| "mean_token_accuracy": 0.9564591765403747, |
| "step": 6975 |
| }, |
| { |
| "epoch": 2.685780257841062, |
| "grad_norm": 0.1030152420845438, |
| "learning_rate": 1.8230732771814425e-05, |
| "loss": 0.1293, |
| "mean_token_accuracy": 0.9554629743099212, |
| "step": 6980 |
| }, |
| { |
| "epoch": 2.6877044448720415, |
| "grad_norm": 0.1147631862622607, |
| "learning_rate": 1.819631000367941e-05, |
| "loss": 0.1289, |
| "mean_token_accuracy": 0.9556432604789734, |
| "step": 6985 |
| }, |
| { |
| "epoch": 2.689628631903021, |
| "grad_norm": 0.10355133931423409, |
| "learning_rate": 1.8161913482920028e-05, |
| "loss": 0.1248, |
| "mean_token_accuracy": 0.9568826794624329, |
| "step": 6990 |
| }, |
| { |
| "epoch": 2.6915528189340003, |
| "grad_norm": 0.10616830597210221, |
| "learning_rate": 1.8127543306575034e-05, |
| "loss": 0.1282, |
| "mean_token_accuracy": 0.9558695256710052, |
| "step": 6995 |
| }, |
| { |
| "epoch": 2.6934770059649797, |
| "grad_norm": 0.11136236222785284, |
| "learning_rate": 1.8093199571608836e-05, |
| "loss": 0.1261, |
| "mean_token_accuracy": 0.9567158997058869, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.695401192995959, |
| "grad_norm": 0.11542457168063223, |
| "learning_rate": 1.805888237491129e-05, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9568669140338898, |
| "step": 7005 |
| }, |
| { |
| "epoch": 2.6973253800269386, |
| "grad_norm": 0.10964388423684117, |
| "learning_rate": 1.802459181329732e-05, |
| "loss": 0.1232, |
| "mean_token_accuracy": 0.9577533662319183, |
| "step": 7010 |
| }, |
| { |
| "epoch": 2.699249567057918, |
| "grad_norm": 0.11697708785883097, |
| "learning_rate": 1.799032798350676e-05, |
| "loss": 0.1252, |
| "mean_token_accuracy": 0.9567424178123474, |
| "step": 7015 |
| }, |
| { |
| "epoch": 2.7011737540888974, |
| "grad_norm": 0.10669513491757858, |
| "learning_rate": 1.7956090982204015e-05, |
| "loss": 0.1266, |
| "mean_token_accuracy": 0.95643470287323, |
| "step": 7020 |
| }, |
| { |
| "epoch": 2.703097941119877, |
| "grad_norm": 0.10777973722640331, |
| "learning_rate": 1.7921880905977788e-05, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9562375128269196, |
| "step": 7025 |
| }, |
| { |
| "epoch": 2.705022128150856, |
| "grad_norm": 0.10701533180688344, |
| "learning_rate": 1.7887697851340825e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9558044135570526, |
| "step": 7030 |
| }, |
| { |
| "epoch": 2.7069463151818356, |
| "grad_norm": 0.10331099074883404, |
| "learning_rate": 1.785354191472965e-05, |
| "loss": 0.1272, |
| "mean_token_accuracy": 0.956113600730896, |
| "step": 7035 |
| }, |
| { |
| "epoch": 2.708870502212815, |
| "grad_norm": 0.11193788287349637, |
| "learning_rate": 1.7819413192504276e-05, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9568765163421631, |
| "step": 7040 |
| }, |
| { |
| "epoch": 2.7107946892437944, |
| "grad_norm": 0.10723666773916213, |
| "learning_rate": 1.778531178094795e-05, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9566305696964263, |
| "step": 7045 |
| }, |
| { |
| "epoch": 2.712718876274774, |
| "grad_norm": 0.10400150310051162, |
| "learning_rate": 1.775123777626685e-05, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9561739981174469, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.7146430633057532, |
| "grad_norm": 0.10733826918036667, |
| "learning_rate": 1.771719127458985e-05, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9562620878219604, |
| "step": 7055 |
| }, |
| { |
| "epoch": 2.7165672503367326, |
| "grad_norm": 0.12644344138008767, |
| "learning_rate": 1.7683172371968227e-05, |
| "loss": 0.1279, |
| "mean_token_accuracy": 0.9559725463390351, |
| "step": 7060 |
| }, |
| { |
| "epoch": 2.718491437367712, |
| "grad_norm": 0.10536033765819473, |
| "learning_rate": 1.7649181164375395e-05, |
| "loss": 0.1272, |
| "mean_token_accuracy": 0.9562171399593353, |
| "step": 7065 |
| }, |
| { |
| "epoch": 2.7204156243986914, |
| "grad_norm": 0.10884864080619998, |
| "learning_rate": 1.761521774770665e-05, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.955652940273285, |
| "step": 7070 |
| }, |
| { |
| "epoch": 2.722339811429671, |
| "grad_norm": 0.10723787813049393, |
| "learning_rate": 1.758128221777885e-05, |
| "loss": 0.1241, |
| "mean_token_accuracy": 0.9573346912860871, |
| "step": 7075 |
| }, |
| { |
| "epoch": 2.7242639984606503, |
| "grad_norm": 0.10767449085985573, |
| "learning_rate": 1.754737467033023e-05, |
| "loss": 0.1292, |
| "mean_token_accuracy": 0.9555230498313904, |
| "step": 7080 |
| }, |
| { |
| "epoch": 2.7261881854916297, |
| "grad_norm": 0.10895307917278084, |
| "learning_rate": 1.751349520102003e-05, |
| "loss": 0.1249, |
| "mean_token_accuracy": 0.9571972370147706, |
| "step": 7085 |
| }, |
| { |
| "epoch": 2.728112372522609, |
| "grad_norm": 0.10650045123221602, |
| "learning_rate": 1.7479643905428316e-05, |
| "loss": 0.1253, |
| "mean_token_accuracy": 0.9568059027194977, |
| "step": 7090 |
| }, |
| { |
| "epoch": 2.7300365595535885, |
| "grad_norm": 0.10928759118436902, |
| "learning_rate": 1.7445820879055647e-05, |
| "loss": 0.128, |
| "mean_token_accuracy": 0.955977874994278, |
| "step": 7095 |
| }, |
| { |
| "epoch": 2.731960746584568, |
| "grad_norm": 0.1073383269536675, |
| "learning_rate": 1.7412026217322836e-05, |
| "loss": 0.1259, |
| "mean_token_accuracy": 0.9569141745567322, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.7338849336155473, |
| "grad_norm": 0.10966952979678361, |
| "learning_rate": 1.737826001557068e-05, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9563021123409271, |
| "step": 7105 |
| }, |
| { |
| "epoch": 2.7358091206465267, |
| "grad_norm": 0.10526562498309563, |
| "learning_rate": 1.7344522369059692e-05, |
| "loss": 0.1265, |
| "mean_token_accuracy": 0.9564592182636261, |
| "step": 7110 |
| }, |
| { |
| "epoch": 2.737733307677506, |
| "grad_norm": 0.11031935892748677, |
| "learning_rate": 1.73108133729698e-05, |
| "loss": 0.1262, |
| "mean_token_accuracy": 0.956552791595459, |
| "step": 7115 |
| }, |
| { |
| "epoch": 2.7396574947084855, |
| "grad_norm": 0.10418193840163821, |
| "learning_rate": 1.7277133122400114e-05, |
| "loss": 0.1277, |
| "mean_token_accuracy": 0.9562981486320495, |
| "step": 7120 |
| }, |
| { |
| "epoch": 2.741581681739465, |
| "grad_norm": 0.11099596444843954, |
| "learning_rate": 1.724348171236866e-05, |
| "loss": 0.1268, |
| "mean_token_accuracy": 0.9564831733703614, |
| "step": 7125 |
| }, |
| { |
| "epoch": 2.7435058687704443, |
| "grad_norm": 0.10204613720474413, |
| "learning_rate": 1.7209859237812097e-05, |
| "loss": 0.1267, |
| "mean_token_accuracy": 0.9565585315227508, |
| "step": 7130 |
| }, |
| { |
| "epoch": 2.7454300558014237, |
| "grad_norm": 0.10686056931951905, |
| "learning_rate": 1.717626579358545e-05, |
| "loss": 0.1277, |
| "mean_token_accuracy": 0.9559927165508271, |
| "step": 7135 |
| }, |
| { |
| "epoch": 2.747354242832403, |
| "grad_norm": 0.10665318681343526, |
| "learning_rate": 1.7142701474461826e-05, |
| "loss": 0.1269, |
| "mean_token_accuracy": 0.9562374651432037, |
| "step": 7140 |
| }, |
| { |
| "epoch": 2.7492784298633826, |
| "grad_norm": 0.10557591127652506, |
| "learning_rate": 1.7109166375132196e-05, |
| "loss": 0.1272, |
| "mean_token_accuracy": 0.956173038482666, |
| "step": 7145 |
| }, |
| { |
| "epoch": 2.7512026168943624, |
| "grad_norm": 0.11052732032634138, |
| "learning_rate": 1.7075660590205067e-05, |
| "loss": 0.1274, |
| "mean_token_accuracy": 0.9560621798038482, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.7531268039253414, |
| "grad_norm": 0.1087942108070962, |
| "learning_rate": 1.704218421420627e-05, |
| "loss": 0.1272, |
| "mean_token_accuracy": 0.9559900820255279, |
| "step": 7155 |
| }, |
| { |
| "epoch": 2.755050990956321, |
| "grad_norm": 0.10447864519398838, |
| "learning_rate": 1.7008737341578646e-05, |
| "loss": 0.1263, |
| "mean_token_accuracy": 0.9565044999122619, |
| "step": 7160 |
| }, |
| { |
| "epoch": 2.7569751779873, |
| "grad_norm": 0.10552092323762612, |
| "learning_rate": 1.697532006668182e-05, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9559858977794647, |
| "step": 7165 |
| }, |
| { |
| "epoch": 2.75889936501828, |
| "grad_norm": 0.11015301455853907, |
| "learning_rate": 1.6941932483791913e-05, |
| "loss": 0.1274, |
| "mean_token_accuracy": 0.9562235772609711, |
| "step": 7170 |
| }, |
| { |
| "epoch": 2.760823552049259, |
| "grad_norm": 0.10941521089776884, |
| "learning_rate": 1.6908574687101273e-05, |
| "loss": 0.1227, |
| "mean_token_accuracy": 0.9576989531517028, |
| "step": 7175 |
| }, |
| { |
| "epoch": 2.762747739080239, |
| "grad_norm": 0.10782441194041115, |
| "learning_rate": 1.6875246770718202e-05, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.9567393243312836, |
| "step": 7180 |
| }, |
| { |
| "epoch": 2.764671926111218, |
| "grad_norm": 0.10542593214023584, |
| "learning_rate": 1.6841948828666742e-05, |
| "loss": 0.1233, |
| "mean_token_accuracy": 0.9576320827007294, |
| "step": 7185 |
| }, |
| { |
| "epoch": 2.7665961131421977, |
| "grad_norm": 0.10659113484500546, |
| "learning_rate": 1.680868095488634e-05, |
| "loss": 0.1258, |
| "mean_token_accuracy": 0.9567969083786011, |
| "step": 7190 |
| }, |
| { |
| "epoch": 2.7685203001731766, |
| "grad_norm": 0.11071007338136814, |
| "learning_rate": 1.6775443243231636e-05, |
| "loss": 0.1274, |
| "mean_token_accuracy": 0.9560856699943543, |
| "step": 7195 |
| }, |
| { |
| "epoch": 2.7704444872041565, |
| "grad_norm": 0.10800226795369741, |
| "learning_rate": 1.6742235787472148e-05, |
| "loss": 0.1259, |
| "mean_token_accuracy": 0.9565568685531616, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.7723686742351354, |
| "grad_norm": 0.10574854506306638, |
| "learning_rate": 1.670905868129208e-05, |
| "loss": 0.1247, |
| "mean_token_accuracy": 0.9571392416954041, |
| "step": 7205 |
| }, |
| { |
| "epoch": 2.7742928612661153, |
| "grad_norm": 0.10691098685310134, |
| "learning_rate": 1.667591201828997e-05, |
| "loss": 0.1268, |
| "mean_token_accuracy": 0.95638587474823, |
| "step": 7210 |
| }, |
| { |
| "epoch": 2.7762170482970943, |
| "grad_norm": 0.10540967489714154, |
| "learning_rate": 1.6642795891978496e-05, |
| "loss": 0.126, |
| "mean_token_accuracy": 0.9566452860832214, |
| "step": 7215 |
| }, |
| { |
| "epoch": 2.778141235328074, |
| "grad_norm": 0.10674759101707822, |
| "learning_rate": 1.6609710395784193e-05, |
| "loss": 0.1257, |
| "mean_token_accuracy": 0.9566307127475738, |
| "step": 7220 |
| }, |
| { |
| "epoch": 2.780065422359053, |
| "grad_norm": 0.11002060431089043, |
| "learning_rate": 1.657665562304715e-05, |
| "loss": 0.123, |
| "mean_token_accuracy": 0.9575728833675384, |
| "step": 7225 |
| }, |
| { |
| "epoch": 2.781989609390033, |
| "grad_norm": 0.10576961272518313, |
| "learning_rate": 1.654363166702082e-05, |
| "loss": 0.1246, |
| "mean_token_accuracy": 0.9570701956748963, |
| "step": 7230 |
| }, |
| { |
| "epoch": 2.783913796421012, |
| "grad_norm": 0.11262475939757777, |
| "learning_rate": 1.6510638620871682e-05, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9560605108737945, |
| "step": 7235 |
| }, |
| { |
| "epoch": 2.7858379834519917, |
| "grad_norm": 0.11127630384550656, |
| "learning_rate": 1.6477676577679042e-05, |
| "loss": 0.1262, |
| "mean_token_accuracy": 0.956292325258255, |
| "step": 7240 |
| }, |
| { |
| "epoch": 2.7877621704829707, |
| "grad_norm": 0.10658040995481745, |
| "learning_rate": 1.6444745630434705e-05, |
| "loss": 0.1254, |
| "mean_token_accuracy": 0.9569096446037293, |
| "step": 7245 |
| }, |
| { |
| "epoch": 2.7896863575139506, |
| "grad_norm": 0.11552542983328662, |
| "learning_rate": 1.6411845872042792e-05, |
| "loss": 0.1254, |
| "mean_token_accuracy": 0.9569571316242218, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.7916105445449295, |
| "grad_norm": 0.10708675669840825, |
| "learning_rate": 1.63789773953194e-05, |
| "loss": 0.1247, |
| "mean_token_accuracy": 0.957468980550766, |
| "step": 7255 |
| }, |
| { |
| "epoch": 2.7935347315759094, |
| "grad_norm": 0.10449931858438566, |
| "learning_rate": 1.6346140292992405e-05, |
| "loss": 0.1272, |
| "mean_token_accuracy": 0.9563921928405762, |
| "step": 7260 |
| }, |
| { |
| "epoch": 2.7954589186068883, |
| "grad_norm": 0.10809986665414069, |
| "learning_rate": 1.6313334657701146e-05, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9558048605918884, |
| "step": 7265 |
| }, |
| { |
| "epoch": 2.797383105637868, |
| "grad_norm": 0.10739087950506812, |
| "learning_rate": 1.628056058199618e-05, |
| "loss": 0.125, |
| "mean_token_accuracy": 0.9568708956241607, |
| "step": 7270 |
| }, |
| { |
| "epoch": 2.799307292668847, |
| "grad_norm": 0.10402472359624435, |
| "learning_rate": 1.6247818158339053e-05, |
| "loss": 0.1263, |
| "mean_token_accuracy": 0.9565973937511444, |
| "step": 7275 |
| }, |
| { |
| "epoch": 2.801231479699827, |
| "grad_norm": 0.10522619304351352, |
| "learning_rate": 1.621510747910202e-05, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.9567424237728119, |
| "step": 7280 |
| }, |
| { |
| "epoch": 2.803155666730806, |
| "grad_norm": 0.1031627194210059, |
| "learning_rate": 1.6182428636567746e-05, |
| "loss": 0.126, |
| "mean_token_accuracy": 0.9567280828952789, |
| "step": 7285 |
| }, |
| { |
| "epoch": 2.805079853761786, |
| "grad_norm": 0.10885829619538895, |
| "learning_rate": 1.6149781722929112e-05, |
| "loss": 0.1272, |
| "mean_token_accuracy": 0.9562729597091675, |
| "step": 7290 |
| }, |
| { |
| "epoch": 2.807004040792765, |
| "grad_norm": 0.11357013093667029, |
| "learning_rate": 1.6117166830288894e-05, |
| "loss": 0.1238, |
| "mean_token_accuracy": 0.9574274599552155, |
| "step": 7295 |
| }, |
| { |
| "epoch": 2.8089282278237446, |
| "grad_norm": 0.10447156358180851, |
| "learning_rate": 1.608458405065955e-05, |
| "loss": 0.1259, |
| "mean_token_accuracy": 0.9568834602832794, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.810852414854724, |
| "grad_norm": 0.102388962981238, |
| "learning_rate": 1.6052033475962953e-05, |
| "loss": 0.1273, |
| "mean_token_accuracy": 0.9563150763511657, |
| "step": 7305 |
| }, |
| { |
| "epoch": 2.8127766018857034, |
| "grad_norm": 0.10645925025513615, |
| "learning_rate": 1.6019515198030078e-05, |
| "loss": 0.1268, |
| "mean_token_accuracy": 0.9563553273677826, |
| "step": 7310 |
| }, |
| { |
| "epoch": 2.814700788916683, |
| "grad_norm": 0.11653208782578775, |
| "learning_rate": 1.5987029308600822e-05, |
| "loss": 0.1268, |
| "mean_token_accuracy": 0.956266438961029, |
| "step": 7315 |
| }, |
| { |
| "epoch": 2.8166249759476623, |
| "grad_norm": 0.10715577285113691, |
| "learning_rate": 1.5954575899323703e-05, |
| "loss": 0.1249, |
| "mean_token_accuracy": 0.9570776879787445, |
| "step": 7320 |
| }, |
| { |
| "epoch": 2.8185491629786417, |
| "grad_norm": 0.10736007734155249, |
| "learning_rate": 1.5922155061755602e-05, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9558275938034058, |
| "step": 7325 |
| }, |
| { |
| "epoch": 2.820473350009621, |
| "grad_norm": 0.11065163771448285, |
| "learning_rate": 1.5889766887361485e-05, |
| "loss": 0.1266, |
| "mean_token_accuracy": 0.9565343916416168, |
| "step": 7330 |
| }, |
| { |
| "epoch": 2.8223975370406005, |
| "grad_norm": 0.11471891290397622, |
| "learning_rate": 1.585741146751421e-05, |
| "loss": 0.1232, |
| "mean_token_accuracy": 0.9576056897640228, |
| "step": 7335 |
| }, |
| { |
| "epoch": 2.82432172407158, |
| "grad_norm": 0.11104147050631989, |
| "learning_rate": 1.5825088893494208e-05, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9562883615493775, |
| "step": 7340 |
| }, |
| { |
| "epoch": 2.8262459111025593, |
| "grad_norm": 0.11255149427553987, |
| "learning_rate": 1.579279925648926e-05, |
| "loss": 0.1203, |
| "mean_token_accuracy": 0.9585840284824372, |
| "step": 7345 |
| }, |
| { |
| "epoch": 2.8281700981335387, |
| "grad_norm": 0.10481594755119765, |
| "learning_rate": 1.5760542647594192e-05, |
| "loss": 0.1246, |
| "mean_token_accuracy": 0.9568839311599732, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.830094285164518, |
| "grad_norm": 0.11130912788500391, |
| "learning_rate": 1.5728319157810693e-05, |
| "loss": 0.1251, |
| "mean_token_accuracy": 0.9572184145450592, |
| "step": 7355 |
| }, |
| { |
| "epoch": 2.8320184721954975, |
| "grad_norm": 0.109805966713199, |
| "learning_rate": 1.569612887804699e-05, |
| "loss": 0.126, |
| "mean_token_accuracy": 0.9567272245883942, |
| "step": 7360 |
| }, |
| { |
| "epoch": 2.833942659226477, |
| "grad_norm": 0.10635917964214982, |
| "learning_rate": 1.5663971899117635e-05, |
| "loss": 0.1249, |
| "mean_token_accuracy": 0.9571217775344849, |
| "step": 7365 |
| }, |
| { |
| "epoch": 2.8358668462574563, |
| "grad_norm": 0.10700521407852201, |
| "learning_rate": 1.563184831174321e-05, |
| "loss": 0.124, |
| "mean_token_accuracy": 0.9574501037597656, |
| "step": 7370 |
| }, |
| { |
| "epoch": 2.8377910332884357, |
| "grad_norm": 0.10854668460509331, |
| "learning_rate": 1.5599758206550114e-05, |
| "loss": 0.125, |
| "mean_token_accuracy": 0.9568875133991241, |
| "step": 7375 |
| }, |
| { |
| "epoch": 2.839715220319415, |
| "grad_norm": 0.10566738417028762, |
| "learning_rate": 1.5567701674070293e-05, |
| "loss": 0.126, |
| "mean_token_accuracy": 0.956588226556778, |
| "step": 7380 |
| }, |
| { |
| "epoch": 2.8416394073503946, |
| "grad_norm": 0.10208895046477207, |
| "learning_rate": 1.553567880474095e-05, |
| "loss": 0.124, |
| "mean_token_accuracy": 0.9574152231216431, |
| "step": 7385 |
| }, |
| { |
| "epoch": 2.843563594381374, |
| "grad_norm": 0.10397090537389687, |
| "learning_rate": 1.5503689688904343e-05, |
| "loss": 0.1227, |
| "mean_token_accuracy": 0.9577645778656005, |
| "step": 7390 |
| }, |
| { |
| "epoch": 2.8454877814123534, |
| "grad_norm": 0.10873053971174768, |
| "learning_rate": 1.5471734416807487e-05, |
| "loss": 0.1243, |
| "mean_token_accuracy": 0.957385802268982, |
| "step": 7395 |
| }, |
| { |
| "epoch": 2.8474119684433328, |
| "grad_norm": 0.10472998947209755, |
| "learning_rate": 1.5439813078601933e-05, |
| "loss": 0.1259, |
| "mean_token_accuracy": 0.9567414045333862, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.849336155474312, |
| "grad_norm": 0.10409222392640442, |
| "learning_rate": 1.5407925764343494e-05, |
| "loss": 0.1259, |
| "mean_token_accuracy": 0.9568287134170532, |
| "step": 7405 |
| }, |
| { |
| "epoch": 2.8512603425052916, |
| "grad_norm": 0.10556065666725871, |
| "learning_rate": 1.5376072563992006e-05, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9562435626983643, |
| "step": 7410 |
| }, |
| { |
| "epoch": 2.853184529536271, |
| "grad_norm": 0.10452247547749552, |
| "learning_rate": 1.5344253567411033e-05, |
| "loss": 0.1248, |
| "mean_token_accuracy": 0.9570737600326538, |
| "step": 7415 |
| }, |
| { |
| "epoch": 2.8551087165672504, |
| "grad_norm": 0.1087680963204854, |
| "learning_rate": 1.5312468864367668e-05, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.9568704783916473, |
| "step": 7420 |
| }, |
| { |
| "epoch": 2.85703290359823, |
| "grad_norm": 0.10578472470541685, |
| "learning_rate": 1.5280718544532247e-05, |
| "loss": 0.1242, |
| "mean_token_accuracy": 0.9572508156299591, |
| "step": 7425 |
| }, |
| { |
| "epoch": 2.858957090629209, |
| "grad_norm": 0.1058352653218143, |
| "learning_rate": 1.5249002697478121e-05, |
| "loss": 0.1242, |
| "mean_token_accuracy": 0.9571494102478028, |
| "step": 7430 |
| }, |
| { |
| "epoch": 2.8608812776601886, |
| "grad_norm": 0.11115611407235973, |
| "learning_rate": 1.5217321412681357e-05, |
| "loss": 0.1239, |
| "mean_token_accuracy": 0.9573991417884826, |
| "step": 7435 |
| }, |
| { |
| "epoch": 2.862805464691168, |
| "grad_norm": 0.1029632486773793, |
| "learning_rate": 1.5185674779520554e-05, |
| "loss": 0.1243, |
| "mean_token_accuracy": 0.9571222126483917, |
| "step": 7440 |
| }, |
| { |
| "epoch": 2.8647296517221474, |
| "grad_norm": 0.10800954899956418, |
| "learning_rate": 1.515406288727651e-05, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9563605070114136, |
| "step": 7445 |
| }, |
| { |
| "epoch": 2.866653838753127, |
| "grad_norm": 0.10476947230916332, |
| "learning_rate": 1.5122485825132043e-05, |
| "loss": 0.1251, |
| "mean_token_accuracy": 0.9568899631500244, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.8685780257841063, |
| "grad_norm": 0.11056280552061581, |
| "learning_rate": 1.509094368217171e-05, |
| "loss": 0.1244, |
| "mean_token_accuracy": 0.9571887671947479, |
| "step": 7455 |
| }, |
| { |
| "epoch": 2.8705022128150857, |
| "grad_norm": 0.10978622017949022, |
| "learning_rate": 1.5059436547381527e-05, |
| "loss": 0.1252, |
| "mean_token_accuracy": 0.9570884883403779, |
| "step": 7460 |
| }, |
| { |
| "epoch": 2.872426399846065, |
| "grad_norm": 0.11231449620491728, |
| "learning_rate": 1.5027964509648776e-05, |
| "loss": 0.1264, |
| "mean_token_accuracy": 0.9565370500087738, |
| "step": 7465 |
| }, |
| { |
| "epoch": 2.8743505868770445, |
| "grad_norm": 0.10649846229397224, |
| "learning_rate": 1.4996527657761719e-05, |
| "loss": 0.1266, |
| "mean_token_accuracy": 0.9563987612724304, |
| "step": 7470 |
| }, |
| { |
| "epoch": 2.876274773908024, |
| "grad_norm": 0.10580010864087648, |
| "learning_rate": 1.496512608040933e-05, |
| "loss": 0.1242, |
| "mean_token_accuracy": 0.9571030080318451, |
| "step": 7475 |
| }, |
| { |
| "epoch": 2.8781989609390033, |
| "grad_norm": 0.10399350282844987, |
| "learning_rate": 1.4933759866181085e-05, |
| "loss": 0.1228, |
| "mean_token_accuracy": 0.9577360272407531, |
| "step": 7480 |
| }, |
| { |
| "epoch": 2.8801231479699827, |
| "grad_norm": 0.10649576662983058, |
| "learning_rate": 1.4902429103566687e-05, |
| "loss": 0.1257, |
| "mean_token_accuracy": 0.9567151606082916, |
| "step": 7485 |
| }, |
| { |
| "epoch": 2.882047335000962, |
| "grad_norm": 0.10651968169816911, |
| "learning_rate": 1.487113388095584e-05, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9568205177783966, |
| "step": 7490 |
| }, |
| { |
| "epoch": 2.8839715220319415, |
| "grad_norm": 0.10638941530991032, |
| "learning_rate": 1.4839874286637973e-05, |
| "loss": 0.1241, |
| "mean_token_accuracy": 0.9573801636695862, |
| "step": 7495 |
| }, |
| { |
| "epoch": 2.885895709062921, |
| "grad_norm": 0.1077741344290852, |
| "learning_rate": 1.4808650408801983e-05, |
| "loss": 0.1253, |
| "mean_token_accuracy": 0.957089239358902, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.8878198960939003, |
| "grad_norm": 0.10345254233037338, |
| "learning_rate": 1.4777462335536044e-05, |
| "loss": 0.1217, |
| "mean_token_accuracy": 0.9581014931201934, |
| "step": 7505 |
| }, |
| { |
| "epoch": 2.8897440831248797, |
| "grad_norm": 0.10837312897375773, |
| "learning_rate": 1.4746310154827269e-05, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.956810462474823, |
| "step": 7510 |
| }, |
| { |
| "epoch": 2.891668270155859, |
| "grad_norm": 0.10776077164463539, |
| "learning_rate": 1.4715193954561568e-05, |
| "loss": 0.1281, |
| "mean_token_accuracy": 0.956069827079773, |
| "step": 7515 |
| }, |
| { |
| "epoch": 2.8935924571868386, |
| "grad_norm": 0.1035556713444127, |
| "learning_rate": 1.4684113822523291e-05, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9563699722290039, |
| "step": 7520 |
| }, |
| { |
| "epoch": 2.895516644217818, |
| "grad_norm": 0.11074344644349653, |
| "learning_rate": 1.4653069846395063e-05, |
| "loss": 0.1267, |
| "mean_token_accuracy": 0.956612741947174, |
| "step": 7525 |
| }, |
| { |
| "epoch": 2.8974408312487974, |
| "grad_norm": 0.10477033499360329, |
| "learning_rate": 1.4622062113757509e-05, |
| "loss": 0.1245, |
| "mean_token_accuracy": 0.957367604970932, |
| "step": 7530 |
| }, |
| { |
| "epoch": 2.8993650182797768, |
| "grad_norm": 0.10958436618782283, |
| "learning_rate": 1.4591090712088983e-05, |
| "loss": 0.1248, |
| "mean_token_accuracy": 0.9569049835205078, |
| "step": 7535 |
| }, |
| { |
| "epoch": 2.901289205310756, |
| "grad_norm": 0.10499998417791684, |
| "learning_rate": 1.4560155728765367e-05, |
| "loss": 0.1216, |
| "mean_token_accuracy": 0.9581144154071808, |
| "step": 7540 |
| }, |
| { |
| "epoch": 2.9032133923417356, |
| "grad_norm": 0.10655053344073778, |
| "learning_rate": 1.452925725105978e-05, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.9569212675094605, |
| "step": 7545 |
| }, |
| { |
| "epoch": 2.905137579372715, |
| "grad_norm": 0.10977747494446383, |
| "learning_rate": 1.4498395366142361e-05, |
| "loss": 0.1231, |
| "mean_token_accuracy": 0.9578100383281708, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.9070617664036944, |
| "grad_norm": 0.10651052736899481, |
| "learning_rate": 1.4467570161080018e-05, |
| "loss": 0.124, |
| "mean_token_accuracy": 0.9572772800922393, |
| "step": 7555 |
| }, |
| { |
| "epoch": 2.908985953434674, |
| "grad_norm": 0.10587084876989453, |
| "learning_rate": 1.4436781722836191e-05, |
| "loss": 0.1268, |
| "mean_token_accuracy": 0.9564314723014832, |
| "step": 7560 |
| }, |
| { |
| "epoch": 2.910910140465653, |
| "grad_norm": 0.10787590169557064, |
| "learning_rate": 1.4406030138270554e-05, |
| "loss": 0.1253, |
| "mean_token_accuracy": 0.9569768607616425, |
| "step": 7565 |
| }, |
| { |
| "epoch": 2.9128343274966326, |
| "grad_norm": 0.11193040860381288, |
| "learning_rate": 1.4375315494138835e-05, |
| "loss": 0.1248, |
| "mean_token_accuracy": 0.957205992937088, |
| "step": 7570 |
| }, |
| { |
| "epoch": 2.914758514527612, |
| "grad_norm": 0.10614147252868583, |
| "learning_rate": 1.4344637877092554e-05, |
| "loss": 0.1219, |
| "mean_token_accuracy": 0.9580237150192261, |
| "step": 7575 |
| }, |
| { |
| "epoch": 2.9166827015585914, |
| "grad_norm": 0.12226018340910985, |
| "learning_rate": 1.431399737367877e-05, |
| "loss": 0.1257, |
| "mean_token_accuracy": 0.9568642377853394, |
| "step": 7580 |
| }, |
| { |
| "epoch": 2.918606888589571, |
| "grad_norm": 0.1101874805916549, |
| "learning_rate": 1.4283394070339811e-05, |
| "loss": 0.1205, |
| "mean_token_accuracy": 0.9583792507648468, |
| "step": 7585 |
| }, |
| { |
| "epoch": 2.9205310756205503, |
| "grad_norm": 0.1058993738730307, |
| "learning_rate": 1.4252828053413098e-05, |
| "loss": 0.1237, |
| "mean_token_accuracy": 0.9573873460292817, |
| "step": 7590 |
| }, |
| { |
| "epoch": 2.9224552626515297, |
| "grad_norm": 0.11056817436133677, |
| "learning_rate": 1.4222299409130822e-05, |
| "loss": 0.1229, |
| "mean_token_accuracy": 0.9575633466243744, |
| "step": 7595 |
| }, |
| { |
| "epoch": 2.924379449682509, |
| "grad_norm": 0.10362373496270215, |
| "learning_rate": 1.4191808223619768e-05, |
| "loss": 0.1253, |
| "mean_token_accuracy": 0.9568525791168213, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.9263036367134885, |
| "grad_norm": 0.10916660290464916, |
| "learning_rate": 1.4161354582901015e-05, |
| "loss": 0.1249, |
| "mean_token_accuracy": 0.9571748375892639, |
| "step": 7605 |
| }, |
| { |
| "epoch": 2.928227823744468, |
| "grad_norm": 0.10764565709057493, |
| "learning_rate": 1.4130938572889746e-05, |
| "loss": 0.1265, |
| "mean_token_accuracy": 0.9566021263599396, |
| "step": 7610 |
| }, |
| { |
| "epoch": 2.9301520107754473, |
| "grad_norm": 0.1059320053361084, |
| "learning_rate": 1.4100560279394975e-05, |
| "loss": 0.1228, |
| "mean_token_accuracy": 0.9579496622085572, |
| "step": 7615 |
| }, |
| { |
| "epoch": 2.9320761978064267, |
| "grad_norm": 0.11360266392266798, |
| "learning_rate": 1.4070219788119315e-05, |
| "loss": 0.1223, |
| "mean_token_accuracy": 0.957982987165451, |
| "step": 7620 |
| }, |
| { |
| "epoch": 2.934000384837406, |
| "grad_norm": 0.10610798650914667, |
| "learning_rate": 1.4039917184658718e-05, |
| "loss": 0.1257, |
| "mean_token_accuracy": 0.9566937685012817, |
| "step": 7625 |
| }, |
| { |
| "epoch": 2.9359245718683855, |
| "grad_norm": 0.11071740328007927, |
| "learning_rate": 1.4009652554502243e-05, |
| "loss": 0.1232, |
| "mean_token_accuracy": 0.9576710939407349, |
| "step": 7630 |
| }, |
| { |
| "epoch": 2.937848758899365, |
| "grad_norm": 0.1072534333814872, |
| "learning_rate": 1.3979425983031841e-05, |
| "loss": 0.1247, |
| "mean_token_accuracy": 0.9570759296417236, |
| "step": 7635 |
| }, |
| { |
| "epoch": 2.9397729459303443, |
| "grad_norm": 0.10321128452437721, |
| "learning_rate": 1.3949237555522091e-05, |
| "loss": 0.1214, |
| "mean_token_accuracy": 0.9581948697566987, |
| "step": 7640 |
| }, |
| { |
| "epoch": 2.9416971329613237, |
| "grad_norm": 0.10854912685893148, |
| "learning_rate": 1.3919087357139954e-05, |
| "loss": 0.1259, |
| "mean_token_accuracy": 0.9564633071422577, |
| "step": 7645 |
| }, |
| { |
| "epoch": 2.943621319992303, |
| "grad_norm": 0.11080075613686713, |
| "learning_rate": 1.388897547294452e-05, |
| "loss": 0.1263, |
| "mean_token_accuracy": 0.956570053100586, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.9455455070232825, |
| "grad_norm": 0.11283077280520182, |
| "learning_rate": 1.385890198788683e-05, |
| "loss": 0.123, |
| "mean_token_accuracy": 0.9576871275901795, |
| "step": 7655 |
| }, |
| { |
| "epoch": 2.947469694054262, |
| "grad_norm": 0.1055597743138611, |
| "learning_rate": 1.3828866986809553e-05, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.9570915341377259, |
| "step": 7660 |
| }, |
| { |
| "epoch": 2.9493938810852414, |
| "grad_norm": 0.11716483869134532, |
| "learning_rate": 1.3798870554446819e-05, |
| "loss": 0.1238, |
| "mean_token_accuracy": 0.9574430525302887, |
| "step": 7665 |
| }, |
| { |
| "epoch": 2.9513180681162208, |
| "grad_norm": 0.11219262966435255, |
| "learning_rate": 1.3768912775423922e-05, |
| "loss": 0.1242, |
| "mean_token_accuracy": 0.9571912288665771, |
| "step": 7670 |
| }, |
| { |
| "epoch": 2.9532422551472, |
| "grad_norm": 0.10756629871932641, |
| "learning_rate": 1.3738993734257133e-05, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.956557160615921, |
| "step": 7675 |
| }, |
| { |
| "epoch": 2.9551664421781796, |
| "grad_norm": 0.10819322200524713, |
| "learning_rate": 1.3709113515353428e-05, |
| "loss": 0.1241, |
| "mean_token_accuracy": 0.9571969687938691, |
| "step": 7680 |
| }, |
| { |
| "epoch": 2.957090629209159, |
| "grad_norm": 0.10522933962660358, |
| "learning_rate": 1.367927220301025e-05, |
| "loss": 0.1244, |
| "mean_token_accuracy": 0.9574010789394378, |
| "step": 7685 |
| }, |
| { |
| "epoch": 2.9590148162401384, |
| "grad_norm": 0.10942898684989587, |
| "learning_rate": 1.3649469881415276e-05, |
| "loss": 0.1254, |
| "mean_token_accuracy": 0.9567907691001892, |
| "step": 7690 |
| }, |
| { |
| "epoch": 2.960939003271118, |
| "grad_norm": 0.10772067005720781, |
| "learning_rate": 1.3619706634646201e-05, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.9567744076251984, |
| "step": 7695 |
| }, |
| { |
| "epoch": 2.962863190302097, |
| "grad_norm": 0.1087985352173851, |
| "learning_rate": 1.3589982546670474e-05, |
| "loss": 0.1231, |
| "mean_token_accuracy": 0.9575774610042572, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.9647873773330766, |
| "grad_norm": 0.1067799950600145, |
| "learning_rate": 1.356029770134507e-05, |
| "loss": 0.1218, |
| "mean_token_accuracy": 0.9579530179500579, |
| "step": 7705 |
| }, |
| { |
| "epoch": 2.966711564364056, |
| "grad_norm": 0.10819695429679835, |
| "learning_rate": 1.3530652182416264e-05, |
| "loss": 0.1233, |
| "mean_token_accuracy": 0.9575179398059845, |
| "step": 7710 |
| }, |
| { |
| "epoch": 2.9686357513950354, |
| "grad_norm": 0.10453719331335365, |
| "learning_rate": 1.3501046073519364e-05, |
| "loss": 0.124, |
| "mean_token_accuracy": 0.9573317110538483, |
| "step": 7715 |
| }, |
| { |
| "epoch": 2.970559938426015, |
| "grad_norm": 0.10607372196854979, |
| "learning_rate": 1.3471479458178499e-05, |
| "loss": 0.1228, |
| "mean_token_accuracy": 0.9579567670822143, |
| "step": 7720 |
| }, |
| { |
| "epoch": 2.9724841254569947, |
| "grad_norm": 0.10440932629580602, |
| "learning_rate": 1.3441952419806391e-05, |
| "loss": 0.1242, |
| "mean_token_accuracy": 0.9573928415775299, |
| "step": 7725 |
| }, |
| { |
| "epoch": 2.9744083124879737, |
| "grad_norm": 0.11011335803374803, |
| "learning_rate": 1.3412465041704114e-05, |
| "loss": 0.1252, |
| "mean_token_accuracy": 0.9569807887077332, |
| "step": 7730 |
| }, |
| { |
| "epoch": 2.9763324995189535, |
| "grad_norm": 0.10803320079657108, |
| "learning_rate": 1.3383017407060824e-05, |
| "loss": 0.121, |
| "mean_token_accuracy": 0.9582719445228577, |
| "step": 7735 |
| }, |
| { |
| "epoch": 2.9782566865499325, |
| "grad_norm": 0.11326957663750642, |
| "learning_rate": 1.3353609598953587e-05, |
| "loss": 0.1231, |
| "mean_token_accuracy": 0.9574480533599854, |
| "step": 7740 |
| }, |
| { |
| "epoch": 2.9801808735809123, |
| "grad_norm": 0.1060972691963728, |
| "learning_rate": 1.3324241700347084e-05, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.956799840927124, |
| "step": 7745 |
| }, |
| { |
| "epoch": 2.9821050606118913, |
| "grad_norm": 0.10793607800115752, |
| "learning_rate": 1.3294913794093433e-05, |
| "loss": 0.1226, |
| "mean_token_accuracy": 0.9577231049537659, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.984029247642871, |
| "grad_norm": 0.10803527524344377, |
| "learning_rate": 1.326562596293189e-05, |
| "loss": 0.1236, |
| "mean_token_accuracy": 0.9576082110404969, |
| "step": 7755 |
| }, |
| { |
| "epoch": 2.98595343467385, |
| "grad_norm": 0.11256389464645765, |
| "learning_rate": 1.3236378289488682e-05, |
| "loss": 0.1245, |
| "mean_token_accuracy": 0.9571884274482727, |
| "step": 7760 |
| }, |
| { |
| "epoch": 2.98787762170483, |
| "grad_norm": 0.10757240512768781, |
| "learning_rate": 1.3207170856276736e-05, |
| "loss": 0.126, |
| "mean_token_accuracy": 0.9567049860954284, |
| "step": 7765 |
| }, |
| { |
| "epoch": 2.989801808735809, |
| "grad_norm": 0.11149173015234722, |
| "learning_rate": 1.317800374569545e-05, |
| "loss": 0.123, |
| "mean_token_accuracy": 0.9575415909290313, |
| "step": 7770 |
| }, |
| { |
| "epoch": 2.9917259957667888, |
| "grad_norm": 0.10859179660363494, |
| "learning_rate": 1.3148877040030466e-05, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9568469166755676, |
| "step": 7775 |
| }, |
| { |
| "epoch": 2.9936501827977677, |
| "grad_norm": 0.10845196418753182, |
| "learning_rate": 1.3119790821453432e-05, |
| "loss": 0.1242, |
| "mean_token_accuracy": 0.9572749853134155, |
| "step": 7780 |
| }, |
| { |
| "epoch": 2.9955743698287476, |
| "grad_norm": 0.1043788293341144, |
| "learning_rate": 1.309074517202178e-05, |
| "loss": 0.1222, |
| "mean_token_accuracy": 0.957735562324524, |
| "step": 7785 |
| }, |
| { |
| "epoch": 2.9974985568597265, |
| "grad_norm": 0.1068433999214701, |
| "learning_rate": 1.3061740173678492e-05, |
| "loss": 0.1235, |
| "mean_token_accuracy": 0.9574783861637115, |
| "step": 7790 |
| }, |
| { |
| "epoch": 2.9994227438907064, |
| "grad_norm": 0.11024594434820396, |
| "learning_rate": 1.303277590825187e-05, |
| "loss": 0.1222, |
| "mean_token_accuracy": 0.958063292503357, |
| "step": 7795 |
| }, |
| { |
| "epoch": 3.0011545122185876, |
| "grad_norm": 0.10806090102622518, |
| "learning_rate": 1.3003852457455288e-05, |
| "loss": 0.1021, |
| "mean_token_accuracy": 0.9649597538842095, |
| "step": 7800 |
| }, |
| { |
| "epoch": 3.003078699249567, |
| "grad_norm": 0.1278029168675794, |
| "learning_rate": 1.297496990288697e-05, |
| "loss": 0.0891, |
| "mean_token_accuracy": 0.9692692935466767, |
| "step": 7805 |
| }, |
| { |
| "epoch": 3.0050028862805465, |
| "grad_norm": 0.12892837513477004, |
| "learning_rate": 1.2946128326029786e-05, |
| "loss": 0.0874, |
| "mean_token_accuracy": 0.9698507249355316, |
| "step": 7810 |
| }, |
| { |
| "epoch": 3.006927073311526, |
| "grad_norm": 0.12022972716434455, |
| "learning_rate": 1.2917327808250993e-05, |
| "loss": 0.0884, |
| "mean_token_accuracy": 0.9694180428981781, |
| "step": 7815 |
| }, |
| { |
| "epoch": 3.0088512603425053, |
| "grad_norm": 0.12531548139344598, |
| "learning_rate": 1.2888568430801995e-05, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9698590219020844, |
| "step": 7820 |
| }, |
| { |
| "epoch": 3.0107754473734847, |
| "grad_norm": 0.11762844071368292, |
| "learning_rate": 1.2859850274818158e-05, |
| "loss": 0.0865, |
| "mean_token_accuracy": 0.9699797809123993, |
| "step": 7825 |
| }, |
| { |
| "epoch": 3.012699634404464, |
| "grad_norm": 0.11506289173555091, |
| "learning_rate": 1.2831173421318548e-05, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.9697744309902191, |
| "step": 7830 |
| }, |
| { |
| "epoch": 3.0146238214354435, |
| "grad_norm": 0.12532324926552826, |
| "learning_rate": 1.2802537951205695e-05, |
| "loss": 0.0876, |
| "mean_token_accuracy": 0.9696701884269714, |
| "step": 7835 |
| }, |
| { |
| "epoch": 3.016548008466423, |
| "grad_norm": 0.12198962421807008, |
| "learning_rate": 1.2773943945265382e-05, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.9696745097637176, |
| "step": 7840 |
| }, |
| { |
| "epoch": 3.0184721954974023, |
| "grad_norm": 0.12526480360585637, |
| "learning_rate": 1.2745391484166427e-05, |
| "loss": 0.0885, |
| "mean_token_accuracy": 0.9692001581192017, |
| "step": 7845 |
| }, |
| { |
| "epoch": 3.0203963825283817, |
| "grad_norm": 0.12298338848117586, |
| "learning_rate": 1.271688064846044e-05, |
| "loss": 0.0874, |
| "mean_token_accuracy": 0.9697564482688904, |
| "step": 7850 |
| }, |
| { |
| "epoch": 3.022320569559361, |
| "grad_norm": 0.12363669211046695, |
| "learning_rate": 1.2688411518581589e-05, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9699727296829224, |
| "step": 7855 |
| }, |
| { |
| "epoch": 3.0242447565903405, |
| "grad_norm": 0.12592966137030076, |
| "learning_rate": 1.2659984174846403e-05, |
| "loss": 0.0886, |
| "mean_token_accuracy": 0.9691447734832763, |
| "step": 7860 |
| }, |
| { |
| "epoch": 3.02616894362132, |
| "grad_norm": 0.12315689968587146, |
| "learning_rate": 1.2631598697453496e-05, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9698756873607636, |
| "step": 7865 |
| }, |
| { |
| "epoch": 3.0280931306522993, |
| "grad_norm": 0.12179550081200788, |
| "learning_rate": 1.2603255166483374e-05, |
| "loss": 0.0857, |
| "mean_token_accuracy": 0.9701310276985169, |
| "step": 7870 |
| }, |
| { |
| "epoch": 3.0300173176832788, |
| "grad_norm": 0.12307347849136704, |
| "learning_rate": 1.2574953661898222e-05, |
| "loss": 0.0895, |
| "mean_token_accuracy": 0.968977826833725, |
| "step": 7875 |
| }, |
| { |
| "epoch": 3.031941504714258, |
| "grad_norm": 0.12251435917553875, |
| "learning_rate": 1.2546694263541667e-05, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.9701171934604644, |
| "step": 7880 |
| }, |
| { |
| "epoch": 3.0338656917452376, |
| "grad_norm": 0.1188143957854138, |
| "learning_rate": 1.251847705113851e-05, |
| "loss": 0.0843, |
| "mean_token_accuracy": 0.9706710934638977, |
| "step": 7885 |
| }, |
| { |
| "epoch": 3.035789878776217, |
| "grad_norm": 0.12475703694562157, |
| "learning_rate": 1.2490302104294586e-05, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9699966847896576, |
| "step": 7890 |
| }, |
| { |
| "epoch": 3.0377140658071964, |
| "grad_norm": 0.12093351966846219, |
| "learning_rate": 1.2462169502496435e-05, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9701157450675965, |
| "step": 7895 |
| }, |
| { |
| "epoch": 3.039638252838176, |
| "grad_norm": 0.12017604333104909, |
| "learning_rate": 1.2434079325111192e-05, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9696642577648162, |
| "step": 7900 |
| }, |
| { |
| "epoch": 3.041562439869155, |
| "grad_norm": 0.12085886994243633, |
| "learning_rate": 1.240603165138626e-05, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9699452579021454, |
| "step": 7905 |
| }, |
| { |
| "epoch": 3.0434866269001346, |
| "grad_norm": 0.12387593946204745, |
| "learning_rate": 1.2378026560449155e-05, |
| "loss": 0.0867, |
| "mean_token_accuracy": 0.9697232723236084, |
| "step": 7910 |
| }, |
| { |
| "epoch": 3.045410813931114, |
| "grad_norm": 0.12983147755354021, |
| "learning_rate": 1.2350064131307253e-05, |
| "loss": 0.0858, |
| "mean_token_accuracy": 0.9702217698097229, |
| "step": 7915 |
| }, |
| { |
| "epoch": 3.0473350009620934, |
| "grad_norm": 0.124590059629808, |
| "learning_rate": 1.2322144442847587e-05, |
| "loss": 0.0848, |
| "mean_token_accuracy": 0.9704817235469818, |
| "step": 7920 |
| }, |
| { |
| "epoch": 3.049259187993073, |
| "grad_norm": 0.12318032429282658, |
| "learning_rate": 1.2294267573836587e-05, |
| "loss": 0.0893, |
| "mean_token_accuracy": 0.9690053462982178, |
| "step": 7925 |
| }, |
| { |
| "epoch": 3.0511833750240522, |
| "grad_norm": 0.12644916618282354, |
| "learning_rate": 1.2266433602919883e-05, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.969806456565857, |
| "step": 7930 |
| }, |
| { |
| "epoch": 3.0531075620550316, |
| "grad_norm": 0.12502760515043446, |
| "learning_rate": 1.2238642608622105e-05, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.9696021556854248, |
| "step": 7935 |
| }, |
| { |
| "epoch": 3.055031749086011, |
| "grad_norm": 0.12383987780357592, |
| "learning_rate": 1.2210894669346623e-05, |
| "loss": 0.0876, |
| "mean_token_accuracy": 0.9695601046085358, |
| "step": 7940 |
| }, |
| { |
| "epoch": 3.0569559361169905, |
| "grad_norm": 0.12804783151969265, |
| "learning_rate": 1.2183189863375347e-05, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9699361026287079, |
| "step": 7945 |
| }, |
| { |
| "epoch": 3.05888012314797, |
| "grad_norm": 0.11848934621495531, |
| "learning_rate": 1.2155528268868492e-05, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.9703084409236908, |
| "step": 7950 |
| }, |
| { |
| "epoch": 3.0608043101789493, |
| "grad_norm": 0.12423792043438635, |
| "learning_rate": 1.212790996386436e-05, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9698608636856079, |
| "step": 7955 |
| }, |
| { |
| "epoch": 3.0627284972099287, |
| "grad_norm": 0.1296803573215681, |
| "learning_rate": 1.2100335026279145e-05, |
| "loss": 0.0879, |
| "mean_token_accuracy": 0.9694078743457795, |
| "step": 7960 |
| }, |
| { |
| "epoch": 3.064652684240908, |
| "grad_norm": 0.12414203646830227, |
| "learning_rate": 1.2072803533906694e-05, |
| "loss": 0.0863, |
| "mean_token_accuracy": 0.9698703289031982, |
| "step": 7965 |
| }, |
| { |
| "epoch": 3.0665768712718875, |
| "grad_norm": 0.12216839952104394, |
| "learning_rate": 1.2045315564418255e-05, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.9703336238861084, |
| "step": 7970 |
| }, |
| { |
| "epoch": 3.068501058302867, |
| "grad_norm": 0.12430462906305857, |
| "learning_rate": 1.201787119536233e-05, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.969856345653534, |
| "step": 7975 |
| }, |
| { |
| "epoch": 3.0704252453338463, |
| "grad_norm": 0.1182551093883623, |
| "learning_rate": 1.1990470504164394e-05, |
| "loss": 0.0867, |
| "mean_token_accuracy": 0.9697697043418885, |
| "step": 7980 |
| }, |
| { |
| "epoch": 3.0723494323648257, |
| "grad_norm": 0.12572587657675405, |
| "learning_rate": 1.1963113568126708e-05, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9698267221450806, |
| "step": 7985 |
| }, |
| { |
| "epoch": 3.074273619395805, |
| "grad_norm": 0.127462469416906, |
| "learning_rate": 1.1935800464428079e-05, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.9696932435035706, |
| "step": 7990 |
| }, |
| { |
| "epoch": 3.0761978064267845, |
| "grad_norm": 0.13134160058581362, |
| "learning_rate": 1.1908531270123665e-05, |
| "loss": 0.0878, |
| "mean_token_accuracy": 0.9693050265312195, |
| "step": 7995 |
| }, |
| { |
| "epoch": 3.078121993457764, |
| "grad_norm": 0.1238281796372475, |
| "learning_rate": 1.188130606214475e-05, |
| "loss": 0.0863, |
| "mean_token_accuracy": 0.970119446516037, |
| "step": 8000 |
| }, |
| { |
| "epoch": 3.0800461804887433, |
| "grad_norm": 0.13433419867660185, |
| "learning_rate": 1.185412491729853e-05, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9708240568637848, |
| "step": 8005 |
| }, |
| { |
| "epoch": 3.0819703675197228, |
| "grad_norm": 0.12396118665861203, |
| "learning_rate": 1.1826987912267864e-05, |
| "loss": 0.0887, |
| "mean_token_accuracy": 0.9691758751869202, |
| "step": 8010 |
| }, |
| { |
| "epoch": 3.083894554550702, |
| "grad_norm": 0.11889626164987209, |
| "learning_rate": 1.1799895123611125e-05, |
| "loss": 0.087, |
| "mean_token_accuracy": 0.9697548627853394, |
| "step": 8015 |
| }, |
| { |
| "epoch": 3.0858187415816816, |
| "grad_norm": 0.131945297023811, |
| "learning_rate": 1.1772846627761899e-05, |
| "loss": 0.0879, |
| "mean_token_accuracy": 0.96954505443573, |
| "step": 8020 |
| }, |
| { |
| "epoch": 3.087742928612661, |
| "grad_norm": 0.12209211115052003, |
| "learning_rate": 1.1745842501028847e-05, |
| "loss": 0.0857, |
| "mean_token_accuracy": 0.9703159630298615, |
| "step": 8025 |
| }, |
| { |
| "epoch": 3.0896671156436404, |
| "grad_norm": 0.12460601320367481, |
| "learning_rate": 1.1718882819595454e-05, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.970074987411499, |
| "step": 8030 |
| }, |
| { |
| "epoch": 3.09159130267462, |
| "grad_norm": 0.11829881911452493, |
| "learning_rate": 1.1691967659519796e-05, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9700419008731842, |
| "step": 8035 |
| }, |
| { |
| "epoch": 3.093515489705599, |
| "grad_norm": 0.13005744915575315, |
| "learning_rate": 1.1665097096734372e-05, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.96951624751091, |
| "step": 8040 |
| }, |
| { |
| "epoch": 3.0954396767365786, |
| "grad_norm": 0.13567902609403767, |
| "learning_rate": 1.1638271207045841e-05, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.9704005897045136, |
| "step": 8045 |
| }, |
| { |
| "epoch": 3.097363863767558, |
| "grad_norm": 0.12804817918149772, |
| "learning_rate": 1.1611490066134853e-05, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.9701052308082581, |
| "step": 8050 |
| }, |
| { |
| "epoch": 3.0992880507985374, |
| "grad_norm": 0.11891623802480417, |
| "learning_rate": 1.1584753749555789e-05, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.96992067694664, |
| "step": 8055 |
| }, |
| { |
| "epoch": 3.101212237829517, |
| "grad_norm": 0.13000095226206448, |
| "learning_rate": 1.1558062332736595e-05, |
| "loss": 0.0854, |
| "mean_token_accuracy": 0.9702975928783417, |
| "step": 8060 |
| }, |
| { |
| "epoch": 3.1031364248604962, |
| "grad_norm": 0.12310517172219068, |
| "learning_rate": 1.1531415890978535e-05, |
| "loss": 0.09, |
| "mean_token_accuracy": 0.9687209069728852, |
| "step": 8065 |
| }, |
| { |
| "epoch": 3.1050606118914756, |
| "grad_norm": 0.12478784300253068, |
| "learning_rate": 1.1504814499456003e-05, |
| "loss": 0.0845, |
| "mean_token_accuracy": 0.970802241563797, |
| "step": 8070 |
| }, |
| { |
| "epoch": 3.106984798922455, |
| "grad_norm": 0.12995901203556426, |
| "learning_rate": 1.147825823321628e-05, |
| "loss": 0.087, |
| "mean_token_accuracy": 0.9696940183639526, |
| "step": 8075 |
| }, |
| { |
| "epoch": 3.108908985953435, |
| "grad_norm": 0.12465076291615731, |
| "learning_rate": 1.1451747167179341e-05, |
| "loss": 0.0865, |
| "mean_token_accuracy": 0.9700176894664765, |
| "step": 8080 |
| }, |
| { |
| "epoch": 3.110833172984414, |
| "grad_norm": 0.1250070740769649, |
| "learning_rate": 1.1425281376137659e-05, |
| "loss": 0.0855, |
| "mean_token_accuracy": 0.9701965272426605, |
| "step": 8085 |
| }, |
| { |
| "epoch": 3.1127573600153937, |
| "grad_norm": 0.12747187171365093, |
| "learning_rate": 1.139886093475597e-05, |
| "loss": 0.0867, |
| "mean_token_accuracy": 0.969889110326767, |
| "step": 8090 |
| }, |
| { |
| "epoch": 3.114681547046373, |
| "grad_norm": 0.12403764057755394, |
| "learning_rate": 1.137248591757108e-05, |
| "loss": 0.0877, |
| "mean_token_accuracy": 0.9695557832717896, |
| "step": 8095 |
| }, |
| { |
| "epoch": 3.1166057340773525, |
| "grad_norm": 0.12776305701838625, |
| "learning_rate": 1.1346156398991623e-05, |
| "loss": 0.0846, |
| "mean_token_accuracy": 0.9706474542617798, |
| "step": 8100 |
| }, |
| { |
| "epoch": 3.118529921108332, |
| "grad_norm": 0.12188474517126945, |
| "learning_rate": 1.1319872453297888e-05, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.9699768602848053, |
| "step": 8105 |
| }, |
| { |
| "epoch": 3.1204541081393113, |
| "grad_norm": 0.1255535944742883, |
| "learning_rate": 1.1293634154641593e-05, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.9696572244167327, |
| "step": 8110 |
| }, |
| { |
| "epoch": 3.1223782951702908, |
| "grad_norm": 0.12576509472395267, |
| "learning_rate": 1.1267441577045685e-05, |
| "loss": 0.085, |
| "mean_token_accuracy": 0.9705277800559997, |
| "step": 8115 |
| }, |
| { |
| "epoch": 3.12430248220127, |
| "grad_norm": 0.12282131609490717, |
| "learning_rate": 1.1241294794404102e-05, |
| "loss": 0.0883, |
| "mean_token_accuracy": 0.9693425118923187, |
| "step": 8120 |
| }, |
| { |
| "epoch": 3.1262266692322496, |
| "grad_norm": 0.12559358366060971, |
| "learning_rate": 1.121519388048161e-05, |
| "loss": 0.0865, |
| "mean_token_accuracy": 0.9699931085109711, |
| "step": 8125 |
| }, |
| { |
| "epoch": 3.128150856263229, |
| "grad_norm": 0.12785731056478644, |
| "learning_rate": 1.1189138908913568e-05, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9700387597084046, |
| "step": 8130 |
| }, |
| { |
| "epoch": 3.1300750432942084, |
| "grad_norm": 0.1263134280990649, |
| "learning_rate": 1.116312995320571e-05, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9700113117694855, |
| "step": 8135 |
| }, |
| { |
| "epoch": 3.131999230325188, |
| "grad_norm": 0.12950712286712593, |
| "learning_rate": 1.1137167086733948e-05, |
| "loss": 0.0867, |
| "mean_token_accuracy": 0.9699228644371033, |
| "step": 8140 |
| }, |
| { |
| "epoch": 3.133923417356167, |
| "grad_norm": 0.12609052574189428, |
| "learning_rate": 1.1111250382744187e-05, |
| "loss": 0.087, |
| "mean_token_accuracy": 0.9696399331092834, |
| "step": 8145 |
| }, |
| { |
| "epoch": 3.1358476043871466, |
| "grad_norm": 0.1243791146405868, |
| "learning_rate": 1.1085379914352093e-05, |
| "loss": 0.0868, |
| "mean_token_accuracy": 0.9699157655239106, |
| "step": 8150 |
| }, |
| { |
| "epoch": 3.137771791418126, |
| "grad_norm": 0.12478666533557523, |
| "learning_rate": 1.105955575454289e-05, |
| "loss": 0.0855, |
| "mean_token_accuracy": 0.9703081667423248, |
| "step": 8155 |
| }, |
| { |
| "epoch": 3.1396959784491054, |
| "grad_norm": 0.12723248786466004, |
| "learning_rate": 1.1033777976171153e-05, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9698303639888763, |
| "step": 8160 |
| }, |
| { |
| "epoch": 3.141620165480085, |
| "grad_norm": 0.1224982823706387, |
| "learning_rate": 1.1008046651960615e-05, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9705148994922638, |
| "step": 8165 |
| }, |
| { |
| "epoch": 3.1435443525110642, |
| "grad_norm": 0.13209837606796643, |
| "learning_rate": 1.0982361854503944e-05, |
| "loss": 0.0874, |
| "mean_token_accuracy": 0.969810402393341, |
| "step": 8170 |
| }, |
| { |
| "epoch": 3.1454685395420436, |
| "grad_norm": 0.12487860197240644, |
| "learning_rate": 1.095672365626256e-05, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9701269149780274, |
| "step": 8175 |
| }, |
| { |
| "epoch": 3.147392726573023, |
| "grad_norm": 0.1287032592687375, |
| "learning_rate": 1.0931132129566412e-05, |
| "loss": 0.0845, |
| "mean_token_accuracy": 0.9705935418605804, |
| "step": 8180 |
| }, |
| { |
| "epoch": 3.1493169136040025, |
| "grad_norm": 0.12612765501750817, |
| "learning_rate": 1.0905587346613772e-05, |
| "loss": 0.0879, |
| "mean_token_accuracy": 0.9693886756896972, |
| "step": 8185 |
| }, |
| { |
| "epoch": 3.151241100634982, |
| "grad_norm": 0.12710517193889465, |
| "learning_rate": 1.0880089379471056e-05, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9695225954055786, |
| "step": 8190 |
| }, |
| { |
| "epoch": 3.1531652876659613, |
| "grad_norm": 0.1277696516634007, |
| "learning_rate": 1.0854638300072589e-05, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9702028095722198, |
| "step": 8195 |
| }, |
| { |
| "epoch": 3.1550894746969407, |
| "grad_norm": 0.12538158801161664, |
| "learning_rate": 1.0829234180220433e-05, |
| "loss": 0.0883, |
| "mean_token_accuracy": 0.9693264961242676, |
| "step": 8200 |
| }, |
| { |
| "epoch": 3.15701366172792, |
| "grad_norm": 0.12482280763356315, |
| "learning_rate": 1.0803877091584147e-05, |
| "loss": 0.0865, |
| "mean_token_accuracy": 0.9699821174144745, |
| "step": 8205 |
| }, |
| { |
| "epoch": 3.1589378487588995, |
| "grad_norm": 0.12184134340839646, |
| "learning_rate": 1.0778567105700627e-05, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9702258408069611, |
| "step": 8210 |
| }, |
| { |
| "epoch": 3.160862035789879, |
| "grad_norm": 0.12622632430749628, |
| "learning_rate": 1.075330429397387e-05, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.969819289445877, |
| "step": 8215 |
| }, |
| { |
| "epoch": 3.1627862228208583, |
| "grad_norm": 0.12128537119264614, |
| "learning_rate": 1.072808872767481e-05, |
| "loss": 0.0876, |
| "mean_token_accuracy": 0.9694657444953918, |
| "step": 8220 |
| }, |
| { |
| "epoch": 3.1647104098518377, |
| "grad_norm": 0.1341024695293841, |
| "learning_rate": 1.0702920477941055e-05, |
| "loss": 0.0879, |
| "mean_token_accuracy": 0.9694705188274384, |
| "step": 8225 |
| }, |
| { |
| "epoch": 3.166634596882817, |
| "grad_norm": 0.12446827777112705, |
| "learning_rate": 1.067779961577675e-05, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9698066115379333, |
| "step": 8230 |
| }, |
| { |
| "epoch": 3.1685587839137965, |
| "grad_norm": 0.12635166599806483, |
| "learning_rate": 1.0652726212052344e-05, |
| "loss": 0.087, |
| "mean_token_accuracy": 0.969752311706543, |
| "step": 8235 |
| }, |
| { |
| "epoch": 3.170482970944776, |
| "grad_norm": 0.12415362694650094, |
| "learning_rate": 1.0627700337504411e-05, |
| "loss": 0.088, |
| "mean_token_accuracy": 0.9694055020809174, |
| "step": 8240 |
| }, |
| { |
| "epoch": 3.1724071579757553, |
| "grad_norm": 0.12308317166189457, |
| "learning_rate": 1.0602722062735412e-05, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9707213938236237, |
| "step": 8245 |
| }, |
| { |
| "epoch": 3.1743313450067348, |
| "grad_norm": 0.12975850218368418, |
| "learning_rate": 1.0577791458213543e-05, |
| "loss": 0.087, |
| "mean_token_accuracy": 0.9694936692714691, |
| "step": 8250 |
| }, |
| { |
| "epoch": 3.176255532037714, |
| "grad_norm": 0.1342180776755028, |
| "learning_rate": 1.0552908594272492e-05, |
| "loss": 0.0871, |
| "mean_token_accuracy": 0.9696606636047364, |
| "step": 8255 |
| }, |
| { |
| "epoch": 3.1781797190686936, |
| "grad_norm": 0.12421484751480402, |
| "learning_rate": 1.0528073541111284e-05, |
| "loss": 0.0878, |
| "mean_token_accuracy": 0.9692697167396546, |
| "step": 8260 |
| }, |
| { |
| "epoch": 3.180103906099673, |
| "grad_norm": 0.12604153150022193, |
| "learning_rate": 1.0503286368794054e-05, |
| "loss": 0.0859, |
| "mean_token_accuracy": 0.9701263308525085, |
| "step": 8265 |
| }, |
| { |
| "epoch": 3.1820280931306524, |
| "grad_norm": 0.12564266714889988, |
| "learning_rate": 1.0478547147249842e-05, |
| "loss": 0.0857, |
| "mean_token_accuracy": 0.9703177154064179, |
| "step": 8270 |
| }, |
| { |
| "epoch": 3.183952280161632, |
| "grad_norm": 0.1257561651173637, |
| "learning_rate": 1.045385594627243e-05, |
| "loss": 0.0878, |
| "mean_token_accuracy": 0.9695186614990234, |
| "step": 8275 |
| }, |
| { |
| "epoch": 3.185876467192611, |
| "grad_norm": 0.12159795962859275, |
| "learning_rate": 1.0429212835520127e-05, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.97016139626503, |
| "step": 8280 |
| }, |
| { |
| "epoch": 3.1878006542235906, |
| "grad_norm": 0.1315051252009715, |
| "learning_rate": 1.0404617884515546e-05, |
| "loss": 0.0871, |
| "mean_token_accuracy": 0.9696720123291016, |
| "step": 8285 |
| }, |
| { |
| "epoch": 3.18972484125457, |
| "grad_norm": 0.12045718399538607, |
| "learning_rate": 1.0380071162645454e-05, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.969960606098175, |
| "step": 8290 |
| }, |
| { |
| "epoch": 3.1916490282855494, |
| "grad_norm": 0.1263931601493202, |
| "learning_rate": 1.0355572739160548e-05, |
| "loss": 0.0854, |
| "mean_token_accuracy": 0.9703317403793335, |
| "step": 8295 |
| }, |
| { |
| "epoch": 3.193573215316529, |
| "grad_norm": 0.12819071683709377, |
| "learning_rate": 1.0331122683175271e-05, |
| "loss": 0.0874, |
| "mean_token_accuracy": 0.969643896818161, |
| "step": 8300 |
| }, |
| { |
| "epoch": 3.1954974023475082, |
| "grad_norm": 0.12759630896085855, |
| "learning_rate": 1.0306721063667613e-05, |
| "loss": 0.0876, |
| "mean_token_accuracy": 0.969584834575653, |
| "step": 8305 |
| }, |
| { |
| "epoch": 3.1974215893784876, |
| "grad_norm": 0.13322178423454648, |
| "learning_rate": 1.0282367949478899e-05, |
| "loss": 0.0881, |
| "mean_token_accuracy": 0.9695081174373626, |
| "step": 8310 |
| }, |
| { |
| "epoch": 3.199345776409467, |
| "grad_norm": 0.12499356791319982, |
| "learning_rate": 1.0258063409313648e-05, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.969955325126648, |
| "step": 8315 |
| }, |
| { |
| "epoch": 3.2012699634404465, |
| "grad_norm": 0.12456975891769033, |
| "learning_rate": 1.0233807511739293e-05, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.9698077619075776, |
| "step": 8320 |
| }, |
| { |
| "epoch": 3.203194150471426, |
| "grad_norm": 0.1294086133597835, |
| "learning_rate": 1.020960032518609e-05, |
| "loss": 0.0888, |
| "mean_token_accuracy": 0.9690967381000519, |
| "step": 8325 |
| }, |
| { |
| "epoch": 3.2051183375024053, |
| "grad_norm": 0.12508669985528187, |
| "learning_rate": 1.0185441917946831e-05, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9697443544864655, |
| "step": 8330 |
| }, |
| { |
| "epoch": 3.2070425245333847, |
| "grad_norm": 0.12710477111832502, |
| "learning_rate": 1.0161332358176713e-05, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.9696725368499756, |
| "step": 8335 |
| }, |
| { |
| "epoch": 3.208966711564364, |
| "grad_norm": 0.1343357489271712, |
| "learning_rate": 1.0137271713893128e-05, |
| "loss": 0.0889, |
| "mean_token_accuracy": 0.9690210878849029, |
| "step": 8340 |
| }, |
| { |
| "epoch": 3.2108908985953435, |
| "grad_norm": 0.12538716805540998, |
| "learning_rate": 1.0113260052975452e-05, |
| "loss": 0.0876, |
| "mean_token_accuracy": 0.9695513010025024, |
| "step": 8345 |
| }, |
| { |
| "epoch": 3.212815085626323, |
| "grad_norm": 0.1288275528367617, |
| "learning_rate": 1.0089297443164894e-05, |
| "loss": 0.0868, |
| "mean_token_accuracy": 0.9696984708309173, |
| "step": 8350 |
| }, |
| { |
| "epoch": 3.2147392726573023, |
| "grad_norm": 0.1269432876964765, |
| "learning_rate": 1.0065383952064254e-05, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.9701885461807251, |
| "step": 8355 |
| }, |
| { |
| "epoch": 3.2166634596882817, |
| "grad_norm": 0.12251520794355583, |
| "learning_rate": 1.0041519647137779e-05, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.9700977683067322, |
| "step": 8360 |
| }, |
| { |
| "epoch": 3.218587646719261, |
| "grad_norm": 0.1265888644195425, |
| "learning_rate": 1.0017704595710947e-05, |
| "loss": 0.0865, |
| "mean_token_accuracy": 0.9698656022548675, |
| "step": 8365 |
| }, |
| { |
| "epoch": 3.2205118337502405, |
| "grad_norm": 0.12734965041860674, |
| "learning_rate": 9.993938864970288e-06, |
| "loss": 0.0883, |
| "mean_token_accuracy": 0.9695118367671967, |
| "step": 8370 |
| }, |
| { |
| "epoch": 3.22243602078122, |
| "grad_norm": 0.12474460914321397, |
| "learning_rate": 9.97022252196318e-06, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9699595808982849, |
| "step": 8375 |
| }, |
| { |
| "epoch": 3.2243602078121993, |
| "grad_norm": 0.1306528695087571, |
| "learning_rate": 9.946555633597666e-06, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.9696216762065888, |
| "step": 8380 |
| }, |
| { |
| "epoch": 3.2262843948431787, |
| "grad_norm": 0.1266524631503943, |
| "learning_rate": 9.922938266642284e-06, |
| "loss": 0.0865, |
| "mean_token_accuracy": 0.9699699997901916, |
| "step": 8385 |
| }, |
| { |
| "epoch": 3.228208581874158, |
| "grad_norm": 0.1259564595531912, |
| "learning_rate": 9.899370487725866e-06, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.9694744288921356, |
| "step": 8390 |
| }, |
| { |
| "epoch": 3.2301327689051376, |
| "grad_norm": 0.12096085083215267, |
| "learning_rate": 9.875852363337315e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9708819687366486, |
| "step": 8395 |
| }, |
| { |
| "epoch": 3.232056955936117, |
| "grad_norm": 0.1331944475195223, |
| "learning_rate": 9.852383959825492e-06, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.9708536863327026, |
| "step": 8400 |
| }, |
| { |
| "epoch": 3.2339811429670964, |
| "grad_norm": 0.12740850552589505, |
| "learning_rate": 9.828965343398952e-06, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.969858956336975, |
| "step": 8405 |
| }, |
| { |
| "epoch": 3.235905329998076, |
| "grad_norm": 0.1267014162237489, |
| "learning_rate": 9.805596580125809e-06, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.9700336575508117, |
| "step": 8410 |
| }, |
| { |
| "epoch": 3.237829517029055, |
| "grad_norm": 0.12934145585469362, |
| "learning_rate": 9.78227773593354e-06, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9699920177459717, |
| "step": 8415 |
| }, |
| { |
| "epoch": 3.2397537040600346, |
| "grad_norm": 0.124341432322928, |
| "learning_rate": 9.759008876608766e-06, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.9696942687034606, |
| "step": 8420 |
| }, |
| { |
| "epoch": 3.241677891091014, |
| "grad_norm": 0.1305683160263325, |
| "learning_rate": 9.73579006779711e-06, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9699869990348816, |
| "step": 8425 |
| }, |
| { |
| "epoch": 3.2436020781219934, |
| "grad_norm": 0.12006655217324837, |
| "learning_rate": 9.712621375002999e-06, |
| "loss": 0.0846, |
| "mean_token_accuracy": 0.9707320332527161, |
| "step": 8430 |
| }, |
| { |
| "epoch": 3.245526265152973, |
| "grad_norm": 0.12418308091527765, |
| "learning_rate": 9.689502863589458e-06, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.9693997085094452, |
| "step": 8435 |
| }, |
| { |
| "epoch": 3.2474504521839522, |
| "grad_norm": 0.1273627357393596, |
| "learning_rate": 9.666434598777944e-06, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.9706789910793304, |
| "step": 8440 |
| }, |
| { |
| "epoch": 3.2493746392149316, |
| "grad_norm": 0.12610851458563718, |
| "learning_rate": 9.643416645648162e-06, |
| "loss": 0.0854, |
| "mean_token_accuracy": 0.9701983332633972, |
| "step": 8445 |
| }, |
| { |
| "epoch": 3.251298826245911, |
| "grad_norm": 0.12462572164326881, |
| "learning_rate": 9.620449069137891e-06, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.970365023612976, |
| "step": 8450 |
| }, |
| { |
| "epoch": 3.2532230132768905, |
| "grad_norm": 0.12695478875600175, |
| "learning_rate": 9.597531934042773e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9702978789806366, |
| "step": 8455 |
| }, |
| { |
| "epoch": 3.25514720030787, |
| "grad_norm": 0.12596431800857663, |
| "learning_rate": 9.574665305016148e-06, |
| "loss": 0.0874, |
| "mean_token_accuracy": 0.9696771502494812, |
| "step": 8460 |
| }, |
| { |
| "epoch": 3.2570713873388493, |
| "grad_norm": 0.1263908940111466, |
| "learning_rate": 9.551849246568866e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9705479264259338, |
| "step": 8465 |
| }, |
| { |
| "epoch": 3.2589955743698287, |
| "grad_norm": 0.12796106826662196, |
| "learning_rate": 9.529083823069123e-06, |
| "loss": 0.0859, |
| "mean_token_accuracy": 0.9702809691429138, |
| "step": 8470 |
| }, |
| { |
| "epoch": 3.260919761400808, |
| "grad_norm": 0.12584104905433993, |
| "learning_rate": 9.506369098742257e-06, |
| "loss": 0.0838, |
| "mean_token_accuracy": 0.9710902273654938, |
| "step": 8475 |
| }, |
| { |
| "epoch": 3.2628439484317875, |
| "grad_norm": 0.128886611194648, |
| "learning_rate": 9.483705137670563e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9707912802696228, |
| "step": 8480 |
| }, |
| { |
| "epoch": 3.264768135462767, |
| "grad_norm": 0.124633168632203, |
| "learning_rate": 9.46109200379314e-06, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9700573444366455, |
| "step": 8485 |
| }, |
| { |
| "epoch": 3.2666923224937463, |
| "grad_norm": 0.13217726055423035, |
| "learning_rate": 9.438529760905694e-06, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9699700951576233, |
| "step": 8490 |
| }, |
| { |
| "epoch": 3.2686165095247257, |
| "grad_norm": 0.1252762176406864, |
| "learning_rate": 9.41601847266034e-06, |
| "loss": 0.087, |
| "mean_token_accuracy": 0.9699073255062103, |
| "step": 8495 |
| }, |
| { |
| "epoch": 3.270540696555705, |
| "grad_norm": 0.1332115069886139, |
| "learning_rate": 9.393558202565469e-06, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9696744382381439, |
| "step": 8500 |
| }, |
| { |
| "epoch": 3.2724648835866845, |
| "grad_norm": 0.13767971258879022, |
| "learning_rate": 9.37114901398551e-06, |
| "loss": 0.0867, |
| "mean_token_accuracy": 0.9698637843132019, |
| "step": 8505 |
| }, |
| { |
| "epoch": 3.274389070617664, |
| "grad_norm": 0.12634734486679827, |
| "learning_rate": 9.348790970140803e-06, |
| "loss": 0.0877, |
| "mean_token_accuracy": 0.9696608185768127, |
| "step": 8510 |
| }, |
| { |
| "epoch": 3.2763132576486433, |
| "grad_norm": 0.12463310795171538, |
| "learning_rate": 9.326484134107397e-06, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.9704243242740631, |
| "step": 8515 |
| }, |
| { |
| "epoch": 3.2782374446796227, |
| "grad_norm": 0.13231686223559833, |
| "learning_rate": 9.304228568816873e-06, |
| "loss": 0.0843, |
| "mean_token_accuracy": 0.9707625210285187, |
| "step": 8520 |
| }, |
| { |
| "epoch": 3.280161631710602, |
| "grad_norm": 0.12724142915743591, |
| "learning_rate": 9.282024337056164e-06, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.9705350875854493, |
| "step": 8525 |
| }, |
| { |
| "epoch": 3.2820858187415816, |
| "grad_norm": 0.12371333151127234, |
| "learning_rate": 9.259871501467374e-06, |
| "loss": 0.0846, |
| "mean_token_accuracy": 0.9705982744693756, |
| "step": 8530 |
| }, |
| { |
| "epoch": 3.284010005772561, |
| "grad_norm": 0.12490000522762193, |
| "learning_rate": 9.23777012454763e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9704672932624817, |
| "step": 8535 |
| }, |
| { |
| "epoch": 3.2859341928035404, |
| "grad_norm": 0.1289941739689611, |
| "learning_rate": 9.215720268648878e-06, |
| "loss": 0.0857, |
| "mean_token_accuracy": 0.970098227262497, |
| "step": 8540 |
| }, |
| { |
| "epoch": 3.28785837983452, |
| "grad_norm": 0.12967348920086358, |
| "learning_rate": 9.193721995977698e-06, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.9699119985103607, |
| "step": 8545 |
| }, |
| { |
| "epoch": 3.289782566865499, |
| "grad_norm": 0.12851709397206285, |
| "learning_rate": 9.17177536859517e-06, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9700691282749176, |
| "step": 8550 |
| }, |
| { |
| "epoch": 3.2917067538964786, |
| "grad_norm": 0.1231932806947136, |
| "learning_rate": 9.149880448416648e-06, |
| "loss": 0.0857, |
| "mean_token_accuracy": 0.9702551066875458, |
| "step": 8555 |
| }, |
| { |
| "epoch": 3.293630940927458, |
| "grad_norm": 0.12495074882295382, |
| "learning_rate": 9.128037297211634e-06, |
| "loss": 0.0876, |
| "mean_token_accuracy": 0.9694799661636353, |
| "step": 8560 |
| }, |
| { |
| "epoch": 3.2955551279584374, |
| "grad_norm": 0.1288804330900242, |
| "learning_rate": 9.10624597660356e-06, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9698720872402191, |
| "step": 8565 |
| }, |
| { |
| "epoch": 3.297479314989417, |
| "grad_norm": 0.12501816949094877, |
| "learning_rate": 9.084506548069645e-06, |
| "loss": 0.0858, |
| "mean_token_accuracy": 0.9702178537845612, |
| "step": 8570 |
| }, |
| { |
| "epoch": 3.2994035020203962, |
| "grad_norm": 0.12837424725335872, |
| "learning_rate": 9.062819072940715e-06, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9699204087257385, |
| "step": 8575 |
| }, |
| { |
| "epoch": 3.3013276890513756, |
| "grad_norm": 0.12569337071515446, |
| "learning_rate": 9.041183612401025e-06, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.9705540895462036, |
| "step": 8580 |
| }, |
| { |
| "epoch": 3.303251876082355, |
| "grad_norm": 0.1295374658358602, |
| "learning_rate": 9.019600227488077e-06, |
| "loss": 0.0851, |
| "mean_token_accuracy": 0.9705880224704743, |
| "step": 8585 |
| }, |
| { |
| "epoch": 3.3051760631133345, |
| "grad_norm": 0.12715742158265253, |
| "learning_rate": 8.998068979092458e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9702557623386383, |
| "step": 8590 |
| }, |
| { |
| "epoch": 3.307100250144314, |
| "grad_norm": 0.13623753283325926, |
| "learning_rate": 8.976589927957687e-06, |
| "loss": 0.0863, |
| "mean_token_accuracy": 0.9700308620929718, |
| "step": 8595 |
| }, |
| { |
| "epoch": 3.3090244371752933, |
| "grad_norm": 0.1281690334105656, |
| "learning_rate": 8.955163134680011e-06, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9699817836284638, |
| "step": 8600 |
| }, |
| { |
| "epoch": 3.3109486242062727, |
| "grad_norm": 0.13038861977362848, |
| "learning_rate": 8.933788659708256e-06, |
| "loss": 0.0845, |
| "mean_token_accuracy": 0.9706872582435608, |
| "step": 8605 |
| }, |
| { |
| "epoch": 3.312872811237252, |
| "grad_norm": 0.12433591222697461, |
| "learning_rate": 8.912466563343638e-06, |
| "loss": 0.084, |
| "mean_token_accuracy": 0.9707286357879639, |
| "step": 8610 |
| }, |
| { |
| "epoch": 3.3147969982682315, |
| "grad_norm": 0.12254842004963368, |
| "learning_rate": 8.891196905739604e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9708254933357239, |
| "step": 8615 |
| }, |
| { |
| "epoch": 3.316721185299211, |
| "grad_norm": 0.12548382355964321, |
| "learning_rate": 8.869979746901677e-06, |
| "loss": 0.0865, |
| "mean_token_accuracy": 0.9700875878334045, |
| "step": 8620 |
| }, |
| { |
| "epoch": 3.3186453723301903, |
| "grad_norm": 0.13264124283049286, |
| "learning_rate": 8.848815146687257e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.970658129453659, |
| "step": 8625 |
| }, |
| { |
| "epoch": 3.3205695593611697, |
| "grad_norm": 0.12652569781174539, |
| "learning_rate": 8.827703164805471e-06, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9701962292194366, |
| "step": 8630 |
| }, |
| { |
| "epoch": 3.3224937463921496, |
| "grad_norm": 0.12819513526318943, |
| "learning_rate": 8.806643860816998e-06, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9706246614456177, |
| "step": 8635 |
| }, |
| { |
| "epoch": 3.3244179334231285, |
| "grad_norm": 0.1298989340360717, |
| "learning_rate": 8.78563729413392e-06, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.9696630299091339, |
| "step": 8640 |
| }, |
| { |
| "epoch": 3.3263421204541084, |
| "grad_norm": 0.1182656253664839, |
| "learning_rate": 8.764683524019512e-06, |
| "loss": 0.082, |
| "mean_token_accuracy": 0.9715340971946717, |
| "step": 8645 |
| }, |
| { |
| "epoch": 3.3282663074850873, |
| "grad_norm": 0.12708389586633778, |
| "learning_rate": 8.743782609588108e-06, |
| "loss": 0.0846, |
| "mean_token_accuracy": 0.9707981884479523, |
| "step": 8650 |
| }, |
| { |
| "epoch": 3.330190494516067, |
| "grad_norm": 0.1285361679643166, |
| "learning_rate": 8.722934609804937e-06, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.9702385127544403, |
| "step": 8655 |
| }, |
| { |
| "epoch": 3.332114681547046, |
| "grad_norm": 0.12981013898874733, |
| "learning_rate": 8.702139583485938e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9703791618347168, |
| "step": 8660 |
| }, |
| { |
| "epoch": 3.334038868578026, |
| "grad_norm": 0.13135679062104932, |
| "learning_rate": 8.681397589297604e-06, |
| "loss": 0.0854, |
| "mean_token_accuracy": 0.97044517993927, |
| "step": 8665 |
| }, |
| { |
| "epoch": 3.335963055609005, |
| "grad_norm": 0.12858230101996082, |
| "learning_rate": 8.660708685756826e-06, |
| "loss": 0.0868, |
| "mean_token_accuracy": 0.9698947310447693, |
| "step": 8670 |
| }, |
| { |
| "epoch": 3.337887242639985, |
| "grad_norm": 0.13140167849466097, |
| "learning_rate": 8.640072931230696e-06, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9699412226676941, |
| "step": 8675 |
| }, |
| { |
| "epoch": 3.339811429670964, |
| "grad_norm": 0.13994509560367993, |
| "learning_rate": 8.619490383936366e-06, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.9701304972171784, |
| "step": 8680 |
| }, |
| { |
| "epoch": 3.3417356167019436, |
| "grad_norm": 0.1313381474793011, |
| "learning_rate": 8.598961101940898e-06, |
| "loss": 0.0868, |
| "mean_token_accuracy": 0.9697572290897369, |
| "step": 8685 |
| }, |
| { |
| "epoch": 3.3436598037329226, |
| "grad_norm": 0.12773458439501786, |
| "learning_rate": 8.578485143161077e-06, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9697576522827148, |
| "step": 8690 |
| }, |
| { |
| "epoch": 3.3455839907639024, |
| "grad_norm": 0.12908337054513808, |
| "learning_rate": 8.558062565363236e-06, |
| "loss": 0.0843, |
| "mean_token_accuracy": 0.9707521021366119, |
| "step": 8695 |
| }, |
| { |
| "epoch": 3.347508177794882, |
| "grad_norm": 0.14050298858857763, |
| "learning_rate": 8.537693426163137e-06, |
| "loss": 0.0871, |
| "mean_token_accuracy": 0.9696966946125031, |
| "step": 8700 |
| }, |
| { |
| "epoch": 3.3494323648258613, |
| "grad_norm": 0.13014009624854814, |
| "learning_rate": 8.517377783025762e-06, |
| "loss": 0.0882, |
| "mean_token_accuracy": 0.9692828714847564, |
| "step": 8705 |
| }, |
| { |
| "epoch": 3.3513565518568407, |
| "grad_norm": 0.13106790772771806, |
| "learning_rate": 8.497115693265184e-06, |
| "loss": 0.0846, |
| "mean_token_accuracy": 0.9706206440925598, |
| "step": 8710 |
| }, |
| { |
| "epoch": 3.35328073888782, |
| "grad_norm": 0.12709539158034155, |
| "learning_rate": 8.476907214044378e-06, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9701223254203797, |
| "step": 8715 |
| }, |
| { |
| "epoch": 3.3552049259187995, |
| "grad_norm": 0.12594987993398446, |
| "learning_rate": 8.45675240237509e-06, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.971149480342865, |
| "step": 8720 |
| }, |
| { |
| "epoch": 3.357129112949779, |
| "grad_norm": 0.12738524473365023, |
| "learning_rate": 8.436651315117652e-06, |
| "loss": 0.0843, |
| "mean_token_accuracy": 0.9705275237560272, |
| "step": 8725 |
| }, |
| { |
| "epoch": 3.3590532999807583, |
| "grad_norm": 0.1282987237539586, |
| "learning_rate": 8.416604008980836e-06, |
| "loss": 0.0863, |
| "mean_token_accuracy": 0.9699268102645874, |
| "step": 8730 |
| }, |
| { |
| "epoch": 3.3609774870117377, |
| "grad_norm": 0.1260384071569397, |
| "learning_rate": 8.396610540521679e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9706168353557587, |
| "step": 8735 |
| }, |
| { |
| "epoch": 3.362901674042717, |
| "grad_norm": 0.13272296976352874, |
| "learning_rate": 8.376670966145328e-06, |
| "loss": 0.0848, |
| "mean_token_accuracy": 0.9704327166080475, |
| "step": 8740 |
| }, |
| { |
| "epoch": 3.3648258610736965, |
| "grad_norm": 0.13256551590203566, |
| "learning_rate": 8.3567853421049e-06, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9698093652725219, |
| "step": 8745 |
| }, |
| { |
| "epoch": 3.366750048104676, |
| "grad_norm": 0.1296375896640046, |
| "learning_rate": 8.336953724501301e-06, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9705244064331054, |
| "step": 8750 |
| }, |
| { |
| "epoch": 3.3686742351356553, |
| "grad_norm": 0.12871414388401403, |
| "learning_rate": 8.317176169283084e-06, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9697363972663879, |
| "step": 8755 |
| }, |
| { |
| "epoch": 3.3705984221666347, |
| "grad_norm": 0.12867392118207444, |
| "learning_rate": 8.297452732246263e-06, |
| "loss": 0.0854, |
| "mean_token_accuracy": 0.9703603863716126, |
| "step": 8760 |
| }, |
| { |
| "epoch": 3.372522609197614, |
| "grad_norm": 0.13251967572536488, |
| "learning_rate": 8.277783469034189e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9703691661357879, |
| "step": 8765 |
| }, |
| { |
| "epoch": 3.3744467962285936, |
| "grad_norm": 0.12963231003298034, |
| "learning_rate": 8.258168435137373e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9707355380058289, |
| "step": 8770 |
| }, |
| { |
| "epoch": 3.376370983259573, |
| "grad_norm": 0.13040555109562293, |
| "learning_rate": 8.238607685893345e-06, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.9706344664096832, |
| "step": 8775 |
| }, |
| { |
| "epoch": 3.3782951702905524, |
| "grad_norm": 0.1269499257113466, |
| "learning_rate": 8.219101276486479e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9708466231822968, |
| "step": 8780 |
| }, |
| { |
| "epoch": 3.380219357321532, |
| "grad_norm": 0.12505828556642629, |
| "learning_rate": 8.199649261947847e-06, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.9703607201576233, |
| "step": 8785 |
| }, |
| { |
| "epoch": 3.382143544352511, |
| "grad_norm": 0.12435969457559475, |
| "learning_rate": 8.180251697155073e-06, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9701003551483154, |
| "step": 8790 |
| }, |
| { |
| "epoch": 3.3840677313834906, |
| "grad_norm": 0.1313400660491146, |
| "learning_rate": 8.160908636832154e-06, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9699647605419159, |
| "step": 8795 |
| }, |
| { |
| "epoch": 3.38599191841447, |
| "grad_norm": 0.12681342418139757, |
| "learning_rate": 8.141620135549327e-06, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.97001833319664, |
| "step": 8800 |
| }, |
| { |
| "epoch": 3.3879161054454494, |
| "grad_norm": 0.1267393721622519, |
| "learning_rate": 8.122386247722907e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9708464503288269, |
| "step": 8805 |
| }, |
| { |
| "epoch": 3.389840292476429, |
| "grad_norm": 0.13209641998243066, |
| "learning_rate": 8.103207027615142e-06, |
| "loss": 0.0877, |
| "mean_token_accuracy": 0.9694723188877106, |
| "step": 8810 |
| }, |
| { |
| "epoch": 3.3917644795074082, |
| "grad_norm": 0.12532336727924384, |
| "learning_rate": 8.084082529334043e-06, |
| "loss": 0.0848, |
| "mean_token_accuracy": 0.9706017315387726, |
| "step": 8815 |
| }, |
| { |
| "epoch": 3.3936886665383876, |
| "grad_norm": 0.12709702174008902, |
| "learning_rate": 8.065012806833249e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9706872940063477, |
| "step": 8820 |
| }, |
| { |
| "epoch": 3.395612853569367, |
| "grad_norm": 0.12864795259469106, |
| "learning_rate": 8.045997913911862e-06, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.9698288381099701, |
| "step": 8825 |
| }, |
| { |
| "epoch": 3.3975370406003464, |
| "grad_norm": 0.12646239900475942, |
| "learning_rate": 8.027037904214292e-06, |
| "loss": 0.0854, |
| "mean_token_accuracy": 0.9703909814357757, |
| "step": 8830 |
| }, |
| { |
| "epoch": 3.399461227631326, |
| "grad_norm": 0.12565513142414456, |
| "learning_rate": 8.008132831230126e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9707223236560821, |
| "step": 8835 |
| }, |
| { |
| "epoch": 3.4013854146623053, |
| "grad_norm": 0.12760894959438268, |
| "learning_rate": 7.989282748293967e-06, |
| "loss": 0.0858, |
| "mean_token_accuracy": 0.9702325403690338, |
| "step": 8840 |
| }, |
| { |
| "epoch": 3.4033096016932847, |
| "grad_norm": 0.12872157096414913, |
| "learning_rate": 7.970487708585263e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9705387651920319, |
| "step": 8845 |
| }, |
| { |
| "epoch": 3.405233788724264, |
| "grad_norm": 0.13074364076123604, |
| "learning_rate": 7.951747765128198e-06, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.9703906655311585, |
| "step": 8850 |
| }, |
| { |
| "epoch": 3.4071579757552435, |
| "grad_norm": 0.12860828576759153, |
| "learning_rate": 7.933062970791503e-06, |
| "loss": 0.0825, |
| "mean_token_accuracy": 0.9713438928127289, |
| "step": 8855 |
| }, |
| { |
| "epoch": 3.409082162786223, |
| "grad_norm": 0.12596652807591652, |
| "learning_rate": 7.91443337828833e-06, |
| "loss": 0.0851, |
| "mean_token_accuracy": 0.970458322763443, |
| "step": 8860 |
| }, |
| { |
| "epoch": 3.4110063498172023, |
| "grad_norm": 0.12706147071281007, |
| "learning_rate": 7.895859040176094e-06, |
| "loss": 0.0854, |
| "mean_token_accuracy": 0.9701674222946167, |
| "step": 8865 |
| }, |
| { |
| "epoch": 3.4129305368481817, |
| "grad_norm": 0.12809440511811315, |
| "learning_rate": 7.877340008856327e-06, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9701098024845123, |
| "step": 8870 |
| }, |
| { |
| "epoch": 3.414854723879161, |
| "grad_norm": 0.1246733969536944, |
| "learning_rate": 7.858876336574538e-06, |
| "loss": 0.0858, |
| "mean_token_accuracy": 0.9701932430267334, |
| "step": 8875 |
| }, |
| { |
| "epoch": 3.4167789109101405, |
| "grad_norm": 0.13569869596719736, |
| "learning_rate": 7.840468075420057e-06, |
| "loss": 0.084, |
| "mean_token_accuracy": 0.9707090973854064, |
| "step": 8880 |
| }, |
| { |
| "epoch": 3.41870309794112, |
| "grad_norm": 0.12870972415192447, |
| "learning_rate": 7.822115277325882e-06, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9700087785720826, |
| "step": 8885 |
| }, |
| { |
| "epoch": 3.4206272849720993, |
| "grad_norm": 0.1233483285767941, |
| "learning_rate": 7.803817994068534e-06, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9699972093105316, |
| "step": 8890 |
| }, |
| { |
| "epoch": 3.4225514720030787, |
| "grad_norm": 0.1248853456521094, |
| "learning_rate": 7.785576277267934e-06, |
| "loss": 0.0855, |
| "mean_token_accuracy": 0.9701894283294678, |
| "step": 8895 |
| }, |
| { |
| "epoch": 3.424475659034058, |
| "grad_norm": 0.13033547431633313, |
| "learning_rate": 7.767390178387231e-06, |
| "loss": 0.087, |
| "mean_token_accuracy": 0.9697039365768433, |
| "step": 8900 |
| }, |
| { |
| "epoch": 3.4263998460650376, |
| "grad_norm": 0.1282445236894112, |
| "learning_rate": 7.749259748732671e-06, |
| "loss": 0.0865, |
| "mean_token_accuracy": 0.9699876427650451, |
| "step": 8905 |
| }, |
| { |
| "epoch": 3.428324033096017, |
| "grad_norm": 0.13185925812837074, |
| "learning_rate": 7.731185039453438e-06, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.9702568829059601, |
| "step": 8910 |
| }, |
| { |
| "epoch": 3.4302482201269964, |
| "grad_norm": 0.12881109452347636, |
| "learning_rate": 7.713166101541522e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.970571768283844, |
| "step": 8915 |
| }, |
| { |
| "epoch": 3.432172407157976, |
| "grad_norm": 0.12574808543435886, |
| "learning_rate": 7.695202985831577e-06, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.9705647766590119, |
| "step": 8920 |
| }, |
| { |
| "epoch": 3.434096594188955, |
| "grad_norm": 0.12738407701511564, |
| "learning_rate": 7.677295743000772e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9711636543273926, |
| "step": 8925 |
| }, |
| { |
| "epoch": 3.4360207812199346, |
| "grad_norm": 0.12629537602643848, |
| "learning_rate": 7.659444423568638e-06, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.97016361951828, |
| "step": 8930 |
| }, |
| { |
| "epoch": 3.437944968250914, |
| "grad_norm": 0.13288314080760535, |
| "learning_rate": 7.641649077896947e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9705747485160827, |
| "step": 8935 |
| }, |
| { |
| "epoch": 3.4398691552818934, |
| "grad_norm": 0.12750592143154055, |
| "learning_rate": 7.623909756189565e-06, |
| "loss": 0.0848, |
| "mean_token_accuracy": 0.970649367570877, |
| "step": 8940 |
| }, |
| { |
| "epoch": 3.441793342312873, |
| "grad_norm": 0.1261252863356084, |
| "learning_rate": 7.606226508492286e-06, |
| "loss": 0.0838, |
| "mean_token_accuracy": 0.9709120571613312, |
| "step": 8945 |
| }, |
| { |
| "epoch": 3.4437175293438522, |
| "grad_norm": 0.12844346524328273, |
| "learning_rate": 7.588599384692719e-06, |
| "loss": 0.0874, |
| "mean_token_accuracy": 0.9696136832237243, |
| "step": 8950 |
| }, |
| { |
| "epoch": 3.4456417163748316, |
| "grad_norm": 0.128895448462382, |
| "learning_rate": 7.571028434520136e-06, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.9703655481338501, |
| "step": 8955 |
| }, |
| { |
| "epoch": 3.447565903405811, |
| "grad_norm": 0.12565867696603822, |
| "learning_rate": 7.553513707545339e-06, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.971028083562851, |
| "step": 8960 |
| }, |
| { |
| "epoch": 3.4494900904367904, |
| "grad_norm": 0.1294464609680728, |
| "learning_rate": 7.536055253180511e-06, |
| "loss": 0.084, |
| "mean_token_accuracy": 0.9708369553089142, |
| "step": 8965 |
| }, |
| { |
| "epoch": 3.45141427746777, |
| "grad_norm": 0.12779041889602735, |
| "learning_rate": 7.518653120679074e-06, |
| "loss": 0.085, |
| "mean_token_accuracy": 0.9704262435436248, |
| "step": 8970 |
| }, |
| { |
| "epoch": 3.4533384644987493, |
| "grad_norm": 0.12443532467597591, |
| "learning_rate": 7.501307359135556e-06, |
| "loss": 0.0822, |
| "mean_token_accuracy": 0.9715291023254394, |
| "step": 8975 |
| }, |
| { |
| "epoch": 3.4552626515297287, |
| "grad_norm": 0.12187677311471684, |
| "learning_rate": 7.484018017485463e-06, |
| "loss": 0.0826, |
| "mean_token_accuracy": 0.9712438225746155, |
| "step": 8980 |
| }, |
| { |
| "epoch": 3.457186838560708, |
| "grad_norm": 0.12822302730085042, |
| "learning_rate": 7.466785144505123e-06, |
| "loss": 0.0859, |
| "mean_token_accuracy": 0.9702449262142181, |
| "step": 8985 |
| }, |
| { |
| "epoch": 3.4591110255916875, |
| "grad_norm": 0.13088413276881763, |
| "learning_rate": 7.44960878881156e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9710516095161438, |
| "step": 8990 |
| }, |
| { |
| "epoch": 3.461035212622667, |
| "grad_norm": 0.1225224566435389, |
| "learning_rate": 7.432488998862341e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9705376029014587, |
| "step": 8995 |
| }, |
| { |
| "epoch": 3.4629593996536463, |
| "grad_norm": 0.1384362152617989, |
| "learning_rate": 7.415425822955471e-06, |
| "loss": 0.0868, |
| "mean_token_accuracy": 0.9698244988918304, |
| "step": 9000 |
| }, |
| { |
| "epoch": 3.4648835866846257, |
| "grad_norm": 0.14183049964477246, |
| "learning_rate": 7.398419309229211e-06, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9700653553009033, |
| "step": 9005 |
| }, |
| { |
| "epoch": 3.466807773715605, |
| "grad_norm": 0.12297008925890439, |
| "learning_rate": 7.3814695056619946e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9703586101531982, |
| "step": 9010 |
| }, |
| { |
| "epoch": 3.4687319607465845, |
| "grad_norm": 0.12722759866701883, |
| "learning_rate": 7.364576460072245e-06, |
| "loss": 0.0859, |
| "mean_token_accuracy": 0.9700540244579315, |
| "step": 9015 |
| }, |
| { |
| "epoch": 3.470656147777564, |
| "grad_norm": 0.1332776354167734, |
| "learning_rate": 7.347740220118271e-06, |
| "loss": 0.0831, |
| "mean_token_accuracy": 0.9711086690425873, |
| "step": 9020 |
| }, |
| { |
| "epoch": 3.4725803348085433, |
| "grad_norm": 0.13357779167266728, |
| "learning_rate": 7.330960833298123e-06, |
| "loss": 0.0848, |
| "mean_token_accuracy": 0.9706657350063324, |
| "step": 9025 |
| }, |
| { |
| "epoch": 3.4745045218395227, |
| "grad_norm": 0.21084750779345962, |
| "learning_rate": 7.314238346949456e-06, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.970378315448761, |
| "step": 9030 |
| }, |
| { |
| "epoch": 3.476428708870502, |
| "grad_norm": 0.12639746006093333, |
| "learning_rate": 7.297572808249399e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9707970798015595, |
| "step": 9035 |
| }, |
| { |
| "epoch": 3.4783528959014816, |
| "grad_norm": 0.12444375526245546, |
| "learning_rate": 7.280964264214416e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9709853947162628, |
| "step": 9040 |
| }, |
| { |
| "epoch": 3.480277082932461, |
| "grad_norm": 0.12516347291350133, |
| "learning_rate": 7.264412761700186e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9707713544368743, |
| "step": 9045 |
| }, |
| { |
| "epoch": 3.4822012699634404, |
| "grad_norm": 0.1275743992676517, |
| "learning_rate": 7.247918347401464e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9707460880279541, |
| "step": 9050 |
| }, |
| { |
| "epoch": 3.48412545699442, |
| "grad_norm": 0.1264298342167565, |
| "learning_rate": 7.23148106785195e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9708225727081299, |
| "step": 9055 |
| }, |
| { |
| "epoch": 3.486049644025399, |
| "grad_norm": 0.1396312586335439, |
| "learning_rate": 7.21510096942415e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9702761232852936, |
| "step": 9060 |
| }, |
| { |
| "epoch": 3.4879738310563786, |
| "grad_norm": 0.12600088149391286, |
| "learning_rate": 7.1987780983292506e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9709597945213317, |
| "step": 9065 |
| }, |
| { |
| "epoch": 3.489898018087358, |
| "grad_norm": 0.12802814534644144, |
| "learning_rate": 7.1825125006169986e-06, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.9703998446464539, |
| "step": 9070 |
| }, |
| { |
| "epoch": 3.4918222051183374, |
| "grad_norm": 0.1280563216277743, |
| "learning_rate": 7.16630422217556e-06, |
| "loss": 0.0868, |
| "mean_token_accuracy": 0.9698512375354766, |
| "step": 9075 |
| }, |
| { |
| "epoch": 3.493746392149317, |
| "grad_norm": 0.13260569969650457, |
| "learning_rate": 7.150153308731388e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9711039125919342, |
| "step": 9080 |
| }, |
| { |
| "epoch": 3.4956705791802962, |
| "grad_norm": 0.1261891301050121, |
| "learning_rate": 7.1340598058490995e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9710277199745179, |
| "step": 9085 |
| }, |
| { |
| "epoch": 3.4975947662112756, |
| "grad_norm": 0.12707472253510566, |
| "learning_rate": 7.118023758931357e-06, |
| "loss": 0.0865, |
| "mean_token_accuracy": 0.9699563682079315, |
| "step": 9090 |
| }, |
| { |
| "epoch": 3.499518953242255, |
| "grad_norm": 0.12499224818434655, |
| "learning_rate": 7.102045213218714e-06, |
| "loss": 0.0822, |
| "mean_token_accuracy": 0.9716532945632934, |
| "step": 9095 |
| }, |
| { |
| "epoch": 3.5014431402732344, |
| "grad_norm": 0.131033648481688, |
| "learning_rate": 7.086124213789506e-06, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9699096500873565, |
| "step": 9100 |
| }, |
| { |
| "epoch": 3.503367327304214, |
| "grad_norm": 0.13685556027766135, |
| "learning_rate": 7.070260805559728e-06, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.9701919317245483, |
| "step": 9105 |
| }, |
| { |
| "epoch": 3.5052915143351933, |
| "grad_norm": 0.12763545572226542, |
| "learning_rate": 7.054455033282899e-06, |
| "loss": 0.0805, |
| "mean_token_accuracy": 0.9719826638698578, |
| "step": 9110 |
| }, |
| { |
| "epoch": 3.5072157013661727, |
| "grad_norm": 0.12934854751913, |
| "learning_rate": 7.03870694154993e-06, |
| "loss": 0.0832, |
| "mean_token_accuracy": 0.9709598600864411, |
| "step": 9115 |
| }, |
| { |
| "epoch": 3.509139888397152, |
| "grad_norm": 0.13067551146740328, |
| "learning_rate": 7.023016574789008e-06, |
| "loss": 0.0817, |
| "mean_token_accuracy": 0.9717234134674072, |
| "step": 9120 |
| }, |
| { |
| "epoch": 3.5110640754281315, |
| "grad_norm": 0.126533600046491, |
| "learning_rate": 7.007383977265465e-06, |
| "loss": 0.083, |
| "mean_token_accuracy": 0.9712643921375275, |
| "step": 9125 |
| }, |
| { |
| "epoch": 3.512988262459111, |
| "grad_norm": 0.1266039435538283, |
| "learning_rate": 6.991809193081661e-06, |
| "loss": 0.0867, |
| "mean_token_accuracy": 0.969794648885727, |
| "step": 9130 |
| }, |
| { |
| "epoch": 3.5149124494900903, |
| "grad_norm": 0.12863707875363184, |
| "learning_rate": 6.976292266176848e-06, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.9711201667785645, |
| "step": 9135 |
| }, |
| { |
| "epoch": 3.5168366365210697, |
| "grad_norm": 0.1284053461949965, |
| "learning_rate": 6.9608332403270655e-06, |
| "loss": 0.0828, |
| "mean_token_accuracy": 0.9714163899421692, |
| "step": 9140 |
| }, |
| { |
| "epoch": 3.518760823552049, |
| "grad_norm": 0.12531953200720355, |
| "learning_rate": 6.945432159144982e-06, |
| "loss": 0.0858, |
| "mean_token_accuracy": 0.9702307283878326, |
| "step": 9145 |
| }, |
| { |
| "epoch": 3.5206850105830285, |
| "grad_norm": 0.12688678414941892, |
| "learning_rate": 6.930089066079816e-06, |
| "loss": 0.0871, |
| "mean_token_accuracy": 0.9698631703853607, |
| "step": 9150 |
| }, |
| { |
| "epoch": 3.522609197614008, |
| "grad_norm": 0.13018679852235768, |
| "learning_rate": 6.9148040044171705e-06, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.9709725737571716, |
| "step": 9155 |
| }, |
| { |
| "epoch": 3.5245333846449873, |
| "grad_norm": 0.1391861731141968, |
| "learning_rate": 6.899577017278952e-06, |
| "loss": 0.085, |
| "mean_token_accuracy": 0.9702840864658355, |
| "step": 9160 |
| }, |
| { |
| "epoch": 3.5264575716759667, |
| "grad_norm": 0.12922872655559622, |
| "learning_rate": 6.8844081476232076e-06, |
| "loss": 0.0845, |
| "mean_token_accuracy": 0.9706377267837525, |
| "step": 9165 |
| }, |
| { |
| "epoch": 3.5283817587069466, |
| "grad_norm": 0.13085069031984367, |
| "learning_rate": 6.869297438244039e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9710263013839722, |
| "step": 9170 |
| }, |
| { |
| "epoch": 3.5303059457379256, |
| "grad_norm": 0.15929457260546562, |
| "learning_rate": 6.854244931771467e-06, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.970440661907196, |
| "step": 9175 |
| }, |
| { |
| "epoch": 3.5322301327689054, |
| "grad_norm": 0.13113039818982128, |
| "learning_rate": 6.839250670671308e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9708048164844513, |
| "step": 9180 |
| }, |
| { |
| "epoch": 3.5341543197998844, |
| "grad_norm": 0.12947898341430028, |
| "learning_rate": 6.824314697245057e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9711683690547943, |
| "step": 9185 |
| }, |
| { |
| "epoch": 3.536078506830864, |
| "grad_norm": 0.12805154763824103, |
| "learning_rate": 6.8094370536297665e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.970403516292572, |
| "step": 9190 |
| }, |
| { |
| "epoch": 3.538002693861843, |
| "grad_norm": 0.12863808995173834, |
| "learning_rate": 6.794617781797934e-06, |
| "loss": 0.084, |
| "mean_token_accuracy": 0.970603746175766, |
| "step": 9195 |
| }, |
| { |
| "epoch": 3.539926880892823, |
| "grad_norm": 0.126568470390018, |
| "learning_rate": 6.779856923557385e-06, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9701663970947265, |
| "step": 9200 |
| }, |
| { |
| "epoch": 3.541851067923802, |
| "grad_norm": 0.12205274777553857, |
| "learning_rate": 6.765154520551134e-06, |
| "loss": 0.0848, |
| "mean_token_accuracy": 0.9704317271709442, |
| "step": 9205 |
| }, |
| { |
| "epoch": 3.543775254954782, |
| "grad_norm": 0.13594787640050665, |
| "learning_rate": 6.750510614257306e-06, |
| "loss": 0.0833, |
| "mean_token_accuracy": 0.9710303544998169, |
| "step": 9210 |
| }, |
| { |
| "epoch": 3.545699441985761, |
| "grad_norm": 0.125655355880411, |
| "learning_rate": 6.735925245988972e-06, |
| "loss": 0.0845, |
| "mean_token_accuracy": 0.970561021566391, |
| "step": 9215 |
| }, |
| { |
| "epoch": 3.5476236290167407, |
| "grad_norm": 0.12923608243530355, |
| "learning_rate": 6.72139845689407e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9711350262165069, |
| "step": 9220 |
| }, |
| { |
| "epoch": 3.5495478160477196, |
| "grad_norm": 0.13742684679218364, |
| "learning_rate": 6.706930287955278e-06, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9699180245399475, |
| "step": 9225 |
| }, |
| { |
| "epoch": 3.5514720030786995, |
| "grad_norm": 0.12627623911720126, |
| "learning_rate": 6.692520779989888e-06, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.9704681694507599, |
| "step": 9230 |
| }, |
| { |
| "epoch": 3.5533961901096784, |
| "grad_norm": 0.12790034643230236, |
| "learning_rate": 6.678169973649703e-06, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9698730170726776, |
| "step": 9235 |
| }, |
| { |
| "epoch": 3.5553203771406583, |
| "grad_norm": 0.12746343526254894, |
| "learning_rate": 6.663877909420924e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9704817056655883, |
| "step": 9240 |
| }, |
| { |
| "epoch": 3.5572445641716373, |
| "grad_norm": 0.12942624345380135, |
| "learning_rate": 6.649644627624014e-06, |
| "loss": 0.0827, |
| "mean_token_accuracy": 0.9712142050266266, |
| "step": 9245 |
| }, |
| { |
| "epoch": 3.559168751202617, |
| "grad_norm": 0.1294626315055323, |
| "learning_rate": 6.635470168413616e-06, |
| "loss": 0.0859, |
| "mean_token_accuracy": 0.970136284828186, |
| "step": 9250 |
| }, |
| { |
| "epoch": 3.561092938233596, |
| "grad_norm": 0.1287659587086771, |
| "learning_rate": 6.62135457177841e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.97107794880867, |
| "step": 9255 |
| }, |
| { |
| "epoch": 3.563017125264576, |
| "grad_norm": 0.13124629790306566, |
| "learning_rate": 6.60729787754103e-06, |
| "loss": 0.0855, |
| "mean_token_accuracy": 0.9702930569648742, |
| "step": 9260 |
| }, |
| { |
| "epoch": 3.564941312295555, |
| "grad_norm": 0.13150538121829, |
| "learning_rate": 6.593300125357932e-06, |
| "loss": 0.0833, |
| "mean_token_accuracy": 0.9710995376110076, |
| "step": 9265 |
| }, |
| { |
| "epoch": 3.5668654993265347, |
| "grad_norm": 0.12911652795640852, |
| "learning_rate": 6.579361354719271e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9702031135559082, |
| "step": 9270 |
| }, |
| { |
| "epoch": 3.5687896863575137, |
| "grad_norm": 0.1261550636731243, |
| "learning_rate": 6.565481604948817e-06, |
| "loss": 0.0823, |
| "mean_token_accuracy": 0.9713754057884216, |
| "step": 9275 |
| }, |
| { |
| "epoch": 3.5707138733884936, |
| "grad_norm": 0.13217408438340994, |
| "learning_rate": 6.551660915203834e-06, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.9706252694129944, |
| "step": 9280 |
| }, |
| { |
| "epoch": 3.5726380604194725, |
| "grad_norm": 0.12628151081566255, |
| "learning_rate": 6.5378993244749655e-06, |
| "loss": 0.085, |
| "mean_token_accuracy": 0.9705230236053467, |
| "step": 9285 |
| }, |
| { |
| "epoch": 3.5745622474504524, |
| "grad_norm": 0.12658169932670083, |
| "learning_rate": 6.524196871586113e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9705257952213288, |
| "step": 9290 |
| }, |
| { |
| "epoch": 3.5764864344814313, |
| "grad_norm": 0.12600332071058087, |
| "learning_rate": 6.510553595194359e-06, |
| "loss": 0.0823, |
| "mean_token_accuracy": 0.971400386095047, |
| "step": 9295 |
| }, |
| { |
| "epoch": 3.578410621512411, |
| "grad_norm": 0.12864461247839032, |
| "learning_rate": 6.496969533789829e-06, |
| "loss": 0.085, |
| "mean_token_accuracy": 0.9705588459968567, |
| "step": 9300 |
| }, |
| { |
| "epoch": 3.58033480854339, |
| "grad_norm": 0.1288969574385522, |
| "learning_rate": 6.483444725695587e-06, |
| "loss": 0.0851, |
| "mean_token_accuracy": 0.9704531967639923, |
| "step": 9305 |
| }, |
| { |
| "epoch": 3.58225899557437, |
| "grad_norm": 0.1258000907397356, |
| "learning_rate": 6.469979209067545e-06, |
| "loss": 0.0848, |
| "mean_token_accuracy": 0.9705624639987945, |
| "step": 9310 |
| }, |
| { |
| "epoch": 3.5841831826053494, |
| "grad_norm": 0.12880633256077986, |
| "learning_rate": 6.456573021894331e-06, |
| "loss": 0.0819, |
| "mean_token_accuracy": 0.97150416970253, |
| "step": 9315 |
| }, |
| { |
| "epoch": 3.586107369636329, |
| "grad_norm": 0.1289644166825434, |
| "learning_rate": 6.443226201997204e-06, |
| "loss": 0.0827, |
| "mean_token_accuracy": 0.9712906897068023, |
| "step": 9320 |
| }, |
| { |
| "epoch": 3.588031556667308, |
| "grad_norm": 0.12788660079345335, |
| "learning_rate": 6.4299387870299335e-06, |
| "loss": 0.0831, |
| "mean_token_accuracy": 0.9710439383983612, |
| "step": 9325 |
| }, |
| { |
| "epoch": 3.5899557436982876, |
| "grad_norm": 0.12775612967950467, |
| "learning_rate": 6.416710814478696e-06, |
| "loss": 0.0832, |
| "mean_token_accuracy": 0.9710244417190552, |
| "step": 9330 |
| }, |
| { |
| "epoch": 3.591879930729267, |
| "grad_norm": 0.12825385865965377, |
| "learning_rate": 6.403542321661972e-06, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9705589354038239, |
| "step": 9335 |
| }, |
| { |
| "epoch": 3.5938041177602464, |
| "grad_norm": 0.12991215195891673, |
| "learning_rate": 6.390433345730433e-06, |
| "loss": 0.0851, |
| "mean_token_accuracy": 0.9702601253986358, |
| "step": 9340 |
| }, |
| { |
| "epoch": 3.595728304791226, |
| "grad_norm": 0.1267840160300084, |
| "learning_rate": 6.377383923666852e-06, |
| "loss": 0.0848, |
| "mean_token_accuracy": 0.9707222580909729, |
| "step": 9345 |
| }, |
| { |
| "epoch": 3.5976524918222053, |
| "grad_norm": 0.12276379955317407, |
| "learning_rate": 6.3643940922859886e-06, |
| "loss": 0.0823, |
| "mean_token_accuracy": 0.9711625814437866, |
| "step": 9350 |
| }, |
| { |
| "epoch": 3.5995766788531847, |
| "grad_norm": 0.13249444718998268, |
| "learning_rate": 6.351463888234477e-06, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9701505243778229, |
| "step": 9355 |
| }, |
| { |
| "epoch": 3.601500865884164, |
| "grad_norm": 0.13043491753878234, |
| "learning_rate": 6.338593347990742e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9708169162273407, |
| "step": 9360 |
| }, |
| { |
| "epoch": 3.6034250529151435, |
| "grad_norm": 0.12976735823843522, |
| "learning_rate": 6.325782507864881e-06, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.9709677278995514, |
| "step": 9365 |
| }, |
| { |
| "epoch": 3.605349239946123, |
| "grad_norm": 0.12815702589482678, |
| "learning_rate": 6.313031403998566e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9709154903888703, |
| "step": 9370 |
| }, |
| { |
| "epoch": 3.6072734269771023, |
| "grad_norm": 0.12979207167662427, |
| "learning_rate": 6.300340072364952e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9704569756984711, |
| "step": 9375 |
| }, |
| { |
| "epoch": 3.6091976140080817, |
| "grad_norm": 0.126402904194075, |
| "learning_rate": 6.287708548768552e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9711087584495545, |
| "step": 9380 |
| }, |
| { |
| "epoch": 3.611121801039061, |
| "grad_norm": 0.12843425985952908, |
| "learning_rate": 6.275136868845155e-06, |
| "loss": 0.0823, |
| "mean_token_accuracy": 0.9714385747909546, |
| "step": 9385 |
| }, |
| { |
| "epoch": 3.6130459880700405, |
| "grad_norm": 0.13333106158008526, |
| "learning_rate": 6.26262506806173e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9713011264801026, |
| "step": 9390 |
| }, |
| { |
| "epoch": 3.61497017510102, |
| "grad_norm": 0.14663434827906371, |
| "learning_rate": 6.250173181716304e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9708237528800965, |
| "step": 9395 |
| }, |
| { |
| "epoch": 3.6168943621319993, |
| "grad_norm": 0.12692314981959749, |
| "learning_rate": 6.23778124493787e-06, |
| "loss": 0.0843, |
| "mean_token_accuracy": 0.9708507835865021, |
| "step": 9400 |
| }, |
| { |
| "epoch": 3.6188185491629787, |
| "grad_norm": 0.12203236079210081, |
| "learning_rate": 6.2254492926863095e-06, |
| "loss": 0.0845, |
| "mean_token_accuracy": 0.9706545650959015, |
| "step": 9405 |
| }, |
| { |
| "epoch": 3.620742736193958, |
| "grad_norm": 0.12760412027520115, |
| "learning_rate": 6.213177359752266e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9708652913570404, |
| "step": 9410 |
| }, |
| { |
| "epoch": 3.6226669232249376, |
| "grad_norm": 0.13813682121731513, |
| "learning_rate": 6.200965480757063e-06, |
| "loss": 0.085, |
| "mean_token_accuracy": 0.9703601837158203, |
| "step": 9415 |
| }, |
| { |
| "epoch": 3.624591110255917, |
| "grad_norm": 0.12734847847373157, |
| "learning_rate": 6.188813690152597e-06, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9707621932029724, |
| "step": 9420 |
| }, |
| { |
| "epoch": 3.6265152972868964, |
| "grad_norm": 0.14059425449796248, |
| "learning_rate": 6.176722022221239e-06, |
| "loss": 0.082, |
| "mean_token_accuracy": 0.9715953230857849, |
| "step": 9425 |
| }, |
| { |
| "epoch": 3.6284394843178758, |
| "grad_norm": 0.12913843992546312, |
| "learning_rate": 6.164690511075756e-06, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.9708456873893738, |
| "step": 9430 |
| }, |
| { |
| "epoch": 3.630363671348855, |
| "grad_norm": 0.12870779437650948, |
| "learning_rate": 6.152719190659195e-06, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9706524848937989, |
| "step": 9435 |
| }, |
| { |
| "epoch": 3.6322878583798346, |
| "grad_norm": 0.13145419132449554, |
| "learning_rate": 6.1408080947447915e-06, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.9703650951385498, |
| "step": 9440 |
| }, |
| { |
| "epoch": 3.634212045410814, |
| "grad_norm": 0.13406184413384944, |
| "learning_rate": 6.128957256935885e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9707401990890503, |
| "step": 9445 |
| }, |
| { |
| "epoch": 3.6361362324417934, |
| "grad_norm": 0.12749027604201754, |
| "learning_rate": 6.117166710665809e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9707375228404999, |
| "step": 9450 |
| }, |
| { |
| "epoch": 3.638060419472773, |
| "grad_norm": 0.12517043425750166, |
| "learning_rate": 6.105436489197804e-06, |
| "loss": 0.0858, |
| "mean_token_accuracy": 0.9704214990139007, |
| "step": 9455 |
| }, |
| { |
| "epoch": 3.639984606503752, |
| "grad_norm": 0.13340903779008972, |
| "learning_rate": 6.093766625624931e-06, |
| "loss": 0.0826, |
| "mean_token_accuracy": 0.9715105950832367, |
| "step": 9460 |
| }, |
| { |
| "epoch": 3.6419087935347316, |
| "grad_norm": 0.1264258374381348, |
| "learning_rate": 6.082157152869959e-06, |
| "loss": 0.083, |
| "mean_token_accuracy": 0.97146155834198, |
| "step": 9465 |
| }, |
| { |
| "epoch": 3.643832980565711, |
| "grad_norm": 0.12776191850725374, |
| "learning_rate": 6.070608103685293e-06, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9705254197120666, |
| "step": 9470 |
| }, |
| { |
| "epoch": 3.6457571675966904, |
| "grad_norm": 0.12786128161907748, |
| "learning_rate": 6.059119510652865e-06, |
| "loss": 0.0851, |
| "mean_token_accuracy": 0.9703995406627655, |
| "step": 9475 |
| }, |
| { |
| "epoch": 3.64768135462767, |
| "grad_norm": 0.1354404763943873, |
| "learning_rate": 6.047691406184063e-06, |
| "loss": 0.0818, |
| "mean_token_accuracy": 0.9714069902896881, |
| "step": 9480 |
| }, |
| { |
| "epoch": 3.6496055416586493, |
| "grad_norm": 0.12580053609381317, |
| "learning_rate": 6.036323822519609e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9704692304134369, |
| "step": 9485 |
| }, |
| { |
| "epoch": 3.6515297286896287, |
| "grad_norm": 0.13263506232122327, |
| "learning_rate": 6.0250167917294906e-06, |
| "loss": 0.0859, |
| "mean_token_accuracy": 0.9703659892082215, |
| "step": 9490 |
| }, |
| { |
| "epoch": 3.653453915720608, |
| "grad_norm": 0.12324365526429366, |
| "learning_rate": 6.013770345712869e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9709503173828125, |
| "step": 9495 |
| }, |
| { |
| "epoch": 3.6553781027515875, |
| "grad_norm": 0.12844079414476534, |
| "learning_rate": 6.0025845161979856e-06, |
| "loss": 0.0846, |
| "mean_token_accuracy": 0.9706700563430786, |
| "step": 9500 |
| }, |
| { |
| "epoch": 3.657302289782567, |
| "grad_norm": 0.12068984137989774, |
| "learning_rate": 5.991459334742063e-06, |
| "loss": 0.0824, |
| "mean_token_accuracy": 0.9714205920696258, |
| "step": 9505 |
| }, |
| { |
| "epoch": 3.6592264768135463, |
| "grad_norm": 0.12813199601373393, |
| "learning_rate": 5.980394832731235e-06, |
| "loss": 0.0845, |
| "mean_token_accuracy": 0.9706978559494018, |
| "step": 9510 |
| }, |
| { |
| "epoch": 3.6611506638445257, |
| "grad_norm": 0.13301041854928702, |
| "learning_rate": 5.96939104138044e-06, |
| "loss": 0.0848, |
| "mean_token_accuracy": 0.9705958902835846, |
| "step": 9515 |
| }, |
| { |
| "epoch": 3.663074850875505, |
| "grad_norm": 0.12989984590258896, |
| "learning_rate": 5.958447991733349e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9704363226890564, |
| "step": 9520 |
| }, |
| { |
| "epoch": 3.6649990379064845, |
| "grad_norm": 0.13131257145653005, |
| "learning_rate": 5.947565714662257e-06, |
| "loss": 0.084, |
| "mean_token_accuracy": 0.9708589434623718, |
| "step": 9525 |
| }, |
| { |
| "epoch": 3.666923224937464, |
| "grad_norm": 0.12632135625413002, |
| "learning_rate": 5.936744240868021e-06, |
| "loss": 0.0846, |
| "mean_token_accuracy": 0.9707492649555206, |
| "step": 9530 |
| }, |
| { |
| "epoch": 3.6688474119684433, |
| "grad_norm": 0.12895466794249552, |
| "learning_rate": 5.9259836008799574e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9705276906490325, |
| "step": 9535 |
| }, |
| { |
| "epoch": 3.6707715989994227, |
| "grad_norm": 0.13088433706746005, |
| "learning_rate": 5.91528382505576e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9706717550754547, |
| "step": 9540 |
| }, |
| { |
| "epoch": 3.672695786030402, |
| "grad_norm": 0.12617769608353183, |
| "learning_rate": 5.9046449435814105e-06, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.9710495352745057, |
| "step": 9545 |
| }, |
| { |
| "epoch": 3.6746199730613816, |
| "grad_norm": 0.12695856992703744, |
| "learning_rate": 5.894066986471097e-06, |
| "loss": 0.0815, |
| "mean_token_accuracy": 0.9717949390411377, |
| "step": 9550 |
| }, |
| { |
| "epoch": 3.676544160092361, |
| "grad_norm": 0.1446820744341158, |
| "learning_rate": 5.883549983567131e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9703502655029297, |
| "step": 9555 |
| }, |
| { |
| "epoch": 3.6784683471233404, |
| "grad_norm": 0.13044982400324565, |
| "learning_rate": 5.8730939645398635e-06, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.9701001346111298, |
| "step": 9560 |
| }, |
| { |
| "epoch": 3.6803925341543198, |
| "grad_norm": 0.12958478222913503, |
| "learning_rate": 5.862698958887599e-06, |
| "loss": 0.0846, |
| "mean_token_accuracy": 0.9706281781196594, |
| "step": 9565 |
| }, |
| { |
| "epoch": 3.682316721185299, |
| "grad_norm": 0.12732133750741922, |
| "learning_rate": 5.852364995936504e-06, |
| "loss": 0.0845, |
| "mean_token_accuracy": 0.9705536603927613, |
| "step": 9570 |
| }, |
| { |
| "epoch": 3.6842409082162786, |
| "grad_norm": 0.13493229587392408, |
| "learning_rate": 5.842092104840541e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.970888888835907, |
| "step": 9575 |
| }, |
| { |
| "epoch": 3.686165095247258, |
| "grad_norm": 0.1280209800426124, |
| "learning_rate": 5.831880314581377e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9710353314876556, |
| "step": 9580 |
| }, |
| { |
| "epoch": 3.6880892822782374, |
| "grad_norm": 0.13102973089755615, |
| "learning_rate": 5.821729653968301e-06, |
| "loss": 0.0828, |
| "mean_token_accuracy": 0.9712984323501587, |
| "step": 9585 |
| }, |
| { |
| "epoch": 3.690013469309217, |
| "grad_norm": 0.1324189617244035, |
| "learning_rate": 5.811640151638141e-06, |
| "loss": 0.0832, |
| "mean_token_accuracy": 0.9712521076202393, |
| "step": 9590 |
| }, |
| { |
| "epoch": 3.691937656340196, |
| "grad_norm": 0.12926347222955453, |
| "learning_rate": 5.8016118360551925e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.970823872089386, |
| "step": 9595 |
| }, |
| { |
| "epoch": 3.6938618433711756, |
| "grad_norm": 0.1266507464981616, |
| "learning_rate": 5.7916447355111335e-06, |
| "loss": 0.0833, |
| "mean_token_accuracy": 0.9709834456443787, |
| "step": 9600 |
| }, |
| { |
| "epoch": 3.695786030402155, |
| "grad_norm": 0.12877563320490412, |
| "learning_rate": 5.781738878124935e-06, |
| "loss": 0.0827, |
| "mean_token_accuracy": 0.9714086413383484, |
| "step": 9605 |
| }, |
| { |
| "epoch": 3.6977102174331344, |
| "grad_norm": 0.12918938997673993, |
| "learning_rate": 5.771894291842795e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9705465793609619, |
| "step": 9610 |
| }, |
| { |
| "epoch": 3.699634404464114, |
| "grad_norm": 0.12659622004044696, |
| "learning_rate": 5.762111004438051e-06, |
| "loss": 0.0824, |
| "mean_token_accuracy": 0.9713815689086914, |
| "step": 9615 |
| }, |
| { |
| "epoch": 3.7015585914950933, |
| "grad_norm": 0.1340281643074414, |
| "learning_rate": 5.752389043511115e-06, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.9705156087875366, |
| "step": 9620 |
| }, |
| { |
| "epoch": 3.7034827785260727, |
| "grad_norm": 0.13195473895165047, |
| "learning_rate": 5.74272843648938e-06, |
| "loss": 0.0843, |
| "mean_token_accuracy": 0.9707110404968262, |
| "step": 9625 |
| }, |
| { |
| "epoch": 3.705406965557052, |
| "grad_norm": 0.13339870846954704, |
| "learning_rate": 5.733129210627147e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.971345865726471, |
| "step": 9630 |
| }, |
| { |
| "epoch": 3.7073311525880315, |
| "grad_norm": 0.1303095363345547, |
| "learning_rate": 5.723591393005545e-06, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.9709326565265656, |
| "step": 9635 |
| }, |
| { |
| "epoch": 3.709255339619011, |
| "grad_norm": 0.13122225618803973, |
| "learning_rate": 5.714115010532475e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9712490439414978, |
| "step": 9640 |
| }, |
| { |
| "epoch": 3.7111795266499903, |
| "grad_norm": 0.13520625638718706, |
| "learning_rate": 5.704700089942502e-06, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9707024335861206, |
| "step": 9645 |
| }, |
| { |
| "epoch": 3.7131037136809697, |
| "grad_norm": 0.1234826292184828, |
| "learning_rate": 5.695346657796808e-06, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.9709530830383301, |
| "step": 9650 |
| }, |
| { |
| "epoch": 3.715027900711949, |
| "grad_norm": 0.12783892191660337, |
| "learning_rate": 5.686054740483098e-06, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.970134836435318, |
| "step": 9655 |
| }, |
| { |
| "epoch": 3.7169520877429285, |
| "grad_norm": 0.13140494485561072, |
| "learning_rate": 5.6768243642155355e-06, |
| "loss": 0.0846, |
| "mean_token_accuracy": 0.9708115994930268, |
| "step": 9660 |
| }, |
| { |
| "epoch": 3.718876274773908, |
| "grad_norm": 0.1279226683859733, |
| "learning_rate": 5.667655555034663e-06, |
| "loss": 0.0843, |
| "mean_token_accuracy": 0.9708460986614227, |
| "step": 9665 |
| }, |
| { |
| "epoch": 3.7208004618048873, |
| "grad_norm": 0.1280860588688012, |
| "learning_rate": 5.6585483388073375e-06, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9706709027290344, |
| "step": 9670 |
| }, |
| { |
| "epoch": 3.7227246488358667, |
| "grad_norm": 0.12544249073631844, |
| "learning_rate": 5.649502741226642e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9709541261196136, |
| "step": 9675 |
| }, |
| { |
| "epoch": 3.724648835866846, |
| "grad_norm": 0.13036270589362559, |
| "learning_rate": 5.640518787811829e-06, |
| "loss": 0.083, |
| "mean_token_accuracy": 0.9713950634002686, |
| "step": 9680 |
| }, |
| { |
| "epoch": 3.7265730228978255, |
| "grad_norm": 0.12191015362164194, |
| "learning_rate": 5.631596503908238e-06, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.9709760129451752, |
| "step": 9685 |
| }, |
| { |
| "epoch": 3.728497209928805, |
| "grad_norm": 0.13167359609413348, |
| "learning_rate": 5.622735914687238e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9707652926445007, |
| "step": 9690 |
| }, |
| { |
| "epoch": 3.7304213969597844, |
| "grad_norm": 0.14381840216378108, |
| "learning_rate": 5.613937045146129e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9705403208732605, |
| "step": 9695 |
| }, |
| { |
| "epoch": 3.7323455839907638, |
| "grad_norm": 0.128441979982536, |
| "learning_rate": 5.605199920108101e-06, |
| "loss": 0.0828, |
| "mean_token_accuracy": 0.9714809536933899, |
| "step": 9700 |
| }, |
| { |
| "epoch": 3.734269771021743, |
| "grad_norm": 0.1293392279553799, |
| "learning_rate": 5.596524564222146e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9709720969200134, |
| "step": 9705 |
| }, |
| { |
| "epoch": 3.7361939580527226, |
| "grad_norm": 0.12555112644927274, |
| "learning_rate": 5.587911001963e-06, |
| "loss": 0.0832, |
| "mean_token_accuracy": 0.9711412131786347, |
| "step": 9710 |
| }, |
| { |
| "epoch": 3.738118145083702, |
| "grad_norm": 0.12577695886277074, |
| "learning_rate": 5.579359257631066e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9712973713874817, |
| "step": 9715 |
| }, |
| { |
| "epoch": 3.7400423321146814, |
| "grad_norm": 0.1257272476723164, |
| "learning_rate": 5.570869355352341e-06, |
| "loss": 0.0828, |
| "mean_token_accuracy": 0.971248596906662, |
| "step": 9720 |
| }, |
| { |
| "epoch": 3.741966519145661, |
| "grad_norm": 0.1273361782268896, |
| "learning_rate": 5.562441319078364e-06, |
| "loss": 0.0823, |
| "mean_token_accuracy": 0.9714790344238281, |
| "step": 9725 |
| }, |
| { |
| "epoch": 3.74389070617664, |
| "grad_norm": 0.1315909980513443, |
| "learning_rate": 5.554075172586131e-06, |
| "loss": 0.0832, |
| "mean_token_accuracy": 0.9712474584579468, |
| "step": 9730 |
| }, |
| { |
| "epoch": 3.7458148932076196, |
| "grad_norm": 0.1338115526857461, |
| "learning_rate": 5.545770939478045e-06, |
| "loss": 0.0859, |
| "mean_token_accuracy": 0.9701749801635742, |
| "step": 9735 |
| }, |
| { |
| "epoch": 3.747739080238599, |
| "grad_norm": 0.1254777546079142, |
| "learning_rate": 5.537528643181829e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9708792209625244, |
| "step": 9740 |
| }, |
| { |
| "epoch": 3.7496632672695784, |
| "grad_norm": 0.13495019415206566, |
| "learning_rate": 5.5293483069504805e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9711350321769714, |
| "step": 9745 |
| }, |
| { |
| "epoch": 3.751587454300558, |
| "grad_norm": 0.1325753258034756, |
| "learning_rate": 5.5212299538621935e-06, |
| "loss": 0.0824, |
| "mean_token_accuracy": 0.9715189516544342, |
| "step": 9750 |
| }, |
| { |
| "epoch": 3.7535116413315377, |
| "grad_norm": 0.12845638484747274, |
| "learning_rate": 5.513173606820293e-06, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.9710196614265442, |
| "step": 9755 |
| }, |
| { |
| "epoch": 3.7554358283625167, |
| "grad_norm": 0.13421034634217963, |
| "learning_rate": 5.505179288553175e-06, |
| "loss": 0.0859, |
| "mean_token_accuracy": 0.9702002882957459, |
| "step": 9760 |
| }, |
| { |
| "epoch": 3.7573600153934965, |
| "grad_norm": 0.12980935458034462, |
| "learning_rate": 5.497247021614248e-06, |
| "loss": 0.0816, |
| "mean_token_accuracy": 0.9716308295726777, |
| "step": 9765 |
| }, |
| { |
| "epoch": 3.7592842024244755, |
| "grad_norm": 0.1292440837300365, |
| "learning_rate": 5.489376828381857e-06, |
| "loss": 0.0826, |
| "mean_token_accuracy": 0.9712635397911071, |
| "step": 9770 |
| }, |
| { |
| "epoch": 3.7612083894554553, |
| "grad_norm": 0.13267514667464766, |
| "learning_rate": 5.481568731059224e-06, |
| "loss": 0.0851, |
| "mean_token_accuracy": 0.970600175857544, |
| "step": 9775 |
| }, |
| { |
| "epoch": 3.7631325764864343, |
| "grad_norm": 0.1328581360184632, |
| "learning_rate": 5.473822751674394e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9713373780250549, |
| "step": 9780 |
| }, |
| { |
| "epoch": 3.765056763517414, |
| "grad_norm": 0.12474286120297946, |
| "learning_rate": 5.466138912080157e-06, |
| "loss": 0.0846, |
| "mean_token_accuracy": 0.970719039440155, |
| "step": 9785 |
| }, |
| { |
| "epoch": 3.766980950548393, |
| "grad_norm": 0.12321370554616629, |
| "learning_rate": 5.458517233954e-06, |
| "loss": 0.0808, |
| "mean_token_accuracy": 0.9720763206481934, |
| "step": 9790 |
| }, |
| { |
| "epoch": 3.768905137579373, |
| "grad_norm": 0.12586128975028832, |
| "learning_rate": 5.450957738798047e-06, |
| "loss": 0.0828, |
| "mean_token_accuracy": 0.9712636828422546, |
| "step": 9795 |
| }, |
| { |
| "epoch": 3.770829324610352, |
| "grad_norm": 0.13099616747028167, |
| "learning_rate": 5.443460447938987e-06, |
| "loss": 0.0826, |
| "mean_token_accuracy": 0.9713502109050751, |
| "step": 9800 |
| }, |
| { |
| "epoch": 3.7727535116413318, |
| "grad_norm": 0.13315282807199783, |
| "learning_rate": 5.436025382528017e-06, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9707408905029297, |
| "step": 9805 |
| }, |
| { |
| "epoch": 3.7746776986723107, |
| "grad_norm": 0.13012296461998368, |
| "learning_rate": 5.42865256354079e-06, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9702576816082, |
| "step": 9810 |
| }, |
| { |
| "epoch": 3.7766018857032906, |
| "grad_norm": 0.1301751974531095, |
| "learning_rate": 5.421342011777347e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9711966514587402, |
| "step": 9815 |
| }, |
| { |
| "epoch": 3.7785260727342695, |
| "grad_norm": 0.130077690349645, |
| "learning_rate": 5.414093747862066e-06, |
| "loss": 0.0831, |
| "mean_token_accuracy": 0.9712831854820252, |
| "step": 9820 |
| }, |
| { |
| "epoch": 3.7804502597652494, |
| "grad_norm": 0.1267481640529036, |
| "learning_rate": 5.406907792243597e-06, |
| "loss": 0.0845, |
| "mean_token_accuracy": 0.9705725729465484, |
| "step": 9825 |
| }, |
| { |
| "epoch": 3.7823744467962284, |
| "grad_norm": 0.1253012354373941, |
| "learning_rate": 5.3997841651948045e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9713540017604828, |
| "step": 9830 |
| }, |
| { |
| "epoch": 3.784298633827208, |
| "grad_norm": 0.132347676217188, |
| "learning_rate": 5.392722886812721e-06, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.9705925464630127, |
| "step": 9835 |
| }, |
| { |
| "epoch": 3.786222820858187, |
| "grad_norm": 0.1296479996379853, |
| "learning_rate": 5.3857239770184755e-06, |
| "loss": 0.0817, |
| "mean_token_accuracy": 0.9715558767318726, |
| "step": 9840 |
| }, |
| { |
| "epoch": 3.788147007889167, |
| "grad_norm": 0.1260459335050519, |
| "learning_rate": 5.378787455557247e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9711081922054291, |
| "step": 9845 |
| }, |
| { |
| "epoch": 3.790071194920146, |
| "grad_norm": 0.12971339976913693, |
| "learning_rate": 5.3719133419982e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9707649409770965, |
| "step": 9850 |
| }, |
| { |
| "epoch": 3.791995381951126, |
| "grad_norm": 0.13658938924903805, |
| "learning_rate": 5.365101655734444e-06, |
| "loss": 0.0838, |
| "mean_token_accuracy": 0.970706433057785, |
| "step": 9855 |
| }, |
| { |
| "epoch": 3.793919568982105, |
| "grad_norm": 0.12935536651610716, |
| "learning_rate": 5.358352415982966e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9708448588848114, |
| "step": 9860 |
| }, |
| { |
| "epoch": 3.7958437560130847, |
| "grad_norm": 0.12471233681763558, |
| "learning_rate": 5.351665641784581e-06, |
| "loss": 0.0833, |
| "mean_token_accuracy": 0.9710375130176544, |
| "step": 9865 |
| }, |
| { |
| "epoch": 3.7977679430440636, |
| "grad_norm": 0.1254634815523459, |
| "learning_rate": 5.345041352003874e-06, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.9707024693489075, |
| "step": 9870 |
| }, |
| { |
| "epoch": 3.7996921300750435, |
| "grad_norm": 0.1331609128556762, |
| "learning_rate": 5.338479565329152e-06, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.9710483849048615, |
| "step": 9875 |
| }, |
| { |
| "epoch": 3.8016163171060224, |
| "grad_norm": 0.12741349365771215, |
| "learning_rate": 5.331980300272393e-06, |
| "loss": 0.0831, |
| "mean_token_accuracy": 0.9710205376148224, |
| "step": 9880 |
| }, |
| { |
| "epoch": 3.8035405041370023, |
| "grad_norm": 0.13611099575339342, |
| "learning_rate": 5.3255435751691875e-06, |
| "loss": 0.0831, |
| "mean_token_accuracy": 0.9711984157562256, |
| "step": 9885 |
| }, |
| { |
| "epoch": 3.8054646911679813, |
| "grad_norm": 0.12980319201233825, |
| "learning_rate": 5.3191694081786865e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9707724273204803, |
| "step": 9890 |
| }, |
| { |
| "epoch": 3.807388878198961, |
| "grad_norm": 0.1426123333389128, |
| "learning_rate": 5.31285781728356e-06, |
| "loss": 0.0821, |
| "mean_token_accuracy": 0.9714478254318237, |
| "step": 9895 |
| }, |
| { |
| "epoch": 3.8093130652299405, |
| "grad_norm": 0.1282591232708518, |
| "learning_rate": 5.306608820289936e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9712033033370971, |
| "step": 9900 |
| }, |
| { |
| "epoch": 3.81123725226092, |
| "grad_norm": 0.12957055752538663, |
| "learning_rate": 5.300422434827353e-06, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.9710727035999298, |
| "step": 9905 |
| }, |
| { |
| "epoch": 3.8131614392918993, |
| "grad_norm": 0.1291094611425077, |
| "learning_rate": 5.2942986783487115e-06, |
| "loss": 0.0843, |
| "mean_token_accuracy": 0.9708347141742706, |
| "step": 9910 |
| }, |
| { |
| "epoch": 3.8150856263228787, |
| "grad_norm": 0.126529513057066, |
| "learning_rate": 5.288237568130227e-06, |
| "loss": 0.0813, |
| "mean_token_accuracy": 0.9718260645866394, |
| "step": 9915 |
| }, |
| { |
| "epoch": 3.817009813353858, |
| "grad_norm": 0.13568601561666901, |
| "learning_rate": 5.282239121271376e-06, |
| "loss": 0.0833, |
| "mean_token_accuracy": 0.9709510028362274, |
| "step": 9920 |
| }, |
| { |
| "epoch": 3.8189340003848375, |
| "grad_norm": 0.132814773260242, |
| "learning_rate": 5.2763033546948515e-06, |
| "loss": 0.0838, |
| "mean_token_accuracy": 0.9708267450332642, |
| "step": 9925 |
| }, |
| { |
| "epoch": 3.820858187415817, |
| "grad_norm": 0.13132014977440104, |
| "learning_rate": 5.270430285146514e-06, |
| "loss": 0.085, |
| "mean_token_accuracy": 0.970295125246048, |
| "step": 9930 |
| }, |
| { |
| "epoch": 3.8227823744467964, |
| "grad_norm": 0.133290690508834, |
| "learning_rate": 5.264619929195344e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.971205323934555, |
| "step": 9935 |
| }, |
| { |
| "epoch": 3.8247065614777758, |
| "grad_norm": 0.1314362482054655, |
| "learning_rate": 5.258872303233397e-06, |
| "loss": 0.0843, |
| "mean_token_accuracy": 0.9707643210887908, |
| "step": 9940 |
| }, |
| { |
| "epoch": 3.826630748508755, |
| "grad_norm": 0.12533084082719861, |
| "learning_rate": 5.253187423475754e-06, |
| "loss": 0.0838, |
| "mean_token_accuracy": 0.9710265934467316, |
| "step": 9945 |
| }, |
| { |
| "epoch": 3.8285549355397346, |
| "grad_norm": 0.12911160533507593, |
| "learning_rate": 5.247565305960483e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9709489643573761, |
| "step": 9950 |
| }, |
| { |
| "epoch": 3.830479122570714, |
| "grad_norm": 0.12233415417364961, |
| "learning_rate": 5.242005966548577e-06, |
| "loss": 0.0832, |
| "mean_token_accuracy": 0.9711369574069977, |
| "step": 9955 |
| }, |
| { |
| "epoch": 3.8324033096016934, |
| "grad_norm": 0.13001856988459168, |
| "learning_rate": 5.236509420923935e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9707648396492005, |
| "step": 9960 |
| }, |
| { |
| "epoch": 3.834327496632673, |
| "grad_norm": 0.1289324823675169, |
| "learning_rate": 5.23107568459329e-06, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9699880659580231, |
| "step": 9965 |
| }, |
| { |
| "epoch": 3.836251683663652, |
| "grad_norm": 0.12593253880586072, |
| "learning_rate": 5.225704772886192e-06, |
| "loss": 0.0826, |
| "mean_token_accuracy": 0.9715697944164277, |
| "step": 9970 |
| }, |
| { |
| "epoch": 3.8381758706946316, |
| "grad_norm": 0.13267526238451943, |
| "learning_rate": 5.220396700954941e-06, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.9709756135940552, |
| "step": 9975 |
| }, |
| { |
| "epoch": 3.840100057725611, |
| "grad_norm": 0.12821941199142406, |
| "learning_rate": 5.215151483774559e-06, |
| "loss": 0.0854, |
| "mean_token_accuracy": 0.9702714800834655, |
| "step": 9980 |
| }, |
| { |
| "epoch": 3.8420242447565904, |
| "grad_norm": 0.13010407450206193, |
| "learning_rate": 5.209969136142742e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9708271741867065, |
| "step": 9985 |
| }, |
| { |
| "epoch": 3.84394843178757, |
| "grad_norm": 0.12906298041128167, |
| "learning_rate": 5.204849672679825e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9711850464344025, |
| "step": 9990 |
| }, |
| { |
| "epoch": 3.8458726188185492, |
| "grad_norm": 0.1357119125248045, |
| "learning_rate": 5.199793107828726e-06, |
| "loss": 0.0822, |
| "mean_token_accuracy": 0.9714503288269043, |
| "step": 9995 |
| }, |
| { |
| "epoch": 3.8477968058495287, |
| "grad_norm": 0.12898214151698506, |
| "learning_rate": 5.194799455854923e-06, |
| "loss": 0.0843, |
| "mean_token_accuracy": 0.9707751035690307, |
| "step": 10000 |
| }, |
| { |
| "epoch": 3.849720992880508, |
| "grad_norm": 0.12936479325355857, |
| "learning_rate": 5.189868730846402e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.971122008562088, |
| "step": 10005 |
| }, |
| { |
| "epoch": 3.8516451799114875, |
| "grad_norm": 0.12638559348310224, |
| "learning_rate": 5.185000946713621e-06, |
| "loss": 0.0821, |
| "mean_token_accuracy": 0.9716223776340485, |
| "step": 10010 |
| }, |
| { |
| "epoch": 3.853569366942467, |
| "grad_norm": 0.1274584727626304, |
| "learning_rate": 5.180196117189471e-06, |
| "loss": 0.083, |
| "mean_token_accuracy": 0.9712328910827637, |
| "step": 10015 |
| }, |
| { |
| "epoch": 3.8554935539734463, |
| "grad_norm": 0.12934902608342166, |
| "learning_rate": 5.175454255829237e-06, |
| "loss": 0.0822, |
| "mean_token_accuracy": 0.9715910375118255, |
| "step": 10020 |
| }, |
| { |
| "epoch": 3.8574177410044257, |
| "grad_norm": 0.12649848202572228, |
| "learning_rate": 5.170775376010558e-06, |
| "loss": 0.0825, |
| "mean_token_accuracy": 0.9712820768356323, |
| "step": 10025 |
| }, |
| { |
| "epoch": 3.859341928035405, |
| "grad_norm": 0.12602422849900147, |
| "learning_rate": 5.166159490933391e-06, |
| "loss": 0.0824, |
| "mean_token_accuracy": 0.9714111924171448, |
| "step": 10030 |
| }, |
| { |
| "epoch": 3.8612661150663845, |
| "grad_norm": 0.12771046568125663, |
| "learning_rate": 5.161606613619979e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9702588737010955, |
| "step": 10035 |
| }, |
| { |
| "epoch": 3.863190302097364, |
| "grad_norm": 0.12819251323149022, |
| "learning_rate": 5.157116756914799e-06, |
| "loss": 0.0855, |
| "mean_token_accuracy": 0.9703081130981446, |
| "step": 10040 |
| }, |
| { |
| "epoch": 3.8651144891283433, |
| "grad_norm": 0.12439719751765296, |
| "learning_rate": 5.152689933484543e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9711783528327942, |
| "step": 10045 |
| }, |
| { |
| "epoch": 3.8670386761593227, |
| "grad_norm": 0.1314106629943747, |
| "learning_rate": 5.148326155818074e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9711025774478912, |
| "step": 10050 |
| }, |
| { |
| "epoch": 3.868962863190302, |
| "grad_norm": 0.1250518592899649, |
| "learning_rate": 5.144025436226387e-06, |
| "loss": 0.0827, |
| "mean_token_accuracy": 0.9713931560516358, |
| "step": 10055 |
| }, |
| { |
| "epoch": 3.8708870502212815, |
| "grad_norm": 0.13078642303132598, |
| "learning_rate": 5.139787786842584e-06, |
| "loss": 0.0864, |
| "mean_token_accuracy": 0.9698314964771271, |
| "step": 10060 |
| }, |
| { |
| "epoch": 3.872811237252261, |
| "grad_norm": 0.12923350352929674, |
| "learning_rate": 5.135613219621834e-06, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.9711137771606445, |
| "step": 10065 |
| }, |
| { |
| "epoch": 3.8747354242832404, |
| "grad_norm": 0.12563235433177386, |
| "learning_rate": 5.131501746341337e-06, |
| "loss": 0.0826, |
| "mean_token_accuracy": 0.971451610326767, |
| "step": 10070 |
| }, |
| { |
| "epoch": 3.8766596113142198, |
| "grad_norm": 0.13197299075249083, |
| "learning_rate": 5.127453378600299e-06, |
| "loss": 0.0851, |
| "mean_token_accuracy": 0.9704861402511596, |
| "step": 10075 |
| }, |
| { |
| "epoch": 3.878583798345199, |
| "grad_norm": 0.12846446070815468, |
| "learning_rate": 5.123468127819885e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.970909696817398, |
| "step": 10080 |
| }, |
| { |
| "epoch": 3.8805079853761786, |
| "grad_norm": 0.13823440456106123, |
| "learning_rate": 5.1195460052432016e-06, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.970853990316391, |
| "step": 10085 |
| }, |
| { |
| "epoch": 3.882432172407158, |
| "grad_norm": 0.12688446312090473, |
| "learning_rate": 5.1156870219352635e-06, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.9702435910701752, |
| "step": 10090 |
| }, |
| { |
| "epoch": 3.8843563594381374, |
| "grad_norm": 0.12591304427956893, |
| "learning_rate": 5.111891188782951e-06, |
| "loss": 0.0821, |
| "mean_token_accuracy": 0.9713928639888764, |
| "step": 10095 |
| }, |
| { |
| "epoch": 3.886280546469117, |
| "grad_norm": 0.13381926417216353, |
| "learning_rate": 5.108158516494989e-06, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.970725280046463, |
| "step": 10100 |
| }, |
| { |
| "epoch": 3.888204733500096, |
| "grad_norm": 0.13227264592543717, |
| "learning_rate": 5.104489015601915e-06, |
| "loss": 0.0819, |
| "mean_token_accuracy": 0.9715412676334381, |
| "step": 10105 |
| }, |
| { |
| "epoch": 3.8901289205310756, |
| "grad_norm": 0.1332023257435153, |
| "learning_rate": 5.100882696456047e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9710934042930603, |
| "step": 10110 |
| }, |
| { |
| "epoch": 3.892053107562055, |
| "grad_norm": 0.1328629268332128, |
| "learning_rate": 5.09733956923146e-06, |
| "loss": 0.0832, |
| "mean_token_accuracy": 0.9711413741111755, |
| "step": 10115 |
| }, |
| { |
| "epoch": 3.8939772945930344, |
| "grad_norm": 0.12670490122460273, |
| "learning_rate": 5.093859643923948e-06, |
| "loss": 0.0824, |
| "mean_token_accuracy": 0.9714822411537171, |
| "step": 10120 |
| }, |
| { |
| "epoch": 3.895901481624014, |
| "grad_norm": 0.12499373344855456, |
| "learning_rate": 5.090442930351005e-06, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.9706906914710999, |
| "step": 10125 |
| }, |
| { |
| "epoch": 3.8978256686549932, |
| "grad_norm": 0.1245399094578281, |
| "learning_rate": 5.087089438151788e-06, |
| "loss": 0.0831, |
| "mean_token_accuracy": 0.9712965786457062, |
| "step": 10130 |
| }, |
| { |
| "epoch": 3.8997498556859727, |
| "grad_norm": 0.1301572958641481, |
| "learning_rate": 5.083799176787104e-06, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.9707218647003174, |
| "step": 10135 |
| }, |
| { |
| "epoch": 3.901674042716952, |
| "grad_norm": 0.13022481008797088, |
| "learning_rate": 5.080572155539369e-06, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9702335953712463, |
| "step": 10140 |
| }, |
| { |
| "epoch": 3.9035982297479315, |
| "grad_norm": 0.12837698874608652, |
| "learning_rate": 5.077408383512584e-06, |
| "loss": 0.0805, |
| "mean_token_accuracy": 0.9720855534076691, |
| "step": 10145 |
| }, |
| { |
| "epoch": 3.905522416778911, |
| "grad_norm": 0.12753857050900713, |
| "learning_rate": 5.074307869632321e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9710967183113098, |
| "step": 10150 |
| }, |
| { |
| "epoch": 3.9074466038098903, |
| "grad_norm": 0.12859493835044503, |
| "learning_rate": 5.0712706226456835e-06, |
| "loss": 0.0824, |
| "mean_token_accuracy": 0.971541553735733, |
| "step": 10155 |
| }, |
| { |
| "epoch": 3.9093707908408697, |
| "grad_norm": 0.12584170646455595, |
| "learning_rate": 5.068296651121286e-06, |
| "loss": 0.082, |
| "mean_token_accuracy": 0.9715514481067657, |
| "step": 10160 |
| }, |
| { |
| "epoch": 3.911294977871849, |
| "grad_norm": 0.1269583447627004, |
| "learning_rate": 5.06538596344924e-06, |
| "loss": 0.0821, |
| "mean_token_accuracy": 0.9715434312820435, |
| "step": 10165 |
| }, |
| { |
| "epoch": 3.9132191649028285, |
| "grad_norm": 0.13090408555934774, |
| "learning_rate": 5.062538567841114e-06, |
| "loss": 0.0814, |
| "mean_token_accuracy": 0.9717414677143097, |
| "step": 10170 |
| }, |
| { |
| "epoch": 3.915143351933808, |
| "grad_norm": 0.1301397066057007, |
| "learning_rate": 5.059754472329919e-06, |
| "loss": 0.0822, |
| "mean_token_accuracy": 0.9714883327484131, |
| "step": 10175 |
| }, |
| { |
| "epoch": 3.9170675389647873, |
| "grad_norm": 0.1265616885928072, |
| "learning_rate": 5.0570336847700875e-06, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.9710824906826019, |
| "step": 10180 |
| }, |
| { |
| "epoch": 3.9189917259957667, |
| "grad_norm": 0.1292654952944248, |
| "learning_rate": 5.054376212837453e-06, |
| "loss": 0.0799, |
| "mean_token_accuracy": 0.9723989367485046, |
| "step": 10185 |
| }, |
| { |
| "epoch": 3.920915913026746, |
| "grad_norm": 0.12506948699138926, |
| "learning_rate": 5.051782064029214e-06, |
| "loss": 0.0843, |
| "mean_token_accuracy": 0.9707196116447449, |
| "step": 10190 |
| }, |
| { |
| "epoch": 3.9228401000577255, |
| "grad_norm": 0.12625564064244693, |
| "learning_rate": 5.0492512456639325e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9709169268608093, |
| "step": 10195 |
| }, |
| { |
| "epoch": 3.924764287088705, |
| "grad_norm": 0.13116148538043676, |
| "learning_rate": 5.046783764881503e-06, |
| "loss": 0.0838, |
| "mean_token_accuracy": 0.9707896530628204, |
| "step": 10200 |
| }, |
| { |
| "epoch": 3.9266884741196844, |
| "grad_norm": 0.1284618138441698, |
| "learning_rate": 5.044379628643123e-06, |
| "loss": 0.0821, |
| "mean_token_accuracy": 0.9715276062488556, |
| "step": 10205 |
| }, |
| { |
| "epoch": 3.9286126611506638, |
| "grad_norm": 0.13214132106477244, |
| "learning_rate": 5.0420388437312975e-06, |
| "loss": 0.0851, |
| "mean_token_accuracy": 0.9706056773662567, |
| "step": 10210 |
| }, |
| { |
| "epoch": 3.930536848181643, |
| "grad_norm": 0.1348864793969741, |
| "learning_rate": 5.039761416749797e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9712868213653565, |
| "step": 10215 |
| }, |
| { |
| "epoch": 3.9324610352126226, |
| "grad_norm": 0.1289711702781738, |
| "learning_rate": 5.037547354123652e-06, |
| "loss": 0.0787, |
| "mean_token_accuracy": 0.9727490663528442, |
| "step": 10220 |
| }, |
| { |
| "epoch": 3.934385222243602, |
| "grad_norm": 0.1271638377917533, |
| "learning_rate": 5.035396662099127e-06, |
| "loss": 0.0819, |
| "mean_token_accuracy": 0.9715701401233673, |
| "step": 10225 |
| }, |
| { |
| "epoch": 3.9363094092745814, |
| "grad_norm": 0.13920813700768522, |
| "learning_rate": 5.033309346743716e-06, |
| "loss": 0.0814, |
| "mean_token_accuracy": 0.9715500593185424, |
| "step": 10230 |
| }, |
| { |
| "epoch": 3.938233596305561, |
| "grad_norm": 0.1270314032330905, |
| "learning_rate": 5.031285413946101e-06, |
| "loss": 0.083, |
| "mean_token_accuracy": 0.9712082087993622, |
| "step": 10235 |
| }, |
| { |
| "epoch": 3.94015778333654, |
| "grad_norm": 0.13403626861564544, |
| "learning_rate": 5.0293248694161665e-06, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.9704956352710724, |
| "step": 10240 |
| }, |
| { |
| "epoch": 3.9420819703675196, |
| "grad_norm": 0.1309227117524348, |
| "learning_rate": 5.027427718684955e-06, |
| "loss": 0.0833, |
| "mean_token_accuracy": 0.9710226118564605, |
| "step": 10245 |
| }, |
| { |
| "epoch": 3.944006157398499, |
| "grad_norm": 0.13231010952518213, |
| "learning_rate": 5.02559396710467e-06, |
| "loss": 0.081, |
| "mean_token_accuracy": 0.9719463467597962, |
| "step": 10250 |
| }, |
| { |
| "epoch": 3.9459303444294784, |
| "grad_norm": 0.13128683536722258, |
| "learning_rate": 5.023823619848651e-06, |
| "loss": 0.0828, |
| "mean_token_accuracy": 0.9711349666118622, |
| "step": 10255 |
| }, |
| { |
| "epoch": 3.947854531460458, |
| "grad_norm": 0.12653554607775047, |
| "learning_rate": 5.02211668191137e-06, |
| "loss": 0.0823, |
| "mean_token_accuracy": 0.9714332520961761, |
| "step": 10260 |
| }, |
| { |
| "epoch": 3.9497787184914372, |
| "grad_norm": 0.13474648427718314, |
| "learning_rate": 5.020473158108398e-06, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9714989304542542, |
| "step": 10265 |
| }, |
| { |
| "epoch": 3.9517029055224167, |
| "grad_norm": 0.13044370419525142, |
| "learning_rate": 5.0188930530764155e-06, |
| "loss": 0.0824, |
| "mean_token_accuracy": 0.9714354455471039, |
| "step": 10270 |
| }, |
| { |
| "epoch": 3.953627092553396, |
| "grad_norm": 0.12661398006963123, |
| "learning_rate": 5.017376371273177e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9711092829704284, |
| "step": 10275 |
| }, |
| { |
| "epoch": 3.9555512795843755, |
| "grad_norm": 0.1387673625001325, |
| "learning_rate": 5.015923116977517e-06, |
| "loss": 0.0818, |
| "mean_token_accuracy": 0.9715732455253601, |
| "step": 10280 |
| }, |
| { |
| "epoch": 3.957475466615355, |
| "grad_norm": 0.12749658417248053, |
| "learning_rate": 5.014533294289326e-06, |
| "loss": 0.0822, |
| "mean_token_accuracy": 0.9713805615901947, |
| "step": 10285 |
| }, |
| { |
| "epoch": 3.9593996536463343, |
| "grad_norm": 0.14480554531550074, |
| "learning_rate": 5.013206907129543e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9709558188915253, |
| "step": 10290 |
| }, |
| { |
| "epoch": 3.9613238406773137, |
| "grad_norm": 0.1340795746063367, |
| "learning_rate": 5.011943959240139e-06, |
| "loss": 0.0814, |
| "mean_token_accuracy": 0.9717664599418641, |
| "step": 10295 |
| }, |
| { |
| "epoch": 3.963248027708293, |
| "grad_norm": 0.13577146914254368, |
| "learning_rate": 5.010744454184122e-06, |
| "loss": 0.0808, |
| "mean_token_accuracy": 0.9720539331436158, |
| "step": 10300 |
| }, |
| { |
| "epoch": 3.9651722147392725, |
| "grad_norm": 0.13278406905491066, |
| "learning_rate": 5.009608395345504e-06, |
| "loss": 0.0817, |
| "mean_token_accuracy": 0.9717506885528564, |
| "step": 10305 |
| }, |
| { |
| "epoch": 3.967096401770252, |
| "grad_norm": 0.12880093192507883, |
| "learning_rate": 5.008535785929318e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9709747076034546, |
| "step": 10310 |
| }, |
| { |
| "epoch": 3.9690205888012313, |
| "grad_norm": 0.12894048623160223, |
| "learning_rate": 5.007526628961585e-06, |
| "loss": 0.0823, |
| "mean_token_accuracy": 0.9715463817119598, |
| "step": 10315 |
| }, |
| { |
| "epoch": 3.9709447758322107, |
| "grad_norm": 0.13374200890172133, |
| "learning_rate": 5.0065809272893155e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.970946192741394, |
| "step": 10320 |
| }, |
| { |
| "epoch": 3.97286896286319, |
| "grad_norm": 0.1347420834773057, |
| "learning_rate": 5.0056986835805045e-06, |
| "loss": 0.0831, |
| "mean_token_accuracy": 0.9711974918842315, |
| "step": 10325 |
| }, |
| { |
| "epoch": 3.97479314989417, |
| "grad_norm": 0.129123768389722, |
| "learning_rate": 5.004879900324117e-06, |
| "loss": 0.083, |
| "mean_token_accuracy": 0.9712783515453338, |
| "step": 10330 |
| }, |
| { |
| "epoch": 3.976717336925149, |
| "grad_norm": 0.12496839255961688, |
| "learning_rate": 5.004124579830091e-06, |
| "loss": 0.0828, |
| "mean_token_accuracy": 0.9713409304618835, |
| "step": 10335 |
| }, |
| { |
| "epoch": 3.978641523956129, |
| "grad_norm": 0.12993674670818847, |
| "learning_rate": 5.003432724229319e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.970902556180954, |
| "step": 10340 |
| }, |
| { |
| "epoch": 3.9805657109871078, |
| "grad_norm": 0.1328995851373331, |
| "learning_rate": 5.002804335473649e-06, |
| "loss": 0.0855, |
| "mean_token_accuracy": 0.9702074944972991, |
| "step": 10345 |
| }, |
| { |
| "epoch": 3.9824898980180876, |
| "grad_norm": 0.1283922514424663, |
| "learning_rate": 5.0022394153358796e-06, |
| "loss": 0.0838, |
| "mean_token_accuracy": 0.9709455788135528, |
| "step": 10350 |
| }, |
| { |
| "epoch": 3.9844140850490666, |
| "grad_norm": 0.12695338540041073, |
| "learning_rate": 5.001737965409753e-06, |
| "loss": 0.0812, |
| "mean_token_accuracy": 0.9720363140106201, |
| "step": 10355 |
| }, |
| { |
| "epoch": 3.9863382720800464, |
| "grad_norm": 0.12901800004736622, |
| "learning_rate": 5.001299987109945e-06, |
| "loss": 0.0819, |
| "mean_token_accuracy": 0.9715746283531189, |
| "step": 10360 |
| }, |
| { |
| "epoch": 3.9882624591110254, |
| "grad_norm": 0.13476223421856068, |
| "learning_rate": 5.0009254816720735e-06, |
| "loss": 0.0824, |
| "mean_token_accuracy": 0.9714550971984863, |
| "step": 10365 |
| }, |
| { |
| "epoch": 3.9901866461420052, |
| "grad_norm": 0.13027831801951967, |
| "learning_rate": 5.000614450152687e-06, |
| "loss": 0.0833, |
| "mean_token_accuracy": 0.9711871147155762, |
| "step": 10370 |
| }, |
| { |
| "epoch": 3.992110833172984, |
| "grad_norm": 0.13494495469316733, |
| "learning_rate": 5.000366893429256e-06, |
| "loss": 0.0819, |
| "mean_token_accuracy": 0.971628975868225, |
| "step": 10375 |
| }, |
| { |
| "epoch": 3.994035020203964, |
| "grad_norm": 0.12309697899654617, |
| "learning_rate": 5.000182812200186e-06, |
| "loss": 0.084, |
| "mean_token_accuracy": 0.9710172712802887, |
| "step": 10380 |
| }, |
| { |
| "epoch": 3.995959207234943, |
| "grad_norm": 0.13140302440475887, |
| "learning_rate": 5.000062206984804e-06, |
| "loss": 0.0838, |
| "mean_token_accuracy": 0.9709397912025451, |
| "step": 10385 |
| }, |
| { |
| "epoch": 3.997883394265923, |
| "grad_norm": 0.13152660776564254, |
| "learning_rate": 5.000005078123357e-06, |
| "loss": 0.0796, |
| "mean_token_accuracy": 0.972462397813797, |
| "step": 10390 |
| }, |
| { |
| "epoch": 3.9986530690783146, |
| "mean_token_accuracy": 0.9705722630023956, |
| "step": 10392, |
| "total_flos": 5409408689373184.0, |
| "train_loss": 0.14862096939577196, |
| "train_runtime": 59690.9668, |
| "train_samples_per_second": 2.786, |
| "train_steps_per_second": 0.174 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 10392, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5409408689373184.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|