diff --git "a/checkpoint-2000/trainer_state.json" "b/checkpoint-2000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2000/trainer_state.json" @@ -0,0 +1,18034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5197568389057752, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007598784194528875, + "grad_norm": 11.767926216125488, + "learning_rate": 0.0, + "loss": 0.7937269806861877, + "mean_token_accuracy": 0.7822731137275696, + "num_tokens": 10507.0, + "step": 1 + }, + { + "epoch": 0.001519756838905775, + "grad_norm": 14.9199800491333, + "learning_rate": 2.5252525252525256e-08, + "loss": 0.7665389776229858, + "mean_token_accuracy": 0.8342233300209045, + "num_tokens": 14806.0, + "step": 2 + }, + { + "epoch": 0.0022796352583586625, + "grad_norm": 11.991217613220215, + "learning_rate": 5.050505050505051e-08, + "loss": 0.9597002267837524, + "mean_token_accuracy": 0.7054992318153381, + "num_tokens": 27170.0, + "step": 3 + }, + { + "epoch": 0.00303951367781155, + "grad_norm": 12.958333015441895, + "learning_rate": 7.575757575757576e-08, + "loss": 0.9971482753753662, + "mean_token_accuracy": 0.7261134386062622, + "num_tokens": 33729.0, + "step": 4 + }, + { + "epoch": 0.003799392097264438, + "grad_norm": 13.5665283203125, + "learning_rate": 1.0101010101010103e-07, + "loss": 0.9504883885383606, + "mean_token_accuracy": 0.745307445526123, + "num_tokens": 41174.0, + "step": 5 + }, + { + "epoch": 0.004559270516717325, + "grad_norm": 10.09444808959961, + "learning_rate": 1.2626262626262626e-07, + "loss": 0.759548008441925, + "mean_token_accuracy": 0.7842121124267578, + "num_tokens": 47943.0, + "step": 6 + }, + { + "epoch": 0.005319148936170213, + "grad_norm": 10.741650581359863, + "learning_rate": 1.5151515151515152e-07, + "loss": 0.8231598138809204, + "mean_token_accuracy": 0.7550969123840332, + "num_tokens": 56665.0, + "step": 7 + }, + { + "epoch": 0.0060790273556231, + "grad_norm": 12.250170707702637, + "learning_rate": 1.767676767676768e-07, + "loss": 0.8576581478118896, + "mean_token_accuracy": 0.7568671703338623, + "num_tokens": 67606.0, + "step": 8 + }, + { + "epoch": 0.006838905775075988, + "grad_norm": 12.828629493713379, + "learning_rate": 2.0202020202020205e-07, + "loss": 0.9886435866355896, + "mean_token_accuracy": 0.733400285243988, + "num_tokens": 74272.0, + "step": 9 + }, + { + "epoch": 0.007598784194528876, + "grad_norm": 15.966923713684082, + "learning_rate": 2.2727272727272729e-07, + "loss": 1.064985990524292, + "mean_token_accuracy": 0.7101132869720459, + "num_tokens": 80524.0, + "step": 10 + }, + { + "epoch": 0.008358662613981762, + "grad_norm": 10.864850044250488, + "learning_rate": 2.525252525252525e-07, + "loss": 0.8311550617218018, + "mean_token_accuracy": 0.7431639432907104, + "num_tokens": 96292.0, + "step": 11 + }, + { + "epoch": 0.00911854103343465, + "grad_norm": 16.438785552978516, + "learning_rate": 2.7777777777777776e-07, + "loss": 1.0579866170883179, + "mean_token_accuracy": 0.7222976684570312, + "num_tokens": 102992.0, + "step": 12 + }, + { + "epoch": 0.009878419452887538, + "grad_norm": 11.179214477539062, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.9816144704818726, + "mean_token_accuracy": 0.7206371426582336, + "num_tokens": 113571.0, + "step": 13 + }, + { + "epoch": 0.010638297872340425, + "grad_norm": 12.780299186706543, + "learning_rate": 3.2828282828282834e-07, + "loss": 0.847449004650116, + "mean_token_accuracy": 0.7826199531555176, + "num_tokens": 119568.0, + "step": 14 + }, + { + "epoch": 0.011398176291793313, + "grad_norm": 14.800421714782715, + "learning_rate": 3.535353535353536e-07, + "loss": 0.9275516271591187, + "mean_token_accuracy": 0.7655045986175537, + "num_tokens": 126258.0, + "step": 15 + }, + { + "epoch": 0.0121580547112462, + "grad_norm": 11.267602920532227, + "learning_rate": 3.787878787878788e-07, + "loss": 0.8464037179946899, + "mean_token_accuracy": 0.7606508731842041, + "num_tokens": 136831.0, + "step": 16 + }, + { + "epoch": 0.012917933130699088, + "grad_norm": 12.891013145446777, + "learning_rate": 4.040404040404041e-07, + "loss": 0.9903074502944946, + "mean_token_accuracy": 0.7247487306594849, + "num_tokens": 150434.0, + "step": 17 + }, + { + "epoch": 0.013677811550151976, + "grad_norm": 11.13957691192627, + "learning_rate": 4.2929292929292934e-07, + "loss": 0.8287211656570435, + "mean_token_accuracy": 0.7621913552284241, + "num_tokens": 158516.0, + "step": 18 + }, + { + "epoch": 0.014437689969604863, + "grad_norm": 18.39569664001465, + "learning_rate": 4.5454545454545457e-07, + "loss": 1.150015115737915, + "mean_token_accuracy": 0.7349498271942139, + "num_tokens": 162214.0, + "step": 19 + }, + { + "epoch": 0.015197568389057751, + "grad_norm": 9.353750228881836, + "learning_rate": 4.797979797979798e-07, + "loss": 0.7228299379348755, + "mean_token_accuracy": 0.7969573736190796, + "num_tokens": 173035.0, + "step": 20 + }, + { + "epoch": 0.015957446808510637, + "grad_norm": 8.267163276672363, + "learning_rate": 5.05050505050505e-07, + "loss": 0.7358136177062988, + "mean_token_accuracy": 0.7903937101364136, + "num_tokens": 183568.0, + "step": 21 + }, + { + "epoch": 0.016717325227963525, + "grad_norm": 11.137128829956055, + "learning_rate": 5.303030303030304e-07, + "loss": 1.0075397491455078, + "mean_token_accuracy": 0.702807605266571, + "num_tokens": 192759.0, + "step": 22 + }, + { + "epoch": 0.017477203647416412, + "grad_norm": 10.734103202819824, + "learning_rate": 5.555555555555555e-07, + "loss": 0.8925919532775879, + "mean_token_accuracy": 0.7475671768188477, + "num_tokens": 201280.0, + "step": 23 + }, + { + "epoch": 0.0182370820668693, + "grad_norm": 11.945566177368164, + "learning_rate": 5.808080808080809e-07, + "loss": 0.7260514497756958, + "mean_token_accuracy": 0.7859152555465698, + "num_tokens": 218053.0, + "step": 24 + }, + { + "epoch": 0.018996960486322188, + "grad_norm": 18.610652923583984, + "learning_rate": 6.060606060606061e-07, + "loss": 0.8995465636253357, + "mean_token_accuracy": 0.7931990623474121, + "num_tokens": 220953.0, + "step": 25 + }, + { + "epoch": 0.019756838905775075, + "grad_norm": 10.51898193359375, + "learning_rate": 6.313131313131314e-07, + "loss": 0.9532671570777893, + "mean_token_accuracy": 0.7257645726203918, + "num_tokens": 231200.0, + "step": 26 + }, + { + "epoch": 0.020516717325227963, + "grad_norm": 9.581812858581543, + "learning_rate": 6.565656565656567e-07, + "loss": 0.9038010239601135, + "mean_token_accuracy": 0.7390379905700684, + "num_tokens": 237711.0, + "step": 27 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 12.297484397888184, + "learning_rate": 6.818181818181818e-07, + "loss": 1.048936367034912, + "mean_token_accuracy": 0.7175670862197876, + "num_tokens": 242503.0, + "step": 28 + }, + { + "epoch": 0.022036474164133738, + "grad_norm": 7.437953472137451, + "learning_rate": 7.070707070707071e-07, + "loss": 0.8308826684951782, + "mean_token_accuracy": 0.7415335774421692, + "num_tokens": 250842.0, + "step": 29 + }, + { + "epoch": 0.022796352583586626, + "grad_norm": 6.134475231170654, + "learning_rate": 7.323232323232324e-07, + "loss": 0.647913932800293, + "mean_token_accuracy": 0.8124054670333862, + "num_tokens": 267453.0, + "step": 30 + }, + { + "epoch": 0.023556231003039513, + "grad_norm": 6.678966045379639, + "learning_rate": 7.575757575757576e-07, + "loss": 0.7052810192108154, + "mean_token_accuracy": 0.7908754348754883, + "num_tokens": 284416.0, + "step": 31 + }, + { + "epoch": 0.0243161094224924, + "grad_norm": 7.42232084274292, + "learning_rate": 7.82828282828283e-07, + "loss": 1.022383213043213, + "mean_token_accuracy": 0.7053230404853821, + "num_tokens": 292073.0, + "step": 32 + }, + { + "epoch": 0.02507598784194529, + "grad_norm": 6.463219165802002, + "learning_rate": 8.080808080808082e-07, + "loss": 0.7603012323379517, + "mean_token_accuracy": 0.7728140354156494, + "num_tokens": 298550.0, + "step": 33 + }, + { + "epoch": 0.025835866261398176, + "grad_norm": 5.668411731719971, + "learning_rate": 8.333333333333333e-07, + "loss": 0.7707852721214294, + "mean_token_accuracy": 0.7827773094177246, + "num_tokens": 306683.0, + "step": 34 + }, + { + "epoch": 0.026595744680851064, + "grad_norm": 4.984964847564697, + "learning_rate": 8.585858585858587e-07, + "loss": 0.6317349672317505, + "mean_token_accuracy": 0.8106861114501953, + "num_tokens": 318842.0, + "step": 35 + }, + { + "epoch": 0.02735562310030395, + "grad_norm": 4.421732425689697, + "learning_rate": 8.838383838383839e-07, + "loss": 0.6228617429733276, + "mean_token_accuracy": 0.8023355603218079, + "num_tokens": 329850.0, + "step": 36 + }, + { + "epoch": 0.02811550151975684, + "grad_norm": 5.970808029174805, + "learning_rate": 9.090909090909091e-07, + "loss": 0.8443238139152527, + "mean_token_accuracy": 0.7462409734725952, + "num_tokens": 335844.0, + "step": 37 + }, + { + "epoch": 0.028875379939209727, + "grad_norm": 4.5389084815979, + "learning_rate": 9.343434343434345e-07, + "loss": 0.6976436376571655, + "mean_token_accuracy": 0.790410041809082, + "num_tokens": 348768.0, + "step": 38 + }, + { + "epoch": 0.029635258358662615, + "grad_norm": 4.116631507873535, + "learning_rate": 9.595959595959596e-07, + "loss": 0.6698519587516785, + "mean_token_accuracy": 0.7818127870559692, + "num_tokens": 355460.0, + "step": 39 + }, + { + "epoch": 0.030395136778115502, + "grad_norm": 3.3714773654937744, + "learning_rate": 9.84848484848485e-07, + "loss": 0.5723201036453247, + "mean_token_accuracy": 0.8100086450576782, + "num_tokens": 368507.0, + "step": 40 + }, + { + "epoch": 0.03115501519756839, + "grad_norm": 4.4438347816467285, + "learning_rate": 1.01010101010101e-06, + "loss": 0.7508786916732788, + "mean_token_accuracy": 0.7711942791938782, + "num_tokens": 376467.0, + "step": 41 + }, + { + "epoch": 0.031914893617021274, + "grad_norm": 5.609974384307861, + "learning_rate": 1.0353535353535354e-06, + "loss": 0.566256046295166, + "mean_token_accuracy": 0.8319284319877625, + "num_tokens": 381399.0, + "step": 42 + }, + { + "epoch": 0.03267477203647416, + "grad_norm": 5.124386787414551, + "learning_rate": 1.0606060606060608e-06, + "loss": 0.8151067495346069, + "mean_token_accuracy": 0.7537785768508911, + "num_tokens": 387389.0, + "step": 43 + }, + { + "epoch": 0.03343465045592705, + "grad_norm": 3.6318116188049316, + "learning_rate": 1.085858585858586e-06, + "loss": 0.5989949107170105, + "mean_token_accuracy": 0.8129256963729858, + "num_tokens": 395302.0, + "step": 44 + }, + { + "epoch": 0.03419452887537994, + "grad_norm": 2.694424629211426, + "learning_rate": 1.111111111111111e-06, + "loss": 0.5831396579742432, + "mean_token_accuracy": 0.8056820631027222, + "num_tokens": 409920.0, + "step": 45 + }, + { + "epoch": 0.034954407294832825, + "grad_norm": 2.2949178218841553, + "learning_rate": 1.1363636363636364e-06, + "loss": 0.472550630569458, + "mean_token_accuracy": 0.8343006372451782, + "num_tokens": 428323.0, + "step": 46 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 3.3930575847625732, + "learning_rate": 1.1616161616161617e-06, + "loss": 0.6246505379676819, + "mean_token_accuracy": 0.783149003982544, + "num_tokens": 435889.0, + "step": 47 + }, + { + "epoch": 0.0364741641337386, + "grad_norm": 3.692598819732666, + "learning_rate": 1.186868686868687e-06, + "loss": 0.46132946014404297, + "mean_token_accuracy": 0.8583089113235474, + "num_tokens": 441192.0, + "step": 48 + }, + { + "epoch": 0.03723404255319149, + "grad_norm": 6.571533203125, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.9351121783256531, + "mean_token_accuracy": 0.7580878734588623, + "num_tokens": 444277.0, + "step": 49 + }, + { + "epoch": 0.037993920972644375, + "grad_norm": 5.029570579528809, + "learning_rate": 1.2373737373737375e-06, + "loss": 0.6921554803848267, + "mean_token_accuracy": 0.8131166100502014, + "num_tokens": 447646.0, + "step": 50 + }, + { + "epoch": 0.03875379939209726, + "grad_norm": 2.9174208641052246, + "learning_rate": 1.2626262626262629e-06, + "loss": 0.591706395149231, + "mean_token_accuracy": 0.8108617067337036, + "num_tokens": 461397.0, + "step": 51 + }, + { + "epoch": 0.03951367781155015, + "grad_norm": 4.315536022186279, + "learning_rate": 1.287878787878788e-06, + "loss": 0.6986310482025146, + "mean_token_accuracy": 0.7710754871368408, + "num_tokens": 472047.0, + "step": 52 + }, + { + "epoch": 0.04027355623100304, + "grad_norm": 2.6216275691986084, + "learning_rate": 1.3131313131313134e-06, + "loss": 0.5553690791130066, + "mean_token_accuracy": 0.8167896866798401, + "num_tokens": 482795.0, + "step": 53 + }, + { + "epoch": 0.041033434650455926, + "grad_norm": 3.0562477111816406, + "learning_rate": 1.3383838383838385e-06, + "loss": 0.6909202337265015, + "mean_token_accuracy": 0.7859863638877869, + "num_tokens": 494818.0, + "step": 54 + }, + { + "epoch": 0.04179331306990881, + "grad_norm": 2.1420412063598633, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.5415265560150146, + "mean_token_accuracy": 0.818886399269104, + "num_tokens": 513695.0, + "step": 55 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 2.9610488414764404, + "learning_rate": 1.3888888888888892e-06, + "loss": 0.6602212190628052, + "mean_token_accuracy": 0.7830734252929688, + "num_tokens": 523784.0, + "step": 56 + }, + { + "epoch": 0.04331306990881459, + "grad_norm": 2.511972665786743, + "learning_rate": 1.4141414141414143e-06, + "loss": 0.5717809796333313, + "mean_token_accuracy": 0.8053616285324097, + "num_tokens": 546308.0, + "step": 57 + }, + { + "epoch": 0.044072948328267476, + "grad_norm": 3.52642822265625, + "learning_rate": 1.4393939393939396e-06, + "loss": 0.6242594718933105, + "mean_token_accuracy": 0.8162082433700562, + "num_tokens": 552019.0, + "step": 58 + }, + { + "epoch": 0.044832826747720364, + "grad_norm": 3.02362322807312, + "learning_rate": 1.4646464646464648e-06, + "loss": 0.6634255647659302, + "mean_token_accuracy": 0.7682032585144043, + "num_tokens": 560009.0, + "step": 59 + }, + { + "epoch": 0.04559270516717325, + "grad_norm": 2.3910107612609863, + "learning_rate": 1.48989898989899e-06, + "loss": 0.5519146919250488, + "mean_token_accuracy": 0.8270269632339478, + "num_tokens": 571005.0, + "step": 60 + }, + { + "epoch": 0.04635258358662614, + "grad_norm": 4.28154993057251, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.7437789440155029, + "mean_token_accuracy": 0.7782418131828308, + "num_tokens": 574950.0, + "step": 61 + }, + { + "epoch": 0.04711246200607903, + "grad_norm": 3.4078686237335205, + "learning_rate": 1.5404040404040404e-06, + "loss": 0.6345915198326111, + "mean_token_accuracy": 0.7903392314910889, + "num_tokens": 581657.0, + "step": 62 + }, + { + "epoch": 0.047872340425531915, + "grad_norm": 2.6834158897399902, + "learning_rate": 1.565656565656566e-06, + "loss": 0.5981127023696899, + "mean_token_accuracy": 0.7911489605903625, + "num_tokens": 591267.0, + "step": 63 + }, + { + "epoch": 0.0486322188449848, + "grad_norm": 2.1054461002349854, + "learning_rate": 1.590909090909091e-06, + "loss": 0.5523523688316345, + "mean_token_accuracy": 0.8194501399993896, + "num_tokens": 606787.0, + "step": 64 + }, + { + "epoch": 0.04939209726443769, + "grad_norm": 3.322596788406372, + "learning_rate": 1.6161616161616164e-06, + "loss": 0.48417025804519653, + "mean_token_accuracy": 0.8293706178665161, + "num_tokens": 611068.0, + "step": 65 + }, + { + "epoch": 0.05015197568389058, + "grad_norm": 2.302450180053711, + "learning_rate": 1.6414141414141415e-06, + "loss": 0.6498389840126038, + "mean_token_accuracy": 0.7728497385978699, + "num_tokens": 624452.0, + "step": 66 + }, + { + "epoch": 0.050911854103343465, + "grad_norm": 2.680191993713379, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.6347037553787231, + "mean_token_accuracy": 0.8108306527137756, + "num_tokens": 638049.0, + "step": 67 + }, + { + "epoch": 0.05167173252279635, + "grad_norm": 3.0297021865844727, + "learning_rate": 1.6919191919191922e-06, + "loss": 0.5344363451004028, + "mean_token_accuracy": 0.8113535046577454, + "num_tokens": 643892.0, + "step": 68 + }, + { + "epoch": 0.05243161094224924, + "grad_norm": 2.9283676147460938, + "learning_rate": 1.7171717171717173e-06, + "loss": 0.6999260187149048, + "mean_token_accuracy": 0.7782022356987, + "num_tokens": 654418.0, + "step": 69 + }, + { + "epoch": 0.05319148936170213, + "grad_norm": 3.4098572731018066, + "learning_rate": 1.7424242424242427e-06, + "loss": 0.6508946418762207, + "mean_token_accuracy": 0.7942900657653809, + "num_tokens": 659837.0, + "step": 70 + }, + { + "epoch": 0.053951367781155016, + "grad_norm": 2.6756019592285156, + "learning_rate": 1.7676767676767678e-06, + "loss": 0.603486180305481, + "mean_token_accuracy": 0.8015457391738892, + "num_tokens": 668361.0, + "step": 71 + }, + { + "epoch": 0.0547112462006079, + "grad_norm": 2.2630293369293213, + "learning_rate": 1.792929292929293e-06, + "loss": 0.6608274579048157, + "mean_token_accuracy": 0.7753809690475464, + "num_tokens": 679025.0, + "step": 72 + }, + { + "epoch": 0.05547112462006079, + "grad_norm": 2.123962879180908, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.4525482654571533, + "mean_token_accuracy": 0.8425612449645996, + "num_tokens": 688574.0, + "step": 73 + }, + { + "epoch": 0.05623100303951368, + "grad_norm": 7.90519905090332, + "learning_rate": 1.8434343434343434e-06, + "loss": 0.6507195830345154, + "mean_token_accuracy": 0.7714964151382446, + "num_tokens": 694534.0, + "step": 74 + }, + { + "epoch": 0.056990881458966566, + "grad_norm": 2.372203826904297, + "learning_rate": 1.868686868686869e-06, + "loss": 0.4458143413066864, + "mean_token_accuracy": 0.7991449236869812, + "num_tokens": 703114.0, + "step": 75 + }, + { + "epoch": 0.057750759878419454, + "grad_norm": 2.918677568435669, + "learning_rate": 1.8939393939393941e-06, + "loss": 0.5614339113235474, + "mean_token_accuracy": 0.8211464881896973, + "num_tokens": 709038.0, + "step": 76 + }, + { + "epoch": 0.05851063829787234, + "grad_norm": 1.6106709241867065, + "learning_rate": 1.9191919191919192e-06, + "loss": 0.5802098512649536, + "mean_token_accuracy": 0.8055065870285034, + "num_tokens": 730482.0, + "step": 77 + }, + { + "epoch": 0.05927051671732523, + "grad_norm": 2.8069989681243896, + "learning_rate": 1.944444444444445e-06, + "loss": 0.5709059238433838, + "mean_token_accuracy": 0.8024872541427612, + "num_tokens": 751817.0, + "step": 78 + }, + { + "epoch": 0.06003039513677812, + "grad_norm": 2.641667127609253, + "learning_rate": 1.96969696969697e-06, + "loss": 0.6480152606964111, + "mean_token_accuracy": 0.7912271618843079, + "num_tokens": 759236.0, + "step": 79 + }, + { + "epoch": 0.060790273556231005, + "grad_norm": 2.6034350395202637, + "learning_rate": 1.994949494949495e-06, + "loss": 0.5535176396369934, + "mean_token_accuracy": 0.7980542778968811, + "num_tokens": 766496.0, + "step": 80 + }, + { + "epoch": 0.06155015197568389, + "grad_norm": 1.7095069885253906, + "learning_rate": 2.02020202020202e-06, + "loss": 0.4545496106147766, + "mean_token_accuracy": 0.8229660391807556, + "num_tokens": 780124.0, + "step": 81 + }, + { + "epoch": 0.06231003039513678, + "grad_norm": 3.788830518722534, + "learning_rate": 2.0454545454545457e-06, + "loss": 0.6679391264915466, + "mean_token_accuracy": 0.7942397594451904, + "num_tokens": 784555.0, + "step": 82 + }, + { + "epoch": 0.06306990881458967, + "grad_norm": 2.009831666946411, + "learning_rate": 2.070707070707071e-06, + "loss": 0.5067101120948792, + "mean_token_accuracy": 0.8276634216308594, + "num_tokens": 797459.0, + "step": 83 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 2.201627731323242, + "learning_rate": 2.095959595959596e-06, + "loss": 0.5012127161026001, + "mean_token_accuracy": 0.8432504534721375, + "num_tokens": 810817.0, + "step": 84 + }, + { + "epoch": 0.06458966565349544, + "grad_norm": 2.492568016052246, + "learning_rate": 2.1212121212121216e-06, + "loss": 0.6142797470092773, + "mean_token_accuracy": 0.8338661193847656, + "num_tokens": 818191.0, + "step": 85 + }, + { + "epoch": 0.06534954407294832, + "grad_norm": 2.8360862731933594, + "learning_rate": 2.1464646464646467e-06, + "loss": 0.5569300651550293, + "mean_token_accuracy": 0.8121030330657959, + "num_tokens": 825325.0, + "step": 86 + }, + { + "epoch": 0.06610942249240122, + "grad_norm": 2.407548427581787, + "learning_rate": 2.171717171717172e-06, + "loss": 0.6442930102348328, + "mean_token_accuracy": 0.792514443397522, + "num_tokens": 834439.0, + "step": 87 + }, + { + "epoch": 0.0668693009118541, + "grad_norm": 2.340728759765625, + "learning_rate": 2.196969696969697e-06, + "loss": 0.6494365930557251, + "mean_token_accuracy": 0.7746615409851074, + "num_tokens": 843078.0, + "step": 88 + }, + { + "epoch": 0.067629179331307, + "grad_norm": 1.7703697681427002, + "learning_rate": 2.222222222222222e-06, + "loss": 0.598991870880127, + "mean_token_accuracy": 0.7992157340049744, + "num_tokens": 860171.0, + "step": 89 + }, + { + "epoch": 0.06838905775075987, + "grad_norm": 2.5779271125793457, + "learning_rate": 2.2474747474747476e-06, + "loss": 0.5693082809448242, + "mean_token_accuracy": 0.8093700408935547, + "num_tokens": 866669.0, + "step": 90 + }, + { + "epoch": 0.06914893617021277, + "grad_norm": 2.014092206954956, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.5346695780754089, + "mean_token_accuracy": 0.8165590763092041, + "num_tokens": 876698.0, + "step": 91 + }, + { + "epoch": 0.06990881458966565, + "grad_norm": 1.7555919885635376, + "learning_rate": 2.2979797979797983e-06, + "loss": 0.5321458578109741, + "mean_token_accuracy": 0.8166656494140625, + "num_tokens": 889488.0, + "step": 92 + }, + { + "epoch": 0.07066869300911854, + "grad_norm": 1.8631824254989624, + "learning_rate": 2.3232323232323234e-06, + "loss": 0.5246532559394836, + "mean_token_accuracy": 0.8088107705116272, + "num_tokens": 901322.0, + "step": 93 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 3.2332139015197754, + "learning_rate": 2.348484848484849e-06, + "loss": 0.5141711235046387, + "mean_token_accuracy": 0.8382217884063721, + "num_tokens": 905792.0, + "step": 94 + }, + { + "epoch": 0.07218844984802432, + "grad_norm": 1.7806555032730103, + "learning_rate": 2.373737373737374e-06, + "loss": 0.5233149528503418, + "mean_token_accuracy": 0.8101529479026794, + "num_tokens": 917320.0, + "step": 95 + }, + { + "epoch": 0.0729483282674772, + "grad_norm": 1.8169859647750854, + "learning_rate": 2.3989898989898993e-06, + "loss": 0.578881561756134, + "mean_token_accuracy": 0.8044873476028442, + "num_tokens": 931062.0, + "step": 96 + }, + { + "epoch": 0.0737082066869301, + "grad_norm": 4.677402496337891, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.7842556238174438, + "mean_token_accuracy": 0.7579764127731323, + "num_tokens": 934712.0, + "step": 97 + }, + { + "epoch": 0.07446808510638298, + "grad_norm": 2.6987264156341553, + "learning_rate": 2.4494949494949495e-06, + "loss": 0.5669287443161011, + "mean_token_accuracy": 0.8186933994293213, + "num_tokens": 941058.0, + "step": 98 + }, + { + "epoch": 0.07522796352583587, + "grad_norm": 1.6906023025512695, + "learning_rate": 2.474747474747475e-06, + "loss": 0.4976363778114319, + "mean_token_accuracy": 0.8198553323745728, + "num_tokens": 956509.0, + "step": 99 + }, + { + "epoch": 0.07598784194528875, + "grad_norm": 2.7256152629852295, + "learning_rate": 2.5e-06, + "loss": 0.7138420343399048, + "mean_token_accuracy": 0.7752805948257446, + "num_tokens": 963920.0, + "step": 100 + }, + { + "epoch": 0.07674772036474165, + "grad_norm": 2.174870491027832, + "learning_rate": 2.5252525252525258e-06, + "loss": 0.6733541488647461, + "mean_token_accuracy": 0.7745175361633301, + "num_tokens": 975268.0, + "step": 101 + }, + { + "epoch": 0.07750759878419453, + "grad_norm": 1.5587213039398193, + "learning_rate": 2.5505050505050505e-06, + "loss": 0.44223445653915405, + "mean_token_accuracy": 0.8278359174728394, + "num_tokens": 991837.0, + "step": 102 + }, + { + "epoch": 0.07826747720364742, + "grad_norm": 2.181840658187866, + "learning_rate": 2.575757575757576e-06, + "loss": 0.625128448009491, + "mean_token_accuracy": 0.7941786050796509, + "num_tokens": 1004325.0, + "step": 103 + }, + { + "epoch": 0.0790273556231003, + "grad_norm": 1.4986687898635864, + "learning_rate": 2.601010101010101e-06, + "loss": 0.39262527227401733, + "mean_token_accuracy": 0.8412648439407349, + "num_tokens": 1018331.0, + "step": 104 + }, + { + "epoch": 0.0797872340425532, + "grad_norm": 2.3416061401367188, + "learning_rate": 2.6262626262626267e-06, + "loss": 0.5495132803916931, + "mean_token_accuracy": 0.8193322420120239, + "num_tokens": 1026090.0, + "step": 105 + }, + { + "epoch": 0.08054711246200608, + "grad_norm": 3.8168859481811523, + "learning_rate": 2.6515151515151514e-06, + "loss": 0.4898706376552582, + "mean_token_accuracy": 0.8467956185340881, + "num_tokens": 1029955.0, + "step": 106 + }, + { + "epoch": 0.08130699088145897, + "grad_norm": 4.113908767700195, + "learning_rate": 2.676767676767677e-06, + "loss": 0.6189584732055664, + "mean_token_accuracy": 0.8019394278526306, + "num_tokens": 1033598.0, + "step": 107 + }, + { + "epoch": 0.08206686930091185, + "grad_norm": 2.50003981590271, + "learning_rate": 2.7020202020202025e-06, + "loss": 0.6479471921920776, + "mean_token_accuracy": 0.7790026664733887, + "num_tokens": 1042533.0, + "step": 108 + }, + { + "epoch": 0.08282674772036475, + "grad_norm": 1.408934473991394, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.3909248113632202, + "mean_token_accuracy": 0.8477586507797241, + "num_tokens": 1061755.0, + "step": 109 + }, + { + "epoch": 0.08358662613981763, + "grad_norm": 3.360633611679077, + "learning_rate": 2.7525252525252528e-06, + "loss": 0.6952459812164307, + "mean_token_accuracy": 0.777535080909729, + "num_tokens": 1067316.0, + "step": 110 + }, + { + "epoch": 0.08434650455927052, + "grad_norm": 1.8631696701049805, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.5420593023300171, + "mean_token_accuracy": 0.8157662749290466, + "num_tokens": 1079930.0, + "step": 111 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.4308314323425293, + "learning_rate": 2.803030303030303e-06, + "loss": 0.5863882303237915, + "mean_token_accuracy": 0.8206346035003662, + "num_tokens": 1088069.0, + "step": 112 + }, + { + "epoch": 0.0858662613981763, + "grad_norm": 2.922808885574341, + "learning_rate": 2.8282828282828286e-06, + "loss": 0.5217319130897522, + "mean_token_accuracy": 0.8253234028816223, + "num_tokens": 1093607.0, + "step": 113 + }, + { + "epoch": 0.08662613981762918, + "grad_norm": 2.3596107959747314, + "learning_rate": 2.8535353535353537e-06, + "loss": 0.5070714950561523, + "mean_token_accuracy": 0.8258323669433594, + "num_tokens": 1100405.0, + "step": 114 + }, + { + "epoch": 0.08738601823708207, + "grad_norm": 3.0853066444396973, + "learning_rate": 2.8787878787878793e-06, + "loss": 0.591964840888977, + "mean_token_accuracy": 0.8047322630882263, + "num_tokens": 1107535.0, + "step": 115 + }, + { + "epoch": 0.08814589665653495, + "grad_norm": 1.9251092672348022, + "learning_rate": 2.904040404040404e-06, + "loss": 0.5226191878318787, + "mean_token_accuracy": 0.8022720217704773, + "num_tokens": 1118716.0, + "step": 116 + }, + { + "epoch": 0.08890577507598785, + "grad_norm": 1.9692988395690918, + "learning_rate": 2.9292929292929295e-06, + "loss": 0.5462069511413574, + "mean_token_accuracy": 0.8157015442848206, + "num_tokens": 1131917.0, + "step": 117 + }, + { + "epoch": 0.08966565349544073, + "grad_norm": 1.4738909006118774, + "learning_rate": 2.954545454545455e-06, + "loss": 0.4564219117164612, + "mean_token_accuracy": 0.849632978439331, + "num_tokens": 1148534.0, + "step": 118 + }, + { + "epoch": 0.09042553191489362, + "grad_norm": 2.72646164894104, + "learning_rate": 2.97979797979798e-06, + "loss": 0.6654808521270752, + "mean_token_accuracy": 0.7752684354782104, + "num_tokens": 1155438.0, + "step": 119 + }, + { + "epoch": 0.0911854103343465, + "grad_norm": 2.7843852043151855, + "learning_rate": 3.0050505050505054e-06, + "loss": 0.5354680418968201, + "mean_token_accuracy": 0.8196378946304321, + "num_tokens": 1161815.0, + "step": 120 + }, + { + "epoch": 0.0919452887537994, + "grad_norm": 2.8052573204040527, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.6366757154464722, + "mean_token_accuracy": 0.7967483997344971, + "num_tokens": 1168295.0, + "step": 121 + }, + { + "epoch": 0.09270516717325228, + "grad_norm": 2.7462735176086426, + "learning_rate": 3.055555555555556e-06, + "loss": 0.59470534324646, + "mean_token_accuracy": 0.8023771047592163, + "num_tokens": 1174502.0, + "step": 122 + }, + { + "epoch": 0.09346504559270517, + "grad_norm": 2.2743821144104004, + "learning_rate": 3.0808080808080807e-06, + "loss": 0.5720560550689697, + "mean_token_accuracy": 0.8162771463394165, + "num_tokens": 1183615.0, + "step": 123 + }, + { + "epoch": 0.09422492401215805, + "grad_norm": 1.8669533729553223, + "learning_rate": 3.1060606060606063e-06, + "loss": 0.4655378758907318, + "mean_token_accuracy": 0.8360732793807983, + "num_tokens": 1193761.0, + "step": 124 + }, + { + "epoch": 0.09498480243161095, + "grad_norm": 1.7666901350021362, + "learning_rate": 3.131313131313132e-06, + "loss": 0.5524153709411621, + "mean_token_accuracy": 0.8252713680267334, + "num_tokens": 1207870.0, + "step": 125 + }, + { + "epoch": 0.09574468085106383, + "grad_norm": 2.4720070362091064, + "learning_rate": 3.1565656565656566e-06, + "loss": 0.5003011226654053, + "mean_token_accuracy": 0.8491042852401733, + "num_tokens": 1214603.0, + "step": 126 + }, + { + "epoch": 0.09650455927051672, + "grad_norm": 1.6500422954559326, + "learning_rate": 3.181818181818182e-06, + "loss": 0.5137069225311279, + "mean_token_accuracy": 0.8273531198501587, + "num_tokens": 1228717.0, + "step": 127 + }, + { + "epoch": 0.0972644376899696, + "grad_norm": 3.402543067932129, + "learning_rate": 3.2070707070707072e-06, + "loss": 0.708167552947998, + "mean_token_accuracy": 0.7705385684967041, + "num_tokens": 1234361.0, + "step": 128 + }, + { + "epoch": 0.0980243161094225, + "grad_norm": 2.547285795211792, + "learning_rate": 3.232323232323233e-06, + "loss": 0.6020137071609497, + "mean_token_accuracy": 0.7981340289115906, + "num_tokens": 1244169.0, + "step": 129 + }, + { + "epoch": 0.09878419452887538, + "grad_norm": 2.0578792095184326, + "learning_rate": 3.257575757575758e-06, + "loss": 0.4425000250339508, + "mean_token_accuracy": 0.8567807674407959, + "num_tokens": 1252709.0, + "step": 130 + }, + { + "epoch": 0.09954407294832827, + "grad_norm": 1.672614336013794, + "learning_rate": 3.282828282828283e-06, + "loss": 0.4860966205596924, + "mean_token_accuracy": 0.8393139243125916, + "num_tokens": 1265766.0, + "step": 131 + }, + { + "epoch": 0.10030395136778116, + "grad_norm": 3.2560198307037354, + "learning_rate": 3.3080808080808086e-06, + "loss": 0.624736487865448, + "mean_token_accuracy": 0.7875322699546814, + "num_tokens": 1270779.0, + "step": 132 + }, + { + "epoch": 0.10106382978723404, + "grad_norm": 2.4468185901641846, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.5062227249145508, + "mean_token_accuracy": 0.8217229843139648, + "num_tokens": 1277113.0, + "step": 133 + }, + { + "epoch": 0.10182370820668693, + "grad_norm": 2.6371328830718994, + "learning_rate": 3.358585858585859e-06, + "loss": 0.477113276720047, + "mean_token_accuracy": 0.8605583906173706, + "num_tokens": 1282514.0, + "step": 134 + }, + { + "epoch": 0.10258358662613981, + "grad_norm": 2.48421311378479, + "learning_rate": 3.3838383838383844e-06, + "loss": 0.40855684876441956, + "mean_token_accuracy": 0.864548921585083, + "num_tokens": 1287859.0, + "step": 135 + }, + { + "epoch": 0.1033434650455927, + "grad_norm": 1.993099331855774, + "learning_rate": 3.409090909090909e-06, + "loss": 0.5913145542144775, + "mean_token_accuracy": 0.8248485922813416, + "num_tokens": 1301074.0, + "step": 136 + }, + { + "epoch": 0.10410334346504559, + "grad_norm": 3.5947680473327637, + "learning_rate": 3.4343434343434347e-06, + "loss": 0.5028599500656128, + "mean_token_accuracy": 0.8367215394973755, + "num_tokens": 1305219.0, + "step": 137 + }, + { + "epoch": 0.10486322188449848, + "grad_norm": 2.5778582096099854, + "learning_rate": 3.45959595959596e-06, + "loss": 0.5297672748565674, + "mean_token_accuracy": 0.8232187032699585, + "num_tokens": 1312482.0, + "step": 138 + }, + { + "epoch": 0.10562310030395136, + "grad_norm": 1.8961588144302368, + "learning_rate": 3.4848484848484854e-06, + "loss": 0.39954107999801636, + "mean_token_accuracy": 0.8605833053588867, + "num_tokens": 1323404.0, + "step": 139 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 1.9687960147857666, + "learning_rate": 3.51010101010101e-06, + "loss": 0.48791587352752686, + "mean_token_accuracy": 0.8200347423553467, + "num_tokens": 1333027.0, + "step": 140 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 2.520242691040039, + "learning_rate": 3.5353535353535356e-06, + "loss": 0.6106002330780029, + "mean_token_accuracy": 0.790692150592804, + "num_tokens": 1340999.0, + "step": 141 + }, + { + "epoch": 0.10790273556231003, + "grad_norm": 3.751617431640625, + "learning_rate": 3.560606060606061e-06, + "loss": 0.48141729831695557, + "mean_token_accuracy": 0.8421382904052734, + "num_tokens": 1344687.0, + "step": 142 + }, + { + "epoch": 0.10866261398176291, + "grad_norm": 2.7101709842681885, + "learning_rate": 3.585858585858586e-06, + "loss": 0.5375241637229919, + "mean_token_accuracy": 0.8061438202857971, + "num_tokens": 1350192.0, + "step": 143 + }, + { + "epoch": 0.1094224924012158, + "grad_norm": 2.583484411239624, + "learning_rate": 3.6111111111111115e-06, + "loss": 0.6492470502853394, + "mean_token_accuracy": 0.7863001823425293, + "num_tokens": 1358148.0, + "step": 144 + }, + { + "epoch": 0.11018237082066869, + "grad_norm": 1.792561650276184, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.48480600118637085, + "mean_token_accuracy": 0.8358709812164307, + "num_tokens": 1369519.0, + "step": 145 + }, + { + "epoch": 0.11094224924012158, + "grad_norm": 2.6480472087860107, + "learning_rate": 3.661616161616162e-06, + "loss": 0.5268933176994324, + "mean_token_accuracy": 0.8214013576507568, + "num_tokens": 1375862.0, + "step": 146 + }, + { + "epoch": 0.11170212765957446, + "grad_norm": 2.3174469470977783, + "learning_rate": 3.686868686868687e-06, + "loss": 0.42517897486686707, + "mean_token_accuracy": 0.8523461222648621, + "num_tokens": 1381546.0, + "step": 147 + }, + { + "epoch": 0.11246200607902736, + "grad_norm": 3.0090949535369873, + "learning_rate": 3.7121212121212124e-06, + "loss": 0.4042336940765381, + "mean_token_accuracy": 0.8670448064804077, + "num_tokens": 1385896.0, + "step": 148 + }, + { + "epoch": 0.11322188449848024, + "grad_norm": 2.4928104877471924, + "learning_rate": 3.737373737373738e-06, + "loss": 0.6498878598213196, + "mean_token_accuracy": 0.7967068552970886, + "num_tokens": 1394169.0, + "step": 149 + }, + { + "epoch": 0.11398176291793313, + "grad_norm": 1.5984913110733032, + "learning_rate": 3.7626262626262627e-06, + "loss": 0.546096920967102, + "mean_token_accuracy": 0.8035850524902344, + "num_tokens": 1408785.0, + "step": 150 + }, + { + "epoch": 0.11474164133738601, + "grad_norm": 2.3663532733917236, + "learning_rate": 3.7878787878787882e-06, + "loss": 0.6111721992492676, + "mean_token_accuracy": 0.8015355467796326, + "num_tokens": 1417510.0, + "step": 151 + }, + { + "epoch": 0.11550151975683891, + "grad_norm": 2.518932819366455, + "learning_rate": 3.8131313131313138e-06, + "loss": 0.5274964570999146, + "mean_token_accuracy": 0.8155480623245239, + "num_tokens": 1424186.0, + "step": 152 + }, + { + "epoch": 0.11626139817629179, + "grad_norm": 2.14353609085083, + "learning_rate": 3.8383838383838385e-06, + "loss": 0.5283297896385193, + "mean_token_accuracy": 0.8275758028030396, + "num_tokens": 1432630.0, + "step": 153 + }, + { + "epoch": 0.11702127659574468, + "grad_norm": 1.8243604898452759, + "learning_rate": 3.863636363636364e-06, + "loss": 0.41854870319366455, + "mean_token_accuracy": 0.8222295045852661, + "num_tokens": 1442691.0, + "step": 154 + }, + { + "epoch": 0.11778115501519756, + "grad_norm": 2.088212251663208, + "learning_rate": 3.88888888888889e-06, + "loss": 0.6062943339347839, + "mean_token_accuracy": 0.8009427785873413, + "num_tokens": 1456890.0, + "step": 155 + }, + { + "epoch": 0.11854103343465046, + "grad_norm": 1.3469511270523071, + "learning_rate": 3.914141414141415e-06, + "loss": 0.4390433728694916, + "mean_token_accuracy": 0.8436295986175537, + "num_tokens": 1475349.0, + "step": 156 + }, + { + "epoch": 0.11930091185410334, + "grad_norm": 3.247023105621338, + "learning_rate": 3.93939393939394e-06, + "loss": 0.6490433216094971, + "mean_token_accuracy": 0.8037861585617065, + "num_tokens": 1479952.0, + "step": 157 + }, + { + "epoch": 0.12006079027355623, + "grad_norm": 2.6610445976257324, + "learning_rate": 3.964646464646465e-06, + "loss": 0.6221826076507568, + "mean_token_accuracy": 0.7848749160766602, + "num_tokens": 1487306.0, + "step": 158 + }, + { + "epoch": 0.12082066869300911, + "grad_norm": 2.3060810565948486, + "learning_rate": 3.98989898989899e-06, + "loss": 0.5052388310432434, + "mean_token_accuracy": 0.8281195759773254, + "num_tokens": 1495367.0, + "step": 159 + }, + { + "epoch": 0.12158054711246201, + "grad_norm": 2.504448652267456, + "learning_rate": 4.015151515151515e-06, + "loss": 0.5005477666854858, + "mean_token_accuracy": 0.8408058881759644, + "num_tokens": 1502069.0, + "step": 160 + }, + { + "epoch": 0.12234042553191489, + "grad_norm": 3.993938446044922, + "learning_rate": 4.04040404040404e-06, + "loss": 0.5569638013839722, + "mean_token_accuracy": 0.8095242977142334, + "num_tokens": 1510224.0, + "step": 161 + }, + { + "epoch": 0.12310030395136778, + "grad_norm": 2.2287683486938477, + "learning_rate": 4.065656565656566e-06, + "loss": 0.524042546749115, + "mean_token_accuracy": 0.8102203607559204, + "num_tokens": 1518364.0, + "step": 162 + }, + { + "epoch": 0.12386018237082067, + "grad_norm": 1.9531738758087158, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.45794573426246643, + "mean_token_accuracy": 0.8560376167297363, + "num_tokens": 1528097.0, + "step": 163 + }, + { + "epoch": 0.12462006079027356, + "grad_norm": 1.5841206312179565, + "learning_rate": 4.116161616161617e-06, + "loss": 0.5420972108840942, + "mean_token_accuracy": 0.8092726469039917, + "num_tokens": 1544119.0, + "step": 164 + }, + { + "epoch": 0.12537993920972645, + "grad_norm": 1.7536218166351318, + "learning_rate": 4.141414141414142e-06, + "loss": 0.554668664932251, + "mean_token_accuracy": 0.8193825483322144, + "num_tokens": 1559140.0, + "step": 165 + }, + { + "epoch": 0.12613981762917933, + "grad_norm": 3.545454740524292, + "learning_rate": 4.166666666666667e-06, + "loss": 0.580947995185852, + "mean_token_accuracy": 0.8286383152008057, + "num_tokens": 1563625.0, + "step": 166 + }, + { + "epoch": 0.12689969604863222, + "grad_norm": 1.6608915328979492, + "learning_rate": 4.191919191919192e-06, + "loss": 0.5523324012756348, + "mean_token_accuracy": 0.8155215978622437, + "num_tokens": 1574945.0, + "step": 167 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 1.4832708835601807, + "learning_rate": 4.217171717171717e-06, + "loss": 0.5133191347122192, + "mean_token_accuracy": 0.8367571830749512, + "num_tokens": 1595865.0, + "step": 168 + }, + { + "epoch": 0.128419452887538, + "grad_norm": 1.7807520627975464, + "learning_rate": 4.242424242424243e-06, + "loss": 0.5131410360336304, + "mean_token_accuracy": 0.8129367232322693, + "num_tokens": 1608723.0, + "step": 169 + }, + { + "epoch": 0.12917933130699089, + "grad_norm": 2.707569122314453, + "learning_rate": 4.267676767676767e-06, + "loss": 0.6129013299942017, + "mean_token_accuracy": 0.7926048040390015, + "num_tokens": 1616136.0, + "step": 170 + }, + { + "epoch": 0.12993920972644377, + "grad_norm": 2.5831644535064697, + "learning_rate": 4.292929292929293e-06, + "loss": 0.6264227628707886, + "mean_token_accuracy": 0.8074911236763, + "num_tokens": 1624228.0, + "step": 171 + }, + { + "epoch": 0.13069908814589665, + "grad_norm": 3.1124250888824463, + "learning_rate": 4.3181818181818185e-06, + "loss": 0.41763827204704285, + "mean_token_accuracy": 0.8565453290939331, + "num_tokens": 1628098.0, + "step": 172 + }, + { + "epoch": 0.13145896656534956, + "grad_norm": 2.3214211463928223, + "learning_rate": 4.343434343434344e-06, + "loss": 0.421974778175354, + "mean_token_accuracy": 0.8391546010971069, + "num_tokens": 1634950.0, + "step": 173 + }, + { + "epoch": 0.13221884498480244, + "grad_norm": 2.1010327339172363, + "learning_rate": 4.368686868686869e-06, + "loss": 0.5307331681251526, + "mean_token_accuracy": 0.8139588236808777, + "num_tokens": 1644132.0, + "step": 174 + }, + { + "epoch": 0.13297872340425532, + "grad_norm": 2.533612012863159, + "learning_rate": 4.393939393939394e-06, + "loss": 0.5626664161682129, + "mean_token_accuracy": 0.8029808402061462, + "num_tokens": 1651637.0, + "step": 175 + }, + { + "epoch": 0.1337386018237082, + "grad_norm": 1.669508457183838, + "learning_rate": 4.41919191919192e-06, + "loss": 0.5351508259773254, + "mean_token_accuracy": 0.8281655311584473, + "num_tokens": 1666776.0, + "step": 176 + }, + { + "epoch": 0.1344984802431611, + "grad_norm": 1.7579659223556519, + "learning_rate": 4.444444444444444e-06, + "loss": 0.5235031247138977, + "mean_token_accuracy": 0.8143284320831299, + "num_tokens": 1679241.0, + "step": 177 + }, + { + "epoch": 0.135258358662614, + "grad_norm": 3.123563528060913, + "learning_rate": 4.46969696969697e-06, + "loss": 0.43051332235336304, + "mean_token_accuracy": 0.8518186211585999, + "num_tokens": 1683317.0, + "step": 178 + }, + { + "epoch": 0.13601823708206687, + "grad_norm": 2.2411575317382812, + "learning_rate": 4.494949494949495e-06, + "loss": 0.5471380949020386, + "mean_token_accuracy": 0.8267596960067749, + "num_tokens": 1691366.0, + "step": 179 + }, + { + "epoch": 0.13677811550151975, + "grad_norm": 2.621973991394043, + "learning_rate": 4.520202020202021e-06, + "loss": 0.5685839653015137, + "mean_token_accuracy": 0.8260642290115356, + "num_tokens": 1698148.0, + "step": 180 + }, + { + "epoch": 0.13753799392097266, + "grad_norm": 2.1553852558135986, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.5703883171081543, + "mean_token_accuracy": 0.8219090700149536, + "num_tokens": 1707225.0, + "step": 181 + }, + { + "epoch": 0.13829787234042554, + "grad_norm": 5.1767897605896, + "learning_rate": 4.5707070707070715e-06, + "loss": 0.32704639434814453, + "mean_token_accuracy": 0.8754568099975586, + "num_tokens": 1712748.0, + "step": 182 + }, + { + "epoch": 0.13905775075987842, + "grad_norm": 2.609168291091919, + "learning_rate": 4.595959595959597e-06, + "loss": 0.5939987301826477, + "mean_token_accuracy": 0.8034975528717041, + "num_tokens": 1719932.0, + "step": 183 + }, + { + "epoch": 0.1398176291793313, + "grad_norm": 2.2059099674224854, + "learning_rate": 4.621212121212122e-06, + "loss": 0.5310720205307007, + "mean_token_accuracy": 0.8177368640899658, + "num_tokens": 1727640.0, + "step": 184 + }, + { + "epoch": 0.1405775075987842, + "grad_norm": 2.6367759704589844, + "learning_rate": 4.646464646464647e-06, + "loss": 0.522086501121521, + "mean_token_accuracy": 0.826233983039856, + "num_tokens": 1733609.0, + "step": 185 + }, + { + "epoch": 0.1413373860182371, + "grad_norm": 3.326732873916626, + "learning_rate": 4.671717171717172e-06, + "loss": 0.4127829074859619, + "mean_token_accuracy": 0.8551101684570312, + "num_tokens": 1737256.0, + "step": 186 + }, + { + "epoch": 0.14209726443768997, + "grad_norm": 1.828412413597107, + "learning_rate": 4.696969696969698e-06, + "loss": 0.5444269180297852, + "mean_token_accuracy": 0.8350818157196045, + "num_tokens": 1750196.0, + "step": 187 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 3.209203004837036, + "learning_rate": 4.722222222222222e-06, + "loss": 0.5087994933128357, + "mean_token_accuracy": 0.8349015712738037, + "num_tokens": 1754836.0, + "step": 188 + }, + { + "epoch": 0.14361702127659576, + "grad_norm": 1.7339166402816772, + "learning_rate": 4.747474747474748e-06, + "loss": 0.5151352286338806, + "mean_token_accuracy": 0.8321266174316406, + "num_tokens": 1766015.0, + "step": 189 + }, + { + "epoch": 0.14437689969604864, + "grad_norm": 2.699068069458008, + "learning_rate": 4.772727272727273e-06, + "loss": 0.4406203031539917, + "mean_token_accuracy": 0.8425000905990601, + "num_tokens": 1771684.0, + "step": 190 + }, + { + "epoch": 0.14513677811550152, + "grad_norm": 2.8117282390594482, + "learning_rate": 4.7979797979797985e-06, + "loss": 0.40428489446640015, + "mean_token_accuracy": 0.8654326796531677, + "num_tokens": 1776301.0, + "step": 191 + }, + { + "epoch": 0.1458966565349544, + "grad_norm": 2.9204647541046143, + "learning_rate": 4.823232323232324e-06, + "loss": 0.4191770553588867, + "mean_token_accuracy": 0.8574687242507935, + "num_tokens": 1781678.0, + "step": 192 + }, + { + "epoch": 0.1466565349544073, + "grad_norm": 2.1648988723754883, + "learning_rate": 4.848484848484849e-06, + "loss": 0.5839012861251831, + "mean_token_accuracy": 0.8053664565086365, + "num_tokens": 1792516.0, + "step": 193 + }, + { + "epoch": 0.1474164133738602, + "grad_norm": 2.3221631050109863, + "learning_rate": 4.873737373737374e-06, + "loss": 0.5037894248962402, + "mean_token_accuracy": 0.8427227139472961, + "num_tokens": 1800192.0, + "step": 194 + }, + { + "epoch": 0.14817629179331307, + "grad_norm": 2.4536430835723877, + "learning_rate": 4.898989898989899e-06, + "loss": 0.42326074838638306, + "mean_token_accuracy": 0.8510633111000061, + "num_tokens": 1806159.0, + "step": 195 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 2.4875805377960205, + "learning_rate": 4.924242424242425e-06, + "loss": 0.539531409740448, + "mean_token_accuracy": 0.8060250282287598, + "num_tokens": 1813392.0, + "step": 196 + }, + { + "epoch": 0.14969604863221886, + "grad_norm": 2.1664798259735107, + "learning_rate": 4.94949494949495e-06, + "loss": 0.42502015829086304, + "mean_token_accuracy": 0.8503251075744629, + "num_tokens": 1821424.0, + "step": 197 + }, + { + "epoch": 0.15045592705167174, + "grad_norm": 2.568808078765869, + "learning_rate": 4.974747474747475e-06, + "loss": 0.5025098323822021, + "mean_token_accuracy": 0.8182311058044434, + "num_tokens": 1827225.0, + "step": 198 + }, + { + "epoch": 0.15121580547112462, + "grad_norm": 1.9116802215576172, + "learning_rate": 5e-06, + "loss": 0.4907258450984955, + "mean_token_accuracy": 0.8310189843177795, + "num_tokens": 1836297.0, + "step": 199 + }, + { + "epoch": 0.1519756838905775, + "grad_norm": 3.150765895843506, + "learning_rate": 4.999999122701883e-06, + "loss": 0.390616774559021, + "mean_token_accuracy": 0.8626647591590881, + "num_tokens": 1839984.0, + "step": 200 + }, + { + "epoch": 0.15273556231003038, + "grad_norm": 3.2229044437408447, + "learning_rate": 4.999996490808146e-06, + "loss": 0.48009657859802246, + "mean_token_accuracy": 0.825214147567749, + "num_tokens": 1844610.0, + "step": 201 + }, + { + "epoch": 0.1534954407294833, + "grad_norm": 1.4473289251327515, + "learning_rate": 4.9999921043206356e-06, + "loss": 0.40135183930397034, + "mean_token_accuracy": 0.8537827730178833, + "num_tokens": 1859573.0, + "step": 202 + }, + { + "epoch": 0.15425531914893617, + "grad_norm": 4.072319507598877, + "learning_rate": 4.999985963242432e-06, + "loss": 0.6158689260482788, + "mean_token_accuracy": 0.8075432777404785, + "num_tokens": 1863147.0, + "step": 203 + }, + { + "epoch": 0.15501519756838905, + "grad_norm": 3.15741229057312, + "learning_rate": 4.999978067577844e-06, + "loss": 0.4603108763694763, + "mean_token_accuracy": 0.8418779373168945, + "num_tokens": 1867201.0, + "step": 204 + }, + { + "epoch": 0.15577507598784193, + "grad_norm": 2.1925418376922607, + "learning_rate": 4.999968417332415e-06, + "loss": 0.5552488565444946, + "mean_token_accuracy": 0.8216016292572021, + "num_tokens": 1874837.0, + "step": 205 + }, + { + "epoch": 0.15653495440729484, + "grad_norm": 2.2518117427825928, + "learning_rate": 4.999957012512916e-06, + "loss": 0.4912569522857666, + "mean_token_accuracy": 0.8284667730331421, + "num_tokens": 1881842.0, + "step": 206 + }, + { + "epoch": 0.15729483282674772, + "grad_norm": 1.8223762512207031, + "learning_rate": 4.999943853127351e-06, + "loss": 0.47709137201309204, + "mean_token_accuracy": 0.8311659097671509, + "num_tokens": 1890805.0, + "step": 207 + }, + { + "epoch": 0.1580547112462006, + "grad_norm": 2.066499948501587, + "learning_rate": 4.999928939184958e-06, + "loss": 0.44794657826423645, + "mean_token_accuracy": 0.8513424396514893, + "num_tokens": 1898264.0, + "step": 208 + }, + { + "epoch": 0.15881458966565348, + "grad_norm": 3.53865909576416, + "learning_rate": 4.999912270696202e-06, + "loss": 0.5978270769119263, + "mean_token_accuracy": 0.8080137968063354, + "num_tokens": 1902435.0, + "step": 209 + }, + { + "epoch": 0.1595744680851064, + "grad_norm": 2.0760679244995117, + "learning_rate": 4.999893847672783e-06, + "loss": 0.5930601358413696, + "mean_token_accuracy": 0.8028650283813477, + "num_tokens": 1912252.0, + "step": 210 + }, + { + "epoch": 0.16033434650455927, + "grad_norm": 2.21551513671875, + "learning_rate": 4.99987367012763e-06, + "loss": 0.6336753964424133, + "mean_token_accuracy": 0.7902286648750305, + "num_tokens": 1922095.0, + "step": 211 + }, + { + "epoch": 0.16109422492401215, + "grad_norm": 1.7654480934143066, + "learning_rate": 4.999851738074904e-06, + "loss": 0.6373403668403625, + "mean_token_accuracy": 0.7802424430847168, + "num_tokens": 1938962.0, + "step": 212 + }, + { + "epoch": 0.16185410334346503, + "grad_norm": 2.852834701538086, + "learning_rate": 4.9998280515300006e-06, + "loss": 0.6418683528900146, + "mean_token_accuracy": 0.7895716428756714, + "num_tokens": 1944668.0, + "step": 213 + }, + { + "epoch": 0.16261398176291794, + "grad_norm": 3.4737212657928467, + "learning_rate": 4.999802610509541e-06, + "loss": 0.6323273181915283, + "mean_token_accuracy": 0.7982614636421204, + "num_tokens": 1949142.0, + "step": 214 + }, + { + "epoch": 0.16337386018237082, + "grad_norm": 3.0802664756774902, + "learning_rate": 4.999775415031381e-06, + "loss": 0.5929068326950073, + "mean_token_accuracy": 0.8112219572067261, + "num_tokens": 1954141.0, + "step": 215 + }, + { + "epoch": 0.1641337386018237, + "grad_norm": 2.9808855056762695, + "learning_rate": 4.999746465114609e-06, + "loss": 0.5556406378746033, + "mean_token_accuracy": 0.8117628693580627, + "num_tokens": 1959406.0, + "step": 216 + }, + { + "epoch": 0.16489361702127658, + "grad_norm": 1.7346166372299194, + "learning_rate": 4.999715760779541e-06, + "loss": 0.5122925043106079, + "mean_token_accuracy": 0.8040724992752075, + "num_tokens": 1971921.0, + "step": 217 + }, + { + "epoch": 0.1656534954407295, + "grad_norm": 1.4183907508850098, + "learning_rate": 4.999683302047729e-06, + "loss": 0.46471893787384033, + "mean_token_accuracy": 0.8381330966949463, + "num_tokens": 1988863.0, + "step": 218 + }, + { + "epoch": 0.16641337386018237, + "grad_norm": 1.6797802448272705, + "learning_rate": 4.999649088941951e-06, + "loss": 0.38348832726478577, + "mean_token_accuracy": 0.8344278931617737, + "num_tokens": 2000003.0, + "step": 219 + }, + { + "epoch": 0.16717325227963525, + "grad_norm": 3.036963939666748, + "learning_rate": 4.999613121486222e-06, + "loss": 0.6062780618667603, + "mean_token_accuracy": 0.8217900991439819, + "num_tokens": 2004813.0, + "step": 220 + }, + { + "epoch": 0.16793313069908813, + "grad_norm": 2.0343217849731445, + "learning_rate": 4.999575399705782e-06, + "loss": 0.5052450895309448, + "mean_token_accuracy": 0.8368623852729797, + "num_tokens": 2013565.0, + "step": 221 + }, + { + "epoch": 0.16869300911854104, + "grad_norm": 2.1162009239196777, + "learning_rate": 4.9995359236271094e-06, + "loss": 0.5169756412506104, + "mean_token_accuracy": 0.8339958190917969, + "num_tokens": 2025763.0, + "step": 222 + }, + { + "epoch": 0.16945288753799392, + "grad_norm": 2.055333375930786, + "learning_rate": 4.9994946932779076e-06, + "loss": 0.6327048540115356, + "mean_token_accuracy": 0.8078711032867432, + "num_tokens": 2037005.0, + "step": 223 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 3.334620475769043, + "learning_rate": 4.999451708687114e-06, + "loss": 0.5688358545303345, + "mean_token_accuracy": 0.8015589714050293, + "num_tokens": 2041473.0, + "step": 224 + }, + { + "epoch": 0.17097264437689969, + "grad_norm": 2.3734676837921143, + "learning_rate": 4.999406969884897e-06, + "loss": 0.5673821568489075, + "mean_token_accuracy": 0.8054057359695435, + "num_tokens": 2049397.0, + "step": 225 + }, + { + "epoch": 0.1717325227963526, + "grad_norm": 1.807358980178833, + "learning_rate": 4.999360476902656e-06, + "loss": 0.4376158118247986, + "mean_token_accuracy": 0.8456039428710938, + "num_tokens": 2058721.0, + "step": 226 + }, + { + "epoch": 0.17249240121580547, + "grad_norm": 3.231638193130493, + "learning_rate": 4.999312229773022e-06, + "loss": 0.5592809915542603, + "mean_token_accuracy": 0.8170154094696045, + "num_tokens": 2063455.0, + "step": 227 + }, + { + "epoch": 0.17325227963525835, + "grad_norm": 2.2717151641845703, + "learning_rate": 4.999262228529855e-06, + "loss": 0.6144396066665649, + "mean_token_accuracy": 0.7948470115661621, + "num_tokens": 2071686.0, + "step": 228 + }, + { + "epoch": 0.17401215805471124, + "grad_norm": 1.4171342849731445, + "learning_rate": 4.99921047320825e-06, + "loss": 0.43680912256240845, + "mean_token_accuracy": 0.84850013256073, + "num_tokens": 2086999.0, + "step": 229 + }, + { + "epoch": 0.17477203647416414, + "grad_norm": 3.162736654281616, + "learning_rate": 4.99915696384453e-06, + "loss": 0.6025407910346985, + "mean_token_accuracy": 0.8042335510253906, + "num_tokens": 2092001.0, + "step": 230 + }, + { + "epoch": 0.17553191489361702, + "grad_norm": 1.8672804832458496, + "learning_rate": 4.99910170047625e-06, + "loss": 0.5843087434768677, + "mean_token_accuracy": 0.8016980886459351, + "num_tokens": 2103372.0, + "step": 231 + }, + { + "epoch": 0.1762917933130699, + "grad_norm": 2.967587471008301, + "learning_rate": 4.999044683142196e-06, + "loss": 0.5123642086982727, + "mean_token_accuracy": 0.8216149806976318, + "num_tokens": 2108008.0, + "step": 232 + }, + { + "epoch": 0.1770516717325228, + "grad_norm": 1.9651981592178345, + "learning_rate": 4.998985911882383e-06, + "loss": 0.5868178606033325, + "mean_token_accuracy": 0.7904198169708252, + "num_tokens": 2119009.0, + "step": 233 + }, + { + "epoch": 0.1778115501519757, + "grad_norm": 2.7785449028015137, + "learning_rate": 4.998925386738063e-06, + "loss": 0.5075510144233704, + "mean_token_accuracy": 0.8280210494995117, + "num_tokens": 2124915.0, + "step": 234 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 2.957470417022705, + "learning_rate": 4.998863107751711e-06, + "loss": 0.5351958274841309, + "mean_token_accuracy": 0.846825122833252, + "num_tokens": 2129905.0, + "step": 235 + }, + { + "epoch": 0.17933130699088146, + "grad_norm": 3.207671880722046, + "learning_rate": 4.99879907496704e-06, + "loss": 0.6209091544151306, + "mean_token_accuracy": 0.789960503578186, + "num_tokens": 2135027.0, + "step": 236 + }, + { + "epoch": 0.18009118541033434, + "grad_norm": 2.018953800201416, + "learning_rate": 4.998733288428987e-06, + "loss": 0.601510763168335, + "mean_token_accuracy": 0.8136930465698242, + "num_tokens": 2147016.0, + "step": 237 + }, + { + "epoch": 0.18085106382978725, + "grad_norm": 2.437281847000122, + "learning_rate": 4.998665748183727e-06, + "loss": 0.5813639163970947, + "mean_token_accuracy": 0.8116716146469116, + "num_tokens": 2155386.0, + "step": 238 + }, + { + "epoch": 0.18161094224924013, + "grad_norm": 1.5708180665969849, + "learning_rate": 4.998596454278661e-06, + "loss": 0.5252395272254944, + "mean_token_accuracy": 0.8193864822387695, + "num_tokens": 2170295.0, + "step": 239 + }, + { + "epoch": 0.182370820668693, + "grad_norm": 1.9921495914459229, + "learning_rate": 4.998525406762422e-06, + "loss": 0.5335029363632202, + "mean_token_accuracy": 0.8120872974395752, + "num_tokens": 2180012.0, + "step": 240 + }, + { + "epoch": 0.1831306990881459, + "grad_norm": 2.6562681198120117, + "learning_rate": 4.998452605684874e-06, + "loss": 0.48021435737609863, + "mean_token_accuracy": 0.8388714790344238, + "num_tokens": 2185607.0, + "step": 241 + }, + { + "epoch": 0.1838905775075988, + "grad_norm": 2.2535853385925293, + "learning_rate": 4.998378051097111e-06, + "loss": 0.5747300386428833, + "mean_token_accuracy": 0.8004639148712158, + "num_tokens": 2194105.0, + "step": 242 + }, + { + "epoch": 0.18465045592705168, + "grad_norm": 1.6151788234710693, + "learning_rate": 4.998301743051459e-06, + "loss": 0.6190565824508667, + "mean_token_accuracy": 0.7816627621650696, + "num_tokens": 2210629.0, + "step": 243 + }, + { + "epoch": 0.18541033434650456, + "grad_norm": 2.1088173389434814, + "learning_rate": 4.9982236816014735e-06, + "loss": 0.4715560972690582, + "mean_token_accuracy": 0.8485721349716187, + "num_tokens": 2218958.0, + "step": 244 + }, + { + "epoch": 0.18617021276595744, + "grad_norm": 2.6168735027313232, + "learning_rate": 4.998143866801941e-06, + "loss": 0.6077103018760681, + "mean_token_accuracy": 0.8057924509048462, + "num_tokens": 2226368.0, + "step": 245 + }, + { + "epoch": 0.18693009118541035, + "grad_norm": 2.5988616943359375, + "learning_rate": 4.99806229870888e-06, + "loss": 0.5021637678146362, + "mean_token_accuracy": 0.8361666202545166, + "num_tokens": 2232485.0, + "step": 246 + }, + { + "epoch": 0.18768996960486323, + "grad_norm": 2.015887498855591, + "learning_rate": 4.9979789773795365e-06, + "loss": 0.4309737980365753, + "mean_token_accuracy": 0.8508044481277466, + "num_tokens": 2240819.0, + "step": 247 + }, + { + "epoch": 0.1884498480243161, + "grad_norm": 2.3115265369415283, + "learning_rate": 4.997893902872389e-06, + "loss": 0.5776500701904297, + "mean_token_accuracy": 0.8079549074172974, + "num_tokens": 2249460.0, + "step": 248 + }, + { + "epoch": 0.189209726443769, + "grad_norm": 1.7387021780014038, + "learning_rate": 4.997807075247147e-06, + "loss": 0.430944561958313, + "mean_token_accuracy": 0.8483544588088989, + "num_tokens": 2259124.0, + "step": 249 + }, + { + "epoch": 0.1899696048632219, + "grad_norm": 1.6378381252288818, + "learning_rate": 4.997718494564747e-06, + "loss": 0.4123363792896271, + "mean_token_accuracy": 0.8557409644126892, + "num_tokens": 2269899.0, + "step": 250 + }, + { + "epoch": 0.19072948328267478, + "grad_norm": 1.336282730102539, + "learning_rate": 4.997628160887361e-06, + "loss": 0.502329409122467, + "mean_token_accuracy": 0.8186938166618347, + "num_tokens": 2292821.0, + "step": 251 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 3.3335583209991455, + "learning_rate": 4.997536074278388e-06, + "loss": 0.584446907043457, + "mean_token_accuracy": 0.8062717318534851, + "num_tokens": 2297175.0, + "step": 252 + }, + { + "epoch": 0.19224924012158054, + "grad_norm": 2.246727228164673, + "learning_rate": 4.9974422348024565e-06, + "loss": 0.5683060884475708, + "mean_token_accuracy": 0.8193703293800354, + "num_tokens": 2305456.0, + "step": 253 + }, + { + "epoch": 0.19300911854103345, + "grad_norm": 2.3520865440368652, + "learning_rate": 4.997346642525429e-06, + "loss": 0.4724946618080139, + "mean_token_accuracy": 0.8426719307899475, + "num_tokens": 2312241.0, + "step": 254 + }, + { + "epoch": 0.19376899696048633, + "grad_norm": 2.7115702629089355, + "learning_rate": 4.9972492975143936e-06, + "loss": 0.5019032955169678, + "mean_token_accuracy": 0.8253573179244995, + "num_tokens": 2318094.0, + "step": 255 + }, + { + "epoch": 0.1945288753799392, + "grad_norm": 1.705528974533081, + "learning_rate": 4.997150199837671e-06, + "loss": 0.45588475465774536, + "mean_token_accuracy": 0.836666464805603, + "num_tokens": 2329025.0, + "step": 256 + }, + { + "epoch": 0.1952887537993921, + "grad_norm": 2.161400318145752, + "learning_rate": 4.997049349564814e-06, + "loss": 0.5170183777809143, + "mean_token_accuracy": 0.8287534117698669, + "num_tokens": 2337448.0, + "step": 257 + }, + { + "epoch": 0.196048632218845, + "grad_norm": 2.629669189453125, + "learning_rate": 4.996946746766602e-06, + "loss": 0.44650501012802124, + "mean_token_accuracy": 0.850114107131958, + "num_tokens": 2343207.0, + "step": 258 + }, + { + "epoch": 0.19680851063829788, + "grad_norm": 1.6735503673553467, + "learning_rate": 4.996842391515045e-06, + "loss": 0.5247820019721985, + "mean_token_accuracy": 0.8285071849822998, + "num_tokens": 2356801.0, + "step": 259 + }, + { + "epoch": 0.19756838905775076, + "grad_norm": 1.2753115892410278, + "learning_rate": 4.996736283883382e-06, + "loss": 0.41870927810668945, + "mean_token_accuracy": 0.8448047637939453, + "num_tokens": 2377306.0, + "step": 260 + }, + { + "epoch": 0.19832826747720364, + "grad_norm": 2.6947314739227295, + "learning_rate": 4.9966284239460875e-06, + "loss": 0.5059205889701843, + "mean_token_accuracy": 0.8430814743041992, + "num_tokens": 2383352.0, + "step": 261 + }, + { + "epoch": 0.19908814589665655, + "grad_norm": 2.0509963035583496, + "learning_rate": 4.996518811778858e-06, + "loss": 0.4565388560295105, + "mean_token_accuracy": 0.8453130722045898, + "num_tokens": 2391149.0, + "step": 262 + }, + { + "epoch": 0.19984802431610943, + "grad_norm": 2.1856348514556885, + "learning_rate": 4.996407447458626e-06, + "loss": 0.531380832195282, + "mean_token_accuracy": 0.8387004137039185, + "num_tokens": 2399875.0, + "step": 263 + }, + { + "epoch": 0.2006079027355623, + "grad_norm": 2.7348573207855225, + "learning_rate": 4.99629433106355e-06, + "loss": 0.5242817401885986, + "mean_token_accuracy": 0.8177423477172852, + "num_tokens": 2406586.0, + "step": 264 + }, + { + "epoch": 0.2013677811550152, + "grad_norm": 1.76587975025177, + "learning_rate": 4.99617946267302e-06, + "loss": 0.49298471212387085, + "mean_token_accuracy": 0.8271149396896362, + "num_tokens": 2418683.0, + "step": 265 + }, + { + "epoch": 0.20212765957446807, + "grad_norm": 2.8129730224609375, + "learning_rate": 4.996062842367655e-06, + "loss": 0.46420302987098694, + "mean_token_accuracy": 0.8453244566917419, + "num_tokens": 2422929.0, + "step": 266 + }, + { + "epoch": 0.20288753799392098, + "grad_norm": 2.575744152069092, + "learning_rate": 4.9959444702293025e-06, + "loss": 0.43208545446395874, + "mean_token_accuracy": 0.8494843244552612, + "num_tokens": 2429567.0, + "step": 267 + }, + { + "epoch": 0.20364741641337386, + "grad_norm": 2.7586750984191895, + "learning_rate": 4.995824346341041e-06, + "loss": 0.4390473961830139, + "mean_token_accuracy": 0.8348895311355591, + "num_tokens": 2434700.0, + "step": 268 + }, + { + "epoch": 0.20440729483282674, + "grad_norm": 1.972145438194275, + "learning_rate": 4.99570247078718e-06, + "loss": 0.6219544410705566, + "mean_token_accuracy": 0.7939999103546143, + "num_tokens": 2447007.0, + "step": 269 + }, + { + "epoch": 0.20516717325227962, + "grad_norm": 2.2963485717773438, + "learning_rate": 4.995578843653255e-06, + "loss": 0.5008970499038696, + "mean_token_accuracy": 0.8255308866500854, + "num_tokens": 2453936.0, + "step": 270 + }, + { + "epoch": 0.20592705167173253, + "grad_norm": 1.8897721767425537, + "learning_rate": 4.995453465026033e-06, + "loss": 0.5436089038848877, + "mean_token_accuracy": 0.819086492061615, + "num_tokens": 2464494.0, + "step": 271 + }, + { + "epoch": 0.2066869300911854, + "grad_norm": 2.319728374481201, + "learning_rate": 4.995326334993508e-06, + "loss": 0.5136368870735168, + "mean_token_accuracy": 0.820817232131958, + "num_tokens": 2470938.0, + "step": 272 + }, + { + "epoch": 0.2074468085106383, + "grad_norm": 2.230414390563965, + "learning_rate": 4.9951974536449055e-06, + "loss": 0.5272846817970276, + "mean_token_accuracy": 0.8203279972076416, + "num_tokens": 2478629.0, + "step": 273 + }, + { + "epoch": 0.20820668693009117, + "grad_norm": 3.401937484741211, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.4389592111110687, + "mean_token_accuracy": 0.8647899031639099, + "num_tokens": 2482193.0, + "step": 274 + }, + { + "epoch": 0.20896656534954408, + "grad_norm": 2.1278507709503174, + "learning_rate": 4.994934437362513e-06, + "loss": 0.598863422870636, + "mean_token_accuracy": 0.7945119738578796, + "num_tokens": 2492465.0, + "step": 275 + }, + { + "epoch": 0.20972644376899696, + "grad_norm": 1.9259960651397705, + "learning_rate": 4.994800302613318e-06, + "loss": 0.49520939588546753, + "mean_token_accuracy": 0.8371536135673523, + "num_tokens": 2500825.0, + "step": 276 + }, + { + "epoch": 0.21048632218844984, + "grad_norm": 2.346418857574463, + "learning_rate": 4.994664416917236e-06, + "loss": 0.5412614345550537, + "mean_token_accuracy": 0.810661792755127, + "num_tokens": 2509513.0, + "step": 277 + }, + { + "epoch": 0.21124620060790272, + "grad_norm": 1.3092039823532104, + "learning_rate": 4.994526780369636e-06, + "loss": 0.46305379271507263, + "mean_token_accuracy": 0.8358527421951294, + "num_tokens": 2531405.0, + "step": 278 + }, + { + "epoch": 0.21200607902735563, + "grad_norm": 2.924611806869507, + "learning_rate": 4.9943873930671175e-06, + "loss": 0.6134544610977173, + "mean_token_accuracy": 0.7947378754615784, + "num_tokens": 2536744.0, + "step": 279 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 2.8290598392486572, + "learning_rate": 4.994246255107506e-06, + "loss": 0.465520441532135, + "mean_token_accuracy": 0.8440108299255371, + "num_tokens": 2541184.0, + "step": 280 + }, + { + "epoch": 0.2135258358662614, + "grad_norm": 3.8081259727478027, + "learning_rate": 4.994103366589859e-06, + "loss": 0.43394139409065247, + "mean_token_accuracy": 0.8579148054122925, + "num_tokens": 2545395.0, + "step": 281 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 1.7994529008865356, + "learning_rate": 4.993958727614462e-06, + "loss": 0.5076484680175781, + "mean_token_accuracy": 0.8270803093910217, + "num_tokens": 2556541.0, + "step": 282 + }, + { + "epoch": 0.21504559270516718, + "grad_norm": 2.5582659244537354, + "learning_rate": 4.993812338282826e-06, + "loss": 0.4453684389591217, + "mean_token_accuracy": 0.8488293886184692, + "num_tokens": 2562949.0, + "step": 283 + }, + { + "epoch": 0.21580547112462006, + "grad_norm": 1.6448938846588135, + "learning_rate": 4.993664198697694e-06, + "loss": 0.461971640586853, + "mean_token_accuracy": 0.824763298034668, + "num_tokens": 2576407.0, + "step": 284 + }, + { + "epoch": 0.21656534954407294, + "grad_norm": 2.1264469623565674, + "learning_rate": 4.993514308963037e-06, + "loss": 0.6241602897644043, + "mean_token_accuracy": 0.7916014790534973, + "num_tokens": 2585695.0, + "step": 285 + }, + { + "epoch": 0.21732522796352582, + "grad_norm": 3.629991292953491, + "learning_rate": 4.993362669184051e-06, + "loss": 0.610355019569397, + "mean_token_accuracy": 0.7847568988800049, + "num_tokens": 2589778.0, + "step": 286 + }, + { + "epoch": 0.21808510638297873, + "grad_norm": 1.9070756435394287, + "learning_rate": 4.993209279467164e-06, + "loss": 0.5513623952865601, + "mean_token_accuracy": 0.7911607027053833, + "num_tokens": 2600920.0, + "step": 287 + }, + { + "epoch": 0.2188449848024316, + "grad_norm": 1.761062741279602, + "learning_rate": 4.993054139920031e-06, + "loss": 0.4579957127571106, + "mean_token_accuracy": 0.8189530372619629, + "num_tokens": 2611856.0, + "step": 288 + }, + { + "epoch": 0.2196048632218845, + "grad_norm": 1.7264713048934937, + "learning_rate": 4.992897250651535e-06, + "loss": 0.5871305465698242, + "mean_token_accuracy": 0.7918527126312256, + "num_tokens": 2624730.0, + "step": 289 + }, + { + "epoch": 0.22036474164133737, + "grad_norm": 1.7455977201461792, + "learning_rate": 4.992738611771787e-06, + "loss": 0.5475119948387146, + "mean_token_accuracy": 0.8226917386054993, + "num_tokens": 2635705.0, + "step": 290 + }, + { + "epoch": 0.22112462006079028, + "grad_norm": 2.095095157623291, + "learning_rate": 4.992578223392124e-06, + "loss": 0.5952225923538208, + "mean_token_accuracy": 0.8078469038009644, + "num_tokens": 2643954.0, + "step": 291 + }, + { + "epoch": 0.22188449848024316, + "grad_norm": 2.994664192199707, + "learning_rate": 4.992416085625115e-06, + "loss": 0.5432442426681519, + "mean_token_accuracy": 0.8329008221626282, + "num_tokens": 2648800.0, + "step": 292 + }, + { + "epoch": 0.22264437689969604, + "grad_norm": 2.796790361404419, + "learning_rate": 4.992252198584554e-06, + "loss": 0.5168961882591248, + "mean_token_accuracy": 0.8393474817276001, + "num_tokens": 2653546.0, + "step": 293 + }, + { + "epoch": 0.22340425531914893, + "grad_norm": 1.8610522747039795, + "learning_rate": 4.992086562385462e-06, + "loss": 0.5728024244308472, + "mean_token_accuracy": 0.797406792640686, + "num_tokens": 2667483.0, + "step": 294 + }, + { + "epoch": 0.22416413373860183, + "grad_norm": 1.695472002029419, + "learning_rate": 4.9919191771440905e-06, + "loss": 0.5460028648376465, + "mean_token_accuracy": 0.8123016357421875, + "num_tokens": 2683574.0, + "step": 295 + }, + { + "epoch": 0.22492401215805471, + "grad_norm": 2.8627376556396484, + "learning_rate": 4.9917500429779165e-06, + "loss": 0.5566985011100769, + "mean_token_accuracy": 0.815531313419342, + "num_tokens": 2688985.0, + "step": 296 + }, + { + "epoch": 0.2256838905775076, + "grad_norm": 2.73323655128479, + "learning_rate": 4.991579160005644e-06, + "loss": 0.48197102546691895, + "mean_token_accuracy": 0.8471829295158386, + "num_tokens": 2694799.0, + "step": 297 + }, + { + "epoch": 0.22644376899696048, + "grad_norm": 1.8436161279678345, + "learning_rate": 4.991406528347206e-06, + "loss": 0.4528339207172394, + "mean_token_accuracy": 0.8603188395500183, + "num_tokens": 2707321.0, + "step": 298 + }, + { + "epoch": 0.22720364741641338, + "grad_norm": 2.6231515407562256, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.5916541814804077, + "mean_token_accuracy": 0.8050242066383362, + "num_tokens": 2714233.0, + "step": 299 + }, + { + "epoch": 0.22796352583586627, + "grad_norm": 3.08776593208313, + "learning_rate": 4.991056019457697e-06, + "loss": 0.4860580563545227, + "mean_token_accuracy": 0.8464088439941406, + "num_tokens": 2718443.0, + "step": 300 + }, + { + "epoch": 0.22872340425531915, + "grad_norm": 2.2537803649902344, + "learning_rate": 4.990878142472628e-06, + "loss": 0.5158311128616333, + "mean_token_accuracy": 0.824694812297821, + "num_tokens": 2726158.0, + "step": 301 + }, + { + "epoch": 0.22948328267477203, + "grad_norm": 2.1122705936431885, + "learning_rate": 4.990698517293394e-06, + "loss": 0.495265394449234, + "mean_token_accuracy": 0.8343238830566406, + "num_tokens": 2735022.0, + "step": 302 + }, + { + "epoch": 0.23024316109422494, + "grad_norm": 3.5503528118133545, + "learning_rate": 4.9905171440460645e-06, + "loss": 0.46063232421875, + "mean_token_accuracy": 0.8420047760009766, + "num_tokens": 2738550.0, + "step": 303 + }, + { + "epoch": 0.23100303951367782, + "grad_norm": 3.9858486652374268, + "learning_rate": 4.990334022857932e-06, + "loss": 0.5832710266113281, + "mean_token_accuracy": 0.8144199848175049, + "num_tokens": 2741720.0, + "step": 304 + }, + { + "epoch": 0.2317629179331307, + "grad_norm": 2.407231330871582, + "learning_rate": 4.990149153857519e-06, + "loss": 0.4692630171775818, + "mean_token_accuracy": 0.8429223299026489, + "num_tokens": 2748693.0, + "step": 305 + }, + { + "epoch": 0.23252279635258358, + "grad_norm": 1.6996397972106934, + "learning_rate": 4.989962537174573e-06, + "loss": 0.49143946170806885, + "mean_token_accuracy": 0.8340128064155579, + "num_tokens": 2761254.0, + "step": 306 + }, + { + "epoch": 0.23328267477203649, + "grad_norm": 3.746432065963745, + "learning_rate": 4.989774172940071e-06, + "loss": 0.6282026767730713, + "mean_token_accuracy": 0.775698184967041, + "num_tokens": 2765115.0, + "step": 307 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 2.212872266769409, + "learning_rate": 4.989584061286211e-06, + "loss": 0.5193763971328735, + "mean_token_accuracy": 0.8168246746063232, + "num_tokens": 2772345.0, + "step": 308 + }, + { + "epoch": 0.23480243161094225, + "grad_norm": 1.752297282218933, + "learning_rate": 4.989392202346423e-06, + "loss": 0.4437984824180603, + "mean_token_accuracy": 0.8451256155967712, + "num_tokens": 2783072.0, + "step": 309 + }, + { + "epoch": 0.23556231003039513, + "grad_norm": 2.386019706726074, + "learning_rate": 4.989198596255361e-06, + "loss": 0.4090752899646759, + "mean_token_accuracy": 0.8480085134506226, + "num_tokens": 2788757.0, + "step": 310 + }, + { + "epoch": 0.23632218844984804, + "grad_norm": 3.9981489181518555, + "learning_rate": 4.989003243148904e-06, + "loss": 0.5149132013320923, + "mean_token_accuracy": 0.8179056644439697, + "num_tokens": 2792096.0, + "step": 311 + }, + { + "epoch": 0.23708206686930092, + "grad_norm": 1.8723100423812866, + "learning_rate": 4.988806143164159e-06, + "loss": 0.4531487822532654, + "mean_token_accuracy": 0.8400167226791382, + "num_tokens": 2802210.0, + "step": 312 + }, + { + "epoch": 0.2378419452887538, + "grad_norm": 2.3415136337280273, + "learning_rate": 4.988607296439459e-06, + "loss": 0.5974439978599548, + "mean_token_accuracy": 0.8035976886749268, + "num_tokens": 2810088.0, + "step": 313 + }, + { + "epoch": 0.23860182370820668, + "grad_norm": 1.5317577123641968, + "learning_rate": 4.98840670311436e-06, + "loss": 0.49247145652770996, + "mean_token_accuracy": 0.8292540311813354, + "num_tokens": 2824005.0, + "step": 314 + }, + { + "epoch": 0.2393617021276596, + "grad_norm": 2.170772075653076, + "learning_rate": 4.988204363329648e-06, + "loss": 0.6359974145889282, + "mean_token_accuracy": 0.7785564661026001, + "num_tokens": 2834680.0, + "step": 315 + }, + { + "epoch": 0.24012158054711247, + "grad_norm": 3.2655932903289795, + "learning_rate": 4.988000277227334e-06, + "loss": 0.5080196857452393, + "mean_token_accuracy": 0.8295877575874329, + "num_tokens": 2838735.0, + "step": 316 + }, + { + "epoch": 0.24088145896656535, + "grad_norm": 3.406589984893799, + "learning_rate": 4.987794444950651e-06, + "loss": 0.3939085006713867, + "mean_token_accuracy": 0.8700719475746155, + "num_tokens": 2842127.0, + "step": 317 + }, + { + "epoch": 0.24164133738601823, + "grad_norm": 1.8211106061935425, + "learning_rate": 4.987586866644061e-06, + "loss": 0.5270540118217468, + "mean_token_accuracy": 0.826683521270752, + "num_tokens": 2853656.0, + "step": 318 + }, + { + "epoch": 0.24240121580547114, + "grad_norm": 1.8429969549179077, + "learning_rate": 4.9873775424532515e-06, + "loss": 0.4705049991607666, + "mean_token_accuracy": 0.8355701565742493, + "num_tokens": 2863513.0, + "step": 319 + }, + { + "epoch": 0.24316109422492402, + "grad_norm": 2.2425320148468018, + "learning_rate": 4.9871664725251314e-06, + "loss": 0.485736608505249, + "mean_token_accuracy": 0.835182785987854, + "num_tokens": 2871556.0, + "step": 320 + }, + { + "epoch": 0.2439209726443769, + "grad_norm": 1.6202056407928467, + "learning_rate": 4.986953657007841e-06, + "loss": 0.4437887370586395, + "mean_token_accuracy": 0.8282591700553894, + "num_tokens": 2884335.0, + "step": 321 + }, + { + "epoch": 0.24468085106382978, + "grad_norm": 1.1027268171310425, + "learning_rate": 4.98673909605074e-06, + "loss": 0.3770800828933716, + "mean_token_accuracy": 0.8325437307357788, + "num_tokens": 2904286.0, + "step": 322 + }, + { + "epoch": 0.2454407294832827, + "grad_norm": 2.3239076137542725, + "learning_rate": 4.986522789804417e-06, + "loss": 0.5387254953384399, + "mean_token_accuracy": 0.806242823600769, + "num_tokens": 2910975.0, + "step": 323 + }, + { + "epoch": 0.24620060790273557, + "grad_norm": 2.243482828140259, + "learning_rate": 4.986304738420684e-06, + "loss": 0.4396553039550781, + "mean_token_accuracy": 0.8561904430389404, + "num_tokens": 2917087.0, + "step": 324 + }, + { + "epoch": 0.24696048632218845, + "grad_norm": 2.537264347076416, + "learning_rate": 4.986084942052577e-06, + "loss": 0.395110160112381, + "mean_token_accuracy": 0.8636915683746338, + "num_tokens": 2921887.0, + "step": 325 + }, + { + "epoch": 0.24772036474164133, + "grad_norm": 2.319399118423462, + "learning_rate": 4.9858634008543574e-06, + "loss": 0.581517219543457, + "mean_token_accuracy": 0.8157487511634827, + "num_tokens": 2928996.0, + "step": 326 + }, + { + "epoch": 0.24848024316109424, + "grad_norm": 1.9787474870681763, + "learning_rate": 4.985640114981513e-06, + "loss": 0.5084106922149658, + "mean_token_accuracy": 0.835221529006958, + "num_tokens": 2940302.0, + "step": 327 + }, + { + "epoch": 0.24924012158054712, + "grad_norm": 2.4783265590667725, + "learning_rate": 4.985415084590752e-06, + "loss": 0.6062222719192505, + "mean_token_accuracy": 0.7885516285896301, + "num_tokens": 2946386.0, + "step": 328 + }, + { + "epoch": 0.25, + "grad_norm": 2.4081411361694336, + "learning_rate": 4.985188309840012e-06, + "loss": 0.5079880356788635, + "mean_token_accuracy": 0.8313904404640198, + "num_tokens": 2952323.0, + "step": 329 + }, + { + "epoch": 0.2507598784194529, + "grad_norm": 2.64993953704834, + "learning_rate": 4.984959790888451e-06, + "loss": 0.5461447834968567, + "mean_token_accuracy": 0.8125468492507935, + "num_tokens": 2958119.0, + "step": 330 + }, + { + "epoch": 0.25151975683890576, + "grad_norm": 2.549734115600586, + "learning_rate": 4.984729527896451e-06, + "loss": 0.5998573303222656, + "mean_token_accuracy": 0.8076666593551636, + "num_tokens": 2964947.0, + "step": 331 + }, + { + "epoch": 0.25227963525835867, + "grad_norm": 3.2185161113739014, + "learning_rate": 4.984497521025622e-06, + "loss": 0.4232945442199707, + "mean_token_accuracy": 0.8543803095817566, + "num_tokens": 2968598.0, + "step": 332 + }, + { + "epoch": 0.2530395136778115, + "grad_norm": 2.588994264602661, + "learning_rate": 4.984263770438793e-06, + "loss": 0.460967481136322, + "mean_token_accuracy": 0.8416207432746887, + "num_tokens": 2974510.0, + "step": 333 + }, + { + "epoch": 0.25379939209726443, + "grad_norm": 2.1373162269592285, + "learning_rate": 4.984028276300021e-06, + "loss": 0.49382102489471436, + "mean_token_accuracy": 0.8388048410415649, + "num_tokens": 2981632.0, + "step": 334 + }, + { + "epoch": 0.25455927051671734, + "grad_norm": 2.2524826526641846, + "learning_rate": 4.983791038774585e-06, + "loss": 0.4947671890258789, + "mean_token_accuracy": 0.8066365122795105, + "num_tokens": 2988736.0, + "step": 335 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 1.7244199514389038, + "learning_rate": 4.983552058028985e-06, + "loss": 0.48096776008605957, + "mean_token_accuracy": 0.830735445022583, + "num_tokens": 3003576.0, + "step": 336 + }, + { + "epoch": 0.2560790273556231, + "grad_norm": 3.0628933906555176, + "learning_rate": 4.9833113342309495e-06, + "loss": 0.6027032136917114, + "mean_token_accuracy": 0.8008694648742676, + "num_tokens": 3009549.0, + "step": 337 + }, + { + "epoch": 0.256838905775076, + "grad_norm": 2.438674211502075, + "learning_rate": 4.983068867549427e-06, + "loss": 0.517090916633606, + "mean_token_accuracy": 0.827893853187561, + "num_tokens": 3015236.0, + "step": 338 + }, + { + "epoch": 0.25759878419452886, + "grad_norm": 2.131535053253174, + "learning_rate": 4.982824658154589e-06, + "loss": 0.6656812429428101, + "mean_token_accuracy": 0.7772425413131714, + "num_tokens": 3028142.0, + "step": 339 + }, + { + "epoch": 0.25835866261398177, + "grad_norm": 2.3206584453582764, + "learning_rate": 4.9825787062178315e-06, + "loss": 0.5757625699043274, + "mean_token_accuracy": 0.8073873519897461, + "num_tokens": 3040996.0, + "step": 340 + }, + { + "epoch": 0.2591185410334346, + "grad_norm": 1.3905521631240845, + "learning_rate": 4.982331011911774e-06, + "loss": 0.4193805456161499, + "mean_token_accuracy": 0.8399466872215271, + "num_tokens": 3061931.0, + "step": 341 + }, + { + "epoch": 0.25987841945288753, + "grad_norm": 2.184173345565796, + "learning_rate": 4.982081575410256e-06, + "loss": 0.4751223921775818, + "mean_token_accuracy": 0.8409271240234375, + "num_tokens": 3069081.0, + "step": 342 + }, + { + "epoch": 0.26063829787234044, + "grad_norm": 3.538764238357544, + "learning_rate": 4.9818303968883445e-06, + "loss": 0.8119601011276245, + "mean_token_accuracy": 0.7442739009857178, + "num_tokens": 3073628.0, + "step": 343 + }, + { + "epoch": 0.2613981762917933, + "grad_norm": 1.8063762187957764, + "learning_rate": 4.981577476522323e-06, + "loss": 0.5615730881690979, + "mean_token_accuracy": 0.8207751512527466, + "num_tokens": 3086596.0, + "step": 344 + }, + { + "epoch": 0.2621580547112462, + "grad_norm": 2.4346961975097656, + "learning_rate": 4.981322814489703e-06, + "loss": 0.5266709327697754, + "mean_token_accuracy": 0.8211277723312378, + "num_tokens": 3092631.0, + "step": 345 + }, + { + "epoch": 0.2629179331306991, + "grad_norm": 1.91289484500885, + "learning_rate": 4.981066410969215e-06, + "loss": 0.5047177672386169, + "mean_token_accuracy": 0.8356877565383911, + "num_tokens": 3101102.0, + "step": 346 + }, + { + "epoch": 0.26367781155015196, + "grad_norm": 2.1495707035064697, + "learning_rate": 4.980808266140813e-06, + "loss": 0.47876280546188354, + "mean_token_accuracy": 0.8364313244819641, + "num_tokens": 3107998.0, + "step": 347 + }, + { + "epoch": 0.26443768996960487, + "grad_norm": 2.5961992740631104, + "learning_rate": 4.9805483801856744e-06, + "loss": 0.5512958765029907, + "mean_token_accuracy": 0.8181467652320862, + "num_tokens": 3113848.0, + "step": 348 + }, + { + "epoch": 0.2651975683890577, + "grad_norm": 3.2828900814056396, + "learning_rate": 4.980286753286196e-06, + "loss": 0.4217945635318756, + "mean_token_accuracy": 0.8617103099822998, + "num_tokens": 3117652.0, + "step": 349 + }, + { + "epoch": 0.26595744680851063, + "grad_norm": 1.425554871559143, + "learning_rate": 4.980023385625996e-06, + "loss": 0.4042487144470215, + "mean_token_accuracy": 0.8492785692214966, + "num_tokens": 3132336.0, + "step": 350 + }, + { + "epoch": 0.26671732522796354, + "grad_norm": 2.933504104614258, + "learning_rate": 4.979758277389919e-06, + "loss": 0.5406704545021057, + "mean_token_accuracy": 0.8035423755645752, + "num_tokens": 3137544.0, + "step": 351 + }, + { + "epoch": 0.2674772036474164, + "grad_norm": 1.9958966970443726, + "learning_rate": 4.9794914287640264e-06, + "loss": 0.5857555270195007, + "mean_token_accuracy": 0.7965140342712402, + "num_tokens": 3149705.0, + "step": 352 + }, + { + "epoch": 0.2682370820668693, + "grad_norm": 2.467694044113159, + "learning_rate": 4.979222839935602e-06, + "loss": 0.6404043436050415, + "mean_token_accuracy": 0.7823755741119385, + "num_tokens": 3158353.0, + "step": 353 + }, + { + "epoch": 0.2689969604863222, + "grad_norm": 2.0102720260620117, + "learning_rate": 4.9789525110931545e-06, + "loss": 0.5681496858596802, + "mean_token_accuracy": 0.8108169436454773, + "num_tokens": 3167121.0, + "step": 354 + }, + { + "epoch": 0.26975683890577506, + "grad_norm": 2.6017866134643555, + "learning_rate": 4.978680442426409e-06, + "loss": 0.6309828162193298, + "mean_token_accuracy": 0.7742617130279541, + "num_tokens": 3175012.0, + "step": 355 + }, + { + "epoch": 0.270516717325228, + "grad_norm": 1.8799268007278442, + "learning_rate": 4.978406634126315e-06, + "loss": 0.524029016494751, + "mean_token_accuracy": 0.8317689895629883, + "num_tokens": 3185331.0, + "step": 356 + }, + { + "epoch": 0.2712765957446808, + "grad_norm": 1.508332371711731, + "learning_rate": 4.978131086385041e-06, + "loss": 0.46656402945518494, + "mean_token_accuracy": 0.8339117765426636, + "num_tokens": 3198813.0, + "step": 357 + }, + { + "epoch": 0.27203647416413373, + "grad_norm": 3.595707654953003, + "learning_rate": 4.977853799395976e-06, + "loss": 0.5101234912872314, + "mean_token_accuracy": 0.8251723051071167, + "num_tokens": 3206557.0, + "step": 358 + }, + { + "epoch": 0.27279635258358664, + "grad_norm": 3.5317916870117188, + "learning_rate": 4.977574773353732e-06, + "loss": 0.5684665441513062, + "mean_token_accuracy": 0.8124493360519409, + "num_tokens": 3210912.0, + "step": 359 + }, + { + "epoch": 0.2735562310030395, + "grad_norm": 2.8606204986572266, + "learning_rate": 4.97729400845414e-06, + "loss": 0.4746384620666504, + "mean_token_accuracy": 0.8195606470108032, + "num_tokens": 3215365.0, + "step": 360 + }, + { + "epoch": 0.2743161094224924, + "grad_norm": 1.8214033842086792, + "learning_rate": 4.977011504894253e-06, + "loss": 0.4842769503593445, + "mean_token_accuracy": 0.82928866147995, + "num_tokens": 3224037.0, + "step": 361 + }, + { + "epoch": 0.2750759878419453, + "grad_norm": 1.628746509552002, + "learning_rate": 4.97672726287234e-06, + "loss": 0.4397493302822113, + "mean_token_accuracy": 0.8606528043746948, + "num_tokens": 3235589.0, + "step": 362 + }, + { + "epoch": 0.27583586626139817, + "grad_norm": 3.557973861694336, + "learning_rate": 4.976441282587894e-06, + "loss": 0.5732032060623169, + "mean_token_accuracy": 0.8041545748710632, + "num_tokens": 3239958.0, + "step": 363 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 1.3467901945114136, + "learning_rate": 4.9761535642416284e-06, + "loss": 0.4525323510169983, + "mean_token_accuracy": 0.8281061053276062, + "num_tokens": 3257703.0, + "step": 364 + }, + { + "epoch": 0.2773556231003039, + "grad_norm": 2.2649986743927, + "learning_rate": 4.9758641080354745e-06, + "loss": 0.5074734687805176, + "mean_token_accuracy": 0.8447474241256714, + "num_tokens": 3264334.0, + "step": 365 + }, + { + "epoch": 0.27811550151975684, + "grad_norm": 2.8667566776275635, + "learning_rate": 4.975572914172581e-06, + "loss": 0.5759559869766235, + "mean_token_accuracy": 0.7976793050765991, + "num_tokens": 3269314.0, + "step": 366 + }, + { + "epoch": 0.27887537993920974, + "grad_norm": 2.2514986991882324, + "learning_rate": 4.975279982857324e-06, + "loss": 0.5786465406417847, + "mean_token_accuracy": 0.8058781623840332, + "num_tokens": 3277324.0, + "step": 367 + }, + { + "epoch": 0.2796352583586626, + "grad_norm": 1.3826723098754883, + "learning_rate": 4.97498531429529e-06, + "loss": 0.40801727771759033, + "mean_token_accuracy": 0.8601310849189758, + "num_tokens": 3290530.0, + "step": 368 + }, + { + "epoch": 0.2803951367781155, + "grad_norm": 2.084092617034912, + "learning_rate": 4.97468890869329e-06, + "loss": 0.47076648473739624, + "mean_token_accuracy": 0.8310186862945557, + "num_tokens": 3298325.0, + "step": 369 + }, + { + "epoch": 0.2811550151975684, + "grad_norm": 1.3467998504638672, + "learning_rate": 4.974390766259353e-06, + "loss": 0.44668465852737427, + "mean_token_accuracy": 0.8275353908538818, + "num_tokens": 3314302.0, + "step": 370 + }, + { + "epoch": 0.28191489361702127, + "grad_norm": 2.5921075344085693, + "learning_rate": 4.974090887202726e-06, + "loss": 0.5343953967094421, + "mean_token_accuracy": 0.8110706806182861, + "num_tokens": 3320963.0, + "step": 371 + }, + { + "epoch": 0.2826747720364742, + "grad_norm": 2.042781352996826, + "learning_rate": 4.973789271733877e-06, + "loss": 0.6293343305587769, + "mean_token_accuracy": 0.7800243496894836, + "num_tokens": 3332742.0, + "step": 372 + }, + { + "epoch": 0.28343465045592703, + "grad_norm": 4.822193145751953, + "learning_rate": 4.973485920064491e-06, + "loss": 0.6256728768348694, + "mean_token_accuracy": 0.7962433099746704, + "num_tokens": 3335872.0, + "step": 373 + }, + { + "epoch": 0.28419452887537994, + "grad_norm": 1.260988473892212, + "learning_rate": 4.973180832407471e-06, + "loss": 0.38731223344802856, + "mean_token_accuracy": 0.8385066986083984, + "num_tokens": 3351884.0, + "step": 374 + }, + { + "epoch": 0.28495440729483285, + "grad_norm": 2.669966697692871, + "learning_rate": 4.97287400897694e-06, + "loss": 0.5594710111618042, + "mean_token_accuracy": 0.8097212314605713, + "num_tokens": 3358197.0, + "step": 375 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 3.0344486236572266, + "learning_rate": 4.972565449988238e-06, + "loss": 0.34449583292007446, + "mean_token_accuracy": 0.8813316822052002, + "num_tokens": 3362133.0, + "step": 376 + }, + { + "epoch": 0.2864741641337386, + "grad_norm": 2.562251091003418, + "learning_rate": 4.972255155657925e-06, + "loss": 0.5331522822380066, + "mean_token_accuracy": 0.8212941288948059, + "num_tokens": 3370346.0, + "step": 377 + }, + { + "epoch": 0.2872340425531915, + "grad_norm": 2.7083740234375, + "learning_rate": 4.9719431262037755e-06, + "loss": 0.5403046011924744, + "mean_token_accuracy": 0.8108335733413696, + "num_tokens": 3375588.0, + "step": 378 + }, + { + "epoch": 0.28799392097264437, + "grad_norm": 1.396430492401123, + "learning_rate": 4.971629361844785e-06, + "loss": 0.4041529893875122, + "mean_token_accuracy": 0.8588063716888428, + "num_tokens": 3390749.0, + "step": 379 + }, + { + "epoch": 0.2887537993920973, + "grad_norm": 1.9872784614562988, + "learning_rate": 4.971313862801166e-06, + "loss": 0.4336993098258972, + "mean_token_accuracy": 0.8511303663253784, + "num_tokens": 3399064.0, + "step": 380 + }, + { + "epoch": 0.28951367781155013, + "grad_norm": 1.9652575254440308, + "learning_rate": 4.9709966292943455e-06, + "loss": 0.4578358232975006, + "mean_token_accuracy": 0.8229440450668335, + "num_tokens": 3407229.0, + "step": 381 + }, + { + "epoch": 0.29027355623100304, + "grad_norm": 1.6626898050308228, + "learning_rate": 4.970677661546972e-06, + "loss": 0.5427594184875488, + "mean_token_accuracy": 0.815427303314209, + "num_tokens": 3422321.0, + "step": 382 + }, + { + "epoch": 0.29103343465045595, + "grad_norm": 3.5265562534332275, + "learning_rate": 4.970356959782909e-06, + "loss": 0.6661460995674133, + "mean_token_accuracy": 0.7856965065002441, + "num_tokens": 3427442.0, + "step": 383 + }, + { + "epoch": 0.2917933130699088, + "grad_norm": 1.667205572128296, + "learning_rate": 4.970034524227239e-06, + "loss": 0.36256325244903564, + "mean_token_accuracy": 0.8711205720901489, + "num_tokens": 3436662.0, + "step": 384 + }, + { + "epoch": 0.2925531914893617, + "grad_norm": 1.3389486074447632, + "learning_rate": 4.969710355106256e-06, + "loss": 0.4282698631286621, + "mean_token_accuracy": 0.838951587677002, + "num_tokens": 3450060.0, + "step": 385 + }, + { + "epoch": 0.2933130699088146, + "grad_norm": 2.5163397789001465, + "learning_rate": 4.969384452647477e-06, + "loss": 0.5176984071731567, + "mean_token_accuracy": 0.8235267996788025, + "num_tokens": 3456990.0, + "step": 386 + }, + { + "epoch": 0.29407294832826747, + "grad_norm": 1.7588495016098022, + "learning_rate": 4.969056817079633e-06, + "loss": 0.49710947275161743, + "mean_token_accuracy": 0.818520724773407, + "num_tokens": 3468098.0, + "step": 387 + }, + { + "epoch": 0.2948328267477204, + "grad_norm": 2.6381046772003174, + "learning_rate": 4.968727448632669e-06, + "loss": 0.4425308108329773, + "mean_token_accuracy": 0.8451643586158752, + "num_tokens": 3472899.0, + "step": 388 + }, + { + "epoch": 0.29559270516717323, + "grad_norm": 1.6345038414001465, + "learning_rate": 4.968396347537751e-06, + "loss": 0.4177059829235077, + "mean_token_accuracy": 0.8498886227607727, + "num_tokens": 3484826.0, + "step": 389 + }, + { + "epoch": 0.29635258358662614, + "grad_norm": 3.0466468334198, + "learning_rate": 4.968063514027258e-06, + "loss": 0.4274463951587677, + "mean_token_accuracy": 0.8387278318405151, + "num_tokens": 3488610.0, + "step": 390 + }, + { + "epoch": 0.29711246200607905, + "grad_norm": 2.6509406566619873, + "learning_rate": 4.967728948334784e-06, + "loss": 0.5401753783226013, + "mean_token_accuracy": 0.8252490162849426, + "num_tokens": 3493657.0, + "step": 391 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 1.6372219324111938, + "learning_rate": 4.967392650695141e-06, + "loss": 0.3862472176551819, + "mean_token_accuracy": 0.8555525541305542, + "num_tokens": 3505588.0, + "step": 392 + }, + { + "epoch": 0.2986322188449848, + "grad_norm": 2.1615452766418457, + "learning_rate": 4.967054621344356e-06, + "loss": 0.57850581407547, + "mean_token_accuracy": 0.8222678899765015, + "num_tokens": 3514396.0, + "step": 393 + }, + { + "epoch": 0.2993920972644377, + "grad_norm": 1.8610916137695312, + "learning_rate": 4.96671486051967e-06, + "loss": 0.5440595149993896, + "mean_token_accuracy": 0.8196715116500854, + "num_tokens": 3523604.0, + "step": 394 + }, + { + "epoch": 0.30015197568389057, + "grad_norm": 2.9585862159729004, + "learning_rate": 4.966373368459542e-06, + "loss": 0.6921588182449341, + "mean_token_accuracy": 0.7816659808158875, + "num_tokens": 3529849.0, + "step": 395 + }, + { + "epoch": 0.3009118541033435, + "grad_norm": 1.9374035596847534, + "learning_rate": 4.966030145403642e-06, + "loss": 0.5494055151939392, + "mean_token_accuracy": 0.8126792907714844, + "num_tokens": 3539529.0, + "step": 396 + }, + { + "epoch": 0.30167173252279633, + "grad_norm": 1.730530023574829, + "learning_rate": 4.965685191592859e-06, + "loss": 0.4271572232246399, + "mean_token_accuracy": 0.8383668661117554, + "num_tokens": 3550972.0, + "step": 397 + }, + { + "epoch": 0.30243161094224924, + "grad_norm": 3.9635560512542725, + "learning_rate": 4.9653385072692935e-06, + "loss": 0.5576210021972656, + "mean_token_accuracy": 0.799404501914978, + "num_tokens": 3554147.0, + "step": 398 + }, + { + "epoch": 0.30319148936170215, + "grad_norm": 2.5731968879699707, + "learning_rate": 4.964990092676263e-06, + "loss": 0.5478942394256592, + "mean_token_accuracy": 0.8220961093902588, + "num_tokens": 3559972.0, + "step": 399 + }, + { + "epoch": 0.303951367781155, + "grad_norm": 2.2096588611602783, + "learning_rate": 4.964639948058297e-06, + "loss": 0.35461270809173584, + "mean_token_accuracy": 0.8640927076339722, + "num_tokens": 3565770.0, + "step": 400 + }, + { + "epoch": 0.3047112462006079, + "grad_norm": 1.7874189615249634, + "learning_rate": 4.964288073661142e-06, + "loss": 0.38849619030952454, + "mean_token_accuracy": 0.8443037271499634, + "num_tokens": 3574514.0, + "step": 401 + }, + { + "epoch": 0.30547112462006076, + "grad_norm": 1.5583146810531616, + "learning_rate": 4.963934469731756e-06, + "loss": 0.48909449577331543, + "mean_token_accuracy": 0.8429768681526184, + "num_tokens": 3585877.0, + "step": 402 + }, + { + "epoch": 0.30623100303951367, + "grad_norm": 3.026599645614624, + "learning_rate": 4.963579136518312e-06, + "loss": 0.5138992071151733, + "mean_token_accuracy": 0.8283728361129761, + "num_tokens": 3590412.0, + "step": 403 + }, + { + "epoch": 0.3069908814589666, + "grad_norm": 2.777505874633789, + "learning_rate": 4.963222074270197e-06, + "loss": 0.6241534948348999, + "mean_token_accuracy": 0.8130464553833008, + "num_tokens": 3596246.0, + "step": 404 + }, + { + "epoch": 0.30775075987841943, + "grad_norm": 2.4772839546203613, + "learning_rate": 4.962863283238011e-06, + "loss": 0.5930814146995544, + "mean_token_accuracy": 0.8036394715309143, + "num_tokens": 3602878.0, + "step": 405 + }, + { + "epoch": 0.30851063829787234, + "grad_norm": 1.5049982070922852, + "learning_rate": 4.962502763673566e-06, + "loss": 0.4903082549571991, + "mean_token_accuracy": 0.8184912204742432, + "num_tokens": 3617018.0, + "step": 406 + }, + { + "epoch": 0.30927051671732525, + "grad_norm": 2.453155040740967, + "learning_rate": 4.96214051582989e-06, + "loss": 0.5138067603111267, + "mean_token_accuracy": 0.8336835503578186, + "num_tokens": 3624188.0, + "step": 407 + }, + { + "epoch": 0.3100303951367781, + "grad_norm": 2.4038336277008057, + "learning_rate": 4.961776539961222e-06, + "loss": 0.5752760171890259, + "mean_token_accuracy": 0.8054730892181396, + "num_tokens": 3634152.0, + "step": 408 + }, + { + "epoch": 0.310790273556231, + "grad_norm": 2.629068374633789, + "learning_rate": 4.961410836323014e-06, + "loss": 0.5580606460571289, + "mean_token_accuracy": 0.8121089935302734, + "num_tokens": 3639528.0, + "step": 409 + }, + { + "epoch": 0.31155015197568386, + "grad_norm": 1.4245928525924683, + "learning_rate": 4.961043405171931e-06, + "loss": 0.5399882793426514, + "mean_token_accuracy": 0.812280535697937, + "num_tokens": 3655744.0, + "step": 410 + }, + { + "epoch": 0.3123100303951368, + "grad_norm": 1.5236459970474243, + "learning_rate": 4.9606742467658505e-06, + "loss": 0.5234690308570862, + "mean_token_accuracy": 0.8188928365707397, + "num_tokens": 3675010.0, + "step": 411 + }, + { + "epoch": 0.3130699088145897, + "grad_norm": 2.27961802482605, + "learning_rate": 4.960303361363863e-06, + "loss": 0.5502505898475647, + "mean_token_accuracy": 0.8161963224411011, + "num_tokens": 3682328.0, + "step": 412 + }, + { + "epoch": 0.31382978723404253, + "grad_norm": 1.554518222808838, + "learning_rate": 4.959930749226269e-06, + "loss": 0.420867919921875, + "mean_token_accuracy": 0.8499157428741455, + "num_tokens": 3694980.0, + "step": 413 + }, + { + "epoch": 0.31458966565349544, + "grad_norm": 2.609218120574951, + "learning_rate": 4.9595564106145825e-06, + "loss": 0.4706704318523407, + "mean_token_accuracy": 0.8412490487098694, + "num_tokens": 3700033.0, + "step": 414 + }, + { + "epoch": 0.31534954407294835, + "grad_norm": 1.5303231477737427, + "learning_rate": 4.959180345791528e-06, + "loss": 0.4668654799461365, + "mean_token_accuracy": 0.8125015497207642, + "num_tokens": 3715012.0, + "step": 415 + }, + { + "epoch": 0.3161094224924012, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.958802555021042e-06, + "loss": 0.4339369237422943, + "mean_token_accuracy": 0.8442851901054382, + "num_tokens": 3733928.0, + "step": 416 + }, + { + "epoch": 0.3168693009118541, + "grad_norm": 2.1240181922912598, + "learning_rate": 4.958423038568274e-06, + "loss": 0.4029104709625244, + "mean_token_accuracy": 0.8627674579620361, + "num_tokens": 3740202.0, + "step": 417 + }, + { + "epoch": 0.31762917933130697, + "grad_norm": 2.00538969039917, + "learning_rate": 4.958041796699583e-06, + "loss": 0.5229607820510864, + "mean_token_accuracy": 0.8282366394996643, + "num_tokens": 3749308.0, + "step": 418 + }, + { + "epoch": 0.3183890577507599, + "grad_norm": 2.6555092334747314, + "learning_rate": 4.957658829682539e-06, + "loss": 0.5344101190567017, + "mean_token_accuracy": 0.8183202743530273, + "num_tokens": 3754595.0, + "step": 419 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 1.7468839883804321, + "learning_rate": 4.9572741377859225e-06, + "loss": 0.5667245984077454, + "mean_token_accuracy": 0.8080123662948608, + "num_tokens": 3765761.0, + "step": 420 + }, + { + "epoch": 0.31990881458966564, + "grad_norm": 2.9612457752227783, + "learning_rate": 4.956887721279726e-06, + "loss": 0.5389559864997864, + "mean_token_accuracy": 0.8019476532936096, + "num_tokens": 3770844.0, + "step": 421 + }, + { + "epoch": 0.32066869300911854, + "grad_norm": 1.842403769493103, + "learning_rate": 4.95649958043515e-06, + "loss": 0.38279837369918823, + "mean_token_accuracy": 0.858866810798645, + "num_tokens": 3778094.0, + "step": 422 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 2.3108131885528564, + "learning_rate": 4.956109715524609e-06, + "loss": 0.5453893542289734, + "mean_token_accuracy": 0.8085013031959534, + "num_tokens": 3785015.0, + "step": 423 + }, + { + "epoch": 0.3221884498480243, + "grad_norm": 3.0326945781707764, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.5550523400306702, + "mean_token_accuracy": 0.8125876188278198, + "num_tokens": 3789830.0, + "step": 424 + }, + { + "epoch": 0.3229483282674772, + "grad_norm": 1.8851977586746216, + "learning_rate": 4.955324814601324e-06, + "loss": 0.4902324974536896, + "mean_token_accuracy": 0.8205406665802002, + "num_tokens": 3799862.0, + "step": 425 + }, + { + "epoch": 0.32370820668693007, + "grad_norm": 2.6018171310424805, + "learning_rate": 4.954929779139455e-06, + "loss": 0.5920133590698242, + "mean_token_accuracy": 0.8340690732002258, + "num_tokens": 3806617.0, + "step": 426 + }, + { + "epoch": 0.324468085106383, + "grad_norm": 2.4283878803253174, + "learning_rate": 4.954533020713367e-06, + "loss": 0.5305854082107544, + "mean_token_accuracy": 0.8137468099594116, + "num_tokens": 3813843.0, + "step": 427 + }, + { + "epoch": 0.3252279635258359, + "grad_norm": 2.667978525161743, + "learning_rate": 4.954134539601519e-06, + "loss": 0.5333638787269592, + "mean_token_accuracy": 0.8402629494667053, + "num_tokens": 3819450.0, + "step": 428 + }, + { + "epoch": 0.32598784194528874, + "grad_norm": 1.7302523851394653, + "learning_rate": 4.953734336083582e-06, + "loss": 0.422895610332489, + "mean_token_accuracy": 0.8709704875946045, + "num_tokens": 3831027.0, + "step": 429 + }, + { + "epoch": 0.32674772036474165, + "grad_norm": 2.427192211151123, + "learning_rate": 4.953332410440434e-06, + "loss": 0.6334598064422607, + "mean_token_accuracy": 0.7817479968070984, + "num_tokens": 3841776.0, + "step": 430 + }, + { + "epoch": 0.32750759878419455, + "grad_norm": 1.460949182510376, + "learning_rate": 4.952928762954161e-06, + "loss": 0.3654777705669403, + "mean_token_accuracy": 0.8780122995376587, + "num_tokens": 3852213.0, + "step": 431 + }, + { + "epoch": 0.3282674772036474, + "grad_norm": 1.9855005741119385, + "learning_rate": 4.952523393908059e-06, + "loss": 0.5117089748382568, + "mean_token_accuracy": 0.811911404132843, + "num_tokens": 3861176.0, + "step": 432 + }, + { + "epoch": 0.3290273556231003, + "grad_norm": 2.2653207778930664, + "learning_rate": 4.952116303586631e-06, + "loss": 0.42514950037002563, + "mean_token_accuracy": 0.8448518514633179, + "num_tokens": 3867164.0, + "step": 433 + }, + { + "epoch": 0.32978723404255317, + "grad_norm": 1.9780964851379395, + "learning_rate": 4.951707492275589e-06, + "loss": 0.5095293521881104, + "mean_token_accuracy": 0.8262748718261719, + "num_tokens": 3876406.0, + "step": 434 + }, + { + "epoch": 0.3305471124620061, + "grad_norm": 2.9480233192443848, + "learning_rate": 4.951296960261853e-06, + "loss": 0.3494448959827423, + "mean_token_accuracy": 0.8781307935714722, + "num_tokens": 3880298.0, + "step": 435 + }, + { + "epoch": 0.331306990881459, + "grad_norm": 2.335571527481079, + "learning_rate": 4.95088470783355e-06, + "loss": 0.5456914901733398, + "mean_token_accuracy": 0.816297173500061, + "num_tokens": 3886487.0, + "step": 436 + }, + { + "epoch": 0.33206686930091184, + "grad_norm": 2.3046419620513916, + "learning_rate": 4.950470735280013e-06, + "loss": 0.4835948944091797, + "mean_token_accuracy": 0.8539175391197205, + "num_tokens": 3892706.0, + "step": 437 + }, + { + "epoch": 0.33282674772036475, + "grad_norm": 2.44047474861145, + "learning_rate": 4.950055042891786e-06, + "loss": 0.5154092907905579, + "mean_token_accuracy": 0.8579919338226318, + "num_tokens": 3899532.0, + "step": 438 + }, + { + "epoch": 0.33358662613981765, + "grad_norm": 4.826764106750488, + "learning_rate": 4.949637630960618e-06, + "loss": 0.5270259976387024, + "mean_token_accuracy": 0.8172192573547363, + "num_tokens": 3902260.0, + "step": 439 + }, + { + "epoch": 0.3343465045592705, + "grad_norm": 2.001574754714966, + "learning_rate": 4.949218499779462e-06, + "loss": 0.5413002967834473, + "mean_token_accuracy": 0.8162837028503418, + "num_tokens": 3911706.0, + "step": 440 + }, + { + "epoch": 0.3351063829787234, + "grad_norm": 1.7998944520950317, + "learning_rate": 4.948797649642484e-06, + "loss": 0.5131614208221436, + "mean_token_accuracy": 0.8367440700531006, + "num_tokens": 3923490.0, + "step": 441 + }, + { + "epoch": 0.33586626139817627, + "grad_norm": 3.4566173553466797, + "learning_rate": 4.94837508084505e-06, + "loss": 0.7258909940719604, + "mean_token_accuracy": 0.771377444267273, + "num_tokens": 3928099.0, + "step": 442 + }, + { + "epoch": 0.3366261398176292, + "grad_norm": 2.0040442943573, + "learning_rate": 4.9479507936837364e-06, + "loss": 0.482135534286499, + "mean_token_accuracy": 0.8339327573776245, + "num_tokens": 3937328.0, + "step": 443 + }, + { + "epoch": 0.3373860182370821, + "grad_norm": 2.949502944946289, + "learning_rate": 4.947524788456325e-06, + "loss": 0.6474795341491699, + "mean_token_accuracy": 0.7951677441596985, + "num_tokens": 3942529.0, + "step": 444 + }, + { + "epoch": 0.33814589665653494, + "grad_norm": 1.5528364181518555, + "learning_rate": 4.947097065461801e-06, + "loss": 0.48791584372520447, + "mean_token_accuracy": 0.8425545692443848, + "num_tokens": 3955200.0, + "step": 445 + }, + { + "epoch": 0.33890577507598785, + "grad_norm": 1.8813284635543823, + "learning_rate": 4.946667625000358e-06, + "loss": 0.45922309160232544, + "mean_token_accuracy": 0.8206527233123779, + "num_tokens": 3962975.0, + "step": 446 + }, + { + "epoch": 0.33966565349544076, + "grad_norm": 1.7157847881317139, + "learning_rate": 4.946236467373392e-06, + "loss": 0.5454182028770447, + "mean_token_accuracy": 0.8049604892730713, + "num_tokens": 3973956.0, + "step": 447 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 2.008857250213623, + "learning_rate": 4.945803592883509e-06, + "loss": 0.5151860117912292, + "mean_token_accuracy": 0.8262045383453369, + "num_tokens": 3982853.0, + "step": 448 + }, + { + "epoch": 0.3411854103343465, + "grad_norm": 1.6632496118545532, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.42710691690444946, + "mean_token_accuracy": 0.8521314859390259, + "num_tokens": 3993838.0, + "step": 449 + }, + { + "epoch": 0.34194528875379937, + "grad_norm": 1.365234375, + "learning_rate": 4.944932694531423e-06, + "loss": 0.5172526836395264, + "mean_token_accuracy": 0.8277045488357544, + "num_tokens": 4014179.0, + "step": 450 + }, + { + "epoch": 0.3427051671732523, + "grad_norm": 1.7610243558883667, + "learning_rate": 4.94449467128045e-06, + "loss": 0.42104798555374146, + "mean_token_accuracy": 0.8552065491676331, + "num_tokens": 4023663.0, + "step": 451 + }, + { + "epoch": 0.3434650455927052, + "grad_norm": 2.3732354640960693, + "learning_rate": 4.944054932389018e-06, + "loss": 0.5471175909042358, + "mean_token_accuracy": 0.8487317562103271, + "num_tokens": 4030100.0, + "step": 452 + }, + { + "epoch": 0.34422492401215804, + "grad_norm": 1.5973623991012573, + "learning_rate": 4.943613478165753e-06, + "loss": 0.419813871383667, + "mean_token_accuracy": 0.8484025001525879, + "num_tokens": 4041124.0, + "step": 453 + }, + { + "epoch": 0.34498480243161095, + "grad_norm": 2.966381549835205, + "learning_rate": 4.943170308920484e-06, + "loss": 0.5370652675628662, + "mean_token_accuracy": 0.8439491987228394, + "num_tokens": 4045675.0, + "step": 454 + }, + { + "epoch": 0.34574468085106386, + "grad_norm": 2.5097248554229736, + "learning_rate": 4.9427254249642445e-06, + "loss": 0.5776349306106567, + "mean_token_accuracy": 0.8060523867607117, + "num_tokens": 4053250.0, + "step": 455 + }, + { + "epoch": 0.3465045592705167, + "grad_norm": 1.6779125928878784, + "learning_rate": 4.942278826609272e-06, + "loss": 0.5245476961135864, + "mean_token_accuracy": 0.8168526887893677, + "num_tokens": 4064106.0, + "step": 456 + }, + { + "epoch": 0.3472644376899696, + "grad_norm": 1.5945546627044678, + "learning_rate": 4.9418305141690045e-06, + "loss": 0.4972047209739685, + "mean_token_accuracy": 0.8257735967636108, + "num_tokens": 4077687.0, + "step": 457 + }, + { + "epoch": 0.34802431610942247, + "grad_norm": 2.864778757095337, + "learning_rate": 4.9413804879580865e-06, + "loss": 0.5372499823570251, + "mean_token_accuracy": 0.8423776626586914, + "num_tokens": 4082632.0, + "step": 458 + }, + { + "epoch": 0.3487841945288754, + "grad_norm": 1.4797078371047974, + "learning_rate": 4.940928748292363e-06, + "loss": 0.5903409719467163, + "mean_token_accuracy": 0.8061295747756958, + "num_tokens": 4104218.0, + "step": 459 + }, + { + "epoch": 0.3495440729483283, + "grad_norm": 2.4376983642578125, + "learning_rate": 4.940475295488882e-06, + "loss": 0.4534894824028015, + "mean_token_accuracy": 0.8395825028419495, + "num_tokens": 4110530.0, + "step": 460 + }, + { + "epoch": 0.35030395136778114, + "grad_norm": 1.2955626249313354, + "learning_rate": 4.940020129865895e-06, + "loss": 0.47155818343162537, + "mean_token_accuracy": 0.8253582715988159, + "num_tokens": 4128398.0, + "step": 461 + }, + { + "epoch": 0.35106382978723405, + "grad_norm": 2.066575527191162, + "learning_rate": 4.9395632517428546e-06, + "loss": 0.5555641651153564, + "mean_token_accuracy": 0.814624547958374, + "num_tokens": 4137623.0, + "step": 462 + }, + { + "epoch": 0.3518237082066869, + "grad_norm": 1.6407525539398193, + "learning_rate": 4.939104661440415e-06, + "loss": 0.4361790418624878, + "mean_token_accuracy": 0.8544459342956543, + "num_tokens": 4152803.0, + "step": 463 + }, + { + "epoch": 0.3525835866261398, + "grad_norm": 2.1685116291046143, + "learning_rate": 4.938644359280433e-06, + "loss": 0.5347012877464294, + "mean_token_accuracy": 0.853853702545166, + "num_tokens": 4160778.0, + "step": 464 + }, + { + "epoch": 0.3533434650455927, + "grad_norm": 1.8824869394302368, + "learning_rate": 4.938182345585967e-06, + "loss": 0.5512481927871704, + "mean_token_accuracy": 0.7985891699790955, + "num_tokens": 4170380.0, + "step": 465 + }, + { + "epoch": 0.3541033434650456, + "grad_norm": 2.2229504585266113, + "learning_rate": 4.937718620681273e-06, + "loss": 0.516828179359436, + "mean_token_accuracy": 0.8265621066093445, + "num_tokens": 4178179.0, + "step": 466 + }, + { + "epoch": 0.3548632218844985, + "grad_norm": 1.955990195274353, + "learning_rate": 4.9372531848918145e-06, + "loss": 0.5586158037185669, + "mean_token_accuracy": 0.8367916345596313, + "num_tokens": 4188626.0, + "step": 467 + }, + { + "epoch": 0.3556231003039514, + "grad_norm": 1.9687023162841797, + "learning_rate": 4.936786038544251e-06, + "loss": 0.5517531633377075, + "mean_token_accuracy": 0.8134098052978516, + "num_tokens": 4198144.0, + "step": 468 + }, + { + "epoch": 0.35638297872340424, + "grad_norm": 1.405516505241394, + "learning_rate": 4.9363171819664434e-06, + "loss": 0.5305492877960205, + "mean_token_accuracy": 0.8014427423477173, + "num_tokens": 4222818.0, + "step": 469 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 2.6355695724487305, + "learning_rate": 4.9358466154874535e-06, + "loss": 0.5303391218185425, + "mean_token_accuracy": 0.8028861284255981, + "num_tokens": 4228318.0, + "step": 470 + }, + { + "epoch": 0.35790273556231, + "grad_norm": 1.5133824348449707, + "learning_rate": 4.935374339437543e-06, + "loss": 0.5329189300537109, + "mean_token_accuracy": 0.8479441404342651, + "num_tokens": 4244527.0, + "step": 471 + }, + { + "epoch": 0.3586626139817629, + "grad_norm": 3.4356725215911865, + "learning_rate": 4.934900354148173e-06, + "loss": 0.5431582927703857, + "mean_token_accuracy": 0.8328983783721924, + "num_tokens": 4248034.0, + "step": 472 + }, + { + "epoch": 0.3594224924012158, + "grad_norm": 2.5789499282836914, + "learning_rate": 4.934424659952006e-06, + "loss": 0.4141455292701721, + "mean_token_accuracy": 0.8658635020256042, + "num_tokens": 4252953.0, + "step": 473 + }, + { + "epoch": 0.3601823708206687, + "grad_norm": 1.145262598991394, + "learning_rate": 4.933947257182901e-06, + "loss": 0.40294092893600464, + "mean_token_accuracy": 0.8565847277641296, + "num_tokens": 4277813.0, + "step": 474 + }, + { + "epoch": 0.3609422492401216, + "grad_norm": 1.7242133617401123, + "learning_rate": 4.933468146175918e-06, + "loss": 0.6036738753318787, + "mean_token_accuracy": 0.8072597980499268, + "num_tokens": 4291088.0, + "step": 475 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 2.3490941524505615, + "learning_rate": 4.932987327267317e-06, + "loss": 0.49456146359443665, + "mean_token_accuracy": 0.8372673988342285, + "num_tokens": 4297376.0, + "step": 476 + }, + { + "epoch": 0.36246200607902734, + "grad_norm": 1.3605526685714722, + "learning_rate": 4.932504800794553e-06, + "loss": 0.43595948815345764, + "mean_token_accuracy": 0.8415953516960144, + "num_tokens": 4312054.0, + "step": 477 + }, + { + "epoch": 0.36322188449848025, + "grad_norm": 1.4525885581970215, + "learning_rate": 4.9320205670962815e-06, + "loss": 0.5390371680259705, + "mean_token_accuracy": 0.8101649284362793, + "num_tokens": 4328701.0, + "step": 478 + }, + { + "epoch": 0.3639817629179331, + "grad_norm": 1.9862419366836548, + "learning_rate": 4.931534626512359e-06, + "loss": 0.45436930656433105, + "mean_token_accuracy": 0.8352861404418945, + "num_tokens": 4338372.0, + "step": 479 + }, + { + "epoch": 0.364741641337386, + "grad_norm": 1.7804961204528809, + "learning_rate": 4.931046979383836e-06, + "loss": 0.4677754044532776, + "mean_token_accuracy": 0.840467095375061, + "num_tokens": 4347897.0, + "step": 480 + }, + { + "epoch": 0.3655015197568389, + "grad_norm": 2.066632032394409, + "learning_rate": 4.930557626052961e-06, + "loss": 0.42418140172958374, + "mean_token_accuracy": 0.8528275489807129, + "num_tokens": 4354061.0, + "step": 481 + }, + { + "epoch": 0.3662613981762918, + "grad_norm": 1.6155282258987427, + "learning_rate": 4.930066566863182e-06, + "loss": 0.5424284934997559, + "mean_token_accuracy": 0.825040876865387, + "num_tokens": 4370400.0, + "step": 482 + }, + { + "epoch": 0.3670212765957447, + "grad_norm": 2.1452953815460205, + "learning_rate": 4.929573802159143e-06, + "loss": 0.5105804204940796, + "mean_token_accuracy": 0.8284053802490234, + "num_tokens": 4377579.0, + "step": 483 + }, + { + "epoch": 0.3677811550151976, + "grad_norm": 1.8940945863723755, + "learning_rate": 4.929079332286685e-06, + "loss": 0.43478304147720337, + "mean_token_accuracy": 0.8505665063858032, + "num_tokens": 4385686.0, + "step": 484 + }, + { + "epoch": 0.36854103343465044, + "grad_norm": 1.6785860061645508, + "learning_rate": 4.928583157592846e-06, + "loss": 0.40227848291397095, + "mean_token_accuracy": 0.8623573780059814, + "num_tokens": 4396128.0, + "step": 485 + }, + { + "epoch": 0.36930091185410335, + "grad_norm": 1.6416733264923096, + "learning_rate": 4.928085278425862e-06, + "loss": 0.526267409324646, + "mean_token_accuracy": 0.8284667730331421, + "num_tokens": 4407963.0, + "step": 486 + }, + { + "epoch": 0.3700607902735562, + "grad_norm": 1.8882389068603516, + "learning_rate": 4.927585695135162e-06, + "loss": 0.5555213093757629, + "mean_token_accuracy": 0.8115293979644775, + "num_tokens": 4418057.0, + "step": 487 + }, + { + "epoch": 0.3708206686930091, + "grad_norm": 2.300248384475708, + "learning_rate": 4.9270844080713735e-06, + "loss": 0.5812339186668396, + "mean_token_accuracy": 0.800270676612854, + "num_tokens": 4425358.0, + "step": 488 + }, + { + "epoch": 0.371580547112462, + "grad_norm": 1.6802922487258911, + "learning_rate": 4.926581417586319e-06, + "loss": 0.5134941935539246, + "mean_token_accuracy": 0.8247408866882324, + "num_tokens": 4437702.0, + "step": 489 + }, + { + "epoch": 0.3723404255319149, + "grad_norm": 1.7620291709899902, + "learning_rate": 4.926076724033016e-06, + "loss": 0.5233973264694214, + "mean_token_accuracy": 0.8102161884307861, + "num_tokens": 4448584.0, + "step": 490 + }, + { + "epoch": 0.3731003039513678, + "grad_norm": 1.6911998987197876, + "learning_rate": 4.925570327765678e-06, + "loss": 0.5337274074554443, + "mean_token_accuracy": 0.845306396484375, + "num_tokens": 4462651.0, + "step": 491 + }, + { + "epoch": 0.3738601823708207, + "grad_norm": 1.7991242408752441, + "learning_rate": 4.9250622291397144e-06, + "loss": 0.31018948554992676, + "mean_token_accuracy": 0.8857606053352356, + "num_tokens": 4469971.0, + "step": 492 + }, + { + "epoch": 0.37462006079027355, + "grad_norm": 4.9776835441589355, + "learning_rate": 4.924552428511727e-06, + "loss": 0.44114983081817627, + "mean_token_accuracy": 0.8429906368255615, + "num_tokens": 4478275.0, + "step": 493 + }, + { + "epoch": 0.37537993920972645, + "grad_norm": 1.8007272481918335, + "learning_rate": 4.924040926239515e-06, + "loss": 0.574328601360321, + "mean_token_accuracy": 0.7669196128845215, + "num_tokens": 4491551.0, + "step": 494 + }, + { + "epoch": 0.3761398176291793, + "grad_norm": 2.021300792694092, + "learning_rate": 4.92352772268207e-06, + "loss": 0.45636120438575745, + "mean_token_accuracy": 0.840438723564148, + "num_tokens": 4498658.0, + "step": 495 + }, + { + "epoch": 0.3768996960486322, + "grad_norm": 2.369748592376709, + "learning_rate": 4.923012818199576e-06, + "loss": 0.5206376910209656, + "mean_token_accuracy": 0.8521823287010193, + "num_tokens": 4504648.0, + "step": 496 + }, + { + "epoch": 0.3776595744680851, + "grad_norm": 2.733485221862793, + "learning_rate": 4.922496213153416e-06, + "loss": 0.5067723989486694, + "mean_token_accuracy": 0.8168281316757202, + "num_tokens": 4509990.0, + "step": 497 + }, + { + "epoch": 0.378419452887538, + "grad_norm": 2.3751676082611084, + "learning_rate": 4.921977907906161e-06, + "loss": 0.49757206439971924, + "mean_token_accuracy": 0.8325017690658569, + "num_tokens": 4518373.0, + "step": 498 + }, + { + "epoch": 0.3791793313069909, + "grad_norm": 2.1672775745391846, + "learning_rate": 4.921457902821578e-06, + "loss": 0.4237566590309143, + "mean_token_accuracy": 0.8404698371887207, + "num_tokens": 4524338.0, + "step": 499 + }, + { + "epoch": 0.3799392097264438, + "grad_norm": 1.8374360799789429, + "learning_rate": 4.9209361982646275e-06, + "loss": 0.4995468854904175, + "mean_token_accuracy": 0.8299649953842163, + "num_tokens": 4533396.0, + "step": 500 + }, + { + "epoch": 0.38069908814589665, + "grad_norm": 2.083967924118042, + "learning_rate": 4.920412794601461e-06, + "loss": 0.489935040473938, + "mean_token_accuracy": 0.8315291404724121, + "num_tokens": 4540941.0, + "step": 501 + }, + { + "epoch": 0.38145896656534956, + "grad_norm": 2.2075610160827637, + "learning_rate": 4.919887692199423e-06, + "loss": 0.5233147740364075, + "mean_token_accuracy": 0.804171085357666, + "num_tokens": 4548215.0, + "step": 502 + }, + { + "epoch": 0.3822188449848024, + "grad_norm": 2.076775312423706, + "learning_rate": 4.9193608914270515e-06, + "loss": 0.5785550475120544, + "mean_token_accuracy": 0.7993186116218567, + "num_tokens": 4558204.0, + "step": 503 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.238546133041382, + "learning_rate": 4.918832392654075e-06, + "loss": 0.5287384390830994, + "mean_token_accuracy": 0.8214945793151855, + "num_tokens": 4565407.0, + "step": 504 + }, + { + "epoch": 0.3837386018237082, + "grad_norm": 1.6783074140548706, + "learning_rate": 4.9183021962514145e-06, + "loss": 0.6063359379768372, + "mean_token_accuracy": 0.7914625406265259, + "num_tokens": 4580991.0, + "step": 505 + }, + { + "epoch": 0.3844984802431611, + "grad_norm": 1.6287449598312378, + "learning_rate": 4.917770302591183e-06, + "loss": 0.3598247766494751, + "mean_token_accuracy": 0.8706809878349304, + "num_tokens": 4590579.0, + "step": 506 + }, + { + "epoch": 0.385258358662614, + "grad_norm": 1.5432041883468628, + "learning_rate": 4.917236712046682e-06, + "loss": 0.5267890095710754, + "mean_token_accuracy": 0.8032117486000061, + "num_tokens": 4608380.0, + "step": 507 + }, + { + "epoch": 0.3860182370820669, + "grad_norm": 1.7664037942886353, + "learning_rate": 4.9167014249924075e-06, + "loss": 0.3552354574203491, + "mean_token_accuracy": 0.8569793701171875, + "num_tokens": 4616426.0, + "step": 508 + }, + { + "epoch": 0.38677811550151975, + "grad_norm": 2.1147472858428955, + "learning_rate": 4.916164441804044e-06, + "loss": 0.5212404727935791, + "mean_token_accuracy": 0.8196578025817871, + "num_tokens": 4623908.0, + "step": 509 + }, + { + "epoch": 0.38753799392097266, + "grad_norm": 2.1092333793640137, + "learning_rate": 4.915625762858467e-06, + "loss": 0.5197038650512695, + "mean_token_accuracy": 0.8245604634284973, + "num_tokens": 4630956.0, + "step": 510 + }, + { + "epoch": 0.3882978723404255, + "grad_norm": 1.23331880569458, + "learning_rate": 4.915085388533743e-06, + "loss": 0.4759839177131653, + "mean_token_accuracy": 0.8192248344421387, + "num_tokens": 4651269.0, + "step": 511 + }, + { + "epoch": 0.3890577507598784, + "grad_norm": 2.424199104309082, + "learning_rate": 4.914543319209126e-06, + "loss": 0.5576270818710327, + "mean_token_accuracy": 0.8203302621841431, + "num_tokens": 4657296.0, + "step": 512 + }, + { + "epoch": 0.3898176291793313, + "grad_norm": 2.725156307220459, + "learning_rate": 4.913999555265062e-06, + "loss": 0.4337949752807617, + "mean_token_accuracy": 0.8382406234741211, + "num_tokens": 4661850.0, + "step": 513 + }, + { + "epoch": 0.3905775075987842, + "grad_norm": 2.3120534420013428, + "learning_rate": 4.913454097083185e-06, + "loss": 0.4941597580909729, + "mean_token_accuracy": 0.8302834033966064, + "num_tokens": 4667769.0, + "step": 514 + }, + { + "epoch": 0.3913373860182371, + "grad_norm": 2.3111207485198975, + "learning_rate": 4.912906945046319e-06, + "loss": 0.5253715515136719, + "mean_token_accuracy": 0.84515380859375, + "num_tokens": 4674537.0, + "step": 515 + }, + { + "epoch": 0.39209726443769, + "grad_norm": 1.4117841720581055, + "learning_rate": 4.912358099538476e-06, + "loss": 0.4521017074584961, + "mean_token_accuracy": 0.8208256959915161, + "num_tokens": 4690605.0, + "step": 516 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 2.3742799758911133, + "learning_rate": 4.911807560944858e-06, + "loss": 0.41572901606559753, + "mean_token_accuracy": 0.8550551533699036, + "num_tokens": 4706437.0, + "step": 517 + }, + { + "epoch": 0.39361702127659576, + "grad_norm": 2.4052202701568604, + "learning_rate": 4.911255329651852e-06, + "loss": 0.6003736257553101, + "mean_token_accuracy": 0.8247885704040527, + "num_tokens": 4712746.0, + "step": 518 + }, + { + "epoch": 0.3943768996960486, + "grad_norm": 1.9335490465164185, + "learning_rate": 4.910701406047037e-06, + "loss": 0.5457713603973389, + "mean_token_accuracy": 0.787429690361023, + "num_tokens": 4731937.0, + "step": 519 + }, + { + "epoch": 0.3951367781155015, + "grad_norm": 2.257706880569458, + "learning_rate": 4.910145790519177e-06, + "loss": 0.5300652980804443, + "mean_token_accuracy": 0.8192912936210632, + "num_tokens": 4739422.0, + "step": 520 + }, + { + "epoch": 0.3958966565349544, + "grad_norm": 1.2099462747573853, + "learning_rate": 4.9095884834582256e-06, + "loss": 0.45872747898101807, + "mean_token_accuracy": 0.8362667560577393, + "num_tokens": 4757113.0, + "step": 521 + }, + { + "epoch": 0.3966565349544073, + "grad_norm": 2.7991135120391846, + "learning_rate": 4.909029485255321e-06, + "loss": 0.49039560556411743, + "mean_token_accuracy": 0.8260016441345215, + "num_tokens": 4761709.0, + "step": 522 + }, + { + "epoch": 0.3974164133738602, + "grad_norm": 2.2360129356384277, + "learning_rate": 4.90846879630279e-06, + "loss": 0.49556830525398254, + "mean_token_accuracy": 0.827864408493042, + "num_tokens": 4769048.0, + "step": 523 + }, + { + "epoch": 0.3981762917933131, + "grad_norm": 2.5953688621520996, + "learning_rate": 4.907906416994146e-06, + "loss": 0.387208491563797, + "mean_token_accuracy": 0.8467001914978027, + "num_tokens": 4774637.0, + "step": 524 + }, + { + "epoch": 0.39893617021276595, + "grad_norm": 2.1046814918518066, + "learning_rate": 4.907342347724088e-06, + "loss": 0.5477259755134583, + "mean_token_accuracy": 0.8060322999954224, + "num_tokens": 4782774.0, + "step": 525 + }, + { + "epoch": 0.39969604863221886, + "grad_norm": 2.5622646808624268, + "learning_rate": 4.906776588888502e-06, + "loss": 0.5684159398078918, + "mean_token_accuracy": 0.8095303177833557, + "num_tokens": 4788766.0, + "step": 526 + }, + { + "epoch": 0.4004559270516717, + "grad_norm": 1.9027913808822632, + "learning_rate": 4.906209140884459e-06, + "loss": 0.535524845123291, + "mean_token_accuracy": 0.815237820148468, + "num_tokens": 4798492.0, + "step": 527 + }, + { + "epoch": 0.4012158054711246, + "grad_norm": 2.1447622776031494, + "learning_rate": 4.905640004110216e-06, + "loss": 0.5628632307052612, + "mean_token_accuracy": 0.8085395097732544, + "num_tokens": 4805737.0, + "step": 528 + }, + { + "epoch": 0.40197568389057753, + "grad_norm": 1.6754741668701172, + "learning_rate": 4.905069178965215e-06, + "loss": 0.5046736598014832, + "mean_token_accuracy": 0.8247535228729248, + "num_tokens": 4816912.0, + "step": 529 + }, + { + "epoch": 0.4027355623100304, + "grad_norm": 2.271230459213257, + "learning_rate": 4.904496665850083e-06, + "loss": 0.6086187958717346, + "mean_token_accuracy": 0.7935276627540588, + "num_tokens": 4824577.0, + "step": 530 + }, + { + "epoch": 0.4034954407294833, + "grad_norm": 2.107595205307007, + "learning_rate": 4.903922465166633e-06, + "loss": 0.5431341528892517, + "mean_token_accuracy": 0.8129537105560303, + "num_tokens": 4831772.0, + "step": 531 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 1.3860732316970825, + "learning_rate": 4.903346577317859e-06, + "loss": 0.45816320180892944, + "mean_token_accuracy": 0.8328287601470947, + "num_tokens": 4850302.0, + "step": 532 + }, + { + "epoch": 0.40501519756838905, + "grad_norm": 1.9186837673187256, + "learning_rate": 4.902769002707942e-06, + "loss": 0.3294633626937866, + "mean_token_accuracy": 0.8853933811187744, + "num_tokens": 4856624.0, + "step": 533 + }, + { + "epoch": 0.40577507598784196, + "grad_norm": 1.516194462776184, + "learning_rate": 4.902189741742247e-06, + "loss": 0.45482105016708374, + "mean_token_accuracy": 0.8370342254638672, + "num_tokens": 4870395.0, + "step": 534 + }, + { + "epoch": 0.4065349544072948, + "grad_norm": 2.3235628604888916, + "learning_rate": 4.901608794827321e-06, + "loss": 0.40688639879226685, + "mean_token_accuracy": 0.8643521666526794, + "num_tokens": 4875645.0, + "step": 535 + }, + { + "epoch": 0.4072948328267477, + "grad_norm": 2.29286527633667, + "learning_rate": 4.9010261623708945e-06, + "loss": 0.45482826232910156, + "mean_token_accuracy": 0.8429383039474487, + "num_tokens": 4881772.0, + "step": 536 + }, + { + "epoch": 0.40805471124620063, + "grad_norm": 1.5907070636749268, + "learning_rate": 4.900441844781882e-06, + "loss": 0.5266948342323303, + "mean_token_accuracy": 0.8348641395568848, + "num_tokens": 4894289.0, + "step": 537 + }, + { + "epoch": 0.4088145896656535, + "grad_norm": 2.1816294193267822, + "learning_rate": 4.89985584247038e-06, + "loss": 0.4797617793083191, + "mean_token_accuracy": 0.8549500703811646, + "num_tokens": 4901106.0, + "step": 538 + }, + { + "epoch": 0.4095744680851064, + "grad_norm": 1.7347146272659302, + "learning_rate": 4.899268155847667e-06, + "loss": 0.4754739999771118, + "mean_token_accuracy": 0.8278418183326721, + "num_tokens": 4912131.0, + "step": 539 + }, + { + "epoch": 0.41033434650455924, + "grad_norm": 2.0694527626037598, + "learning_rate": 4.898678785326205e-06, + "loss": 0.5071008801460266, + "mean_token_accuracy": 0.8157946467399597, + "num_tokens": 4921141.0, + "step": 540 + }, + { + "epoch": 0.41109422492401215, + "grad_norm": 2.570047616958618, + "learning_rate": 4.898087731319637e-06, + "loss": 0.43639278411865234, + "mean_token_accuracy": 0.8682913780212402, + "num_tokens": 4926182.0, + "step": 541 + }, + { + "epoch": 0.41185410334346506, + "grad_norm": 4.064006805419922, + "learning_rate": 4.8974949942427854e-06, + "loss": 0.539260745048523, + "mean_token_accuracy": 0.8225528001785278, + "num_tokens": 4929449.0, + "step": 542 + }, + { + "epoch": 0.4126139817629179, + "grad_norm": 1.7644332647323608, + "learning_rate": 4.896900574511657e-06, + "loss": 0.472618043422699, + "mean_token_accuracy": 0.8332902193069458, + "num_tokens": 4939443.0, + "step": 543 + }, + { + "epoch": 0.4133738601823708, + "grad_norm": 2.879918336868286, + "learning_rate": 4.89630447254344e-06, + "loss": 0.6360667943954468, + "mean_token_accuracy": 0.8215296268463135, + "num_tokens": 4950838.0, + "step": 544 + }, + { + "epoch": 0.41413373860182373, + "grad_norm": 1.4575570821762085, + "learning_rate": 4.8957066887565005e-06, + "loss": 0.45617997646331787, + "mean_token_accuracy": 0.8373187184333801, + "num_tokens": 4965222.0, + "step": 545 + }, + { + "epoch": 0.4148936170212766, + "grad_norm": 2.4829535484313965, + "learning_rate": 4.895107223570386e-06, + "loss": 0.42285341024398804, + "mean_token_accuracy": 0.8686380386352539, + "num_tokens": 4970724.0, + "step": 546 + }, + { + "epoch": 0.4156534954407295, + "grad_norm": 2.639474630355835, + "learning_rate": 4.894506077405824e-06, + "loss": 0.5906289219856262, + "mean_token_accuracy": 0.8174435496330261, + "num_tokens": 4976766.0, + "step": 547 + }, + { + "epoch": 0.41641337386018235, + "grad_norm": 2.7960562705993652, + "learning_rate": 4.893903250684723e-06, + "loss": 0.4518949091434479, + "mean_token_accuracy": 0.8387585282325745, + "num_tokens": 4980991.0, + "step": 548 + }, + { + "epoch": 0.41717325227963525, + "grad_norm": 2.184176206588745, + "learning_rate": 4.893298743830168e-06, + "loss": 0.5223842859268188, + "mean_token_accuracy": 0.8170937299728394, + "num_tokens": 4987781.0, + "step": 549 + }, + { + "epoch": 0.41793313069908816, + "grad_norm": 2.2393438816070557, + "learning_rate": 4.892692557266429e-06, + "loss": 0.5238431692123413, + "mean_token_accuracy": 0.8217905759811401, + "num_tokens": 4994321.0, + "step": 550 + }, + { + "epoch": 0.418693009118541, + "grad_norm": 3.579047441482544, + "learning_rate": 4.8920846914189465e-06, + "loss": 0.5367584228515625, + "mean_token_accuracy": 0.8312011361122131, + "num_tokens": 4997951.0, + "step": 551 + }, + { + "epoch": 0.4194528875379939, + "grad_norm": 1.6330240964889526, + "learning_rate": 4.891475146714348e-06, + "loss": 0.6054705381393433, + "mean_token_accuracy": 0.7938206791877747, + "num_tokens": 5012726.0, + "step": 552 + }, + { + "epoch": 0.42021276595744683, + "grad_norm": 1.5775716304779053, + "learning_rate": 4.8908639235804324e-06, + "loss": 0.4774656891822815, + "mean_token_accuracy": 0.828762948513031, + "num_tokens": 5026751.0, + "step": 553 + }, + { + "epoch": 0.4209726443768997, + "grad_norm": 1.5719101428985596, + "learning_rate": 4.890251022446181e-06, + "loss": 0.549429178237915, + "mean_token_accuracy": 0.8110791444778442, + "num_tokens": 5041861.0, + "step": 554 + }, + { + "epoch": 0.4217325227963526, + "grad_norm": 1.8585275411605835, + "learning_rate": 4.889636443741752e-06, + "loss": 0.4448118805885315, + "mean_token_accuracy": 0.8462690711021423, + "num_tokens": 5052690.0, + "step": 555 + }, + { + "epoch": 0.42249240121580545, + "grad_norm": 2.189202070236206, + "learning_rate": 4.88902018789848e-06, + "loss": 0.4296762943267822, + "mean_token_accuracy": 0.8488791584968567, + "num_tokens": 5058964.0, + "step": 556 + }, + { + "epoch": 0.42325227963525835, + "grad_norm": 1.9328460693359375, + "learning_rate": 4.888402255348877e-06, + "loss": 0.5369474291801453, + "mean_token_accuracy": 0.8184729814529419, + "num_tokens": 5068465.0, + "step": 557 + }, + { + "epoch": 0.42401215805471126, + "grad_norm": 1.6233323812484741, + "learning_rate": 4.887782646526631e-06, + "loss": 0.5284391641616821, + "mean_token_accuracy": 0.8276044726371765, + "num_tokens": 5081052.0, + "step": 558 + }, + { + "epoch": 0.4247720364741641, + "grad_norm": 2.222813844680786, + "learning_rate": 4.887161361866608e-06, + "loss": 0.5679137706756592, + "mean_token_accuracy": 0.8012375831604004, + "num_tokens": 5090001.0, + "step": 559 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.1062207221984863, + "learning_rate": 4.8865384018048494e-06, + "loss": 0.5554201602935791, + "mean_token_accuracy": 0.8128066062927246, + "num_tokens": 5097644.0, + "step": 560 + }, + { + "epoch": 0.42629179331306993, + "grad_norm": 1.5380984544754028, + "learning_rate": 4.8859137667785735e-06, + "loss": 0.4948265850543976, + "mean_token_accuracy": 0.8258291482925415, + "num_tokens": 5110069.0, + "step": 561 + }, + { + "epoch": 0.4270516717325228, + "grad_norm": 2.0290257930755615, + "learning_rate": 4.8852874572261715e-06, + "loss": 0.4969530403614044, + "mean_token_accuracy": 0.8297134637832642, + "num_tokens": 5117452.0, + "step": 562 + }, + { + "epoch": 0.4278115501519757, + "grad_norm": 1.5651452541351318, + "learning_rate": 4.884659473587213e-06, + "loss": 0.5353102087974548, + "mean_token_accuracy": 0.8161719441413879, + "num_tokens": 5133756.0, + "step": 563 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 2.2470998764038086, + "learning_rate": 4.884029816302441e-06, + "loss": 0.5104288458824158, + "mean_token_accuracy": 0.8081635236740112, + "num_tokens": 5140278.0, + "step": 564 + }, + { + "epoch": 0.42933130699088146, + "grad_norm": 1.726891279220581, + "learning_rate": 4.883398485813772e-06, + "loss": 0.4508771002292633, + "mean_token_accuracy": 0.8548800349235535, + "num_tokens": 5150115.0, + "step": 565 + }, + { + "epoch": 0.43009118541033436, + "grad_norm": 1.4779289960861206, + "learning_rate": 4.8827654825642984e-06, + "loss": 0.46861088275909424, + "mean_token_accuracy": 0.8209476470947266, + "num_tokens": 5163225.0, + "step": 566 + }, + { + "epoch": 0.4308510638297872, + "grad_norm": 1.2361034154891968, + "learning_rate": 4.882130806998287e-06, + "loss": 0.4591076672077179, + "mean_token_accuracy": 0.803041934967041, + "num_tokens": 5180342.0, + "step": 567 + }, + { + "epoch": 0.4316109422492401, + "grad_norm": 1.882467269897461, + "learning_rate": 4.881494459561177e-06, + "loss": 0.579258143901825, + "mean_token_accuracy": 0.8007112741470337, + "num_tokens": 5189595.0, + "step": 568 + }, + { + "epoch": 0.43237082066869303, + "grad_norm": 1.095462441444397, + "learning_rate": 4.880856440699582e-06, + "loss": 0.3806574046611786, + "mean_token_accuracy": 0.8650111556053162, + "num_tokens": 5211642.0, + "step": 569 + }, + { + "epoch": 0.4331306990881459, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.880216750861288e-06, + "loss": 0.544589638710022, + "mean_token_accuracy": 0.8060122728347778, + "num_tokens": 5224137.0, + "step": 570 + }, + { + "epoch": 0.4338905775075988, + "grad_norm": 1.8561251163482666, + "learning_rate": 4.879575390495254e-06, + "loss": 0.4094924330711365, + "mean_token_accuracy": 0.8591406345367432, + "num_tokens": 5231588.0, + "step": 571 + }, + { + "epoch": 0.43465045592705165, + "grad_norm": 3.01326847076416, + "learning_rate": 4.878932360051611e-06, + "loss": 0.6139192581176758, + "mean_token_accuracy": 0.8108739852905273, + "num_tokens": 5236853.0, + "step": 572 + }, + { + "epoch": 0.43541033434650456, + "grad_norm": 2.1753034591674805, + "learning_rate": 4.878287659981663e-06, + "loss": 0.49082931876182556, + "mean_token_accuracy": 0.862828254699707, + "num_tokens": 5243264.0, + "step": 573 + }, + { + "epoch": 0.43617021276595747, + "grad_norm": 1.4437755346298218, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.5608728528022766, + "mean_token_accuracy": 0.8271626234054565, + "num_tokens": 5261757.0, + "step": 574 + }, + { + "epoch": 0.4369300911854103, + "grad_norm": 1.786683440208435, + "learning_rate": 4.876993252773923e-06, + "loss": 0.4377627968788147, + "mean_token_accuracy": 0.844936192035675, + "num_tokens": 5271038.0, + "step": 575 + }, + { + "epoch": 0.4376899696048632, + "grad_norm": 1.3425915241241455, + "learning_rate": 4.876343546544596e-06, + "loss": 0.44762521982192993, + "mean_token_accuracy": 0.8397793769836426, + "num_tokens": 5285555.0, + "step": 576 + }, + { + "epoch": 0.43844984802431614, + "grad_norm": 2.1549675464630127, + "learning_rate": 4.8756921725058935e-06, + "loss": 0.5332942008972168, + "mean_token_accuracy": 0.820149302482605, + "num_tokens": 5294595.0, + "step": 577 + }, + { + "epoch": 0.439209726443769, + "grad_norm": 1.5254042148590088, + "learning_rate": 4.875039131114975e-06, + "loss": 0.3646543622016907, + "mean_token_accuracy": 0.8442583084106445, + "num_tokens": 5304955.0, + "step": 578 + }, + { + "epoch": 0.4399696048632219, + "grad_norm": 1.5751557350158691, + "learning_rate": 4.8743844228301676e-06, + "loss": 0.4854734539985657, + "mean_token_accuracy": 0.8317523002624512, + "num_tokens": 5317351.0, + "step": 579 + }, + { + "epoch": 0.44072948328267475, + "grad_norm": 1.6950466632843018, + "learning_rate": 4.873728048110973e-06, + "loss": 0.5907570719718933, + "mean_token_accuracy": 0.7946986556053162, + "num_tokens": 5332542.0, + "step": 580 + }, + { + "epoch": 0.44148936170212766, + "grad_norm": 2.1180708408355713, + "learning_rate": 4.873070007418059e-06, + "loss": 0.5220296382904053, + "mean_token_accuracy": 0.8037363290786743, + "num_tokens": 5341722.0, + "step": 581 + }, + { + "epoch": 0.44224924012158057, + "grad_norm": 1.3643816709518433, + "learning_rate": 4.872410301213265e-06, + "loss": 0.4865502417087555, + "mean_token_accuracy": 0.8377852439880371, + "num_tokens": 5359359.0, + "step": 582 + }, + { + "epoch": 0.4430091185410334, + "grad_norm": 1.483280897140503, + "learning_rate": 4.871748929959598e-06, + "loss": 0.36856764554977417, + "mean_token_accuracy": 0.8709549903869629, + "num_tokens": 5369749.0, + "step": 583 + }, + { + "epoch": 0.44376899696048633, + "grad_norm": 1.6891541481018066, + "learning_rate": 4.871085894121234e-06, + "loss": 0.5768930912017822, + "mean_token_accuracy": 0.8030461668968201, + "num_tokens": 5383912.0, + "step": 584 + }, + { + "epoch": 0.44452887537993924, + "grad_norm": 2.1318740844726562, + "learning_rate": 4.870421194163515e-06, + "loss": 0.4337100386619568, + "mean_token_accuracy": 0.8562518358230591, + "num_tokens": 5389412.0, + "step": 585 + }, + { + "epoch": 0.4452887537993921, + "grad_norm": 2.540255546569824, + "learning_rate": 4.869754830552956e-06, + "loss": 0.4708256125450134, + "mean_token_accuracy": 0.8446552753448486, + "num_tokens": 5394762.0, + "step": 586 + }, + { + "epoch": 0.446048632218845, + "grad_norm": 2.048015594482422, + "learning_rate": 4.869086803757235e-06, + "loss": 0.5265800952911377, + "mean_token_accuracy": 0.8181137442588806, + "num_tokens": 5402379.0, + "step": 587 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 2.9821012020111084, + "learning_rate": 4.868417114245199e-06, + "loss": 0.6299797296524048, + "mean_token_accuracy": 0.8237329125404358, + "num_tokens": 5408229.0, + "step": 588 + }, + { + "epoch": 0.44756838905775076, + "grad_norm": 1.7807202339172363, + "learning_rate": 4.867745762486862e-06, + "loss": 0.5176759958267212, + "mean_token_accuracy": 0.8184244632720947, + "num_tokens": 5418383.0, + "step": 589 + }, + { + "epoch": 0.44832826747720367, + "grad_norm": 1.5466399192810059, + "learning_rate": 4.8670727489534035e-06, + "loss": 0.5137228965759277, + "mean_token_accuracy": 0.8365053534507751, + "num_tokens": 5432127.0, + "step": 590 + }, + { + "epoch": 0.4490881458966565, + "grad_norm": 2.9521141052246094, + "learning_rate": 4.866398074117173e-06, + "loss": 0.4056887924671173, + "mean_token_accuracy": 0.8561501502990723, + "num_tokens": 5436062.0, + "step": 591 + }, + { + "epoch": 0.44984802431610943, + "grad_norm": 2.058743953704834, + "learning_rate": 4.86572173845168e-06, + "loss": 0.6124799251556396, + "mean_token_accuracy": 0.8007957339286804, + "num_tokens": 5444989.0, + "step": 592 + }, + { + "epoch": 0.4506079027355623, + "grad_norm": 2.1243767738342285, + "learning_rate": 4.865043742431605e-06, + "loss": 0.5659694671630859, + "mean_token_accuracy": 0.8084750175476074, + "num_tokens": 5453865.0, + "step": 593 + }, + { + "epoch": 0.4513677811550152, + "grad_norm": 1.6732314825057983, + "learning_rate": 4.864364086532792e-06, + "loss": 0.47879064083099365, + "mean_token_accuracy": 0.8346436023712158, + "num_tokens": 5466398.0, + "step": 594 + }, + { + "epoch": 0.4521276595744681, + "grad_norm": 1.3793858289718628, + "learning_rate": 4.863682771232249e-06, + "loss": 0.45989373326301575, + "mean_token_accuracy": 0.8254791498184204, + "num_tokens": 5482121.0, + "step": 595 + }, + { + "epoch": 0.45288753799392095, + "grad_norm": 1.9812315702438354, + "learning_rate": 4.862999797008149e-06, + "loss": 0.5778874754905701, + "mean_token_accuracy": 0.8041508197784424, + "num_tokens": 5493000.0, + "step": 596 + }, + { + "epoch": 0.45364741641337386, + "grad_norm": 3.3065083026885986, + "learning_rate": 4.862315164339829e-06, + "loss": 0.4623975157737732, + "mean_token_accuracy": 0.8426318168640137, + "num_tokens": 5496723.0, + "step": 597 + }, + { + "epoch": 0.45440729483282677, + "grad_norm": 3.167119026184082, + "learning_rate": 4.861628873707792e-06, + "loss": 0.6984533667564392, + "mean_token_accuracy": 0.772136926651001, + "num_tokens": 5501161.0, + "step": 598 + }, + { + "epoch": 0.4551671732522796, + "grad_norm": 2.2130985260009766, + "learning_rate": 4.860940925593703e-06, + "loss": 0.4823192059993744, + "mean_token_accuracy": 0.8462972640991211, + "num_tokens": 5509544.0, + "step": 599 + }, + { + "epoch": 0.45592705167173253, + "grad_norm": 3.029191732406616, + "learning_rate": 4.86025132048039e-06, + "loss": 0.523664116859436, + "mean_token_accuracy": 0.8229140043258667, + "num_tokens": 5514586.0, + "step": 600 + }, + { + "epoch": 0.4566869300911854, + "grad_norm": 1.6983962059020996, + "learning_rate": 4.859560058851844e-06, + "loss": 0.4832698106765747, + "mean_token_accuracy": 0.8403248190879822, + "num_tokens": 5525773.0, + "step": 601 + }, + { + "epoch": 0.4574468085106383, + "grad_norm": 3.0504038333892822, + "learning_rate": 4.8588671411932195e-06, + "loss": 0.5158926248550415, + "mean_token_accuracy": 0.8098392486572266, + "num_tokens": 5529739.0, + "step": 602 + }, + { + "epoch": 0.4582066869300912, + "grad_norm": 2.584836483001709, + "learning_rate": 4.858172567990832e-06, + "loss": 0.5724587440490723, + "mean_token_accuracy": 0.8128519058227539, + "num_tokens": 5535763.0, + "step": 603 + }, + { + "epoch": 0.45896656534954405, + "grad_norm": 2.0514042377471924, + "learning_rate": 4.857476339732162e-06, + "loss": 0.4337679445743561, + "mean_token_accuracy": 0.8405929207801819, + "num_tokens": 5543075.0, + "step": 604 + }, + { + "epoch": 0.45972644376899696, + "grad_norm": 2.2949347496032715, + "learning_rate": 4.856778456905846e-06, + "loss": 0.46532145142555237, + "mean_token_accuracy": 0.8345137238502502, + "num_tokens": 5549035.0, + "step": 605 + }, + { + "epoch": 0.46048632218844987, + "grad_norm": 2.2067551612854004, + "learning_rate": 4.856078920001689e-06, + "loss": 0.5855136513710022, + "mean_token_accuracy": 0.8043795228004456, + "num_tokens": 5555545.0, + "step": 606 + }, + { + "epoch": 0.4612462006079027, + "grad_norm": 2.101945161819458, + "learning_rate": 4.855377729510648e-06, + "loss": 0.6071814298629761, + "mean_token_accuracy": 0.7973253130912781, + "num_tokens": 5563615.0, + "step": 607 + }, + { + "epoch": 0.46200607902735563, + "grad_norm": 2.5958821773529053, + "learning_rate": 4.8546748859248504e-06, + "loss": 0.6278061866760254, + "mean_token_accuracy": 0.7864972352981567, + "num_tokens": 5570078.0, + "step": 608 + }, + { + "epoch": 0.4627659574468085, + "grad_norm": 2.778101921081543, + "learning_rate": 4.853970389737576e-06, + "loss": 0.35521194338798523, + "mean_token_accuracy": 0.8752605319023132, + "num_tokens": 5573995.0, + "step": 609 + }, + { + "epoch": 0.4635258358662614, + "grad_norm": 2.600534677505493, + "learning_rate": 4.8532642414432675e-06, + "loss": 0.6541563868522644, + "mean_token_accuracy": 0.7843613028526306, + "num_tokens": 5580333.0, + "step": 610 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 1.778337836265564, + "learning_rate": 4.852556441537528e-06, + "loss": 0.3561405837535858, + "mean_token_accuracy": 0.8579353094100952, + "num_tokens": 5588430.0, + "step": 611 + }, + { + "epoch": 0.46504559270516715, + "grad_norm": 1.5653862953186035, + "learning_rate": 4.851846990517118e-06, + "loss": 0.6067906618118286, + "mean_token_accuracy": 0.7919317483901978, + "num_tokens": 5601700.0, + "step": 612 + }, + { + "epoch": 0.46580547112462006, + "grad_norm": 1.6097723245620728, + "learning_rate": 4.851135888879958e-06, + "loss": 0.446664422750473, + "mean_token_accuracy": 0.8441969156265259, + "num_tokens": 5612063.0, + "step": 613 + }, + { + "epoch": 0.46656534954407297, + "grad_norm": 1.961207389831543, + "learning_rate": 4.850423137125126e-06, + "loss": 0.5508605241775513, + "mean_token_accuracy": 0.8240450024604797, + "num_tokens": 5620245.0, + "step": 614 + }, + { + "epoch": 0.4673252279635258, + "grad_norm": 2.2189085483551025, + "learning_rate": 4.8497087357528585e-06, + "loss": 0.6805076599121094, + "mean_token_accuracy": 0.771978497505188, + "num_tokens": 5629590.0, + "step": 615 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 2.5176279544830322, + "learning_rate": 4.8489926852645505e-06, + "loss": 0.4512156844139099, + "mean_token_accuracy": 0.836459755897522, + "num_tokens": 5635259.0, + "step": 616 + }, + { + "epoch": 0.4688449848024316, + "grad_norm": 1.5327287912368774, + "learning_rate": 4.848274986162754e-06, + "loss": 0.4884302616119385, + "mean_token_accuracy": 0.8194037079811096, + "num_tokens": 5649993.0, + "step": 617 + }, + { + "epoch": 0.4696048632218845, + "grad_norm": 2.184554100036621, + "learning_rate": 4.847555638951177e-06, + "loss": 0.5141451358795166, + "mean_token_accuracy": 0.8245922327041626, + "num_tokens": 5657375.0, + "step": 618 + }, + { + "epoch": 0.4703647416413374, + "grad_norm": 1.6143407821655273, + "learning_rate": 4.846834644134686e-06, + "loss": 0.4276641607284546, + "mean_token_accuracy": 0.8481845855712891, + "num_tokens": 5667941.0, + "step": 619 + }, + { + "epoch": 0.47112462006079026, + "grad_norm": 2.3747270107269287, + "learning_rate": 4.846112002219301e-06, + "loss": 0.5608246922492981, + "mean_token_accuracy": 0.8073011040687561, + "num_tokens": 5675042.0, + "step": 620 + }, + { + "epoch": 0.47188449848024316, + "grad_norm": 2.390404224395752, + "learning_rate": 4.845387713712203e-06, + "loss": 0.46616724133491516, + "mean_token_accuracy": 0.8468319177627563, + "num_tokens": 5680207.0, + "step": 621 + }, + { + "epoch": 0.4726443768996961, + "grad_norm": 1.7245099544525146, + "learning_rate": 4.844661779121723e-06, + "loss": 0.5652435421943665, + "mean_token_accuracy": 0.8010749816894531, + "num_tokens": 5693759.0, + "step": 622 + }, + { + "epoch": 0.4734042553191489, + "grad_norm": 2.6923108100891113, + "learning_rate": 4.843934198957351e-06, + "loss": 0.6254661679267883, + "mean_token_accuracy": 0.8236024975776672, + "num_tokens": 5699916.0, + "step": 623 + }, + { + "epoch": 0.47416413373860183, + "grad_norm": 2.516901969909668, + "learning_rate": 4.84320497372973e-06, + "loss": 0.6334252953529358, + "mean_token_accuracy": 0.7803834676742554, + "num_tokens": 5706554.0, + "step": 624 + }, + { + "epoch": 0.4749240121580547, + "grad_norm": 2.3744447231292725, + "learning_rate": 4.842474103950658e-06, + "loss": 0.4221811890602112, + "mean_token_accuracy": 0.8639545440673828, + "num_tokens": 5711756.0, + "step": 625 + }, + { + "epoch": 0.4756838905775076, + "grad_norm": 3.2373476028442383, + "learning_rate": 4.841741590133089e-06, + "loss": 0.6637828946113586, + "mean_token_accuracy": 0.7968347072601318, + "num_tokens": 5716458.0, + "step": 626 + }, + { + "epoch": 0.4764437689969605, + "grad_norm": 2.153888463973999, + "learning_rate": 4.841007432791129e-06, + "loss": 0.4877486228942871, + "mean_token_accuracy": 0.8345249891281128, + "num_tokens": 5723155.0, + "step": 627 + }, + { + "epoch": 0.47720364741641336, + "grad_norm": 2.120497703552246, + "learning_rate": 4.8402716324400375e-06, + "loss": 0.37323033809661865, + "mean_token_accuracy": 0.8734050393104553, + "num_tokens": 5729171.0, + "step": 628 + }, + { + "epoch": 0.47796352583586627, + "grad_norm": 1.5294172763824463, + "learning_rate": 4.839534189596228e-06, + "loss": 0.4057067334651947, + "mean_token_accuracy": 0.8523319959640503, + "num_tokens": 5740112.0, + "step": 629 + }, + { + "epoch": 0.4787234042553192, + "grad_norm": 2.1913886070251465, + "learning_rate": 4.8387951047772656e-06, + "loss": 0.4835960865020752, + "mean_token_accuracy": 0.8438145518302917, + "num_tokens": 5746838.0, + "step": 630 + }, + { + "epoch": 0.479483282674772, + "grad_norm": 1.482897162437439, + "learning_rate": 4.838054378501868e-06, + "loss": 0.46967992186546326, + "mean_token_accuracy": 0.8315759897232056, + "num_tokens": 5760428.0, + "step": 631 + }, + { + "epoch": 0.48024316109422494, + "grad_norm": 1.38850998878479, + "learning_rate": 4.837312011289907e-06, + "loss": 0.41845446825027466, + "mean_token_accuracy": 0.8557186126708984, + "num_tokens": 5773437.0, + "step": 632 + }, + { + "epoch": 0.4810030395136778, + "grad_norm": 3.8337457180023193, + "learning_rate": 4.836568003662403e-06, + "loss": 0.5102912187576294, + "mean_token_accuracy": 0.830644965171814, + "num_tokens": 5776367.0, + "step": 633 + }, + { + "epoch": 0.4817629179331307, + "grad_norm": 1.2084007263183594, + "learning_rate": 4.8358223561415304e-06, + "loss": 0.3835333585739136, + "mean_token_accuracy": 0.8639016151428223, + "num_tokens": 5792246.0, + "step": 634 + }, + { + "epoch": 0.4825227963525836, + "grad_norm": 1.939408540725708, + "learning_rate": 4.835075069250613e-06, + "loss": 0.4044850468635559, + "mean_token_accuracy": 0.8488376140594482, + "num_tokens": 5799853.0, + "step": 635 + }, + { + "epoch": 0.48328267477203646, + "grad_norm": 1.345870852470398, + "learning_rate": 4.8343261435141245e-06, + "loss": 0.46660199761390686, + "mean_token_accuracy": 0.8371681571006775, + "num_tokens": 5817478.0, + "step": 636 + }, + { + "epoch": 0.48404255319148937, + "grad_norm": 1.6531339883804321, + "learning_rate": 4.833575579457691e-06, + "loss": 0.3886989951133728, + "mean_token_accuracy": 0.8763507008552551, + "num_tokens": 5825739.0, + "step": 637 + }, + { + "epoch": 0.4848024316109423, + "grad_norm": 1.6443969011306763, + "learning_rate": 4.832823377608088e-06, + "loss": 0.4070289731025696, + "mean_token_accuracy": 0.8586630821228027, + "num_tokens": 5837917.0, + "step": 638 + }, + { + "epoch": 0.48556231003039513, + "grad_norm": 2.005136013031006, + "learning_rate": 4.832069538493237e-06, + "loss": 0.40616685152053833, + "mean_token_accuracy": 0.8571510314941406, + "num_tokens": 5845250.0, + "step": 639 + }, + { + "epoch": 0.48632218844984804, + "grad_norm": 1.5244266986846924, + "learning_rate": 4.831314062642213e-06, + "loss": 0.49530288577079773, + "mean_token_accuracy": 0.8328841924667358, + "num_tokens": 5857407.0, + "step": 640 + }, + { + "epoch": 0.4870820668693009, + "grad_norm": 1.9876971244812012, + "learning_rate": 4.830556950585239e-06, + "loss": 0.4583776593208313, + "mean_token_accuracy": 0.8427221179008484, + "num_tokens": 5865391.0, + "step": 641 + }, + { + "epoch": 0.4878419452887538, + "grad_norm": 3.023336172103882, + "learning_rate": 4.829798202853683e-06, + "loss": 0.6134771108627319, + "mean_token_accuracy": 0.7981935739517212, + "num_tokens": 5870729.0, + "step": 642 + }, + { + "epoch": 0.4886018237082067, + "grad_norm": 1.8889515399932861, + "learning_rate": 4.829037819980065e-06, + "loss": 0.4420135021209717, + "mean_token_accuracy": 0.8480775356292725, + "num_tokens": 5878982.0, + "step": 643 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.2408435344696045, + "learning_rate": 4.828275802498051e-06, + "loss": 0.525706946849823, + "mean_token_accuracy": 0.8271557092666626, + "num_tokens": 5885097.0, + "step": 644 + }, + { + "epoch": 0.49012158054711247, + "grad_norm": 1.9734224081039429, + "learning_rate": 4.827512150942454e-06, + "loss": 0.44246578216552734, + "mean_token_accuracy": 0.8456668257713318, + "num_tokens": 5893941.0, + "step": 645 + }, + { + "epoch": 0.4908814589665654, + "grad_norm": 1.9618173837661743, + "learning_rate": 4.8267468658492335e-06, + "loss": 0.5119768381118774, + "mean_token_accuracy": 0.8355510830879211, + "num_tokens": 5902829.0, + "step": 646 + }, + { + "epoch": 0.49164133738601823, + "grad_norm": 1.7181587219238281, + "learning_rate": 4.825979947755496e-06, + "loss": 0.5666520595550537, + "mean_token_accuracy": 0.7951971888542175, + "num_tokens": 5915212.0, + "step": 647 + }, + { + "epoch": 0.49240121580547114, + "grad_norm": 3.0121164321899414, + "learning_rate": 4.8252113971994955e-06, + "loss": 0.628632128238678, + "mean_token_accuracy": 0.8041050434112549, + "num_tokens": 5921410.0, + "step": 648 + }, + { + "epoch": 0.493161094224924, + "grad_norm": 2.9980475902557373, + "learning_rate": 4.824441214720629e-06, + "loss": 0.4507424831390381, + "mean_token_accuracy": 0.8636263608932495, + "num_tokens": 5925179.0, + "step": 649 + }, + { + "epoch": 0.4939209726443769, + "grad_norm": 2.0096445083618164, + "learning_rate": 4.823669400859441e-06, + "loss": 0.602759838104248, + "mean_token_accuracy": 0.8104915618896484, + "num_tokens": 5934160.0, + "step": 650 + }, + { + "epoch": 0.4946808510638298, + "grad_norm": 1.1186442375183105, + "learning_rate": 4.8228959561576195e-06, + "loss": 0.41168469190597534, + "mean_token_accuracy": 0.8461419939994812, + "num_tokens": 5954163.0, + "step": 651 + }, + { + "epoch": 0.49544072948328266, + "grad_norm": 1.855465054512024, + "learning_rate": 4.822120881157998e-06, + "loss": 0.5049735307693481, + "mean_token_accuracy": 0.8225747346878052, + "num_tokens": 5963840.0, + "step": 652 + }, + { + "epoch": 0.49620060790273557, + "grad_norm": 3.550563335418701, + "learning_rate": 4.821344176404554e-06, + "loss": 0.49025264382362366, + "mean_token_accuracy": 0.8265978693962097, + "num_tokens": 5967358.0, + "step": 653 + }, + { + "epoch": 0.4969604863221885, + "grad_norm": 3.063910484313965, + "learning_rate": 4.820565842442408e-06, + "loss": 0.5652767419815063, + "mean_token_accuracy": 0.811700701713562, + "num_tokens": 5971858.0, + "step": 654 + }, + { + "epoch": 0.49772036474164133, + "grad_norm": 2.4613308906555176, + "learning_rate": 4.819785879817827e-06, + "loss": 0.5296125411987305, + "mean_token_accuracy": 0.8336488008499146, + "num_tokens": 5977442.0, + "step": 655 + }, + { + "epoch": 0.49848024316109424, + "grad_norm": 2.342519760131836, + "learning_rate": 4.819004289078217e-06, + "loss": 0.5753380060195923, + "mean_token_accuracy": 0.7922406792640686, + "num_tokens": 5984531.0, + "step": 656 + }, + { + "epoch": 0.4992401215805471, + "grad_norm": 2.0410680770874023, + "learning_rate": 4.818221070772129e-06, + "loss": 0.5433275699615479, + "mean_token_accuracy": 0.8043830990791321, + "num_tokens": 5992642.0, + "step": 657 + }, + { + "epoch": 0.5, + "grad_norm": 1.4999698400497437, + "learning_rate": 4.8174362254492555e-06, + "loss": 0.5248899459838867, + "mean_token_accuracy": 0.8107168674468994, + "num_tokens": 6005543.0, + "step": 658 + }, + { + "epoch": 0.5007598784194529, + "grad_norm": 1.9494401216506958, + "learning_rate": 4.816649753660431e-06, + "loss": 0.41291385889053345, + "mean_token_accuracy": 0.8650569915771484, + "num_tokens": 6012185.0, + "step": 659 + }, + { + "epoch": 0.5015197568389058, + "grad_norm": 2.7514095306396484, + "learning_rate": 4.815861655957632e-06, + "loss": 0.4244142770767212, + "mean_token_accuracy": 0.8485112190246582, + "num_tokens": 6016809.0, + "step": 660 + }, + { + "epoch": 0.5022796352583586, + "grad_norm": 1.4354928731918335, + "learning_rate": 4.815071932893976e-06, + "loss": 0.4332060217857361, + "mean_token_accuracy": 0.8386815786361694, + "num_tokens": 6034795.0, + "step": 661 + }, + { + "epoch": 0.5030395136778115, + "grad_norm": 1.3113417625427246, + "learning_rate": 4.81428058502372e-06, + "loss": 0.5415540933609009, + "mean_token_accuracy": 0.8115285038948059, + "num_tokens": 6053624.0, + "step": 662 + }, + { + "epoch": 0.5037993920972644, + "grad_norm": 1.820868730545044, + "learning_rate": 4.813487612902265e-06, + "loss": 0.5360245108604431, + "mean_token_accuracy": 0.8313555717468262, + "num_tokens": 6063399.0, + "step": 663 + }, + { + "epoch": 0.5045592705167173, + "grad_norm": 2.347001552581787, + "learning_rate": 4.812693017086145e-06, + "loss": 0.4926982820034027, + "mean_token_accuracy": 0.8137006759643555, + "num_tokens": 6070111.0, + "step": 664 + }, + { + "epoch": 0.5053191489361702, + "grad_norm": 1.8830888271331787, + "learning_rate": 4.811896798133042e-06, + "loss": 0.5419014692306519, + "mean_token_accuracy": 0.8027454614639282, + "num_tokens": 6081090.0, + "step": 665 + }, + { + "epoch": 0.506079027355623, + "grad_norm": 2.3258056640625, + "learning_rate": 4.811098956601772e-06, + "loss": 0.4629337787628174, + "mean_token_accuracy": 0.8416580557823181, + "num_tokens": 6087921.0, + "step": 666 + }, + { + "epoch": 0.506838905775076, + "grad_norm": 1.9578291177749634, + "learning_rate": 4.810299493052289e-06, + "loss": 0.40305402874946594, + "mean_token_accuracy": 0.8529061079025269, + "num_tokens": 6100034.0, + "step": 667 + }, + { + "epoch": 0.5075987841945289, + "grad_norm": 2.800635576248169, + "learning_rate": 4.809498408045691e-06, + "loss": 0.5087342262268066, + "mean_token_accuracy": 0.8214689493179321, + "num_tokens": 6104742.0, + "step": 668 + }, + { + "epoch": 0.5083586626139818, + "grad_norm": 1.5318149328231812, + "learning_rate": 4.808695702144206e-06, + "loss": 0.4733222723007202, + "mean_token_accuracy": 0.837577223777771, + "num_tokens": 6117242.0, + "step": 669 + }, + { + "epoch": 0.5091185410334347, + "grad_norm": 1.2368661165237427, + "learning_rate": 4.807891375911207e-06, + "loss": 0.3929097056388855, + "mean_token_accuracy": 0.8331400752067566, + "num_tokens": 6133509.0, + "step": 670 + }, + { + "epoch": 0.5098784194528876, + "grad_norm": 2.4711415767669678, + "learning_rate": 4.8070854299112e-06, + "loss": 0.6294851303100586, + "mean_token_accuracy": 0.7956781983375549, + "num_tokens": 6140294.0, + "step": 671 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.590961217880249, + "learning_rate": 4.806277864709828e-06, + "loss": 0.580160915851593, + "mean_token_accuracy": 0.809589684009552, + "num_tokens": 6145803.0, + "step": 672 + }, + { + "epoch": 0.5113981762917933, + "grad_norm": 2.4653842449188232, + "learning_rate": 4.805468680873874e-06, + "loss": 0.5262120366096497, + "mean_token_accuracy": 0.822458803653717, + "num_tokens": 6151236.0, + "step": 673 + }, + { + "epoch": 0.5121580547112462, + "grad_norm": 2.860720157623291, + "learning_rate": 4.804657878971252e-06, + "loss": 0.4007391035556793, + "mean_token_accuracy": 0.8637382984161377, + "num_tokens": 6155310.0, + "step": 674 + }, + { + "epoch": 0.5129179331306991, + "grad_norm": 2.520282030105591, + "learning_rate": 4.803845459571014e-06, + "loss": 0.45798182487487793, + "mean_token_accuracy": 0.8270114660263062, + "num_tokens": 6160326.0, + "step": 675 + }, + { + "epoch": 0.513677811550152, + "grad_norm": 2.7290921211242676, + "learning_rate": 4.803031423243349e-06, + "loss": 0.5745848417282104, + "mean_token_accuracy": 0.8401234745979309, + "num_tokens": 6165709.0, + "step": 676 + }, + { + "epoch": 0.5144376899696048, + "grad_norm": 1.6678650379180908, + "learning_rate": 4.802215770559578e-06, + "loss": 0.5257721543312073, + "mean_token_accuracy": 0.8241991996765137, + "num_tokens": 6177875.0, + "step": 677 + }, + { + "epoch": 0.5151975683890577, + "grad_norm": 2.1720468997955322, + "learning_rate": 4.801398502092156e-06, + "loss": 0.45342206954956055, + "mean_token_accuracy": 0.8463799953460693, + "num_tokens": 6185415.0, + "step": 678 + }, + { + "epoch": 0.5159574468085106, + "grad_norm": 2.282259702682495, + "learning_rate": 4.800579618414677e-06, + "loss": 0.4864169955253601, + "mean_token_accuracy": 0.8300632238388062, + "num_tokens": 6191832.0, + "step": 679 + }, + { + "epoch": 0.5167173252279635, + "grad_norm": 2.0092248916625977, + "learning_rate": 4.799759120101861e-06, + "loss": 0.5781463980674744, + "mean_token_accuracy": 0.8267031908035278, + "num_tokens": 6199440.0, + "step": 680 + }, + { + "epoch": 0.5174772036474165, + "grad_norm": 1.396580696105957, + "learning_rate": 4.798937007729568e-06, + "loss": 0.49689239263534546, + "mean_token_accuracy": 0.8257499933242798, + "num_tokens": 6213840.0, + "step": 681 + }, + { + "epoch": 0.5182370820668692, + "grad_norm": 1.9060769081115723, + "learning_rate": 4.798113281874788e-06, + "loss": 0.48969539999961853, + "mean_token_accuracy": 0.8171790838241577, + "num_tokens": 6223006.0, + "step": 682 + }, + { + "epoch": 0.5189969604863222, + "grad_norm": 1.6255282163619995, + "learning_rate": 4.797287943115642e-06, + "loss": 0.5532330870628357, + "mean_token_accuracy": 0.8173393607139587, + "num_tokens": 6234857.0, + "step": 683 + }, + { + "epoch": 0.5197568389057751, + "grad_norm": 1.6923905611038208, + "learning_rate": 4.796460992031386e-06, + "loss": 0.4880887269973755, + "mean_token_accuracy": 0.834983229637146, + "num_tokens": 6245252.0, + "step": 684 + }, + { + "epoch": 0.520516717325228, + "grad_norm": 2.13161301612854, + "learning_rate": 4.7956324292024045e-06, + "loss": 0.5687593817710876, + "mean_token_accuracy": 0.7996571063995361, + "num_tokens": 6253726.0, + "step": 685 + }, + { + "epoch": 0.5212765957446809, + "grad_norm": 2.509375810623169, + "learning_rate": 4.794802255210217e-06, + "loss": 0.5396929979324341, + "mean_token_accuracy": 0.8007107973098755, + "num_tokens": 6259238.0, + "step": 686 + }, + { + "epoch": 0.5220364741641338, + "grad_norm": 2.393710136413574, + "learning_rate": 4.793970470637469e-06, + "loss": 0.6165191531181335, + "mean_token_accuracy": 0.7891418933868408, + "num_tokens": 6266325.0, + "step": 687 + }, + { + "epoch": 0.5227963525835866, + "grad_norm": 1.511647343635559, + "learning_rate": 4.7931370760679415e-06, + "loss": 0.4773876965045929, + "mean_token_accuracy": 0.8381044864654541, + "num_tokens": 6277447.0, + "step": 688 + }, + { + "epoch": 0.5235562310030395, + "grad_norm": 2.206587314605713, + "learning_rate": 4.792302072086542e-06, + "loss": 0.5482058525085449, + "mean_token_accuracy": 0.8239108920097351, + "num_tokens": 6285163.0, + "step": 689 + }, + { + "epoch": 0.5243161094224924, + "grad_norm": 3.018146514892578, + "learning_rate": 4.7914654592793065e-06, + "loss": 0.4880615472793579, + "mean_token_accuracy": 0.8361308574676514, + "num_tokens": 6289386.0, + "step": 690 + }, + { + "epoch": 0.5250759878419453, + "grad_norm": 1.6469231843948364, + "learning_rate": 4.790627238233405e-06, + "loss": 0.4164774715900421, + "mean_token_accuracy": 0.8496290445327759, + "num_tokens": 6298915.0, + "step": 691 + }, + { + "epoch": 0.5258358662613982, + "grad_norm": 2.352505922317505, + "learning_rate": 4.789787409537131e-06, + "loss": 0.5366303324699402, + "mean_token_accuracy": 0.8350417613983154, + "num_tokens": 6306130.0, + "step": 692 + }, + { + "epoch": 0.526595744680851, + "grad_norm": 1.7463021278381348, + "learning_rate": 4.7889459737799105e-06, + "loss": 0.4389137923717499, + "mean_token_accuracy": 0.8463300466537476, + "num_tokens": 6315503.0, + "step": 693 + }, + { + "epoch": 0.5273556231003039, + "grad_norm": 2.257706642150879, + "learning_rate": 4.788102931552294e-06, + "loss": 0.5309344530105591, + "mean_token_accuracy": 0.8164352178573608, + "num_tokens": 6321852.0, + "step": 694 + }, + { + "epoch": 0.5281155015197568, + "grad_norm": 2.392732620239258, + "learning_rate": 4.787258283445962e-06, + "loss": 0.3956204056739807, + "mean_token_accuracy": 0.8671456575393677, + "num_tokens": 6327380.0, + "step": 695 + }, + { + "epoch": 0.5288753799392097, + "grad_norm": 2.210514545440674, + "learning_rate": 4.786412030053721e-06, + "loss": 0.4842875003814697, + "mean_token_accuracy": 0.8508446216583252, + "num_tokens": 6334898.0, + "step": 696 + }, + { + "epoch": 0.5296352583586627, + "grad_norm": 1.8678946495056152, + "learning_rate": 4.785564171969503e-06, + "loss": 0.47399595379829407, + "mean_token_accuracy": 0.8514996767044067, + "num_tokens": 6346374.0, + "step": 697 + }, + { + "epoch": 0.5303951367781155, + "grad_norm": 2.604079484939575, + "learning_rate": 4.784714709788368e-06, + "loss": 0.5950228571891785, + "mean_token_accuracy": 0.7983481884002686, + "num_tokens": 6351648.0, + "step": 698 + }, + { + "epoch": 0.5311550151975684, + "grad_norm": 1.662381649017334, + "learning_rate": 4.783863644106502e-06, + "loss": 0.41616758704185486, + "mean_token_accuracy": 0.8554803133010864, + "num_tokens": 6360506.0, + "step": 699 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 1.6300342082977295, + "learning_rate": 4.783010975521216e-06, + "loss": 0.43029269576072693, + "mean_token_accuracy": 0.8443028926849365, + "num_tokens": 6370675.0, + "step": 700 + }, + { + "epoch": 0.5326747720364742, + "grad_norm": 1.731873869895935, + "learning_rate": 4.782156704630944e-06, + "loss": 0.4383814334869385, + "mean_token_accuracy": 0.8443183898925781, + "num_tokens": 6381803.0, + "step": 701 + }, + { + "epoch": 0.5334346504559271, + "grad_norm": 3.1788413524627686, + "learning_rate": 4.7813008320352475e-06, + "loss": 0.32194480299949646, + "mean_token_accuracy": 0.8870962858200073, + "num_tokens": 6389263.0, + "step": 702 + }, + { + "epoch": 0.53419452887538, + "grad_norm": 2.099513530731201, + "learning_rate": 4.78044335833481e-06, + "loss": 0.36962923407554626, + "mean_token_accuracy": 0.8661133646965027, + "num_tokens": 6395589.0, + "step": 703 + }, + { + "epoch": 0.5349544072948328, + "grad_norm": 1.4859435558319092, + "learning_rate": 4.77958428413144e-06, + "loss": 0.4619954824447632, + "mean_token_accuracy": 0.8438555002212524, + "num_tokens": 6407470.0, + "step": 704 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 1.2561073303222656, + "learning_rate": 4.7787236100280685e-06, + "loss": 0.3770977258682251, + "mean_token_accuracy": 0.8515733480453491, + "num_tokens": 6422888.0, + "step": 705 + }, + { + "epoch": 0.5364741641337386, + "grad_norm": 1.4455817937850952, + "learning_rate": 4.777861336628751e-06, + "loss": 0.46481069922447205, + "mean_token_accuracy": 0.8502002954483032, + "num_tokens": 6441266.0, + "step": 706 + }, + { + "epoch": 0.5372340425531915, + "grad_norm": 1.1387295722961426, + "learning_rate": 4.7769974645386616e-06, + "loss": 0.36964765191078186, + "mean_token_accuracy": 0.8719524145126343, + "num_tokens": 6463686.0, + "step": 707 + }, + { + "epoch": 0.5379939209726444, + "grad_norm": 1.7179663181304932, + "learning_rate": 4.776131994364102e-06, + "loss": 0.4231719970703125, + "mean_token_accuracy": 0.8416585922241211, + "num_tokens": 6472956.0, + "step": 708 + }, + { + "epoch": 0.5387537993920972, + "grad_norm": 1.6328502893447876, + "learning_rate": 4.775264926712489e-06, + "loss": 0.5836569666862488, + "mean_token_accuracy": 0.8039724230766296, + "num_tokens": 6485773.0, + "step": 709 + }, + { + "epoch": 0.5395136778115501, + "grad_norm": 1.8515360355377197, + "learning_rate": 4.774396262192368e-06, + "loss": 0.5477553009986877, + "mean_token_accuracy": 0.8136521577835083, + "num_tokens": 6496379.0, + "step": 710 + }, + { + "epoch": 0.540273556231003, + "grad_norm": 1.741858959197998, + "learning_rate": 4.7735260014133986e-06, + "loss": 0.4663267731666565, + "mean_token_accuracy": 0.8473691940307617, + "num_tokens": 6507652.0, + "step": 711 + }, + { + "epoch": 0.541033434650456, + "grad_norm": 1.7516659498214722, + "learning_rate": 4.772654144986364e-06, + "loss": 0.374914288520813, + "mean_token_accuracy": 0.8600220680236816, + "num_tokens": 6519030.0, + "step": 712 + }, + { + "epoch": 0.5417933130699089, + "grad_norm": 2.662343978881836, + "learning_rate": 4.7717806935231665e-06, + "loss": 0.4206875264644623, + "mean_token_accuracy": 0.8544126749038696, + "num_tokens": 6523669.0, + "step": 713 + }, + { + "epoch": 0.5425531914893617, + "grad_norm": 1.4088834524154663, + "learning_rate": 4.770905647636828e-06, + "loss": 0.5824331045150757, + "mean_token_accuracy": 0.7857901453971863, + "num_tokens": 6540560.0, + "step": 714 + }, + { + "epoch": 0.5433130699088146, + "grad_norm": 2.173656940460205, + "learning_rate": 4.77002900794149e-06, + "loss": 0.555023729801178, + "mean_token_accuracy": 0.8067290782928467, + "num_tokens": 6548946.0, + "step": 715 + }, + { + "epoch": 0.5440729483282675, + "grad_norm": 2.121018648147583, + "learning_rate": 4.769150775052411e-06, + "loss": 0.559730052947998, + "mean_token_accuracy": 0.8166372776031494, + "num_tokens": 6556065.0, + "step": 716 + }, + { + "epoch": 0.5448328267477204, + "grad_norm": 3.335866928100586, + "learning_rate": 4.768270949585968e-06, + "loss": 0.6442267894744873, + "mean_token_accuracy": 0.7858607769012451, + "num_tokens": 6560615.0, + "step": 717 + }, + { + "epoch": 0.5455927051671733, + "grad_norm": 2.3813695907592773, + "learning_rate": 4.767389532159659e-06, + "loss": 0.4027421474456787, + "mean_token_accuracy": 0.8635619282722473, + "num_tokens": 6565841.0, + "step": 718 + }, + { + "epoch": 0.5463525835866262, + "grad_norm": 2.0657708644866943, + "learning_rate": 4.766506523392095e-06, + "loss": 0.38899827003479004, + "mean_token_accuracy": 0.8660480380058289, + "num_tokens": 6572362.0, + "step": 719 + }, + { + "epoch": 0.547112462006079, + "grad_norm": 1.093705415725708, + "learning_rate": 4.765621923903005e-06, + "loss": 0.45967352390289307, + "mean_token_accuracy": 0.8338102102279663, + "num_tokens": 6595998.0, + "step": 720 + }, + { + "epoch": 0.5478723404255319, + "grad_norm": 2.942065954208374, + "learning_rate": 4.764735734313236e-06, + "loss": 0.42910510301589966, + "mean_token_accuracy": 0.8406122922897339, + "num_tokens": 6601075.0, + "step": 721 + }, + { + "epoch": 0.5486322188449848, + "grad_norm": 2.049011707305908, + "learning_rate": 4.763847955244749e-06, + "loss": 0.5584231615066528, + "mean_token_accuracy": 0.8171684741973877, + "num_tokens": 6609310.0, + "step": 722 + }, + { + "epoch": 0.5493920972644377, + "grad_norm": 2.485543966293335, + "learning_rate": 4.762958587320623e-06, + "loss": 0.5396170020103455, + "mean_token_accuracy": 0.8158525824546814, + "num_tokens": 6616185.0, + "step": 723 + }, + { + "epoch": 0.5501519756838906, + "grad_norm": 1.87015962600708, + "learning_rate": 4.762067631165049e-06, + "loss": 0.49739527702331543, + "mean_token_accuracy": 0.8303765654563904, + "num_tokens": 6625629.0, + "step": 724 + }, + { + "epoch": 0.5509118541033434, + "grad_norm": 4.239654541015625, + "learning_rate": 4.761175087403336e-06, + "loss": 0.6029239296913147, + "mean_token_accuracy": 0.8123486042022705, + "num_tokens": 6629194.0, + "step": 725 + }, + { + "epoch": 0.5516717325227963, + "grad_norm": 2.0134730339050293, + "learning_rate": 4.760280956661904e-06, + "loss": 0.4777873754501343, + "mean_token_accuracy": 0.8283513784408569, + "num_tokens": 6636929.0, + "step": 726 + }, + { + "epoch": 0.5524316109422492, + "grad_norm": 1.991780400276184, + "learning_rate": 4.75938523956829e-06, + "loss": 0.4631248116493225, + "mean_token_accuracy": 0.8275107741355896, + "num_tokens": 6645135.0, + "step": 727 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 1.423792839050293, + "learning_rate": 4.75848793675114e-06, + "loss": 0.49630722403526306, + "mean_token_accuracy": 0.8388000130653381, + "num_tokens": 6662690.0, + "step": 728 + }, + { + "epoch": 0.5539513677811551, + "grad_norm": 2.345294952392578, + "learning_rate": 4.757589048840219e-06, + "loss": 0.37830638885498047, + "mean_token_accuracy": 0.8782080411911011, + "num_tokens": 6667285.0, + "step": 729 + }, + { + "epoch": 0.5547112462006079, + "grad_norm": 2.7452144622802734, + "learning_rate": 4.756688576466398e-06, + "loss": 0.51595538854599, + "mean_token_accuracy": 0.8441770672798157, + "num_tokens": 6672324.0, + "step": 730 + }, + { + "epoch": 0.5554711246200608, + "grad_norm": 1.5247859954833984, + "learning_rate": 4.755786520261666e-06, + "loss": 0.48365193605422974, + "mean_token_accuracy": 0.8276445269584656, + "num_tokens": 6685296.0, + "step": 731 + }, + { + "epoch": 0.5562310030395137, + "grad_norm": 1.4018276929855347, + "learning_rate": 4.75488288085912e-06, + "loss": 0.3876481354236603, + "mean_token_accuracy": 0.8612343072891235, + "num_tokens": 6697515.0, + "step": 732 + }, + { + "epoch": 0.5569908814589666, + "grad_norm": 2.9570324420928955, + "learning_rate": 4.753977658892967e-06, + "loss": 0.5468149185180664, + "mean_token_accuracy": 0.8054271340370178, + "num_tokens": 6702194.0, + "step": 733 + }, + { + "epoch": 0.5577507598784195, + "grad_norm": 1.9282715320587158, + "learning_rate": 4.753070854998529e-06, + "loss": 0.4758574962615967, + "mean_token_accuracy": 0.8379775285720825, + "num_tokens": 6709938.0, + "step": 734 + }, + { + "epoch": 0.5585106382978723, + "grad_norm": 1.981264591217041, + "learning_rate": 4.752162469812234e-06, + "loss": 0.48461222648620605, + "mean_token_accuracy": 0.833509087562561, + "num_tokens": 6718125.0, + "step": 735 + }, + { + "epoch": 0.5592705167173252, + "grad_norm": 1.1643427610397339, + "learning_rate": 4.751252503971624e-06, + "loss": 0.410121887922287, + "mean_token_accuracy": 0.8221402764320374, + "num_tokens": 6735125.0, + "step": 736 + }, + { + "epoch": 0.5600303951367781, + "grad_norm": 1.786566972732544, + "learning_rate": 4.750340958115346e-06, + "loss": 0.5964341163635254, + "mean_token_accuracy": 0.8038164377212524, + "num_tokens": 6747369.0, + "step": 737 + }, + { + "epoch": 0.560790273556231, + "grad_norm": 1.7256991863250732, + "learning_rate": 4.749427832883158e-06, + "loss": 0.48737066984176636, + "mean_token_accuracy": 0.830894947052002, + "num_tokens": 6758115.0, + "step": 738 + }, + { + "epoch": 0.5615501519756839, + "grad_norm": 1.997747540473938, + "learning_rate": 4.748513128915928e-06, + "loss": 0.5238886475563049, + "mean_token_accuracy": 0.8066858053207397, + "num_tokens": 6766111.0, + "step": 739 + }, + { + "epoch": 0.5623100303951368, + "grad_norm": 2.127016305923462, + "learning_rate": 4.747596846855629e-06, + "loss": 0.5045586228370667, + "mean_token_accuracy": 0.821424126625061, + "num_tokens": 6772893.0, + "step": 740 + }, + { + "epoch": 0.5630699088145896, + "grad_norm": 1.7664796113967896, + "learning_rate": 4.7466789873453446e-06, + "loss": 0.42954835295677185, + "mean_token_accuracy": 0.8533384799957275, + "num_tokens": 6785133.0, + "step": 741 + }, + { + "epoch": 0.5638297872340425, + "grad_norm": 1.4987404346466064, + "learning_rate": 4.7457595510292615e-06, + "loss": 0.5378558039665222, + "mean_token_accuracy": 0.8184819221496582, + "num_tokens": 6799563.0, + "step": 742 + }, + { + "epoch": 0.5645896656534954, + "grad_norm": 1.4444655179977417, + "learning_rate": 4.744838538552678e-06, + "loss": 0.42193782329559326, + "mean_token_accuracy": 0.837514340877533, + "num_tokens": 6812470.0, + "step": 743 + }, + { + "epoch": 0.5653495440729484, + "grad_norm": 3.867751121520996, + "learning_rate": 4.7439159505619946e-06, + "loss": 0.4457814693450928, + "mean_token_accuracy": 0.8630104660987854, + "num_tokens": 6815652.0, + "step": 744 + }, + { + "epoch": 0.5661094224924013, + "grad_norm": 2.1250710487365723, + "learning_rate": 4.74299178770472e-06, + "loss": 0.5638922452926636, + "mean_token_accuracy": 0.7969781160354614, + "num_tokens": 6824566.0, + "step": 745 + }, + { + "epoch": 0.5668693009118541, + "grad_norm": 2.547072410583496, + "learning_rate": 4.742066050629465e-06, + "loss": 0.5516207814216614, + "mean_token_accuracy": 0.8160669803619385, + "num_tokens": 6830589.0, + "step": 746 + }, + { + "epoch": 0.567629179331307, + "grad_norm": 1.2975233793258667, + "learning_rate": 4.741138739985951e-06, + "loss": 0.3823344111442566, + "mean_token_accuracy": 0.8668368458747864, + "num_tokens": 6842707.0, + "step": 747 + }, + { + "epoch": 0.5683890577507599, + "grad_norm": 1.3410450220108032, + "learning_rate": 4.740209856424998e-06, + "loss": 0.5148671269416809, + "mean_token_accuracy": 0.8188045024871826, + "num_tokens": 6857624.0, + "step": 748 + }, + { + "epoch": 0.5691489361702128, + "grad_norm": 1.219467282295227, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.3998957872390747, + "mean_token_accuracy": 0.855175256729126, + "num_tokens": 6875064.0, + "step": 749 + }, + { + "epoch": 0.5699088145896657, + "grad_norm": 1.3530343770980835, + "learning_rate": 4.738347373159585e-06, + "loss": 0.5359633564949036, + "mean_token_accuracy": 0.8178457021713257, + "num_tokens": 6890911.0, + "step": 750 + }, + { + "epoch": 0.5706686930091185, + "grad_norm": 2.146988868713379, + "learning_rate": 4.737413774762287e-06, + "loss": 0.4460008144378662, + "mean_token_accuracy": 0.8172903060913086, + "num_tokens": 6896959.0, + "step": 751 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.456023097038269, + "learning_rate": 4.736478606061876e-06, + "loss": 0.43616920709609985, + "mean_token_accuracy": 0.8465108871459961, + "num_tokens": 6908904.0, + "step": 752 + }, + { + "epoch": 0.5721884498480243, + "grad_norm": 2.9696967601776123, + "learning_rate": 4.735541867714687e-06, + "loss": 0.43464532494544983, + "mean_token_accuracy": 0.8608652353286743, + "num_tokens": 6913026.0, + "step": 753 + }, + { + "epoch": 0.5729483282674772, + "grad_norm": 2.2990667819976807, + "learning_rate": 4.73460356037816e-06, + "loss": 0.6619116067886353, + "mean_token_accuracy": 0.7821142673492432, + "num_tokens": 6920588.0, + "step": 754 + }, + { + "epoch": 0.5737082066869301, + "grad_norm": 2.054746389389038, + "learning_rate": 4.733663684710835e-06, + "loss": 0.5304250717163086, + "mean_token_accuracy": 0.8265531063079834, + "num_tokens": 6928910.0, + "step": 755 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.0050594806671143, + "learning_rate": 4.732722241372354e-06, + "loss": 0.6393026113510132, + "mean_token_accuracy": 0.796819806098938, + "num_tokens": 6940217.0, + "step": 756 + }, + { + "epoch": 0.5752279635258358, + "grad_norm": 1.4285320043563843, + "learning_rate": 4.731779231023456e-06, + "loss": 0.5432837009429932, + "mean_token_accuracy": 0.8104778528213501, + "num_tokens": 6959101.0, + "step": 757 + }, + { + "epoch": 0.5759878419452887, + "grad_norm": 2.3941943645477295, + "learning_rate": 4.730834654325984e-06, + "loss": 0.46550673246383667, + "mean_token_accuracy": 0.8444503545761108, + "num_tokens": 6965036.0, + "step": 758 + }, + { + "epoch": 0.5767477203647416, + "grad_norm": 2.3850574493408203, + "learning_rate": 4.729888511942877e-06, + "loss": 0.4916389584541321, + "mean_token_accuracy": 0.8228527307510376, + "num_tokens": 6971184.0, + "step": 759 + }, + { + "epoch": 0.5775075987841946, + "grad_norm": 1.627480149269104, + "learning_rate": 4.728940804538176e-06, + "loss": 0.5863215923309326, + "mean_token_accuracy": 0.7995302677154541, + "num_tokens": 6982569.0, + "step": 760 + }, + { + "epoch": 0.5782674772036475, + "grad_norm": 1.1723195314407349, + "learning_rate": 4.727991532777016e-06, + "loss": 0.36908864974975586, + "mean_token_accuracy": 0.8355655670166016, + "num_tokens": 6998659.0, + "step": 761 + }, + { + "epoch": 0.5790273556231003, + "grad_norm": 1.5324925184249878, + "learning_rate": 4.727040697325634e-06, + "loss": 0.557658851146698, + "mean_token_accuracy": 0.8141458034515381, + "num_tokens": 7012969.0, + "step": 762 + }, + { + "epoch": 0.5797872340425532, + "grad_norm": 2.4106390476226807, + "learning_rate": 4.726088298851362e-06, + "loss": 0.5004243850708008, + "mean_token_accuracy": 0.8376860618591309, + "num_tokens": 7018301.0, + "step": 763 + }, + { + "epoch": 0.5805471124620061, + "grad_norm": 2.2594921588897705, + "learning_rate": 4.725134338022631e-06, + "loss": 0.6067016124725342, + "mean_token_accuracy": 0.8100241422653198, + "num_tokens": 7025201.0, + "step": 764 + }, + { + "epoch": 0.581306990881459, + "grad_norm": 1.4649826288223267, + "learning_rate": 4.724178815508967e-06, + "loss": 0.36200693249702454, + "mean_token_accuracy": 0.8621826171875, + "num_tokens": 7035112.0, + "step": 765 + }, + { + "epoch": 0.5820668693009119, + "grad_norm": 2.3634560108184814, + "learning_rate": 4.723221731980993e-06, + "loss": 0.41862213611602783, + "mean_token_accuracy": 0.8541463613510132, + "num_tokens": 7040339.0, + "step": 766 + }, + { + "epoch": 0.5828267477203647, + "grad_norm": 2.7798104286193848, + "learning_rate": 4.722263088110426e-06, + "loss": 0.4647108018398285, + "mean_token_accuracy": 0.8505672216415405, + "num_tokens": 7044880.0, + "step": 767 + }, + { + "epoch": 0.5835866261398176, + "grad_norm": 2.070528507232666, + "learning_rate": 4.721302884570079e-06, + "loss": 0.5147565007209778, + "mean_token_accuracy": 0.8113877773284912, + "num_tokens": 7052433.0, + "step": 768 + }, + { + "epoch": 0.5843465045592705, + "grad_norm": 2.1953284740448, + "learning_rate": 4.720341122033862e-06, + "loss": 0.5075466632843018, + "mean_token_accuracy": 0.8474211096763611, + "num_tokens": 7058686.0, + "step": 769 + }, + { + "epoch": 0.5851063829787234, + "grad_norm": 1.9287755489349365, + "learning_rate": 4.719377801176774e-06, + "loss": 0.5382202863693237, + "mean_token_accuracy": 0.8148090243339539, + "num_tokens": 7067538.0, + "step": 770 + }, + { + "epoch": 0.5858662613981763, + "grad_norm": 1.5574456453323364, + "learning_rate": 4.718412922674913e-06, + "loss": 0.43406790494918823, + "mean_token_accuracy": 0.8477081060409546, + "num_tokens": 7077853.0, + "step": 771 + }, + { + "epoch": 0.5866261398176292, + "grad_norm": 1.5490336418151855, + "learning_rate": 4.717446487205466e-06, + "loss": 0.43164271116256714, + "mean_token_accuracy": 0.8504570126533508, + "num_tokens": 7091728.0, + "step": 772 + }, + { + "epoch": 0.587386018237082, + "grad_norm": 1.6945984363555908, + "learning_rate": 4.716478495446717e-06, + "loss": 0.5153743624687195, + "mean_token_accuracy": 0.8213579058647156, + "num_tokens": 7108680.0, + "step": 773 + }, + { + "epoch": 0.5881458966565349, + "grad_norm": 2.2633883953094482, + "learning_rate": 4.715508948078037e-06, + "loss": 0.45254790782928467, + "mean_token_accuracy": 0.8392219543457031, + "num_tokens": 7115546.0, + "step": 774 + }, + { + "epoch": 0.5889057750759878, + "grad_norm": 1.5731090307235718, + "learning_rate": 4.714537845779894e-06, + "loss": 0.38678881525993347, + "mean_token_accuracy": 0.8800252676010132, + "num_tokens": 7126360.0, + "step": 775 + }, + { + "epoch": 0.5896656534954408, + "grad_norm": 2.4873392581939697, + "learning_rate": 4.7135651892338445e-06, + "loss": 0.5190927386283875, + "mean_token_accuracy": 0.8145407438278198, + "num_tokens": 7135705.0, + "step": 776 + }, + { + "epoch": 0.5904255319148937, + "grad_norm": 1.2931004762649536, + "learning_rate": 4.712590979122534e-06, + "loss": 0.3686544895172119, + "mean_token_accuracy": 0.8720537424087524, + "num_tokens": 7150688.0, + "step": 777 + }, + { + "epoch": 0.5911854103343465, + "grad_norm": 1.6353671550750732, + "learning_rate": 4.7116152161297045e-06, + "loss": 0.49065062403678894, + "mean_token_accuracy": 0.8203760385513306, + "num_tokens": 7161040.0, + "step": 778 + }, + { + "epoch": 0.5919452887537994, + "grad_norm": 1.2345483303070068, + "learning_rate": 4.710637900940181e-06, + "loss": 0.4004976451396942, + "mean_token_accuracy": 0.8302007913589478, + "num_tokens": 7178074.0, + "step": 779 + }, + { + "epoch": 0.5927051671732523, + "grad_norm": 2.2506837844848633, + "learning_rate": 4.7096590342398825e-06, + "loss": 0.45142874121665955, + "mean_token_accuracy": 0.8481036424636841, + "num_tokens": 7184153.0, + "step": 780 + }, + { + "epoch": 0.5934650455927052, + "grad_norm": 1.420479416847229, + "learning_rate": 4.708678616715815e-06, + "loss": 0.4802100360393524, + "mean_token_accuracy": 0.8586992025375366, + "num_tokens": 7202810.0, + "step": 781 + }, + { + "epoch": 0.5942249240121581, + "grad_norm": 3.457632303237915, + "learning_rate": 4.707696649056073e-06, + "loss": 0.5265094041824341, + "mean_token_accuracy": 0.8260114192962646, + "num_tokens": 7206396.0, + "step": 782 + }, + { + "epoch": 0.5949848024316109, + "grad_norm": 1.1592093706130981, + "learning_rate": 4.706713131949839e-06, + "loss": 0.3708173632621765, + "mean_token_accuracy": 0.8476542234420776, + "num_tokens": 7225034.0, + "step": 783 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 1.6761400699615479, + "learning_rate": 4.705728066087384e-06, + "loss": 0.4137252867221832, + "mean_token_accuracy": 0.8462049961090088, + "num_tokens": 7237101.0, + "step": 784 + }, + { + "epoch": 0.5965045592705167, + "grad_norm": 2.320185422897339, + "learning_rate": 4.704741452160064e-06, + "loss": 0.5157154202461243, + "mean_token_accuracy": 0.8391785621643066, + "num_tokens": 7243826.0, + "step": 785 + }, + { + "epoch": 0.5972644376899696, + "grad_norm": 2.079423427581787, + "learning_rate": 4.703753290860323e-06, + "loss": 0.4734993278980255, + "mean_token_accuracy": 0.8353281021118164, + "num_tokens": 7250175.0, + "step": 786 + }, + { + "epoch": 0.5980243161094225, + "grad_norm": 1.8215159177780151, + "learning_rate": 4.702763582881692e-06, + "loss": 0.520193338394165, + "mean_token_accuracy": 0.844062864780426, + "num_tokens": 7258868.0, + "step": 787 + }, + { + "epoch": 0.5987841945288754, + "grad_norm": 1.3823071718215942, + "learning_rate": 4.701772328918784e-06, + "loss": 0.4177844822406769, + "mean_token_accuracy": 0.8363165259361267, + "num_tokens": 7271744.0, + "step": 788 + }, + { + "epoch": 0.5995440729483282, + "grad_norm": 2.4749298095703125, + "learning_rate": 4.700779529667301e-06, + "loss": 0.5115069150924683, + "mean_token_accuracy": 0.8473520278930664, + "num_tokens": 7277040.0, + "step": 789 + }, + { + "epoch": 0.6003039513677811, + "grad_norm": 1.7072296142578125, + "learning_rate": 4.699785185824026e-06, + "loss": 0.5265800952911377, + "mean_token_accuracy": 0.8161447048187256, + "num_tokens": 7288288.0, + "step": 790 + }, + { + "epoch": 0.601063829787234, + "grad_norm": 1.6479384899139404, + "learning_rate": 4.69878929808683e-06, + "loss": 0.4445168972015381, + "mean_token_accuracy": 0.8381255865097046, + "num_tokens": 7298640.0, + "step": 791 + }, + { + "epoch": 0.601823708206687, + "grad_norm": 1.9095896482467651, + "learning_rate": 4.6977918671546635e-06, + "loss": 0.5841238498687744, + "mean_token_accuracy": 0.7971454858779907, + "num_tokens": 7307220.0, + "step": 792 + }, + { + "epoch": 0.6025835866261399, + "grad_norm": 1.9614146947860718, + "learning_rate": 4.696792893727562e-06, + "loss": 0.34684082865715027, + "mean_token_accuracy": 0.8739526271820068, + "num_tokens": 7313875.0, + "step": 793 + }, + { + "epoch": 0.6033434650455927, + "grad_norm": 2.015570640563965, + "learning_rate": 4.695792378506645e-06, + "loss": 0.42779117822647095, + "mean_token_accuracy": 0.8625012636184692, + "num_tokens": 7321439.0, + "step": 794 + }, + { + "epoch": 0.6041033434650456, + "grad_norm": 2.8581228256225586, + "learning_rate": 4.694790322194111e-06, + "loss": 0.6519991159439087, + "mean_token_accuracy": 0.7629562616348267, + "num_tokens": 7326916.0, + "step": 795 + }, + { + "epoch": 0.6048632218844985, + "grad_norm": 2.482715368270874, + "learning_rate": 4.693786725493242e-06, + "loss": 0.532963216304779, + "mean_token_accuracy": 0.832184910774231, + "num_tokens": 7333311.0, + "step": 796 + }, + { + "epoch": 0.6056231003039514, + "grad_norm": 1.6076741218566895, + "learning_rate": 4.692781589108402e-06, + "loss": 0.43381205201148987, + "mean_token_accuracy": 0.8402494192123413, + "num_tokens": 7343731.0, + "step": 797 + }, + { + "epoch": 0.6063829787234043, + "grad_norm": 2.2133216857910156, + "learning_rate": 4.691774913745033e-06, + "loss": 0.4380851089954376, + "mean_token_accuracy": 0.8600908517837524, + "num_tokens": 7350224.0, + "step": 798 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 2.046280860900879, + "learning_rate": 4.690766700109659e-06, + "loss": 0.3821919560432434, + "mean_token_accuracy": 0.8691814541816711, + "num_tokens": 7356717.0, + "step": 799 + }, + { + "epoch": 0.60790273556231, + "grad_norm": 1.8482693433761597, + "learning_rate": 4.689756948909884e-06, + "loss": 0.5217651128768921, + "mean_token_accuracy": 0.803473711013794, + "num_tokens": 7365806.0, + "step": 800 + }, + { + "epoch": 0.6086626139817629, + "grad_norm": 2.192134141921997, + "learning_rate": 4.688745660854388e-06, + "loss": 0.573980987071991, + "mean_token_accuracy": 0.8198676109313965, + "num_tokens": 7380281.0, + "step": 801 + }, + { + "epoch": 0.6094224924012158, + "grad_norm": 2.363626718521118, + "learning_rate": 4.687732836652935e-06, + "loss": 0.5204599499702454, + "mean_token_accuracy": 0.8373252153396606, + "num_tokens": 7386938.0, + "step": 802 + }, + { + "epoch": 0.6101823708206687, + "grad_norm": 1.9320523738861084, + "learning_rate": 4.686718477016361e-06, + "loss": 0.47316622734069824, + "mean_token_accuracy": 0.830596923828125, + "num_tokens": 7395069.0, + "step": 803 + }, + { + "epoch": 0.6109422492401215, + "grad_norm": 2.6573057174682617, + "learning_rate": 4.6857025826565845e-06, + "loss": 0.5495861768722534, + "mean_token_accuracy": 0.8187421560287476, + "num_tokens": 7400563.0, + "step": 804 + }, + { + "epoch": 0.6117021276595744, + "grad_norm": 2.0893123149871826, + "learning_rate": 4.684685154286599e-06, + "loss": 0.5362675786018372, + "mean_token_accuracy": 0.8394701480865479, + "num_tokens": 7406973.0, + "step": 805 + }, + { + "epoch": 0.6124620060790273, + "grad_norm": 2.455130100250244, + "learning_rate": 4.683666192620474e-06, + "loss": 0.5405995845794678, + "mean_token_accuracy": 0.8079100847244263, + "num_tokens": 7412931.0, + "step": 806 + }, + { + "epoch": 0.6132218844984803, + "grad_norm": 2.311915636062622, + "learning_rate": 4.682645698373357e-06, + "loss": 0.5395106077194214, + "mean_token_accuracy": 0.8156260251998901, + "num_tokens": 7419699.0, + "step": 807 + }, + { + "epoch": 0.6139817629179332, + "grad_norm": 1.686838984489441, + "learning_rate": 4.6816236722614694e-06, + "loss": 0.6034521460533142, + "mean_token_accuracy": 0.7855954170227051, + "num_tokens": 7431899.0, + "step": 808 + }, + { + "epoch": 0.6147416413373861, + "grad_norm": 1.682759165763855, + "learning_rate": 4.680600115002109e-06, + "loss": 0.48593831062316895, + "mean_token_accuracy": 0.8229435682296753, + "num_tokens": 7443187.0, + "step": 809 + }, + { + "epoch": 0.6155015197568389, + "grad_norm": 2.064589738845825, + "learning_rate": 4.679575027313649e-06, + "loss": 0.5098468661308289, + "mean_token_accuracy": 0.8234638571739197, + "num_tokens": 7450868.0, + "step": 810 + }, + { + "epoch": 0.6162613981762918, + "grad_norm": 2.2063486576080322, + "learning_rate": 4.6785484099155324e-06, + "loss": 0.5138497352600098, + "mean_token_accuracy": 0.8152111172676086, + "num_tokens": 7457176.0, + "step": 811 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 1.6258726119995117, + "learning_rate": 4.67752026352828e-06, + "loss": 0.4064181447029114, + "mean_token_accuracy": 0.8720619678497314, + "num_tokens": 7466557.0, + "step": 812 + }, + { + "epoch": 0.6177811550151976, + "grad_norm": 2.3309383392333984, + "learning_rate": 4.676490588873486e-06, + "loss": 0.5180112719535828, + "mean_token_accuracy": 0.8233879804611206, + "num_tokens": 7472650.0, + "step": 813 + }, + { + "epoch": 0.6185410334346505, + "grad_norm": 1.4545246362686157, + "learning_rate": 4.675459386673815e-06, + "loss": 0.37917959690093994, + "mean_token_accuracy": 0.8598103523254395, + "num_tokens": 7485171.0, + "step": 814 + }, + { + "epoch": 0.6193009118541033, + "grad_norm": 2.654231071472168, + "learning_rate": 4.674426657653003e-06, + "loss": 0.554074227809906, + "mean_token_accuracy": 0.8026446104049683, + "num_tokens": 7490787.0, + "step": 815 + }, + { + "epoch": 0.6200607902735562, + "grad_norm": 1.5543994903564453, + "learning_rate": 4.67339240253586e-06, + "loss": 0.6335440278053284, + "mean_token_accuracy": 0.783241868019104, + "num_tokens": 7505975.0, + "step": 816 + }, + { + "epoch": 0.6208206686930091, + "grad_norm": 2.079998016357422, + "learning_rate": 4.672356622048266e-06, + "loss": 0.5169394016265869, + "mean_token_accuracy": 0.8088761568069458, + "num_tokens": 7513470.0, + "step": 817 + }, + { + "epoch": 0.621580547112462, + "grad_norm": 1.5971896648406982, + "learning_rate": 4.671319316917172e-06, + "loss": 0.44588586688041687, + "mean_token_accuracy": 0.8518649339675903, + "num_tokens": 7524352.0, + "step": 818 + }, + { + "epoch": 0.6223404255319149, + "grad_norm": 2.477579116821289, + "learning_rate": 4.670280487870599e-06, + "loss": 0.5713893175125122, + "mean_token_accuracy": 0.8116940259933472, + "num_tokens": 7530359.0, + "step": 819 + }, + { + "epoch": 0.6231003039513677, + "grad_norm": 2.066211700439453, + "learning_rate": 4.669240135637635e-06, + "loss": 0.5295331478118896, + "mean_token_accuracy": 0.819536566734314, + "num_tokens": 7536963.0, + "step": 820 + }, + { + "epoch": 0.6238601823708206, + "grad_norm": 2.1217997074127197, + "learning_rate": 4.668198260948442e-06, + "loss": 0.6146406531333923, + "mean_token_accuracy": 0.7932635545730591, + "num_tokens": 7545800.0, + "step": 821 + }, + { + "epoch": 0.6246200607902735, + "grad_norm": 2.0173542499542236, + "learning_rate": 4.667154864534245e-06, + "loss": 0.6240535974502563, + "mean_token_accuracy": 0.7883644104003906, + "num_tokens": 7556165.0, + "step": 822 + }, + { + "epoch": 0.6253799392097265, + "grad_norm": 2.014526128768921, + "learning_rate": 4.666109947127343e-06, + "loss": 0.40367332100868225, + "mean_token_accuracy": 0.8653522729873657, + "num_tokens": 7562665.0, + "step": 823 + }, + { + "epoch": 0.6261398176291794, + "grad_norm": 2.5078861713409424, + "learning_rate": 4.665063509461098e-06, + "loss": 0.5903617739677429, + "mean_token_accuracy": 0.7902897596359253, + "num_tokens": 7568922.0, + "step": 824 + }, + { + "epoch": 0.6268996960486323, + "grad_norm": 2.454622745513916, + "learning_rate": 4.664015552269938e-06, + "loss": 0.5238361358642578, + "mean_token_accuracy": 0.838546872138977, + "num_tokens": 7575965.0, + "step": 825 + }, + { + "epoch": 0.6276595744680851, + "grad_norm": 2.920919418334961, + "learning_rate": 4.662966076289363e-06, + "loss": 0.5028782486915588, + "mean_token_accuracy": 0.8311152458190918, + "num_tokens": 7580193.0, + "step": 826 + }, + { + "epoch": 0.628419452887538, + "grad_norm": 1.545382022857666, + "learning_rate": 4.661915082255932e-06, + "loss": 0.4817378520965576, + "mean_token_accuracy": 0.8373227119445801, + "num_tokens": 7593024.0, + "step": 827 + }, + { + "epoch": 0.6291793313069909, + "grad_norm": 1.5152469873428345, + "learning_rate": 4.6608625709072766e-06, + "loss": 0.4693033695220947, + "mean_token_accuracy": 0.8150848150253296, + "num_tokens": 7606459.0, + "step": 828 + }, + { + "epoch": 0.6299392097264438, + "grad_norm": 2.1310224533081055, + "learning_rate": 4.659808542982089e-06, + "loss": 0.4653395414352417, + "mean_token_accuracy": 0.8286294341087341, + "num_tokens": 7613036.0, + "step": 829 + }, + { + "epoch": 0.6306990881458967, + "grad_norm": 2.1949679851531982, + "learning_rate": 4.658752999220125e-06, + "loss": 0.3698633909225464, + "mean_token_accuracy": 0.871590793132782, + "num_tokens": 7618527.0, + "step": 830 + }, + { + "epoch": 0.6314589665653495, + "grad_norm": 2.2770416736602783, + "learning_rate": 4.657695940362207e-06, + "loss": 0.5202419757843018, + "mean_token_accuracy": 0.817577600479126, + "num_tokens": 7624459.0, + "step": 831 + }, + { + "epoch": 0.6322188449848024, + "grad_norm": 1.402042269706726, + "learning_rate": 4.65663736715022e-06, + "loss": 0.51531583070755, + "mean_token_accuracy": 0.8228116631507874, + "num_tokens": 7639371.0, + "step": 832 + }, + { + "epoch": 0.6329787234042553, + "grad_norm": 3.3554883003234863, + "learning_rate": 4.65557728032711e-06, + "loss": 0.6771188378334045, + "mean_token_accuracy": 0.7880028486251831, + "num_tokens": 7643924.0, + "step": 833 + }, + { + "epoch": 0.6337386018237082, + "grad_norm": 2.081040143966675, + "learning_rate": 4.654515680636888e-06, + "loss": 0.5712796449661255, + "mean_token_accuracy": 0.8177868127822876, + "num_tokens": 7651881.0, + "step": 834 + }, + { + "epoch": 0.6344984802431611, + "grad_norm": 0.9128716588020325, + "learning_rate": 4.653452568824625e-06, + "loss": 0.3423936069011688, + "mean_token_accuracy": 0.8782886266708374, + "num_tokens": 7677829.0, + "step": 835 + }, + { + "epoch": 0.6352583586626139, + "grad_norm": 3.49015736579895, + "learning_rate": 4.652387945636454e-06, + "loss": 0.34657734632492065, + "mean_token_accuracy": 0.8770567178726196, + "num_tokens": 7680796.0, + "step": 836 + }, + { + "epoch": 0.6360182370820668, + "grad_norm": 2.026247501373291, + "learning_rate": 4.651321811819568e-06, + "loss": 0.5098431706428528, + "mean_token_accuracy": 0.8216961622238159, + "num_tokens": 7688746.0, + "step": 837 + }, + { + "epoch": 0.6367781155015197, + "grad_norm": 2.444343090057373, + "learning_rate": 4.650254168122222e-06, + "loss": 0.5490090250968933, + "mean_token_accuracy": 0.8092857599258423, + "num_tokens": 7695220.0, + "step": 838 + }, + { + "epoch": 0.6375379939209727, + "grad_norm": 2.0171122550964355, + "learning_rate": 4.649185015293728e-06, + "loss": 0.47221142053604126, + "mean_token_accuracy": 0.8514408469200134, + "num_tokens": 7702759.0, + "step": 839 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 1.9800984859466553, + "learning_rate": 4.64811435408446e-06, + "loss": 0.5238803625106812, + "mean_token_accuracy": 0.8479194641113281, + "num_tokens": 7714017.0, + "step": 840 + }, + { + "epoch": 0.6390577507598785, + "grad_norm": 3.0674357414245605, + "learning_rate": 4.647042185245848e-06, + "loss": 0.4668245315551758, + "mean_token_accuracy": 0.8381714820861816, + "num_tokens": 7717801.0, + "step": 841 + }, + { + "epoch": 0.6398176291793313, + "grad_norm": 1.5672820806503296, + "learning_rate": 4.645968509530381e-06, + "loss": 0.4428741931915283, + "mean_token_accuracy": 0.8416479825973511, + "num_tokens": 7728342.0, + "step": 842 + }, + { + "epoch": 0.6405775075987842, + "grad_norm": 2.3042354583740234, + "learning_rate": 4.644893327691608e-06, + "loss": 0.49937760829925537, + "mean_token_accuracy": 0.827070951461792, + "num_tokens": 7734576.0, + "step": 843 + }, + { + "epoch": 0.6413373860182371, + "grad_norm": 2.057772159576416, + "learning_rate": 4.6438166404841316e-06, + "loss": 0.5912986993789673, + "mean_token_accuracy": 0.805509090423584, + "num_tokens": 7742481.0, + "step": 844 + }, + { + "epoch": 0.64209726443769, + "grad_norm": 1.9688186645507812, + "learning_rate": 4.6427384486636115e-06, + "loss": 0.482401967048645, + "mean_token_accuracy": 0.8358086347579956, + "num_tokens": 7750002.0, + "step": 845 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 2.6852948665618896, + "learning_rate": 4.6416587529867665e-06, + "loss": 0.5479315519332886, + "mean_token_accuracy": 0.8091106414794922, + "num_tokens": 7755578.0, + "step": 846 + }, + { + "epoch": 0.6436170212765957, + "grad_norm": 2.0547337532043457, + "learning_rate": 4.640577554211366e-06, + "loss": 0.5327274203300476, + "mean_token_accuracy": 0.8280376195907593, + "num_tokens": 7763513.0, + "step": 847 + }, + { + "epoch": 0.6443768996960486, + "grad_norm": 2.0328633785247803, + "learning_rate": 4.63949485309624e-06, + "loss": 0.4814409613609314, + "mean_token_accuracy": 0.8527672290802002, + "num_tokens": 7771131.0, + "step": 848 + }, + { + "epoch": 0.6451367781155015, + "grad_norm": 1.5892863273620605, + "learning_rate": 4.638410650401267e-06, + "loss": 0.4492785334587097, + "mean_token_accuracy": 0.846997857093811, + "num_tokens": 7781572.0, + "step": 849 + }, + { + "epoch": 0.6458966565349544, + "grad_norm": 1.8295910358428955, + "learning_rate": 4.637324946887384e-06, + "loss": 0.37088239192962646, + "mean_token_accuracy": 0.8616628646850586, + "num_tokens": 7788604.0, + "step": 850 + }, + { + "epoch": 0.6466565349544073, + "grad_norm": 3.380040168762207, + "learning_rate": 4.636237743316578e-06, + "loss": 0.4737280607223511, + "mean_token_accuracy": 0.855940580368042, + "num_tokens": 7792504.0, + "step": 851 + }, + { + "epoch": 0.6474164133738601, + "grad_norm": 2.8790009021759033, + "learning_rate": 4.635149040451891e-06, + "loss": 0.39790448546409607, + "mean_token_accuracy": 0.8710698485374451, + "num_tokens": 7796333.0, + "step": 852 + }, + { + "epoch": 0.648176291793313, + "grad_norm": 1.914914608001709, + "learning_rate": 4.634058839057417e-06, + "loss": 0.2954312562942505, + "mean_token_accuracy": 0.8880234956741333, + "num_tokens": 7802456.0, + "step": 853 + }, + { + "epoch": 0.648936170212766, + "grad_norm": 1.3709120750427246, + "learning_rate": 4.632967139898301e-06, + "loss": 0.43224576115608215, + "mean_token_accuracy": 0.8446190357208252, + "num_tokens": 7816770.0, + "step": 854 + }, + { + "epoch": 0.6496960486322189, + "grad_norm": 1.6579312086105347, + "learning_rate": 4.63187394374074e-06, + "loss": 0.3535553514957428, + "mean_token_accuracy": 0.8738704919815063, + "num_tokens": 7824963.0, + "step": 855 + }, + { + "epoch": 0.6504559270516718, + "grad_norm": 2.4055678844451904, + "learning_rate": 4.63077925135198e-06, + "loss": 0.5078744292259216, + "mean_token_accuracy": 0.8430874347686768, + "num_tokens": 7830962.0, + "step": 856 + }, + { + "epoch": 0.6512158054711246, + "grad_norm": 2.5171499252319336, + "learning_rate": 4.629683063500319e-06, + "loss": 0.5172419548034668, + "mean_token_accuracy": 0.8087141513824463, + "num_tokens": 7836638.0, + "step": 857 + }, + { + "epoch": 0.6519756838905775, + "grad_norm": 1.7588486671447754, + "learning_rate": 4.628585380955104e-06, + "loss": 0.5759496092796326, + "mean_token_accuracy": 0.8043236136436462, + "num_tokens": 7844654.0, + "step": 858 + }, + { + "epoch": 0.6527355623100304, + "grad_norm": 1.5887070894241333, + "learning_rate": 4.62748620448673e-06, + "loss": 0.41849038004875183, + "mean_token_accuracy": 0.8556643724441528, + "num_tokens": 7855642.0, + "step": 859 + }, + { + "epoch": 0.6534954407294833, + "grad_norm": 3.227942705154419, + "learning_rate": 4.626385534866642e-06, + "loss": 0.5279449224472046, + "mean_token_accuracy": 0.8250958323478699, + "num_tokens": 7859890.0, + "step": 860 + }, + { + "epoch": 0.6542553191489362, + "grad_norm": 2.440467119216919, + "learning_rate": 4.625283372867333e-06, + "loss": 0.5294933319091797, + "mean_token_accuracy": 0.8235013484954834, + "num_tokens": 7866766.0, + "step": 861 + }, + { + "epoch": 0.6550151975683891, + "grad_norm": 2.4106903076171875, + "learning_rate": 4.624179719262342e-06, + "loss": 0.5662813186645508, + "mean_token_accuracy": 0.8061668872833252, + "num_tokens": 7872809.0, + "step": 862 + }, + { + "epoch": 0.6557750759878419, + "grad_norm": 3.5151145458221436, + "learning_rate": 4.623074574826254e-06, + "loss": 0.5471097230911255, + "mean_token_accuracy": 0.8220691084861755, + "num_tokens": 7876136.0, + "step": 863 + }, + { + "epoch": 0.6565349544072948, + "grad_norm": 1.5319840908050537, + "learning_rate": 4.621967940334705e-06, + "loss": 0.4178982377052307, + "mean_token_accuracy": 0.8517135977745056, + "num_tokens": 7886113.0, + "step": 864 + }, + { + "epoch": 0.6572948328267477, + "grad_norm": 1.63701331615448, + "learning_rate": 4.620859816564371e-06, + "loss": 0.4666512608528137, + "mean_token_accuracy": 0.8223508596420288, + "num_tokens": 7897982.0, + "step": 865 + }, + { + "epoch": 0.6580547112462006, + "grad_norm": 2.1515414714813232, + "learning_rate": 4.619750204292978e-06, + "loss": 0.5359305143356323, + "mean_token_accuracy": 0.8192868232727051, + "num_tokens": 7904947.0, + "step": 866 + }, + { + "epoch": 0.6588145896656535, + "grad_norm": 2.2140955924987793, + "learning_rate": 4.618639104299294e-06, + "loss": 0.5275633931159973, + "mean_token_accuracy": 0.8120715618133545, + "num_tokens": 7913913.0, + "step": 867 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 1.3956893682479858, + "learning_rate": 4.6175265173631304e-06, + "loss": 0.4378768503665924, + "mean_token_accuracy": 0.8479125499725342, + "num_tokens": 7927979.0, + "step": 868 + }, + { + "epoch": 0.6603343465045592, + "grad_norm": 2.98103928565979, + "learning_rate": 4.616412444265344e-06, + "loss": 0.42614591121673584, + "mean_token_accuracy": 0.8595094680786133, + "num_tokens": 7934293.0, + "step": 869 + }, + { + "epoch": 0.6610942249240122, + "grad_norm": 2.554845094680786, + "learning_rate": 4.6152968857878365e-06, + "loss": 0.3698030412197113, + "mean_token_accuracy": 0.8717041015625, + "num_tokens": 7938547.0, + "step": 870 + }, + { + "epoch": 0.6618541033434651, + "grad_norm": 3.0901825428009033, + "learning_rate": 4.6141798427135475e-06, + "loss": 0.5037497282028198, + "mean_token_accuracy": 0.8354041576385498, + "num_tokens": 7942829.0, + "step": 871 + }, + { + "epoch": 0.662613981762918, + "grad_norm": 2.8692073822021484, + "learning_rate": 4.6130613158264605e-06, + "loss": 0.5418164134025574, + "mean_token_accuracy": 0.8298909664154053, + "num_tokens": 7949303.0, + "step": 872 + }, + { + "epoch": 0.6633738601823708, + "grad_norm": 3.960404396057129, + "learning_rate": 4.611941305911602e-06, + "loss": 0.6284480094909668, + "mean_token_accuracy": 0.837495744228363, + "num_tokens": 7952486.0, + "step": 873 + }, + { + "epoch": 0.6641337386018237, + "grad_norm": 2.6690115928649902, + "learning_rate": 4.610819813755038e-06, + "loss": 0.5214360952377319, + "mean_token_accuracy": 0.8213508129119873, + "num_tokens": 7957559.0, + "step": 874 + }, + { + "epoch": 0.6648936170212766, + "grad_norm": 2.3376171588897705, + "learning_rate": 4.609696840143875e-06, + "loss": 0.46887528896331787, + "mean_token_accuracy": 0.8438819646835327, + "num_tokens": 7962826.0, + "step": 875 + }, + { + "epoch": 0.6656534954407295, + "grad_norm": 2.2222683429718018, + "learning_rate": 4.6085723858662575e-06, + "loss": 0.5607719421386719, + "mean_token_accuracy": 0.8128405809402466, + "num_tokens": 7970131.0, + "step": 876 + }, + { + "epoch": 0.6664133738601824, + "grad_norm": 2.069091558456421, + "learning_rate": 4.607446451711372e-06, + "loss": 0.506301760673523, + "mean_token_accuracy": 0.8256827592849731, + "num_tokens": 7977524.0, + "step": 877 + }, + { + "epoch": 0.6671732522796353, + "grad_norm": 1.3724967241287231, + "learning_rate": 4.606319038469443e-06, + "loss": 0.43285101652145386, + "mean_token_accuracy": 0.8525032997131348, + "num_tokens": 7989174.0, + "step": 878 + }, + { + "epoch": 0.6679331306990881, + "grad_norm": 2.278205156326294, + "learning_rate": 4.605190146931731e-06, + "loss": 0.4845905303955078, + "mean_token_accuracy": 0.8284652829170227, + "num_tokens": 7998524.0, + "step": 879 + }, + { + "epoch": 0.668693009118541, + "grad_norm": 1.3871766328811646, + "learning_rate": 4.604059777890537e-06, + "loss": 0.5736679434776306, + "mean_token_accuracy": 0.8223285675048828, + "num_tokens": 8015776.0, + "step": 880 + }, + { + "epoch": 0.6694528875379939, + "grad_norm": 1.926164984703064, + "learning_rate": 4.602927932139197e-06, + "loss": 0.4133230447769165, + "mean_token_accuracy": 0.8653768301010132, + "num_tokens": 8022979.0, + "step": 881 + }, + { + "epoch": 0.6702127659574468, + "grad_norm": 2.109272003173828, + "learning_rate": 4.601794610472083e-06, + "loss": 0.7005600929260254, + "mean_token_accuracy": 0.7777010202407837, + "num_tokens": 8032618.0, + "step": 882 + }, + { + "epoch": 0.6709726443768997, + "grad_norm": 2.077977418899536, + "learning_rate": 4.6006598136846056e-06, + "loss": 0.5278208255767822, + "mean_token_accuracy": 0.8230358958244324, + "num_tokens": 8040534.0, + "step": 883 + }, + { + "epoch": 0.6717325227963525, + "grad_norm": 1.678581714630127, + "learning_rate": 4.599523542573207e-06, + "loss": 0.4955351650714874, + "mean_token_accuracy": 0.8270003795623779, + "num_tokens": 8052249.0, + "step": 884 + }, + { + "epoch": 0.6724924012158054, + "grad_norm": 2.0751662254333496, + "learning_rate": 4.598385797935368e-06, + "loss": 0.5266247987747192, + "mean_token_accuracy": 0.8263581991195679, + "num_tokens": 8060600.0, + "step": 885 + }, + { + "epoch": 0.6732522796352584, + "grad_norm": 2.418405771255493, + "learning_rate": 4.5972465805696e-06, + "loss": 0.4481425881385803, + "mean_token_accuracy": 0.846164345741272, + "num_tokens": 8066025.0, + "step": 886 + }, + { + "epoch": 0.6740121580547113, + "grad_norm": 2.3936474323272705, + "learning_rate": 4.596105891275449e-06, + "loss": 0.4553404450416565, + "mean_token_accuracy": 0.8412896394729614, + "num_tokens": 8071544.0, + "step": 887 + }, + { + "epoch": 0.6747720364741642, + "grad_norm": 2.2024407386779785, + "learning_rate": 4.594963730853497e-06, + "loss": 0.6218541860580444, + "mean_token_accuracy": 0.7890232801437378, + "num_tokens": 8079061.0, + "step": 888 + }, + { + "epoch": 0.675531914893617, + "grad_norm": 2.51015567779541, + "learning_rate": 4.593820100105355e-06, + "loss": 0.5149124264717102, + "mean_token_accuracy": 0.8241918087005615, + "num_tokens": 8084293.0, + "step": 889 + }, + { + "epoch": 0.6762917933130699, + "grad_norm": 1.8748939037322998, + "learning_rate": 4.5926749998336665e-06, + "loss": 0.50836181640625, + "mean_token_accuracy": 0.8067223429679871, + "num_tokens": 8092511.0, + "step": 890 + }, + { + "epoch": 0.6770516717325228, + "grad_norm": 1.801193118095398, + "learning_rate": 4.5915284308421075e-06, + "loss": 0.4372861683368683, + "mean_token_accuracy": 0.8510604500770569, + "num_tokens": 8101174.0, + "step": 891 + }, + { + "epoch": 0.6778115501519757, + "grad_norm": 2.6476457118988037, + "learning_rate": 4.590380393935383e-06, + "loss": 0.38700711727142334, + "mean_token_accuracy": 0.8659796714782715, + "num_tokens": 8105398.0, + "step": 892 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 1.1147183179855347, + "learning_rate": 4.589230889919232e-06, + "loss": 0.38546115159988403, + "mean_token_accuracy": 0.8570581674575806, + "num_tokens": 8127394.0, + "step": 893 + }, + { + "epoch": 0.6793313069908815, + "grad_norm": 2.908905506134033, + "learning_rate": 4.588079919600419e-06, + "loss": 0.5108504295349121, + "mean_token_accuracy": 0.8121406435966492, + "num_tokens": 8131801.0, + "step": 894 + }, + { + "epoch": 0.6800911854103343, + "grad_norm": 3.1522326469421387, + "learning_rate": 4.586927483786739e-06, + "loss": 0.44059112668037415, + "mean_token_accuracy": 0.8448011875152588, + "num_tokens": 8154416.0, + "step": 895 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 1.5142440795898438, + "learning_rate": 4.585773583287017e-06, + "loss": 0.513217568397522, + "mean_token_accuracy": 0.8386049270629883, + "num_tokens": 8171156.0, + "step": 896 + }, + { + "epoch": 0.6816109422492401, + "grad_norm": 2.597881317138672, + "learning_rate": 4.584618218911104e-06, + "loss": 0.4937712550163269, + "mean_token_accuracy": 0.8223681449890137, + "num_tokens": 8176124.0, + "step": 897 + }, + { + "epoch": 0.682370820668693, + "grad_norm": 1.8185619115829468, + "learning_rate": 4.583461391469879e-06, + "loss": 0.519811749458313, + "mean_token_accuracy": 0.8169777393341064, + "num_tokens": 8185136.0, + "step": 898 + }, + { + "epoch": 0.6831306990881459, + "grad_norm": 3.2061994075775146, + "learning_rate": 4.582303101775249e-06, + "loss": 0.4655115008354187, + "mean_token_accuracy": 0.8425977230072021, + "num_tokens": 8188864.0, + "step": 899 + }, + { + "epoch": 0.6838905775075987, + "grad_norm": 1.3485229015350342, + "learning_rate": 4.581143350640146e-06, + "loss": 0.5014470815658569, + "mean_token_accuracy": 0.8273109197616577, + "num_tokens": 8203460.0, + "step": 900 + }, + { + "epoch": 0.6846504559270516, + "grad_norm": 1.3264713287353516, + "learning_rate": 4.579982138878527e-06, + "loss": 0.5073703527450562, + "mean_token_accuracy": 0.8259357213973999, + "num_tokens": 8219348.0, + "step": 901 + }, + { + "epoch": 0.6854103343465046, + "grad_norm": 2.4436347484588623, + "learning_rate": 4.578819467305375e-06, + "loss": 0.47020310163497925, + "mean_token_accuracy": 0.8567265272140503, + "num_tokens": 8224427.0, + "step": 902 + }, + { + "epoch": 0.6861702127659575, + "grad_norm": 1.921749234199524, + "learning_rate": 4.5776553367367e-06, + "loss": 0.622514009475708, + "mean_token_accuracy": 0.7863982319831848, + "num_tokens": 8233151.0, + "step": 903 + }, + { + "epoch": 0.6869300911854104, + "grad_norm": 1.8815616369247437, + "learning_rate": 4.576489747989532e-06, + "loss": 0.4910545349121094, + "mean_token_accuracy": 0.8147122859954834, + "num_tokens": 8240762.0, + "step": 904 + }, + { + "epoch": 0.6876899696048632, + "grad_norm": 1.2366989850997925, + "learning_rate": 4.575322701881926e-06, + "loss": 0.3947566747665405, + "mean_token_accuracy": 0.873993992805481, + "num_tokens": 8259381.0, + "step": 905 + }, + { + "epoch": 0.6884498480243161, + "grad_norm": 1.5767735242843628, + "learning_rate": 4.57415419923296e-06, + "loss": 0.57136070728302, + "mean_token_accuracy": 0.8028088808059692, + "num_tokens": 8273296.0, + "step": 906 + }, + { + "epoch": 0.689209726443769, + "grad_norm": 2.378675699234009, + "learning_rate": 4.572984240862733e-06, + "loss": 0.5894849896430969, + "mean_token_accuracy": 0.7977708578109741, + "num_tokens": 8280083.0, + "step": 907 + }, + { + "epoch": 0.6899696048632219, + "grad_norm": 2.0401132106781006, + "learning_rate": 4.57181282759237e-06, + "loss": 0.5524613261222839, + "mean_token_accuracy": 0.8138598203659058, + "num_tokens": 8288236.0, + "step": 908 + }, + { + "epoch": 0.6907294832826748, + "grad_norm": 2.293701648712158, + "learning_rate": 4.570639960244011e-06, + "loss": 0.5154546499252319, + "mean_token_accuracy": 0.8234660625457764, + "num_tokens": 8294493.0, + "step": 909 + }, + { + "epoch": 0.6914893617021277, + "grad_norm": 1.9286527633666992, + "learning_rate": 4.56946563964082e-06, + "loss": 0.5364264845848083, + "mean_token_accuracy": 0.8147368431091309, + "num_tokens": 8303441.0, + "step": 910 + }, + { + "epoch": 0.6922492401215805, + "grad_norm": 1.2571251392364502, + "learning_rate": 4.5682898666069815e-06, + "loss": 0.43535223603248596, + "mean_token_accuracy": 0.859239935874939, + "num_tokens": 8321548.0, + "step": 911 + }, + { + "epoch": 0.6930091185410334, + "grad_norm": 1.2224860191345215, + "learning_rate": 4.567112641967697e-06, + "loss": 0.40205076336860657, + "mean_token_accuracy": 0.8724711537361145, + "num_tokens": 8335205.0, + "step": 912 + }, + { + "epoch": 0.6937689969604863, + "grad_norm": 1.2064491510391235, + "learning_rate": 4.5659339665491894e-06, + "loss": 0.37790587544441223, + "mean_token_accuracy": 0.8464339971542358, + "num_tokens": 8350926.0, + "step": 913 + }, + { + "epoch": 0.6945288753799392, + "grad_norm": 2.1755270957946777, + "learning_rate": 4.5647538411786965e-06, + "loss": 0.42034298181533813, + "mean_token_accuracy": 0.84148108959198, + "num_tokens": 8356739.0, + "step": 914 + }, + { + "epoch": 0.6952887537993921, + "grad_norm": 1.234864592552185, + "learning_rate": 4.563572266684478e-06, + "loss": 0.5062938332557678, + "mean_token_accuracy": 0.8132052421569824, + "num_tokens": 8373660.0, + "step": 915 + }, + { + "epoch": 0.6960486322188449, + "grad_norm": 2.4250621795654297, + "learning_rate": 4.562389243895807e-06, + "loss": 0.4907791018486023, + "mean_token_accuracy": 0.8337979912757874, + "num_tokens": 8378661.0, + "step": 916 + }, + { + "epoch": 0.6968085106382979, + "grad_norm": 1.5018314123153687, + "learning_rate": 4.561204773642974e-06, + "loss": 0.41041281819343567, + "mean_token_accuracy": 0.8569784164428711, + "num_tokens": 8390322.0, + "step": 917 + }, + { + "epoch": 0.6975683890577508, + "grad_norm": 2.797269344329834, + "learning_rate": 4.5600188567572874e-06, + "loss": 0.3146931529045105, + "mean_token_accuracy": 0.8913302421569824, + "num_tokens": 8393567.0, + "step": 918 + }, + { + "epoch": 0.6983282674772037, + "grad_norm": 1.4002827405929565, + "learning_rate": 4.558831494071069e-06, + "loss": 0.4275597333908081, + "mean_token_accuracy": 0.8504893779754639, + "num_tokens": 8407119.0, + "step": 919 + }, + { + "epoch": 0.6990881458966566, + "grad_norm": 1.7045831680297852, + "learning_rate": 4.557642686417654e-06, + "loss": 0.49593430757522583, + "mean_token_accuracy": 0.8185091018676758, + "num_tokens": 8417408.0, + "step": 920 + }, + { + "epoch": 0.6998480243161094, + "grad_norm": 2.8818066120147705, + "learning_rate": 4.556452434631396e-06, + "loss": 0.637908935546875, + "mean_token_accuracy": 0.7883946895599365, + "num_tokens": 8422319.0, + "step": 921 + }, + { + "epoch": 0.7006079027355623, + "grad_norm": 2.3587265014648438, + "learning_rate": 4.555260739547657e-06, + "loss": 0.38749319314956665, + "mean_token_accuracy": 0.8774704933166504, + "num_tokens": 8427315.0, + "step": 922 + }, + { + "epoch": 0.7013677811550152, + "grad_norm": 1.6648749113082886, + "learning_rate": 4.554067602002815e-06, + "loss": 0.4044865369796753, + "mean_token_accuracy": 0.8524141311645508, + "num_tokens": 8438662.0, + "step": 923 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 3.467787742614746, + "learning_rate": 4.55287302283426e-06, + "loss": 0.591016411781311, + "mean_token_accuracy": 0.81184983253479, + "num_tokens": 8442237.0, + "step": 924 + }, + { + "epoch": 0.702887537993921, + "grad_norm": 2.1458635330200195, + "learning_rate": 4.551677002880395e-06, + "loss": 0.5017476677894592, + "mean_token_accuracy": 0.822914183139801, + "num_tokens": 8449494.0, + "step": 925 + }, + { + "epoch": 0.7036474164133738, + "grad_norm": 2.521714448928833, + "learning_rate": 4.550479542980632e-06, + "loss": 0.531912088394165, + "mean_token_accuracy": 0.8225687742233276, + "num_tokens": 8454983.0, + "step": 926 + }, + { + "epoch": 0.7044072948328267, + "grad_norm": 3.5248100757598877, + "learning_rate": 4.549280643975394e-06, + "loss": 0.4631815254688263, + "mean_token_accuracy": 0.8443771600723267, + "num_tokens": 8458504.0, + "step": 927 + }, + { + "epoch": 0.7051671732522796, + "grad_norm": 2.5105819702148438, + "learning_rate": 4.548080306706114e-06, + "loss": 0.30487123131752014, + "mean_token_accuracy": 0.9018767476081848, + "num_tokens": 8462589.0, + "step": 928 + }, + { + "epoch": 0.7059270516717325, + "grad_norm": 1.3367713689804077, + "learning_rate": 4.5468785320152365e-06, + "loss": 0.4355026185512543, + "mean_token_accuracy": 0.8323584794998169, + "num_tokens": 8478450.0, + "step": 929 + }, + { + "epoch": 0.7066869300911854, + "grad_norm": 2.2506282329559326, + "learning_rate": 4.545675320746212e-06, + "loss": 0.5082957744598389, + "mean_token_accuracy": 0.823430597782135, + "num_tokens": 8485991.0, + "step": 930 + }, + { + "epoch": 0.7074468085106383, + "grad_norm": 1.7164632081985474, + "learning_rate": 4.544470673743502e-06, + "loss": 0.3960164785385132, + "mean_token_accuracy": 0.8592486381530762, + "num_tokens": 8495217.0, + "step": 931 + }, + { + "epoch": 0.7082066869300911, + "grad_norm": 1.5864969491958618, + "learning_rate": 4.543264591852572e-06, + "loss": 0.49114471673965454, + "mean_token_accuracy": 0.8330780267715454, + "num_tokens": 8508904.0, + "step": 932 + }, + { + "epoch": 0.708966565349544, + "grad_norm": 2.1707003116607666, + "learning_rate": 4.542057075919898e-06, + "loss": 0.49895772337913513, + "mean_token_accuracy": 0.8327431082725525, + "num_tokens": 8515792.0, + "step": 933 + }, + { + "epoch": 0.709726443768997, + "grad_norm": 1.9002083539962769, + "learning_rate": 4.54084812679296e-06, + "loss": 0.4548531472682953, + "mean_token_accuracy": 0.834532618522644, + "num_tokens": 8524006.0, + "step": 934 + }, + { + "epoch": 0.7104863221884499, + "grad_norm": 1.8505141735076904, + "learning_rate": 4.539637745320247e-06, + "loss": 0.35716521739959717, + "mean_token_accuracy": 0.872222900390625, + "num_tokens": 8533647.0, + "step": 935 + }, + { + "epoch": 0.7112462006079028, + "grad_norm": 2.092620849609375, + "learning_rate": 4.53842593235125e-06, + "loss": 0.4673694372177124, + "mean_token_accuracy": 0.8460999131202698, + "num_tokens": 8540734.0, + "step": 936 + }, + { + "epoch": 0.7120060790273556, + "grad_norm": 2.689514636993408, + "learning_rate": 4.537212688736466e-06, + "loss": 0.45461273193359375, + "mean_token_accuracy": 0.8450704216957092, + "num_tokens": 8544948.0, + "step": 937 + }, + { + "epoch": 0.7127659574468085, + "grad_norm": 2.4507734775543213, + "learning_rate": 4.535998015327396e-06, + "loss": 0.4571906626224518, + "mean_token_accuracy": 0.8429360389709473, + "num_tokens": 8550445.0, + "step": 938 + }, + { + "epoch": 0.7135258358662614, + "grad_norm": 1.8960013389587402, + "learning_rate": 4.534781912976546e-06, + "loss": 0.4461391568183899, + "mean_token_accuracy": 0.8487973213195801, + "num_tokens": 8557630.0, + "step": 939 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.602611780166626, + "learning_rate": 4.533564382537421e-06, + "loss": 0.5277102589607239, + "mean_token_accuracy": 0.8330916166305542, + "num_tokens": 8570397.0, + "step": 940 + }, + { + "epoch": 0.7150455927051672, + "grad_norm": 1.8936395645141602, + "learning_rate": 4.532345424864533e-06, + "loss": 0.38619571924209595, + "mean_token_accuracy": 0.8514572381973267, + "num_tokens": 8582673.0, + "step": 941 + }, + { + "epoch": 0.71580547112462, + "grad_norm": 1.3898619413375854, + "learning_rate": 4.531125040813392e-06, + "loss": 0.4825032949447632, + "mean_token_accuracy": 0.833012580871582, + "num_tokens": 8597239.0, + "step": 942 + }, + { + "epoch": 0.7165653495440729, + "grad_norm": 2.128230571746826, + "learning_rate": 4.529903231240511e-06, + "loss": 0.4862118065357208, + "mean_token_accuracy": 0.8210917711257935, + "num_tokens": 8605877.0, + "step": 943 + }, + { + "epoch": 0.7173252279635258, + "grad_norm": 1.6552259922027588, + "learning_rate": 4.528679997003403e-06, + "loss": 0.5092059373855591, + "mean_token_accuracy": 0.8247389793395996, + "num_tokens": 8617060.0, + "step": 944 + }, + { + "epoch": 0.7180851063829787, + "grad_norm": 2.1174771785736084, + "learning_rate": 4.52745533896058e-06, + "loss": 0.39110174775123596, + "mean_token_accuracy": 0.8672944903373718, + "num_tokens": 8623306.0, + "step": 945 + }, + { + "epoch": 0.7188449848024316, + "grad_norm": 2.8648383617401123, + "learning_rate": 4.526229257971556e-06, + "loss": 0.49864327907562256, + "mean_token_accuracy": 0.8305130004882812, + "num_tokens": 8627466.0, + "step": 946 + }, + { + "epoch": 0.7196048632218845, + "grad_norm": 2.155514717102051, + "learning_rate": 4.52500175489684e-06, + "loss": 0.5070191025733948, + "mean_token_accuracy": 0.8311188817024231, + "num_tokens": 8634759.0, + "step": 947 + }, + { + "epoch": 0.7203647416413373, + "grad_norm": 1.8432683944702148, + "learning_rate": 4.523772830597942e-06, + "loss": 0.5569252371788025, + "mean_token_accuracy": 0.8070821762084961, + "num_tokens": 8644160.0, + "step": 948 + }, + { + "epoch": 0.7211246200607903, + "grad_norm": 2.8912241458892822, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4799427390098572, + "mean_token_accuracy": 0.8443552851676941, + "num_tokens": 8648377.0, + "step": 949 + }, + { + "epoch": 0.7218844984802432, + "grad_norm": 3.3449625968933105, + "learning_rate": 4.521310721778622e-06, + "loss": 0.44043463468551636, + "mean_token_accuracy": 0.8521315455436707, + "num_tokens": 8651846.0, + "step": 950 + }, + { + "epoch": 0.7226443768996961, + "grad_norm": 1.4127917289733887, + "learning_rate": 4.520077538986203e-06, + "loss": 0.4700999855995178, + "mean_token_accuracy": 0.8377952575683594, + "num_tokens": 8665199.0, + "step": 951 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.1607301235198975, + "learning_rate": 4.518842938425606e-06, + "loss": 0.4374256730079651, + "mean_token_accuracy": 0.8448896408081055, + "num_tokens": 8672158.0, + "step": 952 + }, + { + "epoch": 0.7241641337386018, + "grad_norm": 1.3442779779434204, + "learning_rate": 4.51760692096332e-06, + "loss": 0.38948923349380493, + "mean_token_accuracy": 0.8598923683166504, + "num_tokens": 8684532.0, + "step": 953 + }, + { + "epoch": 0.7249240121580547, + "grad_norm": 2.0003178119659424, + "learning_rate": 4.516369487466832e-06, + "loss": 0.3797217011451721, + "mean_token_accuracy": 0.8652102947235107, + "num_tokens": 8691460.0, + "step": 954 + }, + { + "epoch": 0.7256838905775076, + "grad_norm": 1.8196535110473633, + "learning_rate": 4.5151306388046175e-06, + "loss": 0.5676811933517456, + "mean_token_accuracy": 0.818500816822052, + "num_tokens": 8701624.0, + "step": 955 + }, + { + "epoch": 0.7264437689969605, + "grad_norm": 2.1962296962738037, + "learning_rate": 4.513890375846152e-06, + "loss": 0.45399484038352966, + "mean_token_accuracy": 0.8463879227638245, + "num_tokens": 8707410.0, + "step": 956 + }, + { + "epoch": 0.7272036474164134, + "grad_norm": 1.8798872232437134, + "learning_rate": 4.512648699461897e-06, + "loss": 0.5679811239242554, + "mean_token_accuracy": 0.8089900016784668, + "num_tokens": 8715630.0, + "step": 957 + }, + { + "epoch": 0.7279635258358662, + "grad_norm": 2.3540258407592773, + "learning_rate": 4.511405610523309e-06, + "loss": 0.5282865762710571, + "mean_token_accuracy": 0.8196114301681519, + "num_tokens": 8721934.0, + "step": 958 + }, + { + "epoch": 0.7287234042553191, + "grad_norm": 2.5630908012390137, + "learning_rate": 4.510161109902837e-06, + "loss": 0.39442378282546997, + "mean_token_accuracy": 0.8400980830192566, + "num_tokens": 8726511.0, + "step": 959 + }, + { + "epoch": 0.729483282674772, + "grad_norm": 1.9829226732254028, + "learning_rate": 4.508915198473919e-06, + "loss": 0.4611976742744446, + "mean_token_accuracy": 0.8439624309539795, + "num_tokens": 8733460.0, + "step": 960 + }, + { + "epoch": 0.7302431610942249, + "grad_norm": 3.0291950702667236, + "learning_rate": 4.507667877110982e-06, + "loss": 0.5158340930938721, + "mean_token_accuracy": 0.8300060033798218, + "num_tokens": 8737629.0, + "step": 961 + }, + { + "epoch": 0.7310030395136778, + "grad_norm": 1.9208252429962158, + "learning_rate": 4.506419146689445e-06, + "loss": 0.3807099163532257, + "mean_token_accuracy": 0.871469259262085, + "num_tokens": 8744615.0, + "step": 962 + }, + { + "epoch": 0.7317629179331308, + "grad_norm": 3.051565408706665, + "learning_rate": 4.505169008085717e-06, + "loss": 0.38461726903915405, + "mean_token_accuracy": 0.874465823173523, + "num_tokens": 8748154.0, + "step": 963 + }, + { + "epoch": 0.7325227963525835, + "grad_norm": 1.375466227531433, + "learning_rate": 4.503917462177192e-06, + "loss": 0.42490679025650024, + "mean_token_accuracy": 0.8457326889038086, + "num_tokens": 8760965.0, + "step": 964 + }, + { + "epoch": 0.7332826747720365, + "grad_norm": 2.216681957244873, + "learning_rate": 4.5026645098422515e-06, + "loss": 0.43149900436401367, + "mean_token_accuracy": 0.8527278900146484, + "num_tokens": 8766996.0, + "step": 965 + }, + { + "epoch": 0.7340425531914894, + "grad_norm": 1.9422595500946045, + "learning_rate": 4.5014101519602684e-06, + "loss": 0.4964504539966583, + "mean_token_accuracy": 0.8137556314468384, + "num_tokens": 8774411.0, + "step": 966 + }, + { + "epoch": 0.7348024316109423, + "grad_norm": 2.058887004852295, + "learning_rate": 4.500154389411598e-06, + "loss": 0.4977570176124573, + "mean_token_accuracy": 0.8254626989364624, + "num_tokens": 8782220.0, + "step": 967 + }, + { + "epoch": 0.7355623100303952, + "grad_norm": 2.9977786540985107, + "learning_rate": 4.498897223077582e-06, + "loss": 0.4061415195465088, + "mean_token_accuracy": 0.8752427101135254, + "num_tokens": 8786120.0, + "step": 968 + }, + { + "epoch": 0.736322188449848, + "grad_norm": 2.2636303901672363, + "learning_rate": 4.49763865384055e-06, + "loss": 0.5062161087989807, + "mean_token_accuracy": 0.8171653747558594, + "num_tokens": 8792459.0, + "step": 969 + }, + { + "epoch": 0.7370820668693009, + "grad_norm": 1.8850842714309692, + "learning_rate": 4.496378682583813e-06, + "loss": 0.5014280676841736, + "mean_token_accuracy": 0.8547511100769043, + "num_tokens": 8800675.0, + "step": 970 + }, + { + "epoch": 0.7378419452887538, + "grad_norm": 1.191985011100769, + "learning_rate": 4.495117310191667e-06, + "loss": 0.4713883101940155, + "mean_token_accuracy": 0.8213596343994141, + "num_tokens": 8820740.0, + "step": 971 + }, + { + "epoch": 0.7386018237082067, + "grad_norm": 1.823000192642212, + "learning_rate": 4.493854537549393e-06, + "loss": 0.46332645416259766, + "mean_token_accuracy": 0.8359860777854919, + "num_tokens": 8828884.0, + "step": 972 + }, + { + "epoch": 0.7393617021276596, + "grad_norm": 2.590446949005127, + "learning_rate": 4.492590365543253e-06, + "loss": 0.49074703454971313, + "mean_token_accuracy": 0.8433758020401001, + "num_tokens": 8833859.0, + "step": 973 + }, + { + "epoch": 0.7401215805471124, + "grad_norm": 2.2762670516967773, + "learning_rate": 4.491324795060491e-06, + "loss": 0.39465656876564026, + "mean_token_accuracy": 0.8734766244888306, + "num_tokens": 8839350.0, + "step": 974 + }, + { + "epoch": 0.7408814589665653, + "grad_norm": 2.698725461959839, + "learning_rate": 4.490057826989333e-06, + "loss": 0.5552085041999817, + "mean_token_accuracy": 0.8132266998291016, + "num_tokens": 8844373.0, + "step": 975 + }, + { + "epoch": 0.7416413373860182, + "grad_norm": 2.704606294631958, + "learning_rate": 4.488789462218988e-06, + "loss": 0.3447791635990143, + "mean_token_accuracy": 0.8736170530319214, + "num_tokens": 8848236.0, + "step": 976 + }, + { + "epoch": 0.7424012158054711, + "grad_norm": 3.1260716915130615, + "learning_rate": 4.487519701639641e-06, + "loss": 0.5945233702659607, + "mean_token_accuracy": 0.7997599840164185, + "num_tokens": 8852935.0, + "step": 977 + }, + { + "epoch": 0.743161094224924, + "grad_norm": 1.6895452737808228, + "learning_rate": 4.486248546142459e-06, + "loss": 0.4823892116546631, + "mean_token_accuracy": 0.8279662132263184, + "num_tokens": 8861743.0, + "step": 978 + }, + { + "epoch": 0.743920972644377, + "grad_norm": 1.9161452054977417, + "learning_rate": 4.4849759966195885e-06, + "loss": 0.5266581773757935, + "mean_token_accuracy": 0.8218623399734497, + "num_tokens": 8870601.0, + "step": 979 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 1.6894301176071167, + "learning_rate": 4.483702053964154e-06, + "loss": 0.4186219573020935, + "mean_token_accuracy": 0.8471781015396118, + "num_tokens": 8885617.0, + "step": 980 + }, + { + "epoch": 0.7454407294832827, + "grad_norm": 1.6319992542266846, + "learning_rate": 4.482426719070258e-06, + "loss": 0.541317880153656, + "mean_token_accuracy": 0.8216162323951721, + "num_tokens": 8897595.0, + "step": 981 + }, + { + "epoch": 0.7462006079027356, + "grad_norm": 5.102413177490234, + "learning_rate": 4.4811499928329775e-06, + "loss": 0.3928517699241638, + "mean_token_accuracy": 0.858033299446106, + "num_tokens": 8901682.0, + "step": 982 + }, + { + "epoch": 0.7469604863221885, + "grad_norm": 2.213860273361206, + "learning_rate": 4.479871876148368e-06, + "loss": 0.4276347756385803, + "mean_token_accuracy": 0.8529798984527588, + "num_tokens": 8908088.0, + "step": 983 + }, + { + "epoch": 0.7477203647416414, + "grad_norm": 1.2180038690567017, + "learning_rate": 4.478592369913464e-06, + "loss": 0.3941590189933777, + "mean_token_accuracy": 0.8608149290084839, + "num_tokens": 8925876.0, + "step": 984 + }, + { + "epoch": 0.7484802431610942, + "grad_norm": 2.849802255630493, + "learning_rate": 4.477311475026271e-06, + "loss": 0.42190325260162354, + "mean_token_accuracy": 0.860505223274231, + "num_tokens": 8930190.0, + "step": 985 + }, + { + "epoch": 0.7492401215805471, + "grad_norm": 1.704128384590149, + "learning_rate": 4.476029192385769e-06, + "loss": 0.4786282777786255, + "mean_token_accuracy": 0.8302322626113892, + "num_tokens": 8938340.0, + "step": 986 + }, + { + "epoch": 0.75, + "grad_norm": 2.06322979927063, + "learning_rate": 4.474745522891915e-06, + "loss": 0.4648786187171936, + "mean_token_accuracy": 0.8366481065750122, + "num_tokens": 8944633.0, + "step": 987 + }, + { + "epoch": 0.7507598784194529, + "grad_norm": 2.0745396614074707, + "learning_rate": 4.473460467445637e-06, + "loss": 0.5744885206222534, + "mean_token_accuracy": 0.8357284069061279, + "num_tokens": 8954457.0, + "step": 988 + }, + { + "epoch": 0.7515197568389058, + "grad_norm": 1.9281407594680786, + "learning_rate": 4.472174026948836e-06, + "loss": 0.528974175453186, + "mean_token_accuracy": 0.8083580732345581, + "num_tokens": 8962701.0, + "step": 989 + }, + { + "epoch": 0.7522796352583586, + "grad_norm": 3.012381076812744, + "learning_rate": 4.470886202304385e-06, + "loss": 0.48754751682281494, + "mean_token_accuracy": 0.8368391990661621, + "num_tokens": 8967272.0, + "step": 990 + }, + { + "epoch": 0.7530395136778115, + "grad_norm": 1.691826581954956, + "learning_rate": 4.469596994416131e-06, + "loss": 0.484740674495697, + "mean_token_accuracy": 0.8500643968582153, + "num_tokens": 8976615.0, + "step": 991 + }, + { + "epoch": 0.7537993920972644, + "grad_norm": 2.4961965084075928, + "learning_rate": 4.468306404188887e-06, + "loss": 0.50777268409729, + "mean_token_accuracy": 0.8168395757675171, + "num_tokens": 8983235.0, + "step": 992 + }, + { + "epoch": 0.7545592705167173, + "grad_norm": 1.512007713317871, + "learning_rate": 4.467014432528441e-06, + "loss": 0.4583340287208557, + "mean_token_accuracy": 0.8465162515640259, + "num_tokens": 8993815.0, + "step": 993 + }, + { + "epoch": 0.7553191489361702, + "grad_norm": 1.9362257719039917, + "learning_rate": 4.465721080341547e-06, + "loss": 0.6027892827987671, + "mean_token_accuracy": 0.8052380084991455, + "num_tokens": 9002697.0, + "step": 994 + }, + { + "epoch": 0.756079027355623, + "grad_norm": 2.473632335662842, + "learning_rate": 4.4644263485359316e-06, + "loss": 0.5394320487976074, + "mean_token_accuracy": 0.834665834903717, + "num_tokens": 9007428.0, + "step": 995 + }, + { + "epoch": 0.756838905775076, + "grad_norm": 2.2527434825897217, + "learning_rate": 4.463130238020284e-06, + "loss": 0.5485198497772217, + "mean_token_accuracy": 0.8090173006057739, + "num_tokens": 9013570.0, + "step": 996 + }, + { + "epoch": 0.7575987841945289, + "grad_norm": 1.4130940437316895, + "learning_rate": 4.4618327497042676e-06, + "loss": 0.37994423508644104, + "mean_token_accuracy": 0.8625167012214661, + "num_tokens": 9025485.0, + "step": 997 + }, + { + "epoch": 0.7583586626139818, + "grad_norm": 2.685115098953247, + "learning_rate": 4.460533884498509e-06, + "loss": 0.447973370552063, + "mean_token_accuracy": 0.8564165234565735, + "num_tokens": 9030355.0, + "step": 998 + }, + { + "epoch": 0.7591185410334347, + "grad_norm": 3.2743139266967773, + "learning_rate": 4.4592336433146e-06, + "loss": 0.45275989174842834, + "mean_token_accuracy": 0.8462578058242798, + "num_tokens": 9034406.0, + "step": 999 + }, + { + "epoch": 0.7598784194528876, + "grad_norm": 1.9383049011230469, + "learning_rate": 4.457932027065102e-06, + "loss": 0.5387729406356812, + "mean_token_accuracy": 0.8357330560684204, + "num_tokens": 9041502.0, + "step": 1000 + }, + { + "epoch": 0.7606382978723404, + "grad_norm": 2.7348275184631348, + "learning_rate": 4.456629036663537e-06, + "loss": 0.4448447823524475, + "mean_token_accuracy": 0.8453642129898071, + "num_tokens": 9046088.0, + "step": 1001 + }, + { + "epoch": 0.7613981762917933, + "grad_norm": 1.8477401733398438, + "learning_rate": 4.455324673024396e-06, + "loss": 0.5766505002975464, + "mean_token_accuracy": 0.8074213862419128, + "num_tokens": 9055678.0, + "step": 1002 + }, + { + "epoch": 0.7621580547112462, + "grad_norm": 3.134481430053711, + "learning_rate": 4.4540189370631315e-06, + "loss": 0.5690872669219971, + "mean_token_accuracy": 0.8414670825004578, + "num_tokens": 9062006.0, + "step": 1003 + }, + { + "epoch": 0.7629179331306991, + "grad_norm": 1.7933398485183716, + "learning_rate": 4.452711829696158e-06, + "loss": 0.4898291826248169, + "mean_token_accuracy": 0.8259007930755615, + "num_tokens": 9070754.0, + "step": 1004 + }, + { + "epoch": 0.763677811550152, + "grad_norm": 1.2552275657653809, + "learning_rate": 4.451403351840855e-06, + "loss": 0.4280198812484741, + "mean_token_accuracy": 0.8409112691879272, + "num_tokens": 9085306.0, + "step": 1005 + }, + { + "epoch": 0.7644376899696048, + "grad_norm": 1.6749331951141357, + "learning_rate": 4.450093504415562e-06, + "loss": 0.3723178505897522, + "mean_token_accuracy": 0.8545734882354736, + "num_tokens": 9102453.0, + "step": 1006 + }, + { + "epoch": 0.7651975683890577, + "grad_norm": 2.7514500617980957, + "learning_rate": 4.44878228833958e-06, + "loss": 0.5463190674781799, + "mean_token_accuracy": 0.8121639490127563, + "num_tokens": 9108342.0, + "step": 1007 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 1.3322733640670776, + "learning_rate": 4.447469704533172e-06, + "loss": 0.573723316192627, + "mean_token_accuracy": 0.8065711259841919, + "num_tokens": 9123712.0, + "step": 1008 + }, + { + "epoch": 0.7667173252279635, + "grad_norm": 2.6893765926361084, + "learning_rate": 4.446155753917559e-06, + "loss": 0.6856257915496826, + "mean_token_accuracy": 0.7718256711959839, + "num_tokens": 9130728.0, + "step": 1009 + }, + { + "epoch": 0.7674772036474165, + "grad_norm": 1.792765498161316, + "learning_rate": 4.444840437414923e-06, + "loss": 0.48203110694885254, + "mean_token_accuracy": 0.8419194221496582, + "num_tokens": 9137983.0, + "step": 1010 + }, + { + "epoch": 0.7682370820668692, + "grad_norm": 1.4957399368286133, + "learning_rate": 4.443523755948401e-06, + "loss": 0.4372181296348572, + "mean_token_accuracy": 0.8491764664649963, + "num_tokens": 9148081.0, + "step": 1011 + }, + { + "epoch": 0.7689969604863222, + "grad_norm": 1.7294867038726807, + "learning_rate": 4.442205710442095e-06, + "loss": 0.54277503490448, + "mean_token_accuracy": 0.8196806907653809, + "num_tokens": 9158407.0, + "step": 1012 + }, + { + "epoch": 0.7697568389057751, + "grad_norm": 2.2091221809387207, + "learning_rate": 4.4408863018210564e-06, + "loss": 0.4888187646865845, + "mean_token_accuracy": 0.8384175300598145, + "num_tokens": 9164754.0, + "step": 1013 + }, + { + "epoch": 0.770516717325228, + "grad_norm": 1.7615830898284912, + "learning_rate": 4.439565531011299e-06, + "loss": 0.4640008211135864, + "mean_token_accuracy": 0.8424701690673828, + "num_tokens": 9172715.0, + "step": 1014 + }, + { + "epoch": 0.7712765957446809, + "grad_norm": 1.6796128749847412, + "learning_rate": 4.43824339893979e-06, + "loss": 0.5227609276771545, + "mean_token_accuracy": 0.8135923743247986, + "num_tokens": 9183214.0, + "step": 1015 + }, + { + "epoch": 0.7720364741641338, + "grad_norm": 2.1485698223114014, + "learning_rate": 4.436919906534452e-06, + "loss": 0.4857056140899658, + "mean_token_accuracy": 0.8323013782501221, + "num_tokens": 9190360.0, + "step": 1016 + }, + { + "epoch": 0.7727963525835866, + "grad_norm": 2.7842206954956055, + "learning_rate": 4.4355950547241645e-06, + "loss": 0.46406883001327515, + "mean_token_accuracy": 0.859869122505188, + "num_tokens": 9194523.0, + "step": 1017 + }, + { + "epoch": 0.7735562310030395, + "grad_norm": 2.3774640560150146, + "learning_rate": 4.434268844438758e-06, + "loss": 0.5625549554824829, + "mean_token_accuracy": 0.8188897371292114, + "num_tokens": 9201155.0, + "step": 1018 + }, + { + "epoch": 0.7743161094224924, + "grad_norm": 2.004427909851074, + "learning_rate": 4.432941276609018e-06, + "loss": 0.5164387226104736, + "mean_token_accuracy": 0.829569935798645, + "num_tokens": 9209269.0, + "step": 1019 + }, + { + "epoch": 0.7750759878419453, + "grad_norm": 1.7218989133834839, + "learning_rate": 4.431612352166684e-06, + "loss": 0.481005996465683, + "mean_token_accuracy": 0.8359906673431396, + "num_tokens": 9220860.0, + "step": 1020 + }, + { + "epoch": 0.7758358662613982, + "grad_norm": 2.197108507156372, + "learning_rate": 4.4302820720444454e-06, + "loss": 0.440413236618042, + "mean_token_accuracy": 0.8412867784500122, + "num_tokens": 9226414.0, + "step": 1021 + }, + { + "epoch": 0.776595744680851, + "grad_norm": 2.6995162963867188, + "learning_rate": 4.428950437175944e-06, + "loss": 0.3884299397468567, + "mean_token_accuracy": 0.8696021437644958, + "num_tokens": 9230898.0, + "step": 1022 + }, + { + "epoch": 0.7773556231003039, + "grad_norm": 2.1671667098999023, + "learning_rate": 4.427617448495772e-06, + "loss": 0.5747478008270264, + "mean_token_accuracy": 0.7842930555343628, + "num_tokens": 9238479.0, + "step": 1023 + }, + { + "epoch": 0.7781155015197568, + "grad_norm": 1.6299028396606445, + "learning_rate": 4.426283106939474e-06, + "loss": 0.39478403329849243, + "mean_token_accuracy": 0.8685503602027893, + "num_tokens": 9248263.0, + "step": 1024 + }, + { + "epoch": 0.7788753799392097, + "grad_norm": 2.2621798515319824, + "learning_rate": 4.424947413443539e-06, + "loss": 0.4582178592681885, + "mean_token_accuracy": 0.8312377333641052, + "num_tokens": 9254168.0, + "step": 1025 + }, + { + "epoch": 0.7796352583586627, + "grad_norm": 2.121091365814209, + "learning_rate": 4.423610368945411e-06, + "loss": 0.5315121412277222, + "mean_token_accuracy": 0.8121483325958252, + "num_tokens": 9261808.0, + "step": 1026 + }, + { + "epoch": 0.7803951367781155, + "grad_norm": 1.8558297157287598, + "learning_rate": 4.422271974383479e-06, + "loss": 0.4299176037311554, + "mean_token_accuracy": 0.8452648520469666, + "num_tokens": 9269264.0, + "step": 1027 + }, + { + "epoch": 0.7811550151975684, + "grad_norm": 1.9089949131011963, + "learning_rate": 4.420932230697079e-06, + "loss": 0.43876272439956665, + "mean_token_accuracy": 0.8434094190597534, + "num_tokens": 9277381.0, + "step": 1028 + }, + { + "epoch": 0.7819148936170213, + "grad_norm": 1.8619649410247803, + "learning_rate": 4.419591138826495e-06, + "loss": 0.48798668384552, + "mean_token_accuracy": 0.8281317353248596, + "num_tokens": 9285413.0, + "step": 1029 + }, + { + "epoch": 0.7826747720364742, + "grad_norm": 1.3273087739944458, + "learning_rate": 4.418248699712955e-06, + "loss": 0.4611460864543915, + "mean_token_accuracy": 0.8233213424682617, + "num_tokens": 9300805.0, + "step": 1030 + }, + { + "epoch": 0.7834346504559271, + "grad_norm": 1.0473746061325073, + "learning_rate": 4.416904914298637e-06, + "loss": 0.36537665128707886, + "mean_token_accuracy": 0.8671857118606567, + "num_tokens": 9320035.0, + "step": 1031 + }, + { + "epoch": 0.78419452887538, + "grad_norm": 1.9130918979644775, + "learning_rate": 4.415559783526661e-06, + "loss": 0.4916655123233795, + "mean_token_accuracy": 0.8266351222991943, + "num_tokens": 9326795.0, + "step": 1032 + }, + { + "epoch": 0.7849544072948328, + "grad_norm": 2.0001816749572754, + "learning_rate": 4.414213308341092e-06, + "loss": 0.5711008310317993, + "mean_token_accuracy": 0.8093076348304749, + "num_tokens": 9335625.0, + "step": 1033 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 3.933542251586914, + "learning_rate": 4.412865489686936e-06, + "loss": 0.621616542339325, + "mean_token_accuracy": 0.7938898801803589, + "num_tokens": 9339080.0, + "step": 1034 + }, + { + "epoch": 0.7864741641337386, + "grad_norm": 2.061558961868286, + "learning_rate": 4.411516328510145e-06, + "loss": 0.583686113357544, + "mean_token_accuracy": 0.8216883540153503, + "num_tokens": 9348581.0, + "step": 1035 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 1.9401264190673828, + "learning_rate": 4.410165825757613e-06, + "loss": 0.4905240535736084, + "mean_token_accuracy": 0.8229951858520508, + "num_tokens": 9356032.0, + "step": 1036 + }, + { + "epoch": 0.7879939209726444, + "grad_norm": 3.620547294616699, + "learning_rate": 4.408813982377175e-06, + "loss": 0.4269888997077942, + "mean_token_accuracy": 0.8713940978050232, + "num_tokens": 9359061.0, + "step": 1037 + }, + { + "epoch": 0.7887537993920972, + "grad_norm": 1.2027851343154907, + "learning_rate": 4.407460799317605e-06, + "loss": 0.39972418546676636, + "mean_token_accuracy": 0.8610097765922546, + "num_tokens": 9377068.0, + "step": 1038 + }, + { + "epoch": 0.7895136778115501, + "grad_norm": 2.566753387451172, + "learning_rate": 4.40610627752862e-06, + "loss": 0.45267152786254883, + "mean_token_accuracy": 0.83243328332901, + "num_tokens": 9383604.0, + "step": 1039 + }, + { + "epoch": 0.790273556231003, + "grad_norm": 2.940094470977783, + "learning_rate": 4.404750417960876e-06, + "loss": 0.42862242460250854, + "mean_token_accuracy": 0.8582849502563477, + "num_tokens": 9387541.0, + "step": 1040 + }, + { + "epoch": 0.791033434650456, + "grad_norm": 2.0223944187164307, + "learning_rate": 4.403393221565966e-06, + "loss": 0.4349963665008545, + "mean_token_accuracy": 0.8453047871589661, + "num_tokens": 9394382.0, + "step": 1041 + }, + { + "epoch": 0.7917933130699089, + "grad_norm": 2.9399030208587646, + "learning_rate": 4.402034689296425e-06, + "loss": 0.32197174429893494, + "mean_token_accuracy": 0.8953392505645752, + "num_tokens": 9397741.0, + "step": 1042 + }, + { + "epoch": 0.7925531914893617, + "grad_norm": 2.819016456604004, + "learning_rate": 4.400674822105721e-06, + "loss": 0.6790289878845215, + "mean_token_accuracy": 0.8135063648223877, + "num_tokens": 9403509.0, + "step": 1043 + }, + { + "epoch": 0.7933130699088146, + "grad_norm": 1.3225977420806885, + "learning_rate": 4.399313620948262e-06, + "loss": 0.42203834652900696, + "mean_token_accuracy": 0.8399381637573242, + "num_tokens": 9418870.0, + "step": 1044 + }, + { + "epoch": 0.7940729483282675, + "grad_norm": 1.7822176218032837, + "learning_rate": 4.397951086779392e-06, + "loss": 0.4666554927825928, + "mean_token_accuracy": 0.8364764451980591, + "num_tokens": 9427640.0, + "step": 1045 + }, + { + "epoch": 0.7948328267477204, + "grad_norm": 3.186439037322998, + "learning_rate": 4.396587220555389e-06, + "loss": 0.6048363447189331, + "mean_token_accuracy": 0.7806557416915894, + "num_tokens": 9431927.0, + "step": 1046 + }, + { + "epoch": 0.7955927051671733, + "grad_norm": 3.0804805755615234, + "learning_rate": 4.395222023233467e-06, + "loss": 0.445969820022583, + "mean_token_accuracy": 0.850671112537384, + "num_tokens": 9436136.0, + "step": 1047 + }, + { + "epoch": 0.7963525835866262, + "grad_norm": 1.675968885421753, + "learning_rate": 4.393855495771774e-06, + "loss": 0.4311422109603882, + "mean_token_accuracy": 0.8449079990386963, + "num_tokens": 9445189.0, + "step": 1048 + }, + { + "epoch": 0.797112462006079, + "grad_norm": 2.342410087585449, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.5733606219291687, + "mean_token_accuracy": 0.8156592845916748, + "num_tokens": 9451939.0, + "step": 1049 + }, + { + "epoch": 0.7978723404255319, + "grad_norm": 1.5967470407485962, + "learning_rate": 4.391118454266335e-06, + "loss": 0.46664729714393616, + "mean_token_accuracy": 0.8091695308685303, + "num_tokens": 9463968.0, + "step": 1050 + }, + { + "epoch": 0.7986322188449848, + "grad_norm": 1.5777863264083862, + "learning_rate": 4.389747942143549e-06, + "loss": 0.46028903126716614, + "mean_token_accuracy": 0.8347330093383789, + "num_tokens": 9475561.0, + "step": 1051 + }, + { + "epoch": 0.7993920972644377, + "grad_norm": 2.7630488872528076, + "learning_rate": 4.388376103722914e-06, + "loss": 0.5618188977241516, + "mean_token_accuracy": 0.8273467421531677, + "num_tokens": 9480661.0, + "step": 1052 + }, + { + "epoch": 0.8001519756838906, + "grad_norm": 2.093397378921509, + "learning_rate": 4.387002939967237e-06, + "loss": 0.2998353838920593, + "mean_token_accuracy": 0.8905231952667236, + "num_tokens": 9485924.0, + "step": 1053 + }, + { + "epoch": 0.8009118541033434, + "grad_norm": 1.4385871887207031, + "learning_rate": 4.38562845184026e-06, + "loss": 0.4944111704826355, + "mean_token_accuracy": 0.8403056263923645, + "num_tokens": 9500128.0, + "step": 1054 + }, + { + "epoch": 0.8016717325227963, + "grad_norm": 1.6393156051635742, + "learning_rate": 4.384252640306649e-06, + "loss": 0.5727907419204712, + "mean_token_accuracy": 0.7849414348602295, + "num_tokens": 9511569.0, + "step": 1055 + }, + { + "epoch": 0.8024316109422492, + "grad_norm": 2.3909664154052734, + "learning_rate": 4.382875506332002e-06, + "loss": 0.4760419726371765, + "mean_token_accuracy": 0.8408266305923462, + "num_tokens": 9517244.0, + "step": 1056 + }, + { + "epoch": 0.8031914893617021, + "grad_norm": 1.7288594245910645, + "learning_rate": 4.381497050882845e-06, + "loss": 0.5375926494598389, + "mean_token_accuracy": 0.8138614892959595, + "num_tokens": 9528736.0, + "step": 1057 + }, + { + "epoch": 0.8039513677811551, + "grad_norm": 2.093407392501831, + "learning_rate": 4.380117274926632e-06, + "loss": 0.46659404039382935, + "mean_token_accuracy": 0.8450702428817749, + "num_tokens": 9536200.0, + "step": 1058 + }, + { + "epoch": 0.8047112462006079, + "grad_norm": 1.6835898160934448, + "learning_rate": 4.3787361794317405e-06, + "loss": 0.43157699704170227, + "mean_token_accuracy": 0.8279973268508911, + "num_tokens": 9546314.0, + "step": 1059 + }, + { + "epoch": 0.8054711246200608, + "grad_norm": 1.983067512512207, + "learning_rate": 4.377353765367479e-06, + "loss": 0.5021739602088928, + "mean_token_accuracy": 0.8274815082550049, + "num_tokens": 9554375.0, + "step": 1060 + }, + { + "epoch": 0.8062310030395137, + "grad_norm": 2.0472030639648438, + "learning_rate": 4.375970033704078e-06, + "loss": 0.34298190474510193, + "mean_token_accuracy": 0.8900876045227051, + "num_tokens": 9560230.0, + "step": 1061 + }, + { + "epoch": 0.8069908814589666, + "grad_norm": 1.9613717794418335, + "learning_rate": 4.374584985412692e-06, + "loss": 0.3826758861541748, + "mean_token_accuracy": 0.839923620223999, + "num_tokens": 9566809.0, + "step": 1062 + }, + { + "epoch": 0.8077507598784195, + "grad_norm": 1.991289496421814, + "learning_rate": 4.373198621465405e-06, + "loss": 0.5492525100708008, + "mean_token_accuracy": 0.8153272867202759, + "num_tokens": 9576810.0, + "step": 1063 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 2.421370506286621, + "learning_rate": 4.3718109428352155e-06, + "loss": 0.5240297317504883, + "mean_token_accuracy": 0.8087242245674133, + "num_tokens": 9582906.0, + "step": 1064 + }, + { + "epoch": 0.8092705167173252, + "grad_norm": 3.697765588760376, + "learning_rate": 4.370421950496055e-06, + "loss": 0.6096476912498474, + "mean_token_accuracy": 0.787585973739624, + "num_tokens": 9586920.0, + "step": 1065 + }, + { + "epoch": 0.8100303951367781, + "grad_norm": 2.0767786502838135, + "learning_rate": 4.369031645422768e-06, + "loss": 0.41120079159736633, + "mean_token_accuracy": 0.8513731956481934, + "num_tokens": 9593902.0, + "step": 1066 + }, + { + "epoch": 0.810790273556231, + "grad_norm": 2.5968732833862305, + "learning_rate": 4.367640028591126e-06, + "loss": 0.3364982008934021, + "mean_token_accuracy": 0.8786963224411011, + "num_tokens": 9597745.0, + "step": 1067 + }, + { + "epoch": 0.8115501519756839, + "grad_norm": 2.165742874145508, + "learning_rate": 4.366247100977818e-06, + "loss": 0.406129390001297, + "mean_token_accuracy": 0.868243932723999, + "num_tokens": 9603496.0, + "step": 1068 + }, + { + "epoch": 0.8123100303951368, + "grad_norm": 2.0493404865264893, + "learning_rate": 4.364852863560456e-06, + "loss": 0.5356296300888062, + "mean_token_accuracy": 0.8191947340965271, + "num_tokens": 9610898.0, + "step": 1069 + }, + { + "epoch": 0.8130699088145896, + "grad_norm": 2.3224308490753174, + "learning_rate": 4.363457317317568e-06, + "loss": 0.41461923718452454, + "mean_token_accuracy": 0.8537945747375488, + "num_tokens": 9616626.0, + "step": 1070 + }, + { + "epoch": 0.8138297872340425, + "grad_norm": 1.7387986183166504, + "learning_rate": 4.362060463228603e-06, + "loss": 0.5134786367416382, + "mean_token_accuracy": 0.8511737585067749, + "num_tokens": 9626223.0, + "step": 1071 + }, + { + "epoch": 0.8145896656534954, + "grad_norm": 3.0270655155181885, + "learning_rate": 4.360662302273926e-06, + "loss": 0.3410695791244507, + "mean_token_accuracy": 0.8746449947357178, + "num_tokens": 9629455.0, + "step": 1072 + }, + { + "epoch": 0.8153495440729484, + "grad_norm": 1.7727062702178955, + "learning_rate": 4.35926283543482e-06, + "loss": 0.4610968828201294, + "mean_token_accuracy": 0.8444793224334717, + "num_tokens": 9638070.0, + "step": 1073 + }, + { + "epoch": 0.8161094224924013, + "grad_norm": 3.6333565711975098, + "learning_rate": 4.357862063693486e-06, + "loss": 0.3881273865699768, + "mean_token_accuracy": 0.8757344484329224, + "num_tokens": 9641028.0, + "step": 1074 + }, + { + "epoch": 0.8168693009118541, + "grad_norm": 3.024042844772339, + "learning_rate": 4.356459988033039e-06, + "loss": 0.3853808641433716, + "mean_token_accuracy": 0.8602254390716553, + "num_tokens": 9645730.0, + "step": 1075 + }, + { + "epoch": 0.817629179331307, + "grad_norm": 2.3359482288360596, + "learning_rate": 4.355056609437509e-06, + "loss": 0.4852045476436615, + "mean_token_accuracy": 0.8502728343009949, + "num_tokens": 9650975.0, + "step": 1076 + }, + { + "epoch": 0.8183890577507599, + "grad_norm": 2.2390685081481934, + "learning_rate": 4.353651928891842e-06, + "loss": 0.5287341475486755, + "mean_token_accuracy": 0.8247801065444946, + "num_tokens": 9657471.0, + "step": 1077 + }, + { + "epoch": 0.8191489361702128, + "grad_norm": 2.3809144496917725, + "learning_rate": 4.352245947381897e-06, + "loss": 0.5218510627746582, + "mean_token_accuracy": 0.8149170875549316, + "num_tokens": 9664108.0, + "step": 1078 + }, + { + "epoch": 0.8199088145896657, + "grad_norm": 1.7072309255599976, + "learning_rate": 4.3508386658944455e-06, + "loss": 0.46481168270111084, + "mean_token_accuracy": 0.834963321685791, + "num_tokens": 9673175.0, + "step": 1079 + }, + { + "epoch": 0.8206686930091185, + "grad_norm": 1.7383702993392944, + "learning_rate": 4.349430085417171e-06, + "loss": 0.4505952000617981, + "mean_token_accuracy": 0.8507769107818604, + "num_tokens": 9682800.0, + "step": 1080 + }, + { + "epoch": 0.8214285714285714, + "grad_norm": 2.4308547973632812, + "learning_rate": 4.348020206938672e-06, + "loss": 0.4832455515861511, + "mean_token_accuracy": 0.8538393974304199, + "num_tokens": 9688123.0, + "step": 1081 + }, + { + "epoch": 0.8221884498480243, + "grad_norm": 2.2686192989349365, + "learning_rate": 4.3466090314484526e-06, + "loss": 0.5112563371658325, + "mean_token_accuracy": 0.8308460712432861, + "num_tokens": 9694299.0, + "step": 1082 + }, + { + "epoch": 0.8229483282674772, + "grad_norm": 2.806093454360962, + "learning_rate": 4.345196559936931e-06, + "loss": 0.4818246364593506, + "mean_token_accuracy": 0.86617112159729, + "num_tokens": 9698471.0, + "step": 1083 + }, + { + "epoch": 0.8237082066869301, + "grad_norm": 1.7340706586837769, + "learning_rate": 4.343782793395435e-06, + "loss": 0.38246971368789673, + "mean_token_accuracy": 0.8675198554992676, + "num_tokens": 9706444.0, + "step": 1084 + }, + { + "epoch": 0.824468085106383, + "grad_norm": 1.664942741394043, + "learning_rate": 4.3423677328162e-06, + "loss": 0.498797208070755, + "mean_token_accuracy": 0.8447319865226746, + "num_tokens": 9716765.0, + "step": 1085 + }, + { + "epoch": 0.8252279635258358, + "grad_norm": 1.3608235120773315, + "learning_rate": 4.340951379192369e-06, + "loss": 0.41961491107940674, + "mean_token_accuracy": 0.8339346647262573, + "num_tokens": 9729564.0, + "step": 1086 + }, + { + "epoch": 0.8259878419452887, + "grad_norm": 1.642503261566162, + "learning_rate": 4.3395337335179945e-06, + "loss": 0.5477945804595947, + "mean_token_accuracy": 0.8117889761924744, + "num_tokens": 9741217.0, + "step": 1087 + }, + { + "epoch": 0.8267477203647416, + "grad_norm": 3.0345044136047363, + "learning_rate": 4.338114796788035e-06, + "loss": 0.5024623870849609, + "mean_token_accuracy": 0.8333141207695007, + "num_tokens": 9744941.0, + "step": 1088 + }, + { + "epoch": 0.8275075987841946, + "grad_norm": 1.3096630573272705, + "learning_rate": 4.336694569998354e-06, + "loss": 0.44169723987579346, + "mean_token_accuracy": 0.859926700592041, + "num_tokens": 9757854.0, + "step": 1089 + }, + { + "epoch": 0.8282674772036475, + "grad_norm": 2.203279495239258, + "learning_rate": 4.3352730541457215e-06, + "loss": 0.5283265113830566, + "mean_token_accuracy": 0.8053759932518005, + "num_tokens": 9764096.0, + "step": 1090 + }, + { + "epoch": 0.8290273556231003, + "grad_norm": 1.3774312734603882, + "learning_rate": 4.333850250227814e-06, + "loss": 0.4584103226661682, + "mean_token_accuracy": 0.8342611193656921, + "num_tokens": 9777768.0, + "step": 1091 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 1.822637915611267, + "learning_rate": 4.332426159243206e-06, + "loss": 0.5432791709899902, + "mean_token_accuracy": 0.8136210441589355, + "num_tokens": 9791276.0, + "step": 1092 + }, + { + "epoch": 0.8305471124620061, + "grad_norm": 3.0190067291259766, + "learning_rate": 4.331000782191384e-06, + "loss": 0.5018150806427002, + "mean_token_accuracy": 0.8234807252883911, + "num_tokens": 9794902.0, + "step": 1093 + }, + { + "epoch": 0.831306990881459, + "grad_norm": 2.09987735748291, + "learning_rate": 4.329574120072728e-06, + "loss": 0.4270891547203064, + "mean_token_accuracy": 0.8544977903366089, + "num_tokens": 9800903.0, + "step": 1094 + }, + { + "epoch": 0.8320668693009119, + "grad_norm": 1.969549536705017, + "learning_rate": 4.328146173888528e-06, + "loss": 0.45801427960395813, + "mean_token_accuracy": 0.8334714770317078, + "num_tokens": 9808719.0, + "step": 1095 + }, + { + "epoch": 0.8328267477203647, + "grad_norm": 1.4565571546554565, + "learning_rate": 4.32671694464097e-06, + "loss": 0.34864288568496704, + "mean_token_accuracy": 0.8689061999320984, + "num_tokens": 9818262.0, + "step": 1096 + }, + { + "epoch": 0.8335866261398176, + "grad_norm": 1.2163832187652588, + "learning_rate": 4.3252864333331424e-06, + "loss": 0.37953704595565796, + "mean_token_accuracy": 0.866554856300354, + "num_tokens": 9833942.0, + "step": 1097 + }, + { + "epoch": 0.8343465045592705, + "grad_norm": 1.6112010478973389, + "learning_rate": 4.323854640969033e-06, + "loss": 0.5442801713943481, + "mean_token_accuracy": 0.8190416097640991, + "num_tokens": 9844765.0, + "step": 1098 + }, + { + "epoch": 0.8351063829787234, + "grad_norm": 1.8190315961837769, + "learning_rate": 4.322421568553529e-06, + "loss": 0.48271381855010986, + "mean_token_accuracy": 0.8203652501106262, + "num_tokens": 9852625.0, + "step": 1099 + }, + { + "epoch": 0.8358662613981763, + "grad_norm": 2.7897756099700928, + "learning_rate": 4.320987217092416e-06, + "loss": 0.4086323380470276, + "mean_token_accuracy": 0.8504934310913086, + "num_tokens": 9856888.0, + "step": 1100 + }, + { + "epoch": 0.8366261398176292, + "grad_norm": 1.7035977840423584, + "learning_rate": 4.319551587592377e-06, + "loss": 0.6325064301490784, + "mean_token_accuracy": 0.788190484046936, + "num_tokens": 9869419.0, + "step": 1101 + }, + { + "epoch": 0.837386018237082, + "grad_norm": 2.609731912612915, + "learning_rate": 4.318114681060989e-06, + "loss": 0.519314706325531, + "mean_token_accuracy": 0.8469992280006409, + "num_tokens": 9874553.0, + "step": 1102 + }, + { + "epoch": 0.8381458966565349, + "grad_norm": 1.2519766092300415, + "learning_rate": 4.316676498506735e-06, + "loss": 0.3566005825996399, + "mean_token_accuracy": 0.8588439226150513, + "num_tokens": 9886498.0, + "step": 1103 + }, + { + "epoch": 0.8389057750759878, + "grad_norm": 1.430892825126648, + "learning_rate": 4.3152370409389795e-06, + "loss": 0.5250182747840881, + "mean_token_accuracy": 0.8164948225021362, + "num_tokens": 9900256.0, + "step": 1104 + }, + { + "epoch": 0.8396656534954408, + "grad_norm": 3.1245436668395996, + "learning_rate": 4.3137963093679945e-06, + "loss": 0.3173971176147461, + "mean_token_accuracy": 0.8835347890853882, + "num_tokens": 9903899.0, + "step": 1105 + }, + { + "epoch": 0.8404255319148937, + "grad_norm": 3.131812572479248, + "learning_rate": 4.3123543048049395e-06, + "loss": 0.6567763090133667, + "mean_token_accuracy": 0.8233605027198792, + "num_tokens": 9908798.0, + "step": 1106 + }, + { + "epoch": 0.8411854103343465, + "grad_norm": 1.3551725149154663, + "learning_rate": 4.310911028261867e-06, + "loss": 0.3993729054927826, + "mean_token_accuracy": 0.8529655933380127, + "num_tokens": 9922577.0, + "step": 1107 + }, + { + "epoch": 0.8419452887537994, + "grad_norm": 2.572533130645752, + "learning_rate": 4.309466480751726e-06, + "loss": 0.40906503796577454, + "mean_token_accuracy": 0.8630726933479309, + "num_tokens": 9926890.0, + "step": 1108 + }, + { + "epoch": 0.8427051671732523, + "grad_norm": 1.9146469831466675, + "learning_rate": 4.308020663288356e-06, + "loss": 0.48423194885253906, + "mean_token_accuracy": 0.8370280861854553, + "num_tokens": 9934293.0, + "step": 1109 + }, + { + "epoch": 0.8434650455927052, + "grad_norm": 1.6178001165390015, + "learning_rate": 4.306573576886485e-06, + "loss": 0.4262213408946991, + "mean_token_accuracy": 0.839401125907898, + "num_tokens": 9944513.0, + "step": 1110 + }, + { + "epoch": 0.8442249240121581, + "grad_norm": 2.4444572925567627, + "learning_rate": 4.305125222561736e-06, + "loss": 0.5199950933456421, + "mean_token_accuracy": 0.8507720232009888, + "num_tokens": 9949512.0, + "step": 1111 + }, + { + "epoch": 0.8449848024316109, + "grad_norm": 1.7983134984970093, + "learning_rate": 4.303675601330618e-06, + "loss": 0.36155956983566284, + "mean_token_accuracy": 0.8568712472915649, + "num_tokens": 9956402.0, + "step": 1112 + }, + { + "epoch": 0.8457446808510638, + "grad_norm": 2.391096353530884, + "learning_rate": 4.302224714210532e-06, + "loss": 0.5391949415206909, + "mean_token_accuracy": 0.8183057308197021, + "num_tokens": 9961606.0, + "step": 1113 + }, + { + "epoch": 0.8465045592705167, + "grad_norm": 1.8520214557647705, + "learning_rate": 4.3007725622197675e-06, + "loss": 0.5758882761001587, + "mean_token_accuracy": 0.7924330234527588, + "num_tokens": 9971473.0, + "step": 1114 + }, + { + "epoch": 0.8472644376899696, + "grad_norm": 2.436640739440918, + "learning_rate": 4.2993191463775e-06, + "loss": 0.3837985396385193, + "mean_token_accuracy": 0.8620110750198364, + "num_tokens": 9976333.0, + "step": 1115 + }, + { + "epoch": 0.8480243161094225, + "grad_norm": 1.7287120819091797, + "learning_rate": 4.29786446770379e-06, + "loss": 0.40066856145858765, + "mean_token_accuracy": 0.8618333339691162, + "num_tokens": 9985617.0, + "step": 1116 + }, + { + "epoch": 0.8487841945288754, + "grad_norm": 2.0310518741607666, + "learning_rate": 4.296408527219592e-06, + "loss": 0.5465943217277527, + "mean_token_accuracy": 0.812044620513916, + "num_tokens": 9995363.0, + "step": 1117 + }, + { + "epoch": 0.8495440729483282, + "grad_norm": 1.4858589172363281, + "learning_rate": 4.294951325946737e-06, + "loss": 0.45840176939964294, + "mean_token_accuracy": 0.8432979583740234, + "num_tokens": 10006400.0, + "step": 1118 + }, + { + "epoch": 0.8503039513677811, + "grad_norm": 1.6153514385223389, + "learning_rate": 4.293492864907947e-06, + "loss": 0.5225611925125122, + "mean_token_accuracy": 0.8180211186408997, + "num_tokens": 10018352.0, + "step": 1119 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 2.1178412437438965, + "learning_rate": 4.2920331451268246e-06, + "loss": 0.5580621361732483, + "mean_token_accuracy": 0.8211709260940552, + "num_tokens": 10025614.0, + "step": 1120 + }, + { + "epoch": 0.851823708206687, + "grad_norm": 2.036839246749878, + "learning_rate": 4.2905721676278585e-06, + "loss": 0.4658433198928833, + "mean_token_accuracy": 0.8380423784255981, + "num_tokens": 10032489.0, + "step": 1121 + }, + { + "epoch": 0.8525835866261399, + "grad_norm": 2.0056262016296387, + "learning_rate": 4.28910993343642e-06, + "loss": 0.47023308277130127, + "mean_token_accuracy": 0.8340359926223755, + "num_tokens": 10040050.0, + "step": 1122 + }, + { + "epoch": 0.8533434650455927, + "grad_norm": 2.540024518966675, + "learning_rate": 4.2876464435787576e-06, + "loss": 0.502303957939148, + "mean_token_accuracy": 0.8288739919662476, + "num_tokens": 10045042.0, + "step": 1123 + }, + { + "epoch": 0.8541033434650456, + "grad_norm": 1.7894693613052368, + "learning_rate": 4.286181699082008e-06, + "loss": 0.4732973575592041, + "mean_token_accuracy": 0.8340568542480469, + "num_tokens": 10054424.0, + "step": 1124 + }, + { + "epoch": 0.8548632218844985, + "grad_norm": 1.5601223707199097, + "learning_rate": 4.284715700974186e-06, + "loss": 0.472471684217453, + "mean_token_accuracy": 0.8274722695350647, + "num_tokens": 10065523.0, + "step": 1125 + }, + { + "epoch": 0.8556231003039514, + "grad_norm": 1.7326055765151978, + "learning_rate": 4.283248450284182e-06, + "loss": 0.5924872159957886, + "mean_token_accuracy": 0.7943467497825623, + "num_tokens": 10076839.0, + "step": 1126 + }, + { + "epoch": 0.8563829787234043, + "grad_norm": 1.5165479183197021, + "learning_rate": 4.281779948041772e-06, + "loss": 0.44768425822257996, + "mean_token_accuracy": 0.8394696712493896, + "num_tokens": 10088168.0, + "step": 1127 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.5448920726776123, + "learning_rate": 4.280310195277606e-06, + "loss": 0.4458175003528595, + "mean_token_accuracy": 0.835773229598999, + "num_tokens": 10100306.0, + "step": 1128 + }, + { + "epoch": 0.85790273556231, + "grad_norm": 1.6311609745025635, + "learning_rate": 4.278839193023214e-06, + "loss": 0.4158072769641876, + "mean_token_accuracy": 0.8482539653778076, + "num_tokens": 10110581.0, + "step": 1129 + }, + { + "epoch": 0.8586626139817629, + "grad_norm": 1.6714754104614258, + "learning_rate": 4.277366942311001e-06, + "loss": 0.3686875104904175, + "mean_token_accuracy": 0.8681533336639404, + "num_tokens": 10118799.0, + "step": 1130 + }, + { + "epoch": 0.8594224924012158, + "grad_norm": 2.1604413986206055, + "learning_rate": 4.2758934441742494e-06, + "loss": 0.37267982959747314, + "mean_token_accuracy": 0.8520427346229553, + "num_tokens": 10124734.0, + "step": 1131 + }, + { + "epoch": 0.8601823708206687, + "grad_norm": 2.123013973236084, + "learning_rate": 4.274418699647117e-06, + "loss": 0.49963313341140747, + "mean_token_accuracy": 0.8248758912086487, + "num_tokens": 10131965.0, + "step": 1132 + }, + { + "epoch": 0.8609422492401215, + "grad_norm": 1.4308786392211914, + "learning_rate": 4.272942709764638e-06, + "loss": 0.48666873574256897, + "mean_token_accuracy": 0.8304717540740967, + "num_tokens": 10145164.0, + "step": 1133 + }, + { + "epoch": 0.8617021276595744, + "grad_norm": 1.7952618598937988, + "learning_rate": 4.271465475562716e-06, + "loss": 0.5536223649978638, + "mean_token_accuracy": 0.8093959093093872, + "num_tokens": 10154083.0, + "step": 1134 + }, + { + "epoch": 0.8624620060790273, + "grad_norm": 2.0622456073760986, + "learning_rate": 4.269986998078132e-06, + "loss": 0.5173629522323608, + "mean_token_accuracy": 0.8285619020462036, + "num_tokens": 10161889.0, + "step": 1135 + }, + { + "epoch": 0.8632218844984803, + "grad_norm": 2.0707509517669678, + "learning_rate": 4.268507278348539e-06, + "loss": 0.5871608257293701, + "mean_token_accuracy": 0.7827386856079102, + "num_tokens": 10170726.0, + "step": 1136 + }, + { + "epoch": 0.8639817629179332, + "grad_norm": 2.054368257522583, + "learning_rate": 4.2670263174124615e-06, + "loss": 0.5788969993591309, + "mean_token_accuracy": 0.7967237234115601, + "num_tokens": 10178474.0, + "step": 1137 + }, + { + "epoch": 0.8647416413373861, + "grad_norm": 1.901846170425415, + "learning_rate": 4.265544116309294e-06, + "loss": 0.5405587553977966, + "mean_token_accuracy": 0.8151819705963135, + "num_tokens": 10187013.0, + "step": 1138 + }, + { + "epoch": 0.8655015197568389, + "grad_norm": 2.901285409927368, + "learning_rate": 4.264060676079302e-06, + "loss": 0.44101861119270325, + "mean_token_accuracy": 0.8433429002761841, + "num_tokens": 10191517.0, + "step": 1139 + }, + { + "epoch": 0.8662613981762918, + "grad_norm": 2.4168388843536377, + "learning_rate": 4.262575997763622e-06, + "loss": 0.4686204195022583, + "mean_token_accuracy": 0.8505309820175171, + "num_tokens": 10196948.0, + "step": 1140 + }, + { + "epoch": 0.8670212765957447, + "grad_norm": 1.9588396549224854, + "learning_rate": 4.2610900824042575e-06, + "loss": 0.47056013345718384, + "mean_token_accuracy": 0.8280024528503418, + "num_tokens": 10204292.0, + "step": 1141 + }, + { + "epoch": 0.8677811550151976, + "grad_norm": 2.569150924682617, + "learning_rate": 4.2596029310440826e-06, + "loss": 0.573108434677124, + "mean_token_accuracy": 0.8108246326446533, + "num_tokens": 10209571.0, + "step": 1142 + }, + { + "epoch": 0.8685410334346505, + "grad_norm": 2.038032293319702, + "learning_rate": 4.258114544726835e-06, + "loss": 0.40545332431793213, + "mean_token_accuracy": 0.8611703515052795, + "num_tokens": 10215716.0, + "step": 1143 + }, + { + "epoch": 0.8693009118541033, + "grad_norm": 1.9884231090545654, + "learning_rate": 4.256624924497124e-06, + "loss": 0.40085992217063904, + "mean_token_accuracy": 0.8615031242370605, + "num_tokens": 10222775.0, + "step": 1144 + }, + { + "epoch": 0.8700607902735562, + "grad_norm": 1.912842035293579, + "learning_rate": 4.25513407140042e-06, + "loss": 0.41022324562072754, + "mean_token_accuracy": 0.8459607362747192, + "num_tokens": 10229589.0, + "step": 1145 + }, + { + "epoch": 0.8708206686930091, + "grad_norm": 1.9190576076507568, + "learning_rate": 4.253641986483063e-06, + "loss": 0.5541447401046753, + "mean_token_accuracy": 0.8256468772888184, + "num_tokens": 10240633.0, + "step": 1146 + }, + { + "epoch": 0.871580547112462, + "grad_norm": 1.3742294311523438, + "learning_rate": 4.2521486707922545e-06, + "loss": 0.3680543899536133, + "mean_token_accuracy": 0.8654477596282959, + "num_tokens": 10251252.0, + "step": 1147 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 1.4438525438308716, + "learning_rate": 4.250654125376062e-06, + "loss": 0.45830875635147095, + "mean_token_accuracy": 0.8433834314346313, + "num_tokens": 10263980.0, + "step": 1148 + }, + { + "epoch": 0.8731003039513677, + "grad_norm": 2.1273653507232666, + "learning_rate": 4.249158351283414e-06, + "loss": 0.4129376709461212, + "mean_token_accuracy": 0.861556351184845, + "num_tokens": 10270426.0, + "step": 1149 + }, + { + "epoch": 0.8738601823708206, + "grad_norm": 2.598440647125244, + "learning_rate": 4.247661349564103e-06, + "loss": 0.418030709028244, + "mean_token_accuracy": 0.86553955078125, + "num_tokens": 10275493.0, + "step": 1150 + }, + { + "epoch": 0.8746200607902735, + "grad_norm": 1.6852490901947021, + "learning_rate": 4.246163121268782e-06, + "loss": 0.6403408050537109, + "mean_token_accuracy": 0.7966094017028809, + "num_tokens": 10287989.0, + "step": 1151 + }, + { + "epoch": 0.8753799392097265, + "grad_norm": 2.5013794898986816, + "learning_rate": 4.244663667448965e-06, + "loss": 0.49922505021095276, + "mean_token_accuracy": 0.8318735361099243, + "num_tokens": 10293360.0, + "step": 1152 + }, + { + "epoch": 0.8761398176291794, + "grad_norm": 1.2022709846496582, + "learning_rate": 4.243162989157027e-06, + "loss": 0.4414965510368347, + "mean_token_accuracy": 0.8338693380355835, + "num_tokens": 10310558.0, + "step": 1153 + }, + { + "epoch": 0.8768996960486323, + "grad_norm": 1.9903281927108765, + "learning_rate": 4.241661087446202e-06, + "loss": 0.4277610778808594, + "mean_token_accuracy": 0.8560749292373657, + "num_tokens": 10316983.0, + "step": 1154 + }, + { + "epoch": 0.8776595744680851, + "grad_norm": 2.104923725128174, + "learning_rate": 4.240157963370583e-06, + "loss": 0.44431713223457336, + "mean_token_accuracy": 0.8785282969474792, + "num_tokens": 10323294.0, + "step": 1155 + }, + { + "epoch": 0.878419452887538, + "grad_norm": 2.8364813327789307, + "learning_rate": 4.2386536179851175e-06, + "loss": 0.49948397278785706, + "mean_token_accuracy": 0.8305255174636841, + "num_tokens": 10327662.0, + "step": 1156 + }, + { + "epoch": 0.8791793313069909, + "grad_norm": 1.9493682384490967, + "learning_rate": 4.2371480523456156e-06, + "loss": 0.45867404341697693, + "mean_token_accuracy": 0.8373264074325562, + "num_tokens": 10335699.0, + "step": 1157 + }, + { + "epoch": 0.8799392097264438, + "grad_norm": 2.268616199493408, + "learning_rate": 4.235641267508741e-06, + "loss": 0.4547857940196991, + "mean_token_accuracy": 0.8252766132354736, + "num_tokens": 10342464.0, + "step": 1158 + }, + { + "epoch": 0.8806990881458967, + "grad_norm": 2.1334283351898193, + "learning_rate": 4.234133264532012e-06, + "loss": 0.39503124356269836, + "mean_token_accuracy": 0.8648351430892944, + "num_tokens": 10347514.0, + "step": 1159 + }, + { + "epoch": 0.8814589665653495, + "grad_norm": 1.2775357961654663, + "learning_rate": 4.232624044473805e-06, + "loss": 0.39945733547210693, + "mean_token_accuracy": 0.8369829654693604, + "num_tokens": 10363316.0, + "step": 1160 + }, + { + "epoch": 0.8822188449848024, + "grad_norm": 2.458413600921631, + "learning_rate": 4.231113608393348e-06, + "loss": 0.5020045638084412, + "mean_token_accuracy": 0.8295938968658447, + "num_tokens": 10368401.0, + "step": 1161 + }, + { + "epoch": 0.8829787234042553, + "grad_norm": 1.7464948892593384, + "learning_rate": 4.229601957350722e-06, + "loss": 0.5335392951965332, + "mean_token_accuracy": 0.8134858012199402, + "num_tokens": 10378337.0, + "step": 1162 + }, + { + "epoch": 0.8837386018237082, + "grad_norm": 3.1152119636535645, + "learning_rate": 4.228089092406863e-06, + "loss": 0.4811682105064392, + "mean_token_accuracy": 0.8460187315940857, + "num_tokens": 10382362.0, + "step": 1163 + }, + { + "epoch": 0.8844984802431611, + "grad_norm": 2.190847158432007, + "learning_rate": 4.226575014623557e-06, + "loss": 0.4428049921989441, + "mean_token_accuracy": 0.8382467031478882, + "num_tokens": 10388211.0, + "step": 1164 + }, + { + "epoch": 0.8852583586626139, + "grad_norm": 1.860153079032898, + "learning_rate": 4.225059725063444e-06, + "loss": 0.5265918970108032, + "mean_token_accuracy": 0.8181334733963013, + "num_tokens": 10398873.0, + "step": 1165 + }, + { + "epoch": 0.8860182370820668, + "grad_norm": 1.3372713327407837, + "learning_rate": 4.22354322479001e-06, + "loss": 0.43202850222587585, + "mean_token_accuracy": 0.8432420492172241, + "num_tokens": 10413158.0, + "step": 1166 + }, + { + "epoch": 0.8867781155015197, + "grad_norm": 1.3653379678726196, + "learning_rate": 4.222025514867596e-06, + "loss": 0.43780991435050964, + "mean_token_accuracy": 0.8441485166549683, + "num_tokens": 10428137.0, + "step": 1167 + }, + { + "epoch": 0.8875379939209727, + "grad_norm": 3.0230672359466553, + "learning_rate": 4.220506596361387e-06, + "loss": 0.6039337515830994, + "mean_token_accuracy": 0.8274872303009033, + "num_tokens": 10432586.0, + "step": 1168 + }, + { + "epoch": 0.8882978723404256, + "grad_norm": 2.2180392742156982, + "learning_rate": 4.218986470337419e-06, + "loss": 0.5453792810440063, + "mean_token_accuracy": 0.8127184510231018, + "num_tokens": 10439471.0, + "step": 1169 + }, + { + "epoch": 0.8890577507598785, + "grad_norm": 1.8519103527069092, + "learning_rate": 4.217465137862575e-06, + "loss": 0.5145469903945923, + "mean_token_accuracy": 0.8178654909133911, + "num_tokens": 10450471.0, + "step": 1170 + }, + { + "epoch": 0.8898176291793313, + "grad_norm": 2.034008026123047, + "learning_rate": 4.215942600004586e-06, + "loss": 0.44061461091041565, + "mean_token_accuracy": 0.8572084307670593, + "num_tokens": 10457382.0, + "step": 1171 + }, + { + "epoch": 0.8905775075987842, + "grad_norm": 3.4304304122924805, + "learning_rate": 4.214418857832025e-06, + "loss": 0.44397830963134766, + "mean_token_accuracy": 0.842149019241333, + "num_tokens": 10460650.0, + "step": 1172 + }, + { + "epoch": 0.8913373860182371, + "grad_norm": 1.9021750688552856, + "learning_rate": 4.212893912414316e-06, + "loss": 0.3769867420196533, + "mean_token_accuracy": 0.8806171417236328, + "num_tokens": 10468214.0, + "step": 1173 + }, + { + "epoch": 0.89209726443769, + "grad_norm": 1.9704062938690186, + "learning_rate": 4.211367764821722e-06, + "loss": 0.5501819849014282, + "mean_token_accuracy": 0.8176811337471008, + "num_tokens": 10476739.0, + "step": 1174 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 1.4350415468215942, + "learning_rate": 4.209840416125353e-06, + "loss": 0.41897401213645935, + "mean_token_accuracy": 0.8498011827468872, + "num_tokens": 10491769.0, + "step": 1175 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 3.8237783908843994, + "learning_rate": 4.208311867397162e-06, + "loss": 0.5296977162361145, + "mean_token_accuracy": 0.8168715834617615, + "num_tokens": 10494958.0, + "step": 1176 + }, + { + "epoch": 0.8943768996960486, + "grad_norm": 2.04784893989563, + "learning_rate": 4.206782119709942e-06, + "loss": 0.476105272769928, + "mean_token_accuracy": 0.834011435508728, + "num_tokens": 10502077.0, + "step": 1177 + }, + { + "epoch": 0.8951367781155015, + "grad_norm": 1.8839610815048218, + "learning_rate": 4.205251174137329e-06, + "loss": 0.49628815054893494, + "mean_token_accuracy": 0.8212119936943054, + "num_tokens": 10510077.0, + "step": 1178 + }, + { + "epoch": 0.8958966565349544, + "grad_norm": 1.2100634574890137, + "learning_rate": 4.2037190317538e-06, + "loss": 0.4931519329547882, + "mean_token_accuracy": 0.8170043230056763, + "num_tokens": 10528373.0, + "step": 1179 + }, + { + "epoch": 0.8966565349544073, + "grad_norm": 1.884637713432312, + "learning_rate": 4.202185693634671e-06, + "loss": 0.4913347363471985, + "mean_token_accuracy": 0.8234949707984924, + "num_tokens": 10537108.0, + "step": 1180 + }, + { + "epoch": 0.8974164133738601, + "grad_norm": 1.5062434673309326, + "learning_rate": 4.200651160856099e-06, + "loss": 0.4160492420196533, + "mean_token_accuracy": 0.845937192440033, + "num_tokens": 10547577.0, + "step": 1181 + }, + { + "epoch": 0.898176291793313, + "grad_norm": 2.331169605255127, + "learning_rate": 4.1991154344950755e-06, + "loss": 0.6532632112503052, + "mean_token_accuracy": 0.7743191123008728, + "num_tokens": 10556328.0, + "step": 1182 + }, + { + "epoch": 0.898936170212766, + "grad_norm": 1.3538362979888916, + "learning_rate": 4.197578515629435e-06, + "loss": 0.4437566101551056, + "mean_token_accuracy": 0.8427901268005371, + "num_tokens": 10570026.0, + "step": 1183 + }, + { + "epoch": 0.8996960486322189, + "grad_norm": 2.3828957080841064, + "learning_rate": 4.196040405337846e-06, + "loss": 0.6185290217399597, + "mean_token_accuracy": 0.7969824075698853, + "num_tokens": 10576465.0, + "step": 1184 + }, + { + "epoch": 0.9004559270516718, + "grad_norm": 2.4759042263031006, + "learning_rate": 4.194501104699813e-06, + "loss": 0.46489226818084717, + "mean_token_accuracy": 0.8472316265106201, + "num_tokens": 10582034.0, + "step": 1185 + }, + { + "epoch": 0.9012158054711246, + "grad_norm": 1.9215164184570312, + "learning_rate": 4.192960614795676e-06, + "loss": 0.48001551628112793, + "mean_token_accuracy": 0.8371596336364746, + "num_tokens": 10590556.0, + "step": 1186 + }, + { + "epoch": 0.9019756838905775, + "grad_norm": 2.2717080116271973, + "learning_rate": 4.19141893670661e-06, + "loss": 0.40083563327789307, + "mean_token_accuracy": 0.8464195728302002, + "num_tokens": 10595661.0, + "step": 1187 + }, + { + "epoch": 0.9027355623100304, + "grad_norm": 2.187122344970703, + "learning_rate": 4.189876071514624e-06, + "loss": 0.4942901134490967, + "mean_token_accuracy": 0.8186990022659302, + "num_tokens": 10603366.0, + "step": 1188 + }, + { + "epoch": 0.9034954407294833, + "grad_norm": 1.542414665222168, + "learning_rate": 4.188332020302561e-06, + "loss": 0.4731982946395874, + "mean_token_accuracy": 0.8487229347229004, + "num_tokens": 10616203.0, + "step": 1189 + }, + { + "epoch": 0.9042553191489362, + "grad_norm": 0.9957579970359802, + "learning_rate": 4.186786784154096e-06, + "loss": 0.33211836218833923, + "mean_token_accuracy": 0.870644748210907, + "num_tokens": 10633294.0, + "step": 1190 + }, + { + "epoch": 0.9050151975683891, + "grad_norm": 2.593867540359497, + "learning_rate": 4.1852403641537344e-06, + "loss": 0.6825464963912964, + "mean_token_accuracy": 0.7716869115829468, + "num_tokens": 10640615.0, + "step": 1191 + }, + { + "epoch": 0.9057750759878419, + "grad_norm": 2.0424516201019287, + "learning_rate": 4.183692761386813e-06, + "loss": 0.5672709941864014, + "mean_token_accuracy": 0.7973801493644714, + "num_tokens": 10649845.0, + "step": 1192 + }, + { + "epoch": 0.9065349544072948, + "grad_norm": 1.429018259048462, + "learning_rate": 4.1821439769395e-06, + "loss": 0.5427846908569336, + "mean_token_accuracy": 0.8200292587280273, + "num_tokens": 10665898.0, + "step": 1193 + }, + { + "epoch": 0.9072948328267477, + "grad_norm": 1.9764264822006226, + "learning_rate": 4.180594011898791e-06, + "loss": 0.4784567356109619, + "mean_token_accuracy": 0.82924485206604, + "num_tokens": 10673595.0, + "step": 1194 + }, + { + "epoch": 0.9080547112462006, + "grad_norm": 1.4004309177398682, + "learning_rate": 4.1790428673525104e-06, + "loss": 0.4791432023048401, + "mean_token_accuracy": 0.8334879875183105, + "num_tokens": 10687892.0, + "step": 1195 + }, + { + "epoch": 0.9088145896656535, + "grad_norm": 2.2207727432250977, + "learning_rate": 4.177490544389313e-06, + "loss": 0.5089365243911743, + "mean_token_accuracy": 0.8270776271820068, + "num_tokens": 10694911.0, + "step": 1196 + }, + { + "epoch": 0.9095744680851063, + "grad_norm": 2.2890450954437256, + "learning_rate": 4.175937044098678e-06, + "loss": 0.5152267813682556, + "mean_token_accuracy": 0.8527299165725708, + "num_tokens": 10700512.0, + "step": 1197 + }, + { + "epoch": 0.9103343465045592, + "grad_norm": 1.7938050031661987, + "learning_rate": 4.1743823675709115e-06, + "loss": 0.3507300615310669, + "mean_token_accuracy": 0.8694599866867065, + "num_tokens": 10707953.0, + "step": 1198 + }, + { + "epoch": 0.9110942249240122, + "grad_norm": 1.4368808269500732, + "learning_rate": 4.172826515897146e-06, + "loss": 0.407418429851532, + "mean_token_accuracy": 0.8432893753051758, + "num_tokens": 10717485.0, + "step": 1199 + }, + { + "epoch": 0.9118541033434651, + "grad_norm": 1.735339879989624, + "learning_rate": 4.171269490169337e-06, + "loss": 0.46996885538101196, + "mean_token_accuracy": 0.8331948518753052, + "num_tokens": 10726160.0, + "step": 1200 + }, + { + "epoch": 0.912613981762918, + "grad_norm": 1.7859221696853638, + "learning_rate": 4.1697112914802665e-06, + "loss": 0.5325199365615845, + "mean_token_accuracy": 0.8179605007171631, + "num_tokens": 10736284.0, + "step": 1201 + }, + { + "epoch": 0.9133738601823708, + "grad_norm": 2.6394896507263184, + "learning_rate": 4.168151920923536e-06, + "loss": 0.4039744734764099, + "mean_token_accuracy": 0.8545527458190918, + "num_tokens": 10740673.0, + "step": 1202 + }, + { + "epoch": 0.9141337386018237, + "grad_norm": 1.910988211631775, + "learning_rate": 4.1665913795935755e-06, + "loss": 0.5190291404724121, + "mean_token_accuracy": 0.8203921318054199, + "num_tokens": 10751946.0, + "step": 1203 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 3.0006964206695557, + "learning_rate": 4.16502966858563e-06, + "loss": 0.5856777429580688, + "mean_token_accuracy": 0.8061224222183228, + "num_tokens": 10756795.0, + "step": 1204 + }, + { + "epoch": 0.9156534954407295, + "grad_norm": 1.7396167516708374, + "learning_rate": 4.163466788995768e-06, + "loss": 0.54935222864151, + "mean_token_accuracy": 0.8052443265914917, + "num_tokens": 10767202.0, + "step": 1205 + }, + { + "epoch": 0.9164133738601824, + "grad_norm": 2.143735885620117, + "learning_rate": 4.161902741920881e-06, + "loss": 0.5020298361778259, + "mean_token_accuracy": 0.8249630928039551, + "num_tokens": 10774329.0, + "step": 1206 + }, + { + "epoch": 0.9171732522796353, + "grad_norm": 2.8871893882751465, + "learning_rate": 4.160337528458676e-06, + "loss": 0.5154489278793335, + "mean_token_accuracy": 0.8276848793029785, + "num_tokens": 10778929.0, + "step": 1207 + }, + { + "epoch": 0.9179331306990881, + "grad_norm": 1.4642788171768188, + "learning_rate": 4.15877114970768e-06, + "loss": 0.5033774375915527, + "mean_token_accuracy": 0.8296241164207458, + "num_tokens": 10790928.0, + "step": 1208 + }, + { + "epoch": 0.918693009118541, + "grad_norm": 1.8313497304916382, + "learning_rate": 4.1572036067672386e-06, + "loss": 0.5674909353256226, + "mean_token_accuracy": 0.7975562214851379, + "num_tokens": 10801372.0, + "step": 1209 + }, + { + "epoch": 0.9194528875379939, + "grad_norm": 2.005958080291748, + "learning_rate": 4.155634900737513e-06, + "loss": 0.5557019114494324, + "mean_token_accuracy": 0.8141391277313232, + "num_tokens": 10809150.0, + "step": 1210 + }, + { + "epoch": 0.9202127659574468, + "grad_norm": 2.333519697189331, + "learning_rate": 4.154065032719482e-06, + "loss": 0.6990420818328857, + "mean_token_accuracy": 0.7565394043922424, + "num_tokens": 10816612.0, + "step": 1211 + }, + { + "epoch": 0.9209726443768997, + "grad_norm": 1.4472655057907104, + "learning_rate": 4.152494003814939e-06, + "loss": 0.541398286819458, + "mean_token_accuracy": 0.8027358055114746, + "num_tokens": 10833840.0, + "step": 1212 + }, + { + "epoch": 0.9217325227963525, + "grad_norm": 1.6183619499206543, + "learning_rate": 4.150921815126493e-06, + "loss": 0.6096762418746948, + "mean_token_accuracy": 0.7994354963302612, + "num_tokens": 10846367.0, + "step": 1213 + }, + { + "epoch": 0.9224924012158054, + "grad_norm": 2.614919900894165, + "learning_rate": 4.149348467757566e-06, + "loss": 0.41846764087677, + "mean_token_accuracy": 0.8555068969726562, + "num_tokens": 10850836.0, + "step": 1214 + }, + { + "epoch": 0.9232522796352584, + "grad_norm": 1.4419831037521362, + "learning_rate": 4.147773962812393e-06, + "loss": 0.4139535427093506, + "mean_token_accuracy": 0.845671534538269, + "num_tokens": 10864228.0, + "step": 1215 + }, + { + "epoch": 0.9240121580547113, + "grad_norm": 2.3868865966796875, + "learning_rate": 4.146198301396025e-06, + "loss": 0.3357275128364563, + "mean_token_accuracy": 0.8829520344734192, + "num_tokens": 10868920.0, + "step": 1216 + }, + { + "epoch": 0.9247720364741642, + "grad_norm": 1.7685474157333374, + "learning_rate": 4.14462148461432e-06, + "loss": 0.45333072543144226, + "mean_token_accuracy": 0.8505891561508179, + "num_tokens": 10877286.0, + "step": 1217 + }, + { + "epoch": 0.925531914893617, + "grad_norm": 1.7627625465393066, + "learning_rate": 4.143043513573949e-06, + "loss": 0.5028705596923828, + "mean_token_accuracy": 0.825471043586731, + "num_tokens": 10887047.0, + "step": 1218 + }, + { + "epoch": 0.9262917933130699, + "grad_norm": 1.3168725967407227, + "learning_rate": 4.141464389382392e-06, + "loss": 0.5494637489318848, + "mean_token_accuracy": 0.8121747970581055, + "num_tokens": 10903599.0, + "step": 1219 + }, + { + "epoch": 0.9270516717325228, + "grad_norm": 2.5180399417877197, + "learning_rate": 4.13988411314794e-06, + "loss": 0.6134277582168579, + "mean_token_accuracy": 0.7983006834983826, + "num_tokens": 10909791.0, + "step": 1220 + }, + { + "epoch": 0.9278115501519757, + "grad_norm": 1.1889166831970215, + "learning_rate": 4.13830268597969e-06, + "loss": 0.36713096499443054, + "mean_token_accuracy": 0.8416121006011963, + "num_tokens": 10925794.0, + "step": 1221 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 2.142422676086426, + "learning_rate": 4.136720108987552e-06, + "loss": 0.4427933096885681, + "mean_token_accuracy": 0.8427745699882507, + "num_tokens": 10931622.0, + "step": 1222 + }, + { + "epoch": 0.9293313069908815, + "grad_norm": 1.908564567565918, + "learning_rate": 4.1351363832822364e-06, + "loss": 0.5088109374046326, + "mean_token_accuracy": 0.8309272527694702, + "num_tokens": 10940843.0, + "step": 1223 + }, + { + "epoch": 0.9300911854103343, + "grad_norm": 1.2862322330474854, + "learning_rate": 4.133551509975264e-06, + "loss": 0.3963761329650879, + "mean_token_accuracy": 0.8602159023284912, + "num_tokens": 10954481.0, + "step": 1224 + }, + { + "epoch": 0.9308510638297872, + "grad_norm": 1.5876200199127197, + "learning_rate": 4.13196549017896e-06, + "loss": 0.4311184287071228, + "mean_token_accuracy": 0.8460899591445923, + "num_tokens": 10963501.0, + "step": 1225 + }, + { + "epoch": 0.9316109422492401, + "grad_norm": 2.459878444671631, + "learning_rate": 4.130378325006453e-06, + "loss": 0.5016295313835144, + "mean_token_accuracy": 0.8125218152999878, + "num_tokens": 10968850.0, + "step": 1226 + }, + { + "epoch": 0.932370820668693, + "grad_norm": 2.059718370437622, + "learning_rate": 4.128790015571679e-06, + "loss": 0.48982277512550354, + "mean_token_accuracy": 0.8327049016952515, + "num_tokens": 10976642.0, + "step": 1227 + }, + { + "epoch": 0.9331306990881459, + "grad_norm": 1.3719185590744019, + "learning_rate": 4.127200562989372e-06, + "loss": 0.38778752088546753, + "mean_token_accuracy": 0.8623501062393188, + "num_tokens": 10988703.0, + "step": 1228 + }, + { + "epoch": 0.9338905775075987, + "grad_norm": 1.302140712738037, + "learning_rate": 4.125609968375073e-06, + "loss": 0.4887842535972595, + "mean_token_accuracy": 0.8322232961654663, + "num_tokens": 11005981.0, + "step": 1229 + }, + { + "epoch": 0.9346504559270516, + "grad_norm": 1.819624423980713, + "learning_rate": 4.12401823284512e-06, + "loss": 0.49825209379196167, + "mean_token_accuracy": 0.8278916478157043, + "num_tokens": 11014145.0, + "step": 1230 + }, + { + "epoch": 0.9354103343465046, + "grad_norm": 1.2762807607650757, + "learning_rate": 4.122425357516658e-06, + "loss": 0.433994323015213, + "mean_token_accuracy": 0.853028416633606, + "num_tokens": 11029232.0, + "step": 1231 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 2.2171671390533447, + "learning_rate": 4.1208313435076255e-06, + "loss": 0.38436949253082275, + "mean_token_accuracy": 0.8616260290145874, + "num_tokens": 11034743.0, + "step": 1232 + }, + { + "epoch": 0.9369300911854104, + "grad_norm": 1.355879545211792, + "learning_rate": 4.119236191936764e-06, + "loss": 0.5378084182739258, + "mean_token_accuracy": 0.8256701231002808, + "num_tokens": 11048149.0, + "step": 1233 + }, + { + "epoch": 0.9376899696048632, + "grad_norm": 2.66812801361084, + "learning_rate": 4.117639903923611e-06, + "loss": 0.5236451625823975, + "mean_token_accuracy": 0.8431973457336426, + "num_tokens": 11052295.0, + "step": 1234 + }, + { + "epoch": 0.9384498480243161, + "grad_norm": 1.5740545988082886, + "learning_rate": 4.116042480588505e-06, + "loss": 0.44322824478149414, + "mean_token_accuracy": 0.8436908721923828, + "num_tokens": 11062066.0, + "step": 1235 + }, + { + "epoch": 0.939209726443769, + "grad_norm": 1.230706810951233, + "learning_rate": 4.114443923052577e-06, + "loss": 0.3325323462486267, + "mean_token_accuracy": 0.8674666881561279, + "num_tokens": 11074300.0, + "step": 1236 + }, + { + "epoch": 0.9399696048632219, + "grad_norm": 1.9870070219039917, + "learning_rate": 4.112844232437757e-06, + "loss": 0.5711548328399658, + "mean_token_accuracy": 0.8081738948822021, + "num_tokens": 11082297.0, + "step": 1237 + }, + { + "epoch": 0.9407294832826748, + "grad_norm": 1.3020970821380615, + "learning_rate": 4.11124340986677e-06, + "loss": 0.4187922477722168, + "mean_token_accuracy": 0.8566171526908875, + "num_tokens": 11096810.0, + "step": 1238 + }, + { + "epoch": 0.9414893617021277, + "grad_norm": 2.1399197578430176, + "learning_rate": 4.109641456463135e-06, + "loss": 0.5293116569519043, + "mean_token_accuracy": 0.8176157474517822, + "num_tokens": 11102761.0, + "step": 1239 + }, + { + "epoch": 0.9422492401215805, + "grad_norm": 1.3503763675689697, + "learning_rate": 4.108038373351163e-06, + "loss": 0.4907652735710144, + "mean_token_accuracy": 0.8204987049102783, + "num_tokens": 11118480.0, + "step": 1240 + }, + { + "epoch": 0.9430091185410334, + "grad_norm": 1.9571399688720703, + "learning_rate": 4.106434161655962e-06, + "loss": 0.4709656536579132, + "mean_token_accuracy": 0.8371885418891907, + "num_tokens": 11126265.0, + "step": 1241 + }, + { + "epoch": 0.9437689969604863, + "grad_norm": 2.1277313232421875, + "learning_rate": 4.104828822503427e-06, + "loss": 0.4010283350944519, + "mean_token_accuracy": 0.8586333990097046, + "num_tokens": 11133022.0, + "step": 1242 + }, + { + "epoch": 0.9445288753799392, + "grad_norm": 1.6745036840438843, + "learning_rate": 4.103222357020248e-06, + "loss": 0.562545657157898, + "mean_token_accuracy": 0.8052060604095459, + "num_tokens": 11145255.0, + "step": 1243 + }, + { + "epoch": 0.9452887537993921, + "grad_norm": 2.3616299629211426, + "learning_rate": 4.101614766333904e-06, + "loss": 0.5878340601921082, + "mean_token_accuracy": 0.796745777130127, + "num_tokens": 11152020.0, + "step": 1244 + }, + { + "epoch": 0.9460486322188449, + "grad_norm": 1.6182078123092651, + "learning_rate": 4.100006051572664e-06, + "loss": 0.5357589721679688, + "mean_token_accuracy": 0.8089962005615234, + "num_tokens": 11163112.0, + "step": 1245 + }, + { + "epoch": 0.9468085106382979, + "grad_norm": 1.911770224571228, + "learning_rate": 4.098396213865587e-06, + "loss": 0.49805426597595215, + "mean_token_accuracy": 0.8289647102355957, + "num_tokens": 11171768.0, + "step": 1246 + }, + { + "epoch": 0.9475683890577508, + "grad_norm": 1.649155616760254, + "learning_rate": 4.096785254342518e-06, + "loss": 0.5756166577339172, + "mean_token_accuracy": 0.807680606842041, + "num_tokens": 11183527.0, + "step": 1247 + }, + { + "epoch": 0.9483282674772037, + "grad_norm": 1.8922761678695679, + "learning_rate": 4.095173174134091e-06, + "loss": 0.44688963890075684, + "mean_token_accuracy": 0.8375608921051025, + "num_tokens": 11191494.0, + "step": 1248 + }, + { + "epoch": 0.9490881458966566, + "grad_norm": 2.9044547080993652, + "learning_rate": 4.093559974371725e-06, + "loss": 0.48609739542007446, + "mean_token_accuracy": 0.8404892086982727, + "num_tokens": 11195837.0, + "step": 1249 + }, + { + "epoch": 0.9498480243161094, + "grad_norm": 2.287506580352783, + "learning_rate": 4.091945656187626e-06, + "loss": 0.5260225534439087, + "mean_token_accuracy": 0.8181945085525513, + "num_tokens": 11202174.0, + "step": 1250 + }, + { + "epoch": 0.9506079027355623, + "grad_norm": 1.7908886671066284, + "learning_rate": 4.090330220714785e-06, + "loss": 0.4207724928855896, + "mean_token_accuracy": 0.8616912364959717, + "num_tokens": 11209995.0, + "step": 1251 + }, + { + "epoch": 0.9513677811550152, + "grad_norm": 2.905418634414673, + "learning_rate": 4.0887136690869774e-06, + "loss": 0.4209241271018982, + "mean_token_accuracy": 0.8561323285102844, + "num_tokens": 11213799.0, + "step": 1252 + }, + { + "epoch": 0.9521276595744681, + "grad_norm": 2.814150333404541, + "learning_rate": 4.08709600243876e-06, + "loss": 0.36855608224868774, + "mean_token_accuracy": 0.8764539361000061, + "num_tokens": 11217643.0, + "step": 1253 + }, + { + "epoch": 0.952887537993921, + "grad_norm": 1.9385707378387451, + "learning_rate": 4.0854772219054735e-06, + "loss": 0.531031608581543, + "mean_token_accuracy": 0.80600905418396, + "num_tokens": 11225871.0, + "step": 1254 + }, + { + "epoch": 0.9536474164133738, + "grad_norm": 2.103058099746704, + "learning_rate": 4.083857328623243e-06, + "loss": 0.4576364755630493, + "mean_token_accuracy": 0.8447524905204773, + "num_tokens": 11231829.0, + "step": 1255 + }, + { + "epoch": 0.9544072948328267, + "grad_norm": 1.7518818378448486, + "learning_rate": 4.082236323728969e-06, + "loss": 0.5386767983436584, + "mean_token_accuracy": 0.8055596351623535, + "num_tokens": 11240977.0, + "step": 1256 + }, + { + "epoch": 0.9551671732522796, + "grad_norm": 1.8434966802597046, + "learning_rate": 4.0806142083603365e-06, + "loss": 0.5415925979614258, + "mean_token_accuracy": 0.809962272644043, + "num_tokens": 11249616.0, + "step": 1257 + }, + { + "epoch": 0.9559270516717325, + "grad_norm": 1.7341015338897705, + "learning_rate": 4.078990983655807e-06, + "loss": 0.4621101915836334, + "mean_token_accuracy": 0.8330386877059937, + "num_tokens": 11258616.0, + "step": 1258 + }, + { + "epoch": 0.9566869300911854, + "grad_norm": 1.8589727878570557, + "learning_rate": 4.077366650754624e-06, + "loss": 0.4031238555908203, + "mean_token_accuracy": 0.842434287071228, + "num_tokens": 11266006.0, + "step": 1259 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 1.657175898551941, + "learning_rate": 4.075741210796806e-06, + "loss": 0.41686388850212097, + "mean_token_accuracy": 0.8443650007247925, + "num_tokens": 11275601.0, + "step": 1260 + }, + { + "epoch": 0.9582066869300911, + "grad_norm": 2.4303717613220215, + "learning_rate": 4.07411466492315e-06, + "loss": 0.4554435610771179, + "mean_token_accuracy": 0.853043794631958, + "num_tokens": 11280650.0, + "step": 1261 + }, + { + "epoch": 0.958966565349544, + "grad_norm": 2.3653745651245117, + "learning_rate": 4.072487014275228e-06, + "loss": 0.4304995536804199, + "mean_token_accuracy": 0.8462260961532593, + "num_tokens": 11285637.0, + "step": 1262 + }, + { + "epoch": 0.959726443768997, + "grad_norm": 1.6689718961715698, + "learning_rate": 4.070858259995388e-06, + "loss": 0.5290807485580444, + "mean_token_accuracy": 0.8176917433738708, + "num_tokens": 11299110.0, + "step": 1263 + }, + { + "epoch": 0.9604863221884499, + "grad_norm": 2.103879451751709, + "learning_rate": 4.069228403226751e-06, + "loss": 0.4620879888534546, + "mean_token_accuracy": 0.835270345211029, + "num_tokens": 11305564.0, + "step": 1264 + }, + { + "epoch": 0.9612462006079028, + "grad_norm": 2.139012575149536, + "learning_rate": 4.067597445113216e-06, + "loss": 0.5143396258354187, + "mean_token_accuracy": 0.8191739320755005, + "num_tokens": 11311870.0, + "step": 1265 + }, + { + "epoch": 0.9620060790273556, + "grad_norm": 1.3971210718154907, + "learning_rate": 4.06596538679945e-06, + "loss": 0.472080260515213, + "mean_token_accuracy": 0.8321092128753662, + "num_tokens": 11323970.0, + "step": 1266 + }, + { + "epoch": 0.9627659574468085, + "grad_norm": 1.4965174198150635, + "learning_rate": 4.064332229430895e-06, + "loss": 0.359701007604599, + "mean_token_accuracy": 0.8903120160102844, + "num_tokens": 11333412.0, + "step": 1267 + }, + { + "epoch": 0.9635258358662614, + "grad_norm": 1.1898726224899292, + "learning_rate": 4.062697974153764e-06, + "loss": 0.3423798084259033, + "mean_token_accuracy": 0.8661491870880127, + "num_tokens": 11347657.0, + "step": 1268 + }, + { + "epoch": 0.9642857142857143, + "grad_norm": 1.4952168464660645, + "learning_rate": 4.06106262211504e-06, + "loss": 0.4214417338371277, + "mean_token_accuracy": 0.8362159729003906, + "num_tokens": 11357786.0, + "step": 1269 + }, + { + "epoch": 0.9650455927051672, + "grad_norm": 1.7949583530426025, + "learning_rate": 4.059426174462476e-06, + "loss": 0.59087735414505, + "mean_token_accuracy": 0.7965556979179382, + "num_tokens": 11370561.0, + "step": 1270 + }, + { + "epoch": 0.96580547112462, + "grad_norm": 1.8973214626312256, + "learning_rate": 4.057788632344594e-06, + "loss": 0.47525322437286377, + "mean_token_accuracy": 0.8317365050315857, + "num_tokens": 11378507.0, + "step": 1271 + }, + { + "epoch": 0.9665653495440729, + "grad_norm": 1.8665250539779663, + "learning_rate": 4.056149996910683e-06, + "loss": 0.3537125587463379, + "mean_token_accuracy": 0.8921569585800171, + "num_tokens": 11385186.0, + "step": 1272 + }, + { + "epoch": 0.9673252279635258, + "grad_norm": 1.5072317123413086, + "learning_rate": 4.054510269310803e-06, + "loss": 0.5145624876022339, + "mean_token_accuracy": 0.8265488147735596, + "num_tokens": 11397125.0, + "step": 1273 + }, + { + "epoch": 0.9680851063829787, + "grad_norm": 1.520525574684143, + "learning_rate": 4.052869450695776e-06, + "loss": 0.44322293996810913, + "mean_token_accuracy": 0.8403642177581787, + "num_tokens": 11409919.0, + "step": 1274 + }, + { + "epoch": 0.9688449848024316, + "grad_norm": 1.3764475584030151, + "learning_rate": 4.051227542217192e-06, + "loss": 0.5774400234222412, + "mean_token_accuracy": 0.804118275642395, + "num_tokens": 11425900.0, + "step": 1275 + }, + { + "epoch": 0.9696048632218845, + "grad_norm": 1.3922648429870605, + "learning_rate": 4.049584545027406e-06, + "loss": 0.42727944254875183, + "mean_token_accuracy": 0.8654505014419556, + "num_tokens": 11438787.0, + "step": 1276 + }, + { + "epoch": 0.9703647416413373, + "grad_norm": 1.8505840301513672, + "learning_rate": 4.047940460279537e-06, + "loss": 0.490803062915802, + "mean_token_accuracy": 0.8340574502944946, + "num_tokens": 11447997.0, + "step": 1277 + }, + { + "epoch": 0.9711246200607903, + "grad_norm": 2.28271222114563, + "learning_rate": 4.046295289127466e-06, + "loss": 0.588828444480896, + "mean_token_accuracy": 0.833497166633606, + "num_tokens": 11454072.0, + "step": 1278 + }, + { + "epoch": 0.9718844984802432, + "grad_norm": 2.4242560863494873, + "learning_rate": 4.044649032725836e-06, + "loss": 0.5128831267356873, + "mean_token_accuracy": 0.8225122690200806, + "num_tokens": 11460211.0, + "step": 1279 + }, + { + "epoch": 0.9726443768996961, + "grad_norm": 2.1738455295562744, + "learning_rate": 4.0430016922300566e-06, + "loss": 0.441631942987442, + "mean_token_accuracy": 0.841723620891571, + "num_tokens": 11466814.0, + "step": 1280 + }, + { + "epoch": 0.973404255319149, + "grad_norm": 2.541599988937378, + "learning_rate": 4.0413532687962926e-06, + "loss": 0.5062629580497742, + "mean_token_accuracy": 0.8013502359390259, + "num_tokens": 11472371.0, + "step": 1281 + }, + { + "epoch": 0.9741641337386018, + "grad_norm": 2.8011014461517334, + "learning_rate": 4.039703763581472e-06, + "loss": 0.5061966776847839, + "mean_token_accuracy": 0.829810380935669, + "num_tokens": 11476672.0, + "step": 1282 + }, + { + "epoch": 0.9749240121580547, + "grad_norm": 2.4505462646484375, + "learning_rate": 4.038053177743279e-06, + "loss": 0.43407535552978516, + "mean_token_accuracy": 0.8428469896316528, + "num_tokens": 11481297.0, + "step": 1283 + }, + { + "epoch": 0.9756838905775076, + "grad_norm": 2.1618378162384033, + "learning_rate": 4.036401512440161e-06, + "loss": 0.6056663393974304, + "mean_token_accuracy": 0.7977457642555237, + "num_tokens": 11488657.0, + "step": 1284 + }, + { + "epoch": 0.9764437689969605, + "grad_norm": 1.9192147254943848, + "learning_rate": 4.034748768831319e-06, + "loss": 0.524390697479248, + "mean_token_accuracy": 0.8120636940002441, + "num_tokens": 11496485.0, + "step": 1285 + }, + { + "epoch": 0.9772036474164134, + "grad_norm": 2.766435384750366, + "learning_rate": 4.033094948076713e-06, + "loss": 0.5494908690452576, + "mean_token_accuracy": 0.8141890168190002, + "num_tokens": 11501341.0, + "step": 1286 + }, + { + "epoch": 0.9779635258358662, + "grad_norm": 1.3519539833068848, + "learning_rate": 4.031440051337056e-06, + "loss": 0.4339691400527954, + "mean_token_accuracy": 0.8400131464004517, + "num_tokens": 11512843.0, + "step": 1287 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 1.2492141723632812, + "learning_rate": 4.02978407977382e-06, + "loss": 0.4433518052101135, + "mean_token_accuracy": 0.8432940244674683, + "num_tokens": 11530227.0, + "step": 1288 + }, + { + "epoch": 0.979483282674772, + "grad_norm": 1.6597715616226196, + "learning_rate": 4.02812703454923e-06, + "loss": 0.602222204208374, + "mean_token_accuracy": 0.786965548992157, + "num_tokens": 11543955.0, + "step": 1289 + }, + { + "epoch": 0.9802431610942249, + "grad_norm": 1.6621816158294678, + "learning_rate": 4.026468916826262e-06, + "loss": 0.35662174224853516, + "mean_token_accuracy": 0.8716133832931519, + "num_tokens": 11552064.0, + "step": 1290 + }, + { + "epoch": 0.9810030395136778, + "grad_norm": 4.539844989776611, + "learning_rate": 4.024809727768648e-06, + "loss": 0.543423593044281, + "mean_token_accuracy": 0.8293194770812988, + "num_tokens": 11555595.0, + "step": 1291 + }, + { + "epoch": 0.9817629179331308, + "grad_norm": 1.4026556015014648, + "learning_rate": 4.023149468540871e-06, + "loss": 0.4301237165927887, + "mean_token_accuracy": 0.8358224630355835, + "num_tokens": 11572275.0, + "step": 1292 + }, + { + "epoch": 0.9825227963525835, + "grad_norm": 1.611262321472168, + "learning_rate": 4.021488140308165e-06, + "loss": 0.5378580689430237, + "mean_token_accuracy": 0.8173760771751404, + "num_tokens": 11584299.0, + "step": 1293 + }, + { + "epoch": 0.9832826747720365, + "grad_norm": 4.138631820678711, + "learning_rate": 4.019825744236514e-06, + "loss": 0.40272149443626404, + "mean_token_accuracy": 0.8648844957351685, + "num_tokens": 11586705.0, + "step": 1294 + }, + { + "epoch": 0.9840425531914894, + "grad_norm": 3.177703619003296, + "learning_rate": 4.018162281492651e-06, + "loss": 0.5320103168487549, + "mean_token_accuracy": 0.8250276446342468, + "num_tokens": 11590689.0, + "step": 1295 + }, + { + "epoch": 0.9848024316109423, + "grad_norm": 2.727597713470459, + "learning_rate": 4.016497753244058e-06, + "loss": 0.5662774443626404, + "mean_token_accuracy": 0.8074625730514526, + "num_tokens": 11596092.0, + "step": 1296 + }, + { + "epoch": 0.9855623100303952, + "grad_norm": 1.485139012336731, + "learning_rate": 4.014832160658966e-06, + "loss": 0.5414972305297852, + "mean_token_accuracy": 0.8082696199417114, + "num_tokens": 11613785.0, + "step": 1297 + }, + { + "epoch": 0.986322188449848, + "grad_norm": 2.4025990962982178, + "learning_rate": 4.013165504906352e-06, + "loss": 0.6556503772735596, + "mean_token_accuracy": 0.7785214781761169, + "num_tokens": 11620421.0, + "step": 1298 + }, + { + "epoch": 0.9870820668693009, + "grad_norm": 1.878273606300354, + "learning_rate": 4.011497787155938e-06, + "loss": 0.4221133887767792, + "mean_token_accuracy": 0.850035548210144, + "num_tokens": 11627998.0, + "step": 1299 + }, + { + "epoch": 0.9878419452887538, + "grad_norm": 2.0430715084075928, + "learning_rate": 4.009829008578192e-06, + "loss": 0.5205984711647034, + "mean_token_accuracy": 0.819183349609375, + "num_tokens": 11636279.0, + "step": 1300 + }, + { + "epoch": 0.9886018237082067, + "grad_norm": 3.4769439697265625, + "learning_rate": 4.00815917034433e-06, + "loss": 0.5449948310852051, + "mean_token_accuracy": 0.8240023851394653, + "num_tokens": 11639638.0, + "step": 1301 + }, + { + "epoch": 0.9893617021276596, + "grad_norm": 2.4783987998962402, + "learning_rate": 4.006488273626307e-06, + "loss": 0.4316832423210144, + "mean_token_accuracy": 0.8474695086479187, + "num_tokens": 11645463.0, + "step": 1302 + }, + { + "epoch": 0.9901215805471124, + "grad_norm": 1.881475567817688, + "learning_rate": 4.004816319596822e-06, + "loss": 0.5157331824302673, + "mean_token_accuracy": 0.826042652130127, + "num_tokens": 11653955.0, + "step": 1303 + }, + { + "epoch": 0.9908814589665653, + "grad_norm": 2.6569254398345947, + "learning_rate": 4.003143309429317e-06, + "loss": 0.46492767333984375, + "mean_token_accuracy": 0.8320850133895874, + "num_tokens": 11659357.0, + "step": 1304 + }, + { + "epoch": 0.9916413373860182, + "grad_norm": 2.4917593002319336, + "learning_rate": 4.0014692442979756e-06, + "loss": 0.459585040807724, + "mean_token_accuracy": 0.8457611799240112, + "num_tokens": 11664207.0, + "step": 1305 + }, + { + "epoch": 0.9924012158054711, + "grad_norm": 2.6885526180267334, + "learning_rate": 3.999794125377721e-06, + "loss": 0.4677402973175049, + "mean_token_accuracy": 0.8307361602783203, + "num_tokens": 11668879.0, + "step": 1306 + }, + { + "epoch": 0.993161094224924, + "grad_norm": 1.9737319946289062, + "learning_rate": 3.998117953844215e-06, + "loss": 0.44684839248657227, + "mean_token_accuracy": 0.8367687463760376, + "num_tokens": 11676081.0, + "step": 1307 + }, + { + "epoch": 0.993920972644377, + "grad_norm": 1.4333021640777588, + "learning_rate": 3.996440730873861e-06, + "loss": 0.526146650314331, + "mean_token_accuracy": 0.816251814365387, + "num_tokens": 11689333.0, + "step": 1308 + }, + { + "epoch": 0.9946808510638298, + "grad_norm": 1.3689230680465698, + "learning_rate": 3.9947624576437975e-06, + "loss": 0.40214329957962036, + "mean_token_accuracy": 0.8610327839851379, + "num_tokens": 11701540.0, + "step": 1309 + }, + { + "epoch": 0.9954407294832827, + "grad_norm": 1.2435375452041626, + "learning_rate": 3.9930831353319025e-06, + "loss": 0.4532913267612457, + "mean_token_accuracy": 0.8415389060974121, + "num_tokens": 11717920.0, + "step": 1310 + }, + { + "epoch": 0.9962006079027356, + "grad_norm": 1.9968011379241943, + "learning_rate": 3.9914027651167866e-06, + "loss": 0.46954160928726196, + "mean_token_accuracy": 0.8351103663444519, + "num_tokens": 11724999.0, + "step": 1311 + }, + { + "epoch": 0.9969604863221885, + "grad_norm": 1.9521311521530151, + "learning_rate": 3.989721348177801e-06, + "loss": 0.5068016052246094, + "mean_token_accuracy": 0.8220845460891724, + "num_tokens": 11732569.0, + "step": 1312 + }, + { + "epoch": 0.9977203647416414, + "grad_norm": 2.7332582473754883, + "learning_rate": 3.988038885695028e-06, + "loss": 0.4154692590236664, + "mean_token_accuracy": 0.8493857383728027, + "num_tokens": 11736759.0, + "step": 1313 + }, + { + "epoch": 0.9984802431610942, + "grad_norm": 1.8656952381134033, + "learning_rate": 3.986355378849284e-06, + "loss": 0.4151354134082794, + "mean_token_accuracy": 0.83440101146698, + "num_tokens": 11743827.0, + "step": 1314 + }, + { + "epoch": 0.9992401215805471, + "grad_norm": 1.304006576538086, + "learning_rate": 3.984670828822118e-06, + "loss": 0.4926128089427948, + "mean_token_accuracy": 0.8603005409240723, + "num_tokens": 11757707.0, + "step": 1315 + }, + { + "epoch": 1.0, + "grad_norm": 1.497079610824585, + "learning_rate": 3.982985236795815e-06, + "loss": 0.43342477083206177, + "mean_token_accuracy": 0.8550825119018555, + "num_tokens": 11769678.0, + "step": 1316 + }, + { + "epoch": 1.000759878419453, + "grad_norm": 2.870274543762207, + "learning_rate": 3.981298603953385e-06, + "loss": 0.3723528981208801, + "mean_token_accuracy": 0.8745899796485901, + "num_tokens": 11773290.0, + "step": 1317 + }, + { + "epoch": 1.0015197568389058, + "grad_norm": 1.3442503213882446, + "learning_rate": 3.979610931478574e-06, + "loss": 0.34688329696655273, + "mean_token_accuracy": 0.8749074935913086, + "num_tokens": 11786400.0, + "step": 1318 + }, + { + "epoch": 1.0022796352583587, + "grad_norm": 1.7272238731384277, + "learning_rate": 3.977922220555855e-06, + "loss": 0.28274932503700256, + "mean_token_accuracy": 0.896713137626648, + "num_tokens": 11793059.0, + "step": 1319 + }, + { + "epoch": 1.0030395136778116, + "grad_norm": 1.7362451553344727, + "learning_rate": 3.976232472370431e-06, + "loss": 0.5494794845581055, + "mean_token_accuracy": 0.8341718912124634, + "num_tokens": 11802593.0, + "step": 1320 + }, + { + "epoch": 1.0037993920972645, + "grad_norm": 1.3316494226455688, + "learning_rate": 3.97454168810823e-06, + "loss": 0.41505366563796997, + "mean_token_accuracy": 0.8581969738006592, + "num_tokens": 11813925.0, + "step": 1321 + }, + { + "epoch": 1.0045592705167172, + "grad_norm": 1.6152615547180176, + "learning_rate": 3.972849868955913e-06, + "loss": 0.44761013984680176, + "mean_token_accuracy": 0.8413045406341553, + "num_tokens": 11825709.0, + "step": 1322 + }, + { + "epoch": 1.0053191489361701, + "grad_norm": 2.1172471046447754, + "learning_rate": 3.97115701610086e-06, + "loss": 0.3903353810310364, + "mean_token_accuracy": 0.8662760257720947, + "num_tokens": 11832070.0, + "step": 1323 + }, + { + "epoch": 1.006079027355623, + "grad_norm": 1.5923868417739868, + "learning_rate": 3.969463130731183e-06, + "loss": 0.4491051137447357, + "mean_token_accuracy": 0.8677828311920166, + "num_tokens": 11843154.0, + "step": 1324 + }, + { + "epoch": 1.006838905775076, + "grad_norm": 1.6848995685577393, + "learning_rate": 3.967768214035716e-06, + "loss": 0.45765817165374756, + "mean_token_accuracy": 0.8401060104370117, + "num_tokens": 11854826.0, + "step": 1325 + }, + { + "epoch": 1.0075987841945289, + "grad_norm": 2.3739020824432373, + "learning_rate": 3.966072267204014e-06, + "loss": 0.4482722580432892, + "mean_token_accuracy": 0.8368916511535645, + "num_tokens": 11860559.0, + "step": 1326 + }, + { + "epoch": 1.0083586626139818, + "grad_norm": 1.5403034687042236, + "learning_rate": 3.964375291426361e-06, + "loss": 0.35589972138404846, + "mean_token_accuracy": 0.8728118538856506, + "num_tokens": 11871959.0, + "step": 1327 + }, + { + "epoch": 1.0091185410334347, + "grad_norm": 1.6750119924545288, + "learning_rate": 3.962677287893758e-06, + "loss": 0.35873427987098694, + "mean_token_accuracy": 0.9027186632156372, + "num_tokens": 11881818.0, + "step": 1328 + }, + { + "epoch": 1.0098784194528876, + "grad_norm": 1.5489170551300049, + "learning_rate": 3.9609782577979305e-06, + "loss": 0.3634672462940216, + "mean_token_accuracy": 0.8582607507705688, + "num_tokens": 11891084.0, + "step": 1329 + }, + { + "epoch": 1.0106382978723405, + "grad_norm": 2.43859601020813, + "learning_rate": 3.959278202331323e-06, + "loss": 0.3640799820423126, + "mean_token_accuracy": 0.88062584400177, + "num_tokens": 11896032.0, + "step": 1330 + }, + { + "epoch": 1.0113981762917934, + "grad_norm": 3.612184524536133, + "learning_rate": 3.9575771226870986e-06, + "loss": 0.3733130097389221, + "mean_token_accuracy": 0.8946067094802856, + "num_tokens": 11899479.0, + "step": 1331 + }, + { + "epoch": 1.012158054711246, + "grad_norm": 1.541355848312378, + "learning_rate": 3.955875020059141e-06, + "loss": 0.320593923330307, + "mean_token_accuracy": 0.9057406783103943, + "num_tokens": 11910179.0, + "step": 1332 + }, + { + "epoch": 1.012917933130699, + "grad_norm": 2.0565030574798584, + "learning_rate": 3.954171895642052e-06, + "loss": 0.3341682553291321, + "mean_token_accuracy": 0.8829344511032104, + "num_tokens": 11916489.0, + "step": 1333 + }, + { + "epoch": 1.013677811550152, + "grad_norm": 2.9732539653778076, + "learning_rate": 3.9524677506311505e-06, + "loss": 0.38488566875457764, + "mean_token_accuracy": 0.8752974271774292, + "num_tokens": 11920682.0, + "step": 1334 + }, + { + "epoch": 1.0144376899696048, + "grad_norm": 2.7697458267211914, + "learning_rate": 3.950762586222469e-06, + "loss": 0.39864760637283325, + "mean_token_accuracy": 0.8593167662620544, + "num_tokens": 11925233.0, + "step": 1335 + }, + { + "epoch": 1.0151975683890577, + "grad_norm": 2.2302119731903076, + "learning_rate": 3.949056403612758e-06, + "loss": 0.3985682725906372, + "mean_token_accuracy": 0.8677899837493896, + "num_tokens": 11932000.0, + "step": 1336 + }, + { + "epoch": 1.0159574468085106, + "grad_norm": 2.360572576522827, + "learning_rate": 3.947349203999485e-06, + "loss": 0.36940714716911316, + "mean_token_accuracy": 0.8760676383972168, + "num_tokens": 11937569.0, + "step": 1337 + }, + { + "epoch": 1.0167173252279635, + "grad_norm": 1.3383921384811401, + "learning_rate": 3.945640988580824e-06, + "loss": 0.40628793835639954, + "mean_token_accuracy": 0.866442084312439, + "num_tokens": 11955679.0, + "step": 1338 + }, + { + "epoch": 1.0174772036474165, + "grad_norm": 2.1502623558044434, + "learning_rate": 3.943931758555669e-06, + "loss": 0.4493565559387207, + "mean_token_accuracy": 0.8307522535324097, + "num_tokens": 11962734.0, + "step": 1339 + }, + { + "epoch": 1.0182370820668694, + "grad_norm": 2.4737331867218018, + "learning_rate": 3.942221515123624e-06, + "loss": 0.28508758544921875, + "mean_token_accuracy": 0.8967142105102539, + "num_tokens": 11967783.0, + "step": 1340 + }, + { + "epoch": 1.0189969604863223, + "grad_norm": 2.4525370597839355, + "learning_rate": 3.940510259485002e-06, + "loss": 0.40227818489074707, + "mean_token_accuracy": 0.8618967533111572, + "num_tokens": 11972918.0, + "step": 1341 + }, + { + "epoch": 1.0197568389057752, + "grad_norm": 1.7299731969833374, + "learning_rate": 3.938797992840828e-06, + "loss": 0.26339593529701233, + "mean_token_accuracy": 0.9004406929016113, + "num_tokens": 11981250.0, + "step": 1342 + }, + { + "epoch": 1.0205167173252279, + "grad_norm": 2.8756747245788574, + "learning_rate": 3.937084716392839e-06, + "loss": 0.47792482376098633, + "mean_token_accuracy": 0.8440839052200317, + "num_tokens": 11986356.0, + "step": 1343 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 2.104473114013672, + "learning_rate": 3.935370431343475e-06, + "loss": 0.36723971366882324, + "mean_token_accuracy": 0.8831232786178589, + "num_tokens": 11994495.0, + "step": 1344 + }, + { + "epoch": 1.0220364741641337, + "grad_norm": 1.9173074960708618, + "learning_rate": 3.933655138895889e-06, + "loss": 0.409319669008255, + "mean_token_accuracy": 0.8632645606994629, + "num_tokens": 12002060.0, + "step": 1345 + }, + { + "epoch": 1.0227963525835866, + "grad_norm": 2.958311080932617, + "learning_rate": 3.9319388402539395e-06, + "loss": 0.5390093922615051, + "mean_token_accuracy": 0.8204828500747681, + "num_tokens": 12007588.0, + "step": 1346 + }, + { + "epoch": 1.0235562310030395, + "grad_norm": 1.6470831632614136, + "learning_rate": 3.930221536622192e-06, + "loss": 0.4524633288383484, + "mean_token_accuracy": 0.8516575694084167, + "num_tokens": 12018831.0, + "step": 1347 + }, + { + "epoch": 1.0243161094224924, + "grad_norm": 1.3160780668258667, + "learning_rate": 3.928503229205913e-06, + "loss": 0.4180558919906616, + "mean_token_accuracy": 0.8495022058486938, + "num_tokens": 12033947.0, + "step": 1348 + }, + { + "epoch": 1.0250759878419453, + "grad_norm": 1.9686089754104614, + "learning_rate": 3.92678391921108e-06, + "loss": 0.41927334666252136, + "mean_token_accuracy": 0.8462997674942017, + "num_tokens": 12042005.0, + "step": 1349 + }, + { + "epoch": 1.0258358662613982, + "grad_norm": 2.351778507232666, + "learning_rate": 3.92506360784437e-06, + "loss": 0.2946245074272156, + "mean_token_accuracy": 0.9170923233032227, + "num_tokens": 12046579.0, + "step": 1350 + }, + { + "epoch": 1.0265957446808511, + "grad_norm": 2.0636913776397705, + "learning_rate": 3.923342296313162e-06, + "loss": 0.3422774076461792, + "mean_token_accuracy": 0.8809213638305664, + "num_tokens": 12053214.0, + "step": 1351 + }, + { + "epoch": 1.027355623100304, + "grad_norm": 1.7272592782974243, + "learning_rate": 3.92161998582554e-06, + "loss": 0.5864541530609131, + "mean_token_accuracy": 0.7986117601394653, + "num_tokens": 12068522.0, + "step": 1352 + }, + { + "epoch": 1.028115501519757, + "grad_norm": 0.8980231881141663, + "learning_rate": 3.919896677590289e-06, + "loss": 0.2964550256729126, + "mean_token_accuracy": 0.8911845088005066, + "num_tokens": 12093834.0, + "step": 1353 + }, + { + "epoch": 1.0288753799392096, + "grad_norm": 1.6031712293624878, + "learning_rate": 3.918172372816892e-06, + "loss": 0.37254488468170166, + "mean_token_accuracy": 0.8615843057632446, + "num_tokens": 12104393.0, + "step": 1354 + }, + { + "epoch": 1.0296352583586625, + "grad_norm": 1.282134771347046, + "learning_rate": 3.916447072715531e-06, + "loss": 0.3522927761077881, + "mean_token_accuracy": 0.8713657259941101, + "num_tokens": 12118671.0, + "step": 1355 + }, + { + "epoch": 1.0303951367781155, + "grad_norm": 2.1986680030822754, + "learning_rate": 3.914720778497091e-06, + "loss": 0.3716316223144531, + "mean_token_accuracy": 0.8661249279975891, + "num_tokens": 12125178.0, + "step": 1356 + }, + { + "epoch": 1.0311550151975684, + "grad_norm": 1.5937882661819458, + "learning_rate": 3.91299349137315e-06, + "loss": 0.48067355155944824, + "mean_token_accuracy": 0.8284252882003784, + "num_tokens": 12136785.0, + "step": 1357 + }, + { + "epoch": 1.0319148936170213, + "grad_norm": 1.6743099689483643, + "learning_rate": 3.9112652125559845e-06, + "loss": 0.4461551308631897, + "mean_token_accuracy": 0.8381845355033875, + "num_tokens": 12150066.0, + "step": 1358 + }, + { + "epoch": 1.0326747720364742, + "grad_norm": 2.2346715927124023, + "learning_rate": 3.909535943258567e-06, + "loss": 0.3148220181465149, + "mean_token_accuracy": 0.8797591924667358, + "num_tokens": 12155506.0, + "step": 1359 + }, + { + "epoch": 1.033434650455927, + "grad_norm": 1.9608992338180542, + "learning_rate": 3.907805684694567e-06, + "loss": 0.32598960399627686, + "mean_token_accuracy": 0.8819410800933838, + "num_tokens": 12163261.0, + "step": 1360 + }, + { + "epoch": 1.03419452887538, + "grad_norm": 2.413477897644043, + "learning_rate": 3.906074438078343e-06, + "loss": 0.38179588317871094, + "mean_token_accuracy": 0.8739585876464844, + "num_tokens": 12169254.0, + "step": 1361 + }, + { + "epoch": 1.034954407294833, + "grad_norm": 2.0258278846740723, + "learning_rate": 3.904342204624955e-06, + "loss": 0.33240315318107605, + "mean_token_accuracy": 0.8808181285858154, + "num_tokens": 12175379.0, + "step": 1362 + }, + { + "epoch": 1.0357142857142858, + "grad_norm": 2.4111437797546387, + "learning_rate": 3.9026089855501475e-06, + "loss": 0.412802517414093, + "mean_token_accuracy": 0.8504396677017212, + "num_tokens": 12182007.0, + "step": 1363 + }, + { + "epoch": 1.0364741641337385, + "grad_norm": 2.0424840450286865, + "learning_rate": 3.900874782070362e-06, + "loss": 0.2914797067642212, + "mean_token_accuracy": 0.8731886148452759, + "num_tokens": 12187743.0, + "step": 1364 + }, + { + "epoch": 1.0372340425531914, + "grad_norm": 2.9248716831207275, + "learning_rate": 3.899139595402729e-06, + "loss": 0.34071338176727295, + "mean_token_accuracy": 0.8736443519592285, + "num_tokens": 12191830.0, + "step": 1365 + }, + { + "epoch": 1.0379939209726443, + "grad_norm": 2.240220785140991, + "learning_rate": 3.8974034267650695e-06, + "loss": 0.23049014806747437, + "mean_token_accuracy": 0.9000070691108704, + "num_tokens": 12196460.0, + "step": 1366 + }, + { + "epoch": 1.0387537993920972, + "grad_norm": 1.5038460493087769, + "learning_rate": 3.895666277375892e-06, + "loss": 0.32255327701568604, + "mean_token_accuracy": 0.873004674911499, + "num_tokens": 12206230.0, + "step": 1367 + }, + { + "epoch": 1.0395136778115501, + "grad_norm": 1.2339142560958862, + "learning_rate": 3.893928148454398e-06, + "loss": 0.4069131314754486, + "mean_token_accuracy": 0.8461740016937256, + "num_tokens": 12226502.0, + "step": 1368 + }, + { + "epoch": 1.040273556231003, + "grad_norm": 2.531553268432617, + "learning_rate": 3.89218904122047e-06, + "loss": 0.43681037425994873, + "mean_token_accuracy": 0.8497104048728943, + "num_tokens": 12232241.0, + "step": 1369 + }, + { + "epoch": 1.041033434650456, + "grad_norm": 3.8404815196990967, + "learning_rate": 3.890448956894682e-06, + "loss": 0.3241814970970154, + "mean_token_accuracy": 0.884732723236084, + "num_tokens": 12235126.0, + "step": 1370 + }, + { + "epoch": 1.0417933130699089, + "grad_norm": 2.9608030319213867, + "learning_rate": 3.888707896698293e-06, + "loss": 0.4641021490097046, + "mean_token_accuracy": 0.8496800661087036, + "num_tokens": 12240630.0, + "step": 1371 + }, + { + "epoch": 1.0425531914893618, + "grad_norm": 2.1166417598724365, + "learning_rate": 3.886965861853243e-06, + "loss": 0.42038479447364807, + "mean_token_accuracy": 0.8512747287750244, + "num_tokens": 12247969.0, + "step": 1372 + }, + { + "epoch": 1.0433130699088147, + "grad_norm": 2.5918161869049072, + "learning_rate": 3.885222853582163e-06, + "loss": 0.2871917188167572, + "mean_token_accuracy": 0.9129709601402283, + "num_tokens": 12252161.0, + "step": 1373 + }, + { + "epoch": 1.0440729483282676, + "grad_norm": 2.4261348247528076, + "learning_rate": 3.88347887310836e-06, + "loss": 0.4003123342990875, + "mean_token_accuracy": 0.8570356369018555, + "num_tokens": 12258135.0, + "step": 1374 + }, + { + "epoch": 1.0448328267477203, + "grad_norm": 1.3439548015594482, + "learning_rate": 3.881733921655829e-06, + "loss": 0.3278140425682068, + "mean_token_accuracy": 0.8831373453140259, + "num_tokens": 12272849.0, + "step": 1375 + }, + { + "epoch": 1.0455927051671732, + "grad_norm": 1.527989387512207, + "learning_rate": 3.879988000449243e-06, + "loss": 0.33789363503456116, + "mean_token_accuracy": 0.8825669884681702, + "num_tokens": 12283281.0, + "step": 1376 + }, + { + "epoch": 1.046352583586626, + "grad_norm": 1.6755503416061401, + "learning_rate": 3.878241110713957e-06, + "loss": 0.4816160798072815, + "mean_token_accuracy": 0.8193758726119995, + "num_tokens": 12295422.0, + "step": 1377 + }, + { + "epoch": 1.047112462006079, + "grad_norm": 2.8110361099243164, + "learning_rate": 3.876493253676004e-06, + "loss": 0.38662949204444885, + "mean_token_accuracy": 0.8611986637115479, + "num_tokens": 12299806.0, + "step": 1378 + }, + { + "epoch": 1.047872340425532, + "grad_norm": 1.86097252368927, + "learning_rate": 3.8747444305621e-06, + "loss": 0.27612629532814026, + "mean_token_accuracy": 0.8984048366546631, + "num_tokens": 12306599.0, + "step": 1379 + }, + { + "epoch": 1.0486322188449848, + "grad_norm": 2.361828565597534, + "learning_rate": 3.872994642599635e-06, + "loss": 0.469953715801239, + "mean_token_accuracy": 0.8464452028274536, + "num_tokens": 12314249.0, + "step": 1380 + }, + { + "epoch": 1.0493920972644377, + "grad_norm": 1.9524794816970825, + "learning_rate": 3.871243891016676e-06, + "loss": 0.5419625043869019, + "mean_token_accuracy": 0.8468329906463623, + "num_tokens": 12324987.0, + "step": 1381 + }, + { + "epoch": 1.0501519756838906, + "grad_norm": 1.6931511163711548, + "learning_rate": 3.869492177041971e-06, + "loss": 0.3791416883468628, + "mean_token_accuracy": 0.8692882061004639, + "num_tokens": 12336864.0, + "step": 1382 + }, + { + "epoch": 1.0509118541033435, + "grad_norm": 1.909692406654358, + "learning_rate": 3.867739501904938e-06, + "loss": 0.27974557876586914, + "mean_token_accuracy": 0.9004636406898499, + "num_tokens": 12343093.0, + "step": 1383 + }, + { + "epoch": 1.0516717325227964, + "grad_norm": 1.415162205696106, + "learning_rate": 3.8659858668356735e-06, + "loss": 0.38928335905075073, + "mean_token_accuracy": 0.8491984009742737, + "num_tokens": 12356613.0, + "step": 1384 + }, + { + "epoch": 1.0524316109422491, + "grad_norm": 1.8195741176605225, + "learning_rate": 3.864231273064944e-06, + "loss": 0.3798758089542389, + "mean_token_accuracy": 0.8728072047233582, + "num_tokens": 12364860.0, + "step": 1385 + }, + { + "epoch": 1.053191489361702, + "grad_norm": 1.8481454849243164, + "learning_rate": 3.862475721824193e-06, + "loss": 0.269635945558548, + "mean_token_accuracy": 0.899247407913208, + "num_tokens": 12371841.0, + "step": 1386 + }, + { + "epoch": 1.053951367781155, + "grad_norm": 1.7838784456253052, + "learning_rate": 3.8607192143455325e-06, + "loss": 0.36971768736839294, + "mean_token_accuracy": 0.8833638429641724, + "num_tokens": 12380685.0, + "step": 1387 + }, + { + "epoch": 1.0547112462006079, + "grad_norm": 1.333358645439148, + "learning_rate": 3.858961751861748e-06, + "loss": 0.4039418399333954, + "mean_token_accuracy": 0.8541078567504883, + "num_tokens": 12394072.0, + "step": 1388 + }, + { + "epoch": 1.0554711246200608, + "grad_norm": 2.1600265502929688, + "learning_rate": 3.857203335606294e-06, + "loss": 0.38211894035339355, + "mean_token_accuracy": 0.8549972772598267, + "num_tokens": 12400449.0, + "step": 1389 + }, + { + "epoch": 1.0562310030395137, + "grad_norm": 2.914902687072754, + "learning_rate": 3.855443966813295e-06, + "loss": 0.2237374186515808, + "mean_token_accuracy": 0.9253600835800171, + "num_tokens": 12403758.0, + "step": 1390 + }, + { + "epoch": 1.0569908814589666, + "grad_norm": 2.2361080646514893, + "learning_rate": 3.853683646717543e-06, + "loss": 0.3359566926956177, + "mean_token_accuracy": 0.898173451423645, + "num_tokens": 12410374.0, + "step": 1391 + }, + { + "epoch": 1.0577507598784195, + "grad_norm": 2.3639304637908936, + "learning_rate": 3.8519223765544985e-06, + "loss": 0.3844943046569824, + "mean_token_accuracy": 0.863599419593811, + "num_tokens": 12416016.0, + "step": 1392 + }, + { + "epoch": 1.0585106382978724, + "grad_norm": 2.202971935272217, + "learning_rate": 3.85016015756029e-06, + "loss": 0.3546281158924103, + "mean_token_accuracy": 0.8907540440559387, + "num_tokens": 12422026.0, + "step": 1393 + }, + { + "epoch": 1.0592705167173253, + "grad_norm": 1.1279661655426025, + "learning_rate": 3.848396990971709e-06, + "loss": 0.31522464752197266, + "mean_token_accuracy": 0.8662257194519043, + "num_tokens": 12439964.0, + "step": 1394 + }, + { + "epoch": 1.0600303951367782, + "grad_norm": 2.4731740951538086, + "learning_rate": 3.846632878026214e-06, + "loss": 0.456442266702652, + "mean_token_accuracy": 0.8516958951950073, + "num_tokens": 12446231.0, + "step": 1395 + }, + { + "epoch": 1.060790273556231, + "grad_norm": 1.7631878852844238, + "learning_rate": 3.844867819961928e-06, + "loss": 0.487227201461792, + "mean_token_accuracy": 0.8466947078704834, + "num_tokens": 12459989.0, + "step": 1396 + }, + { + "epoch": 1.0615501519756838, + "grad_norm": 2.4468278884887695, + "learning_rate": 3.843101818017637e-06, + "loss": 0.3367291986942291, + "mean_token_accuracy": 0.8734689950942993, + "num_tokens": 12465741.0, + "step": 1397 + }, + { + "epoch": 1.0623100303951367, + "grad_norm": 1.9045145511627197, + "learning_rate": 3.841334873432789e-06, + "loss": 0.4652615487575531, + "mean_token_accuracy": 0.8333107233047485, + "num_tokens": 12474963.0, + "step": 1398 + }, + { + "epoch": 1.0630699088145896, + "grad_norm": 1.6816917657852173, + "learning_rate": 3.839566987447492e-06, + "loss": 0.4144279956817627, + "mean_token_accuracy": 0.8472539186477661, + "num_tokens": 12485521.0, + "step": 1399 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 1.8990092277526855, + "learning_rate": 3.837798161302518e-06, + "loss": 0.4040985405445099, + "mean_token_accuracy": 0.8514704704284668, + "num_tokens": 12493495.0, + "step": 1400 + }, + { + "epoch": 1.0645896656534954, + "grad_norm": 2.27785325050354, + "learning_rate": 3.836028396239297e-06, + "loss": 0.43425723910331726, + "mean_token_accuracy": 0.8795069456100464, + "num_tokens": 12499789.0, + "step": 1401 + }, + { + "epoch": 1.0653495440729484, + "grad_norm": 2.5130882263183594, + "learning_rate": 3.8342576934999184e-06, + "loss": 0.33892524242401123, + "mean_token_accuracy": 0.8717449903488159, + "num_tokens": 12504885.0, + "step": 1402 + }, + { + "epoch": 1.0661094224924013, + "grad_norm": 2.650040864944458, + "learning_rate": 3.832486054327131e-06, + "loss": 0.4200317859649658, + "mean_token_accuracy": 0.8616159558296204, + "num_tokens": 12509783.0, + "step": 1403 + }, + { + "epoch": 1.0668693009118542, + "grad_norm": 2.9176881313323975, + "learning_rate": 3.830713479964335e-06, + "loss": 0.37018489837646484, + "mean_token_accuracy": 0.8676021695137024, + "num_tokens": 12514441.0, + "step": 1404 + }, + { + "epoch": 1.067629179331307, + "grad_norm": 1.6430318355560303, + "learning_rate": 3.828939971655595e-06, + "loss": 0.27539193630218506, + "mean_token_accuracy": 0.9077831506729126, + "num_tokens": 12523677.0, + "step": 1405 + }, + { + "epoch": 1.06838905775076, + "grad_norm": 1.3683708906173706, + "learning_rate": 3.827165530645627e-06, + "loss": 0.4085099697113037, + "mean_token_accuracy": 0.8579255938529968, + "num_tokens": 12540104.0, + "step": 1406 + }, + { + "epoch": 1.0691489361702127, + "grad_norm": 2.528465747833252, + "learning_rate": 3.825390158179802e-06, + "loss": 0.42462456226348877, + "mean_token_accuracy": 0.852813720703125, + "num_tokens": 12548239.0, + "step": 1407 + }, + { + "epoch": 1.0699088145896656, + "grad_norm": 1.8288795948028564, + "learning_rate": 3.823613855504144e-06, + "loss": 0.412417471408844, + "mean_token_accuracy": 0.8622130751609802, + "num_tokens": 12557316.0, + "step": 1408 + }, + { + "epoch": 1.0706686930091185, + "grad_norm": 2.341794490814209, + "learning_rate": 3.82183662386533e-06, + "loss": 0.2996668815612793, + "mean_token_accuracy": 0.8964041471481323, + "num_tokens": 12562377.0, + "step": 1409 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 2.555877208709717, + "learning_rate": 3.82005846451069e-06, + "loss": 0.4184221625328064, + "mean_token_accuracy": 0.8678828477859497, + "num_tokens": 12568516.0, + "step": 1410 + }, + { + "epoch": 1.0721884498480243, + "grad_norm": 2.081308126449585, + "learning_rate": 3.8182793786882065e-06, + "loss": 0.4376835823059082, + "mean_token_accuracy": 0.8409077525138855, + "num_tokens": 12576598.0, + "step": 1411 + }, + { + "epoch": 1.0729483282674772, + "grad_norm": 2.0272316932678223, + "learning_rate": 3.816499367646508e-06, + "loss": 0.3630060851573944, + "mean_token_accuracy": 0.8762413263320923, + "num_tokens": 12584587.0, + "step": 1412 + }, + { + "epoch": 1.0737082066869301, + "grad_norm": 2.6382484436035156, + "learning_rate": 3.814718432634877e-06, + "loss": 0.4244990348815918, + "mean_token_accuracy": 0.8509312272071838, + "num_tokens": 12590028.0, + "step": 1413 + }, + { + "epoch": 1.074468085106383, + "grad_norm": 2.429800271987915, + "learning_rate": 3.8129365749032398e-06, + "loss": 0.36990004777908325, + "mean_token_accuracy": 0.8749774098396301, + "num_tokens": 12594984.0, + "step": 1414 + }, + { + "epoch": 1.075227963525836, + "grad_norm": 3.5939090251922607, + "learning_rate": 3.8111537957021736e-06, + "loss": 0.4245661199092865, + "mean_token_accuracy": 0.8481623530387878, + "num_tokens": 12598494.0, + "step": 1415 + }, + { + "epoch": 1.0759878419452888, + "grad_norm": 2.705955982208252, + "learning_rate": 3.809370096282903e-06, + "loss": 0.41851678490638733, + "mean_token_accuracy": 0.8548051714897156, + "num_tokens": 12603876.0, + "step": 1416 + }, + { + "epoch": 1.0767477203647418, + "grad_norm": 1.7812079191207886, + "learning_rate": 3.807585477897296e-06, + "loss": 0.47113919258117676, + "mean_token_accuracy": 0.8346904516220093, + "num_tokens": 12613402.0, + "step": 1417 + }, + { + "epoch": 1.0775075987841944, + "grad_norm": 1.4335212707519531, + "learning_rate": 3.8057999417978654e-06, + "loss": 0.3802063465118408, + "mean_token_accuracy": 0.8563423156738281, + "num_tokens": 12626865.0, + "step": 1418 + }, + { + "epoch": 1.0782674772036474, + "grad_norm": 1.9171305894851685, + "learning_rate": 3.8040134892377702e-06, + "loss": 0.20898357033729553, + "mean_token_accuracy": 0.9189738035202026, + "num_tokens": 12632593.0, + "step": 1419 + }, + { + "epoch": 1.0790273556231003, + "grad_norm": 1.4996821880340576, + "learning_rate": 3.802226121470811e-06, + "loss": 0.4203261137008667, + "mean_token_accuracy": 0.8479211330413818, + "num_tokens": 12646395.0, + "step": 1420 + }, + { + "epoch": 1.0797872340425532, + "grad_norm": 2.2007253170013428, + "learning_rate": 3.800437839751432e-06, + "loss": 0.40370577573776245, + "mean_token_accuracy": 0.8427679538726807, + "num_tokens": 12653508.0, + "step": 1421 + }, + { + "epoch": 1.080547112462006, + "grad_norm": 1.7266581058502197, + "learning_rate": 3.7986486453347183e-06, + "loss": 0.46750491857528687, + "mean_token_accuracy": 0.8429205417633057, + "num_tokens": 12666329.0, + "step": 1422 + }, + { + "epoch": 1.081306990881459, + "grad_norm": 1.4716318845748901, + "learning_rate": 3.796858539476394e-06, + "loss": 0.3330317735671997, + "mean_token_accuracy": 0.879012942314148, + "num_tokens": 12676741.0, + "step": 1423 + }, + { + "epoch": 1.082066869300912, + "grad_norm": 2.652127265930176, + "learning_rate": 3.795067523432826e-06, + "loss": 0.35365715622901917, + "mean_token_accuracy": 0.8796792030334473, + "num_tokens": 12681479.0, + "step": 1424 + }, + { + "epoch": 1.0828267477203648, + "grad_norm": 1.2937829494476318, + "learning_rate": 3.793275598461017e-06, + "loss": 0.25272446870803833, + "mean_token_accuracy": 0.9231734275817871, + "num_tokens": 12694238.0, + "step": 1425 + }, + { + "epoch": 1.0835866261398177, + "grad_norm": 1.3831220865249634, + "learning_rate": 3.7914827658186104e-06, + "loss": 0.4935331344604492, + "mean_token_accuracy": 0.8417420387268066, + "num_tokens": 12712857.0, + "step": 1426 + }, + { + "epoch": 1.0843465045592706, + "grad_norm": 3.059525728225708, + "learning_rate": 3.7896890267638832e-06, + "loss": 0.2592190206050873, + "mean_token_accuracy": 0.9040263295173645, + "num_tokens": 12716766.0, + "step": 1427 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 2.8399202823638916, + "learning_rate": 3.787894382555752e-06, + "loss": 0.32098138332366943, + "mean_token_accuracy": 0.8838302493095398, + "num_tokens": 12720774.0, + "step": 1428 + }, + { + "epoch": 1.0858662613981762, + "grad_norm": 2.618479013442993, + "learning_rate": 3.7860988344537664e-06, + "loss": 0.425255686044693, + "mean_token_accuracy": 0.8564130067825317, + "num_tokens": 12726506.0, + "step": 1429 + }, + { + "epoch": 1.0866261398176291, + "grad_norm": 1.3108669519424438, + "learning_rate": 3.7843023837181126e-06, + "loss": 0.40220165252685547, + "mean_token_accuracy": 0.8588873147964478, + "num_tokens": 12742814.0, + "step": 1430 + }, + { + "epoch": 1.087386018237082, + "grad_norm": 2.2083566188812256, + "learning_rate": 3.782505031609607e-06, + "loss": 0.318379282951355, + "mean_token_accuracy": 0.8887606859207153, + "num_tokens": 12748388.0, + "step": 1431 + }, + { + "epoch": 1.088145896656535, + "grad_norm": 1.922358751296997, + "learning_rate": 3.7807067793897006e-06, + "loss": 0.2519589364528656, + "mean_token_accuracy": 0.8936764001846313, + "num_tokens": 12754761.0, + "step": 1432 + }, + { + "epoch": 1.0889057750759878, + "grad_norm": 1.7367439270019531, + "learning_rate": 3.778907628320477e-06, + "loss": 0.3970367908477783, + "mean_token_accuracy": 0.858735203742981, + "num_tokens": 12764016.0, + "step": 1433 + }, + { + "epoch": 1.0896656534954408, + "grad_norm": 2.1931066513061523, + "learning_rate": 3.77710757966465e-06, + "loss": 0.5250554084777832, + "mean_token_accuracy": 0.8356746435165405, + "num_tokens": 12772272.0, + "step": 1434 + }, + { + "epoch": 1.0904255319148937, + "grad_norm": 1.718337893486023, + "learning_rate": 3.775306634685562e-06, + "loss": 0.283231645822525, + "mean_token_accuracy": 0.9009919166564941, + "num_tokens": 12780706.0, + "step": 1435 + }, + { + "epoch": 1.0911854103343466, + "grad_norm": 2.1985926628112793, + "learning_rate": 3.773504794647187e-06, + "loss": 0.3913170397281647, + "mean_token_accuracy": 0.8909255266189575, + "num_tokens": 12787052.0, + "step": 1436 + }, + { + "epoch": 1.0919452887537995, + "grad_norm": 2.8687937259674072, + "learning_rate": 3.771702060814123e-06, + "loss": 0.3135771155357361, + "mean_token_accuracy": 0.9016125202178955, + "num_tokens": 12791854.0, + "step": 1437 + }, + { + "epoch": 1.0927051671732522, + "grad_norm": 4.203946590423584, + "learning_rate": 3.7698984344516e-06, + "loss": 0.3642737865447998, + "mean_token_accuracy": 0.8842349052429199, + "num_tokens": 12794969.0, + "step": 1438 + }, + { + "epoch": 1.093465045592705, + "grad_norm": 1.5134642124176025, + "learning_rate": 3.7680939168254733e-06, + "loss": 0.3732057213783264, + "mean_token_accuracy": 0.8671083450317383, + "num_tokens": 12808480.0, + "step": 1439 + }, + { + "epoch": 1.094224924012158, + "grad_norm": 3.2103970050811768, + "learning_rate": 3.7662885092022206e-06, + "loss": 0.3556194603443146, + "mean_token_accuracy": 0.8786529302597046, + "num_tokens": 12812654.0, + "step": 1440 + }, + { + "epoch": 1.094984802431611, + "grad_norm": 2.2774064540863037, + "learning_rate": 3.7644822128489476e-06, + "loss": 0.38409674167633057, + "mean_token_accuracy": 0.866563081741333, + "num_tokens": 12819854.0, + "step": 1441 + }, + { + "epoch": 1.0957446808510638, + "grad_norm": 1.8250885009765625, + "learning_rate": 3.7626750290333824e-06, + "loss": 0.3812350034713745, + "mean_token_accuracy": 0.8676212430000305, + "num_tokens": 12830338.0, + "step": 1442 + }, + { + "epoch": 1.0965045592705167, + "grad_norm": 1.8337891101837158, + "learning_rate": 3.7608669590238765e-06, + "loss": 0.3892471194267273, + "mean_token_accuracy": 0.8616238832473755, + "num_tokens": 12840340.0, + "step": 1443 + }, + { + "epoch": 1.0972644376899696, + "grad_norm": 1.5300254821777344, + "learning_rate": 3.7590580040894025e-06, + "loss": 0.35288217663764954, + "mean_token_accuracy": 0.8625509738922119, + "num_tokens": 12853144.0, + "step": 1444 + }, + { + "epoch": 1.0980243161094225, + "grad_norm": 2.152683734893799, + "learning_rate": 3.7572481654995554e-06, + "loss": 0.4004772901535034, + "mean_token_accuracy": 0.858427107334137, + "num_tokens": 12859970.0, + "step": 1445 + }, + { + "epoch": 1.0987841945288754, + "grad_norm": 1.532832145690918, + "learning_rate": 3.755437444524548e-06, + "loss": 0.46820127964019775, + "mean_token_accuracy": 0.8585472106933594, + "num_tokens": 12875243.0, + "step": 1446 + }, + { + "epoch": 1.0995440729483283, + "grad_norm": 1.6485342979431152, + "learning_rate": 3.7536258424352164e-06, + "loss": 0.46329325437545776, + "mean_token_accuracy": 0.8376060724258423, + "num_tokens": 12886383.0, + "step": 1447 + }, + { + "epoch": 1.1003039513677813, + "grad_norm": 2.402256488800049, + "learning_rate": 3.75181336050301e-06, + "loss": 0.43916207551956177, + "mean_token_accuracy": 0.8448786735534668, + "num_tokens": 12892613.0, + "step": 1448 + }, + { + "epoch": 1.101063829787234, + "grad_norm": 1.3893651962280273, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.3919021785259247, + "mean_token_accuracy": 0.8495820760726929, + "num_tokens": 12905523.0, + "step": 1449 + }, + { + "epoch": 1.1018237082066868, + "grad_norm": 1.5519827604293823, + "learning_rate": 3.7481857621988734e-06, + "loss": 0.4710700809955597, + "mean_token_accuracy": 0.8387632369995117, + "num_tokens": 12918236.0, + "step": 1450 + }, + { + "epoch": 1.1025835866261398, + "grad_norm": 2.0141353607177734, + "learning_rate": 3.74637064837293e-06, + "loss": 0.30866751074790955, + "mean_token_accuracy": 0.9059321880340576, + "num_tokens": 12924391.0, + "step": 1451 + }, + { + "epoch": 1.1033434650455927, + "grad_norm": 1.2201496362686157, + "learning_rate": 3.7445546597960882e-06, + "loss": 0.3938257396221161, + "mean_token_accuracy": 0.8726630210876465, + "num_tokens": 12943338.0, + "step": 1452 + }, + { + "epoch": 1.1041033434650456, + "grad_norm": 2.29434871673584, + "learning_rate": 3.742737797742878e-06, + "loss": 0.4347776174545288, + "mean_token_accuracy": 0.840569257736206, + "num_tokens": 12950636.0, + "step": 1453 + }, + { + "epoch": 1.1048632218844985, + "grad_norm": 2.3875105381011963, + "learning_rate": 3.7409200634884425e-06, + "loss": 0.48353564739227295, + "mean_token_accuracy": 0.8207056522369385, + "num_tokens": 12957635.0, + "step": 1454 + }, + { + "epoch": 1.1056231003039514, + "grad_norm": 2.3539648056030273, + "learning_rate": 3.7391014583085384e-06, + "loss": 0.3532431721687317, + "mean_token_accuracy": 0.8903788924217224, + "num_tokens": 12963032.0, + "step": 1455 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 1.5611135959625244, + "learning_rate": 3.737281983479534e-06, + "loss": 0.4734863042831421, + "mean_token_accuracy": 0.8413879871368408, + "num_tokens": 12977170.0, + "step": 1456 + }, + { + "epoch": 1.1071428571428572, + "grad_norm": 1.474320411682129, + "learning_rate": 3.735461640278404e-06, + "loss": 0.41854286193847656, + "mean_token_accuracy": 0.8499876856803894, + "num_tokens": 12993750.0, + "step": 1457 + }, + { + "epoch": 1.1079027355623101, + "grad_norm": 2.6873273849487305, + "learning_rate": 3.733640429982738e-06, + "loss": 0.47637903690338135, + "mean_token_accuracy": 0.83599853515625, + "num_tokens": 12999058.0, + "step": 1458 + }, + { + "epoch": 1.108662613981763, + "grad_norm": 1.4575026035308838, + "learning_rate": 3.731818353870729e-06, + "loss": 0.38441652059555054, + "mean_token_accuracy": 0.8582364320755005, + "num_tokens": 13013864.0, + "step": 1459 + }, + { + "epoch": 1.1094224924012157, + "grad_norm": 1.7722690105438232, + "learning_rate": 3.729995413221183e-06, + "loss": 0.4224998950958252, + "mean_token_accuracy": 0.8511888384819031, + "num_tokens": 13023714.0, + "step": 1460 + }, + { + "epoch": 1.1101823708206686, + "grad_norm": 2.625760555267334, + "learning_rate": 3.7281716093135068e-06, + "loss": 0.3487582802772522, + "mean_token_accuracy": 0.8834779262542725, + "num_tokens": 13028608.0, + "step": 1461 + }, + { + "epoch": 1.1109422492401215, + "grad_norm": 1.2554056644439697, + "learning_rate": 3.726346943427719e-06, + "loss": 0.33312469720840454, + "mean_token_accuracy": 0.8704153299331665, + "num_tokens": 13044901.0, + "step": 1462 + }, + { + "epoch": 1.1117021276595744, + "grad_norm": 2.1109910011291504, + "learning_rate": 3.7245214168444388e-06, + "loss": 0.387290894985199, + "mean_token_accuracy": 0.860816240310669, + "num_tokens": 13051452.0, + "step": 1463 + }, + { + "epoch": 1.1124620060790273, + "grad_norm": 3.159201145172119, + "learning_rate": 3.722695030844891e-06, + "loss": 0.37690871953964233, + "mean_token_accuracy": 0.8717561960220337, + "num_tokens": 13055131.0, + "step": 1464 + }, + { + "epoch": 1.1132218844984803, + "grad_norm": 1.3810011148452759, + "learning_rate": 3.7208677867109042e-06, + "loss": 0.36598485708236694, + "mean_token_accuracy": 0.8683375120162964, + "num_tokens": 13069798.0, + "step": 1465 + }, + { + "epoch": 1.1139817629179332, + "grad_norm": 2.500849485397339, + "learning_rate": 3.7190396857249087e-06, + "loss": 0.2781746983528137, + "mean_token_accuracy": 0.9026005268096924, + "num_tokens": 13075127.0, + "step": 1466 + }, + { + "epoch": 1.114741641337386, + "grad_norm": 1.7445712089538574, + "learning_rate": 3.7172107291699356e-06, + "loss": 0.5055314302444458, + "mean_token_accuracy": 0.8252174258232117, + "num_tokens": 13084843.0, + "step": 1467 + }, + { + "epoch": 1.115501519756839, + "grad_norm": 1.6386256217956543, + "learning_rate": 3.7153809183296174e-06, + "loss": 0.38478314876556396, + "mean_token_accuracy": 0.8600847721099854, + "num_tokens": 13096517.0, + "step": 1468 + }, + { + "epoch": 1.1162613981762919, + "grad_norm": 2.3818395137786865, + "learning_rate": 3.713550254488185e-06, + "loss": 0.40308547019958496, + "mean_token_accuracy": 0.8628184795379639, + "num_tokens": 13102324.0, + "step": 1469 + }, + { + "epoch": 1.1170212765957448, + "grad_norm": 1.73163640499115, + "learning_rate": 3.7117187389304703e-06, + "loss": 0.5035421848297119, + "mean_token_accuracy": 0.8229597210884094, + "num_tokens": 13113763.0, + "step": 1470 + }, + { + "epoch": 1.1177811550151975, + "grad_norm": 3.147177219390869, + "learning_rate": 3.7098863729418997e-06, + "loss": 0.557449221611023, + "mean_token_accuracy": 0.8266849517822266, + "num_tokens": 13118849.0, + "step": 1471 + }, + { + "epoch": 1.1185410334346504, + "grad_norm": 1.5061391592025757, + "learning_rate": 3.7080531578085e-06, + "loss": 0.3759554922580719, + "mean_token_accuracy": 0.8541903495788574, + "num_tokens": 13131337.0, + "step": 1472 + }, + { + "epoch": 1.1193009118541033, + "grad_norm": 2.172346353530884, + "learning_rate": 3.7062190948168906e-06, + "loss": 0.41491609811782837, + "mean_token_accuracy": 0.8531454801559448, + "num_tokens": 13139767.0, + "step": 1473 + }, + { + "epoch": 1.1200607902735562, + "grad_norm": 2.1527154445648193, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.4309239387512207, + "mean_token_accuracy": 0.8327745199203491, + "num_tokens": 13147210.0, + "step": 1474 + }, + { + "epoch": 1.1208206686930091, + "grad_norm": 1.8342832326889038, + "learning_rate": 3.7025484304085035e-06, + "loss": 0.34393298625946045, + "mean_token_accuracy": 0.8948153257369995, + "num_tokens": 13154831.0, + "step": 1475 + }, + { + "epoch": 1.121580547112462, + "grad_norm": 2.509291172027588, + "learning_rate": 3.7007118315679384e-06, + "loss": 0.4479471445083618, + "mean_token_accuracy": 0.8280234336853027, + "num_tokens": 13161040.0, + "step": 1476 + }, + { + "epoch": 1.122340425531915, + "grad_norm": 2.914710521697998, + "learning_rate": 3.6988743900215895e-06, + "loss": 0.3724832832813263, + "mean_token_accuracy": 0.863893985748291, + "num_tokens": 13164975.0, + "step": 1477 + }, + { + "epoch": 1.1231003039513678, + "grad_norm": 3.274808645248413, + "learning_rate": 3.6970361070590443e-06, + "loss": 0.4088161885738373, + "mean_token_accuracy": 0.8474822044372559, + "num_tokens": 13168826.0, + "step": 1478 + }, + { + "epoch": 1.1238601823708207, + "grad_norm": 2.861546277999878, + "learning_rate": 3.695196983970481e-06, + "loss": 0.45837992429733276, + "mean_token_accuracy": 0.8579759001731873, + "num_tokens": 13173794.0, + "step": 1479 + }, + { + "epoch": 1.1246200607902737, + "grad_norm": 1.9491597414016724, + "learning_rate": 3.6933570220466654e-06, + "loss": 0.4333910346031189, + "mean_token_accuracy": 0.8444236516952515, + "num_tokens": 13181598.0, + "step": 1480 + }, + { + "epoch": 1.1253799392097266, + "grad_norm": 1.329848051071167, + "learning_rate": 3.6915162225789546e-06, + "loss": 0.36404621601104736, + "mean_token_accuracy": 0.8694117069244385, + "num_tokens": 13196381.0, + "step": 1481 + }, + { + "epoch": 1.1261398176291793, + "grad_norm": 1.8854197263717651, + "learning_rate": 3.6896745868592924e-06, + "loss": 0.4085756838321686, + "mean_token_accuracy": 0.855188250541687, + "num_tokens": 13205236.0, + "step": 1482 + }, + { + "epoch": 1.1268996960486322, + "grad_norm": 3.01684832572937, + "learning_rate": 3.6878321161802106e-06, + "loss": 0.28105655312538147, + "mean_token_accuracy": 0.9009426236152649, + "num_tokens": 13209380.0, + "step": 1483 + }, + { + "epoch": 1.127659574468085, + "grad_norm": 1.8051308393478394, + "learning_rate": 3.685988811834823e-06, + "loss": 0.3314531147480011, + "mean_token_accuracy": 0.8805814385414124, + "num_tokens": 13217714.0, + "step": 1484 + }, + { + "epoch": 1.128419452887538, + "grad_norm": 1.61757493019104, + "learning_rate": 3.684144675116836e-06, + "loss": 0.4543863534927368, + "mean_token_accuracy": 0.8400536775588989, + "num_tokens": 13229330.0, + "step": 1485 + }, + { + "epoch": 1.1291793313069909, + "grad_norm": 1.602686882019043, + "learning_rate": 3.682299707320532e-06, + "loss": 0.3653204143047333, + "mean_token_accuracy": 0.8655825853347778, + "num_tokens": 13242872.0, + "step": 1486 + }, + { + "epoch": 1.1299392097264438, + "grad_norm": 2.3093113899230957, + "learning_rate": 3.680453909740782e-06, + "loss": 0.4383693039417267, + "mean_token_accuracy": 0.839782178401947, + "num_tokens": 13248976.0, + "step": 1487 + }, + { + "epoch": 1.1306990881458967, + "grad_norm": 1.180559754371643, + "learning_rate": 3.6786072836730376e-06, + "loss": 0.5354755520820618, + "mean_token_accuracy": 0.8151205778121948, + "num_tokens": 13272896.0, + "step": 1488 + }, + { + "epoch": 1.1314589665653496, + "grad_norm": 1.9554040431976318, + "learning_rate": 3.6767598304133325e-06, + "loss": 0.4485316872596741, + "mean_token_accuracy": 0.8399936556816101, + "num_tokens": 13280757.0, + "step": 1489 + }, + { + "epoch": 1.1322188449848025, + "grad_norm": 2.236471176147461, + "learning_rate": 3.674911551258279e-06, + "loss": 0.45594364404678345, + "mean_token_accuracy": 0.8552400469779968, + "num_tokens": 13287328.0, + "step": 1490 + }, + { + "epoch": 1.1329787234042552, + "grad_norm": 2.5228686332702637, + "learning_rate": 3.673062447505072e-06, + "loss": 0.4048641622066498, + "mean_token_accuracy": 0.8617376685142517, + "num_tokens": 13292716.0, + "step": 1491 + }, + { + "epoch": 1.1337386018237081, + "grad_norm": 1.1274473667144775, + "learning_rate": 3.6712125204514836e-06, + "loss": 0.3848876357078552, + "mean_token_accuracy": 0.8672975301742554, + "num_tokens": 13313403.0, + "step": 1492 + }, + { + "epoch": 1.134498480243161, + "grad_norm": 2.349541425704956, + "learning_rate": 3.6693617713958633e-06, + "loss": 0.3166058361530304, + "mean_token_accuracy": 0.8896721601486206, + "num_tokens": 13318720.0, + "step": 1493 + }, + { + "epoch": 1.135258358662614, + "grad_norm": 2.2438278198242188, + "learning_rate": 3.6675102016371387e-06, + "loss": 0.5418218970298767, + "mean_token_accuracy": 0.8256527185440063, + "num_tokens": 13325360.0, + "step": 1494 + }, + { + "epoch": 1.1360182370820668, + "grad_norm": 2.21268892288208, + "learning_rate": 3.665657812474812e-06, + "loss": 0.48603951930999756, + "mean_token_accuracy": 0.8273470401763916, + "num_tokens": 13333217.0, + "step": 1495 + }, + { + "epoch": 1.1367781155015197, + "grad_norm": 2.6105997562408447, + "learning_rate": 3.6638046052089614e-06, + "loss": 0.31221291422843933, + "mean_token_accuracy": 0.888375997543335, + "num_tokens": 13338413.0, + "step": 1496 + }, + { + "epoch": 1.1375379939209727, + "grad_norm": 3.655658483505249, + "learning_rate": 3.661950581140239e-06, + "loss": 0.3609023988246918, + "mean_token_accuracy": 0.8838576078414917, + "num_tokens": 13341499.0, + "step": 1497 + }, + { + "epoch": 1.1382978723404256, + "grad_norm": 2.242009162902832, + "learning_rate": 3.660095741569871e-06, + "loss": 0.40022802352905273, + "mean_token_accuracy": 0.8559960722923279, + "num_tokens": 13347917.0, + "step": 1498 + }, + { + "epoch": 1.1390577507598785, + "grad_norm": 1.7958979606628418, + "learning_rate": 3.658240087799655e-06, + "loss": 0.499157190322876, + "mean_token_accuracy": 0.8423802256584167, + "num_tokens": 13361570.0, + "step": 1499 + }, + { + "epoch": 1.1398176291793314, + "grad_norm": 2.5406908988952637, + "learning_rate": 3.6563836211319593e-06, + "loss": 0.4090137481689453, + "mean_token_accuracy": 0.8769663572311401, + "num_tokens": 13367183.0, + "step": 1500 + }, + { + "epoch": 1.1405775075987843, + "grad_norm": 1.9861716032028198, + "learning_rate": 3.654526342869724e-06, + "loss": 0.5125207304954529, + "mean_token_accuracy": 0.8315266370773315, + "num_tokens": 13376767.0, + "step": 1501 + }, + { + "epoch": 1.141337386018237, + "grad_norm": 1.731188178062439, + "learning_rate": 3.65266825431646e-06, + "loss": 0.39452576637268066, + "mean_token_accuracy": 0.8585706353187561, + "num_tokens": 13388437.0, + "step": 1502 + }, + { + "epoch": 1.1420972644376899, + "grad_norm": 1.5203773975372314, + "learning_rate": 3.6508093567762425e-06, + "loss": 0.39466819167137146, + "mean_token_accuracy": 0.8584027886390686, + "num_tokens": 13399727.0, + "step": 1503 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 2.606462001800537, + "learning_rate": 3.6489496515537204e-06, + "loss": 0.4521079361438751, + "mean_token_accuracy": 0.8413360118865967, + "num_tokens": 13408426.0, + "step": 1504 + }, + { + "epoch": 1.1436170212765957, + "grad_norm": 2.6207993030548096, + "learning_rate": 3.647089139954104e-06, + "loss": 0.4709353446960449, + "mean_token_accuracy": 0.8397113084793091, + "num_tokens": 13413506.0, + "step": 1505 + }, + { + "epoch": 1.1443768996960486, + "grad_norm": 1.7214165925979614, + "learning_rate": 3.6452278232831734e-06, + "loss": 0.45506367087364197, + "mean_token_accuracy": 0.8466023206710815, + "num_tokens": 13424592.0, + "step": 1506 + }, + { + "epoch": 1.1451367781155015, + "grad_norm": 1.7111759185791016, + "learning_rate": 3.643365702847272e-06, + "loss": 0.5016278624534607, + "mean_token_accuracy": 0.8196234703063965, + "num_tokens": 13434421.0, + "step": 1507 + }, + { + "epoch": 1.1458966565349544, + "grad_norm": 1.7528148889541626, + "learning_rate": 3.641502779953307e-06, + "loss": 0.5020896196365356, + "mean_token_accuracy": 0.826249361038208, + "num_tokens": 13445286.0, + "step": 1508 + }, + { + "epoch": 1.1466565349544073, + "grad_norm": 1.3470909595489502, + "learning_rate": 3.639639055908751e-06, + "loss": 0.45765724778175354, + "mean_token_accuracy": 0.8380560278892517, + "num_tokens": 13465030.0, + "step": 1509 + }, + { + "epoch": 1.1474164133738602, + "grad_norm": 2.4846835136413574, + "learning_rate": 3.6377745320216346e-06, + "loss": 0.46488267183303833, + "mean_token_accuracy": 0.8393925428390503, + "num_tokens": 13470883.0, + "step": 1510 + }, + { + "epoch": 1.1481762917933132, + "grad_norm": 1.770201563835144, + "learning_rate": 3.635909209600555e-06, + "loss": 0.5262179374694824, + "mean_token_accuracy": 0.8201162815093994, + "num_tokens": 13482558.0, + "step": 1511 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 1.5955098867416382, + "learning_rate": 3.6340430899546656e-06, + "loss": 0.430621862411499, + "mean_token_accuracy": 0.8488553762435913, + "num_tokens": 13493003.0, + "step": 1512 + }, + { + "epoch": 1.1496960486322187, + "grad_norm": 2.846176862716675, + "learning_rate": 3.632176174393682e-06, + "loss": 0.23461638391017914, + "mean_token_accuracy": 0.9218817353248596, + "num_tokens": 13496566.0, + "step": 1513 + }, + { + "epoch": 1.1504559270516717, + "grad_norm": 1.9606610536575317, + "learning_rate": 3.630308464227877e-06, + "loss": 0.4940161108970642, + "mean_token_accuracy": 0.8474864959716797, + "num_tokens": 13504843.0, + "step": 1514 + }, + { + "epoch": 1.1512158054711246, + "grad_norm": 1.1588608026504517, + "learning_rate": 3.628439960768082e-06, + "loss": 0.32650992274284363, + "mean_token_accuracy": 0.8797246217727661, + "num_tokens": 13521513.0, + "step": 1515 + }, + { + "epoch": 1.1519756838905775, + "grad_norm": 1.3566495180130005, + "learning_rate": 3.6265706653256837e-06, + "loss": 0.4359064996242523, + "mean_token_accuracy": 0.8379859328269958, + "num_tokens": 13540608.0, + "step": 1516 + }, + { + "epoch": 1.1527355623100304, + "grad_norm": 1.4728609323501587, + "learning_rate": 3.624700579212626e-06, + "loss": 0.29939693212509155, + "mean_token_accuracy": 0.8831408023834229, + "num_tokens": 13550641.0, + "step": 1517 + }, + { + "epoch": 1.1534954407294833, + "grad_norm": 2.162325382232666, + "learning_rate": 3.6228297037414077e-06, + "loss": 0.4097636938095093, + "mean_token_accuracy": 0.8575425148010254, + "num_tokens": 13556931.0, + "step": 1518 + }, + { + "epoch": 1.1542553191489362, + "grad_norm": 1.754439353942871, + "learning_rate": 3.6209580402250816e-06, + "loss": 0.400202214717865, + "mean_token_accuracy": 0.8569821119308472, + "num_tokens": 13565491.0, + "step": 1519 + }, + { + "epoch": 1.155015197568389, + "grad_norm": 1.5250083208084106, + "learning_rate": 3.619085589977251e-06, + "loss": 0.43330419063568115, + "mean_token_accuracy": 0.8492985963821411, + "num_tokens": 13577147.0, + "step": 1520 + }, + { + "epoch": 1.155775075987842, + "grad_norm": 1.9108905792236328, + "learning_rate": 3.617212354312076e-06, + "loss": 0.30567464232444763, + "mean_token_accuracy": 0.8850164413452148, + "num_tokens": 13584366.0, + "step": 1521 + }, + { + "epoch": 1.156534954407295, + "grad_norm": 2.2574243545532227, + "learning_rate": 3.615338334544265e-06, + "loss": 0.4391738772392273, + "mean_token_accuracy": 0.839765727519989, + "num_tokens": 13591816.0, + "step": 1522 + }, + { + "epoch": 1.1572948328267478, + "grad_norm": 2.1235218048095703, + "learning_rate": 3.6134635319890763e-06, + "loss": 0.45043107867240906, + "mean_token_accuracy": 0.8385299444198608, + "num_tokens": 13599736.0, + "step": 1523 + }, + { + "epoch": 1.1580547112462005, + "grad_norm": 2.2274110317230225, + "learning_rate": 3.611587947962319e-06, + "loss": 0.3623226284980774, + "mean_token_accuracy": 0.8724044561386108, + "num_tokens": 13605354.0, + "step": 1524 + }, + { + "epoch": 1.1588145896656534, + "grad_norm": 3.414236545562744, + "learning_rate": 3.6097115837803504e-06, + "loss": 0.30060696601867676, + "mean_token_accuracy": 0.8971061706542969, + "num_tokens": 13608851.0, + "step": 1525 + }, + { + "epoch": 1.1595744680851063, + "grad_norm": 2.496264696121216, + "learning_rate": 3.6078344407600744e-06, + "loss": 0.3567180037498474, + "mean_token_accuracy": 0.8596180081367493, + "num_tokens": 13614339.0, + "step": 1526 + }, + { + "epoch": 1.1603343465045592, + "grad_norm": 2.0191843509674072, + "learning_rate": 3.6059565202189433e-06, + "loss": 0.43206095695495605, + "mean_token_accuracy": 0.8464000821113586, + "num_tokens": 13622395.0, + "step": 1527 + }, + { + "epoch": 1.1610942249240122, + "grad_norm": 1.5475906133651733, + "learning_rate": 3.604077823474954e-06, + "loss": 0.4535648226737976, + "mean_token_accuracy": 0.8391586542129517, + "num_tokens": 13635356.0, + "step": 1528 + }, + { + "epoch": 1.161854103343465, + "grad_norm": 2.1348211765289307, + "learning_rate": 3.6021983518466468e-06, + "loss": 0.2733963429927826, + "mean_token_accuracy": 0.9007417559623718, + "num_tokens": 13640641.0, + "step": 1529 + }, + { + "epoch": 1.162613981762918, + "grad_norm": 2.8452792167663574, + "learning_rate": 3.600318106653108e-06, + "loss": 0.29591235518455505, + "mean_token_accuracy": 0.8934413194656372, + "num_tokens": 13644995.0, + "step": 1530 + }, + { + "epoch": 1.1633738601823709, + "grad_norm": 2.342907190322876, + "learning_rate": 3.5984370892139663e-06, + "loss": 0.4675130248069763, + "mean_token_accuracy": 0.8352028131484985, + "num_tokens": 13652695.0, + "step": 1531 + }, + { + "epoch": 1.1641337386018238, + "grad_norm": 2.3480238914489746, + "learning_rate": 3.5965553008493924e-06, + "loss": 0.3114515542984009, + "mean_token_accuracy": 0.8845353126525879, + "num_tokens": 13658101.0, + "step": 1532 + }, + { + "epoch": 1.1648936170212765, + "grad_norm": 1.8608155250549316, + "learning_rate": 3.594672742880097e-06, + "loss": 0.3864145278930664, + "mean_token_accuracy": 0.867354154586792, + "num_tokens": 13666042.0, + "step": 1533 + }, + { + "epoch": 1.1656534954407296, + "grad_norm": 1.4756088256835938, + "learning_rate": 3.5927894166273324e-06, + "loss": 0.3671600818634033, + "mean_token_accuracy": 0.8695988655090332, + "num_tokens": 13678253.0, + "step": 1534 + }, + { + "epoch": 1.1664133738601823, + "grad_norm": 2.8831355571746826, + "learning_rate": 3.5909053234128893e-06, + "loss": 0.267184317111969, + "mean_token_accuracy": 0.9008115530014038, + "num_tokens": 13681790.0, + "step": 1535 + }, + { + "epoch": 1.1671732522796352, + "grad_norm": 2.1984763145446777, + "learning_rate": 3.5890204645590964e-06, + "loss": 0.4431505799293518, + "mean_token_accuracy": 0.8623673915863037, + "num_tokens": 13688444.0, + "step": 1536 + }, + { + "epoch": 1.167933130699088, + "grad_norm": 1.8271523714065552, + "learning_rate": 3.5871348413888207e-06, + "loss": 0.3861040771007538, + "mean_token_accuracy": 0.8624277114868164, + "num_tokens": 13696872.0, + "step": 1537 + }, + { + "epoch": 1.168693009118541, + "grad_norm": 1.6313756704330444, + "learning_rate": 3.585248455225466e-06, + "loss": 0.3775154948234558, + "mean_token_accuracy": 0.8624461889266968, + "num_tokens": 13706167.0, + "step": 1538 + }, + { + "epoch": 1.169452887537994, + "grad_norm": 2.4377901554107666, + "learning_rate": 3.5833613073929684e-06, + "loss": 0.2308957427740097, + "mean_token_accuracy": 0.920600175857544, + "num_tokens": 13710367.0, + "step": 1539 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 2.2621750831604004, + "learning_rate": 3.5814733992158025e-06, + "loss": 0.33167219161987305, + "mean_token_accuracy": 0.8963261842727661, + "num_tokens": 13716384.0, + "step": 1540 + }, + { + "epoch": 1.1709726443768997, + "grad_norm": 1.3178150653839111, + "learning_rate": 3.579584732018975e-06, + "loss": 0.3276631832122803, + "mean_token_accuracy": 0.8853521347045898, + "num_tokens": 13731031.0, + "step": 1541 + }, + { + "epoch": 1.1717325227963526, + "grad_norm": 2.177750587463379, + "learning_rate": 3.577695307128024e-06, + "loss": 0.48177266120910645, + "mean_token_accuracy": 0.830329418182373, + "num_tokens": 13737925.0, + "step": 1542 + }, + { + "epoch": 1.1724924012158056, + "grad_norm": 2.2268829345703125, + "learning_rate": 3.5758051258690223e-06, + "loss": 0.48843517899513245, + "mean_token_accuracy": 0.8310644030570984, + "num_tokens": 13746039.0, + "step": 1543 + }, + { + "epoch": 1.1732522796352582, + "grad_norm": 1.498701572418213, + "learning_rate": 3.5739141895685708e-06, + "loss": 0.4542962312698364, + "mean_token_accuracy": 0.8500330448150635, + "num_tokens": 13765002.0, + "step": 1544 + }, + { + "epoch": 1.1740121580547112, + "grad_norm": 1.786670446395874, + "learning_rate": 3.5720224995538023e-06, + "loss": 0.27367928624153137, + "mean_token_accuracy": 0.8916142582893372, + "num_tokens": 13774113.0, + "step": 1545 + }, + { + "epoch": 1.174772036474164, + "grad_norm": 2.0311272144317627, + "learning_rate": 3.5701300571523757e-06, + "loss": 0.559987485408783, + "mean_token_accuracy": 0.8266973495483398, + "num_tokens": 13783912.0, + "step": 1546 + }, + { + "epoch": 1.175531914893617, + "grad_norm": 1.8732186555862427, + "learning_rate": 3.5682368636924825e-06, + "loss": 0.5184751152992249, + "mean_token_accuracy": 0.8450918197631836, + "num_tokens": 13792728.0, + "step": 1547 + }, + { + "epoch": 1.1762917933130699, + "grad_norm": 1.4410661458969116, + "learning_rate": 3.566342920502837e-06, + "loss": 0.383536696434021, + "mean_token_accuracy": 0.8672217726707458, + "num_tokens": 13813590.0, + "step": 1548 + }, + { + "epoch": 1.1770516717325228, + "grad_norm": 3.06056547164917, + "learning_rate": 3.564448228912682e-06, + "loss": 0.3941686153411865, + "mean_token_accuracy": 0.8696402311325073, + "num_tokens": 13817704.0, + "step": 1549 + }, + { + "epoch": 1.1778115501519757, + "grad_norm": 1.6150329113006592, + "learning_rate": 3.562552790251785e-06, + "loss": 0.41606605052948, + "mean_token_accuracy": 0.8488572835922241, + "num_tokens": 13831303.0, + "step": 1550 + }, + { + "epoch": 1.1785714285714286, + "grad_norm": 2.1199934482574463, + "learning_rate": 3.5606566058504377e-06, + "loss": 0.3974752426147461, + "mean_token_accuracy": 0.8686345219612122, + "num_tokens": 13837613.0, + "step": 1551 + }, + { + "epoch": 1.1793313069908815, + "grad_norm": 1.5683876276016235, + "learning_rate": 3.558759677039455e-06, + "loss": 0.35225993394851685, + "mean_token_accuracy": 0.8710784316062927, + "num_tokens": 13846779.0, + "step": 1552 + }, + { + "epoch": 1.1800911854103344, + "grad_norm": 1.4644675254821777, + "learning_rate": 3.5568620051501755e-06, + "loss": 0.38400042057037354, + "mean_token_accuracy": 0.8548328876495361, + "num_tokens": 13860713.0, + "step": 1553 + }, + { + "epoch": 1.1808510638297873, + "grad_norm": 1.461491346359253, + "learning_rate": 3.5549635915144578e-06, + "loss": 0.4572640061378479, + "mean_token_accuracy": 0.8506045937538147, + "num_tokens": 13877289.0, + "step": 1554 + }, + { + "epoch": 1.18161094224924, + "grad_norm": 2.6364715099334717, + "learning_rate": 3.553064437464682e-06, + "loss": 0.3954341411590576, + "mean_token_accuracy": 0.8561649322509766, + "num_tokens": 13882064.0, + "step": 1555 + }, + { + "epoch": 1.182370820668693, + "grad_norm": 2.027273654937744, + "learning_rate": 3.551164544333745e-06, + "loss": 0.47625732421875, + "mean_token_accuracy": 0.8349384069442749, + "num_tokens": 13890306.0, + "step": 1556 + }, + { + "epoch": 1.1831306990881458, + "grad_norm": 2.8427743911743164, + "learning_rate": 3.549263913455069e-06, + "loss": 0.4273033142089844, + "mean_token_accuracy": 0.8541387319564819, + "num_tokens": 13894882.0, + "step": 1557 + }, + { + "epoch": 1.1838905775075987, + "grad_norm": 1.6298975944519043, + "learning_rate": 3.5473625461625884e-06, + "loss": 0.4378639757633209, + "mean_token_accuracy": 0.8634963631629944, + "num_tokens": 13906152.0, + "step": 1558 + }, + { + "epoch": 1.1846504559270516, + "grad_norm": 2.4098947048187256, + "learning_rate": 3.5454604437907535e-06, + "loss": 0.47236716747283936, + "mean_token_accuracy": 0.8646864891052246, + "num_tokens": 13911803.0, + "step": 1559 + }, + { + "epoch": 1.1854103343465046, + "grad_norm": 1.5972497463226318, + "learning_rate": 3.543557607674537e-06, + "loss": 0.3001407980918884, + "mean_token_accuracy": 0.8927055597305298, + "num_tokens": 13921304.0, + "step": 1560 + }, + { + "epoch": 1.1861702127659575, + "grad_norm": 2.1140005588531494, + "learning_rate": 3.54165403914942e-06, + "loss": 0.41898271441459656, + "mean_token_accuracy": 0.8542245626449585, + "num_tokens": 13929434.0, + "step": 1561 + }, + { + "epoch": 1.1869300911854104, + "grad_norm": 1.8733803033828735, + "learning_rate": 3.539749739551401e-06, + "loss": 0.35469961166381836, + "mean_token_accuracy": 0.8805290460586548, + "num_tokens": 13937781.0, + "step": 1562 + }, + { + "epoch": 1.1876899696048633, + "grad_norm": 2.2805802822113037, + "learning_rate": 3.53784471021699e-06, + "loss": 0.44496792554855347, + "mean_token_accuracy": 0.8454172611236572, + "num_tokens": 13944394.0, + "step": 1563 + }, + { + "epoch": 1.1884498480243162, + "grad_norm": 0.9728449583053589, + "learning_rate": 3.535938952483211e-06, + "loss": 0.3156968355178833, + "mean_token_accuracy": 0.8739837408065796, + "num_tokens": 13966712.0, + "step": 1564 + }, + { + "epoch": 1.189209726443769, + "grad_norm": 3.025338888168335, + "learning_rate": 3.534032467687597e-06, + "loss": 0.30036938190460205, + "mean_token_accuracy": 0.9058252573013306, + "num_tokens": 13970183.0, + "step": 1565 + }, + { + "epoch": 1.1899696048632218, + "grad_norm": 2.0659425258636475, + "learning_rate": 3.532125257168193e-06, + "loss": 0.30619731545448303, + "mean_token_accuracy": 0.9041587710380554, + "num_tokens": 13976657.0, + "step": 1566 + }, + { + "epoch": 1.1907294832826747, + "grad_norm": 3.2036776542663574, + "learning_rate": 3.5302173222635526e-06, + "loss": 0.4145944118499756, + "mean_token_accuracy": 0.8502328395843506, + "num_tokens": 13981198.0, + "step": 1567 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 1.7767539024353027, + "learning_rate": 3.5283086643127396e-06, + "loss": 0.437128484249115, + "mean_token_accuracy": 0.8965631723403931, + "num_tokens": 13990259.0, + "step": 1568 + }, + { + "epoch": 1.1922492401215805, + "grad_norm": 1.7777384519577026, + "learning_rate": 3.5263992846553203e-06, + "loss": 0.33831220865249634, + "mean_token_accuracy": 0.8734279870986938, + "num_tokens": 13999363.0, + "step": 1569 + }, + { + "epoch": 1.1930091185410334, + "grad_norm": 1.6710708141326904, + "learning_rate": 3.5244891846313733e-06, + "loss": 0.4005590081214905, + "mean_token_accuracy": 0.8820298314094543, + "num_tokens": 14008719.0, + "step": 1570 + }, + { + "epoch": 1.1937689969604863, + "grad_norm": 1.0378777980804443, + "learning_rate": 3.5225783655814798e-06, + "loss": 0.3174915313720703, + "mean_token_accuracy": 0.8894162774085999, + "num_tokens": 14025806.0, + "step": 1571 + }, + { + "epoch": 1.1945288753799392, + "grad_norm": 1.2647521495819092, + "learning_rate": 3.520666828846726e-06, + "loss": 0.4173050820827484, + "mean_token_accuracy": 0.8437265157699585, + "num_tokens": 14046445.0, + "step": 1572 + }, + { + "epoch": 1.1952887537993921, + "grad_norm": 2.8625528812408447, + "learning_rate": 3.518754575768702e-06, + "loss": 0.37182557582855225, + "mean_token_accuracy": 0.8660947680473328, + "num_tokens": 14051197.0, + "step": 1573 + }, + { + "epoch": 1.196048632218845, + "grad_norm": 1.1213171482086182, + "learning_rate": 3.516841607689501e-06, + "loss": 0.332731157541275, + "mean_token_accuracy": 0.8573278784751892, + "num_tokens": 14070817.0, + "step": 1574 + }, + { + "epoch": 1.196808510638298, + "grad_norm": 1.197508692741394, + "learning_rate": 3.5149279259517165e-06, + "loss": 0.34058472514152527, + "mean_token_accuracy": 0.8603571653366089, + "num_tokens": 14085301.0, + "step": 1575 + }, + { + "epoch": 1.1975683890577509, + "grad_norm": 4.019949913024902, + "learning_rate": 3.5130135318984454e-06, + "loss": 0.3094622492790222, + "mean_token_accuracy": 0.8905094861984253, + "num_tokens": 14088107.0, + "step": 1576 + }, + { + "epoch": 1.1983282674772036, + "grad_norm": 2.591181755065918, + "learning_rate": 3.5110984268732827e-06, + "loss": 0.3407078981399536, + "mean_token_accuracy": 0.880385160446167, + "num_tokens": 14092887.0, + "step": 1577 + }, + { + "epoch": 1.1990881458966565, + "grad_norm": 1.3069331645965576, + "learning_rate": 3.509182612220322e-06, + "loss": 0.3761988878250122, + "mean_token_accuracy": 0.862013041973114, + "num_tokens": 14109216.0, + "step": 1578 + }, + { + "epoch": 1.1998480243161094, + "grad_norm": 1.7802022695541382, + "learning_rate": 3.507266089284157e-06, + "loss": 0.3824652135372162, + "mean_token_accuracy": 0.8707721829414368, + "num_tokens": 14119645.0, + "step": 1579 + }, + { + "epoch": 1.2006079027355623, + "grad_norm": 2.7937185764312744, + "learning_rate": 3.5053488594098763e-06, + "loss": 0.33828890323638916, + "mean_token_accuracy": 0.8765541315078735, + "num_tokens": 14124628.0, + "step": 1580 + }, + { + "epoch": 1.2013677811550152, + "grad_norm": 1.892671823501587, + "learning_rate": 3.5034309239430664e-06, + "loss": 0.3476094603538513, + "mean_token_accuracy": 0.9053795337677002, + "num_tokens": 14131756.0, + "step": 1581 + }, + { + "epoch": 1.202127659574468, + "grad_norm": 1.6857695579528809, + "learning_rate": 3.501512284229807e-06, + "loss": 0.5397108793258667, + "mean_token_accuracy": 0.8173421025276184, + "num_tokens": 14143024.0, + "step": 1582 + }, + { + "epoch": 1.202887537993921, + "grad_norm": 2.501737117767334, + "learning_rate": 3.4995929416166756e-06, + "loss": 0.4192458391189575, + "mean_token_accuracy": 0.8558136224746704, + "num_tokens": 14149499.0, + "step": 1583 + }, + { + "epoch": 1.203647416413374, + "grad_norm": 2.0133907794952393, + "learning_rate": 3.4976728974507387e-06, + "loss": 0.4791576564311981, + "mean_token_accuracy": 0.8253597021102905, + "num_tokens": 14158381.0, + "step": 1584 + }, + { + "epoch": 1.2044072948328268, + "grad_norm": 2.984611988067627, + "learning_rate": 3.4957521530795576e-06, + "loss": 0.3040750026702881, + "mean_token_accuracy": 0.8902391791343689, + "num_tokens": 14162419.0, + "step": 1585 + }, + { + "epoch": 1.2051671732522795, + "grad_norm": 1.518591284751892, + "learning_rate": 3.493830709851185e-06, + "loss": 0.35539618134498596, + "mean_token_accuracy": 0.8737183809280396, + "num_tokens": 14173048.0, + "step": 1586 + }, + { + "epoch": 1.2059270516717326, + "grad_norm": 2.628758192062378, + "learning_rate": 3.4919085691141636e-06, + "loss": 0.33340200781822205, + "mean_token_accuracy": 0.8705098628997803, + "num_tokens": 14178255.0, + "step": 1587 + }, + { + "epoch": 1.2066869300911853, + "grad_norm": 2.5565974712371826, + "learning_rate": 3.4899857322175252e-06, + "loss": 0.44939476251602173, + "mean_token_accuracy": 0.8315504193305969, + "num_tokens": 14183808.0, + "step": 1588 + }, + { + "epoch": 1.2074468085106382, + "grad_norm": 1.7521045207977295, + "learning_rate": 3.4880622005107916e-06, + "loss": 0.3168621063232422, + "mean_token_accuracy": 0.8824669122695923, + "num_tokens": 14192186.0, + "step": 1589 + }, + { + "epoch": 1.2082066869300911, + "grad_norm": 1.9816104173660278, + "learning_rate": 3.486137975343971e-06, + "loss": 0.3892582058906555, + "mean_token_accuracy": 0.8524188995361328, + "num_tokens": 14200512.0, + "step": 1590 + }, + { + "epoch": 1.208966565349544, + "grad_norm": 1.459800124168396, + "learning_rate": 3.484213058067559e-06, + "loss": 0.45930033922195435, + "mean_token_accuracy": 0.8408471345901489, + "num_tokens": 14215232.0, + "step": 1591 + }, + { + "epoch": 1.209726443768997, + "grad_norm": 2.015493154525757, + "learning_rate": 3.482287450032536e-06, + "loss": 0.5514016151428223, + "mean_token_accuracy": 0.8456779718399048, + "num_tokens": 14225402.0, + "step": 1592 + }, + { + "epoch": 1.2104863221884499, + "grad_norm": 3.4511911869049072, + "learning_rate": 3.4803611525903687e-06, + "loss": 0.4772771894931793, + "mean_token_accuracy": 0.8558698892593384, + "num_tokens": 14229038.0, + "step": 1593 + }, + { + "epoch": 1.2112462006079028, + "grad_norm": 2.2247982025146484, + "learning_rate": 3.4784341670930067e-06, + "loss": 0.4042825996875763, + "mean_token_accuracy": 0.8635870218276978, + "num_tokens": 14237057.0, + "step": 1594 + }, + { + "epoch": 1.2120060790273557, + "grad_norm": 2.0534820556640625, + "learning_rate": 3.4765064948928813e-06, + "loss": 0.34057414531707764, + "mean_token_accuracy": 0.8800770044326782, + "num_tokens": 14243013.0, + "step": 1595 + }, + { + "epoch": 1.2127659574468086, + "grad_norm": 2.594703197479248, + "learning_rate": 3.474578137342909e-06, + "loss": 0.4997410774230957, + "mean_token_accuracy": 0.8302106261253357, + "num_tokens": 14251210.0, + "step": 1596 + }, + { + "epoch": 1.2135258358662613, + "grad_norm": 2.517833948135376, + "learning_rate": 3.4726490957964836e-06, + "loss": 0.3630390465259552, + "mean_token_accuracy": 0.8679884672164917, + "num_tokens": 14255893.0, + "step": 1597 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 1.5177065134048462, + "learning_rate": 3.4707193716074816e-06, + "loss": 0.36218544840812683, + "mean_token_accuracy": 0.879178524017334, + "num_tokens": 14268143.0, + "step": 1598 + }, + { + "epoch": 1.215045592705167, + "grad_norm": 2.215291738510132, + "learning_rate": 3.4687889661302577e-06, + "loss": 0.4166645407676697, + "mean_token_accuracy": 0.8495793342590332, + "num_tokens": 14276794.0, + "step": 1599 + }, + { + "epoch": 1.21580547112462, + "grad_norm": 1.534294843673706, + "learning_rate": 3.466857880719645e-06, + "loss": 0.2635883092880249, + "mean_token_accuracy": 0.8971712589263916, + "num_tokens": 14287000.0, + "step": 1600 + }, + { + "epoch": 1.216565349544073, + "grad_norm": 1.2338658571243286, + "learning_rate": 3.464926116730953e-06, + "loss": 0.339110404253006, + "mean_token_accuracy": 0.895592987537384, + "num_tokens": 14303217.0, + "step": 1601 + }, + { + "epoch": 1.2173252279635258, + "grad_norm": 1.8717178106307983, + "learning_rate": 3.462993675519968e-06, + "loss": 0.41204726696014404, + "mean_token_accuracy": 0.8560728430747986, + "num_tokens": 14311372.0, + "step": 1602 + }, + { + "epoch": 1.2180851063829787, + "grad_norm": 2.844160795211792, + "learning_rate": 3.4610605584429526e-06, + "loss": 0.4129520058631897, + "mean_token_accuracy": 0.8555002212524414, + "num_tokens": 14316244.0, + "step": 1603 + }, + { + "epoch": 1.2188449848024316, + "grad_norm": 1.099926471710205, + "learning_rate": 3.4591267668566412e-06, + "loss": 0.35783132910728455, + "mean_token_accuracy": 0.8693175315856934, + "num_tokens": 14338414.0, + "step": 1604 + }, + { + "epoch": 1.2196048632218845, + "grad_norm": 1.6448384523391724, + "learning_rate": 3.457192302118244e-06, + "loss": 0.42060258984565735, + "mean_token_accuracy": 0.8557323217391968, + "num_tokens": 14349143.0, + "step": 1605 + }, + { + "epoch": 1.2203647416413375, + "grad_norm": 2.097529888153076, + "learning_rate": 3.455257165585444e-06, + "loss": 0.5227499008178711, + "mean_token_accuracy": 0.828961968421936, + "num_tokens": 14360032.0, + "step": 1606 + }, + { + "epoch": 1.2211246200607904, + "grad_norm": 1.602988600730896, + "learning_rate": 3.453321358616393e-06, + "loss": 0.3537187874317169, + "mean_token_accuracy": 0.8776708841323853, + "num_tokens": 14370005.0, + "step": 1607 + }, + { + "epoch": 1.221884498480243, + "grad_norm": 2.358971357345581, + "learning_rate": 3.4513848825697145e-06, + "loss": 0.3448919653892517, + "mean_token_accuracy": 0.8887944221496582, + "num_tokens": 14375718.0, + "step": 1608 + }, + { + "epoch": 1.222644376899696, + "grad_norm": 1.72306227684021, + "learning_rate": 3.4494477388045035e-06, + "loss": 0.36985084414482117, + "mean_token_accuracy": 0.859595537185669, + "num_tokens": 14385016.0, + "step": 1609 + }, + { + "epoch": 1.2234042553191489, + "grad_norm": 1.5494085550308228, + "learning_rate": 3.4475099286803204e-06, + "loss": 0.49003708362579346, + "mean_token_accuracy": 0.8701964616775513, + "num_tokens": 14399277.0, + "step": 1610 + }, + { + "epoch": 1.2241641337386018, + "grad_norm": 2.6874046325683594, + "learning_rate": 3.445571453557196e-06, + "loss": 0.3424490690231323, + "mean_token_accuracy": 0.8835943937301636, + "num_tokens": 14404182.0, + "step": 1611 + }, + { + "epoch": 1.2249240121580547, + "grad_norm": 2.2163190841674805, + "learning_rate": 3.443632314795627e-06, + "loss": 0.40944457054138184, + "mean_token_accuracy": 0.8649888038635254, + "num_tokens": 14410158.0, + "step": 1612 + }, + { + "epoch": 1.2256838905775076, + "grad_norm": 2.7961158752441406, + "learning_rate": 3.4416925137565756e-06, + "loss": 0.17890746891498566, + "mean_token_accuracy": 0.9439430832862854, + "num_tokens": 14413285.0, + "step": 1613 + }, + { + "epoch": 1.2264437689969605, + "grad_norm": 1.421451210975647, + "learning_rate": 3.439752051801467e-06, + "loss": 0.33948683738708496, + "mean_token_accuracy": 0.8754585981369019, + "num_tokens": 14424674.0, + "step": 1614 + }, + { + "epoch": 1.2272036474164134, + "grad_norm": 2.105196237564087, + "learning_rate": 3.4378109302921946e-06, + "loss": 0.40009379386901855, + "mean_token_accuracy": 0.8600341081619263, + "num_tokens": 14432400.0, + "step": 1615 + }, + { + "epoch": 1.2279635258358663, + "grad_norm": 2.004122734069824, + "learning_rate": 3.4358691505911105e-06, + "loss": 0.46013444662094116, + "mean_token_accuracy": 0.8400925993919373, + "num_tokens": 14440741.0, + "step": 1616 + }, + { + "epoch": 1.2287234042553192, + "grad_norm": 1.8407535552978516, + "learning_rate": 3.4339267140610317e-06, + "loss": 0.38828906416893005, + "mean_token_accuracy": 0.8582802414894104, + "num_tokens": 14448698.0, + "step": 1617 + }, + { + "epoch": 1.2294832826747721, + "grad_norm": 2.4285924434661865, + "learning_rate": 3.4319836220652334e-06, + "loss": 0.3109283447265625, + "mean_token_accuracy": 0.8888344764709473, + "num_tokens": 14453674.0, + "step": 1618 + }, + { + "epoch": 1.2302431610942248, + "grad_norm": 1.6322550773620605, + "learning_rate": 3.430039875967454e-06, + "loss": 0.5222204327583313, + "mean_token_accuracy": 0.825019121170044, + "num_tokens": 14465736.0, + "step": 1619 + }, + { + "epoch": 1.2310030395136777, + "grad_norm": 2.307573080062866, + "learning_rate": 3.428095477131888e-06, + "loss": 0.29477375745773315, + "mean_token_accuracy": 0.8899064660072327, + "num_tokens": 14471266.0, + "step": 1620 + }, + { + "epoch": 1.2317629179331306, + "grad_norm": 1.8044531345367432, + "learning_rate": 3.4261504269231904e-06, + "loss": 0.4883342981338501, + "mean_token_accuracy": 0.8310165405273438, + "num_tokens": 14481679.0, + "step": 1621 + }, + { + "epoch": 1.2325227963525835, + "grad_norm": 2.7585411071777344, + "learning_rate": 3.4242047267064714e-06, + "loss": 0.45369645953178406, + "mean_token_accuracy": 0.8432134985923767, + "num_tokens": 14487299.0, + "step": 1622 + }, + { + "epoch": 1.2332826747720365, + "grad_norm": 2.687490701675415, + "learning_rate": 3.4222583778472997e-06, + "loss": 0.5627540349960327, + "mean_token_accuracy": 0.8186438083648682, + "num_tokens": 14494254.0, + "step": 1623 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 2.622443199157715, + "learning_rate": 3.4203113817116955e-06, + "loss": 0.28697147965431213, + "mean_token_accuracy": 0.8861737847328186, + "num_tokens": 14498632.0, + "step": 1624 + }, + { + "epoch": 1.2348024316109423, + "grad_norm": 2.6943359375, + "learning_rate": 3.4183637396661372e-06, + "loss": 0.25273287296295166, + "mean_token_accuracy": 0.9104914665222168, + "num_tokens": 14502797.0, + "step": 1625 + }, + { + "epoch": 1.2355623100303952, + "grad_norm": 2.428189992904663, + "learning_rate": 3.4164154530775552e-06, + "loss": 0.4213451147079468, + "mean_token_accuracy": 0.851524293422699, + "num_tokens": 14508503.0, + "step": 1626 + }, + { + "epoch": 1.236322188449848, + "grad_norm": 2.1722824573516846, + "learning_rate": 3.4144665233133318e-06, + "loss": 0.35238856077194214, + "mean_token_accuracy": 0.8730837106704712, + "num_tokens": 14516126.0, + "step": 1627 + }, + { + "epoch": 1.237082066869301, + "grad_norm": 2.291365146636963, + "learning_rate": 3.4125169517413005e-06, + "loss": 0.43963465094566345, + "mean_token_accuracy": 0.8525444865226746, + "num_tokens": 14522507.0, + "step": 1628 + }, + { + "epoch": 1.237841945288754, + "grad_norm": 1.6181648969650269, + "learning_rate": 3.410566739729746e-06, + "loss": 0.2799680233001709, + "mean_token_accuracy": 0.8915654420852661, + "num_tokens": 14531025.0, + "step": 1629 + }, + { + "epoch": 1.2386018237082066, + "grad_norm": 1.4039218425750732, + "learning_rate": 3.408615888647402e-06, + "loss": 0.29756587743759155, + "mean_token_accuracy": 0.8951715230941772, + "num_tokens": 14543770.0, + "step": 1630 + }, + { + "epoch": 1.2393617021276595, + "grad_norm": 2.148325204849243, + "learning_rate": 3.4066643998634506e-06, + "loss": 0.3983418345451355, + "mean_token_accuracy": 0.8635951280593872, + "num_tokens": 14550896.0, + "step": 1631 + }, + { + "epoch": 1.2401215805471124, + "grad_norm": 1.5225859880447388, + "learning_rate": 3.4047122747475227e-06, + "loss": 0.3247569799423218, + "mean_token_accuracy": 0.8727027177810669, + "num_tokens": 14562181.0, + "step": 1632 + }, + { + "epoch": 1.2408814589665653, + "grad_norm": 3.99835467338562, + "learning_rate": 3.402759514669694e-06, + "loss": 0.4317352771759033, + "mean_token_accuracy": 0.8488142490386963, + "num_tokens": 14565521.0, + "step": 1633 + }, + { + "epoch": 1.2416413373860182, + "grad_norm": 1.7306902408599854, + "learning_rate": 3.4008061210004872e-06, + "loss": 0.389854371547699, + "mean_token_accuracy": 0.8553084135055542, + "num_tokens": 14574633.0, + "step": 1634 + }, + { + "epoch": 1.2424012158054711, + "grad_norm": 2.3614673614501953, + "learning_rate": 3.3988520951108683e-06, + "loss": 0.3150152564048767, + "mean_token_accuracy": 0.8865959644317627, + "num_tokens": 14580240.0, + "step": 1635 + }, + { + "epoch": 1.243161094224924, + "grad_norm": 1.5625747442245483, + "learning_rate": 3.3968974383722497e-06, + "loss": 0.43160033226013184, + "mean_token_accuracy": 0.840155839920044, + "num_tokens": 14594255.0, + "step": 1636 + }, + { + "epoch": 1.243920972644377, + "grad_norm": 1.871620535850525, + "learning_rate": 3.3949421521564825e-06, + "loss": 0.49550193548202515, + "mean_token_accuracy": 0.8315126299858093, + "num_tokens": 14605416.0, + "step": 1637 + }, + { + "epoch": 1.2446808510638299, + "grad_norm": 2.111304759979248, + "learning_rate": 3.392986237835863e-06, + "loss": 0.2794899046421051, + "mean_token_accuracy": 0.9049773216247559, + "num_tokens": 14611711.0, + "step": 1638 + }, + { + "epoch": 1.2454407294832828, + "grad_norm": 3.7479894161224365, + "learning_rate": 3.391029696783127e-06, + "loss": 0.469397634267807, + "mean_token_accuracy": 0.8352956771850586, + "num_tokens": 14615536.0, + "step": 1639 + }, + { + "epoch": 1.2462006079027357, + "grad_norm": 3.277726650238037, + "learning_rate": 3.389072530371451e-06, + "loss": 0.35431790351867676, + "mean_token_accuracy": 0.8822286128997803, + "num_tokens": 14619390.0, + "step": 1640 + }, + { + "epoch": 1.2469604863221884, + "grad_norm": 1.9583072662353516, + "learning_rate": 3.3871147399744482e-06, + "loss": 0.3708694577217102, + "mean_token_accuracy": 0.8720351457595825, + "num_tokens": 14626573.0, + "step": 1641 + }, + { + "epoch": 1.2477203647416413, + "grad_norm": 1.8734042644500732, + "learning_rate": 3.385156326966173e-06, + "loss": 0.48163774609565735, + "mean_token_accuracy": 0.8479621410369873, + "num_tokens": 14636382.0, + "step": 1642 + }, + { + "epoch": 1.2484802431610942, + "grad_norm": 2.0085532665252686, + "learning_rate": 3.383197292721114e-06, + "loss": 0.4893198311328888, + "mean_token_accuracy": 0.838238000869751, + "num_tokens": 14645083.0, + "step": 1643 + }, + { + "epoch": 1.249240121580547, + "grad_norm": 2.0874593257904053, + "learning_rate": 3.3812376386141966e-06, + "loss": 0.4610505700111389, + "mean_token_accuracy": 0.8441368341445923, + "num_tokens": 14654048.0, + "step": 1644 + }, + { + "epoch": 1.25, + "grad_norm": 1.6887420415878296, + "learning_rate": 3.379277366020782e-06, + "loss": 0.3628596067428589, + "mean_token_accuracy": 0.8838590383529663, + "num_tokens": 14662317.0, + "step": 1645 + }, + { + "epoch": 1.250759878419453, + "grad_norm": 2.389002561569214, + "learning_rate": 3.3773164763166653e-06, + "loss": 0.21903495490550995, + "mean_token_accuracy": 0.9249413013458252, + "num_tokens": 14666394.0, + "step": 1646 + }, + { + "epoch": 1.2515197568389058, + "grad_norm": 1.7091087102890015, + "learning_rate": 3.3753549708780736e-06, + "loss": 0.37802332639694214, + "mean_token_accuracy": 0.8644627332687378, + "num_tokens": 14676214.0, + "step": 1647 + }, + { + "epoch": 1.2522796352583587, + "grad_norm": 2.5717999935150146, + "learning_rate": 3.3733928510816677e-06, + "loss": 0.4236462116241455, + "mean_token_accuracy": 0.8519910573959351, + "num_tokens": 14681681.0, + "step": 1648 + }, + { + "epoch": 1.2530395136778116, + "grad_norm": 1.958856463432312, + "learning_rate": 3.3714301183045382e-06, + "loss": 0.3923419415950775, + "mean_token_accuracy": 0.8720202445983887, + "num_tokens": 14690419.0, + "step": 1649 + }, + { + "epoch": 1.2537993920972643, + "grad_norm": 1.5900038480758667, + "learning_rate": 3.369466773924207e-06, + "loss": 0.4182325601577759, + "mean_token_accuracy": 0.8515387177467346, + "num_tokens": 14699790.0, + "step": 1650 + }, + { + "epoch": 1.2545592705167175, + "grad_norm": 1.260547161102295, + "learning_rate": 3.3675028193186243e-06, + "loss": 0.3915718197822571, + "mean_token_accuracy": 0.8536830544471741, + "num_tokens": 14717502.0, + "step": 1651 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 1.8152283430099487, + "learning_rate": 3.365538255866169e-06, + "loss": 0.424524188041687, + "mean_token_accuracy": 0.8434420824050903, + "num_tokens": 14726591.0, + "step": 1652 + }, + { + "epoch": 1.256079027355623, + "grad_norm": 1.3357285261154175, + "learning_rate": 3.3635730849456484e-06, + "loss": 0.2949739396572113, + "mean_token_accuracy": 0.8868321180343628, + "num_tokens": 14739911.0, + "step": 1653 + }, + { + "epoch": 1.256838905775076, + "grad_norm": 1.1770358085632324, + "learning_rate": 3.3616073079362925e-06, + "loss": 0.29939576983451843, + "mean_token_accuracy": 0.8923654556274414, + "num_tokens": 14755521.0, + "step": 1654 + }, + { + "epoch": 1.2575987841945289, + "grad_norm": 2.059162139892578, + "learning_rate": 3.3596409262177633e-06, + "loss": 0.4562555253505707, + "mean_token_accuracy": 0.8585271239280701, + "num_tokens": 14764173.0, + "step": 1655 + }, + { + "epoch": 1.2583586626139818, + "grad_norm": 1.430752158164978, + "learning_rate": 3.357673941170139e-06, + "loss": 0.35301265120506287, + "mean_token_accuracy": 0.8920517563819885, + "num_tokens": 14775596.0, + "step": 1656 + }, + { + "epoch": 1.2591185410334347, + "grad_norm": 1.6066302061080933, + "learning_rate": 3.3557063541739283e-06, + "loss": 0.41129636764526367, + "mean_token_accuracy": 0.8512256145477295, + "num_tokens": 14786289.0, + "step": 1657 + }, + { + "epoch": 1.2598784194528876, + "grad_norm": 1.5471590757369995, + "learning_rate": 3.353738166610058e-06, + "loss": 0.3935067057609558, + "mean_token_accuracy": 0.8514131903648376, + "num_tokens": 14798672.0, + "step": 1658 + }, + { + "epoch": 1.2606382978723405, + "grad_norm": 1.3455181121826172, + "learning_rate": 3.35176937985988e-06, + "loss": 0.3486790657043457, + "mean_token_accuracy": 0.8644362688064575, + "num_tokens": 14811603.0, + "step": 1659 + }, + { + "epoch": 1.2613981762917934, + "grad_norm": 1.891432762145996, + "learning_rate": 3.349799995305162e-06, + "loss": 0.3325638175010681, + "mean_token_accuracy": 0.8844645023345947, + "num_tokens": 14819256.0, + "step": 1660 + }, + { + "epoch": 1.262158054711246, + "grad_norm": 2.600614309310913, + "learning_rate": 3.3478300143280946e-06, + "loss": 0.30310919880867004, + "mean_token_accuracy": 0.9103429317474365, + "num_tokens": 14823706.0, + "step": 1661 + }, + { + "epoch": 1.2629179331306992, + "grad_norm": 3.8636202812194824, + "learning_rate": 3.3458594383112868e-06, + "loss": 0.28377676010131836, + "mean_token_accuracy": 0.9047091007232666, + "num_tokens": 14826688.0, + "step": 1662 + }, + { + "epoch": 1.263677811550152, + "grad_norm": 2.3100268840789795, + "learning_rate": 3.343888268637765e-06, + "loss": 0.4723394513130188, + "mean_token_accuracy": 0.8306777477264404, + "num_tokens": 14835471.0, + "step": 1663 + }, + { + "epoch": 1.2644376899696048, + "grad_norm": 1.7582160234451294, + "learning_rate": 3.341916506690971e-06, + "loss": 0.48168784379959106, + "mean_token_accuracy": 0.8281306028366089, + "num_tokens": 14846513.0, + "step": 1664 + }, + { + "epoch": 1.2651975683890577, + "grad_norm": 2.166055917739868, + "learning_rate": 3.3399441538547638e-06, + "loss": 0.4626024067401886, + "mean_token_accuracy": 0.8377980589866638, + "num_tokens": 14853408.0, + "step": 1665 + }, + { + "epoch": 1.2659574468085106, + "grad_norm": 2.23038911819458, + "learning_rate": 3.337971211513417e-06, + "loss": 0.38434159755706787, + "mean_token_accuracy": 0.8708412647247314, + "num_tokens": 14859919.0, + "step": 1666 + }, + { + "epoch": 1.2667173252279635, + "grad_norm": 2.092505693435669, + "learning_rate": 3.3359976810516164e-06, + "loss": 0.35072219371795654, + "mean_token_accuracy": 0.8761640191078186, + "num_tokens": 14865624.0, + "step": 1667 + }, + { + "epoch": 1.2674772036474165, + "grad_norm": 1.8255130052566528, + "learning_rate": 3.3340235638544633e-06, + "loss": 0.4404270648956299, + "mean_token_accuracy": 0.836356520652771, + "num_tokens": 14874181.0, + "step": 1668 + }, + { + "epoch": 1.2682370820668694, + "grad_norm": 1.9889036417007446, + "learning_rate": 3.332048861307467e-06, + "loss": 0.4199368357658386, + "mean_token_accuracy": 0.8508217334747314, + "num_tokens": 14882275.0, + "step": 1669 + }, + { + "epoch": 1.2689969604863223, + "grad_norm": 4.050281047821045, + "learning_rate": 3.330073574796551e-06, + "loss": 0.4271625280380249, + "mean_token_accuracy": 0.8471108675003052, + "num_tokens": 14893633.0, + "step": 1670 + }, + { + "epoch": 1.2697568389057752, + "grad_norm": 1.998838186264038, + "learning_rate": 3.328097705708047e-06, + "loss": 0.34743767976760864, + "mean_token_accuracy": 0.8771528005599976, + "num_tokens": 14899859.0, + "step": 1671 + }, + { + "epoch": 1.2705167173252279, + "grad_norm": 1.7989062070846558, + "learning_rate": 3.3261212554286977e-06, + "loss": 0.5267184376716614, + "mean_token_accuracy": 0.8323302268981934, + "num_tokens": 14911131.0, + "step": 1672 + }, + { + "epoch": 1.2712765957446808, + "grad_norm": 1.312070369720459, + "learning_rate": 3.324144225345649e-06, + "loss": 0.4675425887107849, + "mean_token_accuracy": 0.8157106637954712, + "num_tokens": 14928955.0, + "step": 1673 + }, + { + "epoch": 1.2720364741641337, + "grad_norm": 2.0547919273376465, + "learning_rate": 3.3221666168464584e-06, + "loss": 0.33704331517219543, + "mean_token_accuracy": 0.8621441125869751, + "num_tokens": 14935536.0, + "step": 1674 + }, + { + "epoch": 1.2727963525835866, + "grad_norm": 2.810413122177124, + "learning_rate": 3.320188431319088e-06, + "loss": 0.4007563292980194, + "mean_token_accuracy": 0.8649672269821167, + "num_tokens": 14940219.0, + "step": 1675 + }, + { + "epoch": 1.2735562310030395, + "grad_norm": 1.3516674041748047, + "learning_rate": 3.318209670151904e-06, + "loss": 0.3457040786743164, + "mean_token_accuracy": 0.8698287010192871, + "num_tokens": 14952904.0, + "step": 1676 + }, + { + "epoch": 1.2743161094224924, + "grad_norm": 2.440643310546875, + "learning_rate": 3.3162303347336765e-06, + "loss": 0.5195086002349854, + "mean_token_accuracy": 0.8348199129104614, + "num_tokens": 14958623.0, + "step": 1677 + }, + { + "epoch": 1.2750759878419453, + "grad_norm": 1.3264343738555908, + "learning_rate": 3.3142504264535808e-06, + "loss": 0.2990425229072571, + "mean_token_accuracy": 0.8961933851242065, + "num_tokens": 14971494.0, + "step": 1678 + }, + { + "epoch": 1.2758358662613982, + "grad_norm": 1.3106894493103027, + "learning_rate": 3.3122699467011913e-06, + "loss": 0.291853666305542, + "mean_token_accuracy": 0.893449068069458, + "num_tokens": 14985239.0, + "step": 1679 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 2.5387396812438965, + "learning_rate": 3.3102888968664857e-06, + "loss": 0.4336916208267212, + "mean_token_accuracy": 0.8447890877723694, + "num_tokens": 14991453.0, + "step": 1680 + }, + { + "epoch": 1.2773556231003038, + "grad_norm": 2.7052135467529297, + "learning_rate": 3.308307278339842e-06, + "loss": 0.3279378116130829, + "mean_token_accuracy": 0.8935879468917847, + "num_tokens": 14995428.0, + "step": 1681 + }, + { + "epoch": 1.278115501519757, + "grad_norm": 1.6251261234283447, + "learning_rate": 3.306325092512034e-06, + "loss": 0.32066458463668823, + "mean_token_accuracy": 0.8909799456596375, + "num_tokens": 15004841.0, + "step": 1682 + }, + { + "epoch": 1.2788753799392096, + "grad_norm": 2.3014605045318604, + "learning_rate": 3.3043423407742374e-06, + "loss": 0.3523373603820801, + "mean_token_accuracy": 0.8810735940933228, + "num_tokens": 15010742.0, + "step": 1683 + }, + { + "epoch": 1.2796352583586625, + "grad_norm": 2.9563019275665283, + "learning_rate": 3.3023590245180237e-06, + "loss": 0.39715707302093506, + "mean_token_accuracy": 0.8779881000518799, + "num_tokens": 15015357.0, + "step": 1684 + }, + { + "epoch": 1.2803951367781155, + "grad_norm": 1.5787957906723022, + "learning_rate": 3.300375145135361e-06, + "loss": 0.44630166888237, + "mean_token_accuracy": 0.8400174975395203, + "num_tokens": 15031360.0, + "step": 1685 + }, + { + "epoch": 1.2811550151975684, + "grad_norm": 1.6753438711166382, + "learning_rate": 3.2983907040186112e-06, + "loss": 0.3235800862312317, + "mean_token_accuracy": 0.8938044309616089, + "num_tokens": 15040276.0, + "step": 1686 + }, + { + "epoch": 1.2819148936170213, + "grad_norm": 1.7331148386001587, + "learning_rate": 3.296405702560532e-06, + "loss": 0.39061424136161804, + "mean_token_accuracy": 0.8599754571914673, + "num_tokens": 15049725.0, + "step": 1687 + }, + { + "epoch": 1.2826747720364742, + "grad_norm": 2.2029430866241455, + "learning_rate": 3.294420142154274e-06, + "loss": 0.43598297238349915, + "mean_token_accuracy": 0.8663698434829712, + "num_tokens": 15058182.0, + "step": 1688 + }, + { + "epoch": 1.283434650455927, + "grad_norm": 2.943964958190918, + "learning_rate": 3.29243402419338e-06, + "loss": 0.405210942029953, + "mean_token_accuracy": 0.854996919631958, + "num_tokens": 15062920.0, + "step": 1689 + }, + { + "epoch": 1.28419452887538, + "grad_norm": 1.9343379735946655, + "learning_rate": 3.2904473500717826e-06, + "loss": 0.35011449456214905, + "mean_token_accuracy": 0.8745867013931274, + "num_tokens": 15070298.0, + "step": 1690 + }, + { + "epoch": 1.284954407294833, + "grad_norm": 2.559859037399292, + "learning_rate": 3.2884601211838087e-06, + "loss": 0.38816407322883606, + "mean_token_accuracy": 0.854763388633728, + "num_tokens": 15075667.0, + "step": 1691 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.4357839822769165, + "learning_rate": 3.2864723389241697e-06, + "loss": 0.4512745141983032, + "mean_token_accuracy": 0.8398592472076416, + "num_tokens": 15090291.0, + "step": 1692 + }, + { + "epoch": 1.2864741641337387, + "grad_norm": 1.7643728256225586, + "learning_rate": 3.284484004687969e-06, + "loss": 0.3536742627620697, + "mean_token_accuracy": 0.8726381063461304, + "num_tokens": 15099325.0, + "step": 1693 + }, + { + "epoch": 1.2872340425531914, + "grad_norm": 1.853173017501831, + "learning_rate": 3.2824951198706958e-06, + "loss": 0.36579740047454834, + "mean_token_accuracy": 0.8988048434257507, + "num_tokens": 15107090.0, + "step": 1694 + }, + { + "epoch": 1.2879939209726443, + "grad_norm": 1.6526862382888794, + "learning_rate": 3.280505685868226e-06, + "loss": 0.3853636682033539, + "mean_token_accuracy": 0.8743607997894287, + "num_tokens": 15117818.0, + "step": 1695 + }, + { + "epoch": 1.2887537993920972, + "grad_norm": 2.790398597717285, + "learning_rate": 3.278515704076821e-06, + "loss": 0.2707311511039734, + "mean_token_accuracy": 0.9034668803215027, + "num_tokens": 15121641.0, + "step": 1696 + }, + { + "epoch": 1.2895136778115501, + "grad_norm": 1.69557523727417, + "learning_rate": 3.276525175893126e-06, + "loss": 0.3707970082759857, + "mean_token_accuracy": 0.8617855906486511, + "num_tokens": 15130414.0, + "step": 1697 + }, + { + "epoch": 1.290273556231003, + "grad_norm": 1.1360478401184082, + "learning_rate": 3.274534102714172e-06, + "loss": 0.3368082344532013, + "mean_token_accuracy": 0.8781654834747314, + "num_tokens": 15148307.0, + "step": 1698 + }, + { + "epoch": 1.291033434650456, + "grad_norm": 1.5894653797149658, + "learning_rate": 3.272542485937369e-06, + "loss": 0.3870658278465271, + "mean_token_accuracy": 0.8830926418304443, + "num_tokens": 15161841.0, + "step": 1699 + }, + { + "epoch": 1.2917933130699089, + "grad_norm": 2.3735709190368652, + "learning_rate": 3.270550326960511e-06, + "loss": 0.3873991370201111, + "mean_token_accuracy": 0.8729057908058167, + "num_tokens": 15167733.0, + "step": 1700 + }, + { + "epoch": 1.2925531914893618, + "grad_norm": 1.3739598989486694, + "learning_rate": 3.268557627181772e-06, + "loss": 0.30831626057624817, + "mean_token_accuracy": 0.8695719242095947, + "num_tokens": 15180861.0, + "step": 1701 + }, + { + "epoch": 1.2933130699088147, + "grad_norm": 1.7526969909667969, + "learning_rate": 3.2665643879997054e-06, + "loss": 0.4716024398803711, + "mean_token_accuracy": 0.8303275108337402, + "num_tokens": 15191642.0, + "step": 1702 + }, + { + "epoch": 1.2940729483282674, + "grad_norm": 2.7866084575653076, + "learning_rate": 3.2645706108132426e-06, + "loss": 0.33337634801864624, + "mean_token_accuracy": 0.8790726065635681, + "num_tokens": 15196038.0, + "step": 1703 + }, + { + "epoch": 1.2948328267477205, + "grad_norm": 2.319765090942383, + "learning_rate": 3.2625762970216944e-06, + "loss": 0.3999716639518738, + "mean_token_accuracy": 0.8693568706512451, + "num_tokens": 15202075.0, + "step": 1704 + }, + { + "epoch": 1.2955927051671732, + "grad_norm": 3.18292498588562, + "learning_rate": 3.2605814480247454e-06, + "loss": 0.4579541087150574, + "mean_token_accuracy": 0.8516187071800232, + "num_tokens": 15206886.0, + "step": 1705 + }, + { + "epoch": 1.296352583586626, + "grad_norm": 2.1816933155059814, + "learning_rate": 3.258586065222459e-06, + "loss": 0.5198885202407837, + "mean_token_accuracy": 0.8170592784881592, + "num_tokens": 15214088.0, + "step": 1706 + }, + { + "epoch": 1.297112462006079, + "grad_norm": 1.9076340198516846, + "learning_rate": 3.2565901500152702e-06, + "loss": 0.49752360582351685, + "mean_token_accuracy": 0.8681992292404175, + "num_tokens": 15226046.0, + "step": 1707 + }, + { + "epoch": 1.297872340425532, + "grad_norm": 2.0223331451416016, + "learning_rate": 3.2545937038039904e-06, + "loss": 0.4515793025493622, + "mean_token_accuracy": 0.8429619073867798, + "num_tokens": 15234993.0, + "step": 1708 + }, + { + "epoch": 1.2986322188449848, + "grad_norm": 2.5089669227600098, + "learning_rate": 3.2525967279898017e-06, + "loss": 0.43628376722335815, + "mean_token_accuracy": 0.8493682146072388, + "num_tokens": 15240575.0, + "step": 1709 + }, + { + "epoch": 1.2993920972644377, + "grad_norm": 2.8347091674804688, + "learning_rate": 3.2505992239742582e-06, + "loss": 0.25112441182136536, + "mean_token_accuracy": 0.908825159072876, + "num_tokens": 15244085.0, + "step": 1710 + }, + { + "epoch": 1.3001519756838906, + "grad_norm": 2.3157572746276855, + "learning_rate": 3.2486011931592863e-06, + "loss": 0.482818067073822, + "mean_token_accuracy": 0.8305923938751221, + "num_tokens": 15250377.0, + "step": 1711 + }, + { + "epoch": 1.3009118541033435, + "grad_norm": 3.169052839279175, + "learning_rate": 3.2466026369471804e-06, + "loss": 0.3493242561817169, + "mean_token_accuracy": 0.86913001537323, + "num_tokens": 15255041.0, + "step": 1712 + }, + { + "epoch": 1.3016717325227964, + "grad_norm": 1.4475083351135254, + "learning_rate": 3.2446035567406033e-06, + "loss": 0.4177290201187134, + "mean_token_accuracy": 0.8497589826583862, + "num_tokens": 15266946.0, + "step": 1713 + }, + { + "epoch": 1.3024316109422491, + "grad_norm": 1.6473008394241333, + "learning_rate": 3.2426039539425875e-06, + "loss": 0.5272886753082275, + "mean_token_accuracy": 0.8440133333206177, + "num_tokens": 15279263.0, + "step": 1714 + }, + { + "epoch": 1.3031914893617023, + "grad_norm": 2.3996543884277344, + "learning_rate": 3.240603829956531e-06, + "loss": 0.4272066652774811, + "mean_token_accuracy": 0.8495640754699707, + "num_tokens": 15285213.0, + "step": 1715 + }, + { + "epoch": 1.303951367781155, + "grad_norm": 1.63034987449646, + "learning_rate": 3.238603186186198e-06, + "loss": 0.4034635126590729, + "mean_token_accuracy": 0.8638584613800049, + "num_tokens": 15295974.0, + "step": 1716 + }, + { + "epoch": 1.3047112462006079, + "grad_norm": 2.153608798980713, + "learning_rate": 3.2366020240357166e-06, + "loss": 0.30712565779685974, + "mean_token_accuracy": 0.8863866329193115, + "num_tokens": 15302220.0, + "step": 1717 + }, + { + "epoch": 1.3054711246200608, + "grad_norm": 2.9814558029174805, + "learning_rate": 3.2346003449095803e-06, + "loss": 0.3922840356826782, + "mean_token_accuracy": 0.868030309677124, + "num_tokens": 15306747.0, + "step": 1718 + }, + { + "epoch": 1.3062310030395137, + "grad_norm": 3.3417985439300537, + "learning_rate": 3.2325981502126434e-06, + "loss": 0.30750396847724915, + "mean_token_accuracy": 0.9065356850624084, + "num_tokens": 15310309.0, + "step": 1719 + }, + { + "epoch": 1.3069908814589666, + "grad_norm": 2.237682819366455, + "learning_rate": 3.2305954413501252e-06, + "loss": 0.35068294405937195, + "mean_token_accuracy": 0.8887614011764526, + "num_tokens": 15316463.0, + "step": 1720 + }, + { + "epoch": 1.3077507598784195, + "grad_norm": 1.9526605606079102, + "learning_rate": 3.228592219727602e-06, + "loss": 0.42061835527420044, + "mean_token_accuracy": 0.8456839323043823, + "num_tokens": 15323984.0, + "step": 1721 + }, + { + "epoch": 1.3085106382978724, + "grad_norm": 1.6454212665557861, + "learning_rate": 3.226588486751012e-06, + "loss": 0.5189976692199707, + "mean_token_accuracy": 0.8187375068664551, + "num_tokens": 15338807.0, + "step": 1722 + }, + { + "epoch": 1.3092705167173253, + "grad_norm": 1.4521609544754028, + "learning_rate": 3.2245842438266526e-06, + "loss": 0.329673171043396, + "mean_token_accuracy": 0.853867769241333, + "num_tokens": 15350400.0, + "step": 1723 + }, + { + "epoch": 1.3100303951367782, + "grad_norm": 1.8750989437103271, + "learning_rate": 3.222579492361179e-06, + "loss": 0.4635341167449951, + "mean_token_accuracy": 0.8393422365188599, + "num_tokens": 15360557.0, + "step": 1724 + }, + { + "epoch": 1.310790273556231, + "grad_norm": 1.2728849649429321, + "learning_rate": 3.220574233761603e-06, + "loss": 0.3255572021007538, + "mean_token_accuracy": 0.8989741802215576, + "num_tokens": 15376548.0, + "step": 1725 + }, + { + "epoch": 1.3115501519756838, + "grad_norm": 3.5155694484710693, + "learning_rate": 3.2185684694352913e-06, + "loss": 0.34204089641571045, + "mean_token_accuracy": 0.8781906366348267, + "num_tokens": 15380304.0, + "step": 1726 + }, + { + "epoch": 1.3123100303951367, + "grad_norm": 2.059800148010254, + "learning_rate": 3.216562200789968e-06, + "loss": 0.36288338899612427, + "mean_token_accuracy": 0.8595278263092041, + "num_tokens": 15387653.0, + "step": 1727 + }, + { + "epoch": 1.3130699088145896, + "grad_norm": 3.5388240814208984, + "learning_rate": 3.214555429233707e-06, + "loss": 0.5434849858283997, + "mean_token_accuracy": 0.8074631690979004, + "num_tokens": 15391662.0, + "step": 1728 + }, + { + "epoch": 1.3138297872340425, + "grad_norm": 2.8595592975616455, + "learning_rate": 3.2125481561749406e-06, + "loss": 0.5113687515258789, + "mean_token_accuracy": 0.8448649644851685, + "num_tokens": 15397536.0, + "step": 1729 + }, + { + "epoch": 1.3145896656534954, + "grad_norm": 2.50386905670166, + "learning_rate": 3.210540383022449e-06, + "loss": 0.5293697118759155, + "mean_token_accuracy": 0.8096445798873901, + "num_tokens": 15403478.0, + "step": 1730 + }, + { + "epoch": 1.3153495440729484, + "grad_norm": 1.880035400390625, + "learning_rate": 3.208532111185365e-06, + "loss": 0.5344835519790649, + "mean_token_accuracy": 0.8172965049743652, + "num_tokens": 15413812.0, + "step": 1731 + }, + { + "epoch": 1.3161094224924013, + "grad_norm": 1.3688768148422241, + "learning_rate": 3.2065233420731717e-06, + "loss": 0.2577427327632904, + "mean_token_accuracy": 0.9142681360244751, + "num_tokens": 15423583.0, + "step": 1732 + }, + { + "epoch": 1.3168693009118542, + "grad_norm": 1.7945705652236938, + "learning_rate": 3.2045140770956987e-06, + "loss": 0.3983926773071289, + "mean_token_accuracy": 0.8652000427246094, + "num_tokens": 15432473.0, + "step": 1733 + }, + { + "epoch": 1.3176291793313069, + "grad_norm": 1.8243350982666016, + "learning_rate": 3.2025043176631283e-06, + "loss": 0.48644185066223145, + "mean_token_accuracy": 0.8319193124771118, + "num_tokens": 15445463.0, + "step": 1734 + }, + { + "epoch": 1.31838905775076, + "grad_norm": 2.000094175338745, + "learning_rate": 3.2004940651859844e-06, + "loss": 0.43567317724227905, + "mean_token_accuracy": 0.8857482671737671, + "num_tokens": 15452382.0, + "step": 1735 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 2.379974365234375, + "learning_rate": 3.198483321075141e-06, + "loss": 0.5153506398200989, + "mean_token_accuracy": 0.8295865654945374, + "num_tokens": 15458740.0, + "step": 1736 + }, + { + "epoch": 1.3199088145896656, + "grad_norm": 1.6564184427261353, + "learning_rate": 3.196472086741815e-06, + "loss": 0.508430540561676, + "mean_token_accuracy": 0.8181540369987488, + "num_tokens": 15471844.0, + "step": 1737 + }, + { + "epoch": 1.3206686930091185, + "grad_norm": 2.006925344467163, + "learning_rate": 3.194460363597569e-06, + "loss": 0.34542378783226013, + "mean_token_accuracy": 0.8827437162399292, + "num_tokens": 15478414.0, + "step": 1738 + }, + { + "epoch": 1.3214285714285714, + "grad_norm": 3.589045763015747, + "learning_rate": 3.192448153054306e-06, + "loss": 0.4385780096054077, + "mean_token_accuracy": 0.8480287790298462, + "num_tokens": 15482063.0, + "step": 1739 + }, + { + "epoch": 1.3221884498480243, + "grad_norm": 1.9797427654266357, + "learning_rate": 3.190435456524275e-06, + "loss": 0.4330386519432068, + "mean_token_accuracy": 0.8458058834075928, + "num_tokens": 15489803.0, + "step": 1740 + }, + { + "epoch": 1.3229483282674772, + "grad_norm": 1.4777411222457886, + "learning_rate": 3.188422275420063e-06, + "loss": 0.3997895419597626, + "mean_token_accuracy": 0.8639512062072754, + "num_tokens": 15501103.0, + "step": 1741 + }, + { + "epoch": 1.3237082066869301, + "grad_norm": 2.882338523864746, + "learning_rate": 3.186408611154597e-06, + "loss": 0.2336438149213791, + "mean_token_accuracy": 0.9176726937294006, + "num_tokens": 15504854.0, + "step": 1742 + }, + { + "epoch": 1.324468085106383, + "grad_norm": 2.353503704071045, + "learning_rate": 3.184394465141146e-06, + "loss": 0.4107069671154022, + "mean_token_accuracy": 0.8677014112472534, + "num_tokens": 15510662.0, + "step": 1743 + }, + { + "epoch": 1.325227963525836, + "grad_norm": 2.6551976203918457, + "learning_rate": 3.1823798387933134e-06, + "loss": 0.3862302899360657, + "mean_token_accuracy": 0.8819445371627808, + "num_tokens": 15515681.0, + "step": 1744 + }, + { + "epoch": 1.3259878419452886, + "grad_norm": 1.478572964668274, + "learning_rate": 3.180364733525043e-06, + "loss": 0.43972986936569214, + "mean_token_accuracy": 0.832388162612915, + "num_tokens": 15529542.0, + "step": 1745 + }, + { + "epoch": 1.3267477203647418, + "grad_norm": 1.6003550291061401, + "learning_rate": 3.178349150750612e-06, + "loss": 0.3404902219772339, + "mean_token_accuracy": 0.8764007091522217, + "num_tokens": 15538865.0, + "step": 1746 + }, + { + "epoch": 1.3275075987841944, + "grad_norm": 2.130689859390259, + "learning_rate": 3.1763330918846347e-06, + "loss": 0.383136510848999, + "mean_token_accuracy": 0.8652247190475464, + "num_tokens": 15545567.0, + "step": 1747 + }, + { + "epoch": 1.3282674772036474, + "grad_norm": 2.395937442779541, + "learning_rate": 3.1743165583420586e-06, + "loss": 0.3870319128036499, + "mean_token_accuracy": 0.8618065118789673, + "num_tokens": 15551090.0, + "step": 1748 + }, + { + "epoch": 1.3290273556231003, + "grad_norm": 2.0841057300567627, + "learning_rate": 3.1722995515381644e-06, + "loss": 0.4838739335536957, + "mean_token_accuracy": 0.8548711538314819, + "num_tokens": 15558913.0, + "step": 1749 + }, + { + "epoch": 1.3297872340425532, + "grad_norm": 1.4237847328186035, + "learning_rate": 3.1702820728885657e-06, + "loss": 0.40350261330604553, + "mean_token_accuracy": 0.858984649181366, + "num_tokens": 15572045.0, + "step": 1750 + }, + { + "epoch": 1.330547112462006, + "grad_norm": 2.2641282081604004, + "learning_rate": 3.1682641238092064e-06, + "loss": 0.5117636919021606, + "mean_token_accuracy": 0.8078924417495728, + "num_tokens": 15579753.0, + "step": 1751 + }, + { + "epoch": 1.331306990881459, + "grad_norm": 1.0010309219360352, + "learning_rate": 3.1662457057163603e-06, + "loss": 0.3220978379249573, + "mean_token_accuracy": 0.8786559104919434, + "num_tokens": 15602823.0, + "step": 1752 + }, + { + "epoch": 1.332066869300912, + "grad_norm": 2.441230535507202, + "learning_rate": 3.164226820026632e-06, + "loss": 0.37529727816581726, + "mean_token_accuracy": 0.8886898756027222, + "num_tokens": 15608473.0, + "step": 1753 + }, + { + "epoch": 1.3328267477203648, + "grad_norm": 1.2960991859436035, + "learning_rate": 3.162207468156952e-06, + "loss": 0.3393767476081848, + "mean_token_accuracy": 0.8766993284225464, + "num_tokens": 15620893.0, + "step": 1754 + }, + { + "epoch": 1.3335866261398177, + "grad_norm": 2.0806996822357178, + "learning_rate": 3.16018765152458e-06, + "loss": 0.38034507632255554, + "mean_token_accuracy": 0.8854838609695435, + "num_tokens": 15627068.0, + "step": 1755 + }, + { + "epoch": 1.3343465045592704, + "grad_norm": 1.4316699504852295, + "learning_rate": 3.1581673715471007e-06, + "loss": 0.3665890693664551, + "mean_token_accuracy": 0.870919406414032, + "num_tokens": 15641070.0, + "step": 1756 + }, + { + "epoch": 1.3351063829787235, + "grad_norm": 1.3466622829437256, + "learning_rate": 3.1561466296424247e-06, + "loss": 0.37387198209762573, + "mean_token_accuracy": 0.8633951544761658, + "num_tokens": 15653777.0, + "step": 1757 + }, + { + "epoch": 1.3358662613981762, + "grad_norm": 1.8108628988265991, + "learning_rate": 3.154125427228786e-06, + "loss": 0.38428938388824463, + "mean_token_accuracy": 0.85402512550354, + "num_tokens": 15662494.0, + "step": 1758 + }, + { + "epoch": 1.3366261398176291, + "grad_norm": 1.3221700191497803, + "learning_rate": 3.152103765724743e-06, + "loss": 0.42825520038604736, + "mean_token_accuracy": 0.8435465097427368, + "num_tokens": 15677552.0, + "step": 1759 + }, + { + "epoch": 1.337386018237082, + "grad_norm": 2.6247692108154297, + "learning_rate": 3.150081646549174e-06, + "loss": 0.36186715960502625, + "mean_token_accuracy": 0.8767328262329102, + "num_tokens": 15682103.0, + "step": 1760 + }, + { + "epoch": 1.338145896656535, + "grad_norm": 2.1469814777374268, + "learning_rate": 3.1480590711212823e-06, + "loss": 0.3734385669231415, + "mean_token_accuracy": 0.8711104393005371, + "num_tokens": 15689182.0, + "step": 1761 + }, + { + "epoch": 1.3389057750759878, + "grad_norm": 2.1702585220336914, + "learning_rate": 3.1460360408605866e-06, + "loss": 0.2795315086841583, + "mean_token_accuracy": 0.8892190456390381, + "num_tokens": 15694272.0, + "step": 1762 + }, + { + "epoch": 1.3396656534954408, + "grad_norm": 1.918797254562378, + "learning_rate": 3.144012557186931e-06, + "loss": 0.4363473057746887, + "mean_token_accuracy": 0.8573931455612183, + "num_tokens": 15703532.0, + "step": 1763 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 2.5579960346221924, + "learning_rate": 3.14198862152047e-06, + "loss": 0.406247079372406, + "mean_token_accuracy": 0.8617593050003052, + "num_tokens": 15708652.0, + "step": 1764 + }, + { + "epoch": 1.3411854103343466, + "grad_norm": 2.3617870807647705, + "learning_rate": 3.1399642352816825e-06, + "loss": 0.2839522659778595, + "mean_token_accuracy": 0.8996064066886902, + "num_tokens": 15713598.0, + "step": 1765 + }, + { + "epoch": 1.3419452887537995, + "grad_norm": 1.248302936553955, + "learning_rate": 3.1379393998913594e-06, + "loss": 0.2922290861606598, + "mean_token_accuracy": 0.8948773145675659, + "num_tokens": 15726693.0, + "step": 1766 + }, + { + "epoch": 1.3427051671732522, + "grad_norm": 2.143599510192871, + "learning_rate": 3.135914116770609e-06, + "loss": 0.32176223397254944, + "mean_token_accuracy": 0.8808754682540894, + "num_tokens": 15731901.0, + "step": 1767 + }, + { + "epoch": 1.3434650455927053, + "grad_norm": 4.226369857788086, + "learning_rate": 3.1338883873408517e-06, + "loss": 0.4682556390762329, + "mean_token_accuracy": 0.8566025495529175, + "num_tokens": 15735029.0, + "step": 1768 + }, + { + "epoch": 1.344224924012158, + "grad_norm": 1.8695988655090332, + "learning_rate": 3.1318622130238237e-06, + "loss": 0.4297192394733429, + "mean_token_accuracy": 0.8419148921966553, + "num_tokens": 15744310.0, + "step": 1769 + }, + { + "epoch": 1.344984802431611, + "grad_norm": 2.4321305751800537, + "learning_rate": 3.1298355952415714e-06, + "loss": 0.36076444387435913, + "mean_token_accuracy": 0.8826035261154175, + "num_tokens": 15749337.0, + "step": 1770 + }, + { + "epoch": 1.3457446808510638, + "grad_norm": 1.5500011444091797, + "learning_rate": 3.127808535416454e-06, + "loss": 0.48664039373397827, + "mean_token_accuracy": 0.844344437122345, + "num_tokens": 15761096.0, + "step": 1771 + }, + { + "epoch": 1.3465045592705167, + "grad_norm": 2.1498289108276367, + "learning_rate": 3.1257810349711388e-06, + "loss": 0.4841752052307129, + "mean_token_accuracy": 0.8324567079544067, + "num_tokens": 15768646.0, + "step": 1772 + }, + { + "epoch": 1.3472644376899696, + "grad_norm": 1.2995187044143677, + "learning_rate": 3.1237530953286046e-06, + "loss": 0.492019385099411, + "mean_token_accuracy": 0.8285316228866577, + "num_tokens": 15788401.0, + "step": 1773 + }, + { + "epoch": 1.3480243161094225, + "grad_norm": 2.324819803237915, + "learning_rate": 3.121724717912138e-06, + "loss": 0.33166298270225525, + "mean_token_accuracy": 0.8856451511383057, + "num_tokens": 15794097.0, + "step": 1774 + }, + { + "epoch": 1.3487841945288754, + "grad_norm": 1.9611430168151855, + "learning_rate": 3.11969590414533e-06, + "loss": 0.3974284827709198, + "mean_token_accuracy": 0.8751305937767029, + "num_tokens": 15801065.0, + "step": 1775 + }, + { + "epoch": 1.3495440729483283, + "grad_norm": 1.7084417343139648, + "learning_rate": 3.1176666554520827e-06, + "loss": 0.38729435205459595, + "mean_token_accuracy": 0.8680770397186279, + "num_tokens": 15810353.0, + "step": 1776 + }, + { + "epoch": 1.3503039513677813, + "grad_norm": 1.7616240978240967, + "learning_rate": 3.1156369732566006e-06, + "loss": 0.4271578788757324, + "mean_token_accuracy": 0.843730092048645, + "num_tokens": 15821889.0, + "step": 1777 + }, + { + "epoch": 1.351063829787234, + "grad_norm": 2.030747413635254, + "learning_rate": 3.113606858983391e-06, + "loss": 0.361891508102417, + "mean_token_accuracy": 0.8522407412528992, + "num_tokens": 15830800.0, + "step": 1778 + }, + { + "epoch": 1.3518237082066868, + "grad_norm": 1.4842649698257446, + "learning_rate": 3.1115763140572686e-06, + "loss": 0.466334730386734, + "mean_token_accuracy": 0.8433995246887207, + "num_tokens": 15849422.0, + "step": 1779 + }, + { + "epoch": 1.3525835866261398, + "grad_norm": 1.6595379114151, + "learning_rate": 3.109545339903347e-06, + "loss": 0.4622533321380615, + "mean_token_accuracy": 0.8526314496994019, + "num_tokens": 15860431.0, + "step": 1780 + }, + { + "epoch": 1.3533434650455927, + "grad_norm": 2.1235809326171875, + "learning_rate": 3.107513937947041e-06, + "loss": 0.42694270610809326, + "mean_token_accuracy": 0.854864239692688, + "num_tokens": 15869044.0, + "step": 1781 + }, + { + "epoch": 1.3541033434650456, + "grad_norm": 1.5889263153076172, + "learning_rate": 3.1054821096140675e-06, + "loss": 0.41838499903678894, + "mean_token_accuracy": 0.8671513795852661, + "num_tokens": 15878598.0, + "step": 1782 + }, + { + "epoch": 1.3548632218844985, + "grad_norm": 2.2261741161346436, + "learning_rate": 3.1034498563304435e-06, + "loss": 0.4045066237449646, + "mean_token_accuracy": 0.843826949596405, + "num_tokens": 15885167.0, + "step": 1783 + }, + { + "epoch": 1.3556231003039514, + "grad_norm": 2.2569329738616943, + "learning_rate": 3.1014171795224794e-06, + "loss": 0.36677104234695435, + "mean_token_accuracy": 0.8747833967208862, + "num_tokens": 15891308.0, + "step": 1784 + }, + { + "epoch": 1.3563829787234043, + "grad_norm": 2.1027088165283203, + "learning_rate": 3.0993840806167884e-06, + "loss": 0.437946081161499, + "mean_token_accuracy": 0.8370785117149353, + "num_tokens": 15898952.0, + "step": 1785 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 1.8768929243087769, + "learning_rate": 3.0973505610402767e-06, + "loss": 0.4201734662055969, + "mean_token_accuracy": 0.8474810123443604, + "num_tokens": 15907340.0, + "step": 1786 + }, + { + "epoch": 1.35790273556231, + "grad_norm": 1.7216229438781738, + "learning_rate": 3.0953166222201474e-06, + "loss": 0.4225231409072876, + "mean_token_accuracy": 0.8437749147415161, + "num_tokens": 15917852.0, + "step": 1787 + }, + { + "epoch": 1.358662613981763, + "grad_norm": 2.6256966590881348, + "learning_rate": 3.093282265583895e-06, + "loss": 0.435439795255661, + "mean_token_accuracy": 0.8452040553092957, + "num_tokens": 15923739.0, + "step": 1788 + }, + { + "epoch": 1.3594224924012157, + "grad_norm": 2.90028977394104, + "learning_rate": 3.0912474925593124e-06, + "loss": 0.3730456829071045, + "mean_token_accuracy": 0.8766646385192871, + "num_tokens": 15927943.0, + "step": 1789 + }, + { + "epoch": 1.3601823708206686, + "grad_norm": 1.5966626405715942, + "learning_rate": 3.0892123045744787e-06, + "loss": 0.42150455713272095, + "mean_token_accuracy": 0.854656457901001, + "num_tokens": 15939922.0, + "step": 1790 + }, + { + "epoch": 1.3609422492401215, + "grad_norm": 1.8069748878479004, + "learning_rate": 3.0871767030577686e-06, + "loss": 0.4954872131347656, + "mean_token_accuracy": 0.8289790153503418, + "num_tokens": 15950095.0, + "step": 1791 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 2.0855250358581543, + "learning_rate": 3.085140689437846e-06, + "loss": 0.41999945044517517, + "mean_token_accuracy": 0.8517382144927979, + "num_tokens": 15957972.0, + "step": 1792 + }, + { + "epoch": 1.3624620060790273, + "grad_norm": 2.108659267425537, + "learning_rate": 3.0831042651436634e-06, + "loss": 0.3668023645877838, + "mean_token_accuracy": 0.8710855841636658, + "num_tokens": 15965614.0, + "step": 1793 + }, + { + "epoch": 1.3632218844984803, + "grad_norm": 1.3799632787704468, + "learning_rate": 3.0810674316044602e-06, + "loss": 0.351409375667572, + "mean_token_accuracy": 0.870837390422821, + "num_tokens": 15978854.0, + "step": 1794 + }, + { + "epoch": 1.3639817629179332, + "grad_norm": 1.540397047996521, + "learning_rate": 3.0790301902497664e-06, + "loss": 0.403600811958313, + "mean_token_accuracy": 0.8485002517700195, + "num_tokens": 15993324.0, + "step": 1795 + }, + { + "epoch": 1.364741641337386, + "grad_norm": 1.946882963180542, + "learning_rate": 3.076992542509396e-06, + "loss": 0.40118327736854553, + "mean_token_accuracy": 0.8607497811317444, + "num_tokens": 16001937.0, + "step": 1796 + }, + { + "epoch": 1.365501519756839, + "grad_norm": 2.0464305877685547, + "learning_rate": 3.0749544898134487e-06, + "loss": 0.31742292642593384, + "mean_token_accuracy": 0.8878391981124878, + "num_tokens": 16009277.0, + "step": 1797 + }, + { + "epoch": 1.3662613981762917, + "grad_norm": 2.091754913330078, + "learning_rate": 3.072916033592307e-06, + "loss": 0.31580421328544617, + "mean_token_accuracy": 0.8875244855880737, + "num_tokens": 16015756.0, + "step": 1798 + }, + { + "epoch": 1.3670212765957448, + "grad_norm": 3.4449212551116943, + "learning_rate": 3.0708771752766397e-06, + "loss": 0.4692591726779938, + "mean_token_accuracy": 0.8456202149391174, + "num_tokens": 16019912.0, + "step": 1799 + }, + { + "epoch": 1.3677811550151975, + "grad_norm": 1.600419521331787, + "learning_rate": 3.068837916297396e-06, + "loss": 0.40389442443847656, + "mean_token_accuracy": 0.8378961086273193, + "num_tokens": 16032637.0, + "step": 1800 + }, + { + "epoch": 1.3685410334346504, + "grad_norm": 1.5282686948776245, + "learning_rate": 3.0667982580858047e-06, + "loss": 0.379841685295105, + "mean_token_accuracy": 0.8752143383026123, + "num_tokens": 16045205.0, + "step": 1801 + }, + { + "epoch": 1.3693009118541033, + "grad_norm": 2.486079454421997, + "learning_rate": 3.0647582020733773e-06, + "loss": 0.41060030460357666, + "mean_token_accuracy": 0.8575131893157959, + "num_tokens": 16051189.0, + "step": 1802 + }, + { + "epoch": 1.3700607902735562, + "grad_norm": 1.9458621740341187, + "learning_rate": 3.062717749691904e-06, + "loss": 0.4442213773727417, + "mean_token_accuracy": 0.8451495170593262, + "num_tokens": 16059700.0, + "step": 1803 + }, + { + "epoch": 1.3708206686930091, + "grad_norm": 1.4333001375198364, + "learning_rate": 3.0606769023734535e-06, + "loss": 0.39132001996040344, + "mean_token_accuracy": 0.8609901666641235, + "num_tokens": 16072458.0, + "step": 1804 + }, + { + "epoch": 1.371580547112462, + "grad_norm": 1.490355372428894, + "learning_rate": 3.0586356615503693e-06, + "loss": 0.4108564257621765, + "mean_token_accuracy": 0.8871046304702759, + "num_tokens": 16083142.0, + "step": 1805 + }, + { + "epoch": 1.372340425531915, + "grad_norm": 1.7765129804611206, + "learning_rate": 3.056594028655274e-06, + "loss": 0.3850266635417938, + "mean_token_accuracy": 0.8923365473747253, + "num_tokens": 16092519.0, + "step": 1806 + }, + { + "epoch": 1.3731003039513678, + "grad_norm": 1.955661416053772, + "learning_rate": 3.0545520051210637e-06, + "loss": 0.4665378928184509, + "mean_token_accuracy": 0.837419867515564, + "num_tokens": 16100618.0, + "step": 1807 + }, + { + "epoch": 1.3738601823708207, + "grad_norm": 3.259265422821045, + "learning_rate": 3.052509592380909e-06, + "loss": 0.24722981452941895, + "mean_token_accuracy": 0.9106054306030273, + "num_tokens": 16103836.0, + "step": 1808 + }, + { + "epoch": 1.3746200607902734, + "grad_norm": 1.7995736598968506, + "learning_rate": 3.050466791868254e-06, + "loss": 0.4982220530509949, + "mean_token_accuracy": 0.8298169374465942, + "num_tokens": 16114727.0, + "step": 1809 + }, + { + "epoch": 1.3753799392097266, + "grad_norm": 1.9643093347549438, + "learning_rate": 3.048423605016815e-06, + "loss": 0.5076829195022583, + "mean_token_accuracy": 0.8303098678588867, + "num_tokens": 16129491.0, + "step": 1810 + }, + { + "epoch": 1.3761398176291793, + "grad_norm": 3.505594491958618, + "learning_rate": 3.0463800332605787e-06, + "loss": 0.27466052770614624, + "mean_token_accuracy": 0.9018045663833618, + "num_tokens": 16132640.0, + "step": 1811 + }, + { + "epoch": 1.3768996960486322, + "grad_norm": 1.798437237739563, + "learning_rate": 3.0443360780338034e-06, + "loss": 0.4004853069782257, + "mean_token_accuracy": 0.8569544553756714, + "num_tokens": 16143317.0, + "step": 1812 + }, + { + "epoch": 1.377659574468085, + "grad_norm": 2.276740789413452, + "learning_rate": 3.042291740771014e-06, + "loss": 0.3823797106742859, + "mean_token_accuracy": 0.8764113783836365, + "num_tokens": 16148898.0, + "step": 1813 + }, + { + "epoch": 1.378419452887538, + "grad_norm": 2.5051357746124268, + "learning_rate": 3.0402470229070057e-06, + "loss": 0.40365856885910034, + "mean_token_accuracy": 0.8809891939163208, + "num_tokens": 16153815.0, + "step": 1814 + }, + { + "epoch": 1.3791793313069909, + "grad_norm": 1.2379236221313477, + "learning_rate": 3.03820192587684e-06, + "loss": 0.3955119848251343, + "mean_token_accuracy": 0.8536627292633057, + "num_tokens": 16167783.0, + "step": 1815 + }, + { + "epoch": 1.3799392097264438, + "grad_norm": 2.2286343574523926, + "learning_rate": 3.036156451115846e-06, + "loss": 0.39647501707077026, + "mean_token_accuracy": 0.8621993064880371, + "num_tokens": 16174707.0, + "step": 1816 + }, + { + "epoch": 1.3806990881458967, + "grad_norm": 1.884639024734497, + "learning_rate": 3.034110600059616e-06, + "loss": 0.31612110137939453, + "mean_token_accuracy": 0.8942475318908691, + "num_tokens": 16181919.0, + "step": 1817 + }, + { + "epoch": 1.3814589665653496, + "grad_norm": 1.891312599182129, + "learning_rate": 3.0320643741440052e-06, + "loss": 0.46209126710891724, + "mean_token_accuracy": 0.8374713659286499, + "num_tokens": 16189276.0, + "step": 1818 + }, + { + "epoch": 1.3822188449848025, + "grad_norm": 2.507478713989258, + "learning_rate": 3.0300177748051375e-06, + "loss": 0.37601593136787415, + "mean_token_accuracy": 0.8633589148521423, + "num_tokens": 16194346.0, + "step": 1819 + }, + { + "epoch": 1.3829787234042552, + "grad_norm": 1.5046696662902832, + "learning_rate": 3.0279708034793907e-06, + "loss": 0.3284982144832611, + "mean_token_accuracy": 0.8792630434036255, + "num_tokens": 16205457.0, + "step": 1820 + }, + { + "epoch": 1.3837386018237083, + "grad_norm": 2.4244449138641357, + "learning_rate": 3.025923461603412e-06, + "loss": 0.40939009189605713, + "mean_token_accuracy": 0.8596426248550415, + "num_tokens": 16211866.0, + "step": 1821 + }, + { + "epoch": 1.384498480243161, + "grad_norm": 2.8656933307647705, + "learning_rate": 3.0238757506141013e-06, + "loss": 0.4397110044956207, + "mean_token_accuracy": 0.8597331047058105, + "num_tokens": 16216607.0, + "step": 1822 + }, + { + "epoch": 1.385258358662614, + "grad_norm": 2.0718610286712646, + "learning_rate": 3.0218276719486245e-06, + "loss": 0.49057573080062866, + "mean_token_accuracy": 0.8325331211090088, + "num_tokens": 16224014.0, + "step": 1823 + }, + { + "epoch": 1.3860182370820668, + "grad_norm": 1.054450273513794, + "learning_rate": 3.019779227044398e-06, + "loss": 0.3758106827735901, + "mean_token_accuracy": 0.8689473867416382, + "num_tokens": 16248627.0, + "step": 1824 + }, + { + "epoch": 1.3867781155015197, + "grad_norm": 2.1115148067474365, + "learning_rate": 3.0177304173391038e-06, + "loss": 0.502967119216919, + "mean_token_accuracy": 0.823198676109314, + "num_tokens": 16256255.0, + "step": 1825 + }, + { + "epoch": 1.3875379939209727, + "grad_norm": 2.207277297973633, + "learning_rate": 3.015681244270672e-06, + "loss": 0.3458971083164215, + "mean_token_accuracy": 0.8930196762084961, + "num_tokens": 16261823.0, + "step": 1826 + }, + { + "epoch": 1.3882978723404256, + "grad_norm": 1.289669156074524, + "learning_rate": 3.0136317092772923e-06, + "loss": 0.4422765374183655, + "mean_token_accuracy": 0.8358346819877625, + "num_tokens": 16280659.0, + "step": 1827 + }, + { + "epoch": 1.3890577507598785, + "grad_norm": 2.233865737915039, + "learning_rate": 3.0115818137974066e-06, + "loss": 0.3643006384372711, + "mean_token_accuracy": 0.8682862520217896, + "num_tokens": 16286356.0, + "step": 1828 + }, + { + "epoch": 1.3898176291793314, + "grad_norm": 1.0950042009353638, + "learning_rate": 3.0095315592697126e-06, + "loss": 0.34712421894073486, + "mean_token_accuracy": 0.8578766584396362, + "num_tokens": 16307298.0, + "step": 1829 + }, + { + "epoch": 1.3905775075987843, + "grad_norm": 1.1708037853240967, + "learning_rate": 3.007480947133155e-06, + "loss": 0.33152541518211365, + "mean_token_accuracy": 0.894973874092102, + "num_tokens": 16323232.0, + "step": 1830 + }, + { + "epoch": 1.391337386018237, + "grad_norm": 1.2226970195770264, + "learning_rate": 3.0054299788269343e-06, + "loss": 0.3915635943412781, + "mean_token_accuracy": 0.8575779795646667, + "num_tokens": 16339273.0, + "step": 1831 + }, + { + "epoch": 1.39209726443769, + "grad_norm": 1.2226042747497559, + "learning_rate": 3.0033786557904982e-06, + "loss": 0.45846253633499146, + "mean_token_accuracy": 0.8290432691574097, + "num_tokens": 16360145.0, + "step": 1832 + }, + { + "epoch": 1.3928571428571428, + "grad_norm": 2.0117406845092773, + "learning_rate": 3.001326979463545e-06, + "loss": 0.3837882876396179, + "mean_token_accuracy": 0.8941739797592163, + "num_tokens": 16366602.0, + "step": 1833 + }, + { + "epoch": 1.3936170212765957, + "grad_norm": 1.8419997692108154, + "learning_rate": 2.9992749512860177e-06, + "loss": 0.40777021646499634, + "mean_token_accuracy": 0.854655385017395, + "num_tokens": 16375611.0, + "step": 1834 + }, + { + "epoch": 1.3943768996960486, + "grad_norm": 1.9405122995376587, + "learning_rate": 2.9972225726981114e-06, + "loss": 0.46685922145843506, + "mean_token_accuracy": 0.8493201732635498, + "num_tokens": 16384878.0, + "step": 1835 + }, + { + "epoch": 1.3951367781155015, + "grad_norm": 1.2425674200057983, + "learning_rate": 2.995169845140264e-06, + "loss": 0.394692063331604, + "mean_token_accuracy": 0.851348876953125, + "num_tokens": 16404452.0, + "step": 1836 + }, + { + "epoch": 1.3958966565349544, + "grad_norm": 1.2215365171432495, + "learning_rate": 2.9931167700531575e-06, + "loss": 0.31412452459335327, + "mean_token_accuracy": 0.882760763168335, + "num_tokens": 16419358.0, + "step": 1837 + }, + { + "epoch": 1.3966565349544073, + "grad_norm": 1.912168025970459, + "learning_rate": 2.9910633488777198e-06, + "loss": 0.5065487623214722, + "mean_token_accuracy": 0.8524355292320251, + "num_tokens": 16430418.0, + "step": 1838 + }, + { + "epoch": 1.3974164133738602, + "grad_norm": 2.2173948287963867, + "learning_rate": 2.989009583055121e-06, + "loss": 0.4290938377380371, + "mean_token_accuracy": 0.8381836414337158, + "num_tokens": 16438267.0, + "step": 1839 + }, + { + "epoch": 1.3981762917933132, + "grad_norm": 1.8293484449386597, + "learning_rate": 2.9869554740267726e-06, + "loss": 0.41683733463287354, + "mean_token_accuracy": 0.8548779487609863, + "num_tokens": 16447382.0, + "step": 1840 + }, + { + "epoch": 1.398936170212766, + "grad_norm": 1.835015892982483, + "learning_rate": 2.9849010232343274e-06, + "loss": 0.5080599784851074, + "mean_token_accuracy": 0.8193596601486206, + "num_tokens": 16458541.0, + "step": 1841 + }, + { + "epoch": 1.3996960486322187, + "grad_norm": 2.031339645385742, + "learning_rate": 2.982846232119679e-06, + "loss": 0.5168882012367249, + "mean_token_accuracy": 0.8525956869125366, + "num_tokens": 16467747.0, + "step": 1842 + }, + { + "epoch": 1.4004559270516717, + "grad_norm": 1.5554167032241821, + "learning_rate": 2.9807911021249573e-06, + "loss": 0.35098958015441895, + "mean_token_accuracy": 0.888373851776123, + "num_tokens": 16479319.0, + "step": 1843 + }, + { + "epoch": 1.4012158054711246, + "grad_norm": 1.7183740139007568, + "learning_rate": 2.9787356346925327e-06, + "loss": 0.41263148188591003, + "mean_token_accuracy": 0.8478364944458008, + "num_tokens": 16489952.0, + "step": 1844 + }, + { + "epoch": 1.4019756838905775, + "grad_norm": 1.7743209600448608, + "learning_rate": 2.9766798312650112e-06, + "loss": 0.4211183190345764, + "mean_token_accuracy": 0.8641136884689331, + "num_tokens": 16498655.0, + "step": 1845 + }, + { + "epoch": 1.4027355623100304, + "grad_norm": 2.141300916671753, + "learning_rate": 2.9746236932852355e-06, + "loss": 0.49548980593681335, + "mean_token_accuracy": 0.8304252028465271, + "num_tokens": 16506348.0, + "step": 1846 + }, + { + "epoch": 1.4034954407294833, + "grad_norm": 2.341571807861328, + "learning_rate": 2.9725672221962804e-06, + "loss": 0.40804803371429443, + "mean_token_accuracy": 0.8545800447463989, + "num_tokens": 16513091.0, + "step": 1847 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 1.934428095817566, + "learning_rate": 2.9705104194414587e-06, + "loss": 0.30029812455177307, + "mean_token_accuracy": 0.9032052755355835, + "num_tokens": 16519455.0, + "step": 1848 + }, + { + "epoch": 1.405015197568389, + "grad_norm": 1.420804500579834, + "learning_rate": 2.9684532864643123e-06, + "loss": 0.4384060502052307, + "mean_token_accuracy": 0.8465110063552856, + "num_tokens": 16533222.0, + "step": 1849 + }, + { + "epoch": 1.405775075987842, + "grad_norm": 2.1180737018585205, + "learning_rate": 2.9663958247086165e-06, + "loss": 0.3915565609931946, + "mean_token_accuracy": 0.8633890748023987, + "num_tokens": 16539489.0, + "step": 1850 + }, + { + "epoch": 1.4065349544072947, + "grad_norm": 1.408048152923584, + "learning_rate": 2.964338035618378e-06, + "loss": 0.46166157722473145, + "mean_token_accuracy": 0.8305013179779053, + "num_tokens": 16555785.0, + "step": 1851 + }, + { + "epoch": 1.4072948328267478, + "grad_norm": 1.3418530225753784, + "learning_rate": 2.9622799206378306e-06, + "loss": 0.5314373970031738, + "mean_token_accuracy": 0.81779944896698, + "num_tokens": 16578111.0, + "step": 1852 + }, + { + "epoch": 1.4080547112462005, + "grad_norm": 1.4634262323379517, + "learning_rate": 2.9602214812114414e-06, + "loss": 0.4859408140182495, + "mean_token_accuracy": 0.8261818885803223, + "num_tokens": 16591976.0, + "step": 1853 + }, + { + "epoch": 1.4088145896656534, + "grad_norm": 1.4840295314788818, + "learning_rate": 2.9581627187838997e-06, + "loss": 0.4079628586769104, + "mean_token_accuracy": 0.8549603223800659, + "num_tokens": 16603631.0, + "step": 1854 + }, + { + "epoch": 1.4095744680851063, + "grad_norm": 2.1474642753601074, + "learning_rate": 2.956103634800126e-06, + "loss": 0.32997995615005493, + "mean_token_accuracy": 0.8836915493011475, + "num_tokens": 16609875.0, + "step": 1855 + }, + { + "epoch": 1.4103343465045592, + "grad_norm": 2.627460241317749, + "learning_rate": 2.9540442307052643e-06, + "loss": 0.3229186236858368, + "mean_token_accuracy": 0.8852157592773438, + "num_tokens": 16614113.0, + "step": 1856 + }, + { + "epoch": 1.4110942249240122, + "grad_norm": 1.9569811820983887, + "learning_rate": 2.9519845079446824e-06, + "loss": 0.5057883858680725, + "mean_token_accuracy": 0.8585711717605591, + "num_tokens": 16624611.0, + "step": 1857 + }, + { + "epoch": 1.411854103343465, + "grad_norm": 2.0604090690612793, + "learning_rate": 2.949924467963975e-06, + "loss": 0.4681510329246521, + "mean_token_accuracy": 0.8390560150146484, + "num_tokens": 16632938.0, + "step": 1858 + }, + { + "epoch": 1.412613981762918, + "grad_norm": 2.5430450439453125, + "learning_rate": 2.9478641122089563e-06, + "loss": 0.3090999126434326, + "mean_token_accuracy": 0.8943990468978882, + "num_tokens": 16637135.0, + "step": 1859 + }, + { + "epoch": 1.4133738601823709, + "grad_norm": 1.3275387287139893, + "learning_rate": 2.945803442125663e-06, + "loss": 0.3592180013656616, + "mean_token_accuracy": 0.8678265810012817, + "num_tokens": 16650322.0, + "step": 1860 + }, + { + "epoch": 1.4141337386018238, + "grad_norm": 1.9070929288864136, + "learning_rate": 2.943742459160354e-06, + "loss": 0.5332518815994263, + "mean_token_accuracy": 0.8475706577301025, + "num_tokens": 16660240.0, + "step": 1861 + }, + { + "epoch": 1.4148936170212765, + "grad_norm": 2.8724546432495117, + "learning_rate": 2.9416811647595052e-06, + "loss": 0.5052884817123413, + "mean_token_accuracy": 0.8363175392150879, + "num_tokens": 16665481.0, + "step": 1862 + }, + { + "epoch": 1.4156534954407296, + "grad_norm": 4.203817844390869, + "learning_rate": 2.939619560369813e-06, + "loss": 0.546925961971283, + "mean_token_accuracy": 0.834044337272644, + "num_tokens": 16669615.0, + "step": 1863 + }, + { + "epoch": 1.4164133738601823, + "grad_norm": 1.6466281414031982, + "learning_rate": 2.9375576474381907e-06, + "loss": 0.3474533259868622, + "mean_token_accuracy": 0.8571163415908813, + "num_tokens": 16678893.0, + "step": 1864 + }, + { + "epoch": 1.4171732522796352, + "grad_norm": 1.8885842561721802, + "learning_rate": 2.9354954274117683e-06, + "loss": 0.3726021349430084, + "mean_token_accuracy": 0.8629094958305359, + "num_tokens": 16685939.0, + "step": 1865 + }, + { + "epoch": 1.417933130699088, + "grad_norm": 2.830599784851074, + "learning_rate": 2.9334329017378898e-06, + "loss": 0.4138668477535248, + "mean_token_accuracy": 0.8670746088027954, + "num_tokens": 16690012.0, + "step": 1866 + }, + { + "epoch": 1.418693009118541, + "grad_norm": 1.6838961839675903, + "learning_rate": 2.9313700718641167e-06, + "loss": 0.33954259753227234, + "mean_token_accuracy": 0.8660278916358948, + "num_tokens": 16700061.0, + "step": 1867 + }, + { + "epoch": 1.419452887537994, + "grad_norm": 2.8767011165618896, + "learning_rate": 2.9293069392382224e-06, + "loss": 0.4650302827358246, + "mean_token_accuracy": 0.8448452949523926, + "num_tokens": 16705072.0, + "step": 1868 + }, + { + "epoch": 1.4202127659574468, + "grad_norm": 1.5901305675506592, + "learning_rate": 2.927243505308192e-06, + "loss": 0.40838998556137085, + "mean_token_accuracy": 0.8560664653778076, + "num_tokens": 16714763.0, + "step": 1869 + }, + { + "epoch": 1.4209726443768997, + "grad_norm": 1.3293657302856445, + "learning_rate": 2.925179771522223e-06, + "loss": 0.34712862968444824, + "mean_token_accuracy": 0.8633697032928467, + "num_tokens": 16729575.0, + "step": 1870 + }, + { + "epoch": 1.4217325227963526, + "grad_norm": 1.7465964555740356, + "learning_rate": 2.9231157393287234e-06, + "loss": 0.48190903663635254, + "mean_token_accuracy": 0.8255834579467773, + "num_tokens": 16742529.0, + "step": 1871 + }, + { + "epoch": 1.4224924012158056, + "grad_norm": 1.865749716758728, + "learning_rate": 2.9210514101763116e-06, + "loss": 0.4912028908729553, + "mean_token_accuracy": 0.8309572339057922, + "num_tokens": 16753989.0, + "step": 1872 + }, + { + "epoch": 1.4232522796352582, + "grad_norm": 2.55780291557312, + "learning_rate": 2.9189867855138103e-06, + "loss": 0.4550635814666748, + "mean_token_accuracy": 0.8584091067314148, + "num_tokens": 16758906.0, + "step": 1873 + }, + { + "epoch": 1.4240121580547114, + "grad_norm": 1.867530107498169, + "learning_rate": 2.9169218667902562e-06, + "loss": 0.3524911105632782, + "mean_token_accuracy": 0.8715004920959473, + "num_tokens": 16765969.0, + "step": 1874 + }, + { + "epoch": 1.424772036474164, + "grad_norm": 1.8886862993240356, + "learning_rate": 2.9148566554548857e-06, + "loss": 0.37144535779953003, + "mean_token_accuracy": 0.8640961050987244, + "num_tokens": 16773935.0, + "step": 1875 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 1.266065239906311, + "learning_rate": 2.912791152957145e-06, + "loss": 0.3341747522354126, + "mean_token_accuracy": 0.8929134607315063, + "num_tokens": 16787780.0, + "step": 1876 + }, + { + "epoch": 1.4262917933130699, + "grad_norm": 2.524888753890991, + "learning_rate": 2.9107253607466833e-06, + "loss": 0.33709171414375305, + "mean_token_accuracy": 0.8857531547546387, + "num_tokens": 16792753.0, + "step": 1877 + }, + { + "epoch": 1.4270516717325228, + "grad_norm": 1.9269018173217773, + "learning_rate": 2.908659280273354e-06, + "loss": 0.32599249482154846, + "mean_token_accuracy": 0.8777773380279541, + "num_tokens": 16799904.0, + "step": 1878 + }, + { + "epoch": 1.4278115501519757, + "grad_norm": 1.9844375848770142, + "learning_rate": 2.9065929129872097e-06, + "loss": 0.4086732268333435, + "mean_token_accuracy": 0.8505409955978394, + "num_tokens": 16807774.0, + "step": 1879 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 4.0958662033081055, + "learning_rate": 2.9045262603385073e-06, + "loss": 0.3838827610015869, + "mean_token_accuracy": 0.877601146697998, + "num_tokens": 16810908.0, + "step": 1880 + }, + { + "epoch": 1.4293313069908815, + "grad_norm": 1.7323768138885498, + "learning_rate": 2.902459323777704e-06, + "loss": 0.37459003925323486, + "mean_token_accuracy": 0.8655836582183838, + "num_tokens": 16819494.0, + "step": 1881 + }, + { + "epoch": 1.4300911854103344, + "grad_norm": 2.608043670654297, + "learning_rate": 2.900392104755455e-06, + "loss": 0.5798726677894592, + "mean_token_accuracy": 0.8382592797279358, + "num_tokens": 16827745.0, + "step": 1882 + }, + { + "epoch": 1.4308510638297873, + "grad_norm": 1.3262078762054443, + "learning_rate": 2.8983246047226137e-06, + "loss": 0.3724595904350281, + "mean_token_accuracy": 0.8651963472366333, + "num_tokens": 16844171.0, + "step": 1883 + }, + { + "epoch": 1.43161094224924, + "grad_norm": 1.7250545024871826, + "learning_rate": 2.8962568251302327e-06, + "loss": 0.3478979468345642, + "mean_token_accuracy": 0.8807886242866516, + "num_tokens": 16852838.0, + "step": 1884 + }, + { + "epoch": 1.4323708206686931, + "grad_norm": 2.114525318145752, + "learning_rate": 2.8941887674295573e-06, + "loss": 0.5156140327453613, + "mean_token_accuracy": 0.825178861618042, + "num_tokens": 16861087.0, + "step": 1885 + }, + { + "epoch": 1.4331306990881458, + "grad_norm": 2.400829792022705, + "learning_rate": 2.892120433072031e-06, + "loss": 0.2807392477989197, + "mean_token_accuracy": 0.8907361030578613, + "num_tokens": 16866557.0, + "step": 1886 + }, + { + "epoch": 1.4338905775075987, + "grad_norm": 2.490880012512207, + "learning_rate": 2.8900518235092908e-06, + "loss": 0.2615952491760254, + "mean_token_accuracy": 0.9152894020080566, + "num_tokens": 16871357.0, + "step": 1887 + }, + { + "epoch": 1.4346504559270516, + "grad_norm": 1.9058431386947632, + "learning_rate": 2.887982940193165e-06, + "loss": 0.43623363971710205, + "mean_token_accuracy": 0.84696364402771, + "num_tokens": 16879016.0, + "step": 1888 + }, + { + "epoch": 1.4354103343465046, + "grad_norm": 1.4520210027694702, + "learning_rate": 2.8859137845756785e-06, + "loss": 0.3961856961250305, + "mean_token_accuracy": 0.8518897294998169, + "num_tokens": 16892254.0, + "step": 1889 + }, + { + "epoch": 1.4361702127659575, + "grad_norm": 2.500274896621704, + "learning_rate": 2.8838443581090415e-06, + "loss": 0.41457289457321167, + "mean_token_accuracy": 0.8751448392868042, + "num_tokens": 16897156.0, + "step": 1890 + }, + { + "epoch": 1.4369300911854104, + "grad_norm": 2.9312057495117188, + "learning_rate": 2.8817746622456585e-06, + "loss": 0.45875269174575806, + "mean_token_accuracy": 0.8411039113998413, + "num_tokens": 16902291.0, + "step": 1891 + }, + { + "epoch": 1.4376899696048633, + "grad_norm": 2.367419481277466, + "learning_rate": 2.879704698438121e-06, + "loss": 0.3643629848957062, + "mean_token_accuracy": 0.8771071434020996, + "num_tokens": 16908128.0, + "step": 1892 + }, + { + "epoch": 1.4384498480243162, + "grad_norm": 1.9907705783843994, + "learning_rate": 2.8776344681392106e-06, + "loss": 0.3206835389137268, + "mean_token_accuracy": 0.879996657371521, + "num_tokens": 16914918.0, + "step": 1893 + }, + { + "epoch": 1.439209726443769, + "grad_norm": 3.536956310272217, + "learning_rate": 2.875563972801893e-06, + "loss": 0.3640141785144806, + "mean_token_accuracy": 0.8814959526062012, + "num_tokens": 16918187.0, + "step": 1894 + }, + { + "epoch": 1.4399696048632218, + "grad_norm": 1.3451156616210938, + "learning_rate": 2.8734932138793226e-06, + "loss": 0.3427346348762512, + "mean_token_accuracy": 0.8835382461547852, + "num_tokens": 16931135.0, + "step": 1895 + }, + { + "epoch": 1.4407294832826747, + "grad_norm": 2.0735955238342285, + "learning_rate": 2.871422192824837e-06, + "loss": 0.4265315532684326, + "mean_token_accuracy": 0.8452677726745605, + "num_tokens": 16937995.0, + "step": 1896 + }, + { + "epoch": 1.4414893617021276, + "grad_norm": 1.5124932527542114, + "learning_rate": 2.8693509110919597e-06, + "loss": 0.497121661901474, + "mean_token_accuracy": 0.815092921257019, + "num_tokens": 16952743.0, + "step": 1897 + }, + { + "epoch": 1.4422492401215805, + "grad_norm": 3.716669797897339, + "learning_rate": 2.867279370134395e-06, + "loss": 0.5452651381492615, + "mean_token_accuracy": 0.8150380849838257, + "num_tokens": 16956797.0, + "step": 1898 + }, + { + "epoch": 1.4430091185410334, + "grad_norm": 1.3571398258209229, + "learning_rate": 2.8652075714060296e-06, + "loss": 0.4249724745750427, + "mean_token_accuracy": 0.8675867915153503, + "num_tokens": 16974494.0, + "step": 1899 + }, + { + "epoch": 1.4437689969604863, + "grad_norm": 2.310673475265503, + "learning_rate": 2.863135516360932e-06, + "loss": 0.39368677139282227, + "mean_token_accuracy": 0.878392219543457, + "num_tokens": 16980612.0, + "step": 1900 + }, + { + "epoch": 1.4445288753799392, + "grad_norm": 1.9025533199310303, + "learning_rate": 2.8610632064533517e-06, + "loss": 0.4786127805709839, + "mean_token_accuracy": 0.8720556497573853, + "num_tokens": 16992262.0, + "step": 1901 + }, + { + "epoch": 1.4452887537993921, + "grad_norm": 2.528564453125, + "learning_rate": 2.8589906431377133e-06, + "loss": 0.4223094582557678, + "mean_token_accuracy": 0.8513246178627014, + "num_tokens": 16997717.0, + "step": 1902 + }, + { + "epoch": 1.446048632218845, + "grad_norm": 1.010425329208374, + "learning_rate": 2.8569178278686222e-06, + "loss": 0.3908255696296692, + "mean_token_accuracy": 0.8620463609695435, + "num_tokens": 17020903.0, + "step": 1903 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 1.5760232210159302, + "learning_rate": 2.8548447621008614e-06, + "loss": 0.4134044051170349, + "mean_token_accuracy": 0.8472093343734741, + "num_tokens": 17035250.0, + "step": 1904 + }, + { + "epoch": 1.4475683890577509, + "grad_norm": 2.0668535232543945, + "learning_rate": 2.8527714472893866e-06, + "loss": 0.44095730781555176, + "mean_token_accuracy": 0.881983757019043, + "num_tokens": 17042170.0, + "step": 1905 + }, + { + "epoch": 1.4483282674772036, + "grad_norm": 1.1620599031448364, + "learning_rate": 2.85069788488933e-06, + "loss": 0.3607163429260254, + "mean_token_accuracy": 0.8684282898902893, + "num_tokens": 17061937.0, + "step": 1906 + }, + { + "epoch": 1.4490881458966565, + "grad_norm": 2.1316568851470947, + "learning_rate": 2.8486240763559984e-06, + "loss": 0.3478124141693115, + "mean_token_accuracy": 0.8772403001785278, + "num_tokens": 17068628.0, + "step": 1907 + }, + { + "epoch": 1.4498480243161094, + "grad_norm": 2.4756391048431396, + "learning_rate": 2.8465500231448707e-06, + "loss": 0.46441152691841125, + "mean_token_accuracy": 0.8436450958251953, + "num_tokens": 17075495.0, + "step": 1908 + }, + { + "epoch": 1.4506079027355623, + "grad_norm": 2.249720573425293, + "learning_rate": 2.844475726711595e-06, + "loss": 0.41565513610839844, + "mean_token_accuracy": 0.8525094985961914, + "num_tokens": 17080940.0, + "step": 1909 + }, + { + "epoch": 1.4513677811550152, + "grad_norm": 2.3081841468811035, + "learning_rate": 2.8424011885119956e-06, + "loss": 0.49903199076652527, + "mean_token_accuracy": 0.8212426900863647, + "num_tokens": 17092024.0, + "step": 1910 + }, + { + "epoch": 1.452127659574468, + "grad_norm": 1.2929959297180176, + "learning_rate": 2.8403264100020613e-06, + "loss": 0.47038257122039795, + "mean_token_accuracy": 0.8319816589355469, + "num_tokens": 17108840.0, + "step": 1911 + }, + { + "epoch": 1.452887537993921, + "grad_norm": 1.6476463079452515, + "learning_rate": 2.8382513926379508e-06, + "loss": 0.42287829518318176, + "mean_token_accuracy": 0.8555682897567749, + "num_tokens": 17119704.0, + "step": 1912 + }, + { + "epoch": 1.453647416413374, + "grad_norm": 1.759998083114624, + "learning_rate": 2.836176137875993e-06, + "loss": 0.40904951095581055, + "mean_token_accuracy": 0.8698266744613647, + "num_tokens": 17130676.0, + "step": 1913 + }, + { + "epoch": 1.4544072948328268, + "grad_norm": 1.510909914970398, + "learning_rate": 2.8341006471726817e-06, + "loss": 0.47834792733192444, + "mean_token_accuracy": 0.8335825204849243, + "num_tokens": 17146304.0, + "step": 1914 + }, + { + "epoch": 1.4551671732522795, + "grad_norm": 3.538071632385254, + "learning_rate": 2.832024921984674e-06, + "loss": 0.34059035778045654, + "mean_token_accuracy": 0.8769031763076782, + "num_tokens": 17150458.0, + "step": 1915 + }, + { + "epoch": 1.4559270516717326, + "grad_norm": 2.3368659019470215, + "learning_rate": 2.8299489637687955e-06, + "loss": 0.43068382143974304, + "mean_token_accuracy": 0.845360517501831, + "num_tokens": 17157368.0, + "step": 1916 + }, + { + "epoch": 1.4566869300911853, + "grad_norm": 1.8720396757125854, + "learning_rate": 2.8278727739820334e-06, + "loss": 0.37013399600982666, + "mean_token_accuracy": 0.854241132736206, + "num_tokens": 17166325.0, + "step": 1917 + }, + { + "epoch": 1.4574468085106382, + "grad_norm": 1.6706892251968384, + "learning_rate": 2.825796354081537e-06, + "loss": 0.5397020578384399, + "mean_token_accuracy": 0.8309713006019592, + "num_tokens": 17178920.0, + "step": 1918 + }, + { + "epoch": 1.4582066869300911, + "grad_norm": 2.729210376739502, + "learning_rate": 2.8237197055246175e-06, + "loss": 0.25137859582901, + "mean_token_accuracy": 0.9148792028427124, + "num_tokens": 17183107.0, + "step": 1919 + }, + { + "epoch": 1.458966565349544, + "grad_norm": 3.023500680923462, + "learning_rate": 2.821642829768748e-06, + "loss": 0.43312495946884155, + "mean_token_accuracy": 0.8481811285018921, + "num_tokens": 17187853.0, + "step": 1920 + }, + { + "epoch": 1.459726443768997, + "grad_norm": 1.8108519315719604, + "learning_rate": 2.8195657282715595e-06, + "loss": 0.5101792216300964, + "mean_token_accuracy": 0.8315553069114685, + "num_tokens": 17199247.0, + "step": 1921 + }, + { + "epoch": 1.4604863221884499, + "grad_norm": 2.0262672901153564, + "learning_rate": 2.817488402490841e-06, + "loss": 0.4449934959411621, + "mean_token_accuracy": 0.8634527325630188, + "num_tokens": 17206348.0, + "step": 1922 + }, + { + "epoch": 1.4612462006079028, + "grad_norm": 2.6163926124572754, + "learning_rate": 2.8154108538845405e-06, + "loss": 0.43052345514297485, + "mean_token_accuracy": 0.8375401496887207, + "num_tokens": 17211702.0, + "step": 1923 + }, + { + "epoch": 1.4620060790273557, + "grad_norm": 2.0854408740997314, + "learning_rate": 2.813333083910761e-06, + "loss": 0.5011380910873413, + "mean_token_accuracy": 0.8359915018081665, + "num_tokens": 17219096.0, + "step": 1924 + }, + { + "epoch": 1.4627659574468086, + "grad_norm": 2.2081687450408936, + "learning_rate": 2.8112550940277615e-06, + "loss": 0.5239193439483643, + "mean_token_accuracy": 0.8499593734741211, + "num_tokens": 17229266.0, + "step": 1925 + }, + { + "epoch": 1.4635258358662613, + "grad_norm": 1.798343539237976, + "learning_rate": 2.809176885693956e-06, + "loss": 0.4515029191970825, + "mean_token_accuracy": 0.8400485515594482, + "num_tokens": 17239280.0, + "step": 1926 + }, + { + "epoch": 1.4642857142857144, + "grad_norm": 1.897887945175171, + "learning_rate": 2.807098460367911e-06, + "loss": 0.35935714840888977, + "mean_token_accuracy": 0.8776072263717651, + "num_tokens": 17247132.0, + "step": 1927 + }, + { + "epoch": 1.465045592705167, + "grad_norm": 2.705836296081543, + "learning_rate": 2.8050198195083445e-06, + "loss": 0.3728443682193756, + "mean_token_accuracy": 0.8649885654449463, + "num_tokens": 17251865.0, + "step": 1928 + }, + { + "epoch": 1.46580547112462, + "grad_norm": 1.841178059577942, + "learning_rate": 2.802940964574127e-06, + "loss": 0.40604841709136963, + "mean_token_accuracy": 0.8537783622741699, + "num_tokens": 17260163.0, + "step": 1929 + }, + { + "epoch": 1.466565349544073, + "grad_norm": 2.7393605709075928, + "learning_rate": 2.800861897024279e-06, + "loss": 0.39346879720687866, + "mean_token_accuracy": 0.8628787994384766, + "num_tokens": 17264876.0, + "step": 1930 + }, + { + "epoch": 1.4673252279635258, + "grad_norm": 1.84367835521698, + "learning_rate": 2.798782618317971e-06, + "loss": 0.37411895394325256, + "mean_token_accuracy": 0.8605265617370605, + "num_tokens": 17273049.0, + "step": 1931 + }, + { + "epoch": 1.4680851063829787, + "grad_norm": 1.6546733379364014, + "learning_rate": 2.796703129914519e-06, + "loss": 0.4997844099998474, + "mean_token_accuracy": 0.8267433643341064, + "num_tokens": 17285074.0, + "step": 1932 + }, + { + "epoch": 1.4688449848024316, + "grad_norm": 2.2749221324920654, + "learning_rate": 2.79462343327339e-06, + "loss": 0.35453367233276367, + "mean_token_accuracy": 0.8746850490570068, + "num_tokens": 17290273.0, + "step": 1933 + }, + { + "epoch": 1.4696048632218845, + "grad_norm": 1.7142518758773804, + "learning_rate": 2.7925435298541944e-06, + "loss": 0.345878541469574, + "mean_token_accuracy": 0.8600981831550598, + "num_tokens": 17301045.0, + "step": 1934 + }, + { + "epoch": 1.4703647416413375, + "grad_norm": 3.163342237472534, + "learning_rate": 2.7904634211166877e-06, + "loss": 0.4356975853443146, + "mean_token_accuracy": 0.8460350036621094, + "num_tokens": 17305108.0, + "step": 1935 + }, + { + "epoch": 1.4711246200607904, + "grad_norm": 1.6377612352371216, + "learning_rate": 2.7883831085207707e-06, + "loss": 0.4459729790687561, + "mean_token_accuracy": 0.8463394641876221, + "num_tokens": 17315479.0, + "step": 1936 + }, + { + "epoch": 1.471884498480243, + "grad_norm": 1.865268588066101, + "learning_rate": 2.7863025935264876e-06, + "loss": 0.394723117351532, + "mean_token_accuracy": 0.864177942276001, + "num_tokens": 17324795.0, + "step": 1937 + }, + { + "epoch": 1.4726443768996962, + "grad_norm": 1.241937518119812, + "learning_rate": 2.784221877594024e-06, + "loss": 0.2752220630645752, + "mean_token_accuracy": 0.8998259902000427, + "num_tokens": 17338000.0, + "step": 1938 + }, + { + "epoch": 1.4734042553191489, + "grad_norm": 1.8013651371002197, + "learning_rate": 2.7821409621837042e-06, + "loss": 0.4251005947589874, + "mean_token_accuracy": 0.8518919348716736, + "num_tokens": 17347351.0, + "step": 1939 + }, + { + "epoch": 1.4741641337386018, + "grad_norm": 1.2902207374572754, + "learning_rate": 2.7800598487559976e-06, + "loss": 0.3640727400779724, + "mean_token_accuracy": 0.8592870235443115, + "num_tokens": 17362335.0, + "step": 1940 + }, + { + "epoch": 1.4749240121580547, + "grad_norm": 2.5427513122558594, + "learning_rate": 2.777978538771508e-06, + "loss": 0.38166797161102295, + "mean_token_accuracy": 0.8653234839439392, + "num_tokens": 17367733.0, + "step": 1941 + }, + { + "epoch": 1.4756838905775076, + "grad_norm": 1.7793641090393066, + "learning_rate": 2.7758970336909795e-06, + "loss": 0.3113783895969391, + "mean_token_accuracy": 0.8812868595123291, + "num_tokens": 17375267.0, + "step": 1942 + }, + { + "epoch": 1.4764437689969605, + "grad_norm": 3.4031741619110107, + "learning_rate": 2.7738153349752923e-06, + "loss": 0.4800986647605896, + "mean_token_accuracy": 0.8336698412895203, + "num_tokens": 17379549.0, + "step": 1943 + }, + { + "epoch": 1.4772036474164134, + "grad_norm": 1.3451651334762573, + "learning_rate": 2.7717334440854634e-06, + "loss": 0.3115345239639282, + "mean_token_accuracy": 0.908623218536377, + "num_tokens": 17394455.0, + "step": 1944 + }, + { + "epoch": 1.4779635258358663, + "grad_norm": 1.980919599533081, + "learning_rate": 2.7696513624826422e-06, + "loss": 0.391154944896698, + "mean_token_accuracy": 0.8650267720222473, + "num_tokens": 17401931.0, + "step": 1945 + }, + { + "epoch": 1.4787234042553192, + "grad_norm": 1.0118765830993652, + "learning_rate": 2.7675690916281158e-06, + "loss": 0.3157956600189209, + "mean_token_accuracy": 0.8827471733093262, + "num_tokens": 17424144.0, + "step": 1946 + }, + { + "epoch": 1.4794832826747721, + "grad_norm": 1.579654335975647, + "learning_rate": 2.7654866329833e-06, + "loss": 0.4578486382961273, + "mean_token_accuracy": 0.8361750245094299, + "num_tokens": 17435769.0, + "step": 1947 + }, + { + "epoch": 1.4802431610942248, + "grad_norm": 1.7706717252731323, + "learning_rate": 2.763403988009746e-06, + "loss": 0.3564416170120239, + "mean_token_accuracy": 0.8689201474189758, + "num_tokens": 17444088.0, + "step": 1948 + }, + { + "epoch": 1.4810030395136777, + "grad_norm": 1.2264244556427002, + "learning_rate": 2.761321158169134e-06, + "loss": 0.30763837695121765, + "mean_token_accuracy": 0.8960219621658325, + "num_tokens": 17458096.0, + "step": 1949 + }, + { + "epoch": 1.4817629179331306, + "grad_norm": 1.214431881904602, + "learning_rate": 2.759238144923274e-06, + "loss": 0.49099457263946533, + "mean_token_accuracy": 0.8279136419296265, + "num_tokens": 17481062.0, + "step": 1950 + }, + { + "epoch": 1.4825227963525835, + "grad_norm": 1.593892216682434, + "learning_rate": 2.7571549497341044e-06, + "loss": 0.3745320737361908, + "mean_token_accuracy": 0.8690779209136963, + "num_tokens": 17490874.0, + "step": 1951 + }, + { + "epoch": 1.4832826747720365, + "grad_norm": 2.409924268722534, + "learning_rate": 2.755071574063692e-06, + "loss": 0.4310247600078583, + "mean_token_accuracy": 0.8521159291267395, + "num_tokens": 17496942.0, + "step": 1952 + }, + { + "epoch": 1.4840425531914894, + "grad_norm": 1.2557463645935059, + "learning_rate": 2.7529880193742297e-06, + "loss": 0.34304720163345337, + "mean_token_accuracy": 0.8748183250427246, + "num_tokens": 17514391.0, + "step": 1953 + }, + { + "epoch": 1.4848024316109423, + "grad_norm": 1.17310631275177, + "learning_rate": 2.7509042871280373e-06, + "loss": 0.3835817277431488, + "mean_token_accuracy": 0.8853274583816528, + "num_tokens": 17533289.0, + "step": 1954 + }, + { + "epoch": 1.4855623100303952, + "grad_norm": 1.5261479616165161, + "learning_rate": 2.748820378787558e-06, + "loss": 0.4799988865852356, + "mean_token_accuracy": 0.8252149820327759, + "num_tokens": 17544118.0, + "step": 1955 + }, + { + "epoch": 1.486322188449848, + "grad_norm": 2.030930757522583, + "learning_rate": 2.7467362958153585e-06, + "loss": 0.35690805315971375, + "mean_token_accuracy": 0.8959587216377258, + "num_tokens": 17550431.0, + "step": 1956 + }, + { + "epoch": 1.4870820668693008, + "grad_norm": 2.376520872116089, + "learning_rate": 2.7446520396741293e-06, + "loss": 0.262234091758728, + "mean_token_accuracy": 0.9054547548294067, + "num_tokens": 17554853.0, + "step": 1957 + }, + { + "epoch": 1.487841945288754, + "grad_norm": 1.6944479942321777, + "learning_rate": 2.742567611826681e-06, + "loss": 0.529259979724884, + "mean_token_accuracy": 0.8195339441299438, + "num_tokens": 17568016.0, + "step": 1958 + }, + { + "epoch": 1.4886018237082066, + "grad_norm": 2.833029270172119, + "learning_rate": 2.7404830137359445e-06, + "loss": 0.30229634046554565, + "mean_token_accuracy": 0.8933001756668091, + "num_tokens": 17572587.0, + "step": 1959 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 1.7040144205093384, + "learning_rate": 2.7383982468649715e-06, + "loss": 0.3166356682777405, + "mean_token_accuracy": 0.8871906399726868, + "num_tokens": 17580966.0, + "step": 1960 + }, + { + "epoch": 1.4901215805471124, + "grad_norm": 1.7539052963256836, + "learning_rate": 2.7363133126769326e-06, + "loss": 0.4231064021587372, + "mean_token_accuracy": 0.8708304166793823, + "num_tokens": 17590907.0, + "step": 1961 + }, + { + "epoch": 1.4908814589665653, + "grad_norm": 1.6198650598526, + "learning_rate": 2.7342282126351145e-06, + "loss": 0.4198967218399048, + "mean_token_accuracy": 0.8723280429840088, + "num_tokens": 17604291.0, + "step": 1962 + }, + { + "epoch": 1.4916413373860182, + "grad_norm": 1.8437711000442505, + "learning_rate": 2.73214294820292e-06, + "loss": 0.38923323154449463, + "mean_token_accuracy": 0.8697006106376648, + "num_tokens": 17612291.0, + "step": 1963 + }, + { + "epoch": 1.4924012158054711, + "grad_norm": 1.1129369735717773, + "learning_rate": 2.7300575208438684e-06, + "loss": 0.3107512593269348, + "mean_token_accuracy": 0.878618597984314, + "num_tokens": 17630073.0, + "step": 1964 + }, + { + "epoch": 1.493161094224924, + "grad_norm": 3.0210442543029785, + "learning_rate": 2.7279719320215924e-06, + "loss": 0.4630751609802246, + "mean_token_accuracy": 0.8567075729370117, + "num_tokens": 17634758.0, + "step": 1965 + }, + { + "epoch": 1.493920972644377, + "grad_norm": 2.8825972080230713, + "learning_rate": 2.725886183199839e-06, + "loss": 0.35351765155792236, + "mean_token_accuracy": 0.8711981773376465, + "num_tokens": 17639613.0, + "step": 1966 + }, + { + "epoch": 1.4946808510638299, + "grad_norm": 2.111238718032837, + "learning_rate": 2.723800275842468e-06, + "loss": 0.3529569208621979, + "mean_token_accuracy": 0.8679244518280029, + "num_tokens": 17645308.0, + "step": 1967 + }, + { + "epoch": 1.4954407294832825, + "grad_norm": 2.080509901046753, + "learning_rate": 2.7217142114134466e-06, + "loss": 0.43321219086647034, + "mean_token_accuracy": 0.8848220109939575, + "num_tokens": 17652292.0, + "step": 1968 + }, + { + "epoch": 1.4962006079027357, + "grad_norm": 2.8686363697052, + "learning_rate": 2.7196279913768587e-06, + "loss": 0.417035311460495, + "mean_token_accuracy": 0.8724601864814758, + "num_tokens": 17656908.0, + "step": 1969 + }, + { + "epoch": 1.4969604863221884, + "grad_norm": 3.294193744659424, + "learning_rate": 2.717541617196891e-06, + "loss": 0.3551934063434601, + "mean_token_accuracy": 0.8838565349578857, + "num_tokens": 17660590.0, + "step": 1970 + }, + { + "epoch": 1.4977203647416413, + "grad_norm": 1.766292929649353, + "learning_rate": 2.7154550903378425e-06, + "loss": 0.36521971225738525, + "mean_token_accuracy": 0.8810199499130249, + "num_tokens": 17668214.0, + "step": 1971 + }, + { + "epoch": 1.4984802431610942, + "grad_norm": 1.2127676010131836, + "learning_rate": 2.713368412264118e-06, + "loss": 0.35184425115585327, + "mean_token_accuracy": 0.8672580718994141, + "num_tokens": 17684736.0, + "step": 1972 + }, + { + "epoch": 1.499240121580547, + "grad_norm": 2.268256664276123, + "learning_rate": 2.711281584440228e-06, + "loss": 0.40115267038345337, + "mean_token_accuracy": 0.8517841100692749, + "num_tokens": 17691510.0, + "step": 1973 + }, + { + "epoch": 1.5, + "grad_norm": 2.7196054458618164, + "learning_rate": 2.70919460833079e-06, + "loss": 0.3819037675857544, + "mean_token_accuracy": 0.8765411376953125, + "num_tokens": 17696179.0, + "step": 1974 + }, + { + "epoch": 1.500759878419453, + "grad_norm": 2.969406843185425, + "learning_rate": 2.7071074854005206e-06, + "loss": 0.3922455608844757, + "mean_token_accuracy": 0.8796037435531616, + "num_tokens": 17700597.0, + "step": 1975 + }, + { + "epoch": 1.5015197568389058, + "grad_norm": 2.2965853214263916, + "learning_rate": 2.705020217114248e-06, + "loss": 0.5433666110038757, + "mean_token_accuracy": 0.809639036655426, + "num_tokens": 17708895.0, + "step": 1976 + }, + { + "epoch": 1.5022796352583585, + "grad_norm": 1.5584394931793213, + "learning_rate": 2.7029328049368942e-06, + "loss": 0.4736343324184418, + "mean_token_accuracy": 0.8197190761566162, + "num_tokens": 17725202.0, + "step": 1977 + }, + { + "epoch": 1.5030395136778116, + "grad_norm": 1.3903142213821411, + "learning_rate": 2.700845250333486e-06, + "loss": 0.4471571445465088, + "mean_token_accuracy": 0.839043140411377, + "num_tokens": 17742835.0, + "step": 1978 + }, + { + "epoch": 1.5037993920972643, + "grad_norm": 3.080716609954834, + "learning_rate": 2.69875755476915e-06, + "loss": 0.45760005712509155, + "mean_token_accuracy": 0.8366328477859497, + "num_tokens": 17747324.0, + "step": 1979 + }, + { + "epoch": 1.5045592705167175, + "grad_norm": 1.0150405168533325, + "learning_rate": 2.696669719709111e-06, + "loss": 0.33638954162597656, + "mean_token_accuracy": 0.8591676354408264, + "num_tokens": 17765565.0, + "step": 1980 + }, + { + "epoch": 1.5053191489361701, + "grad_norm": 2.402927875518799, + "learning_rate": 2.694581746618691e-06, + "loss": 0.4086601436138153, + "mean_token_accuracy": 0.8769911527633667, + "num_tokens": 17771275.0, + "step": 1981 + }, + { + "epoch": 1.506079027355623, + "grad_norm": 2.030583381652832, + "learning_rate": 2.6924936369633126e-06, + "loss": 0.5115457773208618, + "mean_token_accuracy": 0.8054746389389038, + "num_tokens": 17779999.0, + "step": 1982 + }, + { + "epoch": 1.506838905775076, + "grad_norm": 2.575199604034424, + "learning_rate": 2.6904053922084893e-06, + "loss": 0.363183856010437, + "mean_token_accuracy": 0.8716042637825012, + "num_tokens": 17785473.0, + "step": 1983 + }, + { + "epoch": 1.5075987841945289, + "grad_norm": 1.8497480154037476, + "learning_rate": 2.688317013819832e-06, + "loss": 0.4254384934902191, + "mean_token_accuracy": 0.8549597263336182, + "num_tokens": 17793812.0, + "step": 1984 + }, + { + "epoch": 1.5083586626139818, + "grad_norm": 1.7786511182785034, + "learning_rate": 2.686228503263045e-06, + "loss": 0.33400774002075195, + "mean_token_accuracy": 0.9027615189552307, + "num_tokens": 17801783.0, + "step": 1985 + }, + { + "epoch": 1.5091185410334347, + "grad_norm": 1.8365367650985718, + "learning_rate": 2.684139862003927e-06, + "loss": 0.35765063762664795, + "mean_token_accuracy": 0.8663736581802368, + "num_tokens": 17809562.0, + "step": 1986 + }, + { + "epoch": 1.5098784194528876, + "grad_norm": 1.8817477226257324, + "learning_rate": 2.682051091508365e-06, + "loss": 0.4627506732940674, + "mean_token_accuracy": 0.8358862400054932, + "num_tokens": 17819094.0, + "step": 1987 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 2.221547842025757, + "learning_rate": 2.679962193242338e-06, + "loss": 0.577020525932312, + "mean_token_accuracy": 0.80013108253479, + "num_tokens": 17826666.0, + "step": 1988 + }, + { + "epoch": 1.5113981762917934, + "grad_norm": 2.6618270874023438, + "learning_rate": 2.6778731686719177e-06, + "loss": 0.44632256031036377, + "mean_token_accuracy": 0.8611289262771606, + "num_tokens": 17833172.0, + "step": 1989 + }, + { + "epoch": 1.512158054711246, + "grad_norm": 2.9495689868927, + "learning_rate": 2.67578401926326e-06, + "loss": 0.3482511043548584, + "mean_token_accuracy": 0.8703314661979675, + "num_tokens": 17837220.0, + "step": 1990 + }, + { + "epoch": 1.5129179331306992, + "grad_norm": 2.0943644046783447, + "learning_rate": 2.6736947464826107e-06, + "loss": 0.2354314625263214, + "mean_token_accuracy": 0.9137634038925171, + "num_tokens": 17842712.0, + "step": 1991 + }, + { + "epoch": 1.513677811550152, + "grad_norm": 1.1303033828735352, + "learning_rate": 2.671605351796302e-06, + "loss": 0.3624761700630188, + "mean_token_accuracy": 0.8769594430923462, + "num_tokens": 17860902.0, + "step": 1992 + }, + { + "epoch": 1.5144376899696048, + "grad_norm": 2.8921146392822266, + "learning_rate": 2.6695158366707526e-06, + "loss": 0.2517220973968506, + "mean_token_accuracy": 0.8974182605743408, + "num_tokens": 17865160.0, + "step": 1993 + }, + { + "epoch": 1.5151975683890577, + "grad_norm": 2.320587158203125, + "learning_rate": 2.667426202572463e-06, + "loss": 0.4589889943599701, + "mean_token_accuracy": 0.8379613161087036, + "num_tokens": 17871994.0, + "step": 1994 + }, + { + "epoch": 1.5159574468085106, + "grad_norm": 1.1407674551010132, + "learning_rate": 2.665336450968019e-06, + "loss": 0.34412115812301636, + "mean_token_accuracy": 0.8776306509971619, + "num_tokens": 17889941.0, + "step": 1995 + }, + { + "epoch": 1.5167173252279635, + "grad_norm": 2.069814920425415, + "learning_rate": 2.6632465833240895e-06, + "loss": 0.47524404525756836, + "mean_token_accuracy": 0.830310046672821, + "num_tokens": 17898447.0, + "step": 1996 + }, + { + "epoch": 1.5174772036474165, + "grad_norm": 1.822415828704834, + "learning_rate": 2.661156601107424e-06, + "loss": 0.4541318416595459, + "mean_token_accuracy": 0.8856616020202637, + "num_tokens": 17908729.0, + "step": 1997 + }, + { + "epoch": 1.5182370820668694, + "grad_norm": 2.851428985595703, + "learning_rate": 2.659066505784852e-06, + "loss": 0.41761666536331177, + "mean_token_accuracy": 0.8710572719573975, + "num_tokens": 17913860.0, + "step": 1998 + }, + { + "epoch": 1.518996960486322, + "grad_norm": 1.8483710289001465, + "learning_rate": 2.6569762988232838e-06, + "loss": 0.45517268776893616, + "mean_token_accuracy": 0.8411115407943726, + "num_tokens": 17923497.0, + "step": 1999 + }, + { + "epoch": 1.5197568389057752, + "grad_norm": 1.9044219255447388, + "learning_rate": 2.654885981689706e-06, + "loss": 0.42533189058303833, + "mean_token_accuracy": 0.8597894906997681, + "num_tokens": 17932670.0, + "step": 2000 + } + ], + "logging_steps": 1.0, + "max_steps": 3948, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9547571235271475e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}