diff --git "a/checkpoint-817/trainer_state.json" "b/checkpoint-817/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-817/trainer_state.json" @@ -0,0 +1,8204 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 817, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.416592299938202, + "epoch": 0.0012239902080783353, + "grad_norm": 3.6496078968048096, + "learning_rate": 0.0, + "loss": 3.9699, + "mean_token_accuracy": 0.44650959968566895, + "num_tokens": 3072.0, + "step": 1 + }, + { + "entropy": 1.4768912494182587, + "epoch": 0.0024479804161566705, + "grad_norm": 3.895545721054077, + "learning_rate": 8.000000000000001e-06, + "loss": 4.1303, + "mean_token_accuracy": 0.4473574683070183, + "num_tokens": 5791.0, + "step": 2 + }, + { + "entropy": 1.4352566003799438, + "epoch": 0.0036719706242350062, + "grad_norm": 3.807893753051758, + "learning_rate": 1.6000000000000003e-05, + "loss": 3.9861, + "mean_token_accuracy": 0.45180729776620865, + "num_tokens": 8666.0, + "step": 3 + }, + { + "entropy": 1.4534677863121033, + "epoch": 0.004895960832313341, + "grad_norm": 4.164114952087402, + "learning_rate": 2.4e-05, + "loss": 3.8968, + "mean_token_accuracy": 0.45359135419130325, + "num_tokens": 11690.0, + "step": 4 + }, + { + "entropy": 1.5395768284797668, + "epoch": 0.006119951040391677, + "grad_norm": 3.188849925994873, + "learning_rate": 3.2000000000000005e-05, + "loss": 3.7347, + "mean_token_accuracy": 0.4653988927602768, + "num_tokens": 14462.0, + "step": 5 + }, + { + "entropy": 1.6238888204097748, + "epoch": 0.0073439412484700125, + "grad_norm": 2.827573537826538, + "learning_rate": 4e-05, + "loss": 3.2967, + "mean_token_accuracy": 0.5028589144349098, + "num_tokens": 17273.0, + "step": 6 + }, + { + "entropy": 1.7926653325557709, + "epoch": 0.008567931456548347, + "grad_norm": 2.5692360401153564, + "learning_rate": 4.8e-05, + "loss": 2.894, + "mean_token_accuracy": 0.5247886776924133, + "num_tokens": 19995.0, + "step": 7 + }, + { + "entropy": 1.8918583691120148, + "epoch": 0.009791921664626682, + "grad_norm": 2.534435272216797, + "learning_rate": 5.6000000000000006e-05, + "loss": 2.5324, + "mean_token_accuracy": 0.5552134364843369, + "num_tokens": 22726.0, + "step": 8 + }, + { + "entropy": 2.017778754234314, + "epoch": 0.011015911872705019, + "grad_norm": 2.5614025592803955, + "learning_rate": 6.400000000000001e-05, + "loss": 2.3022, + "mean_token_accuracy": 0.5543268471956253, + "num_tokens": 25563.0, + "step": 9 + }, + { + "entropy": 1.917980670928955, + "epoch": 0.012239902080783354, + "grad_norm": 1.070860505104065, + "learning_rate": 7.2e-05, + "loss": 2.029, + "mean_token_accuracy": 0.5919588804244995, + "num_tokens": 28550.0, + "step": 10 + }, + { + "entropy": 1.7829480171203613, + "epoch": 0.01346389228886169, + "grad_norm": 1.1343615055084229, + "learning_rate": 8e-05, + "loss": 1.8033, + "mean_token_accuracy": 0.6609392613172531, + "num_tokens": 31437.0, + "step": 11 + }, + { + "entropy": 1.5484703183174133, + "epoch": 0.014687882496940025, + "grad_norm": 0.6140472292900085, + "learning_rate": 8.800000000000001e-05, + "loss": 1.6052, + "mean_token_accuracy": 0.6900880038738251, + "num_tokens": 34383.0, + "step": 12 + }, + { + "entropy": 1.5514098107814789, + "epoch": 0.01591187270501836, + "grad_norm": 0.7041146755218506, + "learning_rate": 9.6e-05, + "loss": 1.5633, + "mean_token_accuracy": 0.6756852567195892, + "num_tokens": 37362.0, + "step": 13 + }, + { + "entropy": 1.4420583844184875, + "epoch": 0.017135862913096694, + "grad_norm": 0.6664041876792908, + "learning_rate": 0.00010400000000000001, + "loss": 1.4821, + "mean_token_accuracy": 0.6965851336717606, + "num_tokens": 40351.0, + "step": 14 + }, + { + "entropy": 1.368016004562378, + "epoch": 0.01835985312117503, + "grad_norm": 0.6962308287620544, + "learning_rate": 0.00011200000000000001, + "loss": 1.4026, + "mean_token_accuracy": 0.7051477432250977, + "num_tokens": 43287.0, + "step": 15 + }, + { + "entropy": 1.296469360589981, + "epoch": 0.019583843329253364, + "grad_norm": 0.7701613306999207, + "learning_rate": 0.00012, + "loss": 1.3131, + "mean_token_accuracy": 0.7211279273033142, + "num_tokens": 46229.0, + "step": 16 + }, + { + "entropy": 1.152268797159195, + "epoch": 0.0208078335373317, + "grad_norm": 0.909450113773346, + "learning_rate": 0.00012800000000000002, + "loss": 1.1256, + "mean_token_accuracy": 0.7591490298509598, + "num_tokens": 49120.0, + "step": 17 + }, + { + "entropy": 1.0438900291919708, + "epoch": 0.022031823745410038, + "grad_norm": 1.0241806507110596, + "learning_rate": 0.00013600000000000003, + "loss": 0.9839, + "mean_token_accuracy": 0.7901723682880402, + "num_tokens": 52053.0, + "step": 18 + }, + { + "entropy": 0.8640953153371811, + "epoch": 0.023255813953488372, + "grad_norm": 1.6977486610412598, + "learning_rate": 0.000144, + "loss": 0.9125, + "mean_token_accuracy": 0.8010950088500977, + "num_tokens": 54931.0, + "step": 19 + }, + { + "entropy": 0.7629655003547668, + "epoch": 0.02447980416156671, + "grad_norm": 1.1721726655960083, + "learning_rate": 0.000152, + "loss": 0.8104, + "mean_token_accuracy": 0.826074481010437, + "num_tokens": 57673.0, + "step": 20 + }, + { + "entropy": 0.7300416678190231, + "epoch": 0.025703794369645042, + "grad_norm": 0.8219130635261536, + "learning_rate": 0.00016, + "loss": 0.743, + "mean_token_accuracy": 0.8358514606952667, + "num_tokens": 60663.0, + "step": 21 + }, + { + "entropy": 0.5352765321731567, + "epoch": 0.02692778457772338, + "grad_norm": 0.6964390873908997, + "learning_rate": 0.000168, + "loss": 0.5853, + "mean_token_accuracy": 0.8740498572587967, + "num_tokens": 63504.0, + "step": 22 + }, + { + "entropy": 0.507995679974556, + "epoch": 0.028151774785801713, + "grad_norm": 0.7009959816932678, + "learning_rate": 0.00017600000000000002, + "loss": 0.6038, + "mean_token_accuracy": 0.8758978694677353, + "num_tokens": 66389.0, + "step": 23 + }, + { + "entropy": 0.46718765795230865, + "epoch": 0.02937576499388005, + "grad_norm": 0.6730883121490479, + "learning_rate": 0.00018400000000000003, + "loss": 0.5644, + "mean_token_accuracy": 0.8818197697401047, + "num_tokens": 69247.0, + "step": 24 + }, + { + "entropy": 0.43733184039592743, + "epoch": 0.030599755201958383, + "grad_norm": 0.5168209075927734, + "learning_rate": 0.000192, + "loss": 0.5242, + "mean_token_accuracy": 0.8911139518022537, + "num_tokens": 72171.0, + "step": 25 + }, + { + "entropy": 0.43337714672088623, + "epoch": 0.03182374541003672, + "grad_norm": 0.4224091172218323, + "learning_rate": 0.0002, + "loss": 0.5136, + "mean_token_accuracy": 0.89336296916008, + "num_tokens": 75016.0, + "step": 26 + }, + { + "entropy": 0.434584341943264, + "epoch": 0.033047735618115054, + "grad_norm": 0.5749392509460449, + "learning_rate": 0.00019999929195423986, + "loss": 0.531, + "mean_token_accuracy": 0.8906401246786118, + "num_tokens": 77874.0, + "step": 27 + }, + { + "entropy": 0.396505244076252, + "epoch": 0.03427172582619339, + "grad_norm": 0.35490888357162476, + "learning_rate": 0.00019999716782809999, + "loss": 0.4452, + "mean_token_accuracy": 0.9087431579828262, + "num_tokens": 80655.0, + "step": 28 + }, + { + "entropy": 0.41393596678972244, + "epoch": 0.03549571603427173, + "grad_norm": 0.38403013348579407, + "learning_rate": 0.00019999362765500217, + "loss": 0.4706, + "mean_token_accuracy": 0.9030248671770096, + "num_tokens": 83481.0, + "step": 29 + }, + { + "entropy": 0.4634091928601265, + "epoch": 0.03671970624235006, + "grad_norm": 0.5506224036216736, + "learning_rate": 0.00019998867149064877, + "loss": 0.5283, + "mean_token_accuracy": 0.8818501979112625, + "num_tokens": 86446.0, + "step": 30 + }, + { + "entropy": 0.409931480884552, + "epoch": 0.037943696450428395, + "grad_norm": 0.3509610593318939, + "learning_rate": 0.00019998229941302174, + "loss": 0.4703, + "mean_token_accuracy": 0.9026807993650436, + "num_tokens": 89371.0, + "step": 31 + }, + { + "entropy": 0.4063503071665764, + "epoch": 0.03916768665850673, + "grad_norm": 0.38866373896598816, + "learning_rate": 0.00019997451152238161, + "loss": 0.4341, + "mean_token_accuracy": 0.9072516858577728, + "num_tokens": 92406.0, + "step": 32 + }, + { + "entropy": 0.40793775767087936, + "epoch": 0.04039167686658507, + "grad_norm": 0.3879442811012268, + "learning_rate": 0.00019996530794126583, + "loss": 0.4517, + "mean_token_accuracy": 0.8937678188085556, + "num_tokens": 95218.0, + "step": 33 + }, + { + "entropy": 0.4148586392402649, + "epoch": 0.0416156670746634, + "grad_norm": 0.3270847797393799, + "learning_rate": 0.00019995468881448667, + "loss": 0.4243, + "mean_token_accuracy": 0.9040537923574448, + "num_tokens": 98237.0, + "step": 34 + }, + { + "entropy": 0.4188431203365326, + "epoch": 0.042839657282741736, + "grad_norm": 0.2955991327762604, + "learning_rate": 0.00019994265430912928, + "loss": 0.4046, + "mean_token_accuracy": 0.9037134796380997, + "num_tokens": 101061.0, + "step": 35 + }, + { + "entropy": 0.3870430439710617, + "epoch": 0.044063647490820076, + "grad_norm": 0.32372933626174927, + "learning_rate": 0.00019992920461454872, + "loss": 0.3927, + "mean_token_accuracy": 0.9039850383996964, + "num_tokens": 104011.0, + "step": 36 + }, + { + "entropy": 0.40124695003032684, + "epoch": 0.04528763769889841, + "grad_norm": 0.3195420503616333, + "learning_rate": 0.0001999143399423672, + "loss": 0.4265, + "mean_token_accuracy": 0.9024457037448883, + "num_tokens": 106952.0, + "step": 37 + }, + { + "entropy": 0.3772233575582504, + "epoch": 0.046511627906976744, + "grad_norm": 0.3649533987045288, + "learning_rate": 0.00019989806052647072, + "loss": 0.4027, + "mean_token_accuracy": 0.9043892174959183, + "num_tokens": 109702.0, + "step": 38 + }, + { + "entropy": 0.397039070725441, + "epoch": 0.04773561811505508, + "grad_norm": 0.37454473972320557, + "learning_rate": 0.0001998803666230053, + "loss": 0.4165, + "mean_token_accuracy": 0.8971301317214966, + "num_tokens": 112558.0, + "step": 39 + }, + { + "entropy": 0.38474544882774353, + "epoch": 0.04895960832313342, + "grad_norm": 0.32726719975471497, + "learning_rate": 0.00019986125851037298, + "loss": 0.3903, + "mean_token_accuracy": 0.908382922410965, + "num_tokens": 115683.0, + "step": 40 + }, + { + "entropy": 0.37939534336328506, + "epoch": 0.05018359853121175, + "grad_norm": 0.3345027267932892, + "learning_rate": 0.00019984073648922753, + "loss": 0.3709, + "mean_token_accuracy": 0.9170250296592712, + "num_tokens": 118516.0, + "step": 41 + }, + { + "entropy": 0.38560714572668076, + "epoch": 0.051407588739290085, + "grad_norm": 0.3319763243198395, + "learning_rate": 0.0001998188008824696, + "loss": 0.3188, + "mean_token_accuracy": 0.9174977540969849, + "num_tokens": 121545.0, + "step": 42 + }, + { + "entropy": 0.35668643563985825, + "epoch": 0.05263157894736842, + "grad_norm": 1.2033625841140747, + "learning_rate": 0.00019979545203524174, + "loss": 0.3481, + "mean_token_accuracy": 0.9099445939064026, + "num_tokens": 124413.0, + "step": 43 + }, + { + "entropy": 0.3702196255326271, + "epoch": 0.05385556915544676, + "grad_norm": 0.3984542787075043, + "learning_rate": 0.0001997706903149228, + "loss": 0.3453, + "mean_token_accuracy": 0.9133437126874924, + "num_tokens": 127274.0, + "step": 44 + }, + { + "entropy": 0.43558043986558914, + "epoch": 0.05507955936352509, + "grad_norm": 0.3805478811264038, + "learning_rate": 0.00019974451611112247, + "loss": 0.4235, + "mean_token_accuracy": 0.8955093175172806, + "num_tokens": 130311.0, + "step": 45 + }, + { + "entropy": 0.333396352827549, + "epoch": 0.056303549571603426, + "grad_norm": 0.3033160865306854, + "learning_rate": 0.00019971692983567483, + "loss": 0.3369, + "mean_token_accuracy": 0.9142611026763916, + "num_tokens": 133115.0, + "step": 46 + }, + { + "entropy": 0.35850197821855545, + "epoch": 0.05752753977968176, + "grad_norm": 0.2995292842388153, + "learning_rate": 0.0001996879319226319, + "loss": 0.3542, + "mean_token_accuracy": 0.9052408188581467, + "num_tokens": 136085.0, + "step": 47 + }, + { + "entropy": 0.30701782926917076, + "epoch": 0.0587515299877601, + "grad_norm": 0.2741283178329468, + "learning_rate": 0.00019965752282825712, + "loss": 0.303, + "mean_token_accuracy": 0.9169407337903976, + "num_tokens": 138929.0, + "step": 48 + }, + { + "entropy": 0.33652304112911224, + "epoch": 0.05997552019583843, + "grad_norm": 0.3663140833377838, + "learning_rate": 0.00019962570303101772, + "loss": 0.3026, + "mean_token_accuracy": 0.9183420985937119, + "num_tokens": 141794.0, + "step": 49 + }, + { + "entropy": 0.3210769593715668, + "epoch": 0.06119951040391677, + "grad_norm": 0.2857617139816284, + "learning_rate": 0.00019959247303157762, + "loss": 0.3241, + "mean_token_accuracy": 0.9151087701320648, + "num_tokens": 144739.0, + "step": 50 + }, + { + "entropy": 0.27277031913399696, + "epoch": 0.06242350061199511, + "grad_norm": 0.2530350387096405, + "learning_rate": 0.00019955783335278923, + "loss": 0.2999, + "mean_token_accuracy": 0.9225717633962631, + "num_tokens": 147540.0, + "step": 51 + }, + { + "entropy": 0.30361150950193405, + "epoch": 0.06364749082007344, + "grad_norm": 0.2551971971988678, + "learning_rate": 0.0001995217845396854, + "loss": 0.289, + "mean_token_accuracy": 0.9233588874340057, + "num_tokens": 150425.0, + "step": 52 + }, + { + "entropy": 0.30236151814460754, + "epoch": 0.06487148102815178, + "grad_norm": 0.2915125787258148, + "learning_rate": 0.00019948432715947078, + "loss": 0.3188, + "mean_token_accuracy": 0.9164365231990814, + "num_tokens": 153374.0, + "step": 53 + }, + { + "entropy": 0.28662972152233124, + "epoch": 0.06609547123623011, + "grad_norm": 0.24113047122955322, + "learning_rate": 0.00019944546180151288, + "loss": 0.2779, + "mean_token_accuracy": 0.9231595695018768, + "num_tokens": 156298.0, + "step": 54 + }, + { + "entropy": 0.2960902750492096, + "epoch": 0.06731946144430845, + "grad_norm": 0.2141588032245636, + "learning_rate": 0.0001994051890773329, + "loss": 0.2751, + "mean_token_accuracy": 0.9259757697582245, + "num_tokens": 159156.0, + "step": 55 + }, + { + "entropy": 0.2784420847892761, + "epoch": 0.06854345165238677, + "grad_norm": 0.24995379149913788, + "learning_rate": 0.00019936350962059595, + "loss": 0.2751, + "mean_token_accuracy": 0.9251870810985565, + "num_tokens": 161998.0, + "step": 56 + }, + { + "entropy": 0.262157179415226, + "epoch": 0.06976744186046512, + "grad_norm": 0.22412163019180298, + "learning_rate": 0.00019932042408710122, + "loss": 0.2383, + "mean_token_accuracy": 0.9338927119970322, + "num_tokens": 164942.0, + "step": 57 + }, + { + "entropy": 0.2535906545817852, + "epoch": 0.07099143206854346, + "grad_norm": 0.29803067445755005, + "learning_rate": 0.00019927593315477159, + "loss": 0.2628, + "mean_token_accuracy": 0.9259988814592361, + "num_tokens": 167861.0, + "step": 58 + }, + { + "entropy": 0.261662982404232, + "epoch": 0.07221542227662178, + "grad_norm": 0.2842896282672882, + "learning_rate": 0.00019923003752364297, + "loss": 0.2601, + "mean_token_accuracy": 0.9260873794555664, + "num_tokens": 170634.0, + "step": 59 + }, + { + "entropy": 0.26782306656241417, + "epoch": 0.07343941248470012, + "grad_norm": 0.24825100600719452, + "learning_rate": 0.00019918273791585324, + "loss": 0.2817, + "mean_token_accuracy": 0.9298474043607712, + "num_tokens": 173548.0, + "step": 60 + }, + { + "entropy": 0.329378105700016, + "epoch": 0.07466340269277846, + "grad_norm": 0.25208738446235657, + "learning_rate": 0.00019913403507563105, + "loss": 0.3223, + "mean_token_accuracy": 0.917021855711937, + "num_tokens": 176457.0, + "step": 61 + }, + { + "entropy": 0.3131766691803932, + "epoch": 0.07588739290085679, + "grad_norm": 0.2592296004295349, + "learning_rate": 0.00019908392976928396, + "loss": 0.2837, + "mean_token_accuracy": 0.9208065420389175, + "num_tokens": 179171.0, + "step": 62 + }, + { + "entropy": 0.2586611472070217, + "epoch": 0.07711138310893513, + "grad_norm": 0.24113550782203674, + "learning_rate": 0.00019903242278518636, + "loss": 0.242, + "mean_token_accuracy": 0.9357094168663025, + "num_tokens": 181950.0, + "step": 63 + }, + { + "entropy": 0.30254383385181427, + "epoch": 0.07833537331701346, + "grad_norm": 0.28910815715789795, + "learning_rate": 0.0001989795149337672, + "loss": 0.2953, + "mean_token_accuracy": 0.9184342175722122, + "num_tokens": 184894.0, + "step": 64 + }, + { + "entropy": 0.24705467373132706, + "epoch": 0.0795593635250918, + "grad_norm": 0.27466851472854614, + "learning_rate": 0.00019892520704749715, + "loss": 0.2642, + "mean_token_accuracy": 0.9294987320899963, + "num_tokens": 187759.0, + "step": 65 + }, + { + "entropy": 0.25479237735271454, + "epoch": 0.08078335373317014, + "grad_norm": 0.26741182804107666, + "learning_rate": 0.0001988694999808755, + "loss": 0.2587, + "mean_token_accuracy": 0.9268348515033722, + "num_tokens": 190651.0, + "step": 66 + }, + { + "entropy": 0.23988736048340797, + "epoch": 0.08200734394124846, + "grad_norm": 0.39416372776031494, + "learning_rate": 0.00019881239461041675, + "loss": 0.2641, + "mean_token_accuracy": 0.9270584881305695, + "num_tokens": 193536.0, + "step": 67 + }, + { + "entropy": 0.259944386780262, + "epoch": 0.0832313341493268, + "grad_norm": 0.3866312801837921, + "learning_rate": 0.00019875389183463682, + "loss": 0.2856, + "mean_token_accuracy": 0.927163615822792, + "num_tokens": 196399.0, + "step": 68 + }, + { + "entropy": 0.269474558532238, + "epoch": 0.08445532435740515, + "grad_norm": 0.2812719941139221, + "learning_rate": 0.0001986939925740388, + "loss": 0.2634, + "mean_token_accuracy": 0.9294895380735397, + "num_tokens": 199165.0, + "step": 69 + }, + { + "entropy": 0.26512395218014717, + "epoch": 0.08567931456548347, + "grad_norm": 0.3766966760158539, + "learning_rate": 0.00019863269777109873, + "loss": 0.2747, + "mean_token_accuracy": 0.9229115545749664, + "num_tokens": 202135.0, + "step": 70 + }, + { + "entropy": 0.24634146317839622, + "epoch": 0.08690330477356181, + "grad_norm": 0.22530341148376465, + "learning_rate": 0.00019857000839025043, + "loss": 0.2086, + "mean_token_accuracy": 0.9400379657745361, + "num_tokens": 204831.0, + "step": 71 + }, + { + "entropy": 0.24204711988568306, + "epoch": 0.08812729498164015, + "grad_norm": 0.35891616344451904, + "learning_rate": 0.00019850592541787064, + "loss": 0.2496, + "mean_token_accuracy": 0.9370903074741364, + "num_tokens": 207651.0, + "step": 72 + }, + { + "entropy": 0.2562050223350525, + "epoch": 0.08935128518971848, + "grad_norm": 0.29746514558792114, + "learning_rate": 0.00019844044986226328, + "loss": 0.2634, + "mean_token_accuracy": 0.9323393702507019, + "num_tokens": 210494.0, + "step": 73 + }, + { + "entropy": 0.28995781391859055, + "epoch": 0.09057527539779682, + "grad_norm": 0.2609274983406067, + "learning_rate": 0.0001983735827536436, + "loss": 0.3043, + "mean_token_accuracy": 0.9166684448719025, + "num_tokens": 213424.0, + "step": 74 + }, + { + "entropy": 0.2909741774201393, + "epoch": 0.09179926560587515, + "grad_norm": 0.36394810676574707, + "learning_rate": 0.0001983053251441222, + "loss": 0.3448, + "mean_token_accuracy": 0.9131789356470108, + "num_tokens": 216176.0, + "step": 75 + }, + { + "entropy": 0.24828237295150757, + "epoch": 0.09302325581395349, + "grad_norm": 0.23761487007141113, + "learning_rate": 0.00019823567810768814, + "loss": 0.2481, + "mean_token_accuracy": 0.9312362223863602, + "num_tokens": 219041.0, + "step": 76 + }, + { + "entropy": 0.2604760490357876, + "epoch": 0.09424724602203183, + "grad_norm": 0.23865053057670593, + "learning_rate": 0.00019816464274019234, + "loss": 0.2588, + "mean_token_accuracy": 0.9291370064020157, + "num_tokens": 221951.0, + "step": 77 + }, + { + "entropy": 0.27589165046811104, + "epoch": 0.09547123623011015, + "grad_norm": 0.34605616331100464, + "learning_rate": 0.0001980922201593301, + "loss": 0.2684, + "mean_token_accuracy": 0.9269371181726456, + "num_tokens": 224942.0, + "step": 78 + }, + { + "entropy": 0.2712624557316303, + "epoch": 0.0966952264381885, + "grad_norm": 0.30021557211875916, + "learning_rate": 0.0001980184115046237, + "loss": 0.2822, + "mean_token_accuracy": 0.9206813722848892, + "num_tokens": 227829.0, + "step": 79 + }, + { + "entropy": 0.2824466861784458, + "epoch": 0.09791921664626684, + "grad_norm": 0.2761493921279907, + "learning_rate": 0.00019794321793740435, + "loss": 0.2713, + "mean_token_accuracy": 0.9215365201234818, + "num_tokens": 230901.0, + "step": 80 + }, + { + "entropy": 0.28565530478954315, + "epoch": 0.09914320685434516, + "grad_norm": 0.2485370635986328, + "learning_rate": 0.00019786664064079401, + "loss": 0.2694, + "mean_token_accuracy": 0.9203636944293976, + "num_tokens": 233726.0, + "step": 81 + }, + { + "entropy": 0.26477914303541183, + "epoch": 0.1003671970624235, + "grad_norm": 0.3845198452472687, + "learning_rate": 0.00019778868081968664, + "loss": 0.2748, + "mean_token_accuracy": 0.9220184534788132, + "num_tokens": 236696.0, + "step": 82 + }, + { + "entropy": 0.2523234784603119, + "epoch": 0.10159118727050184, + "grad_norm": 0.268271267414093, + "learning_rate": 0.00019770933970072937, + "loss": 0.2392, + "mean_token_accuracy": 0.9307764321565628, + "num_tokens": 239556.0, + "step": 83 + }, + { + "entropy": 0.2606973946094513, + "epoch": 0.10281517747858017, + "grad_norm": 0.22910454869270325, + "learning_rate": 0.0001976286185323032, + "loss": 0.2289, + "mean_token_accuracy": 0.9336722493171692, + "num_tokens": 242377.0, + "step": 84 + }, + { + "entropy": 0.23228657618165016, + "epoch": 0.10403916768665851, + "grad_norm": 0.20917530357837677, + "learning_rate": 0.00019754651858450322, + "loss": 0.2044, + "mean_token_accuracy": 0.9373443573713303, + "num_tokens": 245143.0, + "step": 85 + }, + { + "entropy": 0.2621064595878124, + "epoch": 0.10526315789473684, + "grad_norm": 0.24214881658554077, + "learning_rate": 0.00019746304114911877, + "loss": 0.2592, + "mean_token_accuracy": 0.9262759685516357, + "num_tokens": 248120.0, + "step": 86 + }, + { + "entropy": 0.23933247849345207, + "epoch": 0.10648714810281518, + "grad_norm": 0.2383556067943573, + "learning_rate": 0.00019737818753961305, + "loss": 0.2534, + "mean_token_accuracy": 0.932378962635994, + "num_tokens": 250884.0, + "step": 87 + }, + { + "entropy": 0.25300582498311996, + "epoch": 0.10771113831089352, + "grad_norm": 0.23573297262191772, + "learning_rate": 0.00019729195909110248, + "loss": 0.2704, + "mean_token_accuracy": 0.9268384575843811, + "num_tokens": 253834.0, + "step": 88 + }, + { + "entropy": 0.24110553786158562, + "epoch": 0.10893512851897184, + "grad_norm": 0.2606668770313263, + "learning_rate": 0.00019720435716033568, + "loss": 0.2662, + "mean_token_accuracy": 0.9226417541503906, + "num_tokens": 256737.0, + "step": 89 + }, + { + "entropy": 0.22608189284801483, + "epoch": 0.11015911872705018, + "grad_norm": 0.24541468918323517, + "learning_rate": 0.00019711538312567207, + "loss": 0.2536, + "mean_token_accuracy": 0.9286776632070541, + "num_tokens": 259531.0, + "step": 90 + }, + { + "entropy": 0.2727613262832165, + "epoch": 0.11138310893512852, + "grad_norm": 0.2059955894947052, + "learning_rate": 0.00019702503838706033, + "loss": 0.2544, + "mean_token_accuracy": 0.9291167855262756, + "num_tokens": 262476.0, + "step": 91 + }, + { + "entropy": 0.23863257840275764, + "epoch": 0.11260709914320685, + "grad_norm": 0.21770501136779785, + "learning_rate": 0.00019693332436601614, + "loss": 0.2457, + "mean_token_accuracy": 0.930979534983635, + "num_tokens": 265297.0, + "step": 92 + }, + { + "entropy": 0.23057948052883148, + "epoch": 0.11383108935128519, + "grad_norm": 0.23513388633728027, + "learning_rate": 0.00019684024250560015, + "loss": 0.2624, + "mean_token_accuracy": 0.9302163869142532, + "num_tokens": 268114.0, + "step": 93 + }, + { + "entropy": 0.26527583599090576, + "epoch": 0.11505507955936352, + "grad_norm": 0.20624403655529022, + "learning_rate": 0.0001967457942703948, + "loss": 0.2612, + "mean_token_accuracy": 0.9261021316051483, + "num_tokens": 271032.0, + "step": 94 + }, + { + "entropy": 0.2497776485979557, + "epoch": 0.11627906976744186, + "grad_norm": 0.19916735589504242, + "learning_rate": 0.0001966499811464818, + "loss": 0.2347, + "mean_token_accuracy": 0.9336029142141342, + "num_tokens": 273871.0, + "step": 95 + }, + { + "entropy": 0.2606111876666546, + "epoch": 0.1175030599755202, + "grad_norm": 0.20342691242694855, + "learning_rate": 0.0001965528046414184, + "loss": 0.2606, + "mean_token_accuracy": 0.9287164211273193, + "num_tokens": 276640.0, + "step": 96 + }, + { + "entropy": 0.25923800840973854, + "epoch": 0.11872705018359853, + "grad_norm": 0.18066218495368958, + "learning_rate": 0.00019645426628421371, + "loss": 0.2227, + "mean_token_accuracy": 0.933046743273735, + "num_tokens": 279480.0, + "step": 97 + }, + { + "entropy": 0.2528052106499672, + "epoch": 0.11995104039167687, + "grad_norm": 0.20160138607025146, + "learning_rate": 0.0001963543676253048, + "loss": 0.2344, + "mean_token_accuracy": 0.9323443472385406, + "num_tokens": 282399.0, + "step": 98 + }, + { + "entropy": 0.23929034918546677, + "epoch": 0.1211750305997552, + "grad_norm": 0.2336963713169098, + "learning_rate": 0.0001962531102365321, + "loss": 0.2287, + "mean_token_accuracy": 0.9343612790107727, + "num_tokens": 285089.0, + "step": 99 + }, + { + "entropy": 0.23155220225453377, + "epoch": 0.12239902080783353, + "grad_norm": 0.24503269791603088, + "learning_rate": 0.00019615049571111487, + "loss": 0.2642, + "mean_token_accuracy": 0.928838461637497, + "num_tokens": 287824.0, + "step": 100 + }, + { + "entropy": 0.19689469784498215, + "epoch": 0.12362301101591187, + "grad_norm": 0.23204699158668518, + "learning_rate": 0.00019604652566362603, + "loss": 0.2021, + "mean_token_accuracy": 0.9407057166099548, + "num_tokens": 290544.0, + "step": 101 + }, + { + "entropy": 0.24404483661055565, + "epoch": 0.12484700122399021, + "grad_norm": 0.23614054918289185, + "learning_rate": 0.00019594120172996665, + "loss": 0.2477, + "mean_token_accuracy": 0.9329334795475006, + "num_tokens": 293501.0, + "step": 102 + }, + { + "entropy": 0.22669777646660805, + "epoch": 0.12607099143206854, + "grad_norm": 0.25718390941619873, + "learning_rate": 0.00019583452556734044, + "loss": 0.2447, + "mean_token_accuracy": 0.9315554350614548, + "num_tokens": 296362.0, + "step": 103 + }, + { + "entropy": 0.19787690415978432, + "epoch": 0.12729498164014688, + "grad_norm": 0.22311726212501526, + "learning_rate": 0.00019572649885422745, + "loss": 0.2088, + "mean_token_accuracy": 0.9419765770435333, + "num_tokens": 299202.0, + "step": 104 + }, + { + "entropy": 0.21104953065514565, + "epoch": 0.12851897184822522, + "grad_norm": 0.26679009199142456, + "learning_rate": 0.00019561712329035788, + "loss": 0.2387, + "mean_token_accuracy": 0.9316558390855789, + "num_tokens": 301928.0, + "step": 105 + }, + { + "entropy": 0.1988048255443573, + "epoch": 0.12974296205630356, + "grad_norm": 0.20205506682395935, + "learning_rate": 0.00019550640059668508, + "loss": 0.2009, + "mean_token_accuracy": 0.9414614140987396, + "num_tokens": 304673.0, + "step": 106 + }, + { + "entropy": 0.2244689166545868, + "epoch": 0.13096695226438188, + "grad_norm": 0.23207706212997437, + "learning_rate": 0.00019539433251535877, + "loss": 0.2412, + "mean_token_accuracy": 0.930425688624382, + "num_tokens": 307598.0, + "step": 107 + }, + { + "entropy": 0.2184968665242195, + "epoch": 0.13219094247246022, + "grad_norm": 0.2206183671951294, + "learning_rate": 0.00019528092080969736, + "loss": 0.2132, + "mean_token_accuracy": 0.940753623843193, + "num_tokens": 310366.0, + "step": 108 + }, + { + "entropy": 0.2545708566904068, + "epoch": 0.13341493268053856, + "grad_norm": 0.2612517178058624, + "learning_rate": 0.00019516616726416032, + "loss": 0.2409, + "mean_token_accuracy": 0.933647871017456, + "num_tokens": 313288.0, + "step": 109 + }, + { + "entropy": 0.21715376898646355, + "epoch": 0.1346389228886169, + "grad_norm": 0.24301758408546448, + "learning_rate": 0.00019505007368432014, + "loss": 0.2021, + "mean_token_accuracy": 0.9370223134756088, + "num_tokens": 316150.0, + "step": 110 + }, + { + "entropy": 0.23507707566022873, + "epoch": 0.13586291309669524, + "grad_norm": 0.1846926510334015, + "learning_rate": 0.00019493264189683393, + "loss": 0.2252, + "mean_token_accuracy": 0.9359188079833984, + "num_tokens": 319247.0, + "step": 111 + }, + { + "entropy": 0.2184566780924797, + "epoch": 0.13708690330477355, + "grad_norm": 0.22856658697128296, + "learning_rate": 0.00019481387374941446, + "loss": 0.2186, + "mean_token_accuracy": 0.9293907731771469, + "num_tokens": 322084.0, + "step": 112 + }, + { + "entropy": 0.21156685426831245, + "epoch": 0.1383108935128519, + "grad_norm": 0.24209032952785492, + "learning_rate": 0.00019469377111080142, + "loss": 0.2342, + "mean_token_accuracy": 0.9346568733453751, + "num_tokens": 325099.0, + "step": 113 + }, + { + "entropy": 0.2099447324872017, + "epoch": 0.13953488372093023, + "grad_norm": 0.2272947132587433, + "learning_rate": 0.00019457233587073176, + "loss": 0.223, + "mean_token_accuracy": 0.9308805018663406, + "num_tokens": 327989.0, + "step": 114 + }, + { + "entropy": 0.20421232655644417, + "epoch": 0.14075887392900857, + "grad_norm": 0.22891324758529663, + "learning_rate": 0.0001944495699399101, + "loss": 0.1877, + "mean_token_accuracy": 0.9408667683601379, + "num_tokens": 330675.0, + "step": 115 + }, + { + "entropy": 0.20948100090026855, + "epoch": 0.1419828641370869, + "grad_norm": 0.2184046357870102, + "learning_rate": 0.00019432547524997858, + "loss": 0.1914, + "mean_token_accuracy": 0.9385400116443634, + "num_tokens": 333513.0, + "step": 116 + }, + { + "entropy": 0.2362695150077343, + "epoch": 0.14320685434516525, + "grad_norm": 0.22537009418010712, + "learning_rate": 0.00019420005375348648, + "loss": 0.2293, + "mean_token_accuracy": 0.9345899522304535, + "num_tokens": 336461.0, + "step": 117 + }, + { + "entropy": 0.20704597607254982, + "epoch": 0.14443084455324356, + "grad_norm": 0.382014662027359, + "learning_rate": 0.0001940733074238596, + "loss": 0.2105, + "mean_token_accuracy": 0.9430104196071625, + "num_tokens": 339252.0, + "step": 118 + }, + { + "entropy": 0.19709878414869308, + "epoch": 0.1456548347613219, + "grad_norm": 0.23572185635566711, + "learning_rate": 0.00019394523825536907, + "loss": 0.206, + "mean_token_accuracy": 0.9364204704761505, + "num_tokens": 342131.0, + "step": 119 + }, + { + "entropy": 0.18216918036341667, + "epoch": 0.14687882496940025, + "grad_norm": 0.20929844677448273, + "learning_rate": 0.00019381584826310007, + "loss": 0.1832, + "mean_token_accuracy": 0.9486173689365387, + "num_tokens": 344909.0, + "step": 120 + }, + { + "entropy": 0.22762800008058548, + "epoch": 0.1481028151774786, + "grad_norm": 0.2817568778991699, + "learning_rate": 0.00019368513948291995, + "loss": 0.2539, + "mean_token_accuracy": 0.9296244382858276, + "num_tokens": 347701.0, + "step": 121 + }, + { + "entropy": 0.20401224493980408, + "epoch": 0.14932680538555693, + "grad_norm": 0.19746211171150208, + "learning_rate": 0.00019355311397144652, + "loss": 0.2098, + "mean_token_accuracy": 0.9393266439437866, + "num_tokens": 350593.0, + "step": 122 + }, + { + "entropy": 0.21518001705408096, + "epoch": 0.15055079559363524, + "grad_norm": 0.22002124786376953, + "learning_rate": 0.00019341977380601542, + "loss": 0.2222, + "mean_token_accuracy": 0.9344192147254944, + "num_tokens": 353499.0, + "step": 123 + }, + { + "entropy": 0.2497701682150364, + "epoch": 0.15177478580171358, + "grad_norm": 0.2191598266363144, + "learning_rate": 0.00019328512108464748, + "loss": 0.239, + "mean_token_accuracy": 0.9364775866270065, + "num_tokens": 356322.0, + "step": 124 + }, + { + "entropy": 0.21236486360430717, + "epoch": 0.15299877600979192, + "grad_norm": 0.1937335729598999, + "learning_rate": 0.00019314915792601581, + "loss": 0.2089, + "mean_token_accuracy": 0.9417513012886047, + "num_tokens": 359139.0, + "step": 125 + }, + { + "entropy": 0.19794157519936562, + "epoch": 0.15422276621787026, + "grad_norm": 0.2636359930038452, + "learning_rate": 0.00019301188646941235, + "loss": 0.1783, + "mean_token_accuracy": 0.9427973330020905, + "num_tokens": 361963.0, + "step": 126 + }, + { + "entropy": 0.22416212409734726, + "epoch": 0.1554467564259486, + "grad_norm": 0.19302886724472046, + "learning_rate": 0.00019287330887471425, + "loss": 0.2141, + "mean_token_accuracy": 0.9382138699293137, + "num_tokens": 364880.0, + "step": 127 + }, + { + "entropy": 0.22023902088403702, + "epoch": 0.15667074663402691, + "grad_norm": 0.18515250086784363, + "learning_rate": 0.00019273342732234992, + "loss": 0.2207, + "mean_token_accuracy": 0.9345718026161194, + "num_tokens": 367753.0, + "step": 128 + }, + { + "entropy": 0.24657950550317764, + "epoch": 0.15789473684210525, + "grad_norm": 0.21514572203159332, + "learning_rate": 0.00019259224401326473, + "loss": 0.2442, + "mean_token_accuracy": 0.9303484708070755, + "num_tokens": 370693.0, + "step": 129 + }, + { + "entropy": 0.2127302624285221, + "epoch": 0.1591187270501836, + "grad_norm": 0.18327434360980988, + "learning_rate": 0.00019244976116888628, + "loss": 0.2088, + "mean_token_accuracy": 0.9381862580776215, + "num_tokens": 373542.0, + "step": 130 + }, + { + "entropy": 0.1992739662528038, + "epoch": 0.16034271725826194, + "grad_norm": 0.18069583177566528, + "learning_rate": 0.00019230598103108957, + "loss": 0.1921, + "mean_token_accuracy": 0.9439662545919418, + "num_tokens": 376483.0, + "step": 131 + }, + { + "entropy": 0.18468357622623444, + "epoch": 0.16156670746634028, + "grad_norm": 0.2800804674625397, + "learning_rate": 0.0001921609058621616, + "loss": 0.212, + "mean_token_accuracy": 0.9394632279872894, + "num_tokens": 379411.0, + "step": 132 + }, + { + "entropy": 0.1912625916302204, + "epoch": 0.16279069767441862, + "grad_norm": 0.26857990026474, + "learning_rate": 0.00019201453794476594, + "loss": 0.2096, + "mean_token_accuracy": 0.9378114342689514, + "num_tokens": 382242.0, + "step": 133 + }, + { + "entropy": 0.19179615378379822, + "epoch": 0.16401468788249693, + "grad_norm": 0.174366757273674, + "learning_rate": 0.00019186687958190667, + "loss": 0.186, + "mean_token_accuracy": 0.9466377645730972, + "num_tokens": 385030.0, + "step": 134 + }, + { + "entropy": 0.18204596638679504, + "epoch": 0.16523867809057527, + "grad_norm": 0.17768488824367523, + "learning_rate": 0.00019171793309689215, + "loss": 0.1796, + "mean_token_accuracy": 0.9431753903627396, + "num_tokens": 387956.0, + "step": 135 + }, + { + "entropy": 0.2265869863331318, + "epoch": 0.1664626682986536, + "grad_norm": 0.2548235058784485, + "learning_rate": 0.0001915677008332985, + "loss": 0.229, + "mean_token_accuracy": 0.9329653680324554, + "num_tokens": 390775.0, + "step": 136 + }, + { + "entropy": 0.1722266748547554, + "epoch": 0.16768665850673195, + "grad_norm": 0.18111248314380646, + "learning_rate": 0.00019141618515493286, + "loss": 0.1659, + "mean_token_accuracy": 0.9526980966329575, + "num_tokens": 393556.0, + "step": 137 + }, + { + "entropy": 0.2015659138560295, + "epoch": 0.1689106487148103, + "grad_norm": 0.18690569698810577, + "learning_rate": 0.00019126338844579593, + "loss": 0.1963, + "mean_token_accuracy": 0.9430024325847626, + "num_tokens": 396521.0, + "step": 138 + }, + { + "entropy": 0.20409919321537018, + "epoch": 0.1701346389228886, + "grad_norm": 0.23977841436862946, + "learning_rate": 0.00019110931311004466, + "loss": 0.2003, + "mean_token_accuracy": 0.9417325705289841, + "num_tokens": 399356.0, + "step": 139 + }, + { + "entropy": 0.21006100624799728, + "epoch": 0.17135862913096694, + "grad_norm": 0.25729039311408997, + "learning_rate": 0.00019095396157195428, + "loss": 0.2098, + "mean_token_accuracy": 0.9325631260871887, + "num_tokens": 402203.0, + "step": 140 + }, + { + "entropy": 0.19390207529067993, + "epoch": 0.17258261933904528, + "grad_norm": 0.21522146463394165, + "learning_rate": 0.00019079733627588042, + "loss": 0.2097, + "mean_token_accuracy": 0.9346577823162079, + "num_tokens": 405248.0, + "step": 141 + }, + { + "entropy": 0.18020372465252876, + "epoch": 0.17380660954712362, + "grad_norm": 0.19819976389408112, + "learning_rate": 0.00019063943968622023, + "loss": 0.1748, + "mean_token_accuracy": 0.9436092972755432, + "num_tokens": 408141.0, + "step": 142 + }, + { + "entropy": 0.18706470355391502, + "epoch": 0.17503059975520197, + "grad_norm": 0.23346590995788574, + "learning_rate": 0.00019048027428737402, + "loss": 0.2069, + "mean_token_accuracy": 0.9444806575775146, + "num_tokens": 411115.0, + "step": 143 + }, + { + "entropy": 0.20094066113233566, + "epoch": 0.1762545899632803, + "grad_norm": 0.21647650003433228, + "learning_rate": 0.0001903198425837059, + "loss": 0.2176, + "mean_token_accuracy": 0.9335754662752151, + "num_tokens": 413994.0, + "step": 144 + }, + { + "entropy": 0.21303684264421463, + "epoch": 0.17747858017135862, + "grad_norm": 0.18361839652061462, + "learning_rate": 0.00019015814709950446, + "loss": 0.2082, + "mean_token_accuracy": 0.9361512362957001, + "num_tokens": 416929.0, + "step": 145 + }, + { + "entropy": 0.20951109752058983, + "epoch": 0.17870257037943696, + "grad_norm": 0.20885874330997467, + "learning_rate": 0.00018999519037894312, + "loss": 0.2242, + "mean_token_accuracy": 0.9334231615066528, + "num_tokens": 419742.0, + "step": 146 + }, + { + "entropy": 0.21443912014365196, + "epoch": 0.1799265605875153, + "grad_norm": 0.23752816021442413, + "learning_rate": 0.00018983097498603995, + "loss": 0.2172, + "mean_token_accuracy": 0.9317984580993652, + "num_tokens": 422816.0, + "step": 147 + }, + { + "entropy": 0.18117666244506836, + "epoch": 0.18115055079559364, + "grad_norm": 0.15705910325050354, + "learning_rate": 0.0001896655035046175, + "loss": 0.1674, + "mean_token_accuracy": 0.9513498842716217, + "num_tokens": 425459.0, + "step": 148 + }, + { + "entropy": 0.1969112940132618, + "epoch": 0.18237454100367198, + "grad_norm": 0.19693946838378906, + "learning_rate": 0.00018949877853826196, + "loss": 0.1785, + "mean_token_accuracy": 0.9427243173122406, + "num_tokens": 428356.0, + "step": 149 + }, + { + "entropy": 0.18811774626374245, + "epoch": 0.1835985312117503, + "grad_norm": 0.16914179921150208, + "learning_rate": 0.0001893308027102824, + "loss": 0.191, + "mean_token_accuracy": 0.9430782794952393, + "num_tokens": 431146.0, + "step": 150 + }, + { + "entropy": 0.19649061560630798, + "epoch": 0.18482252141982863, + "grad_norm": 0.1536487638950348, + "learning_rate": 0.00018916157866366926, + "loss": 0.1808, + "mean_token_accuracy": 0.9403559118509293, + "num_tokens": 434039.0, + "step": 151 + }, + { + "entropy": 0.17708027735352516, + "epoch": 0.18604651162790697, + "grad_norm": 0.15281440317630768, + "learning_rate": 0.00018899110906105303, + "loss": 0.1802, + "mean_token_accuracy": 0.9457653909921646, + "num_tokens": 437034.0, + "step": 152 + }, + { + "entropy": 0.17960237711668015, + "epoch": 0.18727050183598531, + "grad_norm": 0.15672357380390167, + "learning_rate": 0.00018881939658466207, + "loss": 0.1713, + "mean_token_accuracy": 0.9508736878633499, + "num_tokens": 439943.0, + "step": 153 + }, + { + "entropy": 0.19353603944182396, + "epoch": 0.18849449204406366, + "grad_norm": 0.18015268445014954, + "learning_rate": 0.00018864644393628066, + "loss": 0.1926, + "mean_token_accuracy": 0.9375617653131485, + "num_tokens": 442812.0, + "step": 154 + }, + { + "entropy": 0.18558422103524208, + "epoch": 0.189718482252142, + "grad_norm": 0.21271689236164093, + "learning_rate": 0.00018847225383720632, + "loss": 0.2166, + "mean_token_accuracy": 0.9351175427436829, + "num_tokens": 445682.0, + "step": 155 + }, + { + "entropy": 0.1903727501630783, + "epoch": 0.1909424724602203, + "grad_norm": 0.22528168559074402, + "learning_rate": 0.0001882968290282071, + "loss": 0.2265, + "mean_token_accuracy": 0.9346236139535904, + "num_tokens": 448612.0, + "step": 156 + }, + { + "entropy": 0.15618859976530075, + "epoch": 0.19216646266829865, + "grad_norm": 0.15713728964328766, + "learning_rate": 0.0001881201722694784, + "loss": 0.1581, + "mean_token_accuracy": 0.9467688798904419, + "num_tokens": 451315.0, + "step": 157 + }, + { + "entropy": 0.1818033643066883, + "epoch": 0.193390452876377, + "grad_norm": 0.2185610979795456, + "learning_rate": 0.0001879422863405995, + "loss": 0.1759, + "mean_token_accuracy": 0.9469911754131317, + "num_tokens": 454091.0, + "step": 158 + }, + { + "entropy": 0.18168357759714127, + "epoch": 0.19461444308445533, + "grad_norm": 0.19382953643798828, + "learning_rate": 0.0001877631740404899, + "loss": 0.1784, + "mean_token_accuracy": 0.9449680894613266, + "num_tokens": 456994.0, + "step": 159 + }, + { + "entropy": 0.18659062683582306, + "epoch": 0.19583843329253367, + "grad_norm": 0.2154880166053772, + "learning_rate": 0.00018758283818736524, + "loss": 0.1975, + "mean_token_accuracy": 0.9374001771211624, + "num_tokens": 459742.0, + "step": 160 + }, + { + "entropy": 0.20422787964344025, + "epoch": 0.19706242350061198, + "grad_norm": 0.23205743730068207, + "learning_rate": 0.00018740128161869306, + "loss": 0.2154, + "mean_token_accuracy": 0.9365729838609695, + "num_tokens": 462436.0, + "step": 161 + }, + { + "entropy": 0.19083555042743683, + "epoch": 0.19828641370869032, + "grad_norm": 0.19085632264614105, + "learning_rate": 0.00018721850719114795, + "loss": 0.1672, + "mean_token_accuracy": 0.9514075368642807, + "num_tokens": 465348.0, + "step": 162 + }, + { + "entropy": 0.17570912465453148, + "epoch": 0.19951040391676866, + "grad_norm": 0.15577992796897888, + "learning_rate": 0.00018703451778056675, + "loss": 0.1594, + "mean_token_accuracy": 0.9493329524993896, + "num_tokens": 468284.0, + "step": 163 + }, + { + "entropy": 0.20927099511027336, + "epoch": 0.200734394124847, + "grad_norm": 0.18378165364265442, + "learning_rate": 0.00018684931628190334, + "loss": 0.2163, + "mean_token_accuracy": 0.9379559308290482, + "num_tokens": 471121.0, + "step": 164 + }, + { + "entropy": 0.19773852080106735, + "epoch": 0.20195838433292534, + "grad_norm": 0.16502057015895844, + "learning_rate": 0.00018666290560918292, + "loss": 0.1789, + "mean_token_accuracy": 0.9469153583049774, + "num_tokens": 473881.0, + "step": 165 + }, + { + "entropy": 0.19184665754437447, + "epoch": 0.20318237454100369, + "grad_norm": 0.2153913825750351, + "learning_rate": 0.00018647528869545631, + "loss": 0.1861, + "mean_token_accuracy": 0.9431813210248947, + "num_tokens": 476925.0, + "step": 166 + }, + { + "entropy": 0.1783909946680069, + "epoch": 0.204406364749082, + "grad_norm": 0.1640944629907608, + "learning_rate": 0.00018628646849275373, + "loss": 0.1896, + "mean_token_accuracy": 0.9449851214885712, + "num_tokens": 479728.0, + "step": 167 + }, + { + "entropy": 0.19220372289419174, + "epoch": 0.20563035495716034, + "grad_norm": 0.20433193445205688, + "learning_rate": 0.0001860964479720384, + "loss": 0.1756, + "mean_token_accuracy": 0.9430384337902069, + "num_tokens": 482619.0, + "step": 168 + }, + { + "entropy": 0.19230392575263977, + "epoch": 0.20685434516523868, + "grad_norm": 0.20530015230178833, + "learning_rate": 0.00018590523012315972, + "loss": 0.1868, + "mean_token_accuracy": 0.9438671320676804, + "num_tokens": 485570.0, + "step": 169 + }, + { + "entropy": 0.17511818930506706, + "epoch": 0.20807833537331702, + "grad_norm": 0.19694867730140686, + "learning_rate": 0.0001857128179548063, + "loss": 0.1996, + "mean_token_accuracy": 0.9444852769374847, + "num_tokens": 488445.0, + "step": 170 + }, + { + "entropy": 0.18230733647942543, + "epoch": 0.20930232558139536, + "grad_norm": 0.18150445818901062, + "learning_rate": 0.0001855192144944586, + "loss": 0.187, + "mean_token_accuracy": 0.9412347078323364, + "num_tokens": 491221.0, + "step": 171 + }, + { + "entropy": 0.17679818719625473, + "epoch": 0.21052631578947367, + "grad_norm": 0.16929912567138672, + "learning_rate": 0.00018532442278834129, + "loss": 0.1629, + "mean_token_accuracy": 0.949176624417305, + "num_tokens": 494111.0, + "step": 172 + }, + { + "entropy": 0.19764605909585953, + "epoch": 0.211750305997552, + "grad_norm": 0.15386848151683807, + "learning_rate": 0.0001851284459013752, + "loss": 0.18, + "mean_token_accuracy": 0.9428203105926514, + "num_tokens": 496985.0, + "step": 173 + }, + { + "entropy": 0.1830284371972084, + "epoch": 0.21297429620563035, + "grad_norm": 0.19857825338840485, + "learning_rate": 0.00018493128691712943, + "loss": 0.1966, + "mean_token_accuracy": 0.939018040895462, + "num_tokens": 499957.0, + "step": 174 + }, + { + "entropy": 0.18198782950639725, + "epoch": 0.2141982864137087, + "grad_norm": 0.25519418716430664, + "learning_rate": 0.00018473294893777241, + "loss": 0.1989, + "mean_token_accuracy": 0.9403301775455475, + "num_tokens": 502971.0, + "step": 175 + }, + { + "entropy": 0.18772383406758308, + "epoch": 0.21542227662178703, + "grad_norm": 0.1947707086801529, + "learning_rate": 0.00018453343508402338, + "loss": 0.1981, + "mean_token_accuracy": 0.9359713792800903, + "num_tokens": 505903.0, + "step": 176 + }, + { + "entropy": 0.18177012354135513, + "epoch": 0.21664626682986537, + "grad_norm": 0.20763908326625824, + "learning_rate": 0.00018433274849510321, + "loss": 0.171, + "mean_token_accuracy": 0.9494579136371613, + "num_tokens": 508726.0, + "step": 177 + }, + { + "entropy": 0.1637764871120453, + "epoch": 0.2178702570379437, + "grad_norm": 0.17779627442359924, + "learning_rate": 0.00018413089232868497, + "loss": 0.1462, + "mean_token_accuracy": 0.9507607966661453, + "num_tokens": 511563.0, + "step": 178 + }, + { + "entropy": 0.20556450262665749, + "epoch": 0.21909424724602203, + "grad_norm": 0.21839852631092072, + "learning_rate": 0.00018392786976084425, + "loss": 0.2128, + "mean_token_accuracy": 0.9350947886705399, + "num_tokens": 514569.0, + "step": 179 + }, + { + "entropy": 0.16066605225205421, + "epoch": 0.22031823745410037, + "grad_norm": 0.18180422484874725, + "learning_rate": 0.00018372368398600927, + "loss": 0.1572, + "mean_token_accuracy": 0.94941745698452, + "num_tokens": 517347.0, + "step": 180 + }, + { + "entropy": 0.1602637805044651, + "epoch": 0.2215422276621787, + "grad_norm": 0.17300912737846375, + "learning_rate": 0.0001835183382169105, + "loss": 0.1588, + "mean_token_accuracy": 0.9485157877206802, + "num_tokens": 520187.0, + "step": 181 + }, + { + "entropy": 0.16499871760606766, + "epoch": 0.22276621787025705, + "grad_norm": 0.16628842055797577, + "learning_rate": 0.00018331183568453022, + "loss": 0.1608, + "mean_token_accuracy": 0.9501471370458603, + "num_tokens": 523052.0, + "step": 182 + }, + { + "entropy": 0.18369654193520546, + "epoch": 0.22399020807833536, + "grad_norm": 0.19967518746852875, + "learning_rate": 0.00018310417963805155, + "loss": 0.2117, + "mean_token_accuracy": 0.9424542188644409, + "num_tokens": 525908.0, + "step": 183 + }, + { + "entropy": 0.16439638286828995, + "epoch": 0.2252141982864137, + "grad_norm": 0.18265485763549805, + "learning_rate": 0.00018289537334480752, + "loss": 0.1646, + "mean_token_accuracy": 0.9496809095144272, + "num_tokens": 528771.0, + "step": 184 + }, + { + "entropy": 0.1878223903477192, + "epoch": 0.22643818849449204, + "grad_norm": 0.1926940232515335, + "learning_rate": 0.0001826854200902294, + "loss": 0.1905, + "mean_token_accuracy": 0.9376941174268723, + "num_tokens": 531695.0, + "step": 185 + }, + { + "entropy": 0.1661827340722084, + "epoch": 0.22766217870257038, + "grad_norm": 0.18506911396980286, + "learning_rate": 0.0001824743231777953, + "loss": 0.1595, + "mean_token_accuracy": 0.9480125308036804, + "num_tokens": 534608.0, + "step": 186 + }, + { + "entropy": 0.1938113383948803, + "epoch": 0.22888616891064872, + "grad_norm": 0.18394246697425842, + "learning_rate": 0.0001822620859289779, + "loss": 0.2047, + "mean_token_accuracy": 0.9360768646001816, + "num_tokens": 537451.0, + "step": 187 + }, + { + "entropy": 0.18548545241355896, + "epoch": 0.23011015911872704, + "grad_norm": 0.19763712584972382, + "learning_rate": 0.00018204871168319244, + "loss": 0.208, + "mean_token_accuracy": 0.9336741715669632, + "num_tokens": 540273.0, + "step": 188 + }, + { + "entropy": 0.16884029656648636, + "epoch": 0.23133414932680538, + "grad_norm": 0.16999542713165283, + "learning_rate": 0.00018183420379774408, + "loss": 0.1723, + "mean_token_accuracy": 0.946614533662796, + "num_tokens": 543087.0, + "step": 189 + }, + { + "entropy": 0.18270950391888618, + "epoch": 0.23255813953488372, + "grad_norm": 0.14409801363945007, + "learning_rate": 0.0001816185656477749, + "loss": 0.1574, + "mean_token_accuracy": 0.9510624706745148, + "num_tokens": 546170.0, + "step": 190 + }, + { + "entropy": 0.18100232258439064, + "epoch": 0.23378212974296206, + "grad_norm": 0.15344634652137756, + "learning_rate": 0.00018140180062621117, + "loss": 0.1685, + "mean_token_accuracy": 0.9480974972248077, + "num_tokens": 548999.0, + "step": 191 + }, + { + "entropy": 0.17836323380470276, + "epoch": 0.2350061199510404, + "grad_norm": 0.2597241997718811, + "learning_rate": 0.00018118391214370965, + "loss": 0.1804, + "mean_token_accuracy": 0.9419548660516739, + "num_tokens": 551819.0, + "step": 192 + }, + { + "entropy": 0.18338828161358833, + "epoch": 0.23623011015911874, + "grad_norm": 0.19250500202178955, + "learning_rate": 0.00018096490362860397, + "loss": 0.1927, + "mean_token_accuracy": 0.9372808635234833, + "num_tokens": 554797.0, + "step": 193 + }, + { + "entropy": 0.1730855070054531, + "epoch": 0.23745410036719705, + "grad_norm": 0.12724949419498444, + "learning_rate": 0.00018074477852685089, + "loss": 0.1485, + "mean_token_accuracy": 0.9545421898365021, + "num_tokens": 557676.0, + "step": 194 + }, + { + "entropy": 0.15416070446372032, + "epoch": 0.2386780905752754, + "grad_norm": 0.16263695061206818, + "learning_rate": 0.00018052354030197577, + "loss": 0.1528, + "mean_token_accuracy": 0.9509410858154297, + "num_tokens": 560571.0, + "step": 195 + }, + { + "entropy": 0.16779664903879166, + "epoch": 0.23990208078335373, + "grad_norm": 0.22550639510154724, + "learning_rate": 0.00018030119243501842, + "loss": 0.1887, + "mean_token_accuracy": 0.9451108574867249, + "num_tokens": 563319.0, + "step": 196 + }, + { + "entropy": 0.16639390960335732, + "epoch": 0.24112607099143207, + "grad_norm": 0.17296341061592102, + "learning_rate": 0.00018007773842447798, + "loss": 0.1522, + "mean_token_accuracy": 0.9470518976449966, + "num_tokens": 566212.0, + "step": 197 + }, + { + "entropy": 0.16828778013586998, + "epoch": 0.2423500611995104, + "grad_norm": 0.18193146586418152, + "learning_rate": 0.0001798531817862581, + "loss": 0.1687, + "mean_token_accuracy": 0.9467720240354538, + "num_tokens": 569073.0, + "step": 198 + }, + { + "entropy": 0.1462346874177456, + "epoch": 0.24357405140758873, + "grad_norm": 0.2021908164024353, + "learning_rate": 0.00017962752605361166, + "loss": 0.1513, + "mean_token_accuracy": 0.9474725276231766, + "num_tokens": 572002.0, + "step": 199 + }, + { + "entropy": 0.15045399963855743, + "epoch": 0.24479804161566707, + "grad_norm": 0.1962203085422516, + "learning_rate": 0.0001794007747770849, + "loss": 0.1574, + "mean_token_accuracy": 0.9493848234415054, + "num_tokens": 574874.0, + "step": 200 + }, + { + "entropy": 0.15729429945349693, + "epoch": 0.2460220318237454, + "grad_norm": 0.20367532968521118, + "learning_rate": 0.00017917293152446183, + "loss": 0.1583, + "mean_token_accuracy": 0.9445710331201553, + "num_tokens": 577808.0, + "step": 201 + }, + { + "entropy": 0.1622149646282196, + "epoch": 0.24724602203182375, + "grad_norm": 0.22396479547023773, + "learning_rate": 0.00017894399988070803, + "loss": 0.1643, + "mean_token_accuracy": 0.9492162019014359, + "num_tokens": 580747.0, + "step": 202 + }, + { + "entropy": 0.16159441694617271, + "epoch": 0.2484700122399021, + "grad_norm": 0.23492002487182617, + "learning_rate": 0.00017871398344791414, + "loss": 0.1959, + "mean_token_accuracy": 0.9469300806522369, + "num_tokens": 583559.0, + "step": 203 + }, + { + "entropy": 0.17220193520188332, + "epoch": 0.24969400244798043, + "grad_norm": 0.28471776843070984, + "learning_rate": 0.00017848288584523925, + "loss": 0.1811, + "mean_token_accuracy": 0.9446441382169724, + "num_tokens": 586487.0, + "step": 204 + }, + { + "entropy": 0.17497995868325233, + "epoch": 0.25091799265605874, + "grad_norm": 0.18369029462337494, + "learning_rate": 0.000178250710708854, + "loss": 0.1641, + "mean_token_accuracy": 0.9473273903131485, + "num_tokens": 589220.0, + "step": 205 + }, + { + "entropy": 0.17878996208310127, + "epoch": 0.2521419828641371, + "grad_norm": 0.20050311088562012, + "learning_rate": 0.00017801746169188328, + "loss": 0.184, + "mean_token_accuracy": 0.93953637778759, + "num_tokens": 592071.0, + "step": 206 + }, + { + "entropy": 0.15223676338791847, + "epoch": 0.2533659730722154, + "grad_norm": 0.15353935956954956, + "learning_rate": 0.0001777831424643488, + "loss": 0.1495, + "mean_token_accuracy": 0.95431187748909, + "num_tokens": 594864.0, + "step": 207 + }, + { + "entropy": 0.17296412959694862, + "epoch": 0.25458996328029376, + "grad_norm": 0.16742084920406342, + "learning_rate": 0.0001775477567131114, + "loss": 0.168, + "mean_token_accuracy": 0.9455537647008896, + "num_tokens": 597826.0, + "step": 208 + }, + { + "entropy": 0.17634878307580948, + "epoch": 0.2558139534883721, + "grad_norm": 0.16971459984779358, + "learning_rate": 0.00017731130814181288, + "loss": 0.161, + "mean_token_accuracy": 0.9493403434753418, + "num_tokens": 600497.0, + "step": 209 + }, + { + "entropy": 0.158625029027462, + "epoch": 0.25703794369645044, + "grad_norm": 0.20032508671283722, + "learning_rate": 0.00017707380047081794, + "loss": 0.1731, + "mean_token_accuracy": 0.9465848356485367, + "num_tokens": 603483.0, + "step": 210 + }, + { + "entropy": 0.1820707693696022, + "epoch": 0.2582619339045288, + "grad_norm": 0.17230579257011414, + "learning_rate": 0.00017683523743715538, + "loss": 0.1819, + "mean_token_accuracy": 0.9470725059509277, + "num_tokens": 606184.0, + "step": 211 + }, + { + "entropy": 0.18469195812940598, + "epoch": 0.2594859241126071, + "grad_norm": 0.19344152510166168, + "learning_rate": 0.00017659562279445957, + "loss": 0.2055, + "mean_token_accuracy": 0.9400018602609634, + "num_tokens": 609017.0, + "step": 212 + }, + { + "entropy": 0.17892184108495712, + "epoch": 0.2607099143206854, + "grad_norm": 0.16015875339508057, + "learning_rate": 0.00017635496031291115, + "loss": 0.1767, + "mean_token_accuracy": 0.9444658905267715, + "num_tokens": 611917.0, + "step": 213 + }, + { + "entropy": 0.16957420110702515, + "epoch": 0.26193390452876375, + "grad_norm": 0.15975043177604675, + "learning_rate": 0.00017611325377917797, + "loss": 0.1782, + "mean_token_accuracy": 0.9468631595373154, + "num_tokens": 614670.0, + "step": 214 + }, + { + "entropy": 0.1795290894806385, + "epoch": 0.2631578947368421, + "grad_norm": 0.1569456160068512, + "learning_rate": 0.0001758705069963553, + "loss": 0.1718, + "mean_token_accuracy": 0.9439380913972855, + "num_tokens": 617542.0, + "step": 215 + }, + { + "entropy": 0.181200310587883, + "epoch": 0.26438188494492043, + "grad_norm": 0.1478230208158493, + "learning_rate": 0.00017562672378390595, + "loss": 0.1628, + "mean_token_accuracy": 0.9471941888332367, + "num_tokens": 620416.0, + "step": 216 + }, + { + "entropy": 0.1878260225057602, + "epoch": 0.26560587515299877, + "grad_norm": 0.19499057531356812, + "learning_rate": 0.00017538190797760042, + "loss": 0.1916, + "mean_token_accuracy": 0.9431721717119217, + "num_tokens": 623235.0, + "step": 217 + }, + { + "entropy": 0.15607567131519318, + "epoch": 0.2668298653610771, + "grad_norm": 0.2671367824077606, + "learning_rate": 0.00017513606342945632, + "loss": 0.1436, + "mean_token_accuracy": 0.9530814439058304, + "num_tokens": 625982.0, + "step": 218 + }, + { + "entropy": 0.17111649736762047, + "epoch": 0.26805385556915545, + "grad_norm": 0.15476374328136444, + "learning_rate": 0.00017488919400767788, + "loss": 0.1758, + "mean_token_accuracy": 0.946123331785202, + "num_tokens": 628858.0, + "step": 219 + }, + { + "entropy": 0.17895303294062614, + "epoch": 0.2692778457772338, + "grad_norm": 0.1611027717590332, + "learning_rate": 0.00017464130359659509, + "loss": 0.1873, + "mean_token_accuracy": 0.9450231343507767, + "num_tokens": 631891.0, + "step": 220 + }, + { + "entropy": 0.17547280713915825, + "epoch": 0.27050183598531213, + "grad_norm": 0.16108718514442444, + "learning_rate": 0.0001743923960966024, + "loss": 0.1672, + "mean_token_accuracy": 0.947683796286583, + "num_tokens": 634772.0, + "step": 221 + }, + { + "entropy": 0.16660896316170692, + "epoch": 0.2717258261933905, + "grad_norm": 0.1638018786907196, + "learning_rate": 0.00017414247542409767, + "loss": 0.1687, + "mean_token_accuracy": 0.941192552447319, + "num_tokens": 637630.0, + "step": 222 + }, + { + "entropy": 0.15990225411951542, + "epoch": 0.2729498164014688, + "grad_norm": 0.18220585584640503, + "learning_rate": 0.00017389154551142026, + "loss": 0.1661, + "mean_token_accuracy": 0.9494237303733826, + "num_tokens": 640448.0, + "step": 223 + }, + { + "entropy": 0.16279374808073044, + "epoch": 0.2741738066095471, + "grad_norm": 0.16498099267482758, + "learning_rate": 0.00017363961030678927, + "loss": 0.1629, + "mean_token_accuracy": 0.9454193711280823, + "num_tokens": 643272.0, + "step": 224 + }, + { + "entropy": 0.15560708194971085, + "epoch": 0.27539779681762544, + "grad_norm": 0.13630273938179016, + "learning_rate": 0.00017338667377424151, + "loss": 0.1575, + "mean_token_accuracy": 0.9491967558860779, + "num_tokens": 646174.0, + "step": 225 + }, + { + "entropy": 0.1652889847755432, + "epoch": 0.2766217870257038, + "grad_norm": 0.15164905786514282, + "learning_rate": 0.00017313273989356893, + "loss": 0.1605, + "mean_token_accuracy": 0.9483360350131989, + "num_tokens": 649049.0, + "step": 226 + }, + { + "entropy": 0.16370249167084694, + "epoch": 0.2778457772337821, + "grad_norm": 0.15450038015842438, + "learning_rate": 0.00017287781266025614, + "loss": 0.1797, + "mean_token_accuracy": 0.947179839015007, + "num_tokens": 651922.0, + "step": 227 + }, + { + "entropy": 0.16513775661587715, + "epoch": 0.27906976744186046, + "grad_norm": 0.15598060190677643, + "learning_rate": 0.00017262189608541748, + "loss": 0.1547, + "mean_token_accuracy": 0.9510365873575211, + "num_tokens": 654711.0, + "step": 228 + }, + { + "entropy": 0.15433086827397346, + "epoch": 0.2802937576499388, + "grad_norm": 0.1199483796954155, + "learning_rate": 0.00017236499419573403, + "loss": 0.1341, + "mean_token_accuracy": 0.9524166136980057, + "num_tokens": 657689.0, + "step": 229 + }, + { + "entropy": 0.17272919043898582, + "epoch": 0.28151774785801714, + "grad_norm": 0.19626493752002716, + "learning_rate": 0.0001721071110333901, + "loss": 0.1976, + "mean_token_accuracy": 0.9397020041942596, + "num_tokens": 660632.0, + "step": 230 + }, + { + "entropy": 0.16541118174791336, + "epoch": 0.2827417380660955, + "grad_norm": 0.16486576199531555, + "learning_rate": 0.00017184825065600962, + "loss": 0.1617, + "mean_token_accuracy": 0.9517865031957626, + "num_tokens": 663562.0, + "step": 231 + }, + { + "entropy": 0.164056695997715, + "epoch": 0.2839657282741738, + "grad_norm": 0.17284096777439117, + "learning_rate": 0.00017158841713659246, + "loss": 0.1626, + "mean_token_accuracy": 0.9469199180603027, + "num_tokens": 666428.0, + "step": 232 + }, + { + "entropy": 0.15050045400857925, + "epoch": 0.28518971848225216, + "grad_norm": 0.14317849278450012, + "learning_rate": 0.00017132761456345024, + "loss": 0.1423, + "mean_token_accuracy": 0.955637976527214, + "num_tokens": 669279.0, + "step": 233 + }, + { + "entropy": 0.16040299087762833, + "epoch": 0.2864137086903305, + "grad_norm": 0.12360849976539612, + "learning_rate": 0.00017106584704014191, + "loss": 0.143, + "mean_token_accuracy": 0.9516117870807648, + "num_tokens": 672183.0, + "step": 234 + }, + { + "entropy": 0.17236219719052315, + "epoch": 0.2876376988984088, + "grad_norm": 0.17805422842502594, + "learning_rate": 0.00017080311868540943, + "loss": 0.1783, + "mean_token_accuracy": 0.9467423856258392, + "num_tokens": 675072.0, + "step": 235 + }, + { + "entropy": 0.1514708250761032, + "epoch": 0.28886168910648713, + "grad_norm": 0.20059767365455627, + "learning_rate": 0.00017053943363311275, + "loss": 0.1638, + "mean_token_accuracy": 0.9483214765787125, + "num_tokens": 677851.0, + "step": 236 + }, + { + "entropy": 0.16743789613246918, + "epoch": 0.29008567931456547, + "grad_norm": 0.16960926353931427, + "learning_rate": 0.0001702747960321648, + "loss": 0.1542, + "mean_token_accuracy": 0.9489087015390396, + "num_tokens": 680654.0, + "step": 237 + }, + { + "entropy": 0.16658775135874748, + "epoch": 0.2913096695226438, + "grad_norm": 0.1454268842935562, + "learning_rate": 0.00017000921004646627, + "loss": 0.1675, + "mean_token_accuracy": 0.9475969672203064, + "num_tokens": 683572.0, + "step": 238 + }, + { + "entropy": 0.15546391159296036, + "epoch": 0.29253365973072215, + "grad_norm": 0.24349121749401093, + "learning_rate": 0.00016974267985484008, + "loss": 0.1627, + "mean_token_accuracy": 0.9483270943164825, + "num_tokens": 686452.0, + "step": 239 + }, + { + "entropy": 0.1651969887316227, + "epoch": 0.2937576499388005, + "grad_norm": 0.16770996153354645, + "learning_rate": 0.00016947520965096563, + "loss": 0.18, + "mean_token_accuracy": 0.9437374174594879, + "num_tokens": 689462.0, + "step": 240 + }, + { + "entropy": 0.14968755841255188, + "epoch": 0.29498164014687883, + "grad_norm": 0.14128656685352325, + "learning_rate": 0.00016920680364331282, + "loss": 0.1564, + "mean_token_accuracy": 0.9496227204799652, + "num_tokens": 692265.0, + "step": 241 + }, + { + "entropy": 0.14512823522090912, + "epoch": 0.2962056303549572, + "grad_norm": 0.12016550451517105, + "learning_rate": 0.0001689374660550757, + "loss": 0.1389, + "mean_token_accuracy": 0.9531503915786743, + "num_tokens": 695216.0, + "step": 242 + }, + { + "entropy": 0.16646289825439453, + "epoch": 0.2974296205630355, + "grad_norm": 0.13769562542438507, + "learning_rate": 0.0001686672011241062, + "loss": 0.1596, + "mean_token_accuracy": 0.9455363601446152, + "num_tokens": 698149.0, + "step": 243 + }, + { + "entropy": 0.15644201636314392, + "epoch": 0.29865361077111385, + "grad_norm": 0.1847907304763794, + "learning_rate": 0.0001683960131028475, + "loss": 0.1585, + "mean_token_accuracy": 0.9488164186477661, + "num_tokens": 700908.0, + "step": 244 + }, + { + "entropy": 0.16788003966212273, + "epoch": 0.2998776009791922, + "grad_norm": 0.15104271471500397, + "learning_rate": 0.00016812390625826678, + "loss": 0.1642, + "mean_token_accuracy": 0.9465269148349762, + "num_tokens": 703841.0, + "step": 245 + }, + { + "entropy": 0.16064897924661636, + "epoch": 0.3011015911872705, + "grad_norm": 0.1464117020368576, + "learning_rate": 0.00016785088487178854, + "loss": 0.1673, + "mean_token_accuracy": 0.9491890221834183, + "num_tokens": 706746.0, + "step": 246 + }, + { + "entropy": 0.1823849119246006, + "epoch": 0.3023255813953488, + "grad_norm": 0.13486997783184052, + "learning_rate": 0.00016757695323922687, + "loss": 0.1699, + "mean_token_accuracy": 0.9434429407119751, + "num_tokens": 709662.0, + "step": 247 + }, + { + "entropy": 0.1726757325232029, + "epoch": 0.30354957160342716, + "grad_norm": 0.16196493804454803, + "learning_rate": 0.00016730211567071794, + "loss": 0.1748, + "mean_token_accuracy": 0.9453353434801102, + "num_tokens": 712483.0, + "step": 248 + }, + { + "entropy": 0.15389468148350716, + "epoch": 0.3047735618115055, + "grad_norm": 0.13152503967285156, + "learning_rate": 0.00016702637649065237, + "loss": 0.1486, + "mean_token_accuracy": 0.9476341009140015, + "num_tokens": 715262.0, + "step": 249 + }, + { + "entropy": 0.15640466287732124, + "epoch": 0.30599755201958384, + "grad_norm": 0.16535264253616333, + "learning_rate": 0.00016674974003760704, + "loss": 0.1726, + "mean_token_accuracy": 0.9492945671081543, + "num_tokens": 717974.0, + "step": 250 + }, + { + "entropy": 0.15911602601408958, + "epoch": 0.3072215422276622, + "grad_norm": 0.18389979004859924, + "learning_rate": 0.0001664722106642767, + "loss": 0.1781, + "mean_token_accuracy": 0.9488258361816406, + "num_tokens": 720779.0, + "step": 251 + }, + { + "entropy": 0.16676464304327965, + "epoch": 0.3084455324357405, + "grad_norm": 0.1483651101589203, + "learning_rate": 0.00016619379273740572, + "loss": 0.1607, + "mean_token_accuracy": 0.94488126039505, + "num_tokens": 723592.0, + "step": 252 + }, + { + "entropy": 0.15845325589179993, + "epoch": 0.30966952264381886, + "grad_norm": 0.1452375054359436, + "learning_rate": 0.00016591449063771928, + "loss": 0.1499, + "mean_token_accuracy": 0.9465564489364624, + "num_tokens": 726410.0, + "step": 253 + }, + { + "entropy": 0.14364412799477577, + "epoch": 0.3108935128518972, + "grad_norm": 0.13499148190021515, + "learning_rate": 0.00016563430875985447, + "loss": 0.1483, + "mean_token_accuracy": 0.9524897783994675, + "num_tokens": 729132.0, + "step": 254 + }, + { + "entropy": 0.15591862052679062, + "epoch": 0.31211750305997554, + "grad_norm": 0.1372753530740738, + "learning_rate": 0.00016535325151229106, + "loss": 0.1505, + "mean_token_accuracy": 0.9490492343902588, + "num_tokens": 731974.0, + "step": 255 + }, + { + "entropy": 0.16933650523424149, + "epoch": 0.31334149326805383, + "grad_norm": 0.13696317374706268, + "learning_rate": 0.00016507132331728221, + "loss": 0.1546, + "mean_token_accuracy": 0.9503903985023499, + "num_tokens": 734714.0, + "step": 256 + }, + { + "entropy": 0.16402816027402878, + "epoch": 0.31456548347613217, + "grad_norm": 0.2016628384590149, + "learning_rate": 0.00016478852861078486, + "loss": 0.1943, + "mean_token_accuracy": 0.9415709674358368, + "num_tokens": 737424.0, + "step": 257 + }, + { + "entropy": 0.14688615500926971, + "epoch": 0.3157894736842105, + "grad_norm": 0.12992976605892181, + "learning_rate": 0.00016450487184239, + "loss": 0.1491, + "mean_token_accuracy": 0.9528147131204605, + "num_tokens": 740271.0, + "step": 258 + }, + { + "entropy": 0.15046832337975502, + "epoch": 0.31701346389228885, + "grad_norm": 0.986178457736969, + "learning_rate": 0.00016422035747525257, + "loss": 0.16, + "mean_token_accuracy": 0.9460361450910568, + "num_tokens": 743131.0, + "step": 259 + }, + { + "entropy": 0.16561544686555862, + "epoch": 0.3182374541003672, + "grad_norm": 0.12990634143352509, + "learning_rate": 0.0001639349899860212, + "loss": 0.1585, + "mean_token_accuracy": 0.9492562413215637, + "num_tokens": 745991.0, + "step": 260 + }, + { + "entropy": 0.1497195027768612, + "epoch": 0.31946144430844553, + "grad_norm": 0.12717635929584503, + "learning_rate": 0.00016364877386476802, + "loss": 0.1464, + "mean_token_accuracy": 0.9557505398988724, + "num_tokens": 748821.0, + "step": 261 + }, + { + "entropy": 0.14798679575324059, + "epoch": 0.32068543451652387, + "grad_norm": 0.11717986315488815, + "learning_rate": 0.00016336171361491764, + "loss": 0.1297, + "mean_token_accuracy": 0.9543893784284592, + "num_tokens": 751573.0, + "step": 262 + }, + { + "entropy": 0.15282408520579338, + "epoch": 0.3219094247246022, + "grad_norm": 0.17172206938266754, + "learning_rate": 0.00016307381375317662, + "loss": 0.1748, + "mean_token_accuracy": 0.9438786953687668, + "num_tokens": 754477.0, + "step": 263 + }, + { + "entropy": 0.14917079731822014, + "epoch": 0.32313341493268055, + "grad_norm": 0.12849947810173035, + "learning_rate": 0.0001627850788094621, + "loss": 0.1401, + "mean_token_accuracy": 0.9506666511297226, + "num_tokens": 757335.0, + "step": 264 + }, + { + "entropy": 0.1572648622095585, + "epoch": 0.3243574051407589, + "grad_norm": 0.1511721909046173, + "learning_rate": 0.000162495513326831, + "loss": 0.1687, + "mean_token_accuracy": 0.9507835209369659, + "num_tokens": 760137.0, + "step": 265 + }, + { + "entropy": 0.16095808148384094, + "epoch": 0.32558139534883723, + "grad_norm": 0.16972914338111877, + "learning_rate": 0.00016220512186140786, + "loss": 0.1709, + "mean_token_accuracy": 0.9424894452095032, + "num_tokens": 763131.0, + "step": 266 + }, + { + "entropy": 0.16730071604251862, + "epoch": 0.3268053855569155, + "grad_norm": 0.15815693140029907, + "learning_rate": 0.00016191390898231378, + "loss": 0.1654, + "mean_token_accuracy": 0.9475398361682892, + "num_tokens": 766074.0, + "step": 267 + }, + { + "entropy": 0.15982737392187119, + "epoch": 0.32802937576499386, + "grad_norm": 0.13941867649555206, + "learning_rate": 0.00016162187927159415, + "loss": 0.1562, + "mean_token_accuracy": 0.9470800459384918, + "num_tokens": 768940.0, + "step": 268 + }, + { + "entropy": 0.1737975850701332, + "epoch": 0.3292533659730722, + "grad_norm": 0.1692054122686386, + "learning_rate": 0.00016132903732414677, + "loss": 0.1706, + "mean_token_accuracy": 0.9451922178268433, + "num_tokens": 771876.0, + "step": 269 + }, + { + "entropy": 0.17409133166074753, + "epoch": 0.33047735618115054, + "grad_norm": 0.1440180093050003, + "learning_rate": 0.00016103538774764936, + "loss": 0.1593, + "mean_token_accuracy": 0.9453429281711578, + "num_tokens": 774815.0, + "step": 270 + }, + { + "entropy": 0.14217909425497055, + "epoch": 0.3317013463892289, + "grad_norm": 0.10787917673587799, + "learning_rate": 0.00016074093516248725, + "loss": 0.123, + "mean_token_accuracy": 0.9575345516204834, + "num_tokens": 777753.0, + "step": 271 + }, + { + "entropy": 0.14409229345619678, + "epoch": 0.3329253365973072, + "grad_norm": 0.16516132652759552, + "learning_rate": 0.00016044568420168047, + "loss": 0.1346, + "mean_token_accuracy": 0.9506991654634476, + "num_tokens": 780575.0, + "step": 272 + }, + { + "entropy": 0.15628314018249512, + "epoch": 0.33414932680538556, + "grad_norm": 0.19409486651420593, + "learning_rate": 0.0001601496395108111, + "loss": 0.1601, + "mean_token_accuracy": 0.9453092515468597, + "num_tokens": 783522.0, + "step": 273 + }, + { + "entropy": 0.1545437127351761, + "epoch": 0.3353733170134639, + "grad_norm": 0.17519228160381317, + "learning_rate": 0.00015985280574794993, + "loss": 0.1517, + "mean_token_accuracy": 0.9514786303043365, + "num_tokens": 786437.0, + "step": 274 + }, + { + "entropy": 0.16318928822875023, + "epoch": 0.33659730722154224, + "grad_norm": 0.15866655111312866, + "learning_rate": 0.00015955518758358338, + "loss": 0.1664, + "mean_token_accuracy": 0.9402740448713303, + "num_tokens": 789398.0, + "step": 275 + }, + { + "entropy": 0.1511749029159546, + "epoch": 0.3378212974296206, + "grad_norm": 0.18697375059127808, + "learning_rate": 0.0001592567897005399, + "loss": 0.1636, + "mean_token_accuracy": 0.9464134275913239, + "num_tokens": 792236.0, + "step": 276 + }, + { + "entropy": 0.16786139458417892, + "epoch": 0.3390452876376989, + "grad_norm": 0.2146279662847519, + "learning_rate": 0.0001589576167939163, + "loss": 0.197, + "mean_token_accuracy": 0.9406168162822723, + "num_tokens": 795216.0, + "step": 277 + }, + { + "entropy": 0.14407162740826607, + "epoch": 0.3402692778457772, + "grad_norm": 0.1653442531824112, + "learning_rate": 0.0001586576735710038, + "loss": 0.1455, + "mean_token_accuracy": 0.9496916979551315, + "num_tokens": 798024.0, + "step": 278 + }, + { + "entropy": 0.1474698670208454, + "epoch": 0.34149326805385555, + "grad_norm": 0.16035403311252594, + "learning_rate": 0.00015835696475121418, + "loss": 0.161, + "mean_token_accuracy": 0.9472674876451492, + "num_tokens": 800893.0, + "step": 279 + }, + { + "entropy": 0.14612959697842598, + "epoch": 0.3427172582619339, + "grad_norm": 0.1387489289045334, + "learning_rate": 0.00015805549506600527, + "loss": 0.1396, + "mean_token_accuracy": 0.9526460319757462, + "num_tokens": 803927.0, + "step": 280 + }, + { + "entropy": 0.1554742231965065, + "epoch": 0.34394124847001223, + "grad_norm": 0.17447642982006073, + "learning_rate": 0.00015775326925880674, + "loss": 0.1693, + "mean_token_accuracy": 0.9476045221090317, + "num_tokens": 806841.0, + "step": 281 + }, + { + "entropy": 0.15349746868014336, + "epoch": 0.34516523867809057, + "grad_norm": 0.16736829280853271, + "learning_rate": 0.0001574502920849452, + "loss": 0.1553, + "mean_token_accuracy": 0.9502989798784256, + "num_tokens": 809692.0, + "step": 282 + }, + { + "entropy": 0.1701544038951397, + "epoch": 0.3463892288861689, + "grad_norm": 0.12485537678003311, + "learning_rate": 0.00015714656831156975, + "loss": 0.1563, + "mean_token_accuracy": 0.9486553072929382, + "num_tokens": 812598.0, + "step": 283 + }, + { + "entropy": 0.16421806812286377, + "epoch": 0.34761321909424725, + "grad_norm": 0.12782849371433258, + "learning_rate": 0.00015684210271757652, + "loss": 0.1499, + "mean_token_accuracy": 0.9465761780738831, + "num_tokens": 815366.0, + "step": 284 + }, + { + "entropy": 0.15485062822699547, + "epoch": 0.3488372093023256, + "grad_norm": 0.13289634883403778, + "learning_rate": 0.00015653690009353378, + "loss": 0.1532, + "mean_token_accuracy": 0.9480918049812317, + "num_tokens": 818412.0, + "step": 285 + }, + { + "entropy": 0.16174999251961708, + "epoch": 0.35006119951040393, + "grad_norm": 0.16009384393692017, + "learning_rate": 0.00015623096524160657, + "loss": 0.1494, + "mean_token_accuracy": 0.9480710923671722, + "num_tokens": 821196.0, + "step": 286 + }, + { + "entropy": 0.1417551077902317, + "epoch": 0.35128518971848227, + "grad_norm": 0.1461164653301239, + "learning_rate": 0.000155924302975481, + "loss": 0.1447, + "mean_token_accuracy": 0.9538660943508148, + "num_tokens": 824152.0, + "step": 287 + }, + { + "entropy": 0.16920017078518867, + "epoch": 0.3525091799265606, + "grad_norm": 0.23985892534255981, + "learning_rate": 0.0001556169181202885, + "loss": 0.1858, + "mean_token_accuracy": 0.9420565366744995, + "num_tokens": 827060.0, + "step": 288 + }, + { + "entropy": 0.1464613676071167, + "epoch": 0.3537331701346389, + "grad_norm": 0.11802126467227936, + "learning_rate": 0.00015530881551253003, + "loss": 0.1419, + "mean_token_accuracy": 0.9536632001399994, + "num_tokens": 829964.0, + "step": 289 + }, + { + "entropy": 0.18416478484869003, + "epoch": 0.35495716034271724, + "grad_norm": 0.15151479840278625, + "learning_rate": 0.000155, + "loss": 0.1728, + "mean_token_accuracy": 0.945660188794136, + "num_tokens": 832751.0, + "step": 290 + }, + { + "entropy": 0.14679842442274094, + "epoch": 0.3561811505507956, + "grad_norm": 0.12537342309951782, + "learning_rate": 0.0001546904764417098, + "loss": 0.1392, + "mean_token_accuracy": 0.9564869701862335, + "num_tokens": 835440.0, + "step": 291 + }, + { + "entropy": 0.15377968922257423, + "epoch": 0.3574051407588739, + "grad_norm": 0.15351682901382446, + "learning_rate": 0.00015438024970781153, + "loss": 0.154, + "mean_token_accuracy": 0.94918292760849, + "num_tokens": 838345.0, + "step": 292 + }, + { + "entropy": 0.14738023281097412, + "epoch": 0.35862913096695226, + "grad_norm": 0.14002980291843414, + "learning_rate": 0.00015406932467952123, + "loss": 0.147, + "mean_token_accuracy": 0.9494003355503082, + "num_tokens": 841128.0, + "step": 293 + }, + { + "entropy": 0.15242433547973633, + "epoch": 0.3598531211750306, + "grad_norm": 0.1466827392578125, + "learning_rate": 0.00015375770624904218, + "loss": 0.1542, + "mean_token_accuracy": 0.9480858147144318, + "num_tokens": 844077.0, + "step": 294 + }, + { + "entropy": 0.15137697756290436, + "epoch": 0.36107711138310894, + "grad_norm": 0.11579515039920807, + "learning_rate": 0.00015344539931948795, + "loss": 0.1399, + "mean_token_accuracy": 0.951460674405098, + "num_tokens": 846851.0, + "step": 295 + }, + { + "entropy": 0.1322315465658903, + "epoch": 0.3623011015911873, + "grad_norm": 0.10845408588647842, + "learning_rate": 0.00015313240880480514, + "loss": 0.1294, + "mean_token_accuracy": 0.9559520184993744, + "num_tokens": 849772.0, + "step": 296 + }, + { + "entropy": 0.16083206236362457, + "epoch": 0.3635250917992656, + "grad_norm": 0.16092537343502045, + "learning_rate": 0.00015281873962969607, + "loss": 0.1686, + "mean_token_accuracy": 0.9470204263925552, + "num_tokens": 852848.0, + "step": 297 + }, + { + "entropy": 0.149605181068182, + "epoch": 0.36474908200734396, + "grad_norm": 0.2061191350221634, + "learning_rate": 0.00015250439672954147, + "loss": 0.1714, + "mean_token_accuracy": 0.9444099068641663, + "num_tokens": 855715.0, + "step": 298 + }, + { + "entropy": 0.14552443847060204, + "epoch": 0.3659730722154223, + "grad_norm": 0.13903836905956268, + "learning_rate": 0.00015218938505032255, + "loss": 0.1364, + "mean_token_accuracy": 0.9531468152999878, + "num_tokens": 858683.0, + "step": 299 + }, + { + "entropy": 0.1348921526223421, + "epoch": 0.3671970624235006, + "grad_norm": 0.1601039171218872, + "learning_rate": 0.00015187370954854347, + "loss": 0.1401, + "mean_token_accuracy": 0.9545067846775055, + "num_tokens": 861544.0, + "step": 300 + }, + { + "entropy": 0.14761453121900558, + "epoch": 0.3684210526315789, + "grad_norm": 0.1800871193408966, + "learning_rate": 0.00015155737519115307, + "loss": 0.1592, + "mean_token_accuracy": 0.945025846362114, + "num_tokens": 864399.0, + "step": 301 + }, + { + "entropy": 0.14218927919864655, + "epoch": 0.36964504283965727, + "grad_norm": 0.12443792819976807, + "learning_rate": 0.00015124038695546693, + "loss": 0.1414, + "mean_token_accuracy": 0.9529111981391907, + "num_tokens": 867281.0, + "step": 302 + }, + { + "entropy": 0.15534481592476368, + "epoch": 0.3708690330477356, + "grad_norm": 0.18965572118759155, + "learning_rate": 0.000150922749829089, + "loss": 0.1588, + "mean_token_accuracy": 0.945900559425354, + "num_tokens": 869987.0, + "step": 303 + }, + { + "entropy": 0.156289491802454, + "epoch": 0.37209302325581395, + "grad_norm": 0.15650850534439087, + "learning_rate": 0.0001506044688098331, + "loss": 0.1573, + "mean_token_accuracy": 0.94992895424366, + "num_tokens": 872839.0, + "step": 304 + }, + { + "entropy": 0.1312422826886177, + "epoch": 0.3733170134638923, + "grad_norm": 0.15822833776474, + "learning_rate": 0.0001502855489056441, + "loss": 0.1308, + "mean_token_accuracy": 0.9549101144075394, + "num_tokens": 875626.0, + "step": 305 + }, + { + "entropy": 0.13904886320233345, + "epoch": 0.37454100367197063, + "grad_norm": 0.15255236625671387, + "learning_rate": 0.0001499659951345197, + "loss": 0.1494, + "mean_token_accuracy": 0.9524371922016144, + "num_tokens": 878478.0, + "step": 306 + }, + { + "entropy": 0.1630525141954422, + "epoch": 0.37576499388004897, + "grad_norm": 0.16016660630702972, + "learning_rate": 0.00014964581252443067, + "loss": 0.1647, + "mean_token_accuracy": 0.9454061985015869, + "num_tokens": 881325.0, + "step": 307 + }, + { + "entropy": 0.14576981589198112, + "epoch": 0.3769889840881273, + "grad_norm": 0.16387522220611572, + "learning_rate": 0.00014932500611324236, + "loss": 0.1483, + "mean_token_accuracy": 0.9511110782623291, + "num_tokens": 884323.0, + "step": 308 + }, + { + "entropy": 0.14862747862935066, + "epoch": 0.37821297429620565, + "grad_norm": 0.11072476953268051, + "learning_rate": 0.00014900358094863526, + "loss": 0.1333, + "mean_token_accuracy": 0.9556140899658203, + "num_tokens": 887286.0, + "step": 309 + }, + { + "entropy": 0.134588822722435, + "epoch": 0.379436964504284, + "grad_norm": 0.13213595747947693, + "learning_rate": 0.00014868154208802546, + "loss": 0.1381, + "mean_token_accuracy": 0.9509967118501663, + "num_tokens": 890136.0, + "step": 310 + }, + { + "entropy": 0.1505202241241932, + "epoch": 0.3806609547123623, + "grad_norm": 0.1447887420654297, + "learning_rate": 0.00014835889459848515, + "loss": 0.1484, + "mean_token_accuracy": 0.9513203948736191, + "num_tokens": 892884.0, + "step": 311 + }, + { + "entropy": 0.15479392185807228, + "epoch": 0.3818849449204406, + "grad_norm": 0.14951001107692719, + "learning_rate": 0.00014803564355666296, + "loss": 0.1632, + "mean_token_accuracy": 0.9497022181749344, + "num_tokens": 895729.0, + "step": 312 + }, + { + "entropy": 0.15362760052084923, + "epoch": 0.38310893512851896, + "grad_norm": 0.13406336307525635, + "learning_rate": 0.00014771179404870398, + "loss": 0.1539, + "mean_token_accuracy": 0.9479725211858749, + "num_tokens": 898747.0, + "step": 313 + }, + { + "entropy": 0.16546957194805145, + "epoch": 0.3843329253365973, + "grad_norm": 0.15698297321796417, + "learning_rate": 0.00014738735117016978, + "loss": 0.1707, + "mean_token_accuracy": 0.9463748782873154, + "num_tokens": 901655.0, + "step": 314 + }, + { + "entropy": 0.13706695102155209, + "epoch": 0.38555691554467564, + "grad_norm": 0.09877198189496994, + "learning_rate": 0.00014706232002595828, + "loss": 0.1237, + "mean_token_accuracy": 0.9570258110761642, + "num_tokens": 904462.0, + "step": 315 + }, + { + "entropy": 0.14380010589957237, + "epoch": 0.386780905752754, + "grad_norm": 0.13186849653720856, + "learning_rate": 0.0001467367057302233, + "loss": 0.1424, + "mean_token_accuracy": 0.9487275928258896, + "num_tokens": 907272.0, + "step": 316 + }, + { + "entropy": 0.1601439192891121, + "epoch": 0.3880048959608323, + "grad_norm": 0.13778473436832428, + "learning_rate": 0.00014641051340629418, + "loss": 0.1523, + "mean_token_accuracy": 0.945304661989212, + "num_tokens": 910146.0, + "step": 317 + }, + { + "entropy": 0.15473321080207825, + "epoch": 0.38922888616891066, + "grad_norm": 0.12191618233919144, + "learning_rate": 0.00014608374818659526, + "loss": 0.1414, + "mean_token_accuracy": 0.9572496116161346, + "num_tokens": 912724.0, + "step": 318 + }, + { + "entropy": 0.1558196023106575, + "epoch": 0.390452876376989, + "grad_norm": 0.1394515037536621, + "learning_rate": 0.00014575641521256486, + "loss": 0.1583, + "mean_token_accuracy": 0.9498332440853119, + "num_tokens": 915752.0, + "step": 319 + }, + { + "entropy": 0.15329847112298012, + "epoch": 0.39167686658506734, + "grad_norm": 0.13842739164829254, + "learning_rate": 0.00014542851963457473, + "loss": 0.1401, + "mean_token_accuracy": 0.9502763897180557, + "num_tokens": 918643.0, + "step": 320 + }, + { + "entropy": 0.15282956510782242, + "epoch": 0.3929008567931457, + "grad_norm": 0.12368538230657578, + "learning_rate": 0.00014510006661184866, + "loss": 0.1413, + "mean_token_accuracy": 0.9517961591482162, + "num_tokens": 921586.0, + "step": 321 + }, + { + "entropy": 0.1521201990544796, + "epoch": 0.39412484700122397, + "grad_norm": 0.1309376358985901, + "learning_rate": 0.00014477106131238155, + "loss": 0.1495, + "mean_token_accuracy": 0.9491784572601318, + "num_tokens": 924447.0, + "step": 322 + }, + { + "entropy": 0.14049221947789192, + "epoch": 0.3953488372093023, + "grad_norm": 0.10751700401306152, + "learning_rate": 0.00014444150891285807, + "loss": 0.1299, + "mean_token_accuracy": 0.9556889832019806, + "num_tokens": 927397.0, + "step": 323 + }, + { + "entropy": 0.13667663745582104, + "epoch": 0.39657282741738065, + "grad_norm": 0.11559297889471054, + "learning_rate": 0.00014411141459857104, + "loss": 0.1367, + "mean_token_accuracy": 0.951929360628128, + "num_tokens": 930216.0, + "step": 324 + }, + { + "entropy": 0.15966885164380074, + "epoch": 0.397796817625459, + "grad_norm": 0.1926850527524948, + "learning_rate": 0.00014378078356334, + "loss": 0.1699, + "mean_token_accuracy": 0.9389549642801285, + "num_tokens": 933097.0, + "step": 325 + }, + { + "entropy": 0.149288147687912, + "epoch": 0.3990208078335373, + "grad_norm": 0.1551034152507782, + "learning_rate": 0.00014344962100942946, + "loss": 0.1657, + "mean_token_accuracy": 0.9461280554533005, + "num_tokens": 935986.0, + "step": 326 + }, + { + "entropy": 0.12885337509214878, + "epoch": 0.40024479804161567, + "grad_norm": 0.14290180802345276, + "learning_rate": 0.00014311793214746708, + "loss": 0.1431, + "mean_token_accuracy": 0.9530783593654633, + "num_tokens": 938992.0, + "step": 327 + }, + { + "entropy": 0.15230268985033035, + "epoch": 0.401468788249694, + "grad_norm": 0.11795686185359955, + "learning_rate": 0.00014278572219636148, + "loss": 0.1446, + "mean_token_accuracy": 0.9465717673301697, + "num_tokens": 941936.0, + "step": 328 + }, + { + "entropy": 0.1401187777519226, + "epoch": 0.40269277845777235, + "grad_norm": 0.11777287721633911, + "learning_rate": 0.0001424529963832205, + "loss": 0.1409, + "mean_token_accuracy": 0.9511523544788361, + "num_tokens": 944795.0, + "step": 329 + }, + { + "entropy": 0.14652667939662933, + "epoch": 0.4039167686658507, + "grad_norm": 0.1276416778564453, + "learning_rate": 0.00014211975994326848, + "loss": 0.1389, + "mean_token_accuracy": 0.9489726424217224, + "num_tokens": 947810.0, + "step": 330 + }, + { + "entropy": 0.15301398932933807, + "epoch": 0.40514075887392903, + "grad_norm": 0.15384003520011902, + "learning_rate": 0.00014178601811976434, + "loss": 0.146, + "mean_token_accuracy": 0.9490158259868622, + "num_tokens": 950662.0, + "step": 331 + }, + { + "entropy": 0.14242121577262878, + "epoch": 0.40636474908200737, + "grad_norm": 0.152218297123909, + "learning_rate": 0.00014145177616391887, + "loss": 0.1405, + "mean_token_accuracy": 0.9497435092926025, + "num_tokens": 953563.0, + "step": 332 + }, + { + "entropy": 0.14127517119050026, + "epoch": 0.40758873929008566, + "grad_norm": 0.11781765520572662, + "learning_rate": 0.00014111703933481198, + "loss": 0.1382, + "mean_token_accuracy": 0.9530066698789597, + "num_tokens": 956433.0, + "step": 333 + }, + { + "entropy": 0.1441398561000824, + "epoch": 0.408812729498164, + "grad_norm": 0.15119579434394836, + "learning_rate": 0.0001407818128993102, + "loss": 0.136, + "mean_token_accuracy": 0.9519618451595306, + "num_tokens": 959277.0, + "step": 334 + }, + { + "entropy": 0.13222733698785305, + "epoch": 0.41003671970624234, + "grad_norm": 0.119105763733387, + "learning_rate": 0.0001404461021319837, + "loss": 0.1382, + "mean_token_accuracy": 0.9542486071586609, + "num_tokens": 962041.0, + "step": 335 + }, + { + "entropy": 0.14200424402952194, + "epoch": 0.4112607099143207, + "grad_norm": 0.14100365340709686, + "learning_rate": 0.00014010991231502317, + "loss": 0.1437, + "mean_token_accuracy": 0.949680283665657, + "num_tokens": 964886.0, + "step": 336 + }, + { + "entropy": 0.133393544703722, + "epoch": 0.412484700122399, + "grad_norm": 0.1519688367843628, + "learning_rate": 0.0001397732487381571, + "loss": 0.1356, + "mean_token_accuracy": 0.9563153833150864, + "num_tokens": 967649.0, + "step": 337 + }, + { + "entropy": 0.12752126902341843, + "epoch": 0.41370869033047736, + "grad_norm": 0.14037086069583893, + "learning_rate": 0.000139436116698568, + "loss": 0.1485, + "mean_token_accuracy": 0.9543791562318802, + "num_tokens": 970507.0, + "step": 338 + }, + { + "entropy": 0.13891296088695526, + "epoch": 0.4149326805385557, + "grad_norm": 0.1560559719800949, + "learning_rate": 0.0001390985215008094, + "loss": 0.1397, + "mean_token_accuracy": 0.9537011981010437, + "num_tokens": 973306.0, + "step": 339 + }, + { + "entropy": 0.1506254430860281, + "epoch": 0.41615667074663404, + "grad_norm": 0.13996510207653046, + "learning_rate": 0.0001387604684567225, + "loss": 0.1402, + "mean_token_accuracy": 0.9540493190288544, + "num_tokens": 976318.0, + "step": 340 + }, + { + "entropy": 0.13520655781030655, + "epoch": 0.4173806609547124, + "grad_norm": 0.13306720554828644, + "learning_rate": 0.00013842196288535224, + "loss": 0.1419, + "mean_token_accuracy": 0.9543610662221909, + "num_tokens": 979291.0, + "step": 341 + }, + { + "entropy": 0.14067644998431206, + "epoch": 0.4186046511627907, + "grad_norm": 0.1677086055278778, + "learning_rate": 0.00013808301011286383, + "loss": 0.1578, + "mean_token_accuracy": 0.9484180212020874, + "num_tokens": 982128.0, + "step": 342 + }, + { + "entropy": 0.14619873464107513, + "epoch": 0.41982864137086906, + "grad_norm": 0.13375866413116455, + "learning_rate": 0.00013774361547245893, + "loss": 0.1443, + "mean_token_accuracy": 0.9509466886520386, + "num_tokens": 984969.0, + "step": 343 + }, + { + "entropy": 0.15119155496358871, + "epoch": 0.42105263157894735, + "grad_norm": 0.14939425885677338, + "learning_rate": 0.00013740378430429165, + "loss": 0.1458, + "mean_token_accuracy": 0.9547770172357559, + "num_tokens": 987747.0, + "step": 344 + }, + { + "entropy": 0.15035705268383026, + "epoch": 0.4222766217870257, + "grad_norm": 0.14231523871421814, + "learning_rate": 0.00013706352195538458, + "loss": 0.1516, + "mean_token_accuracy": 0.9481893628835678, + "num_tokens": 990610.0, + "step": 345 + }, + { + "entropy": 0.13821907714009285, + "epoch": 0.423500611995104, + "grad_norm": 0.10501782596111298, + "learning_rate": 0.00013672283377954477, + "loss": 0.122, + "mean_token_accuracy": 0.9573890268802643, + "num_tokens": 993370.0, + "step": 346 + }, + { + "entropy": 0.14097687229514122, + "epoch": 0.42472460220318237, + "grad_norm": 0.15070122480392456, + "learning_rate": 0.0001363817251372792, + "loss": 0.1395, + "mean_token_accuracy": 0.9575945734977722, + "num_tokens": 996321.0, + "step": 347 + }, + { + "entropy": 0.143765177577734, + "epoch": 0.4259485924112607, + "grad_norm": 0.1590331643819809, + "learning_rate": 0.00013604020139571077, + "loss": 0.1481, + "mean_token_accuracy": 0.9514146149158478, + "num_tokens": 999124.0, + "step": 348 + }, + { + "entropy": 0.13779877312481403, + "epoch": 0.42717258261933905, + "grad_norm": 0.20401647686958313, + "learning_rate": 0.00013569826792849361, + "loss": 0.1604, + "mean_token_accuracy": 0.9505531638860703, + "num_tokens": 1001930.0, + "step": 349 + }, + { + "entropy": 0.14049612544476986, + "epoch": 0.4283965728274174, + "grad_norm": 0.1355866640806198, + "learning_rate": 0.0001353559301157287, + "loss": 0.1425, + "mean_token_accuracy": 0.9528732299804688, + "num_tokens": 1004783.0, + "step": 350 + }, + { + "entropy": 0.1371607892215252, + "epoch": 0.42962056303549573, + "grad_norm": 0.13285967707633972, + "learning_rate": 0.000135013193343879, + "loss": 0.1401, + "mean_token_accuracy": 0.9513075202703476, + "num_tokens": 1007693.0, + "step": 351 + }, + { + "entropy": 0.13895654678344727, + "epoch": 0.43084455324357407, + "grad_norm": 0.14023812115192413, + "learning_rate": 0.00013467006300568502, + "loss": 0.1416, + "mean_token_accuracy": 0.954132542014122, + "num_tokens": 1010519.0, + "step": 352 + }, + { + "entropy": 0.14657279290258884, + "epoch": 0.4320685434516524, + "grad_norm": 0.16405871510505676, + "learning_rate": 0.00013432654450007956, + "loss": 0.1467, + "mean_token_accuracy": 0.9481655806303024, + "num_tokens": 1013202.0, + "step": 353 + }, + { + "entropy": 0.14048417285084724, + "epoch": 0.43329253365973075, + "grad_norm": 0.1337006688117981, + "learning_rate": 0.00013398264323210315, + "loss": 0.1308, + "mean_token_accuracy": 0.9549468904733658, + "num_tokens": 1016029.0, + "step": 354 + }, + { + "entropy": 0.1600259207189083, + "epoch": 0.43451652386780903, + "grad_norm": 0.18126504123210907, + "learning_rate": 0.0001336383646128188, + "loss": 0.162, + "mean_token_accuracy": 0.9478375762701035, + "num_tokens": 1019046.0, + "step": 355 + }, + { + "entropy": 0.16760462522506714, + "epoch": 0.4357405140758874, + "grad_norm": 0.14033575356006622, + "learning_rate": 0.00013329371405922688, + "loss": 0.1569, + "mean_token_accuracy": 0.9465935677289963, + "num_tokens": 1021986.0, + "step": 356 + }, + { + "entropy": 0.1519397497177124, + "epoch": 0.4369645042839657, + "grad_norm": 0.11691096425056458, + "learning_rate": 0.00013294869699417988, + "loss": 0.1309, + "mean_token_accuracy": 0.9530302435159683, + "num_tokens": 1024758.0, + "step": 357 + }, + { + "entropy": 0.15269140526652336, + "epoch": 0.43818849449204406, + "grad_norm": 0.12390341609716415, + "learning_rate": 0.00013260331884629712, + "loss": 0.1358, + "mean_token_accuracy": 0.9525894522666931, + "num_tokens": 1027662.0, + "step": 358 + }, + { + "entropy": 0.14871510863304138, + "epoch": 0.4394124847001224, + "grad_norm": 0.10898353159427643, + "learning_rate": 0.00013225758504987938, + "loss": 0.1432, + "mean_token_accuracy": 0.9485438913106918, + "num_tokens": 1030516.0, + "step": 359 + }, + { + "entropy": 0.14364299923181534, + "epoch": 0.44063647490820074, + "grad_norm": 0.10840337723493576, + "learning_rate": 0.0001319115010448233, + "loss": 0.1348, + "mean_token_accuracy": 0.9544093608856201, + "num_tokens": 1033332.0, + "step": 360 + }, + { + "entropy": 0.15100236237049103, + "epoch": 0.4418604651162791, + "grad_norm": 0.14509782195091248, + "learning_rate": 0.0001315650722765358, + "loss": 0.1512, + "mean_token_accuracy": 0.9526737630367279, + "num_tokens": 1036286.0, + "step": 361 + }, + { + "entropy": 0.11828767322003841, + "epoch": 0.4430844553243574, + "grad_norm": 0.08955942094326019, + "learning_rate": 0.00013121830419584847, + "loss": 0.1184, + "mean_token_accuracy": 0.9560969471931458, + "num_tokens": 1039039.0, + "step": 362 + }, + { + "entropy": 0.1256970874965191, + "epoch": 0.44430844553243576, + "grad_norm": 0.1434059888124466, + "learning_rate": 0.00013087120225893168, + "loss": 0.1409, + "mean_token_accuracy": 0.9529259949922562, + "num_tokens": 1041845.0, + "step": 363 + }, + { + "entropy": 0.13888518884778023, + "epoch": 0.4455324357405141, + "grad_norm": 0.13521118462085724, + "learning_rate": 0.00013052377192720896, + "loss": 0.1498, + "mean_token_accuracy": 0.94832943379879, + "num_tokens": 1044843.0, + "step": 364 + }, + { + "entropy": 0.13128922879695892, + "epoch": 0.4467564259485924, + "grad_norm": 0.14544379711151123, + "learning_rate": 0.00013017601866727074, + "loss": 0.1355, + "mean_token_accuracy": 0.9550573527812958, + "num_tokens": 1047823.0, + "step": 365 + }, + { + "entropy": 0.1458285804837942, + "epoch": 0.4479804161566707, + "grad_norm": 0.15188312530517578, + "learning_rate": 0.00012982794795078867, + "loss": 0.1676, + "mean_token_accuracy": 0.9446605145931244, + "num_tokens": 1050764.0, + "step": 366 + }, + { + "entropy": 0.1299880761653185, + "epoch": 0.44920440636474906, + "grad_norm": 0.12781859934329987, + "learning_rate": 0.00012947956525442925, + "loss": 0.1392, + "mean_token_accuracy": 0.9538396149873734, + "num_tokens": 1053811.0, + "step": 367 + }, + { + "entropy": 0.14886001124978065, + "epoch": 0.4504283965728274, + "grad_norm": 0.15223422646522522, + "learning_rate": 0.0001291308760597679, + "loss": 0.1621, + "mean_token_accuracy": 0.9419552832841873, + "num_tokens": 1056695.0, + "step": 368 + }, + { + "entropy": 0.1366724669933319, + "epoch": 0.45165238678090575, + "grad_norm": 0.12155237793922424, + "learning_rate": 0.0001287818858532025, + "loss": 0.1381, + "mean_token_accuracy": 0.955892339348793, + "num_tokens": 1059708.0, + "step": 369 + }, + { + "entropy": 0.1326532531529665, + "epoch": 0.4528763769889841, + "grad_norm": 0.14630113542079926, + "learning_rate": 0.00012843260012586718, + "loss": 0.1434, + "mean_token_accuracy": 0.9520125985145569, + "num_tokens": 1062554.0, + "step": 370 + }, + { + "entropy": 0.15683763474225998, + "epoch": 0.4541003671970624, + "grad_norm": 0.13688820600509644, + "learning_rate": 0.00012808302437354588, + "loss": 0.1534, + "mean_token_accuracy": 0.9499343633651733, + "num_tokens": 1065640.0, + "step": 371 + }, + { + "entropy": 0.14376649633049965, + "epoch": 0.45532435740514077, + "grad_norm": 0.12063167244195938, + "learning_rate": 0.000127733164096586, + "loss": 0.1357, + "mean_token_accuracy": 0.9498518407344818, + "num_tokens": 1068553.0, + "step": 372 + }, + { + "entropy": 0.13551764748990536, + "epoch": 0.4565483476132191, + "grad_norm": 0.10880531370639801, + "learning_rate": 0.00012738302479981152, + "loss": 0.127, + "mean_token_accuracy": 0.9537849426269531, + "num_tokens": 1071400.0, + "step": 373 + }, + { + "entropy": 0.1419544517993927, + "epoch": 0.45777233782129745, + "grad_norm": 0.13044477999210358, + "learning_rate": 0.0001270326119924369, + "loss": 0.1502, + "mean_token_accuracy": 0.9481295794248581, + "num_tokens": 1074135.0, + "step": 374 + }, + { + "entropy": 0.13699105754494667, + "epoch": 0.4589963280293758, + "grad_norm": 0.10245759785175323, + "learning_rate": 0.00012668193118797998, + "loss": 0.1265, + "mean_token_accuracy": 0.9543553292751312, + "num_tokens": 1076979.0, + "step": 375 + }, + { + "entropy": 0.15189310163259506, + "epoch": 0.4602203182374541, + "grad_norm": 0.10509387403726578, + "learning_rate": 0.00012633098790417523, + "loss": 0.1396, + "mean_token_accuracy": 0.950060248374939, + "num_tokens": 1079891.0, + "step": 376 + }, + { + "entropy": 0.13161081075668335, + "epoch": 0.4614443084455324, + "grad_norm": 0.1440083086490631, + "learning_rate": 0.00012597978766288733, + "loss": 0.1276, + "mean_token_accuracy": 0.9539390951395035, + "num_tokens": 1082838.0, + "step": 377 + }, + { + "entropy": 0.14994242414832115, + "epoch": 0.46266829865361075, + "grad_norm": 0.12164657562971115, + "learning_rate": 0.00012562833599002375, + "loss": 0.1484, + "mean_token_accuracy": 0.9505013078451157, + "num_tokens": 1085712.0, + "step": 378 + }, + { + "entropy": 0.14206018671393394, + "epoch": 0.4638922888616891, + "grad_norm": 0.13283054530620575, + "learning_rate": 0.00012527663841544828, + "loss": 0.1476, + "mean_token_accuracy": 0.9502213299274445, + "num_tokens": 1088632.0, + "step": 379 + }, + { + "entropy": 0.14087211713194847, + "epoch": 0.46511627906976744, + "grad_norm": 0.11404265463352203, + "learning_rate": 0.0001249247004728937, + "loss": 0.132, + "mean_token_accuracy": 0.9549631029367447, + "num_tokens": 1091323.0, + "step": 380 + }, + { + "entropy": 0.1236678697168827, + "epoch": 0.4663402692778458, + "grad_norm": 0.10191497951745987, + "learning_rate": 0.00012457252769987483, + "loss": 0.1258, + "mean_token_accuracy": 0.9562105983495712, + "num_tokens": 1094195.0, + "step": 381 + }, + { + "entropy": 0.13154175132513046, + "epoch": 0.4675642594859241, + "grad_norm": 0.14269708096981049, + "learning_rate": 0.0001242201256376015, + "loss": 0.1294, + "mean_token_accuracy": 0.9532541036605835, + "num_tokens": 1096971.0, + "step": 382 + }, + { + "entropy": 0.12461301125586033, + "epoch": 0.46878824969400246, + "grad_norm": 0.11677689850330353, + "learning_rate": 0.0001238674998308911, + "loss": 0.1321, + "mean_token_accuracy": 0.9538192600011826, + "num_tokens": 1099892.0, + "step": 383 + }, + { + "entropy": 0.1387389786541462, + "epoch": 0.4700122399020808, + "grad_norm": 0.12390587478876114, + "learning_rate": 0.00012351465582808164, + "loss": 0.1376, + "mean_token_accuracy": 0.9543635100126266, + "num_tokens": 1102651.0, + "step": 384 + }, + { + "entropy": 0.1256554275751114, + "epoch": 0.47123623011015914, + "grad_norm": 0.11699675768613815, + "learning_rate": 0.00012316159918094417, + "loss": 0.14, + "mean_token_accuracy": 0.9536283612251282, + "num_tokens": 1105440.0, + "step": 385 + }, + { + "entropy": 0.13146117515861988, + "epoch": 0.4724602203182375, + "grad_norm": 0.1133599728345871, + "learning_rate": 0.00012280833544459568, + "loss": 0.1305, + "mean_token_accuracy": 0.9555511027574539, + "num_tokens": 1108212.0, + "step": 386 + }, + { + "entropy": 0.1284522730857134, + "epoch": 0.47368421052631576, + "grad_norm": 0.10900208353996277, + "learning_rate": 0.00012245487017741147, + "loss": 0.1248, + "mean_token_accuracy": 0.9536370187997818, + "num_tokens": 1111013.0, + "step": 387 + }, + { + "entropy": 0.14456385374069214, + "epoch": 0.4749082007343941, + "grad_norm": 0.14733007550239563, + "learning_rate": 0.00012210120894093788, + "loss": 0.1608, + "mean_token_accuracy": 0.9453224092721939, + "num_tokens": 1113889.0, + "step": 388 + }, + { + "entropy": 0.13321279920637608, + "epoch": 0.47613219094247244, + "grad_norm": 0.14787733554840088, + "learning_rate": 0.00012174735729980466, + "loss": 0.1279, + "mean_token_accuracy": 0.9532034695148468, + "num_tokens": 1116968.0, + "step": 389 + }, + { + "entropy": 0.13903686590492725, + "epoch": 0.4773561811505508, + "grad_norm": 0.10082501918077469, + "learning_rate": 0.00012139332082163744, + "loss": 0.1258, + "mean_token_accuracy": 0.9499611258506775, + "num_tokens": 1119802.0, + "step": 390 + }, + { + "entropy": 0.14177364856004715, + "epoch": 0.4785801713586291, + "grad_norm": 0.12191358208656311, + "learning_rate": 0.0001210391050769702, + "loss": 0.134, + "mean_token_accuracy": 0.9570882767438889, + "num_tokens": 1122800.0, + "step": 391 + }, + { + "entropy": 0.14397892355918884, + "epoch": 0.47980416156670747, + "grad_norm": 0.12011052668094635, + "learning_rate": 0.00012068471563915747, + "loss": 0.1345, + "mean_token_accuracy": 0.9518181681632996, + "num_tokens": 1125486.0, + "step": 392 + }, + { + "entropy": 0.14021234214305878, + "epoch": 0.4810281517747858, + "grad_norm": 0.1147322952747345, + "learning_rate": 0.00012033015808428683, + "loss": 0.1346, + "mean_token_accuracy": 0.9503093957901001, + "num_tokens": 1128217.0, + "step": 393 + }, + { + "entropy": 0.13452493213117123, + "epoch": 0.48225214198286415, + "grad_norm": 0.1172524094581604, + "learning_rate": 0.00011997543799109102, + "loss": 0.1379, + "mean_token_accuracy": 0.9532226026058197, + "num_tokens": 1131037.0, + "step": 394 + }, + { + "entropy": 0.13081609271466732, + "epoch": 0.4834761321909425, + "grad_norm": 0.11866399645805359, + "learning_rate": 0.00011962056094086018, + "loss": 0.1319, + "mean_token_accuracy": 0.9528060108423233, + "num_tokens": 1133798.0, + "step": 395 + }, + { + "entropy": 0.1321812979876995, + "epoch": 0.4847001223990208, + "grad_norm": 0.12555596232414246, + "learning_rate": 0.00011926553251735412, + "loss": 0.1471, + "mean_token_accuracy": 0.9510558545589447, + "num_tokens": 1136804.0, + "step": 396 + }, + { + "entropy": 0.12046686001121998, + "epoch": 0.48592411260709917, + "grad_norm": 0.10109179466962814, + "learning_rate": 0.0001189103583067144, + "loss": 0.1235, + "mean_token_accuracy": 0.9547515362501144, + "num_tokens": 1139693.0, + "step": 397 + }, + { + "entropy": 0.1354110036045313, + "epoch": 0.48714810281517745, + "grad_norm": 0.12665265798568726, + "learning_rate": 0.00011855504389737644, + "loss": 0.1404, + "mean_token_accuracy": 0.9554544538259506, + "num_tokens": 1142552.0, + "step": 398 + }, + { + "entropy": 0.1443750038743019, + "epoch": 0.4883720930232558, + "grad_norm": 0.1255348026752472, + "learning_rate": 0.00011819959487998162, + "loss": 0.1398, + "mean_token_accuracy": 0.9517905861139297, + "num_tokens": 1145470.0, + "step": 399 + }, + { + "entropy": 0.14072296395897865, + "epoch": 0.48959608323133413, + "grad_norm": 0.161954864859581, + "learning_rate": 0.00011784401684728925, + "loss": 0.1452, + "mean_token_accuracy": 0.9535402804613113, + "num_tokens": 1148391.0, + "step": 400 + }, + { + "entropy": 0.11881877668201923, + "epoch": 0.4908200734394125, + "grad_norm": 0.10032571107149124, + "learning_rate": 0.00011748831539408862, + "loss": 0.1202, + "mean_token_accuracy": 0.9560712575912476, + "num_tokens": 1151278.0, + "step": 401 + }, + { + "entropy": 0.13250385597348213, + "epoch": 0.4920440636474908, + "grad_norm": 0.1807660609483719, + "learning_rate": 0.00011713249611711096, + "loss": 0.1226, + "mean_token_accuracy": 0.9529293328523636, + "num_tokens": 1154128.0, + "step": 402 + }, + { + "entropy": 0.13556111231446266, + "epoch": 0.49326805385556916, + "grad_norm": 0.10021672397851944, + "learning_rate": 0.00011677656461494144, + "loss": 0.1184, + "mean_token_accuracy": 0.9557460844516754, + "num_tokens": 1156945.0, + "step": 403 + }, + { + "entropy": 0.13336747325956821, + "epoch": 0.4944920440636475, + "grad_norm": 0.16353298723697662, + "learning_rate": 0.00011642052648793092, + "loss": 0.1352, + "mean_token_accuracy": 0.948143258690834, + "num_tokens": 1159797.0, + "step": 404 + }, + { + "entropy": 0.14198456145823002, + "epoch": 0.49571603427172584, + "grad_norm": 0.20091497898101807, + "learning_rate": 0.000116064387338108, + "loss": 0.1618, + "mean_token_accuracy": 0.9499198794364929, + "num_tokens": 1162521.0, + "step": 405 + }, + { + "entropy": 0.1324360091239214, + "epoch": 0.4969400244798042, + "grad_norm": 0.11334196478128433, + "learning_rate": 0.00011570815276909082, + "loss": 0.1316, + "mean_token_accuracy": 0.954832911491394, + "num_tokens": 1165438.0, + "step": 406 + }, + { + "entropy": 0.1498626135289669, + "epoch": 0.4981640146878825, + "grad_norm": 0.15647770464420319, + "learning_rate": 0.00011535182838599881, + "loss": 0.1491, + "mean_token_accuracy": 0.9514666050672531, + "num_tokens": 1168324.0, + "step": 407 + }, + { + "entropy": 0.12485288083553314, + "epoch": 0.49938800489596086, + "grad_norm": 0.11484766751527786, + "learning_rate": 0.00011499541979536463, + "loss": 0.1303, + "mean_token_accuracy": 0.9557812511920929, + "num_tokens": 1171217.0, + "step": 408 + }, + { + "entropy": 0.13768774271011353, + "epoch": 0.5006119951040392, + "grad_norm": 0.13695263862609863, + "learning_rate": 0.00011463893260504588, + "loss": 0.1399, + "mean_token_accuracy": 0.9533586949110031, + "num_tokens": 1174093.0, + "step": 409 + }, + { + "entropy": 0.140621118247509, + "epoch": 0.5018359853121175, + "grad_norm": 0.12973180413246155, + "learning_rate": 0.00011428237242413685, + "loss": 0.1479, + "mean_token_accuracy": 0.9566087573766708, + "num_tokens": 1176928.0, + "step": 410 + }, + { + "entropy": 0.12642132490873337, + "epoch": 0.5030599755201959, + "grad_norm": 0.12001452594995499, + "learning_rate": 0.00011392574486288026, + "loss": 0.1323, + "mean_token_accuracy": 0.9554480165243149, + "num_tokens": 1179838.0, + "step": 411 + }, + { + "entropy": 0.14914321154356003, + "epoch": 0.5042839657282742, + "grad_norm": 0.13202784955501556, + "learning_rate": 0.00011356905553257913, + "loss": 0.1528, + "mean_token_accuracy": 0.9465901255607605, + "num_tokens": 1182691.0, + "step": 412 + }, + { + "entropy": 0.13755455240607262, + "epoch": 0.5055079559363526, + "grad_norm": 0.09685217589139938, + "learning_rate": 0.00011321231004550826, + "loss": 0.1284, + "mean_token_accuracy": 0.9547086507081985, + "num_tokens": 1185544.0, + "step": 413 + }, + { + "entropy": 0.13209095411002636, + "epoch": 0.5067319461444308, + "grad_norm": 0.15801644325256348, + "learning_rate": 0.00011285551401482611, + "loss": 0.1368, + "mean_token_accuracy": 0.9538587778806686, + "num_tokens": 1188394.0, + "step": 414 + }, + { + "entropy": 0.15032725408673286, + "epoch": 0.5079559363525091, + "grad_norm": 0.11453884094953537, + "learning_rate": 0.00011249867305448638, + "loss": 0.1436, + "mean_token_accuracy": 0.9490989446640015, + "num_tokens": 1191293.0, + "step": 415 + }, + { + "entropy": 0.15779422968626022, + "epoch": 0.5091799265605875, + "grad_norm": 0.1386803239583969, + "learning_rate": 0.00011214179277914967, + "loss": 0.1454, + "mean_token_accuracy": 0.952106773853302, + "num_tokens": 1194097.0, + "step": 416 + }, + { + "entropy": 0.13475570268929005, + "epoch": 0.5104039167686658, + "grad_norm": 0.119731105864048, + "learning_rate": 0.00011178487880409532, + "loss": 0.1327, + "mean_token_accuracy": 0.9547446072101593, + "num_tokens": 1196968.0, + "step": 417 + }, + { + "entropy": 0.1360163651406765, + "epoch": 0.5116279069767442, + "grad_norm": 0.10502903908491135, + "learning_rate": 0.00011142793674513272, + "loss": 0.1287, + "mean_token_accuracy": 0.9539955258369446, + "num_tokens": 1199774.0, + "step": 418 + }, + { + "entropy": 0.12870127893984318, + "epoch": 0.5128518971848225, + "grad_norm": 0.11175281554460526, + "learning_rate": 0.0001110709722185133, + "loss": 0.1312, + "mean_token_accuracy": 0.9530057311058044, + "num_tokens": 1202514.0, + "step": 419 + }, + { + "entropy": 0.13849590718746185, + "epoch": 0.5140758873929009, + "grad_norm": 0.14228886365890503, + "learning_rate": 0.00011071399084084189, + "loss": 0.1313, + "mean_token_accuracy": 0.9532709866762161, + "num_tokens": 1205387.0, + "step": 420 + }, + { + "entropy": 0.15055324137210846, + "epoch": 0.5152998776009792, + "grad_norm": 0.12884847819805145, + "learning_rate": 0.00011035699822898851, + "loss": 0.1494, + "mean_token_accuracy": 0.9454932361841202, + "num_tokens": 1208286.0, + "step": 421 + }, + { + "entropy": 0.1303107663989067, + "epoch": 0.5165238678090576, + "grad_norm": 0.13567277789115906, + "learning_rate": 0.00011000000000000002, + "loss": 0.1358, + "mean_token_accuracy": 0.9512244313955307, + "num_tokens": 1211158.0, + "step": 422 + }, + { + "entropy": 0.1351293083280325, + "epoch": 0.5177478580171359, + "grad_norm": 0.15768244862556458, + "learning_rate": 0.00010964300177101147, + "loss": 0.1406, + "mean_token_accuracy": 0.9545091390609741, + "num_tokens": 1213841.0, + "step": 423 + }, + { + "entropy": 0.1322871558368206, + "epoch": 0.5189718482252142, + "grad_norm": 0.1405155509710312, + "learning_rate": 0.00010928600915915814, + "loss": 0.1431, + "mean_token_accuracy": 0.9523000717163086, + "num_tokens": 1216780.0, + "step": 424 + }, + { + "entropy": 0.13802196085453033, + "epoch": 0.5201958384332925, + "grad_norm": 0.10787899047136307, + "learning_rate": 0.00010892902778148672, + "loss": 0.1259, + "mean_token_accuracy": 0.9590041935443878, + "num_tokens": 1219656.0, + "step": 425 + }, + { + "entropy": 0.14560194313526154, + "epoch": 0.5214198286413708, + "grad_norm": 0.12928056716918945, + "learning_rate": 0.00010857206325486729, + "loss": 0.1407, + "mean_token_accuracy": 0.9513566195964813, + "num_tokens": 1222655.0, + "step": 426 + }, + { + "entropy": 0.13380536437034607, + "epoch": 0.5226438188494492, + "grad_norm": 0.1850769966840744, + "learning_rate": 0.00010821512119590473, + "loss": 0.1465, + "mean_token_accuracy": 0.9528696835041046, + "num_tokens": 1225454.0, + "step": 427 + }, + { + "entropy": 0.13339932076632977, + "epoch": 0.5238678090575275, + "grad_norm": 0.15202596783638, + "learning_rate": 0.00010785820722085034, + "loss": 0.1522, + "mean_token_accuracy": 0.9473523050546646, + "num_tokens": 1228360.0, + "step": 428 + }, + { + "entropy": 0.1458057165145874, + "epoch": 0.5250917992656059, + "grad_norm": 0.11800245940685272, + "learning_rate": 0.00010750132694551364, + "loss": 0.1407, + "mean_token_accuracy": 0.9527064561843872, + "num_tokens": 1231291.0, + "step": 429 + }, + { + "entropy": 0.13939331099390984, + "epoch": 0.5263157894736842, + "grad_norm": 0.09993872791528702, + "learning_rate": 0.00010714448598517393, + "loss": 0.1234, + "mean_token_accuracy": 0.9530506879091263, + "num_tokens": 1234183.0, + "step": 430 + }, + { + "entropy": 0.1464698538184166, + "epoch": 0.5275397796817626, + "grad_norm": 0.11575373262166977, + "learning_rate": 0.00010678768995449178, + "loss": 0.1448, + "mean_token_accuracy": 0.9480034857988358, + "num_tokens": 1237004.0, + "step": 431 + }, + { + "entropy": 0.14314524829387665, + "epoch": 0.5287637698898409, + "grad_norm": 0.12237000465393066, + "learning_rate": 0.0001064309444674209, + "loss": 0.1447, + "mean_token_accuracy": 0.9496198147535324, + "num_tokens": 1239942.0, + "step": 432 + }, + { + "entropy": 0.1364418864250183, + "epoch": 0.5299877600979193, + "grad_norm": 0.10764491558074951, + "learning_rate": 0.00010607425513711977, + "loss": 0.1313, + "mean_token_accuracy": 0.9532888233661652, + "num_tokens": 1242836.0, + "step": 433 + }, + { + "entropy": 0.14419133216142654, + "epoch": 0.5312117503059975, + "grad_norm": 0.1218903660774231, + "learning_rate": 0.00010571762757586322, + "loss": 0.1425, + "mean_token_accuracy": 0.9510030448436737, + "num_tokens": 1245784.0, + "step": 434 + }, + { + "entropy": 0.14930906891822815, + "epoch": 0.5324357405140759, + "grad_norm": 0.1454317718744278, + "learning_rate": 0.00010536106739495414, + "loss": 0.1475, + "mean_token_accuracy": 0.9519994705915451, + "num_tokens": 1248676.0, + "step": 435 + }, + { + "entropy": 0.1356421373784542, + "epoch": 0.5336597307221542, + "grad_norm": 0.09756147861480713, + "learning_rate": 0.00010500458020463542, + "loss": 0.125, + "mean_token_accuracy": 0.9552226662635803, + "num_tokens": 1251546.0, + "step": 436 + }, + { + "entropy": 0.14133083447813988, + "epoch": 0.5348837209302325, + "grad_norm": 0.11094829440116882, + "learning_rate": 0.00010464817161400123, + "loss": 0.1348, + "mean_token_accuracy": 0.9499975889921188, + "num_tokens": 1254368.0, + "step": 437 + }, + { + "entropy": 0.14045517146587372, + "epoch": 0.5361077111383109, + "grad_norm": 0.13847166299819946, + "learning_rate": 0.00010429184723090922, + "loss": 0.1416, + "mean_token_accuracy": 0.950296625494957, + "num_tokens": 1257235.0, + "step": 438 + }, + { + "entropy": 0.1331184972077608, + "epoch": 0.5373317013463892, + "grad_norm": 0.10256988555192947, + "learning_rate": 0.000103935612661892, + "loss": 0.1237, + "mean_token_accuracy": 0.9578738659620285, + "num_tokens": 1260101.0, + "step": 439 + }, + { + "entropy": 0.11664273031055927, + "epoch": 0.5385556915544676, + "grad_norm": 0.09503508359193802, + "learning_rate": 0.00010357947351206912, + "loss": 0.1233, + "mean_token_accuracy": 0.9538908898830414, + "num_tokens": 1262999.0, + "step": 440 + }, + { + "entropy": 0.12164262495934963, + "epoch": 0.5397796817625459, + "grad_norm": 0.10633799433708191, + "learning_rate": 0.00010322343538505858, + "loss": 0.1269, + "mean_token_accuracy": 0.9542957246303558, + "num_tokens": 1265888.0, + "step": 441 + }, + { + "entropy": 0.13625743240118027, + "epoch": 0.5410036719706243, + "grad_norm": 0.12708348035812378, + "learning_rate": 0.00010286750388288904, + "loss": 0.1386, + "mean_token_accuracy": 0.9525193274021149, + "num_tokens": 1268770.0, + "step": 442 + }, + { + "entropy": 0.13524017110466957, + "epoch": 0.5422276621787026, + "grad_norm": 0.13129781186580658, + "learning_rate": 0.00010251168460591141, + "loss": 0.1287, + "mean_token_accuracy": 0.9525767266750336, + "num_tokens": 1271696.0, + "step": 443 + }, + { + "entropy": 0.12868662923574448, + "epoch": 0.543451652386781, + "grad_norm": 0.11591530591249466, + "learning_rate": 0.00010215598315271076, + "loss": 0.1292, + "mean_token_accuracy": 0.9566308557987213, + "num_tokens": 1274452.0, + "step": 444 + }, + { + "entropy": 0.1305578351020813, + "epoch": 0.5446756425948592, + "grad_norm": 0.12397347390651703, + "learning_rate": 0.00010180040512001838, + "loss": 0.1342, + "mean_token_accuracy": 0.9553821682929993, + "num_tokens": 1277399.0, + "step": 445 + }, + { + "entropy": 0.1372597012668848, + "epoch": 0.5458996328029376, + "grad_norm": 0.14591890573501587, + "learning_rate": 0.00010144495610262359, + "loss": 0.1506, + "mean_token_accuracy": 0.9508340656757355, + "num_tokens": 1280339.0, + "step": 446 + }, + { + "entropy": 0.134957080706954, + "epoch": 0.5471236230110159, + "grad_norm": 0.12342193722724915, + "learning_rate": 0.00010108964169328563, + "loss": 0.1387, + "mean_token_accuracy": 0.9538850635290146, + "num_tokens": 1283168.0, + "step": 447 + }, + { + "entropy": 0.13791638240218163, + "epoch": 0.5483476132190942, + "grad_norm": 0.14071674644947052, + "learning_rate": 0.0001007344674826459, + "loss": 0.1452, + "mean_token_accuracy": 0.9521060883998871, + "num_tokens": 1286026.0, + "step": 448 + }, + { + "entropy": 0.13752142526209354, + "epoch": 0.5495716034271726, + "grad_norm": 0.1364283412694931, + "learning_rate": 0.00010037943905913987, + "loss": 0.1492, + "mean_token_accuracy": 0.9498987942934036, + "num_tokens": 1289089.0, + "step": 449 + }, + { + "entropy": 0.13567536883056164, + "epoch": 0.5507955936352509, + "grad_norm": 0.10228797793388367, + "learning_rate": 0.00010002456200890902, + "loss": 0.1262, + "mean_token_accuracy": 0.954885944724083, + "num_tokens": 1292137.0, + "step": 450 + }, + { + "entropy": 0.14596708118915558, + "epoch": 0.5520195838433293, + "grad_norm": 0.12808726727962494, + "learning_rate": 9.966984191571318e-05, + "loss": 0.14, + "mean_token_accuracy": 0.9511357396841049, + "num_tokens": 1294977.0, + "step": 451 + }, + { + "entropy": 0.13888927921652794, + "epoch": 0.5532435740514076, + "grad_norm": 0.1234591007232666, + "learning_rate": 9.931528436084257e-05, + "loss": 0.1376, + "mean_token_accuracy": 0.9533981531858444, + "num_tokens": 1298031.0, + "step": 452 + }, + { + "entropy": 0.1446320004761219, + "epoch": 0.554467564259486, + "grad_norm": 0.11091554909944534, + "learning_rate": 9.896089492302982e-05, + "loss": 0.1342, + "mean_token_accuracy": 0.9506274461746216, + "num_tokens": 1300941.0, + "step": 453 + }, + { + "entropy": 0.12542426958680153, + "epoch": 0.5556915544675642, + "grad_norm": 0.08640747517347336, + "learning_rate": 9.860667917836257e-05, + "loss": 0.1188, + "mean_token_accuracy": 0.9547301977872849, + "num_tokens": 1303891.0, + "step": 454 + }, + { + "entropy": 0.11943003907799721, + "epoch": 0.5569155446756426, + "grad_norm": 0.09961756318807602, + "learning_rate": 9.825264270019538e-05, + "loss": 0.1252, + "mean_token_accuracy": 0.9609936475753784, + "num_tokens": 1306534.0, + "step": 455 + }, + { + "entropy": 0.15065762773156166, + "epoch": 0.5581395348837209, + "grad_norm": 0.1277482807636261, + "learning_rate": 9.789879105906214e-05, + "loss": 0.1522, + "mean_token_accuracy": 0.9470720738172531, + "num_tokens": 1309499.0, + "step": 456 + }, + { + "entropy": 0.13451961800456047, + "epoch": 0.5593635250917993, + "grad_norm": 0.10938668996095657, + "learning_rate": 9.754512982258854e-05, + "loss": 0.1432, + "mean_token_accuracy": 0.9527524411678314, + "num_tokens": 1312225.0, + "step": 457 + }, + { + "entropy": 0.1409507505595684, + "epoch": 0.5605875152998776, + "grad_norm": 0.1285836547613144, + "learning_rate": 9.719166455540436e-05, + "loss": 0.1362, + "mean_token_accuracy": 0.9491998702287674, + "num_tokens": 1315103.0, + "step": 458 + }, + { + "entropy": 0.14875612407922745, + "epoch": 0.5618115055079559, + "grad_norm": 0.1150604709982872, + "learning_rate": 9.683840081905584e-05, + "loss": 0.1314, + "mean_token_accuracy": 0.9485827386379242, + "num_tokens": 1318001.0, + "step": 459 + }, + { + "entropy": 0.13133428804576397, + "epoch": 0.5630354957160343, + "grad_norm": 0.14611689746379852, + "learning_rate": 9.648534417191836e-05, + "loss": 0.1496, + "mean_token_accuracy": 0.9502135962247849, + "num_tokens": 1320854.0, + "step": 460 + }, + { + "entropy": 0.1351902335882187, + "epoch": 0.5642594859241126, + "grad_norm": 0.11204855144023895, + "learning_rate": 9.613250016910892e-05, + "loss": 0.1295, + "mean_token_accuracy": 0.9572872370481491, + "num_tokens": 1323726.0, + "step": 461 + }, + { + "entropy": 0.12648485973477364, + "epoch": 0.565483476132191, + "grad_norm": 0.11017502099275589, + "learning_rate": 9.57798743623985e-05, + "loss": 0.1256, + "mean_token_accuracy": 0.9586693197488785, + "num_tokens": 1326607.0, + "step": 462 + }, + { + "entropy": 0.13961461000144482, + "epoch": 0.5667074663402693, + "grad_norm": 0.10312923789024353, + "learning_rate": 9.542747230012516e-05, + "loss": 0.1343, + "mean_token_accuracy": 0.9497461169958115, + "num_tokens": 1329489.0, + "step": 463 + }, + { + "entropy": 0.13521714136004448, + "epoch": 0.5679314565483476, + "grad_norm": 0.12328015267848969, + "learning_rate": 9.507529952710634e-05, + "loss": 0.137, + "mean_token_accuracy": 0.9524849355220795, + "num_tokens": 1332260.0, + "step": 464 + }, + { + "entropy": 0.13796503469347954, + "epoch": 0.5691554467564259, + "grad_norm": 0.10105682909488678, + "learning_rate": 9.472336158455178e-05, + "loss": 0.1353, + "mean_token_accuracy": 0.9537227302789688, + "num_tokens": 1335123.0, + "step": 465 + }, + { + "entropy": 0.14300861582159996, + "epoch": 0.5703794369645043, + "grad_norm": 0.14959032833576202, + "learning_rate": 9.437166400997628e-05, + "loss": 0.143, + "mean_token_accuracy": 0.9513982832431793, + "num_tokens": 1337876.0, + "step": 466 + }, + { + "entropy": 0.1236543133854866, + "epoch": 0.5716034271725826, + "grad_norm": 0.09893029928207397, + "learning_rate": 9.402021233711273e-05, + "loss": 0.1201, + "mean_token_accuracy": 0.9587667882442474, + "num_tokens": 1340778.0, + "step": 467 + }, + { + "entropy": 0.12385170720517635, + "epoch": 0.572827417380661, + "grad_norm": 0.10692918300628662, + "learning_rate": 9.366901209582478e-05, + "loss": 0.1221, + "mean_token_accuracy": 0.9581068903207779, + "num_tokens": 1343592.0, + "step": 468 + }, + { + "entropy": 0.13142681308090687, + "epoch": 0.5740514075887393, + "grad_norm": 0.09896130114793777, + "learning_rate": 9.331806881202005e-05, + "loss": 0.1347, + "mean_token_accuracy": 0.9522279798984528, + "num_tokens": 1346433.0, + "step": 469 + }, + { + "entropy": 0.1318274401128292, + "epoch": 0.5752753977968176, + "grad_norm": 0.11954781413078308, + "learning_rate": 9.296738800756312e-05, + "loss": 0.1287, + "mean_token_accuracy": 0.9536316990852356, + "num_tokens": 1349365.0, + "step": 470 + }, + { + "entropy": 0.12821925058960915, + "epoch": 0.576499388004896, + "grad_norm": 0.1039518415927887, + "learning_rate": 9.261697520018849e-05, + "loss": 0.1309, + "mean_token_accuracy": 0.9541867524385452, + "num_tokens": 1352151.0, + "step": 471 + }, + { + "entropy": 0.13313505239784718, + "epoch": 0.5777233782129743, + "grad_norm": 0.12443695217370987, + "learning_rate": 9.226683590341405e-05, + "loss": 0.1428, + "mean_token_accuracy": 0.9529053568840027, + "num_tokens": 1354968.0, + "step": 472 + }, + { + "entropy": 0.1336048524826765, + "epoch": 0.5789473684210527, + "grad_norm": 0.10788223892450333, + "learning_rate": 9.191697562645412e-05, + "loss": 0.1244, + "mean_token_accuracy": 0.9543289393186569, + "num_tokens": 1357780.0, + "step": 473 + }, + { + "entropy": 0.14546431228518486, + "epoch": 0.5801713586291309, + "grad_norm": 0.13798163831233978, + "learning_rate": 9.156739987413286e-05, + "loss": 0.1511, + "mean_token_accuracy": 0.9486000686883926, + "num_tokens": 1360739.0, + "step": 474 + }, + { + "entropy": 0.1306052915751934, + "epoch": 0.5813953488372093, + "grad_norm": 0.1168493926525116, + "learning_rate": 9.121811414679753e-05, + "loss": 0.1258, + "mean_token_accuracy": 0.9548768550157547, + "num_tokens": 1363545.0, + "step": 475 + }, + { + "entropy": 0.12160796672105789, + "epoch": 0.5826193390452876, + "grad_norm": 0.08270232379436493, + "learning_rate": 9.08691239402321e-05, + "loss": 0.1165, + "mean_token_accuracy": 0.9583134055137634, + "num_tokens": 1366421.0, + "step": 476 + }, + { + "entropy": 0.13790954649448395, + "epoch": 0.583843329253366, + "grad_norm": 0.1251978725194931, + "learning_rate": 9.052043474557075e-05, + "loss": 0.1301, + "mean_token_accuracy": 0.9540035426616669, + "num_tokens": 1369279.0, + "step": 477 + }, + { + "entropy": 0.12729504518210888, + "epoch": 0.5850673194614443, + "grad_norm": 0.10904013365507126, + "learning_rate": 9.017205204921135e-05, + "loss": 0.1327, + "mean_token_accuracy": 0.952341616153717, + "num_tokens": 1372269.0, + "step": 478 + }, + { + "entropy": 0.12838926911354065, + "epoch": 0.5862913096695227, + "grad_norm": 0.10749886929988861, + "learning_rate": 8.982398133272927e-05, + "loss": 0.1251, + "mean_token_accuracy": 0.9540759325027466, + "num_tokens": 1375221.0, + "step": 479 + }, + { + "entropy": 0.1277572326362133, + "epoch": 0.587515299877601, + "grad_norm": 0.13413143157958984, + "learning_rate": 8.94762280727911e-05, + "loss": 0.126, + "mean_token_accuracy": 0.9548780173063278, + "num_tokens": 1378214.0, + "step": 480 + }, + { + "entropy": 0.1269283927977085, + "epoch": 0.5887392900856793, + "grad_norm": 0.12498117983341217, + "learning_rate": 8.912879774106833e-05, + "loss": 0.1248, + "mean_token_accuracy": 0.9582810252904892, + "num_tokens": 1381063.0, + "step": 481 + }, + { + "entropy": 0.12393281422555447, + "epoch": 0.5899632802937577, + "grad_norm": 0.10295683145523071, + "learning_rate": 8.878169580415155e-05, + "loss": 0.1198, + "mean_token_accuracy": 0.9530059397220612, + "num_tokens": 1383892.0, + "step": 482 + }, + { + "entropy": 0.13822954520583153, + "epoch": 0.591187270501836, + "grad_norm": 0.14562058448791504, + "learning_rate": 8.843492772346423e-05, + "loss": 0.138, + "mean_token_accuracy": 0.9512495547533035, + "num_tokens": 1386868.0, + "step": 483 + }, + { + "entropy": 0.13343922048807144, + "epoch": 0.5924112607099143, + "grad_norm": 0.12628866732120514, + "learning_rate": 8.80884989551767e-05, + "loss": 0.134, + "mean_token_accuracy": 0.9502186179161072, + "num_tokens": 1389677.0, + "step": 484 + }, + { + "entropy": 0.12989217042922974, + "epoch": 0.5936352509179926, + "grad_norm": 0.09793204814195633, + "learning_rate": 8.774241495012061e-05, + "loss": 0.1217, + "mean_token_accuracy": 0.9539603888988495, + "num_tokens": 1392579.0, + "step": 485 + }, + { + "entropy": 0.12106052599847317, + "epoch": 0.594859241126071, + "grad_norm": 0.148179829120636, + "learning_rate": 8.73966811537029e-05, + "loss": 0.1256, + "mean_token_accuracy": 0.9536247402429581, + "num_tokens": 1395562.0, + "step": 486 + }, + { + "entropy": 0.1189131960272789, + "epoch": 0.5960832313341493, + "grad_norm": 0.11518009752035141, + "learning_rate": 8.705130300582015e-05, + "loss": 0.125, + "mean_token_accuracy": 0.9558690935373306, + "num_tokens": 1398536.0, + "step": 487 + }, + { + "entropy": 0.11341707967221737, + "epoch": 0.5973072215422277, + "grad_norm": 0.12490274012088776, + "learning_rate": 8.670628594077313e-05, + "loss": 0.1255, + "mean_token_accuracy": 0.9554395079612732, + "num_tokens": 1401423.0, + "step": 488 + }, + { + "entropy": 0.11649352684617043, + "epoch": 0.598531211750306, + "grad_norm": 0.11428402364253998, + "learning_rate": 8.636163538718123e-05, + "loss": 0.1263, + "mean_token_accuracy": 0.9540867656469345, + "num_tokens": 1404449.0, + "step": 489 + }, + { + "entropy": 0.12423662655055523, + "epoch": 0.5997552019583844, + "grad_norm": 0.1465883105993271, + "learning_rate": 8.601735676789685e-05, + "loss": 0.1362, + "mean_token_accuracy": 0.9492033571004868, + "num_tokens": 1407328.0, + "step": 490 + }, + { + "entropy": 0.12803378142416477, + "epoch": 0.6009791921664627, + "grad_norm": 0.13481834530830383, + "learning_rate": 8.567345549992045e-05, + "loss": 0.1399, + "mean_token_accuracy": 0.9494926184415817, + "num_tokens": 1410264.0, + "step": 491 + }, + { + "entropy": 0.13425504229962826, + "epoch": 0.602203182374541, + "grad_norm": 0.13619279861450195, + "learning_rate": 8.532993699431505e-05, + "loss": 0.1466, + "mean_token_accuracy": 0.9480967670679092, + "num_tokens": 1413215.0, + "step": 492 + }, + { + "entropy": 0.13283822126686573, + "epoch": 0.6034271725826194, + "grad_norm": 0.1255907565355301, + "learning_rate": 8.498680665612103e-05, + "loss": 0.1427, + "mean_token_accuracy": 0.9555611908435822, + "num_tokens": 1415986.0, + "step": 493 + }, + { + "entropy": 0.1271696649491787, + "epoch": 0.6046511627906976, + "grad_norm": 0.1262376308441162, + "learning_rate": 8.464406988427134e-05, + "loss": 0.1286, + "mean_token_accuracy": 0.9554323703050613, + "num_tokens": 1418961.0, + "step": 494 + }, + { + "entropy": 0.1388858687132597, + "epoch": 0.605875152998776, + "grad_norm": 0.14236120879650116, + "learning_rate": 8.430173207150642e-05, + "loss": 0.1375, + "mean_token_accuracy": 0.9519379138946533, + "num_tokens": 1421639.0, + "step": 495 + }, + { + "entropy": 0.12536496482789516, + "epoch": 0.6070991432068543, + "grad_norm": 0.07469351589679718, + "learning_rate": 8.395979860428927e-05, + "loss": 0.1161, + "mean_token_accuracy": 0.9578710049390793, + "num_tokens": 1424353.0, + "step": 496 + }, + { + "entropy": 0.1260237004607916, + "epoch": 0.6083231334149327, + "grad_norm": 0.11367585510015488, + "learning_rate": 8.361827486272083e-05, + "loss": 0.1211, + "mean_token_accuracy": 0.9585512578487396, + "num_tokens": 1427109.0, + "step": 497 + }, + { + "entropy": 0.12655403837561607, + "epoch": 0.609547123623011, + "grad_norm": 0.11390203982591629, + "learning_rate": 8.32771662204553e-05, + "loss": 0.131, + "mean_token_accuracy": 0.9558133631944656, + "num_tokens": 1430014.0, + "step": 498 + }, + { + "entropy": 0.12216490879654884, + "epoch": 0.6107711138310894, + "grad_norm": 0.11213718354701996, + "learning_rate": 8.293647804461544e-05, + "loss": 0.1238, + "mean_token_accuracy": 0.9572701454162598, + "num_tokens": 1432871.0, + "step": 499 + }, + { + "entropy": 0.12777983769774437, + "epoch": 0.6119951040391677, + "grad_norm": 0.08813431113958359, + "learning_rate": 8.259621569570838e-05, + "loss": 0.1199, + "mean_token_accuracy": 0.9563022404909134, + "num_tokens": 1435708.0, + "step": 500 + }, + { + "entropy": 0.12374689616262913, + "epoch": 0.6132190942472461, + "grad_norm": 0.12710627913475037, + "learning_rate": 8.22563845275411e-05, + "loss": 0.1164, + "mean_token_accuracy": 0.9588872045278549, + "num_tokens": 1438640.0, + "step": 501 + }, + { + "entropy": 0.1331765539944172, + "epoch": 0.6144430844553244, + "grad_norm": 0.11184730380773544, + "learning_rate": 8.191698988713619e-05, + "loss": 0.1278, + "mean_token_accuracy": 0.9535772055387497, + "num_tokens": 1441584.0, + "step": 502 + }, + { + "entropy": 0.13547160848975182, + "epoch": 0.6156670746634026, + "grad_norm": 0.11172328144311905, + "learning_rate": 8.157803711464778e-05, + "loss": 0.1272, + "mean_token_accuracy": 0.95878566801548, + "num_tokens": 1444361.0, + "step": 503 + }, + { + "entropy": 0.13545426167547703, + "epoch": 0.616891064871481, + "grad_norm": 0.12405485659837723, + "learning_rate": 8.123953154327752e-05, + "loss": 0.1223, + "mean_token_accuracy": 0.9598063975572586, + "num_tokens": 1447137.0, + "step": 504 + }, + { + "entropy": 0.13814809173345566, + "epoch": 0.6181150550795593, + "grad_norm": 0.08983740210533142, + "learning_rate": 8.09014784991906e-05, + "loss": 0.1184, + "mean_token_accuracy": 0.9545447528362274, + "num_tokens": 1450064.0, + "step": 505 + }, + { + "entropy": 0.12397347018122673, + "epoch": 0.6193390452876377, + "grad_norm": 0.11594085395336151, + "learning_rate": 8.056388330143205e-05, + "loss": 0.1302, + "mean_token_accuracy": 0.9496136009693146, + "num_tokens": 1452785.0, + "step": 506 + }, + { + "entropy": 0.1278697457164526, + "epoch": 0.620563035495716, + "grad_norm": 0.1257164478302002, + "learning_rate": 8.022675126184294e-05, + "loss": 0.1316, + "mean_token_accuracy": 0.950615406036377, + "num_tokens": 1455607.0, + "step": 507 + }, + { + "entropy": 0.12996536493301392, + "epoch": 0.6217870257037944, + "grad_norm": 0.09567246586084366, + "learning_rate": 7.989008768497683e-05, + "loss": 0.1216, + "mean_token_accuracy": 0.957462728023529, + "num_tokens": 1458358.0, + "step": 508 + }, + { + "entropy": 0.1335059516131878, + "epoch": 0.6230110159118727, + "grad_norm": 0.11693516373634338, + "learning_rate": 7.955389786801638e-05, + "loss": 0.1278, + "mean_token_accuracy": 0.9599097669124603, + "num_tokens": 1461006.0, + "step": 509 + }, + { + "entropy": 0.12592827714979649, + "epoch": 0.6242350061199511, + "grad_norm": 0.13386186957359314, + "learning_rate": 7.921818710068983e-05, + "loss": 0.1327, + "mean_token_accuracy": 0.9515922516584396, + "num_tokens": 1463889.0, + "step": 510 + }, + { + "entropy": 0.13952646404504776, + "epoch": 0.6254589963280294, + "grad_norm": 0.1409122347831726, + "learning_rate": 7.888296066518806e-05, + "loss": 0.1502, + "mean_token_accuracy": 0.9512711465358734, + "num_tokens": 1466798.0, + "step": 511 + }, + { + "entropy": 0.11477003432810307, + "epoch": 0.6266829865361077, + "grad_norm": 0.08158725500106812, + "learning_rate": 7.854822383608116e-05, + "loss": 0.1154, + "mean_token_accuracy": 0.956229642033577, + "num_tokens": 1469652.0, + "step": 512 + }, + { + "entropy": 0.13199845142662525, + "epoch": 0.627906976744186, + "grad_norm": 0.11513564735651016, + "learning_rate": 7.821398188023565e-05, + "loss": 0.1389, + "mean_token_accuracy": 0.9491001516580582, + "num_tokens": 1472579.0, + "step": 513 + }, + { + "entropy": 0.11838826537132263, + "epoch": 0.6291309669522643, + "grad_norm": 0.11191154271364212, + "learning_rate": 7.788024005673155e-05, + "loss": 0.1194, + "mean_token_accuracy": 0.9559880048036575, + "num_tokens": 1475410.0, + "step": 514 + }, + { + "entropy": 0.12664253264665604, + "epoch": 0.6303549571603427, + "grad_norm": 0.14261458814144135, + "learning_rate": 7.754700361677956e-05, + "loss": 0.1368, + "mean_token_accuracy": 0.9549832046031952, + "num_tokens": 1478265.0, + "step": 515 + }, + { + "entropy": 0.12632698938250542, + "epoch": 0.631578947368421, + "grad_norm": 0.09330065548419952, + "learning_rate": 7.72142778036385e-05, + "loss": 0.1213, + "mean_token_accuracy": 0.9531398117542267, + "num_tokens": 1480975.0, + "step": 516 + }, + { + "entropy": 0.1315625235438347, + "epoch": 0.6328029375764994, + "grad_norm": 0.09596612304449081, + "learning_rate": 7.688206785253296e-05, + "loss": 0.1169, + "mean_token_accuracy": 0.9544998854398727, + "num_tokens": 1483917.0, + "step": 517 + }, + { + "entropy": 0.12857342697679996, + "epoch": 0.6340269277845777, + "grad_norm": 0.11642349511384964, + "learning_rate": 7.655037899057054e-05, + "loss": 0.1277, + "mean_token_accuracy": 0.9583034366369247, + "num_tokens": 1486569.0, + "step": 518 + }, + { + "entropy": 0.14083325490355492, + "epoch": 0.6352509179926561, + "grad_norm": 0.10357247292995453, + "learning_rate": 7.621921643666001e-05, + "loss": 0.1289, + "mean_token_accuracy": 0.9523238837718964, + "num_tokens": 1489470.0, + "step": 519 + }, + { + "entropy": 0.12058390490710735, + "epoch": 0.6364749082007344, + "grad_norm": 0.07763651013374329, + "learning_rate": 7.588858540142898e-05, + "loss": 0.1202, + "mean_token_accuracy": 0.9588678926229477, + "num_tokens": 1492426.0, + "step": 520 + }, + { + "entropy": 0.13268422335386276, + "epoch": 0.6376988984088128, + "grad_norm": 0.11909055709838867, + "learning_rate": 7.555849108714192e-05, + "loss": 0.1442, + "mean_token_accuracy": 0.9491792619228363, + "num_tokens": 1495317.0, + "step": 521 + }, + { + "entropy": 0.12748325243592262, + "epoch": 0.6389228886168911, + "grad_norm": 0.10915423929691315, + "learning_rate": 7.522893868761842e-05, + "loss": 0.1306, + "mean_token_accuracy": 0.9525194317102432, + "num_tokens": 1498194.0, + "step": 522 + }, + { + "entropy": 0.12249485403299332, + "epoch": 0.6401468788249693, + "grad_norm": 0.09855267405509949, + "learning_rate": 7.489993338815136e-05, + "loss": 0.1263, + "mean_token_accuracy": 0.9492330700159073, + "num_tokens": 1501124.0, + "step": 523 + }, + { + "entropy": 0.11524965800344944, + "epoch": 0.6413708690330477, + "grad_norm": 0.11626867949962616, + "learning_rate": 7.45714803654253e-05, + "loss": 0.1357, + "mean_token_accuracy": 0.9552188813686371, + "num_tokens": 1504068.0, + "step": 524 + }, + { + "entropy": 0.12853862158954144, + "epoch": 0.642594859241126, + "grad_norm": 0.10208721458911896, + "learning_rate": 7.424358478743514e-05, + "loss": 0.1211, + "mean_token_accuracy": 0.9536338299512863, + "num_tokens": 1506953.0, + "step": 525 + }, + { + "entropy": 0.12070119939744473, + "epoch": 0.6438188494492044, + "grad_norm": 0.08820836991071701, + "learning_rate": 7.39162518134048e-05, + "loss": 0.1241, + "mean_token_accuracy": 0.9584237486124039, + "num_tokens": 1509898.0, + "step": 526 + }, + { + "entropy": 0.11714869923889637, + "epoch": 0.6450428396572827, + "grad_norm": 0.09070708602666855, + "learning_rate": 7.358948659370586e-05, + "loss": 0.1224, + "mean_token_accuracy": 0.9552429467439651, + "num_tokens": 1512791.0, + "step": 527 + }, + { + "entropy": 0.12014897167682648, + "epoch": 0.6462668298653611, + "grad_norm": 0.10571181774139404, + "learning_rate": 7.326329426977674e-05, + "loss": 0.1287, + "mean_token_accuracy": 0.95228411257267, + "num_tokens": 1515715.0, + "step": 528 + }, + { + "entropy": 0.13048417307436466, + "epoch": 0.6474908200734394, + "grad_norm": 0.11094783246517181, + "learning_rate": 7.293767997404175e-05, + "loss": 0.1336, + "mean_token_accuracy": 0.9517211019992828, + "num_tokens": 1518544.0, + "step": 529 + }, + { + "entropy": 0.12477540224790573, + "epoch": 0.6487148102815178, + "grad_norm": 0.08558245748281479, + "learning_rate": 7.261264882983024e-05, + "loss": 0.1149, + "mean_token_accuracy": 0.9544852375984192, + "num_tokens": 1521417.0, + "step": 530 + }, + { + "entropy": 0.12751509435474873, + "epoch": 0.6499388004895961, + "grad_norm": 0.11244817078113556, + "learning_rate": 7.228820595129604e-05, + "loss": 0.1319, + "mean_token_accuracy": 0.9543735980987549, + "num_tokens": 1524141.0, + "step": 531 + }, + { + "entropy": 0.12362748570740223, + "epoch": 0.6511627906976745, + "grad_norm": 0.0944586843252182, + "learning_rate": 7.196435644333708e-05, + "loss": 0.1234, + "mean_token_accuracy": 0.9570346027612686, + "num_tokens": 1526923.0, + "step": 532 + }, + { + "entropy": 0.12174053117632866, + "epoch": 0.6523867809057528, + "grad_norm": 0.11414114385843277, + "learning_rate": 7.164110540151487e-05, + "loss": 0.1217, + "mean_token_accuracy": 0.9561756104230881, + "num_tokens": 1529721.0, + "step": 533 + }, + { + "entropy": 0.12335773184895515, + "epoch": 0.653610771113831, + "grad_norm": 0.12079119682312012, + "learning_rate": 7.131845791197456e-05, + "loss": 0.1278, + "mean_token_accuracy": 0.952575296163559, + "num_tokens": 1532463.0, + "step": 534 + }, + { + "entropy": 0.13432800769805908, + "epoch": 0.6548347613219094, + "grad_norm": 0.16789008677005768, + "learning_rate": 7.099641905136474e-05, + "loss": 0.1516, + "mean_token_accuracy": 0.9491488337516785, + "num_tokens": 1535436.0, + "step": 535 + }, + { + "entropy": 0.12504513375461102, + "epoch": 0.6560587515299877, + "grad_norm": 0.09669282287359238, + "learning_rate": 7.067499388675767e-05, + "loss": 0.1218, + "mean_token_accuracy": 0.9547172486782074, + "num_tokens": 1538239.0, + "step": 536 + }, + { + "entropy": 0.12883107364177704, + "epoch": 0.6572827417380661, + "grad_norm": 0.12195903807878494, + "learning_rate": 7.035418747556937e-05, + "loss": 0.1295, + "mean_token_accuracy": 0.9529392123222351, + "num_tokens": 1541133.0, + "step": 537 + }, + { + "entropy": 0.13597192242741585, + "epoch": 0.6585067319461444, + "grad_norm": 0.12069480866193771, + "learning_rate": 7.003400486548033e-05, + "loss": 0.142, + "mean_token_accuracy": 0.9506565481424332, + "num_tokens": 1544057.0, + "step": 538 + }, + { + "entropy": 0.12465078942477703, + "epoch": 0.6597307221542228, + "grad_norm": 0.13766269385814667, + "learning_rate": 6.971445109435588e-05, + "loss": 0.1244, + "mean_token_accuracy": 0.9561842232942581, + "num_tokens": 1546913.0, + "step": 539 + }, + { + "entropy": 0.11525347083806992, + "epoch": 0.6609547123623011, + "grad_norm": 0.0794203132390976, + "learning_rate": 6.939553119016696e-05, + "loss": 0.1064, + "mean_token_accuracy": 0.9619592726230621, + "num_tokens": 1549858.0, + "step": 540 + }, + { + "entropy": 0.1330459900200367, + "epoch": 0.6621787025703795, + "grad_norm": 0.09169530123472214, + "learning_rate": 6.9077250170911e-05, + "loss": 0.1222, + "mean_token_accuracy": 0.9564371109008789, + "num_tokens": 1552683.0, + "step": 541 + }, + { + "entropy": 0.1317736990749836, + "epoch": 0.6634026927784578, + "grad_norm": 0.11017370969057083, + "learning_rate": 6.875961304453306e-05, + "loss": 0.1335, + "mean_token_accuracy": 0.9531339704990387, + "num_tokens": 1555644.0, + "step": 542 + }, + { + "entropy": 0.129679873585701, + "epoch": 0.6646266829865362, + "grad_norm": 0.11367757618427277, + "learning_rate": 6.844262480884697e-05, + "loss": 0.1274, + "mean_token_accuracy": 0.953017383813858, + "num_tokens": 1558517.0, + "step": 543 + }, + { + "entropy": 0.13118655420839787, + "epoch": 0.6658506731946144, + "grad_norm": 0.1445721685886383, + "learning_rate": 6.812629045145655e-05, + "loss": 0.1463, + "mean_token_accuracy": 0.9499836713075638, + "num_tokens": 1561419.0, + "step": 544 + }, + { + "entropy": 0.1299305446445942, + "epoch": 0.6670746634026927, + "grad_norm": 0.1013883724808693, + "learning_rate": 6.781061494967746e-05, + "loss": 0.122, + "mean_token_accuracy": 0.9561431407928467, + "num_tokens": 1564376.0, + "step": 545 + }, + { + "entropy": 0.1277548111975193, + "epoch": 0.6682986536107711, + "grad_norm": 0.09661097079515457, + "learning_rate": 6.749560327045858e-05, + "loss": 0.1219, + "mean_token_accuracy": 0.9534917026758194, + "num_tokens": 1567247.0, + "step": 546 + }, + { + "entropy": 0.14297610148787498, + "epoch": 0.6695226438188494, + "grad_norm": 0.11284302920103073, + "learning_rate": 6.718126037030392e-05, + "loss": 0.1413, + "mean_token_accuracy": 0.9474358260631561, + "num_tokens": 1570137.0, + "step": 547 + }, + { + "entropy": 0.13623189367353916, + "epoch": 0.6707466340269278, + "grad_norm": 0.09880812466144562, + "learning_rate": 6.686759119519493e-05, + "loss": 0.1261, + "mean_token_accuracy": 0.9548007994890213, + "num_tokens": 1572943.0, + "step": 548 + }, + { + "entropy": 0.12064841762185097, + "epoch": 0.6719706242350061, + "grad_norm": 0.07386547327041626, + "learning_rate": 6.655460068051207e-05, + "loss": 0.1133, + "mean_token_accuracy": 0.9589287787675858, + "num_tokens": 1575734.0, + "step": 549 + }, + { + "entropy": 0.11477799527347088, + "epoch": 0.6731946144430845, + "grad_norm": 0.09374738484621048, + "learning_rate": 6.624229375095782e-05, + "loss": 0.1073, + "mean_token_accuracy": 0.9592752158641815, + "num_tokens": 1578713.0, + "step": 550 + }, + { + "entropy": 0.11568047851324081, + "epoch": 0.6744186046511628, + "grad_norm": 0.07430195063352585, + "learning_rate": 6.593067532047882e-05, + "loss": 0.1124, + "mean_token_accuracy": 0.9580425918102264, + "num_tokens": 1581585.0, + "step": 551 + }, + { + "entropy": 0.11425567418336868, + "epoch": 0.6756425948592412, + "grad_norm": 0.08985903114080429, + "learning_rate": 6.561975029218848e-05, + "loss": 0.1149, + "mean_token_accuracy": 0.9551949352025986, + "num_tokens": 1584528.0, + "step": 552 + }, + { + "entropy": 0.12063229456543922, + "epoch": 0.6768665850673194, + "grad_norm": 0.1338767558336258, + "learning_rate": 6.530952355829021e-05, + "loss": 0.1258, + "mean_token_accuracy": 0.9540190994739532, + "num_tokens": 1587367.0, + "step": 553 + }, + { + "entropy": 0.11773077212274075, + "epoch": 0.6780905752753978, + "grad_norm": 0.10604891926050186, + "learning_rate": 6.500000000000002e-05, + "loss": 0.1229, + "mean_token_accuracy": 0.9551961719989777, + "num_tokens": 1590355.0, + "step": 554 + }, + { + "entropy": 0.12415792979300022, + "epoch": 0.6793145654834761, + "grad_norm": 0.09846334904432297, + "learning_rate": 6.469118448746997e-05, + "loss": 0.1196, + "mean_token_accuracy": 0.9554211050271988, + "num_tokens": 1593211.0, + "step": 555 + }, + { + "entropy": 0.12992994859814644, + "epoch": 0.6805385556915544, + "grad_norm": 0.11943725496530533, + "learning_rate": 6.438308187971154e-05, + "loss": 0.1336, + "mean_token_accuracy": 0.9517884999513626, + "num_tokens": 1596160.0, + "step": 556 + }, + { + "entropy": 0.12936868704855442, + "epoch": 0.6817625458996328, + "grad_norm": 0.11231625825166702, + "learning_rate": 6.407569702451905e-05, + "loss": 0.1279, + "mean_token_accuracy": 0.9515047073364258, + "num_tokens": 1599018.0, + "step": 557 + }, + { + "entropy": 0.11204122938215733, + "epoch": 0.6829865361077111, + "grad_norm": 0.06850454211235046, + "learning_rate": 6.376903475839346e-05, + "loss": 0.107, + "mean_token_accuracy": 0.961060494184494, + "num_tokens": 1601962.0, + "step": 558 + }, + { + "entropy": 0.13684387877583504, + "epoch": 0.6842105263157895, + "grad_norm": 0.11774871498346329, + "learning_rate": 6.346309990646622e-05, + "loss": 0.1352, + "mean_token_accuracy": 0.9500558078289032, + "num_tokens": 1604919.0, + "step": 559 + }, + { + "entropy": 0.11776763759553432, + "epoch": 0.6854345165238678, + "grad_norm": 0.12762795388698578, + "learning_rate": 6.315789728242351e-05, + "loss": 0.1195, + "mean_token_accuracy": 0.9557198733091354, + "num_tokens": 1607879.0, + "step": 560 + }, + { + "entropy": 0.1305858213454485, + "epoch": 0.6866585067319462, + "grad_norm": 0.08963537216186523, + "learning_rate": 6.285343168843027e-05, + "loss": 0.1214, + "mean_token_accuracy": 0.9584052562713623, + "num_tokens": 1610744.0, + "step": 561 + }, + { + "entropy": 0.12002053111791611, + "epoch": 0.6878824969400245, + "grad_norm": 0.0804935023188591, + "learning_rate": 6.254970791505478e-05, + "loss": 0.1145, + "mean_token_accuracy": 0.9570070058107376, + "num_tokens": 1613644.0, + "step": 562 + }, + { + "entropy": 0.119348905980587, + "epoch": 0.6891064871481029, + "grad_norm": 0.11004110425710678, + "learning_rate": 6.224673074119329e-05, + "loss": 0.1255, + "mean_token_accuracy": 0.9556309878826141, + "num_tokens": 1616386.0, + "step": 563 + }, + { + "entropy": 0.132030563428998, + "epoch": 0.6903304773561811, + "grad_norm": 0.08969442546367645, + "learning_rate": 6.194450493399476e-05, + "loss": 0.1158, + "mean_token_accuracy": 0.9559299945831299, + "num_tokens": 1619250.0, + "step": 564 + }, + { + "entropy": 0.12700083293020725, + "epoch": 0.6915544675642595, + "grad_norm": 0.11120559275150299, + "learning_rate": 6.164303524878586e-05, + "loss": 0.1301, + "mean_token_accuracy": 0.955559253692627, + "num_tokens": 1622222.0, + "step": 565 + }, + { + "entropy": 0.11796704679727554, + "epoch": 0.6927784577723378, + "grad_norm": 0.09868547320365906, + "learning_rate": 6.134232642899626e-05, + "loss": 0.1261, + "mean_token_accuracy": 0.9551449418067932, + "num_tokens": 1625221.0, + "step": 566 + }, + { + "entropy": 0.12740915641188622, + "epoch": 0.6940024479804161, + "grad_norm": 0.11309243738651276, + "learning_rate": 6.104238320608374e-05, + "loss": 0.1239, + "mean_token_accuracy": 0.9568604677915573, + "num_tokens": 1628144.0, + "step": 567 + }, + { + "entropy": 0.12536575458943844, + "epoch": 0.6952264381884945, + "grad_norm": 0.09379520267248154, + "learning_rate": 6.074321029946011e-05, + "loss": 0.1176, + "mean_token_accuracy": 0.9565268158912659, + "num_tokens": 1630851.0, + "step": 568 + }, + { + "entropy": 0.11337637901306152, + "epoch": 0.6964504283965728, + "grad_norm": 0.09800992161035538, + "learning_rate": 6.0444812416416644e-05, + "loss": 0.1176, + "mean_token_accuracy": 0.9542988985776901, + "num_tokens": 1633690.0, + "step": 569 + }, + { + "entropy": 0.12905778735876083, + "epoch": 0.6976744186046512, + "grad_norm": 0.15460340678691864, + "learning_rate": 6.0147194252050086e-05, + "loss": 0.1227, + "mean_token_accuracy": 0.9560205787420273, + "num_tokens": 1636641.0, + "step": 570 + }, + { + "entropy": 0.12652179971337318, + "epoch": 0.6988984088127295, + "grad_norm": 0.11568696796894073, + "learning_rate": 5.985036048918894e-05, + "loss": 0.1324, + "mean_token_accuracy": 0.9555312991142273, + "num_tokens": 1639577.0, + "step": 571 + }, + { + "entropy": 0.13362854346632957, + "epoch": 0.7001223990208079, + "grad_norm": 0.09579101949930191, + "learning_rate": 5.9554315798319535e-05, + "loss": 0.1235, + "mean_token_accuracy": 0.9550480097532272, + "num_tokens": 1642408.0, + "step": 572 + }, + { + "entropy": 0.12491125613451004, + "epoch": 0.7013463892288861, + "grad_norm": 0.07888595014810562, + "learning_rate": 5.9259064837512776e-05, + "loss": 0.1132, + "mean_token_accuracy": 0.9588579535484314, + "num_tokens": 1645247.0, + "step": 573 + }, + { + "entropy": 0.12114104256033897, + "epoch": 0.7025703794369645, + "grad_norm": 0.07899249345064163, + "learning_rate": 5.896461225235065e-05, + "loss": 0.1168, + "mean_token_accuracy": 0.9541361033916473, + "num_tokens": 1648088.0, + "step": 574 + }, + { + "entropy": 0.1321531981229782, + "epoch": 0.7037943696450428, + "grad_norm": 0.13727176189422607, + "learning_rate": 5.8670962675853224e-05, + "loss": 0.1295, + "mean_token_accuracy": 0.9476852715015411, + "num_tokens": 1651002.0, + "step": 575 + }, + { + "entropy": 0.11790530569851398, + "epoch": 0.7050183598531212, + "grad_norm": 0.0890548825263977, + "learning_rate": 5.8378120728405885e-05, + "loss": 0.1149, + "mean_token_accuracy": 0.9562715291976929, + "num_tokens": 1653873.0, + "step": 576 + }, + { + "entropy": 0.13041403330862522, + "epoch": 0.7062423500611995, + "grad_norm": 0.10745251178741455, + "learning_rate": 5.808609101768625e-05, + "loss": 0.1256, + "mean_token_accuracy": 0.9547127336263657, + "num_tokens": 1656853.0, + "step": 577 + }, + { + "entropy": 0.12002996355295181, + "epoch": 0.7074663402692778, + "grad_norm": 0.10615383088588715, + "learning_rate": 5.779487813859218e-05, + "loss": 0.1237, + "mean_token_accuracy": 0.952038049697876, + "num_tokens": 1659705.0, + "step": 578 + }, + { + "entropy": 0.12046261318027973, + "epoch": 0.7086903304773562, + "grad_norm": 0.08489915728569031, + "learning_rate": 5.750448667316905e-05, + "loss": 0.1161, + "mean_token_accuracy": 0.9567108601331711, + "num_tokens": 1662507.0, + "step": 579 + }, + { + "entropy": 0.12017075717449188, + "epoch": 0.7099143206854345, + "grad_norm": 0.09946948289871216, + "learning_rate": 5.7214921190537884e-05, + "loss": 0.1215, + "mean_token_accuracy": 0.9569592475891113, + "num_tokens": 1665257.0, + "step": 580 + }, + { + "entropy": 0.12242058292031288, + "epoch": 0.7111383108935129, + "grad_norm": 0.10300292819738388, + "learning_rate": 5.692618624682342e-05, + "loss": 0.1247, + "mean_token_accuracy": 0.9553636610507965, + "num_tokens": 1668179.0, + "step": 581 + }, + { + "entropy": 0.12047308683395386, + "epoch": 0.7123623011015912, + "grad_norm": 0.0785590261220932, + "learning_rate": 5.663828638508237e-05, + "loss": 0.1133, + "mean_token_accuracy": 0.953291729092598, + "num_tokens": 1671018.0, + "step": 582 + }, + { + "entropy": 0.12438478693366051, + "epoch": 0.7135862913096696, + "grad_norm": 0.10865124315023422, + "learning_rate": 5.635122613523198e-05, + "loss": 0.116, + "mean_token_accuracy": 0.955594390630722, + "num_tokens": 1673971.0, + "step": 583 + }, + { + "entropy": 0.11624068021774292, + "epoch": 0.7148102815177478, + "grad_norm": 0.09714595973491669, + "learning_rate": 5.606501001397878e-05, + "loss": 0.1196, + "mean_token_accuracy": 0.9614518582820892, + "num_tokens": 1676702.0, + "step": 584 + }, + { + "entropy": 0.12332681939005852, + "epoch": 0.7160342717258262, + "grad_norm": 0.11641556024551392, + "learning_rate": 5.577964252474745e-05, + "loss": 0.1228, + "mean_token_accuracy": 0.9534997195005417, + "num_tokens": 1679538.0, + "step": 585 + }, + { + "entropy": 0.12291526794433594, + "epoch": 0.7172582619339045, + "grad_norm": 0.11812041699886322, + "learning_rate": 5.549512815761002e-05, + "loss": 0.1261, + "mean_token_accuracy": 0.9553936123847961, + "num_tokens": 1682358.0, + "step": 586 + }, + { + "entropy": 0.1213561687618494, + "epoch": 0.7184822521419829, + "grad_norm": 0.11882919073104858, + "learning_rate": 5.521147138921514e-05, + "loss": 0.1271, + "mean_token_accuracy": 0.9529240280389786, + "num_tokens": 1685361.0, + "step": 587 + }, + { + "entropy": 0.12020848132669926, + "epoch": 0.7197062423500612, + "grad_norm": 0.09805058687925339, + "learning_rate": 5.4928676682717814e-05, + "loss": 0.1161, + "mean_token_accuracy": 0.9567732810974121, + "num_tokens": 1688350.0, + "step": 588 + }, + { + "entropy": 0.11253302171826363, + "epoch": 0.7209302325581395, + "grad_norm": 0.11090445518493652, + "learning_rate": 5.4646748487708965e-05, + "loss": 0.1235, + "mean_token_accuracy": 0.9579932391643524, + "num_tokens": 1691159.0, + "step": 589 + }, + { + "entropy": 0.1259227003902197, + "epoch": 0.7221542227662179, + "grad_norm": 0.12484165281057358, + "learning_rate": 5.4365691240145525e-05, + "loss": 0.1432, + "mean_token_accuracy": 0.9509315192699432, + "num_tokens": 1694026.0, + "step": 590 + }, + { + "entropy": 0.1159481406211853, + "epoch": 0.7233782129742962, + "grad_norm": 0.08420033752918243, + "learning_rate": 5.4085509362280715e-05, + "loss": 0.1143, + "mean_token_accuracy": 0.9556987583637238, + "num_tokens": 1696864.0, + "step": 591 + }, + { + "entropy": 0.12152973748743534, + "epoch": 0.7246022031823746, + "grad_norm": 0.09311600029468536, + "learning_rate": 5.380620726259431e-05, + "loss": 0.1164, + "mean_token_accuracy": 0.9551445990800858, + "num_tokens": 1699748.0, + "step": 592 + }, + { + "entropy": 0.12369105778634548, + "epoch": 0.7258261933904528, + "grad_norm": 0.10568439215421677, + "learning_rate": 5.3527789335723314e-05, + "loss": 0.1245, + "mean_token_accuracy": 0.95469531416893, + "num_tokens": 1702609.0, + "step": 593 + }, + { + "entropy": 0.12211555987596512, + "epoch": 0.7270501835985312, + "grad_norm": 0.11776036024093628, + "learning_rate": 5.325025996239301e-05, + "loss": 0.1178, + "mean_token_accuracy": 0.9583615809679031, + "num_tokens": 1705472.0, + "step": 594 + }, + { + "entropy": 0.12680583074688911, + "epoch": 0.7282741738066095, + "grad_norm": 0.10097135603427887, + "learning_rate": 5.2973623509347617e-05, + "loss": 0.1225, + "mean_token_accuracy": 0.9526026099920273, + "num_tokens": 1708336.0, + "step": 595 + }, + { + "entropy": 0.11793944239616394, + "epoch": 0.7294981640146879, + "grad_norm": 0.08076831698417664, + "learning_rate": 5.2697884329282065e-05, + "loss": 0.1109, + "mean_token_accuracy": 0.9595013856887817, + "num_tokens": 1711163.0, + "step": 596 + }, + { + "entropy": 0.11948729492723942, + "epoch": 0.7307221542227662, + "grad_norm": 0.0952032133936882, + "learning_rate": 5.2423046760773174e-05, + "loss": 0.1273, + "mean_token_accuracy": 0.9568696320056915, + "num_tokens": 1713982.0, + "step": 597 + }, + { + "entropy": 0.11885470524430275, + "epoch": 0.7319461444308446, + "grad_norm": 0.08250678330659866, + "learning_rate": 5.214911512821145e-05, + "loss": 0.1152, + "mean_token_accuracy": 0.958494171500206, + "num_tokens": 1716762.0, + "step": 598 + }, + { + "entropy": 0.1254271175712347, + "epoch": 0.7331701346389229, + "grad_norm": 0.13346175849437714, + "learning_rate": 5.1876093741733224e-05, + "loss": 0.1362, + "mean_token_accuracy": 0.9497381746768951, + "num_tokens": 1719652.0, + "step": 599 + }, + { + "entropy": 0.11632178910076618, + "epoch": 0.7343941248470012, + "grad_norm": 0.11797245591878891, + "learning_rate": 5.1603986897152555e-05, + "loss": 0.1233, + "mean_token_accuracy": 0.9527034163475037, + "num_tokens": 1722551.0, + "step": 600 + }, + { + "entropy": 0.11504321731626987, + "epoch": 0.7356181150550796, + "grad_norm": 0.1295362114906311, + "learning_rate": 5.1332798875893805e-05, + "loss": 0.1298, + "mean_token_accuracy": 0.9546101838350296, + "num_tokens": 1725446.0, + "step": 601 + }, + { + "entropy": 0.12104380317032337, + "epoch": 0.7368421052631579, + "grad_norm": 0.13073742389678955, + "learning_rate": 5.106253394492435e-05, + "loss": 0.1257, + "mean_token_accuracy": 0.9527780264616013, + "num_tokens": 1728405.0, + "step": 602 + }, + { + "entropy": 0.13663169369101524, + "epoch": 0.7380660954712362, + "grad_norm": 0.15089236199855804, + "learning_rate": 5.0793196356687244e-05, + "loss": 0.144, + "mean_token_accuracy": 0.9498279839754105, + "num_tokens": 1731492.0, + "step": 603 + }, + { + "entropy": 0.12454614788293839, + "epoch": 0.7392900856793145, + "grad_norm": 0.09851514548063278, + "learning_rate": 5.052479034903439e-05, + "loss": 0.1183, + "mean_token_accuracy": 0.9516207277774811, + "num_tokens": 1734409.0, + "step": 604 + }, + { + "entropy": 0.1272672340273857, + "epoch": 0.7405140758873929, + "grad_norm": 0.1096467450261116, + "learning_rate": 5.0257320145159925e-05, + "loss": 0.1188, + "mean_token_accuracy": 0.9545516818761826, + "num_tokens": 1737325.0, + "step": 605 + }, + { + "entropy": 0.1244563814252615, + "epoch": 0.7417380660954712, + "grad_norm": 0.09899038076400757, + "learning_rate": 4.9990789953533767e-05, + "loss": 0.1149, + "mean_token_accuracy": 0.9555229246616364, + "num_tokens": 1740214.0, + "step": 606 + }, + { + "entropy": 0.12568113766610622, + "epoch": 0.7429620563035496, + "grad_norm": 0.12539620697498322, + "learning_rate": 4.972520396783525e-05, + "loss": 0.1356, + "mean_token_accuracy": 0.9545812457799911, + "num_tokens": 1743151.0, + "step": 607 + }, + { + "entropy": 0.131044153124094, + "epoch": 0.7441860465116279, + "grad_norm": 0.11705169081687927, + "learning_rate": 4.946056636688728e-05, + "loss": 0.133, + "mean_token_accuracy": 0.9500038176774979, + "num_tokens": 1745924.0, + "step": 608 + }, + { + "entropy": 0.1339541431516409, + "epoch": 0.7454100367197063, + "grad_norm": 0.10650363564491272, + "learning_rate": 4.919688131459058e-05, + "loss": 0.1234, + "mean_token_accuracy": 0.9516549110412598, + "num_tokens": 1748799.0, + "step": 609 + }, + { + "entropy": 0.1278488989919424, + "epoch": 0.7466340269277846, + "grad_norm": 0.07948151975870132, + "learning_rate": 4.893415295985812e-05, + "loss": 0.1163, + "mean_token_accuracy": 0.9581326395273209, + "num_tokens": 1751609.0, + "step": 610 + }, + { + "entropy": 0.12218378856778145, + "epoch": 0.7478580171358629, + "grad_norm": 0.09037908911705017, + "learning_rate": 4.86723854365498e-05, + "loss": 0.1193, + "mean_token_accuracy": 0.958241879940033, + "num_tokens": 1754479.0, + "step": 611 + }, + { + "entropy": 0.12393893301486969, + "epoch": 0.7490820073439413, + "grad_norm": 0.1619521975517273, + "learning_rate": 4.841158286340756e-05, + "loss": 0.1411, + "mean_token_accuracy": 0.9510868191719055, + "num_tokens": 1757435.0, + "step": 612 + }, + { + "entropy": 0.12574773095548153, + "epoch": 0.7503059975520195, + "grad_norm": 0.09683439880609512, + "learning_rate": 4.8151749343990416e-05, + "loss": 0.1244, + "mean_token_accuracy": 0.9555336087942123, + "num_tokens": 1760411.0, + "step": 613 + }, + { + "entropy": 0.11535865068435669, + "epoch": 0.7515299877600979, + "grad_norm": 0.07456349581480026, + "learning_rate": 4.789288896660994e-05, + "loss": 0.1117, + "mean_token_accuracy": 0.9621769338846207, + "num_tokens": 1763218.0, + "step": 614 + }, + { + "entropy": 0.12458016909658909, + "epoch": 0.7527539779681762, + "grad_norm": 0.10451184213161469, + "learning_rate": 4.7635005804265956e-05, + "loss": 0.1251, + "mean_token_accuracy": 0.9555677026510239, + "num_tokens": 1766149.0, + "step": 615 + }, + { + "entropy": 0.12277675047516823, + "epoch": 0.7539779681762546, + "grad_norm": 0.13702791929244995, + "learning_rate": 4.737810391458252e-05, + "loss": 0.1354, + "mean_token_accuracy": 0.9528448432683945, + "num_tokens": 1768990.0, + "step": 616 + }, + { + "entropy": 0.12686723470687866, + "epoch": 0.7552019583843329, + "grad_norm": 0.11136746406555176, + "learning_rate": 4.71221873397439e-05, + "loss": 0.1256, + "mean_token_accuracy": 0.9526865780353546, + "num_tokens": 1771791.0, + "step": 617 + }, + { + "entropy": 0.12564765103161335, + "epoch": 0.7564259485924113, + "grad_norm": 0.09914430230855942, + "learning_rate": 4.686726010643108e-05, + "loss": 0.1196, + "mean_token_accuracy": 0.9513101577758789, + "num_tokens": 1774674.0, + "step": 618 + }, + { + "entropy": 0.1232032049447298, + "epoch": 0.7576499388004896, + "grad_norm": 0.09024959802627563, + "learning_rate": 4.661332622575853e-05, + "loss": 0.1243, + "mean_token_accuracy": 0.9561030119657516, + "num_tokens": 1777659.0, + "step": 619 + }, + { + "entropy": 0.1146812904626131, + "epoch": 0.758873929008568, + "grad_norm": 0.0843362957239151, + "learning_rate": 4.6360389693210735e-05, + "loss": 0.119, + "mean_token_accuracy": 0.9573349505662918, + "num_tokens": 1780392.0, + "step": 620 + }, + { + "entropy": 0.1336996890604496, + "epoch": 0.7600979192166463, + "grad_norm": 0.115569107234478, + "learning_rate": 4.610845448857975e-05, + "loss": 0.1278, + "mean_token_accuracy": 0.9514962285757065, + "num_tokens": 1783245.0, + "step": 621 + }, + { + "entropy": 0.11678878776729107, + "epoch": 0.7613219094247246, + "grad_norm": 0.0956239253282547, + "learning_rate": 4.5857524575902375e-05, + "loss": 0.1193, + "mean_token_accuracy": 0.9576065689325333, + "num_tokens": 1786089.0, + "step": 622 + }, + { + "entropy": 0.12730038166046143, + "epoch": 0.762545899632803, + "grad_norm": 0.11260199546813965, + "learning_rate": 4.560760390339762e-05, + "loss": 0.1233, + "mean_token_accuracy": 0.9550187587738037, + "num_tokens": 1788932.0, + "step": 623 + }, + { + "entropy": 0.11902902461588383, + "epoch": 0.7637698898408812, + "grad_norm": 0.08219322562217712, + "learning_rate": 4.535869640340494e-05, + "loss": 0.1214, + "mean_token_accuracy": 0.9574473947286606, + "num_tokens": 1791721.0, + "step": 624 + }, + { + "entropy": 0.1282477881759405, + "epoch": 0.7649938800489596, + "grad_norm": 0.10945278406143188, + "learning_rate": 4.511080599232214e-05, + "loss": 0.1339, + "mean_token_accuracy": 0.950457438826561, + "num_tokens": 1794652.0, + "step": 625 + }, + { + "entropy": 0.11842477694153786, + "epoch": 0.7662178702570379, + "grad_norm": 0.09658168256282806, + "learning_rate": 4.486393657054369e-05, + "loss": 0.1234, + "mean_token_accuracy": 0.9555472582578659, + "num_tokens": 1797478.0, + "step": 626 + }, + { + "entropy": 0.13103088922798634, + "epoch": 0.7674418604651163, + "grad_norm": 0.11139772087335587, + "learning_rate": 4.461809202239961e-05, + "loss": 0.1394, + "mean_token_accuracy": 0.9520025104284286, + "num_tokens": 1800327.0, + "step": 627 + }, + { + "entropy": 0.12021008506417274, + "epoch": 0.7686658506731946, + "grad_norm": 0.09884604811668396, + "learning_rate": 4.4373276216094075e-05, + "loss": 0.1252, + "mean_token_accuracy": 0.9551528096199036, + "num_tokens": 1803244.0, + "step": 628 + }, + { + "entropy": 0.12243433855473995, + "epoch": 0.769889840881273, + "grad_norm": 0.10126375406980515, + "learning_rate": 4.412949300364473e-05, + "loss": 0.1241, + "mean_token_accuracy": 0.9508773386478424, + "num_tokens": 1806165.0, + "step": 629 + }, + { + "entropy": 0.13152855448424816, + "epoch": 0.7711138310893513, + "grad_norm": 0.11569098383188248, + "learning_rate": 4.388674622082202e-05, + "loss": 0.1286, + "mean_token_accuracy": 0.9547057598829269, + "num_tokens": 1809059.0, + "step": 630 + }, + { + "entropy": 0.13421803154051304, + "epoch": 0.7723378212974297, + "grad_norm": 0.13831324875354767, + "learning_rate": 4.364503968708885e-05, + "loss": 0.131, + "mean_token_accuracy": 0.9558353126049042, + "num_tokens": 1812085.0, + "step": 631 + }, + { + "entropy": 0.1279985811561346, + "epoch": 0.773561811505508, + "grad_norm": 0.10028630495071411, + "learning_rate": 4.340437720554047e-05, + "loss": 0.1233, + "mean_token_accuracy": 0.9506480246782303, + "num_tokens": 1814950.0, + "step": 632 + }, + { + "entropy": 0.11865941248834133, + "epoch": 0.7747858017135862, + "grad_norm": 0.08036880195140839, + "learning_rate": 4.316476256284464e-05, + "loss": 0.1133, + "mean_token_accuracy": 0.957447960972786, + "num_tokens": 1817801.0, + "step": 633 + }, + { + "entropy": 0.12325590662658215, + "epoch": 0.7760097919216646, + "grad_norm": 0.09269207715988159, + "learning_rate": 4.2926199529182085e-05, + "loss": 0.1152, + "mean_token_accuracy": 0.9558061808347702, + "num_tokens": 1820680.0, + "step": 634 + }, + { + "entropy": 0.1231058668345213, + "epoch": 0.7772337821297429, + "grad_norm": 0.09198976308107376, + "learning_rate": 4.268869185818712e-05, + "loss": 0.1184, + "mean_token_accuracy": 0.9557169675827026, + "num_tokens": 1823538.0, + "step": 635 + }, + { + "entropy": 0.13273220881819725, + "epoch": 0.7784577723378213, + "grad_norm": 0.10118620097637177, + "learning_rate": 4.245224328688861e-05, + "loss": 0.1218, + "mean_token_accuracy": 0.9580679535865784, + "num_tokens": 1826215.0, + "step": 636 + }, + { + "entropy": 0.11764829978346825, + "epoch": 0.7796817625458996, + "grad_norm": 0.08317483961582184, + "learning_rate": 4.221685753565121e-05, + "loss": 0.1152, + "mean_token_accuracy": 0.9570571631193161, + "num_tokens": 1829208.0, + "step": 637 + }, + { + "entropy": 0.1261823009699583, + "epoch": 0.780905752753978, + "grad_norm": 0.08955132216215134, + "learning_rate": 4.198253830811677e-05, + "loss": 0.1211, + "mean_token_accuracy": 0.954292356967926, + "num_tokens": 1831946.0, + "step": 638 + }, + { + "entropy": 0.12664373591542244, + "epoch": 0.7821297429620563, + "grad_norm": 0.08577796816825867, + "learning_rate": 4.1749289291146054e-05, + "loss": 0.1185, + "mean_token_accuracy": 0.956654354929924, + "num_tokens": 1834860.0, + "step": 639 + }, + { + "entropy": 0.12379522807896137, + "epoch": 0.7833537331701347, + "grad_norm": 0.09162270277738571, + "learning_rate": 4.1517114154760774e-05, + "loss": 0.1196, + "mean_token_accuracy": 0.953244611620903, + "num_tokens": 1837744.0, + "step": 640 + }, + { + "entropy": 0.14236753806471825, + "epoch": 0.784577723378213, + "grad_norm": 0.12266378104686737, + "learning_rate": 4.1286016552085885e-05, + "loss": 0.1362, + "mean_token_accuracy": 0.9490169733762741, + "num_tokens": 1840617.0, + "step": 641 + }, + { + "entropy": 0.12323613837361336, + "epoch": 0.7858017135862914, + "grad_norm": 0.0957561731338501, + "learning_rate": 4.1056000119291995e-05, + "loss": 0.122, + "mean_token_accuracy": 0.9566615670919418, + "num_tokens": 1843509.0, + "step": 642 + }, + { + "entropy": 0.1151697114109993, + "epoch": 0.7870257037943696, + "grad_norm": 0.06570401787757874, + "learning_rate": 4.082706847553817e-05, + "loss": 0.1084, + "mean_token_accuracy": 0.9622858166694641, + "num_tokens": 1846205.0, + "step": 643 + }, + { + "entropy": 0.1207436453551054, + "epoch": 0.7882496940024479, + "grad_norm": 0.08406161516904831, + "learning_rate": 4.0599225222915115e-05, + "loss": 0.1141, + "mean_token_accuracy": 0.9559642970561981, + "num_tokens": 1849064.0, + "step": 644 + }, + { + "entropy": 0.11759369447827339, + "epoch": 0.7894736842105263, + "grad_norm": 0.11583132296800613, + "learning_rate": 4.0372473946388367e-05, + "loss": 0.1262, + "mean_token_accuracy": 0.9552195817232132, + "num_tokens": 1852072.0, + "step": 645 + }, + { + "entropy": 0.12121845223009586, + "epoch": 0.7906976744186046, + "grad_norm": 0.10158070176839828, + "learning_rate": 4.0146818213741875e-05, + "loss": 0.1212, + "mean_token_accuracy": 0.9539550244808197, + "num_tokens": 1855059.0, + "step": 646 + }, + { + "entropy": 0.1155339702963829, + "epoch": 0.791921664626683, + "grad_norm": 0.09081114083528519, + "learning_rate": 3.992226157552206e-05, + "loss": 0.1153, + "mean_token_accuracy": 0.9551489055156708, + "num_tokens": 1857910.0, + "step": 647 + }, + { + "entropy": 0.11592069268226624, + "epoch": 0.7931456548347613, + "grad_norm": 0.07916578650474548, + "learning_rate": 3.96988075649816e-05, + "loss": 0.1104, + "mean_token_accuracy": 0.958372950553894, + "num_tokens": 1860735.0, + "step": 648 + }, + { + "entropy": 0.12923837453126907, + "epoch": 0.7943696450428397, + "grad_norm": 0.10375561565160751, + "learning_rate": 3.9476459698024234e-05, + "loss": 0.1191, + "mean_token_accuracy": 0.9554652124643326, + "num_tokens": 1863572.0, + "step": 649 + }, + { + "entropy": 0.11863233521580696, + "epoch": 0.795593635250918, + "grad_norm": 0.11345972865819931, + "learning_rate": 3.925522147314915e-05, + "loss": 0.1182, + "mean_token_accuracy": 0.951953649520874, + "num_tokens": 1866462.0, + "step": 650 + }, + { + "entropy": 0.11698413081467152, + "epoch": 0.7968176254589964, + "grad_norm": 0.08065979927778244, + "learning_rate": 3.903509637139604e-05, + "loss": 0.1129, + "mean_token_accuracy": 0.9577423930168152, + "num_tokens": 1869225.0, + "step": 651 + }, + { + "entropy": 0.11750395968556404, + "epoch": 0.7980416156670747, + "grad_norm": 0.10913274437189102, + "learning_rate": 3.881608785629038e-05, + "loss": 0.1196, + "mean_token_accuracy": 0.9569533616304398, + "num_tokens": 1872162.0, + "step": 652 + }, + { + "entropy": 0.12503521516919136, + "epoch": 0.799265605875153, + "grad_norm": 0.1187901571393013, + "learning_rate": 3.8598199373788846e-05, + "loss": 0.1206, + "mean_token_accuracy": 0.9528372883796692, + "num_tokens": 1874929.0, + "step": 653 + }, + { + "entropy": 0.12583926506340504, + "epoch": 0.8004895960832313, + "grad_norm": 0.11558634042739868, + "learning_rate": 3.83814343522251e-05, + "loss": 0.1198, + "mean_token_accuracy": 0.9533436447381973, + "num_tokens": 1877744.0, + "step": 654 + }, + { + "entropy": 0.11790541745722294, + "epoch": 0.8017135862913096, + "grad_norm": 0.10242120176553726, + "learning_rate": 3.8165796202255966e-05, + "loss": 0.1231, + "mean_token_accuracy": 0.9561738073825836, + "num_tokens": 1880633.0, + "step": 655 + }, + { + "entropy": 0.11141330376267433, + "epoch": 0.802937576499388, + "grad_norm": 0.10717166215181351, + "learning_rate": 3.795128831680758e-05, + "loss": 0.1193, + "mean_token_accuracy": 0.9573391228914261, + "num_tokens": 1883429.0, + "step": 656 + }, + { + "entropy": 0.12505029328167439, + "epoch": 0.8041615667074663, + "grad_norm": 0.10949593782424927, + "learning_rate": 3.773791407102214e-05, + "loss": 0.121, + "mean_token_accuracy": 0.9530365169048309, + "num_tokens": 1886313.0, + "step": 657 + }, + { + "entropy": 0.1187698245048523, + "epoch": 0.8053855569155447, + "grad_norm": 0.08860643208026886, + "learning_rate": 3.7525676822204724e-05, + "loss": 0.1135, + "mean_token_accuracy": 0.959896519780159, + "num_tokens": 1889205.0, + "step": 658 + }, + { + "entropy": 0.11789131909608841, + "epoch": 0.806609547123623, + "grad_norm": 0.09528093039989471, + "learning_rate": 3.731457990977062e-05, + "loss": 0.1096, + "mean_token_accuracy": 0.9584230929613113, + "num_tokens": 1892171.0, + "step": 659 + }, + { + "entropy": 0.1153755821287632, + "epoch": 0.8078335373317014, + "grad_norm": 0.07718336582183838, + "learning_rate": 3.710462665519251e-05, + "loss": 0.1105, + "mean_token_accuracy": 0.9586650729179382, + "num_tokens": 1894978.0, + "step": 660 + }, + { + "entropy": 0.10932862013578415, + "epoch": 0.8090575275397797, + "grad_norm": 0.0870288535952568, + "learning_rate": 3.689582036194844e-05, + "loss": 0.1089, + "mean_token_accuracy": 0.9568508118391037, + "num_tokens": 1897846.0, + "step": 661 + }, + { + "entropy": 0.1201893761754036, + "epoch": 0.8102815177478581, + "grad_norm": 0.09508999437093735, + "learning_rate": 3.6688164315469785e-05, + "loss": 0.113, + "mean_token_accuracy": 0.957280620932579, + "num_tokens": 1900758.0, + "step": 662 + }, + { + "entropy": 0.1151625607162714, + "epoch": 0.8115055079559363, + "grad_norm": 0.13085341453552246, + "learning_rate": 3.64816617830895e-05, + "loss": 0.1213, + "mean_token_accuracy": 0.957948312163353, + "num_tokens": 1903641.0, + "step": 663 + }, + { + "entropy": 0.12170935794711113, + "epoch": 0.8127294981640147, + "grad_norm": 0.12254071980714798, + "learning_rate": 3.627631601399073e-05, + "loss": 0.1193, + "mean_token_accuracy": 0.953119084239006, + "num_tokens": 1906546.0, + "step": 664 + }, + { + "entropy": 0.10714775323867798, + "epoch": 0.813953488372093, + "grad_norm": 0.0794091746211052, + "learning_rate": 3.607213023915579e-05, + "loss": 0.1106, + "mean_token_accuracy": 0.9621934741735458, + "num_tokens": 1909369.0, + "step": 665 + }, + { + "entropy": 0.11147678643465042, + "epoch": 0.8151774785801713, + "grad_norm": 0.1575685441493988, + "learning_rate": 3.586910767131506e-05, + "loss": 0.123, + "mean_token_accuracy": 0.9597513526678085, + "num_tokens": 1912398.0, + "step": 666 + }, + { + "entropy": 0.1143040880560875, + "epoch": 0.8164014687882497, + "grad_norm": 0.08880111575126648, + "learning_rate": 3.566725150489681e-05, + "loss": 0.1091, + "mean_token_accuracy": 0.9554980844259262, + "num_tokens": 1915184.0, + "step": 667 + }, + { + "entropy": 0.11480196006596088, + "epoch": 0.817625458996328, + "grad_norm": 0.1281009167432785, + "learning_rate": 3.5466564915976644e-05, + "loss": 0.1151, + "mean_token_accuracy": 0.9588472992181778, + "num_tokens": 1918061.0, + "step": 668 + }, + { + "entropy": 0.12977596558630466, + "epoch": 0.8188494492044064, + "grad_norm": 0.1156676709651947, + "learning_rate": 3.526705106222761e-05, + "loss": 0.118, + "mean_token_accuracy": 0.9537792056798935, + "num_tokens": 1920891.0, + "step": 669 + }, + { + "entropy": 0.11432594619691372, + "epoch": 0.8200734394124847, + "grad_norm": 0.12913236021995544, + "learning_rate": 3.506871308287059e-05, + "loss": 0.1131, + "mean_token_accuracy": 0.9583148658275604, + "num_tokens": 1923809.0, + "step": 670 + }, + { + "entropy": 0.12306522391736507, + "epoch": 0.8212974296205631, + "grad_norm": 0.17781168222427368, + "learning_rate": 3.4871554098624783e-05, + "loss": 0.1403, + "mean_token_accuracy": 0.9469188004732132, + "num_tokens": 1926645.0, + "step": 671 + }, + { + "entropy": 0.11788520961999893, + "epoch": 0.8225214198286414, + "grad_norm": 0.13206161558628082, + "learning_rate": 3.467557721165873e-05, + "loss": 0.134, + "mean_token_accuracy": 0.9531625211238861, + "num_tokens": 1929502.0, + "step": 672 + }, + { + "entropy": 0.11731364019215107, + "epoch": 0.8237454100367197, + "grad_norm": 0.10521397739648819, + "learning_rate": 3.44807855055414e-05, + "loss": 0.1184, + "mean_token_accuracy": 0.9579882621765137, + "num_tokens": 1932259.0, + "step": 673 + }, + { + "entropy": 0.1247300636023283, + "epoch": 0.824969400244798, + "grad_norm": 0.12718763947486877, + "learning_rate": 3.428718204519369e-05, + "loss": 0.1235, + "mean_token_accuracy": 0.9586955606937408, + "num_tokens": 1935083.0, + "step": 674 + }, + { + "entropy": 0.12399046681821346, + "epoch": 0.8261933904528764, + "grad_norm": 0.0943707749247551, + "learning_rate": 3.409476987684031e-05, + "loss": 0.1123, + "mean_token_accuracy": 0.9583354443311691, + "num_tokens": 1937955.0, + "step": 675 + }, + { + "entropy": 0.11029119975864887, + "epoch": 0.8274173806609547, + "grad_norm": 0.07151154428720474, + "learning_rate": 3.3903552027961615e-05, + "loss": 0.1067, + "mean_token_accuracy": 0.9561154991388321, + "num_tokens": 1940923.0, + "step": 676 + }, + { + "entropy": 0.11008168756961823, + "epoch": 0.828641370869033, + "grad_norm": 0.07453305274248123, + "learning_rate": 3.371353150724629e-05, + "loss": 0.1118, + "mean_token_accuracy": 0.9586080759763718, + "num_tokens": 1943760.0, + "step": 677 + }, + { + "entropy": 0.12976218573749065, + "epoch": 0.8298653610771114, + "grad_norm": 0.11454939842224121, + "learning_rate": 3.352471130454372e-05, + "loss": 0.1225, + "mean_token_accuracy": 0.9511912018060684, + "num_tokens": 1946796.0, + "step": 678 + }, + { + "entropy": 0.11308914795517921, + "epoch": 0.8310893512851897, + "grad_norm": 0.1108722910284996, + "learning_rate": 3.3337094390817105e-05, + "loss": 0.1177, + "mean_token_accuracy": 0.9581098556518555, + "num_tokens": 1949629.0, + "step": 679 + }, + { + "entropy": 0.12419293634593487, + "epoch": 0.8323133414932681, + "grad_norm": 0.09454985707998276, + "learning_rate": 3.315068371809668e-05, + "loss": 0.1173, + "mean_token_accuracy": 0.9585745632648468, + "num_tokens": 1952583.0, + "step": 680 + }, + { + "entropy": 0.11919602937996387, + "epoch": 0.8335373317013464, + "grad_norm": 0.12401589751243591, + "learning_rate": 3.296548221943326e-05, + "loss": 0.1255, + "mean_token_accuracy": 0.9559617787599564, + "num_tokens": 1955444.0, + "step": 681 + }, + { + "entropy": 0.12096353992819786, + "epoch": 0.8347613219094248, + "grad_norm": 0.21755661070346832, + "learning_rate": 3.2781492808852066e-05, + "loss": 0.1292, + "mean_token_accuracy": 0.9521990418434143, + "num_tokens": 1958361.0, + "step": 682 + }, + { + "entropy": 0.11680376157164574, + "epoch": 0.835985312117503, + "grad_norm": 0.08833280950784683, + "learning_rate": 3.259871838130695e-05, + "loss": 0.1107, + "mean_token_accuracy": 0.95879727602005, + "num_tokens": 1961199.0, + "step": 683 + }, + { + "entropy": 0.11357714422047138, + "epoch": 0.8372093023255814, + "grad_norm": 0.07641427963972092, + "learning_rate": 3.241716181263477e-05, + "loss": 0.1092, + "mean_token_accuracy": 0.9609952569007874, + "num_tokens": 1964058.0, + "step": 684 + }, + { + "entropy": 0.12019915319979191, + "epoch": 0.8384332925336597, + "grad_norm": 0.1069817990064621, + "learning_rate": 3.2236825959510145e-05, + "loss": 0.1212, + "mean_token_accuracy": 0.9510638415813446, + "num_tokens": 1966879.0, + "step": 685 + }, + { + "entropy": 0.12963379174470901, + "epoch": 0.8396572827417381, + "grad_norm": 0.11880864948034286, + "learning_rate": 3.205771365940052e-05, + "loss": 0.12, + "mean_token_accuracy": 0.9553559422492981, + "num_tokens": 1969728.0, + "step": 686 + }, + { + "entropy": 0.1285783015191555, + "epoch": 0.8408812729498164, + "grad_norm": 0.11469470709562302, + "learning_rate": 3.1879827730521614e-05, + "loss": 0.119, + "mean_token_accuracy": 0.9536116123199463, + "num_tokens": 1972620.0, + "step": 687 + }, + { + "entropy": 0.12211041525006294, + "epoch": 0.8421052631578947, + "grad_norm": 0.10508140921592712, + "learning_rate": 3.1703170971792894e-05, + "loss": 0.1184, + "mean_token_accuracy": 0.9563441425561905, + "num_tokens": 1975450.0, + "step": 688 + }, + { + "entropy": 0.11813651584088802, + "epoch": 0.8433292533659731, + "grad_norm": 0.08082893490791321, + "learning_rate": 3.1527746162793667e-05, + "loss": 0.113, + "mean_token_accuracy": 0.9517210870981216, + "num_tokens": 1978332.0, + "step": 689 + }, + { + "entropy": 0.1302896160632372, + "epoch": 0.8445532435740514, + "grad_norm": 0.13204847276210785, + "learning_rate": 3.135355606371936e-05, + "loss": 0.1277, + "mean_token_accuracy": 0.9551141262054443, + "num_tokens": 1981177.0, + "step": 690 + }, + { + "entropy": 0.11883826553821564, + "epoch": 0.8457772337821298, + "grad_norm": 0.08550182729959488, + "learning_rate": 3.118060341533795e-05, + "loss": 0.113, + "mean_token_accuracy": 0.9571241289377213, + "num_tokens": 1983903.0, + "step": 691 + }, + { + "entropy": 0.11867485381662846, + "epoch": 0.847001223990208, + "grad_norm": 0.09806519001722336, + "learning_rate": 3.100889093894699e-05, + "loss": 0.1187, + "mean_token_accuracy": 0.9581058174371719, + "num_tokens": 1986735.0, + "step": 692 + }, + { + "entropy": 0.11867827735841274, + "epoch": 0.8482252141982864, + "grad_norm": 0.09867977350950241, + "learning_rate": 3.083842133633076e-05, + "loss": 0.1226, + "mean_token_accuracy": 0.9552352279424667, + "num_tokens": 1989503.0, + "step": 693 + }, + { + "entropy": 0.11490399949252605, + "epoch": 0.8494492044063647, + "grad_norm": 0.09465477615594864, + "learning_rate": 3.0669197289717624e-05, + "loss": 0.1182, + "mean_token_accuracy": 0.9564501196146011, + "num_tokens": 1992352.0, + "step": 694 + }, + { + "entropy": 0.12860754877328873, + "epoch": 0.8506731946144431, + "grad_norm": 0.11305855214595795, + "learning_rate": 3.050122146173805e-05, + "loss": 0.123, + "mean_token_accuracy": 0.9518560320138931, + "num_tokens": 1995352.0, + "step": 695 + }, + { + "entropy": 0.10911222733557224, + "epoch": 0.8518971848225214, + "grad_norm": 0.08079565316438675, + "learning_rate": 3.0334496495382525e-05, + "loss": 0.1098, + "mean_token_accuracy": 0.9573355317115784, + "num_tokens": 1998280.0, + "step": 696 + }, + { + "entropy": 0.1165240965783596, + "epoch": 0.8531211750305998, + "grad_norm": 0.08598501235246658, + "learning_rate": 3.0169025013960052e-05, + "loss": 0.1124, + "mean_token_accuracy": 0.956530749797821, + "num_tokens": 2001084.0, + "step": 697 + }, + { + "entropy": 0.11340131424367428, + "epoch": 0.8543451652386781, + "grad_norm": 0.176701620221138, + "learning_rate": 3.0004809621056902e-05, + "loss": 0.1179, + "mean_token_accuracy": 0.9546279609203339, + "num_tokens": 2004015.0, + "step": 698 + }, + { + "entropy": 0.11915582977235317, + "epoch": 0.8555691554467564, + "grad_norm": 0.1006980836391449, + "learning_rate": 2.9841852900495554e-05, + "loss": 0.1177, + "mean_token_accuracy": 0.9544068276882172, + "num_tokens": 2006886.0, + "step": 699 + }, + { + "entropy": 0.10795664228498936, + "epoch": 0.8567931456548348, + "grad_norm": 0.09037896990776062, + "learning_rate": 2.9680157416294108e-05, + "loss": 0.115, + "mean_token_accuracy": 0.9568637609481812, + "num_tokens": 2009875.0, + "step": 700 + }, + { + "entropy": 0.11119433119893074, + "epoch": 0.8580171358629131, + "grad_norm": 0.11969010531902313, + "learning_rate": 2.951972571262599e-05, + "loss": 0.1211, + "mean_token_accuracy": 0.9581701010465622, + "num_tokens": 2012708.0, + "step": 701 + }, + { + "entropy": 0.11344976350665092, + "epoch": 0.8592411260709915, + "grad_norm": 0.08981240540742874, + "learning_rate": 2.9360560313779783e-05, + "loss": 0.111, + "mean_token_accuracy": 0.9578855633735657, + "num_tokens": 2015620.0, + "step": 702 + }, + { + "entropy": 0.12157167680561543, + "epoch": 0.8604651162790697, + "grad_norm": 0.15222270786762238, + "learning_rate": 2.9202663724119623e-05, + "loss": 0.1294, + "mean_token_accuracy": 0.9513806402683258, + "num_tokens": 2018531.0, + "step": 703 + }, + { + "entropy": 0.10307926498353481, + "epoch": 0.8616891064871481, + "grad_norm": 0.09640531986951828, + "learning_rate": 2.9046038428045718e-05, + "loss": 0.1113, + "mean_token_accuracy": 0.9568430930376053, + "num_tokens": 2021531.0, + "step": 704 + }, + { + "entropy": 0.11375969462096691, + "epoch": 0.8629130966952264, + "grad_norm": 0.11228352785110474, + "learning_rate": 2.8890686889955386e-05, + "loss": 0.1245, + "mean_token_accuracy": 0.9545768797397614, + "num_tokens": 2024352.0, + "step": 705 + }, + { + "entropy": 0.11053463816642761, + "epoch": 0.8641370869033048, + "grad_norm": 0.15762768685817719, + "learning_rate": 2.873661155420411e-05, + "loss": 0.1295, + "mean_token_accuracy": 0.9571340829133987, + "num_tokens": 2027159.0, + "step": 706 + }, + { + "entropy": 0.12471511960029602, + "epoch": 0.8653610771113831, + "grad_norm": 0.1580287367105484, + "learning_rate": 2.8583814845067157e-05, + "loss": 0.1262, + "mean_token_accuracy": 0.949680358171463, + "num_tokens": 2030056.0, + "step": 707 + }, + { + "entropy": 0.11340729705989361, + "epoch": 0.8665850673194615, + "grad_norm": 0.08172990381717682, + "learning_rate": 2.8432299166701508e-05, + "loss": 0.1112, + "mean_token_accuracy": 0.9557855427265167, + "num_tokens": 2032806.0, + "step": 708 + }, + { + "entropy": 0.1142375934869051, + "epoch": 0.8678090575275398, + "grad_norm": 0.08502102643251419, + "learning_rate": 2.8282066903107896e-05, + "loss": 0.1095, + "mean_token_accuracy": 0.9587309509515762, + "num_tokens": 2035727.0, + "step": 709 + }, + { + "entropy": 0.11130358651280403, + "epoch": 0.8690330477356181, + "grad_norm": 0.10156004875898361, + "learning_rate": 2.8133120418093347e-05, + "loss": 0.123, + "mean_token_accuracy": 0.9569457918405533, + "num_tokens": 2038693.0, + "step": 710 + }, + { + "entropy": 0.1261342130601406, + "epoch": 0.8702570379436965, + "grad_norm": 0.10409277677536011, + "learning_rate": 2.798546205523405e-05, + "loss": 0.1142, + "mean_token_accuracy": 0.9579436182975769, + "num_tokens": 2041581.0, + "step": 711 + }, + { + "entropy": 0.11846861988306046, + "epoch": 0.8714810281517748, + "grad_norm": 0.1481771618127823, + "learning_rate": 2.7839094137838406e-05, + "loss": 0.1269, + "mean_token_accuracy": 0.9546496123075485, + "num_tokens": 2044679.0, + "step": 712 + }, + { + "entropy": 0.11766882613301277, + "epoch": 0.8727050183598531, + "grad_norm": 0.11407241970300674, + "learning_rate": 2.7694018968910464e-05, + "loss": 0.1194, + "mean_token_accuracy": 0.9543810039758682, + "num_tokens": 2047617.0, + "step": 713 + }, + { + "entropy": 0.11928965337574482, + "epoch": 0.8739290085679314, + "grad_norm": 0.11081739515066147, + "learning_rate": 2.755023883111374e-05, + "loss": 0.1209, + "mean_token_accuracy": 0.9552731812000275, + "num_tokens": 2050382.0, + "step": 714 + }, + { + "entropy": 0.12090586498379707, + "epoch": 0.8751529987760098, + "grad_norm": 0.12244018912315369, + "learning_rate": 2.74077559867353e-05, + "loss": 0.1247, + "mean_token_accuracy": 0.9528168886899948, + "num_tokens": 2053215.0, + "step": 715 + }, + { + "entropy": 0.11345584504306316, + "epoch": 0.8763769889840881, + "grad_norm": 0.0819413885474205, + "learning_rate": 2.7266572677650103e-05, + "loss": 0.112, + "mean_token_accuracy": 0.9579892754554749, + "num_tokens": 2055985.0, + "step": 716 + }, + { + "entropy": 0.1167902871966362, + "epoch": 0.8776009791921665, + "grad_norm": 0.09099327772855759, + "learning_rate": 2.7126691125285776e-05, + "loss": 0.1127, + "mean_token_accuracy": 0.9561265259981155, + "num_tokens": 2058887.0, + "step": 717 + }, + { + "entropy": 0.1067619789391756, + "epoch": 0.8788249694002448, + "grad_norm": 0.10232370346784592, + "learning_rate": 2.6988113530587688e-05, + "loss": 0.1166, + "mean_token_accuracy": 0.9552911520004272, + "num_tokens": 2061726.0, + "step": 718 + }, + { + "entropy": 0.11897000297904015, + "epoch": 0.8800489596083231, + "grad_norm": 0.09379458427429199, + "learning_rate": 2.6850842073984196e-05, + "loss": 0.1165, + "mean_token_accuracy": 0.9550831317901611, + "num_tokens": 2064471.0, + "step": 719 + }, + { + "entropy": 0.12165862321853638, + "epoch": 0.8812729498164015, + "grad_norm": 0.08941765874624252, + "learning_rate": 2.6714878915352504e-05, + "loss": 0.1175, + "mean_token_accuracy": 0.9547281861305237, + "num_tokens": 2067203.0, + "step": 720 + }, + { + "entropy": 0.12598843686282635, + "epoch": 0.8824969400244798, + "grad_norm": 0.1365310102701187, + "learning_rate": 2.6580226193984587e-05, + "loss": 0.1344, + "mean_token_accuracy": 0.9508580416440964, + "num_tokens": 2069948.0, + "step": 721 + }, + { + "entropy": 0.12104038707911968, + "epoch": 0.8837209302325582, + "grad_norm": 0.09977638721466064, + "learning_rate": 2.6446886028553475e-05, + "loss": 0.1266, + "mean_token_accuracy": 0.9558116942644119, + "num_tokens": 2072812.0, + "step": 722 + }, + { + "entropy": 0.11719655059278011, + "epoch": 0.8849449204406364, + "grad_norm": 0.1261460930109024, + "learning_rate": 2.6314860517080063e-05, + "loss": 0.1254, + "mean_token_accuracy": 0.9562242329120636, + "num_tokens": 2075635.0, + "step": 723 + }, + { + "entropy": 0.1069234162569046, + "epoch": 0.8861689106487148, + "grad_norm": 0.06599584966897964, + "learning_rate": 2.618415173689997e-05, + "loss": 0.1053, + "mean_token_accuracy": 0.9568182080984116, + "num_tokens": 2078556.0, + "step": 724 + }, + { + "entropy": 0.11076822690665722, + "epoch": 0.8873929008567931, + "grad_norm": 0.09865738451480865, + "learning_rate": 2.605476174463093e-05, + "loss": 0.1144, + "mean_token_accuracy": 0.9584460258483887, + "num_tokens": 2081421.0, + "step": 725 + }, + { + "entropy": 0.11669192090630531, + "epoch": 0.8886168910648715, + "grad_norm": 0.11038359254598618, + "learning_rate": 2.5926692576140405e-05, + "loss": 0.1159, + "mean_token_accuracy": 0.9582983553409576, + "num_tokens": 2084336.0, + "step": 726 + }, + { + "entropy": 0.11810577102005482, + "epoch": 0.8898408812729498, + "grad_norm": 0.08592569082975388, + "learning_rate": 2.579994624651353e-05, + "loss": 0.1091, + "mean_token_accuracy": 0.957501232624054, + "num_tokens": 2087409.0, + "step": 727 + }, + { + "entropy": 0.1150412131100893, + "epoch": 0.8910648714810282, + "grad_norm": 0.1141510158777237, + "learning_rate": 2.5674524750021463e-05, + "loss": 0.1168, + "mean_token_accuracy": 0.9552905261516571, + "num_tokens": 2090245.0, + "step": 728 + }, + { + "entropy": 0.12018120102584362, + "epoch": 0.8922888616891065, + "grad_norm": 0.08178415149450302, + "learning_rate": 2.555043006008992e-05, + "loss": 0.1134, + "mean_token_accuracy": 0.9573114514350891, + "num_tokens": 2093202.0, + "step": 729 + }, + { + "entropy": 0.13125368021428585, + "epoch": 0.8935128518971848, + "grad_norm": 0.13168644905090332, + "learning_rate": 2.542766412926825e-05, + "loss": 0.1283, + "mean_token_accuracy": 0.9570727199316025, + "num_tokens": 2096160.0, + "step": 730 + }, + { + "entropy": 0.11740885302424431, + "epoch": 0.8947368421052632, + "grad_norm": 0.088973768055439, + "learning_rate": 2.5306228889198598e-05, + "loss": 0.1137, + "mean_token_accuracy": 0.9569295197725296, + "num_tokens": 2098902.0, + "step": 731 + }, + { + "entropy": 0.12641332112252712, + "epoch": 0.8959608323133414, + "grad_norm": 0.09570562839508057, + "learning_rate": 2.518612625058554e-05, + "loss": 0.1222, + "mean_token_accuracy": 0.9525593966245651, + "num_tokens": 2101788.0, + "step": 732 + }, + { + "entropy": 0.12602835148572922, + "epoch": 0.8971848225214198, + "grad_norm": 0.08164916932582855, + "learning_rate": 2.5067358103166084e-05, + "loss": 0.1126, + "mean_token_accuracy": 0.9539099335670471, + "num_tokens": 2104560.0, + "step": 733 + }, + { + "entropy": 0.12724786810576916, + "epoch": 0.8984088127294981, + "grad_norm": 0.09449490904808044, + "learning_rate": 2.4949926315679843e-05, + "loss": 0.1152, + "mean_token_accuracy": 0.9560279250144958, + "num_tokens": 2107348.0, + "step": 734 + }, + { + "entropy": 0.11353614367544651, + "epoch": 0.8996328029375765, + "grad_norm": 0.08708532899618149, + "learning_rate": 2.4833832735839673e-05, + "loss": 0.1116, + "mean_token_accuracy": 0.9574238508939743, + "num_tokens": 2110158.0, + "step": 735 + }, + { + "entropy": 0.10623525828123093, + "epoch": 0.9008567931456548, + "grad_norm": 0.07666978985071182, + "learning_rate": 2.4719079190302656e-05, + "loss": 0.1082, + "mean_token_accuracy": 0.9632962048053741, + "num_tokens": 2112925.0, + "step": 736 + }, + { + "entropy": 0.12935458309948444, + "epoch": 0.9020807833537332, + "grad_norm": 0.1104234904050827, + "learning_rate": 2.460566748464124e-05, + "loss": 0.1211, + "mean_token_accuracy": 0.9554091840982437, + "num_tokens": 2115798.0, + "step": 737 + }, + { + "entropy": 0.11761978082358837, + "epoch": 0.9033047735618115, + "grad_norm": 0.08762868493795395, + "learning_rate": 2.4493599403314924e-05, + "loss": 0.1167, + "mean_token_accuracy": 0.9543918967247009, + "num_tokens": 2118575.0, + "step": 738 + }, + { + "entropy": 0.12938694655895233, + "epoch": 0.9045287637698899, + "grad_norm": 0.13027149438858032, + "learning_rate": 2.4382876709642148e-05, + "loss": 0.1217, + "mean_token_accuracy": 0.9524136632680893, + "num_tokens": 2121538.0, + "step": 739 + }, + { + "entropy": 0.12381874583661556, + "epoch": 0.9057527539779682, + "grad_norm": 0.09400711208581924, + "learning_rate": 2.4273501145772558e-05, + "loss": 0.1168, + "mean_token_accuracy": 0.9578346014022827, + "num_tokens": 2124375.0, + "step": 740 + }, + { + "entropy": 0.11949828825891018, + "epoch": 0.9069767441860465, + "grad_norm": 0.09050319343805313, + "learning_rate": 2.4165474432659588e-05, + "loss": 0.1185, + "mean_token_accuracy": 0.955832377076149, + "num_tokens": 2127247.0, + "step": 741 + }, + { + "entropy": 0.12360114604234695, + "epoch": 0.9082007343941249, + "grad_norm": 0.08105919510126114, + "learning_rate": 2.4058798270033353e-05, + "loss": 0.1126, + "mean_token_accuracy": 0.9540487974882126, + "num_tokens": 2130114.0, + "step": 742 + }, + { + "entropy": 0.12008361890912056, + "epoch": 0.9094247246022031, + "grad_norm": 0.08749601244926453, + "learning_rate": 2.395347433637398e-05, + "loss": 0.1093, + "mean_token_accuracy": 0.9564074575901031, + "num_tokens": 2133065.0, + "step": 743 + }, + { + "entropy": 0.11987698636949062, + "epoch": 0.9106487148102815, + "grad_norm": 0.08700191974639893, + "learning_rate": 2.384950428888512e-05, + "loss": 0.1124, + "mean_token_accuracy": 0.9571013897657394, + "num_tokens": 2135886.0, + "step": 744 + }, + { + "entropy": 0.11860467493534088, + "epoch": 0.9118727050183598, + "grad_norm": 0.11530648916959763, + "learning_rate": 2.3746889763467907e-05, + "loss": 0.1223, + "mean_token_accuracy": 0.9517072439193726, + "num_tokens": 2138734.0, + "step": 745 + }, + { + "entropy": 0.11237789876759052, + "epoch": 0.9130966952264382, + "grad_norm": 0.1200026273727417, + "learning_rate": 2.3645632374695246e-05, + "loss": 0.1187, + "mean_token_accuracy": 0.9562467336654663, + "num_tokens": 2141692.0, + "step": 746 + }, + { + "entropy": 0.11692007072269917, + "epoch": 0.9143206854345165, + "grad_norm": 0.10906612873077393, + "learning_rate": 2.354573371578632e-05, + "loss": 0.12, + "mean_token_accuracy": 0.9556914120912552, + "num_tokens": 2144596.0, + "step": 747 + }, + { + "entropy": 0.11260858923196793, + "epoch": 0.9155446756425949, + "grad_norm": 0.09712531417608261, + "learning_rate": 2.3447195358581635e-05, + "loss": 0.1187, + "mean_token_accuracy": 0.9540179818868637, + "num_tokens": 2147451.0, + "step": 748 + }, + { + "entropy": 0.11241147667169571, + "epoch": 0.9167686658506732, + "grad_norm": 0.09892810136079788, + "learning_rate": 2.3350018853518225e-05, + "loss": 0.1134, + "mean_token_accuracy": 0.9590259045362473, + "num_tokens": 2150290.0, + "step": 749 + }, + { + "entropy": 0.11137410812079906, + "epoch": 0.9179926560587516, + "grad_norm": 0.08404407650232315, + "learning_rate": 2.3254205729605218e-05, + "loss": 0.1127, + "mean_token_accuracy": 0.9532254785299301, + "num_tokens": 2153222.0, + "step": 750 + }, + { + "entropy": 0.12472932785749435, + "epoch": 0.9192166462668299, + "grad_norm": 0.11108457297086716, + "learning_rate": 2.31597574943999e-05, + "loss": 0.1228, + "mean_token_accuracy": 0.9548518508672714, + "num_tokens": 2156175.0, + "step": 751 + }, + { + "entropy": 0.12059284001588821, + "epoch": 0.9204406364749081, + "grad_norm": 0.10968618094921112, + "learning_rate": 2.3066675633983865e-05, + "loss": 0.1242, + "mean_token_accuracy": 0.9528764337301254, + "num_tokens": 2159136.0, + "step": 752 + }, + { + "entropy": 0.12065505422651768, + "epoch": 0.9216646266829865, + "grad_norm": 0.10751163214445114, + "learning_rate": 2.2974961612939698e-05, + "loss": 0.1208, + "mean_token_accuracy": 0.9567812979221344, + "num_tokens": 2161944.0, + "step": 753 + }, + { + "entropy": 0.12034500576555729, + "epoch": 0.9228886168910648, + "grad_norm": 0.13161377608776093, + "learning_rate": 2.2884616874327942e-05, + "loss": 0.1263, + "mean_token_accuracy": 0.95419642329216, + "num_tokens": 2164902.0, + "step": 754 + }, + { + "entropy": 0.10621046833693981, + "epoch": 0.9241126070991432, + "grad_norm": 0.07875524461269379, + "learning_rate": 2.2795642839664347e-05, + "loss": 0.1058, + "mean_token_accuracy": 0.9551303684711456, + "num_tokens": 2167864.0, + "step": 755 + }, + { + "entropy": 0.1130187138915062, + "epoch": 0.9253365973072215, + "grad_norm": 0.08278296887874603, + "learning_rate": 2.2708040908897536e-05, + "loss": 0.1119, + "mean_token_accuracy": 0.9569633156061172, + "num_tokens": 2170568.0, + "step": 756 + }, + { + "entropy": 0.12869110703468323, + "epoch": 0.9265605875152999, + "grad_norm": 0.1621813178062439, + "learning_rate": 2.2621812460386964e-05, + "loss": 0.1414, + "mean_token_accuracy": 0.947635143995285, + "num_tokens": 2173379.0, + "step": 757 + }, + { + "entropy": 0.12192260660231113, + "epoch": 0.9277845777233782, + "grad_norm": 0.1081606075167656, + "learning_rate": 2.2536958850881255e-05, + "loss": 0.1276, + "mean_token_accuracy": 0.9544932991266251, + "num_tokens": 2176310.0, + "step": 758 + }, + { + "entropy": 0.12152072601020336, + "epoch": 0.9290085679314566, + "grad_norm": 0.10435760766267776, + "learning_rate": 2.2453481415496802e-05, + "loss": 0.1211, + "mean_token_accuracy": 0.9515321105718613, + "num_tokens": 2179294.0, + "step": 759 + }, + { + "entropy": 0.1167523693293333, + "epoch": 0.9302325581395349, + "grad_norm": 0.07130236178636551, + "learning_rate": 2.237138146769681e-05, + "loss": 0.1072, + "mean_token_accuracy": 0.9584721028804779, + "num_tokens": 2182101.0, + "step": 760 + }, + { + "entropy": 0.11823557130992413, + "epoch": 0.9314565483476133, + "grad_norm": 0.07899076491594315, + "learning_rate": 2.229066029927063e-05, + "loss": 0.1128, + "mean_token_accuracy": 0.9541122913360596, + "num_tokens": 2184987.0, + "step": 761 + }, + { + "entropy": 0.12010268121957779, + "epoch": 0.9326805385556916, + "grad_norm": 0.0862560123205185, + "learning_rate": 2.2211319180313366e-05, + "loss": 0.1126, + "mean_token_accuracy": 0.956716775894165, + "num_tokens": 2187967.0, + "step": 762 + }, + { + "entropy": 0.11673757620155811, + "epoch": 0.9339045287637698, + "grad_norm": 0.10122431069612503, + "learning_rate": 2.2133359359206e-05, + "loss": 0.1177, + "mean_token_accuracy": 0.9515764117240906, + "num_tokens": 2190937.0, + "step": 763 + }, + { + "entropy": 0.11652708053588867, + "epoch": 0.9351285189718482, + "grad_norm": 0.08949444442987442, + "learning_rate": 2.2056782062595653e-05, + "loss": 0.1099, + "mean_token_accuracy": 0.9607353955507278, + "num_tokens": 2193703.0, + "step": 764 + }, + { + "entropy": 0.1190547738224268, + "epoch": 0.9363525091799265, + "grad_norm": 0.11255134642124176, + "learning_rate": 2.198158849537631e-05, + "loss": 0.1172, + "mean_token_accuracy": 0.9589087069034576, + "num_tokens": 2196497.0, + "step": 765 + }, + { + "entropy": 0.12217211723327637, + "epoch": 0.9375764993880049, + "grad_norm": 0.12183503806591034, + "learning_rate": 2.1907779840669927e-05, + "loss": 0.1181, + "mean_token_accuracy": 0.9560697823762894, + "num_tokens": 2199424.0, + "step": 766 + }, + { + "entropy": 0.12069625966250896, + "epoch": 0.9388004895960832, + "grad_norm": 0.08380690217018127, + "learning_rate": 2.183535725980769e-05, + "loss": 0.1144, + "mean_token_accuracy": 0.9523214101791382, + "num_tokens": 2202243.0, + "step": 767 + }, + { + "entropy": 0.11609174497425556, + "epoch": 0.9400244798041616, + "grad_norm": 0.08410578966140747, + "learning_rate": 2.1764321892311875e-05, + "loss": 0.1075, + "mean_token_accuracy": 0.9589401930570602, + "num_tokens": 2205056.0, + "step": 768 + }, + { + "entropy": 0.1135543342679739, + "epoch": 0.9412484700122399, + "grad_norm": 0.11362886428833008, + "learning_rate": 2.169467485587782e-05, + "loss": 0.1208, + "mean_token_accuracy": 0.9553671628236771, + "num_tokens": 2207868.0, + "step": 769 + }, + { + "entropy": 0.11551040783524513, + "epoch": 0.9424724602203183, + "grad_norm": 0.09017916023731232, + "learning_rate": 2.16264172463564e-05, + "loss": 0.1082, + "mean_token_accuracy": 0.9546223431825638, + "num_tokens": 2210825.0, + "step": 770 + }, + { + "entropy": 0.12548975460231304, + "epoch": 0.9436964504283966, + "grad_norm": 0.13604146242141724, + "learning_rate": 2.155955013773674e-05, + "loss": 0.1286, + "mean_token_accuracy": 0.9536134600639343, + "num_tokens": 2213711.0, + "step": 771 + }, + { + "entropy": 0.10890325903892517, + "epoch": 0.944920440636475, + "grad_norm": 0.08181123435497284, + "learning_rate": 2.149407458212935e-05, + "loss": 0.1107, + "mean_token_accuracy": 0.9582093060016632, + "num_tokens": 2216676.0, + "step": 772 + }, + { + "entropy": 0.1145523153245449, + "epoch": 0.9461444308445532, + "grad_norm": 0.06920690834522247, + "learning_rate": 2.1429991609749554e-05, + "loss": 0.1035, + "mean_token_accuracy": 0.9617591798305511, + "num_tokens": 2219492.0, + "step": 773 + }, + { + "entropy": 0.11076322570443153, + "epoch": 0.9473684210526315, + "grad_norm": 0.09461228549480438, + "learning_rate": 2.1367302228901282e-05, + "loss": 0.1167, + "mean_token_accuracy": 0.9590214490890503, + "num_tokens": 2222404.0, + "step": 774 + }, + { + "entropy": 0.11105098761618137, + "epoch": 0.9485924112607099, + "grad_norm": 0.08519443869590759, + "learning_rate": 2.13060074259612e-05, + "loss": 0.1076, + "mean_token_accuracy": 0.9583229869604111, + "num_tokens": 2225483.0, + "step": 775 + }, + { + "entropy": 0.11546324752271175, + "epoch": 0.9498164014687882, + "grad_norm": 0.09795887023210526, + "learning_rate": 2.124610816536322e-05, + "loss": 0.1136, + "mean_token_accuracy": 0.9561485797166824, + "num_tokens": 2228483.0, + "step": 776 + }, + { + "entropy": 0.1178339272737503, + "epoch": 0.9510403916768666, + "grad_norm": 0.1451365202665329, + "learning_rate": 2.1187605389583267e-05, + "loss": 0.127, + "mean_token_accuracy": 0.9523122310638428, + "num_tokens": 2231308.0, + "step": 777 + }, + { + "entropy": 0.11218477971851826, + "epoch": 0.9522643818849449, + "grad_norm": 0.09945015609264374, + "learning_rate": 2.1130500019124517e-05, + "loss": 0.113, + "mean_token_accuracy": 0.961069330573082, + "num_tokens": 2234305.0, + "step": 778 + }, + { + "entropy": 0.12483607232570648, + "epoch": 0.9534883720930233, + "grad_norm": 0.1199488490819931, + "learning_rate": 2.1074792952502866e-05, + "loss": 0.1189, + "mean_token_accuracy": 0.9530585408210754, + "num_tokens": 2237121.0, + "step": 779 + }, + { + "entropy": 0.11908796057105064, + "epoch": 0.9547123623011016, + "grad_norm": 0.1451587975025177, + "learning_rate": 2.1020485066232808e-05, + "loss": 0.1254, + "mean_token_accuracy": 0.9533159732818604, + "num_tokens": 2239991.0, + "step": 780 + }, + { + "entropy": 0.10797371156513691, + "epoch": 0.95593635250918, + "grad_norm": 0.09097123146057129, + "learning_rate": 2.096757721481365e-05, + "loss": 0.1132, + "mean_token_accuracy": 0.957582488656044, + "num_tokens": 2242904.0, + "step": 781 + }, + { + "entropy": 0.1175040528178215, + "epoch": 0.9571603427172583, + "grad_norm": 0.07779325544834137, + "learning_rate": 2.0916070230716063e-05, + "loss": 0.1136, + "mean_token_accuracy": 0.9561788141727448, + "num_tokens": 2245756.0, + "step": 782 + }, + { + "entropy": 0.10941893421113491, + "epoch": 0.9583843329253366, + "grad_norm": 0.09423915296792984, + "learning_rate": 2.086596492436895e-05, + "loss": 0.114, + "mean_token_accuracy": 0.9594079703092575, + "num_tokens": 2248771.0, + "step": 783 + }, + { + "entropy": 0.11998820677399635, + "epoch": 0.9596083231334149, + "grad_norm": 0.09895794838666916, + "learning_rate": 2.0817262084146775e-05, + "loss": 0.1137, + "mean_token_accuracy": 0.9520331472158432, + "num_tokens": 2251605.0, + "step": 784 + }, + { + "entropy": 0.12215744704008102, + "epoch": 0.9608323133414932, + "grad_norm": 0.0959494337439537, + "learning_rate": 2.0769962476357068e-05, + "loss": 0.1193, + "mean_token_accuracy": 0.952069103717804, + "num_tokens": 2254331.0, + "step": 785 + }, + { + "entropy": 0.11949954926967621, + "epoch": 0.9620563035495716, + "grad_norm": 0.0973983108997345, + "learning_rate": 2.0724066845228422e-05, + "loss": 0.1128, + "mean_token_accuracy": 0.957825556397438, + "num_tokens": 2257077.0, + "step": 786 + }, + { + "entropy": 0.1224739346653223, + "epoch": 0.9632802937576499, + "grad_norm": 0.10656946152448654, + "learning_rate": 2.0679575912898784e-05, + "loss": 0.1243, + "mean_token_accuracy": 0.9535775631666183, + "num_tokens": 2260137.0, + "step": 787 + }, + { + "entropy": 0.11643553711473942, + "epoch": 0.9645042839657283, + "grad_norm": 0.13314510881900787, + "learning_rate": 2.0636490379404056e-05, + "loss": 0.1194, + "mean_token_accuracy": 0.9564495980739594, + "num_tokens": 2263053.0, + "step": 788 + }, + { + "entropy": 0.11585861630737782, + "epoch": 0.9657282741738066, + "grad_norm": 0.11321521550416946, + "learning_rate": 2.0594810922667113e-05, + "loss": 0.1191, + "mean_token_accuracy": 0.9549570679664612, + "num_tokens": 2265975.0, + "step": 789 + }, + { + "entropy": 0.1279258131980896, + "epoch": 0.966952264381885, + "grad_norm": 0.13720889389514923, + "learning_rate": 2.055453819848713e-05, + "loss": 0.135, + "mean_token_accuracy": 0.9506154805421829, + "num_tokens": 2268865.0, + "step": 790 + }, + { + "entropy": 0.11935215257108212, + "epoch": 0.9681762545899633, + "grad_norm": 0.09593852609395981, + "learning_rate": 2.051567284052924e-05, + "loss": 0.116, + "mean_token_accuracy": 0.9535016715526581, + "num_tokens": 2271660.0, + "step": 791 + }, + { + "entropy": 0.11908247694373131, + "epoch": 0.9694002447980417, + "grad_norm": 0.09145094454288483, + "learning_rate": 2.0478215460314615e-05, + "loss": 0.1155, + "mean_token_accuracy": 0.9593386054039001, + "num_tokens": 2274364.0, + "step": 792 + }, + { + "entropy": 0.10798829421401024, + "epoch": 0.9706242350061199, + "grad_norm": 0.07891134172677994, + "learning_rate": 2.044216664721078e-05, + "loss": 0.1082, + "mean_token_accuracy": 0.9590171277523041, + "num_tokens": 2277247.0, + "step": 793 + }, + { + "entropy": 0.12024936452507973, + "epoch": 0.9718482252141983, + "grad_norm": 0.09279042482376099, + "learning_rate": 2.040752696842239e-05, + "loss": 0.1168, + "mean_token_accuracy": 0.9569914788007736, + "num_tokens": 2280002.0, + "step": 794 + }, + { + "entropy": 0.1129434984177351, + "epoch": 0.9730722154222766, + "grad_norm": 0.09484171867370605, + "learning_rate": 2.0374296968982278e-05, + "loss": 0.1125, + "mean_token_accuracy": 0.9551234692335129, + "num_tokens": 2282893.0, + "step": 795 + }, + { + "entropy": 0.11363891325891018, + "epoch": 0.9742962056303549, + "grad_norm": 0.11324533820152283, + "learning_rate": 2.03424771717429e-05, + "loss": 0.1267, + "mean_token_accuracy": 0.9605773538351059, + "num_tokens": 2285677.0, + "step": 796 + }, + { + "entropy": 0.1053408607840538, + "epoch": 0.9755201958384333, + "grad_norm": 0.07560600340366364, + "learning_rate": 2.0312068077368092e-05, + "loss": 0.1137, + "mean_token_accuracy": 0.9533551335334778, + "num_tokens": 2288347.0, + "step": 797 + }, + { + "entropy": 0.11777897365391254, + "epoch": 0.9767441860465116, + "grad_norm": 0.0965939462184906, + "learning_rate": 2.0283070164325195e-05, + "loss": 0.1144, + "mean_token_accuracy": 0.9538252204656601, + "num_tokens": 2291250.0, + "step": 798 + }, + { + "entropy": 0.10427995771169662, + "epoch": 0.97796817625459, + "grad_norm": 0.09167545288801193, + "learning_rate": 2.025548388887751e-05, + "loss": 0.1103, + "mean_token_accuracy": 0.9586018919944763, + "num_tokens": 2294206.0, + "step": 799 + }, + { + "entropy": 0.12098632752895355, + "epoch": 0.9791921664626683, + "grad_norm": 0.1108219251036644, + "learning_rate": 2.0229309685077186e-05, + "loss": 0.1221, + "mean_token_accuracy": 0.9526151269674301, + "num_tokens": 2297132.0, + "step": 800 + }, + { + "entropy": 0.11739395186305046, + "epoch": 0.9804161566707467, + "grad_norm": 0.06881573051214218, + "learning_rate": 2.0204547964758287e-05, + "loss": 0.1102, + "mean_token_accuracy": 0.9550208747386932, + "num_tokens": 2300067.0, + "step": 801 + }, + { + "entropy": 0.11589861661195755, + "epoch": 0.981640146878825, + "grad_norm": 0.08789421617984772, + "learning_rate": 2.0181199117530416e-05, + "loss": 0.1147, + "mean_token_accuracy": 0.9546492248773575, + "num_tokens": 2302876.0, + "step": 802 + }, + { + "entropy": 0.11370973475277424, + "epoch": 0.9828641370869033, + "grad_norm": 0.10175738483667374, + "learning_rate": 2.015926351077248e-05, + "loss": 0.1174, + "mean_token_accuracy": 0.9557800889015198, + "num_tokens": 2305810.0, + "step": 803 + }, + { + "entropy": 0.11606515012681484, + "epoch": 0.9840881272949816, + "grad_norm": 0.07569295912981033, + "learning_rate": 2.0138741489627025e-05, + "loss": 0.1091, + "mean_token_accuracy": 0.9588016122579575, + "num_tokens": 2308628.0, + "step": 804 + }, + { + "entropy": 0.11860406957566738, + "epoch": 0.98531211750306, + "grad_norm": 0.17748574912548065, + "learning_rate": 2.0119633376994714e-05, + "loss": 0.1204, + "mean_token_accuracy": 0.9567941278219223, + "num_tokens": 2311537.0, + "step": 805 + }, + { + "entropy": 0.12059890665113926, + "epoch": 0.9865361077111383, + "grad_norm": 0.07504194229841232, + "learning_rate": 2.0101939473529285e-05, + "loss": 0.1107, + "mean_token_accuracy": 0.9550109654664993, + "num_tokens": 2314439.0, + "step": 806 + }, + { + "entropy": 0.11730956472456455, + "epoch": 0.9877600979192166, + "grad_norm": 0.15471886098384857, + "learning_rate": 2.00856600576328e-05, + "loss": 0.133, + "mean_token_accuracy": 0.9500728696584702, + "num_tokens": 2317355.0, + "step": 807 + }, + { + "entropy": 0.10784157551825047, + "epoch": 0.988984088127295, + "grad_norm": 0.084711953997612, + "learning_rate": 2.00707953854513e-05, + "loss": 0.1087, + "mean_token_accuracy": 0.953977033495903, + "num_tokens": 2320389.0, + "step": 808 + }, + { + "entropy": 0.11833012104034424, + "epoch": 0.9902080783353733, + "grad_norm": 0.08518842607736588, + "learning_rate": 2.0057345690870743e-05, + "loss": 0.1134, + "mean_token_accuracy": 0.9544739276170731, + "num_tokens": 2323216.0, + "step": 809 + }, + { + "entropy": 0.11488179862499237, + "epoch": 0.9914320685434517, + "grad_norm": 0.10930296033620834, + "learning_rate": 2.0045311185513344e-05, + "loss": 0.1183, + "mean_token_accuracy": 0.9564723968505859, + "num_tokens": 2326174.0, + "step": 810 + }, + { + "entropy": 0.1235403548926115, + "epoch": 0.99265605875153, + "grad_norm": 0.12845449149608612, + "learning_rate": 2.0034692058734194e-05, + "loss": 0.1276, + "mean_token_accuracy": 0.9511407911777496, + "num_tokens": 2329057.0, + "step": 811 + }, + { + "entropy": 0.11454490199685097, + "epoch": 0.9938800489596084, + "grad_norm": 0.12085168808698654, + "learning_rate": 2.002548847761839e-05, + "loss": 0.1208, + "mean_token_accuracy": 0.9573224186897278, + "num_tokens": 2331936.0, + "step": 812 + }, + { + "entropy": 0.11223976872861385, + "epoch": 0.9951040391676866, + "grad_norm": 0.07855137437582016, + "learning_rate": 2.0017700586978264e-05, + "loss": 0.1105, + "mean_token_accuracy": 0.9556984305381775, + "num_tokens": 2334889.0, + "step": 813 + }, + { + "entropy": 0.1127720344811678, + "epoch": 0.996328029375765, + "grad_norm": 0.0951165184378624, + "learning_rate": 2.0011328509351244e-05, + "loss": 0.1149, + "mean_token_accuracy": 0.957403689622879, + "num_tokens": 2337726.0, + "step": 814 + }, + { + "entropy": 0.1144169270992279, + "epoch": 0.9975520195838433, + "grad_norm": 0.08693596720695496, + "learning_rate": 2.0006372344997826e-05, + "loss": 0.1141, + "mean_token_accuracy": 0.9580929130315781, + "num_tokens": 2340711.0, + "step": 815 + }, + { + "entropy": 0.12403703108429909, + "epoch": 0.9987760097919217, + "grad_norm": 0.08769191056489944, + "learning_rate": 2.0002832171900023e-05, + "loss": 0.123, + "mean_token_accuracy": 0.9589347094297409, + "num_tokens": 2343379.0, + "step": 816 + }, + { + "entropy": 0.12176699191331863, + "epoch": 1.0, + "grad_norm": 0.09838308393955231, + "learning_rate": 2.0000708045760164e-05, + "loss": 0.1175, + "mean_token_accuracy": 0.9576897472143173, + "num_tokens": 2346129.0, + "step": 817 + } + ], + "logging_steps": 1, + "max_steps": 817, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1121486214219039e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}