diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10725 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.40358993244913505, + "eval_steps": 500, + "global_step": 60000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00033632494370761253, + "grad_norm": 1.274348497390747, + "learning_rate": 0.0001, + "loss": 9.9462, + "num_input_tokens_seen": 13107200, + "step": 50 + }, + { + "epoch": 0.0006726498874152251, + "grad_norm": 0.3978561758995056, + "learning_rate": 0.0002, + "loss": 8.361, + "num_input_tokens_seen": 26214400, + "step": 100 + }, + { + "epoch": 0.0010089748311228376, + "grad_norm": 0.41306379437446594, + "learning_rate": 0.0003, + "loss": 7.6593, + "num_input_tokens_seen": 39321600, + "step": 150 + }, + { + "epoch": 0.0013452997748304501, + "grad_norm": 0.3370315134525299, + "learning_rate": 0.0004, + "loss": 7.1018, + "num_input_tokens_seen": 52428800, + "step": 200 + }, + { + "epoch": 0.0016816247185380627, + "grad_norm": 0.6396230459213257, + "learning_rate": 0.0005, + "loss": 6.6331, + "num_input_tokens_seen": 65536000, + "step": 250 + }, + { + "epoch": 0.002017949662245675, + "grad_norm": 0.5036832690238953, + "learning_rate": 0.0006, + "loss": 6.2646, + "num_input_tokens_seen": 78643200, + "step": 300 + }, + { + "epoch": 0.002354274605953288, + "grad_norm": 0.4829367399215698, + "learning_rate": 0.0007, + "loss": 5.9656, + "num_input_tokens_seen": 91750400, + "step": 350 + }, + { + "epoch": 0.0026905995496609002, + "grad_norm": 0.6319091320037842, + "learning_rate": 0.0008, + "loss": 5.7098, + "num_input_tokens_seen": 104857600, + "step": 400 + }, + { + "epoch": 0.003026924493368513, + "grad_norm": 0.8607974648475647, + "learning_rate": 0.0009000000000000001, + "loss": 5.5012, + "num_input_tokens_seen": 117964800, + "step": 450 + }, + { + "epoch": 0.0033632494370761253, + "grad_norm": 0.4290812611579895, + "learning_rate": 0.001, + "loss": 5.3179, + "num_input_tokens_seen": 131072000, + "step": 500 + }, + { + "epoch": 0.0033632494370761253, + "eval_loss": 5.179312705993652, + "eval_runtime": 100.6159, + "eval_samples_per_second": 49.694, + "eval_steps_per_second": 12.423, + "num_input_tokens_seen": 131072000, + "step": 500 + }, + { + "epoch": 0.003699574380783738, + "grad_norm": 0.407632052898407, + "learning_rate": 0.001, + "loss": 5.1431, + "num_input_tokens_seen": 144179200, + "step": 550 + }, + { + "epoch": 0.00403589932449135, + "grad_norm": 0.31742918491363525, + "learning_rate": 0.001, + "loss": 4.9611, + "num_input_tokens_seen": 157286400, + "step": 600 + }, + { + "epoch": 0.004372224268198963, + "grad_norm": 0.3897629678249359, + "learning_rate": 0.001, + "loss": 4.8043, + "num_input_tokens_seen": 170393600, + "step": 650 + }, + { + "epoch": 0.004708549211906576, + "grad_norm": 0.41930389404296875, + "learning_rate": 0.001, + "loss": 4.6617, + "num_input_tokens_seen": 183500800, + "step": 700 + }, + { + "epoch": 0.005044874155614189, + "grad_norm": 0.4283705949783325, + "learning_rate": 0.001, + "loss": 4.5535, + "num_input_tokens_seen": 196608000, + "step": 750 + }, + { + "epoch": 0.0053811990993218005, + "grad_norm": 0.3931107521057129, + "learning_rate": 0.001, + "loss": 4.4556, + "num_input_tokens_seen": 209715200, + "step": 800 + }, + { + "epoch": 0.005717524043029413, + "grad_norm": 0.3258611857891083, + "learning_rate": 0.001, + "loss": 4.3808, + "num_input_tokens_seen": 222822400, + "step": 850 + }, + { + "epoch": 0.006053848986737026, + "grad_norm": 0.3562588095664978, + "learning_rate": 0.001, + "loss": 4.3136, + "num_input_tokens_seen": 235929600, + "step": 900 + }, + { + "epoch": 0.006390173930444639, + "grad_norm": 0.3626460134983063, + "learning_rate": 0.001, + "loss": 4.2557, + "num_input_tokens_seen": 249036800, + "step": 950 + }, + { + "epoch": 0.006726498874152251, + "grad_norm": 0.4001849293708801, + "learning_rate": 0.001, + "loss": 4.208, + "num_input_tokens_seen": 262144000, + "step": 1000 + }, + { + "epoch": 0.006726498874152251, + "eval_loss": 4.108783721923828, + "eval_runtime": 101.5418, + "eval_samples_per_second": 49.241, + "eval_steps_per_second": 12.31, + "num_input_tokens_seen": 262144000, + "step": 1000 + }, + { + "epoch": 0.007062823817859863, + "grad_norm": 0.36548155546188354, + "learning_rate": 0.001, + "loss": 4.1537, + "num_input_tokens_seen": 275251200, + "step": 1050 + }, + { + "epoch": 0.007399148761567476, + "grad_norm": 0.33794787526130676, + "learning_rate": 0.001, + "loss": 4.1062, + "num_input_tokens_seen": 288358400, + "step": 1100 + }, + { + "epoch": 0.007735473705275089, + "grad_norm": 0.3374481499195099, + "learning_rate": 0.001, + "loss": 4.0703, + "num_input_tokens_seen": 301465600, + "step": 1150 + }, + { + "epoch": 0.0080717986489827, + "grad_norm": 0.3061329126358032, + "learning_rate": 0.001, + "loss": 4.0253, + "num_input_tokens_seen": 314572800, + "step": 1200 + }, + { + "epoch": 0.008408123592690313, + "grad_norm": 0.3140158951282501, + "learning_rate": 0.001, + "loss": 4.0128, + "num_input_tokens_seen": 327680000, + "step": 1250 + }, + { + "epoch": 0.008744448536397926, + "grad_norm": 0.3002954125404358, + "learning_rate": 0.001, + "loss": 3.9762, + "num_input_tokens_seen": 340787200, + "step": 1300 + }, + { + "epoch": 0.009080773480105539, + "grad_norm": 0.2994467318058014, + "learning_rate": 0.001, + "loss": 3.9485, + "num_input_tokens_seen": 353894400, + "step": 1350 + }, + { + "epoch": 0.009417098423813152, + "grad_norm": 0.25649985671043396, + "learning_rate": 0.001, + "loss": 3.9298, + "num_input_tokens_seen": 367001600, + "step": 1400 + }, + { + "epoch": 0.009753423367520764, + "grad_norm": 0.2627107501029968, + "learning_rate": 0.001, + "loss": 3.904, + "num_input_tokens_seen": 380108800, + "step": 1450 + }, + { + "epoch": 0.010089748311228377, + "grad_norm": 0.29527419805526733, + "learning_rate": 0.001, + "loss": 3.8864, + "num_input_tokens_seen": 393216000, + "step": 1500 + }, + { + "epoch": 0.010089748311228377, + "eval_loss": 3.8075473308563232, + "eval_runtime": 99.5484, + "eval_samples_per_second": 50.227, + "eval_steps_per_second": 12.557, + "num_input_tokens_seen": 393216000, + "step": 1500 + }, + { + "epoch": 0.010426073254935988, + "grad_norm": 0.26328331232070923, + "learning_rate": 0.001, + "loss": 3.8701, + "num_input_tokens_seen": 406323200, + "step": 1550 + }, + { + "epoch": 0.010762398198643601, + "grad_norm": 0.2574864625930786, + "learning_rate": 0.001, + "loss": 3.8575, + "num_input_tokens_seen": 419430400, + "step": 1600 + }, + { + "epoch": 0.011098723142351214, + "grad_norm": 0.2445235550403595, + "learning_rate": 0.001, + "loss": 3.8277, + "num_input_tokens_seen": 432537600, + "step": 1650 + }, + { + "epoch": 0.011435048086058826, + "grad_norm": 0.2726516127586365, + "learning_rate": 0.001, + "loss": 3.8171, + "num_input_tokens_seen": 445644800, + "step": 1700 + }, + { + "epoch": 0.01177137302976644, + "grad_norm": 0.2658848166465759, + "learning_rate": 0.001, + "loss": 3.8026, + "num_input_tokens_seen": 458752000, + "step": 1750 + }, + { + "epoch": 0.012107697973474052, + "grad_norm": 0.30301713943481445, + "learning_rate": 0.001, + "loss": 3.7838, + "num_input_tokens_seen": 471859200, + "step": 1800 + }, + { + "epoch": 0.012444022917181665, + "grad_norm": 0.24884650111198425, + "learning_rate": 0.001, + "loss": 3.7685, + "num_input_tokens_seen": 484966400, + "step": 1850 + }, + { + "epoch": 0.012780347860889277, + "grad_norm": 0.2728760540485382, + "learning_rate": 0.001, + "loss": 3.759, + "num_input_tokens_seen": 498073600, + "step": 1900 + }, + { + "epoch": 0.013116672804596888, + "grad_norm": 0.24091999232769012, + "learning_rate": 0.001, + "loss": 3.7496, + "num_input_tokens_seen": 511180800, + "step": 1950 + }, + { + "epoch": 0.013452997748304501, + "grad_norm": 0.2559104561805725, + "learning_rate": 0.001, + "loss": 3.7289, + "num_input_tokens_seen": 524288000, + "step": 2000 + }, + { + "epoch": 0.013452997748304501, + "eval_loss": 3.654578447341919, + "eval_runtime": 99.6741, + "eval_samples_per_second": 50.164, + "eval_steps_per_second": 12.541, + "num_input_tokens_seen": 524288000, + "step": 2000 + }, + { + "epoch": 0.013789322692012114, + "grad_norm": 0.24713733792304993, + "learning_rate": 0.001, + "loss": 3.7259, + "num_input_tokens_seen": 537395200, + "step": 2050 + }, + { + "epoch": 0.014125647635719727, + "grad_norm": 0.2594991624355316, + "learning_rate": 0.001, + "loss": 3.7121, + "num_input_tokens_seen": 550502400, + "step": 2100 + }, + { + "epoch": 0.01446197257942734, + "grad_norm": 0.2594399154186249, + "learning_rate": 0.001, + "loss": 3.6975, + "num_input_tokens_seen": 563609600, + "step": 2150 + }, + { + "epoch": 0.014798297523134952, + "grad_norm": 0.21877750754356384, + "learning_rate": 0.001, + "loss": 3.6868, + "num_input_tokens_seen": 576716800, + "step": 2200 + }, + { + "epoch": 0.015134622466842565, + "grad_norm": 0.23657166957855225, + "learning_rate": 0.001, + "loss": 3.6853, + "num_input_tokens_seen": 589824000, + "step": 2250 + }, + { + "epoch": 0.015470947410550178, + "grad_norm": 0.24121630191802979, + "learning_rate": 0.001, + "loss": 3.6718, + "num_input_tokens_seen": 602931200, + "step": 2300 + }, + { + "epoch": 0.01580727235425779, + "grad_norm": 0.25701209902763367, + "learning_rate": 0.001, + "loss": 3.6658, + "num_input_tokens_seen": 616038400, + "step": 2350 + }, + { + "epoch": 0.0161435972979654, + "grad_norm": 0.22402645647525787, + "learning_rate": 0.001, + "loss": 3.6704, + "num_input_tokens_seen": 629145600, + "step": 2400 + }, + { + "epoch": 0.016479922241673016, + "grad_norm": 0.2358316332101822, + "learning_rate": 0.001, + "loss": 3.6464, + "num_input_tokens_seen": 642252800, + "step": 2450 + }, + { + "epoch": 0.016816247185380627, + "grad_norm": 0.23835696280002594, + "learning_rate": 0.001, + "loss": 3.6424, + "num_input_tokens_seen": 655360000, + "step": 2500 + }, + { + "epoch": 0.016816247185380627, + "eval_loss": 3.560713768005371, + "eval_runtime": 100.1074, + "eval_samples_per_second": 49.946, + "eval_steps_per_second": 12.487, + "num_input_tokens_seen": 655360000, + "step": 2500 + }, + { + "epoch": 0.01715257212908824, + "grad_norm": 0.20949696004390717, + "learning_rate": 0.001, + "loss": 3.6346, + "num_input_tokens_seen": 668467200, + "step": 2550 + }, + { + "epoch": 0.017488897072795852, + "grad_norm": 0.21765078604221344, + "learning_rate": 0.001, + "loss": 3.6123, + "num_input_tokens_seen": 681574400, + "step": 2600 + }, + { + "epoch": 0.017825222016503463, + "grad_norm": 0.21167752146720886, + "learning_rate": 0.001, + "loss": 3.6173, + "num_input_tokens_seen": 694681600, + "step": 2650 + }, + { + "epoch": 0.018161546960211078, + "grad_norm": 0.22060410678386688, + "learning_rate": 0.001, + "loss": 3.62, + "num_input_tokens_seen": 707788800, + "step": 2700 + }, + { + "epoch": 0.01849787190391869, + "grad_norm": 0.21778976917266846, + "learning_rate": 0.001, + "loss": 3.6009, + "num_input_tokens_seen": 720896000, + "step": 2750 + }, + { + "epoch": 0.018834196847626303, + "grad_norm": 0.21130047738552094, + "learning_rate": 0.001, + "loss": 3.5882, + "num_input_tokens_seen": 734003200, + "step": 2800 + }, + { + "epoch": 0.019170521791333914, + "grad_norm": 0.20137132704257965, + "learning_rate": 0.001, + "loss": 3.5923, + "num_input_tokens_seen": 747110400, + "step": 2850 + }, + { + "epoch": 0.01950684673504153, + "grad_norm": 0.24937641620635986, + "learning_rate": 0.001, + "loss": 3.5872, + "num_input_tokens_seen": 760217600, + "step": 2900 + }, + { + "epoch": 0.01984317167874914, + "grad_norm": 0.20992155373096466, + "learning_rate": 0.001, + "loss": 3.5786, + "num_input_tokens_seen": 773324800, + "step": 2950 + }, + { + "epoch": 0.020179496622456754, + "grad_norm": 0.24723300337791443, + "learning_rate": 0.001, + "loss": 3.5846, + "num_input_tokens_seen": 786432000, + "step": 3000 + }, + { + "epoch": 0.020179496622456754, + "eval_loss": 3.502939462661743, + "eval_runtime": 99.6836, + "eval_samples_per_second": 50.159, + "eval_steps_per_second": 12.54, + "num_input_tokens_seen": 786432000, + "step": 3000 + }, + { + "epoch": 0.020515821566164365, + "grad_norm": 0.22368234395980835, + "learning_rate": 0.001, + "loss": 3.5744, + "num_input_tokens_seen": 799539200, + "step": 3050 + }, + { + "epoch": 0.020852146509871976, + "grad_norm": 0.20934642851352692, + "learning_rate": 0.001, + "loss": 3.5666, + "num_input_tokens_seen": 812646400, + "step": 3100 + }, + { + "epoch": 0.02118847145357959, + "grad_norm": 0.1929185688495636, + "learning_rate": 0.001, + "loss": 3.5624, + "num_input_tokens_seen": 825753600, + "step": 3150 + }, + { + "epoch": 0.021524796397287202, + "grad_norm": 0.20416973531246185, + "learning_rate": 0.001, + "loss": 3.5491, + "num_input_tokens_seen": 838860800, + "step": 3200 + }, + { + "epoch": 0.021861121340994816, + "grad_norm": 0.20873814821243286, + "learning_rate": 0.001, + "loss": 3.5509, + "num_input_tokens_seen": 851968000, + "step": 3250 + }, + { + "epoch": 0.022197446284702427, + "grad_norm": 0.22235794365406036, + "learning_rate": 0.001, + "loss": 3.5453, + "num_input_tokens_seen": 865075200, + "step": 3300 + }, + { + "epoch": 0.022533771228410042, + "grad_norm": 0.2174178808927536, + "learning_rate": 0.001, + "loss": 3.5378, + "num_input_tokens_seen": 878182400, + "step": 3350 + }, + { + "epoch": 0.022870096172117653, + "grad_norm": 0.2016495019197464, + "learning_rate": 0.001, + "loss": 3.5352, + "num_input_tokens_seen": 891289600, + "step": 3400 + }, + { + "epoch": 0.023206421115825267, + "grad_norm": 0.1864960640668869, + "learning_rate": 0.001, + "loss": 3.524, + "num_input_tokens_seen": 904396800, + "step": 3450 + }, + { + "epoch": 0.02354274605953288, + "grad_norm": 0.19614097476005554, + "learning_rate": 0.001, + "loss": 3.528, + "num_input_tokens_seen": 917504000, + "step": 3500 + }, + { + "epoch": 0.02354274605953288, + "eval_loss": 3.4472692012786865, + "eval_runtime": 101.0365, + "eval_samples_per_second": 49.487, + "eval_steps_per_second": 12.372, + "num_input_tokens_seen": 917504000, + "step": 3500 + }, + { + "epoch": 0.02387907100324049, + "grad_norm": 0.18501116335391998, + "learning_rate": 0.001, + "loss": 3.53, + "num_input_tokens_seen": 930611200, + "step": 3550 + }, + { + "epoch": 0.024215395946948104, + "grad_norm": 0.1863412857055664, + "learning_rate": 0.001, + "loss": 3.5145, + "num_input_tokens_seen": 943718400, + "step": 3600 + }, + { + "epoch": 0.024551720890655715, + "grad_norm": 0.1979917287826538, + "learning_rate": 0.001, + "loss": 3.5143, + "num_input_tokens_seen": 956825600, + "step": 3650 + }, + { + "epoch": 0.02488804583436333, + "grad_norm": 0.1991748809814453, + "learning_rate": 0.001, + "loss": 3.5065, + "num_input_tokens_seen": 969932800, + "step": 3700 + }, + { + "epoch": 0.02522437077807094, + "grad_norm": 0.19475233554840088, + "learning_rate": 0.001, + "loss": 3.5011, + "num_input_tokens_seen": 983040000, + "step": 3750 + }, + { + "epoch": 0.025560695721778555, + "grad_norm": 0.195469468832016, + "learning_rate": 0.001, + "loss": 3.506, + "num_input_tokens_seen": 996147200, + "step": 3800 + }, + { + "epoch": 0.025897020665486166, + "grad_norm": 0.19666293263435364, + "learning_rate": 0.001, + "loss": 3.4936, + "num_input_tokens_seen": 1009254400, + "step": 3850 + }, + { + "epoch": 0.026233345609193777, + "grad_norm": 0.20198987424373627, + "learning_rate": 0.001, + "loss": 3.4873, + "num_input_tokens_seen": 1022361600, + "step": 3900 + }, + { + "epoch": 0.02656967055290139, + "grad_norm": 0.18537157773971558, + "learning_rate": 0.001, + "loss": 3.4939, + "num_input_tokens_seen": 1035468800, + "step": 3950 + }, + { + "epoch": 0.026905995496609002, + "grad_norm": 0.18743236362934113, + "learning_rate": 0.001, + "loss": 3.4784, + "num_input_tokens_seen": 1048576000, + "step": 4000 + }, + { + "epoch": 0.026905995496609002, + "eval_loss": 3.4036922454833984, + "eval_runtime": 99.4277, + "eval_samples_per_second": 50.288, + "eval_steps_per_second": 12.572, + "num_input_tokens_seen": 1048576000, + "step": 4000 + }, + { + "epoch": 0.027242320440316617, + "grad_norm": 0.18962019681930542, + "learning_rate": 0.001, + "loss": 3.4848, + "num_input_tokens_seen": 1061683200, + "step": 4050 + }, + { + "epoch": 0.027578645384024228, + "grad_norm": 0.19415706396102905, + "learning_rate": 0.001, + "loss": 3.4934, + "num_input_tokens_seen": 1074790400, + "step": 4100 + }, + { + "epoch": 0.027914970327731842, + "grad_norm": 0.17727437615394592, + "learning_rate": 0.001, + "loss": 3.4835, + "num_input_tokens_seen": 1087897600, + "step": 4150 + }, + { + "epoch": 0.028251295271439453, + "grad_norm": 0.18400466442108154, + "learning_rate": 0.001, + "loss": 3.4843, + "num_input_tokens_seen": 1101004800, + "step": 4200 + }, + { + "epoch": 0.028587620215147068, + "grad_norm": 0.19114799797534943, + "learning_rate": 0.001, + "loss": 3.4713, + "num_input_tokens_seen": 1114112000, + "step": 4250 + }, + { + "epoch": 0.02892394515885468, + "grad_norm": 0.18681153655052185, + "learning_rate": 0.001, + "loss": 3.4601, + "num_input_tokens_seen": 1127219200, + "step": 4300 + }, + { + "epoch": 0.02926027010256229, + "grad_norm": 0.20739078521728516, + "learning_rate": 0.001, + "loss": 3.4628, + "num_input_tokens_seen": 1140326400, + "step": 4350 + }, + { + "epoch": 0.029596595046269904, + "grad_norm": 0.18018484115600586, + "learning_rate": 0.001, + "loss": 3.4521, + "num_input_tokens_seen": 1153433600, + "step": 4400 + }, + { + "epoch": 0.029932919989977515, + "grad_norm": 0.18144090473651886, + "learning_rate": 0.001, + "loss": 3.4536, + "num_input_tokens_seen": 1166540800, + "step": 4450 + }, + { + "epoch": 0.03026924493368513, + "grad_norm": 0.17444822192192078, + "learning_rate": 0.001, + "loss": 3.4509, + "num_input_tokens_seen": 1179648000, + "step": 4500 + }, + { + "epoch": 0.03026924493368513, + "eval_loss": 3.3682761192321777, + "eval_runtime": 100.0421, + "eval_samples_per_second": 49.979, + "eval_steps_per_second": 12.495, + "num_input_tokens_seen": 1179648000, + "step": 4500 + }, + { + "epoch": 0.03060556987739274, + "grad_norm": 0.21026909351348877, + "learning_rate": 0.001, + "loss": 3.4493, + "num_input_tokens_seen": 1192755200, + "step": 4550 + }, + { + "epoch": 0.030941894821100355, + "grad_norm": 0.1758560836315155, + "learning_rate": 0.001, + "loss": 3.4416, + "num_input_tokens_seen": 1205862400, + "step": 4600 + }, + { + "epoch": 0.03127821976480797, + "grad_norm": 0.1979188174009323, + "learning_rate": 0.001, + "loss": 3.4353, + "num_input_tokens_seen": 1218969600, + "step": 4650 + }, + { + "epoch": 0.03161454470851558, + "grad_norm": 0.17621161043643951, + "learning_rate": 0.001, + "loss": 3.435, + "num_input_tokens_seen": 1232076800, + "step": 4700 + }, + { + "epoch": 0.03195086965222319, + "grad_norm": 0.18691854178905487, + "learning_rate": 0.001, + "loss": 3.4367, + "num_input_tokens_seen": 1245184000, + "step": 4750 + }, + { + "epoch": 0.0322871945959308, + "grad_norm": 0.17202869057655334, + "learning_rate": 0.001, + "loss": 3.4426, + "num_input_tokens_seen": 1258291200, + "step": 4800 + }, + { + "epoch": 0.032623519539638414, + "grad_norm": 0.17097769677639008, + "learning_rate": 0.001, + "loss": 3.439, + "num_input_tokens_seen": 1271398400, + "step": 4850 + }, + { + "epoch": 0.03295984448334603, + "grad_norm": 0.1845879703760147, + "learning_rate": 0.001, + "loss": 3.4263, + "num_input_tokens_seen": 1284505600, + "step": 4900 + }, + { + "epoch": 0.03329616942705364, + "grad_norm": 0.18462544679641724, + "learning_rate": 0.001, + "loss": 3.4273, + "num_input_tokens_seen": 1297612800, + "step": 4950 + }, + { + "epoch": 0.033632494370761254, + "grad_norm": 0.18930740654468536, + "learning_rate": 0.001, + "loss": 3.4252, + "num_input_tokens_seen": 1310720000, + "step": 5000 + }, + { + "epoch": 0.033632494370761254, + "eval_loss": 3.3413195610046387, + "eval_runtime": 101.3099, + "eval_samples_per_second": 49.354, + "eval_steps_per_second": 12.338, + "num_input_tokens_seen": 1310720000, + "step": 5000 + }, + { + "epoch": 0.033968819314468865, + "grad_norm": 0.1741451472043991, + "learning_rate": 0.001, + "loss": 3.4216, + "num_input_tokens_seen": 1323827200, + "step": 5050 + }, + { + "epoch": 0.03430514425817648, + "grad_norm": 0.18628782033920288, + "learning_rate": 0.001, + "loss": 3.418, + "num_input_tokens_seen": 1336934400, + "step": 5100 + }, + { + "epoch": 0.034641469201884094, + "grad_norm": 0.17286072671413422, + "learning_rate": 0.001, + "loss": 3.4126, + "num_input_tokens_seen": 1350041600, + "step": 5150 + }, + { + "epoch": 0.034977794145591705, + "grad_norm": 0.18905507028102875, + "learning_rate": 0.001, + "loss": 3.4169, + "num_input_tokens_seen": 1363148800, + "step": 5200 + }, + { + "epoch": 0.035314119089299316, + "grad_norm": 0.17675209045410156, + "learning_rate": 0.001, + "loss": 3.4229, + "num_input_tokens_seen": 1376256000, + "step": 5250 + }, + { + "epoch": 0.03565044403300693, + "grad_norm": 0.19063080847263336, + "learning_rate": 0.001, + "loss": 3.4099, + "num_input_tokens_seen": 1389363200, + "step": 5300 + }, + { + "epoch": 0.035986768976714545, + "grad_norm": 0.1875571757555008, + "learning_rate": 0.001, + "loss": 3.4185, + "num_input_tokens_seen": 1402470400, + "step": 5350 + }, + { + "epoch": 0.036323093920422156, + "grad_norm": 1.0721184015274048, + "learning_rate": 0.001, + "loss": 3.4298, + "num_input_tokens_seen": 1415577600, + "step": 5400 + }, + { + "epoch": 0.03665941886412977, + "grad_norm": 0.1907675564289093, + "learning_rate": 0.001, + "loss": 3.4268, + "num_input_tokens_seen": 1428684800, + "step": 5450 + }, + { + "epoch": 0.03699574380783738, + "grad_norm": 0.18285758793354034, + "learning_rate": 0.001, + "loss": 3.4036, + "num_input_tokens_seen": 1441792000, + "step": 5500 + }, + { + "epoch": 0.03699574380783738, + "eval_loss": 3.31872820854187, + "eval_runtime": 101.3489, + "eval_samples_per_second": 49.335, + "eval_steps_per_second": 12.334, + "num_input_tokens_seen": 1441792000, + "step": 5500 + }, + { + "epoch": 0.037332068751544996, + "grad_norm": 0.18330691754817963, + "learning_rate": 0.001, + "loss": 3.3926, + "num_input_tokens_seen": 1454899200, + "step": 5550 + }, + { + "epoch": 0.03766839369525261, + "grad_norm": 0.16875725984573364, + "learning_rate": 0.001, + "loss": 3.3975, + "num_input_tokens_seen": 1468006400, + "step": 5600 + }, + { + "epoch": 0.03800471863896022, + "grad_norm": 0.18510381877422333, + "learning_rate": 0.001, + "loss": 3.3964, + "num_input_tokens_seen": 1481113600, + "step": 5650 + }, + { + "epoch": 0.03834104358266783, + "grad_norm": 0.16602838039398193, + "learning_rate": 0.001, + "loss": 3.3819, + "num_input_tokens_seen": 1494220800, + "step": 5700 + }, + { + "epoch": 0.03867736852637544, + "grad_norm": 0.16771391034126282, + "learning_rate": 0.001, + "loss": 3.3968, + "num_input_tokens_seen": 1507328000, + "step": 5750 + }, + { + "epoch": 0.03901369347008306, + "grad_norm": 0.16801221668720245, + "learning_rate": 0.001, + "loss": 3.3921, + "num_input_tokens_seen": 1520435200, + "step": 5800 + }, + { + "epoch": 0.03935001841379067, + "grad_norm": 0.16846245527267456, + "learning_rate": 0.001, + "loss": 3.3841, + "num_input_tokens_seen": 1533542400, + "step": 5850 + }, + { + "epoch": 0.03968634335749828, + "grad_norm": 0.17359821498394012, + "learning_rate": 0.001, + "loss": 3.372, + "num_input_tokens_seen": 1546649600, + "step": 5900 + }, + { + "epoch": 0.04002266830120589, + "grad_norm": 0.1578226536512375, + "learning_rate": 0.001, + "loss": 3.3826, + "num_input_tokens_seen": 1559756800, + "step": 5950 + }, + { + "epoch": 0.04035899324491351, + "grad_norm": 0.17228208482265472, + "learning_rate": 0.001, + "loss": 3.3953, + "num_input_tokens_seen": 1572864000, + "step": 6000 + }, + { + "epoch": 0.04035899324491351, + "eval_loss": 3.293433666229248, + "eval_runtime": 100.6763, + "eval_samples_per_second": 49.664, + "eval_steps_per_second": 12.416, + "num_input_tokens_seen": 1572864000, + "step": 6000 + }, + { + "epoch": 0.04069531818862112, + "grad_norm": 0.18091177940368652, + "learning_rate": 0.001, + "loss": 3.3783, + "num_input_tokens_seen": 1585971200, + "step": 6050 + }, + { + "epoch": 0.04103164313232873, + "grad_norm": 0.17565099895000458, + "learning_rate": 0.001, + "loss": 3.3773, + "num_input_tokens_seen": 1599078400, + "step": 6100 + }, + { + "epoch": 0.04136796807603634, + "grad_norm": 0.1635759323835373, + "learning_rate": 0.001, + "loss": 3.3823, + "num_input_tokens_seen": 1612185600, + "step": 6150 + }, + { + "epoch": 0.04170429301974395, + "grad_norm": 0.19144974648952484, + "learning_rate": 0.001, + "loss": 3.3665, + "num_input_tokens_seen": 1625292800, + "step": 6200 + }, + { + "epoch": 0.04204061796345157, + "grad_norm": 0.1741226762533188, + "learning_rate": 0.001, + "loss": 3.3637, + "num_input_tokens_seen": 1638400000, + "step": 6250 + }, + { + "epoch": 0.04237694290715918, + "grad_norm": 0.17072845995426178, + "learning_rate": 0.001, + "loss": 3.3627, + "num_input_tokens_seen": 1651507200, + "step": 6300 + }, + { + "epoch": 0.04271326785086679, + "grad_norm": 0.16942182183265686, + "learning_rate": 0.001, + "loss": 3.3729, + "num_input_tokens_seen": 1664614400, + "step": 6350 + }, + { + "epoch": 0.043049592794574404, + "grad_norm": 0.16412265598773956, + "learning_rate": 0.001, + "loss": 3.3616, + "num_input_tokens_seen": 1677721600, + "step": 6400 + }, + { + "epoch": 0.04338591773828202, + "grad_norm": 0.17044900357723236, + "learning_rate": 0.001, + "loss": 3.3574, + "num_input_tokens_seen": 1690828800, + "step": 6450 + }, + { + "epoch": 0.04372224268198963, + "grad_norm": 0.18034328520298004, + "learning_rate": 0.001, + "loss": 3.3625, + "num_input_tokens_seen": 1703936000, + "step": 6500 + }, + { + "epoch": 0.04372224268198963, + "eval_loss": 3.2744550704956055, + "eval_runtime": 101.1801, + "eval_samples_per_second": 49.417, + "eval_steps_per_second": 12.354, + "num_input_tokens_seen": 1703936000, + "step": 6500 + }, + { + "epoch": 0.044058567625697244, + "grad_norm": 0.169066920876503, + "learning_rate": 0.001, + "loss": 3.3569, + "num_input_tokens_seen": 1717043200, + "step": 6550 + }, + { + "epoch": 0.044394892569404855, + "grad_norm": 0.1789105087518692, + "learning_rate": 0.001, + "loss": 3.3636, + "num_input_tokens_seen": 1730150400, + "step": 6600 + }, + { + "epoch": 0.044731217513112466, + "grad_norm": 0.2083519697189331, + "learning_rate": 0.001, + "loss": 3.3565, + "num_input_tokens_seen": 1743257600, + "step": 6650 + }, + { + "epoch": 0.045067542456820084, + "grad_norm": 0.16989745199680328, + "learning_rate": 0.001, + "loss": 3.3562, + "num_input_tokens_seen": 1756364800, + "step": 6700 + }, + { + "epoch": 0.045403867400527695, + "grad_norm": 0.16275504231452942, + "learning_rate": 0.001, + "loss": 3.3625, + "num_input_tokens_seen": 1769472000, + "step": 6750 + }, + { + "epoch": 0.045740192344235306, + "grad_norm": 0.17771874368190765, + "learning_rate": 0.001, + "loss": 3.3477, + "num_input_tokens_seen": 1782579200, + "step": 6800 + }, + { + "epoch": 0.04607651728794292, + "grad_norm": 0.1635473072528839, + "learning_rate": 0.001, + "loss": 3.3513, + "num_input_tokens_seen": 1795686400, + "step": 6850 + }, + { + "epoch": 0.046412842231650535, + "grad_norm": 0.17198441922664642, + "learning_rate": 0.001, + "loss": 3.3461, + "num_input_tokens_seen": 1808793600, + "step": 6900 + }, + { + "epoch": 0.046749167175358146, + "grad_norm": 0.174327552318573, + "learning_rate": 0.001, + "loss": 3.3519, + "num_input_tokens_seen": 1821900800, + "step": 6950 + }, + { + "epoch": 0.04708549211906576, + "grad_norm": 0.17880085110664368, + "learning_rate": 0.001, + "loss": 3.3387, + "num_input_tokens_seen": 1835008000, + "step": 7000 + }, + { + "epoch": 0.04708549211906576, + "eval_loss": 3.2563092708587646, + "eval_runtime": 100.9949, + "eval_samples_per_second": 49.507, + "eval_steps_per_second": 12.377, + "num_input_tokens_seen": 1835008000, + "step": 7000 + }, + { + "epoch": 0.04742181706277337, + "grad_norm": 0.18561717867851257, + "learning_rate": 0.001, + "loss": 3.3436, + "num_input_tokens_seen": 1848115200, + "step": 7050 + }, + { + "epoch": 0.04775814200648098, + "grad_norm": 0.17194584012031555, + "learning_rate": 0.001, + "loss": 3.3391, + "num_input_tokens_seen": 1861222400, + "step": 7100 + }, + { + "epoch": 0.0480944669501886, + "grad_norm": 0.16629651188850403, + "learning_rate": 0.001, + "loss": 3.3419, + "num_input_tokens_seen": 1874329600, + "step": 7150 + }, + { + "epoch": 0.04843079189389621, + "grad_norm": 0.1665981113910675, + "learning_rate": 0.001, + "loss": 3.3422, + "num_input_tokens_seen": 1887436800, + "step": 7200 + }, + { + "epoch": 0.04876711683760382, + "grad_norm": 0.17213182151317596, + "learning_rate": 0.001, + "loss": 3.3389, + "num_input_tokens_seen": 1900544000, + "step": 7250 + }, + { + "epoch": 0.04910344178131143, + "grad_norm": 0.18480969965457916, + "learning_rate": 0.001, + "loss": 3.3257, + "num_input_tokens_seen": 1913651200, + "step": 7300 + }, + { + "epoch": 0.04943976672501905, + "grad_norm": 0.17105132341384888, + "learning_rate": 0.001, + "loss": 3.3339, + "num_input_tokens_seen": 1926758400, + "step": 7350 + }, + { + "epoch": 0.04977609166872666, + "grad_norm": 0.17547503113746643, + "learning_rate": 0.001, + "loss": 3.3364, + "num_input_tokens_seen": 1939865600, + "step": 7400 + }, + { + "epoch": 0.05011241661243427, + "grad_norm": 0.16320562362670898, + "learning_rate": 0.001, + "loss": 3.3349, + "num_input_tokens_seen": 1952972800, + "step": 7450 + }, + { + "epoch": 0.05044874155614188, + "grad_norm": 0.16704347729682922, + "learning_rate": 0.001, + "loss": 3.3459, + "num_input_tokens_seen": 1966080000, + "step": 7500 + }, + { + "epoch": 0.05044874155614188, + "eval_loss": 3.241454601287842, + "eval_runtime": 100.6752, + "eval_samples_per_second": 49.665, + "eval_steps_per_second": 12.416, + "num_input_tokens_seen": 1966080000, + "step": 7500 + }, + { + "epoch": 0.05078506649984949, + "grad_norm": 0.18855977058410645, + "learning_rate": 0.001, + "loss": 3.3298, + "num_input_tokens_seen": 1979187200, + "step": 7550 + }, + { + "epoch": 0.05112139144355711, + "grad_norm": 0.16146792471408844, + "learning_rate": 0.001, + "loss": 3.324, + "num_input_tokens_seen": 1992294400, + "step": 7600 + }, + { + "epoch": 0.05145771638726472, + "grad_norm": 0.1644527018070221, + "learning_rate": 0.001, + "loss": 3.3306, + "num_input_tokens_seen": 2005401600, + "step": 7650 + }, + { + "epoch": 0.05179404133097233, + "grad_norm": 0.17106670141220093, + "learning_rate": 0.001, + "loss": 3.33, + "num_input_tokens_seen": 2018508800, + "step": 7700 + }, + { + "epoch": 0.05213036627467994, + "grad_norm": 0.1606895476579666, + "learning_rate": 0.001, + "loss": 3.3188, + "num_input_tokens_seen": 2031616000, + "step": 7750 + }, + { + "epoch": 0.052466691218387554, + "grad_norm": 0.16948160529136658, + "learning_rate": 0.001, + "loss": 3.3306, + "num_input_tokens_seen": 2044723200, + "step": 7800 + }, + { + "epoch": 0.05280301616209517, + "grad_norm": 0.20683230459690094, + "learning_rate": 0.001, + "loss": 3.3203, + "num_input_tokens_seen": 2057830400, + "step": 7850 + }, + { + "epoch": 0.05313934110580278, + "grad_norm": 0.161922886967659, + "learning_rate": 0.001, + "loss": 3.3161, + "num_input_tokens_seen": 2070937600, + "step": 7900 + }, + { + "epoch": 0.053475666049510394, + "grad_norm": 0.1616695076227188, + "learning_rate": 0.001, + "loss": 3.3245, + "num_input_tokens_seen": 2084044800, + "step": 7950 + }, + { + "epoch": 0.053811990993218005, + "grad_norm": 0.1723030060529709, + "learning_rate": 0.001, + "loss": 3.3143, + "num_input_tokens_seen": 2097152000, + "step": 8000 + }, + { + "epoch": 0.053811990993218005, + "eval_loss": 3.2274885177612305, + "eval_runtime": 100.3755, + "eval_samples_per_second": 49.813, + "eval_steps_per_second": 12.453, + "num_input_tokens_seen": 2097152000, + "step": 8000 + }, + { + "epoch": 0.05414831593692562, + "grad_norm": 0.16236628592014313, + "learning_rate": 0.001, + "loss": 3.3214, + "num_input_tokens_seen": 2110259200, + "step": 8050 + }, + { + "epoch": 0.054484640880633234, + "grad_norm": 0.1676984280347824, + "learning_rate": 0.001, + "loss": 3.3152, + "num_input_tokens_seen": 2123366400, + "step": 8100 + }, + { + "epoch": 0.054820965824340845, + "grad_norm": 0.16020448505878448, + "learning_rate": 0.001, + "loss": 3.3134, + "num_input_tokens_seen": 2136473600, + "step": 8150 + }, + { + "epoch": 0.055157290768048456, + "grad_norm": 0.1649223119020462, + "learning_rate": 0.001, + "loss": 3.3121, + "num_input_tokens_seen": 2149580800, + "step": 8200 + }, + { + "epoch": 0.05549361571175607, + "grad_norm": 0.1627037674188614, + "learning_rate": 0.001, + "loss": 3.3092, + "num_input_tokens_seen": 2162688000, + "step": 8250 + }, + { + "epoch": 0.055829940655463685, + "grad_norm": 0.17913097143173218, + "learning_rate": 0.001, + "loss": 3.3032, + "num_input_tokens_seen": 2175795200, + "step": 8300 + }, + { + "epoch": 0.056166265599171296, + "grad_norm": 0.18965736031532288, + "learning_rate": 0.001, + "loss": 3.3075, + "num_input_tokens_seen": 2188902400, + "step": 8350 + }, + { + "epoch": 0.05650259054287891, + "grad_norm": 0.16027510166168213, + "learning_rate": 0.001, + "loss": 3.3086, + "num_input_tokens_seen": 2202009600, + "step": 8400 + }, + { + "epoch": 0.05683891548658652, + "grad_norm": 0.16940778493881226, + "learning_rate": 0.001, + "loss": 3.2849, + "num_input_tokens_seen": 2215116800, + "step": 8450 + }, + { + "epoch": 0.057175240430294136, + "grad_norm": 0.17754122614860535, + "learning_rate": 0.001, + "loss": 3.2975, + "num_input_tokens_seen": 2228224000, + "step": 8500 + }, + { + "epoch": 0.057175240430294136, + "eval_loss": 3.2149288654327393, + "eval_runtime": 101.5641, + "eval_samples_per_second": 49.23, + "eval_steps_per_second": 12.307, + "num_input_tokens_seen": 2228224000, + "step": 8500 + }, + { + "epoch": 0.05751156537400175, + "grad_norm": 0.1716330349445343, + "learning_rate": 0.001, + "loss": 3.3091, + "num_input_tokens_seen": 2241331200, + "step": 8550 + }, + { + "epoch": 0.05784789031770936, + "grad_norm": 0.16466470062732697, + "learning_rate": 0.001, + "loss": 3.2934, + "num_input_tokens_seen": 2254438400, + "step": 8600 + }, + { + "epoch": 0.05818421526141697, + "grad_norm": 0.1640830636024475, + "learning_rate": 0.001, + "loss": 3.2986, + "num_input_tokens_seen": 2267545600, + "step": 8650 + }, + { + "epoch": 0.05852054020512458, + "grad_norm": 0.16982024908065796, + "learning_rate": 0.001, + "loss": 3.3016, + "num_input_tokens_seen": 2280652800, + "step": 8700 + }, + { + "epoch": 0.0588568651488322, + "grad_norm": 0.1577749252319336, + "learning_rate": 0.001, + "loss": 3.2961, + "num_input_tokens_seen": 2293760000, + "step": 8750 + }, + { + "epoch": 0.05919319009253981, + "grad_norm": 0.1626594513654709, + "learning_rate": 0.001, + "loss": 3.293, + "num_input_tokens_seen": 2306867200, + "step": 8800 + }, + { + "epoch": 0.05952951503624742, + "grad_norm": 0.18469755351543427, + "learning_rate": 0.001, + "loss": 3.2919, + "num_input_tokens_seen": 2319974400, + "step": 8850 + }, + { + "epoch": 0.05986583997995503, + "grad_norm": 0.17915847897529602, + "learning_rate": 0.001, + "loss": 3.2914, + "num_input_tokens_seen": 2333081600, + "step": 8900 + }, + { + "epoch": 0.06020216492366265, + "grad_norm": 0.17483194172382355, + "learning_rate": 0.001, + "loss": 3.2855, + "num_input_tokens_seen": 2346188800, + "step": 8950 + }, + { + "epoch": 0.06053848986737026, + "grad_norm": 0.16408763825893402, + "learning_rate": 0.001, + "loss": 3.2817, + "num_input_tokens_seen": 2359296000, + "step": 9000 + }, + { + "epoch": 0.06053848986737026, + "eval_loss": 3.201568603515625, + "eval_runtime": 100.2925, + "eval_samples_per_second": 49.854, + "eval_steps_per_second": 12.464, + "num_input_tokens_seen": 2359296000, + "step": 9000 + }, + { + "epoch": 0.06087481481107787, + "grad_norm": 0.15520979464054108, + "learning_rate": 0.001, + "loss": 3.2917, + "num_input_tokens_seen": 2372403200, + "step": 9050 + }, + { + "epoch": 0.06121113975478548, + "grad_norm": 0.19632326066493988, + "learning_rate": 0.001, + "loss": 3.3164, + "num_input_tokens_seen": 2385510400, + "step": 9100 + }, + { + "epoch": 0.06154746469849309, + "grad_norm": 0.17335627973079681, + "learning_rate": 0.001, + "loss": 3.3025, + "num_input_tokens_seen": 2398617600, + "step": 9150 + }, + { + "epoch": 0.06188378964220071, + "grad_norm": 0.18116877973079681, + "learning_rate": 0.001, + "loss": 3.2857, + "num_input_tokens_seen": 2411724800, + "step": 9200 + }, + { + "epoch": 0.06222011458590832, + "grad_norm": 0.17199201881885529, + "learning_rate": 0.001, + "loss": 3.2901, + "num_input_tokens_seen": 2424832000, + "step": 9250 + }, + { + "epoch": 0.06255643952961594, + "grad_norm": 0.163723424077034, + "learning_rate": 0.001, + "loss": 3.2865, + "num_input_tokens_seen": 2437939200, + "step": 9300 + }, + { + "epoch": 0.06289276447332355, + "grad_norm": 0.17228147387504578, + "learning_rate": 0.001, + "loss": 3.2884, + "num_input_tokens_seen": 2451046400, + "step": 9350 + }, + { + "epoch": 0.06322908941703116, + "grad_norm": 0.1656276136636734, + "learning_rate": 0.001, + "loss": 3.2871, + "num_input_tokens_seen": 2464153600, + "step": 9400 + }, + { + "epoch": 0.06356541436073877, + "grad_norm": 0.16867949068546295, + "learning_rate": 0.001, + "loss": 3.2817, + "num_input_tokens_seen": 2477260800, + "step": 9450 + }, + { + "epoch": 0.06390173930444638, + "grad_norm": 0.17453493177890778, + "learning_rate": 0.001, + "loss": 3.2876, + "num_input_tokens_seen": 2490368000, + "step": 9500 + }, + { + "epoch": 0.06390173930444638, + "eval_loss": 3.190683364868164, + "eval_runtime": 100.5146, + "eval_samples_per_second": 49.744, + "eval_steps_per_second": 12.436, + "num_input_tokens_seen": 2490368000, + "step": 9500 + }, + { + "epoch": 0.064238064248154, + "grad_norm": 0.6719958186149597, + "learning_rate": 0.001, + "loss": 3.299, + "num_input_tokens_seen": 2503475200, + "step": 9550 + }, + { + "epoch": 0.0645743891918616, + "grad_norm": 0.19327396154403687, + "learning_rate": 0.001, + "loss": 3.2907, + "num_input_tokens_seen": 2516582400, + "step": 9600 + }, + { + "epoch": 0.06491071413556922, + "grad_norm": 0.17817369103431702, + "learning_rate": 0.001, + "loss": 3.2801, + "num_input_tokens_seen": 2529689600, + "step": 9650 + }, + { + "epoch": 0.06524703907927683, + "grad_norm": 0.16956672072410583, + "learning_rate": 0.001, + "loss": 3.2761, + "num_input_tokens_seen": 2542796800, + "step": 9700 + }, + { + "epoch": 0.06558336402298445, + "grad_norm": 0.1854093372821808, + "learning_rate": 0.001, + "loss": 3.27, + "num_input_tokens_seen": 2555904000, + "step": 9750 + }, + { + "epoch": 0.06591968896669206, + "grad_norm": 0.15702186524868011, + "learning_rate": 0.001, + "loss": 3.2792, + "num_input_tokens_seen": 2569011200, + "step": 9800 + }, + { + "epoch": 0.06625601391039967, + "grad_norm": 0.16380488872528076, + "learning_rate": 0.001, + "loss": 3.2672, + "num_input_tokens_seen": 2582118400, + "step": 9850 + }, + { + "epoch": 0.06659233885410729, + "grad_norm": 0.15908506512641907, + "learning_rate": 0.001, + "loss": 3.2665, + "num_input_tokens_seen": 2595225600, + "step": 9900 + }, + { + "epoch": 0.0669286637978149, + "grad_norm": 0.1654980629682541, + "learning_rate": 0.001, + "loss": 3.2726, + "num_input_tokens_seen": 2608332800, + "step": 9950 + }, + { + "epoch": 0.06726498874152251, + "grad_norm": 0.1780249923467636, + "learning_rate": 0.001, + "loss": 3.2632, + "num_input_tokens_seen": 2621440000, + "step": 10000 + }, + { + "epoch": 0.06726498874152251, + "eval_loss": 3.1775221824645996, + "eval_runtime": 100.6676, + "eval_samples_per_second": 49.668, + "eval_steps_per_second": 12.417, + "num_input_tokens_seen": 2621440000, + "step": 10000 + }, + { + "epoch": 0.06760131368523012, + "grad_norm": 0.1572471708059311, + "learning_rate": 0.001, + "loss": 3.2652, + "num_input_tokens_seen": 2634547200, + "step": 10050 + }, + { + "epoch": 0.06793763862893773, + "grad_norm": 0.1655690222978592, + "learning_rate": 0.001, + "loss": 3.2753, + "num_input_tokens_seen": 2647654400, + "step": 10100 + }, + { + "epoch": 0.06827396357264534, + "grad_norm": 0.18156391382217407, + "learning_rate": 0.001, + "loss": 3.2717, + "num_input_tokens_seen": 2660761600, + "step": 10150 + }, + { + "epoch": 0.06861028851635297, + "grad_norm": 0.1684606373310089, + "learning_rate": 0.001, + "loss": 3.269, + "num_input_tokens_seen": 2673868800, + "step": 10200 + }, + { + "epoch": 0.06894661346006058, + "grad_norm": 0.18199962377548218, + "learning_rate": 0.001, + "loss": 3.2591, + "num_input_tokens_seen": 2686976000, + "step": 10250 + }, + { + "epoch": 0.06928293840376819, + "grad_norm": 0.1662759929895401, + "learning_rate": 0.001, + "loss": 3.2674, + "num_input_tokens_seen": 2700083200, + "step": 10300 + }, + { + "epoch": 0.0696192633474758, + "grad_norm": 0.16799511015415192, + "learning_rate": 0.001, + "loss": 3.2788, + "num_input_tokens_seen": 2713190400, + "step": 10350 + }, + { + "epoch": 0.06995558829118341, + "grad_norm": 0.17926375567913055, + "learning_rate": 0.001, + "loss": 3.2742, + "num_input_tokens_seen": 2726297600, + "step": 10400 + }, + { + "epoch": 0.07029191323489102, + "grad_norm": 0.18057045340538025, + "learning_rate": 0.001, + "loss": 3.2662, + "num_input_tokens_seen": 2739404800, + "step": 10450 + }, + { + "epoch": 0.07062823817859863, + "grad_norm": 0.17588871717453003, + "learning_rate": 0.001, + "loss": 3.2577, + "num_input_tokens_seen": 2752512000, + "step": 10500 + }, + { + "epoch": 0.07062823817859863, + "eval_loss": 3.168182849884033, + "eval_runtime": 100.5761, + "eval_samples_per_second": 49.714, + "eval_steps_per_second": 12.428, + "num_input_tokens_seen": 2752512000, + "step": 10500 + }, + { + "epoch": 0.07096456312230624, + "grad_norm": 0.1731673628091812, + "learning_rate": 0.001, + "loss": 3.2565, + "num_input_tokens_seen": 2765619200, + "step": 10550 + }, + { + "epoch": 0.07130088806601385, + "grad_norm": 0.16532014310359955, + "learning_rate": 0.001, + "loss": 3.2511, + "num_input_tokens_seen": 2778726400, + "step": 10600 + }, + { + "epoch": 0.07163721300972148, + "grad_norm": 0.17818772792816162, + "learning_rate": 0.001, + "loss": 3.2541, + "num_input_tokens_seen": 2791833600, + "step": 10650 + }, + { + "epoch": 0.07197353795342909, + "grad_norm": 0.16863703727722168, + "learning_rate": 0.001, + "loss": 3.2485, + "num_input_tokens_seen": 2804940800, + "step": 10700 + }, + { + "epoch": 0.0723098628971367, + "grad_norm": 0.17316773533821106, + "learning_rate": 0.001, + "loss": 3.2583, + "num_input_tokens_seen": 2818048000, + "step": 10750 + }, + { + "epoch": 0.07264618784084431, + "grad_norm": 0.16366828978061676, + "learning_rate": 0.001, + "loss": 3.2502, + "num_input_tokens_seen": 2831155200, + "step": 10800 + }, + { + "epoch": 0.07298251278455192, + "grad_norm": 0.16141986846923828, + "learning_rate": 0.001, + "loss": 3.2537, + "num_input_tokens_seen": 2844262400, + "step": 10850 + }, + { + "epoch": 0.07331883772825953, + "grad_norm": 0.16185277700424194, + "learning_rate": 0.001, + "loss": 3.2453, + "num_input_tokens_seen": 2857369600, + "step": 10900 + }, + { + "epoch": 0.07365516267196714, + "grad_norm": 0.15637634694576263, + "learning_rate": 0.001, + "loss": 3.2562, + "num_input_tokens_seen": 2870476800, + "step": 10950 + }, + { + "epoch": 0.07399148761567476, + "grad_norm": 0.16142712533473969, + "learning_rate": 0.001, + "loss": 3.2427, + "num_input_tokens_seen": 2883584000, + "step": 11000 + }, + { + "epoch": 0.07399148761567476, + "eval_loss": 3.1591553688049316, + "eval_runtime": 100.7501, + "eval_samples_per_second": 49.628, + "eval_steps_per_second": 12.407, + "num_input_tokens_seen": 2883584000, + "step": 11000 + }, + { + "epoch": 0.07432781255938237, + "grad_norm": 0.16285482048988342, + "learning_rate": 0.001, + "loss": 3.2556, + "num_input_tokens_seen": 2896691200, + "step": 11050 + }, + { + "epoch": 0.07466413750308999, + "grad_norm": 0.1800818145275116, + "learning_rate": 0.001, + "loss": 3.2498, + "num_input_tokens_seen": 2909798400, + "step": 11100 + }, + { + "epoch": 0.0750004624467976, + "grad_norm": 0.1587436944246292, + "learning_rate": 0.001, + "loss": 3.2454, + "num_input_tokens_seen": 2922905600, + "step": 11150 + }, + { + "epoch": 0.07533678739050521, + "grad_norm": 0.17776361107826233, + "learning_rate": 0.001, + "loss": 3.2671, + "num_input_tokens_seen": 2936012800, + "step": 11200 + }, + { + "epoch": 0.07567311233421282, + "grad_norm": 0.16090282797813416, + "learning_rate": 0.001, + "loss": 3.2582, + "num_input_tokens_seen": 2949120000, + "step": 11250 + }, + { + "epoch": 0.07600943727792044, + "grad_norm": 0.1685740053653717, + "learning_rate": 0.001, + "loss": 3.2485, + "num_input_tokens_seen": 2962227200, + "step": 11300 + }, + { + "epoch": 0.07634576222162805, + "grad_norm": 0.16622695326805115, + "learning_rate": 0.001, + "loss": 3.2517, + "num_input_tokens_seen": 2975334400, + "step": 11350 + }, + { + "epoch": 0.07668208716533566, + "grad_norm": 0.2576703131198883, + "learning_rate": 0.001, + "loss": 3.2536, + "num_input_tokens_seen": 2988441600, + "step": 11400 + }, + { + "epoch": 0.07701841210904327, + "grad_norm": 0.16928231716156006, + "learning_rate": 0.001, + "loss": 3.234, + "num_input_tokens_seen": 3001548800, + "step": 11450 + }, + { + "epoch": 0.07735473705275088, + "grad_norm": 0.16732951998710632, + "learning_rate": 0.001, + "loss": 3.2421, + "num_input_tokens_seen": 3014656000, + "step": 11500 + }, + { + "epoch": 0.07735473705275088, + "eval_loss": 3.1493282318115234, + "eval_runtime": 100.6579, + "eval_samples_per_second": 49.673, + "eval_steps_per_second": 12.418, + "num_input_tokens_seen": 3014656000, + "step": 11500 + }, + { + "epoch": 0.0776910619964585, + "grad_norm": 0.1627015471458435, + "learning_rate": 0.001, + "loss": 3.2449, + "num_input_tokens_seen": 3027763200, + "step": 11550 + }, + { + "epoch": 0.07802738694016612, + "grad_norm": 0.1591007262468338, + "learning_rate": 0.001, + "loss": 3.2405, + "num_input_tokens_seen": 3040870400, + "step": 11600 + }, + { + "epoch": 0.07836371188387373, + "grad_norm": 0.16861042380332947, + "learning_rate": 0.001, + "loss": 3.2371, + "num_input_tokens_seen": 3053977600, + "step": 11650 + }, + { + "epoch": 0.07870003682758134, + "grad_norm": 0.17942191660404205, + "learning_rate": 0.001, + "loss": 3.2401, + "num_input_tokens_seen": 3067084800, + "step": 11700 + }, + { + "epoch": 0.07903636177128895, + "grad_norm": 0.19918367266654968, + "learning_rate": 0.001, + "loss": 3.2522, + "num_input_tokens_seen": 3080192000, + "step": 11750 + }, + { + "epoch": 0.07937268671499656, + "grad_norm": 0.20974946022033691, + "learning_rate": 0.001, + "loss": 3.2476, + "num_input_tokens_seen": 3093299200, + "step": 11800 + }, + { + "epoch": 0.07970901165870417, + "grad_norm": 0.17063277959823608, + "learning_rate": 0.001, + "loss": 3.2461, + "num_input_tokens_seen": 3106406400, + "step": 11850 + }, + { + "epoch": 0.08004533660241178, + "grad_norm": 0.17285390198230743, + "learning_rate": 0.001, + "loss": 3.2389, + "num_input_tokens_seen": 3119513600, + "step": 11900 + }, + { + "epoch": 0.08038166154611939, + "grad_norm": 0.16399264335632324, + "learning_rate": 0.001, + "loss": 3.2354, + "num_input_tokens_seen": 3132620800, + "step": 11950 + }, + { + "epoch": 0.08071798648982702, + "grad_norm": 0.17166489362716675, + "learning_rate": 0.001, + "loss": 3.2393, + "num_input_tokens_seen": 3145728000, + "step": 12000 + }, + { + "epoch": 0.08071798648982702, + "eval_loss": 3.1431546211242676, + "eval_runtime": 100.4686, + "eval_samples_per_second": 49.767, + "eval_steps_per_second": 12.442, + "num_input_tokens_seen": 3145728000, + "step": 12000 + }, + { + "epoch": 0.08105431143353463, + "grad_norm": 0.16976477205753326, + "learning_rate": 0.001, + "loss": 3.2367, + "num_input_tokens_seen": 3158835200, + "step": 12050 + }, + { + "epoch": 0.08139063637724224, + "grad_norm": 0.17778240144252777, + "learning_rate": 0.001, + "loss": 3.2346, + "num_input_tokens_seen": 3171942400, + "step": 12100 + }, + { + "epoch": 0.08172696132094985, + "grad_norm": 0.17096461355686188, + "learning_rate": 0.001, + "loss": 3.235, + "num_input_tokens_seen": 3185049600, + "step": 12150 + }, + { + "epoch": 0.08206328626465746, + "grad_norm": 0.16154351830482483, + "learning_rate": 0.001, + "loss": 3.2263, + "num_input_tokens_seen": 3198156800, + "step": 12200 + }, + { + "epoch": 0.08239961120836507, + "grad_norm": 0.23045915365219116, + "learning_rate": 0.001, + "loss": 3.2295, + "num_input_tokens_seen": 3211264000, + "step": 12250 + }, + { + "epoch": 0.08273593615207268, + "grad_norm": 0.17755016684532166, + "learning_rate": 0.001, + "loss": 3.2465, + "num_input_tokens_seen": 3224371200, + "step": 12300 + }, + { + "epoch": 0.0830722610957803, + "grad_norm": 0.17216768860816956, + "learning_rate": 0.001, + "loss": 3.2353, + "num_input_tokens_seen": 3237478400, + "step": 12350 + }, + { + "epoch": 0.0834085860394879, + "grad_norm": 0.166086345911026, + "learning_rate": 0.001, + "loss": 3.231, + "num_input_tokens_seen": 3250585600, + "step": 12400 + }, + { + "epoch": 0.08374491098319553, + "grad_norm": 0.1681985855102539, + "learning_rate": 0.001, + "loss": 3.2343, + "num_input_tokens_seen": 3263692800, + "step": 12450 + }, + { + "epoch": 0.08408123592690314, + "grad_norm": 0.1611029952764511, + "learning_rate": 0.001, + "loss": 3.2386, + "num_input_tokens_seen": 3276800000, + "step": 12500 + }, + { + "epoch": 0.08408123592690314, + "eval_loss": 3.135470390319824, + "eval_runtime": 99.9587, + "eval_samples_per_second": 50.021, + "eval_steps_per_second": 12.505, + "num_input_tokens_seen": 3276800000, + "step": 12500 + }, + { + "epoch": 0.08441756087061075, + "grad_norm": 0.16823448240756989, + "learning_rate": 0.001, + "loss": 3.2315, + "num_input_tokens_seen": 3289907200, + "step": 12550 + }, + { + "epoch": 0.08475388581431836, + "grad_norm": 0.17325358092784882, + "learning_rate": 0.001, + "loss": 3.2258, + "num_input_tokens_seen": 3303014400, + "step": 12600 + }, + { + "epoch": 0.08509021075802597, + "grad_norm": 0.16828718781471252, + "learning_rate": 0.001, + "loss": 3.2251, + "num_input_tokens_seen": 3316121600, + "step": 12650 + }, + { + "epoch": 0.08542653570173359, + "grad_norm": 0.3836762309074402, + "learning_rate": 0.001, + "loss": 3.2279, + "num_input_tokens_seen": 3329228800, + "step": 12700 + }, + { + "epoch": 0.0857628606454412, + "grad_norm": 0.17255236208438873, + "learning_rate": 0.001, + "loss": 3.221, + "num_input_tokens_seen": 3342336000, + "step": 12750 + }, + { + "epoch": 0.08609918558914881, + "grad_norm": 0.2381184846162796, + "learning_rate": 0.001, + "loss": 3.2228, + "num_input_tokens_seen": 3355443200, + "step": 12800 + }, + { + "epoch": 0.08643551053285642, + "grad_norm": 0.3065573573112488, + "learning_rate": 0.001, + "loss": 3.2251, + "num_input_tokens_seen": 3368550400, + "step": 12850 + }, + { + "epoch": 0.08677183547656404, + "grad_norm": 0.1801990419626236, + "learning_rate": 0.001, + "loss": 3.2488, + "num_input_tokens_seen": 3381657600, + "step": 12900 + }, + { + "epoch": 0.08710816042027165, + "grad_norm": 0.17388571798801422, + "learning_rate": 0.001, + "loss": 3.2309, + "num_input_tokens_seen": 3394764800, + "step": 12950 + }, + { + "epoch": 0.08744448536397927, + "grad_norm": 0.16619688272476196, + "learning_rate": 0.001, + "loss": 3.2158, + "num_input_tokens_seen": 3407872000, + "step": 13000 + }, + { + "epoch": 0.08744448536397927, + "eval_loss": 3.128695011138916, + "eval_runtime": 101.514, + "eval_samples_per_second": 49.254, + "eval_steps_per_second": 12.314, + "num_input_tokens_seen": 3407872000, + "step": 13000 + }, + { + "epoch": 0.08778081030768688, + "grad_norm": 0.16921883821487427, + "learning_rate": 0.001, + "loss": 3.2157, + "num_input_tokens_seen": 3420979200, + "step": 13050 + }, + { + "epoch": 0.08811713525139449, + "grad_norm": 0.16760320961475372, + "learning_rate": 0.001, + "loss": 3.2057, + "num_input_tokens_seen": 3434086400, + "step": 13100 + }, + { + "epoch": 0.0884534601951021, + "grad_norm": 0.16922198235988617, + "learning_rate": 0.001, + "loss": 3.2281, + "num_input_tokens_seen": 3447193600, + "step": 13150 + }, + { + "epoch": 0.08878978513880971, + "grad_norm": 0.1857660859823227, + "learning_rate": 0.001, + "loss": 3.2107, + "num_input_tokens_seen": 3460300800, + "step": 13200 + }, + { + "epoch": 0.08912611008251732, + "grad_norm": 0.1746143400669098, + "learning_rate": 0.001, + "loss": 3.217, + "num_input_tokens_seen": 3473408000, + "step": 13250 + }, + { + "epoch": 0.08946243502622493, + "grad_norm": 0.16841556131839752, + "learning_rate": 0.001, + "loss": 3.2241, + "num_input_tokens_seen": 3486515200, + "step": 13300 + }, + { + "epoch": 0.08979875996993256, + "grad_norm": 0.1724822223186493, + "learning_rate": 0.001, + "loss": 3.2158, + "num_input_tokens_seen": 3499622400, + "step": 13350 + }, + { + "epoch": 0.09013508491364017, + "grad_norm": 0.17045529186725616, + "learning_rate": 0.001, + "loss": 3.2135, + "num_input_tokens_seen": 3512729600, + "step": 13400 + }, + { + "epoch": 0.09047140985734778, + "grad_norm": 0.18234893679618835, + "learning_rate": 0.001, + "loss": 3.2109, + "num_input_tokens_seen": 3525836800, + "step": 13450 + }, + { + "epoch": 0.09080773480105539, + "grad_norm": 0.16932611167430878, + "learning_rate": 0.001, + "loss": 3.2117, + "num_input_tokens_seen": 3538944000, + "step": 13500 + }, + { + "epoch": 0.09080773480105539, + "eval_loss": 3.1214168071746826, + "eval_runtime": 100.8175, + "eval_samples_per_second": 49.595, + "eval_steps_per_second": 12.399, + "num_input_tokens_seen": 3538944000, + "step": 13500 + }, + { + "epoch": 0.091144059744763, + "grad_norm": 0.17800532281398773, + "learning_rate": 0.001, + "loss": 3.2005, + "num_input_tokens_seen": 3552051200, + "step": 13550 + }, + { + "epoch": 0.09148038468847061, + "grad_norm": 0.15552346408367157, + "learning_rate": 0.001, + "loss": 3.2165, + "num_input_tokens_seen": 3565158400, + "step": 13600 + }, + { + "epoch": 0.09181670963217822, + "grad_norm": 0.1732388734817505, + "learning_rate": 0.001, + "loss": 3.2159, + "num_input_tokens_seen": 3578265600, + "step": 13650 + }, + { + "epoch": 0.09215303457588583, + "grad_norm": 0.17064529657363892, + "learning_rate": 0.001, + "loss": 3.2213, + "num_input_tokens_seen": 3591372800, + "step": 13700 + }, + { + "epoch": 0.09248935951959344, + "grad_norm": 0.18150164186954498, + "learning_rate": 0.001, + "loss": 3.2207, + "num_input_tokens_seen": 3604480000, + "step": 13750 + }, + { + "epoch": 0.09282568446330107, + "grad_norm": 0.16305723786354065, + "learning_rate": 0.001, + "loss": 3.2112, + "num_input_tokens_seen": 3617587200, + "step": 13800 + }, + { + "epoch": 0.09316200940700868, + "grad_norm": 0.17140090465545654, + "learning_rate": 0.001, + "loss": 3.2064, + "num_input_tokens_seen": 3630694400, + "step": 13850 + }, + { + "epoch": 0.09349833435071629, + "grad_norm": 0.3770304024219513, + "learning_rate": 0.001, + "loss": 3.2129, + "num_input_tokens_seen": 3643801600, + "step": 13900 + }, + { + "epoch": 0.0938346592944239, + "grad_norm": 0.15605700016021729, + "learning_rate": 0.001, + "loss": 3.2194, + "num_input_tokens_seen": 3656908800, + "step": 13950 + }, + { + "epoch": 0.09417098423813151, + "grad_norm": 0.18392467498779297, + "learning_rate": 0.001, + "loss": 3.2057, + "num_input_tokens_seen": 3670016000, + "step": 14000 + }, + { + "epoch": 0.09417098423813151, + "eval_loss": 3.1151933670043945, + "eval_runtime": 100.6439, + "eval_samples_per_second": 49.68, + "eval_steps_per_second": 12.42, + "num_input_tokens_seen": 3670016000, + "step": 14000 + }, + { + "epoch": 0.09450730918183912, + "grad_norm": 0.17042067646980286, + "learning_rate": 0.001, + "loss": 3.2079, + "num_input_tokens_seen": 3683123200, + "step": 14050 + }, + { + "epoch": 0.09484363412554674, + "grad_norm": 0.1771795153617859, + "learning_rate": 0.001, + "loss": 3.2074, + "num_input_tokens_seen": 3696230400, + "step": 14100 + }, + { + "epoch": 0.09517995906925435, + "grad_norm": 0.18254883587360382, + "learning_rate": 0.001, + "loss": 3.1999, + "num_input_tokens_seen": 3709337600, + "step": 14150 + }, + { + "epoch": 0.09551628401296196, + "grad_norm": 0.17174501717090607, + "learning_rate": 0.001, + "loss": 3.2066, + "num_input_tokens_seen": 3722444800, + "step": 14200 + }, + { + "epoch": 0.09585260895666958, + "grad_norm": 0.15733762085437775, + "learning_rate": 0.001, + "loss": 3.1931, + "num_input_tokens_seen": 3735552000, + "step": 14250 + }, + { + "epoch": 0.0961889339003772, + "grad_norm": 0.17221161723136902, + "learning_rate": 0.001, + "loss": 3.2055, + "num_input_tokens_seen": 3748659200, + "step": 14300 + }, + { + "epoch": 0.0965252588440848, + "grad_norm": 0.2117476761341095, + "learning_rate": 0.001, + "loss": 3.2046, + "num_input_tokens_seen": 3761766400, + "step": 14350 + }, + { + "epoch": 0.09686158378779242, + "grad_norm": 0.19019798934459686, + "learning_rate": 0.001, + "loss": 3.2086, + "num_input_tokens_seen": 3774873600, + "step": 14400 + }, + { + "epoch": 0.09719790873150003, + "grad_norm": 0.1791025549173355, + "learning_rate": 0.001, + "loss": 3.2002, + "num_input_tokens_seen": 3787980800, + "step": 14450 + }, + { + "epoch": 0.09753423367520764, + "grad_norm": 0.1800592541694641, + "learning_rate": 0.001, + "loss": 3.2121, + "num_input_tokens_seen": 3801088000, + "step": 14500 + }, + { + "epoch": 0.09753423367520764, + "eval_loss": 3.1071391105651855, + "eval_runtime": 101.3171, + "eval_samples_per_second": 49.35, + "eval_steps_per_second": 12.337, + "num_input_tokens_seen": 3801088000, + "step": 14500 + }, + { + "epoch": 0.09787055861891525, + "grad_norm": 0.15765570104122162, + "learning_rate": 0.001, + "loss": 3.2046, + "num_input_tokens_seen": 3814195200, + "step": 14550 + }, + { + "epoch": 0.09820688356262286, + "grad_norm": 0.16369874775409698, + "learning_rate": 0.001, + "loss": 3.2059, + "num_input_tokens_seen": 3827302400, + "step": 14600 + }, + { + "epoch": 0.09854320850633047, + "grad_norm": 0.16802681982517242, + "learning_rate": 0.001, + "loss": 3.1967, + "num_input_tokens_seen": 3840409600, + "step": 14650 + }, + { + "epoch": 0.0988795334500381, + "grad_norm": 0.1722741425037384, + "learning_rate": 0.001, + "loss": 3.2189, + "num_input_tokens_seen": 3853516800, + "step": 14700 + }, + { + "epoch": 0.0992158583937457, + "grad_norm": 0.17442888021469116, + "learning_rate": 0.001, + "loss": 3.2079, + "num_input_tokens_seen": 3866624000, + "step": 14750 + }, + { + "epoch": 0.09955218333745332, + "grad_norm": 0.18931840360164642, + "learning_rate": 0.001, + "loss": 3.1978, + "num_input_tokens_seen": 3879731200, + "step": 14800 + }, + { + "epoch": 0.09988850828116093, + "grad_norm": 0.17893177270889282, + "learning_rate": 0.001, + "loss": 3.202, + "num_input_tokens_seen": 3892838400, + "step": 14850 + }, + { + "epoch": 0.10022483322486854, + "grad_norm": 0.18453757464885712, + "learning_rate": 0.001, + "loss": 3.2004, + "num_input_tokens_seen": 3905945600, + "step": 14900 + }, + { + "epoch": 0.10056115816857615, + "grad_norm": 0.17419569194316864, + "learning_rate": 0.001, + "loss": 3.1936, + "num_input_tokens_seen": 3919052800, + "step": 14950 + }, + { + "epoch": 0.10089748311228376, + "grad_norm": 0.1765667200088501, + "learning_rate": 0.001, + "loss": 3.2015, + "num_input_tokens_seen": 3932160000, + "step": 15000 + }, + { + "epoch": 0.10089748311228376, + "eval_loss": 3.1014840602874756, + "eval_runtime": 100.8594, + "eval_samples_per_second": 49.574, + "eval_steps_per_second": 12.393, + "num_input_tokens_seen": 3932160000, + "step": 15000 + }, + { + "epoch": 0.10123380805599137, + "grad_norm": 0.18003995716571808, + "learning_rate": 0.001, + "loss": 3.2003, + "num_input_tokens_seen": 3945267200, + "step": 15050 + }, + { + "epoch": 0.10157013299969898, + "grad_norm": 0.16458339989185333, + "learning_rate": 0.001, + "loss": 3.1917, + "num_input_tokens_seen": 3958374400, + "step": 15100 + }, + { + "epoch": 0.1019064579434066, + "grad_norm": 0.6094233393669128, + "learning_rate": 0.001, + "loss": 3.2255, + "num_input_tokens_seen": 3971481600, + "step": 15150 + }, + { + "epoch": 0.10224278288711422, + "grad_norm": 0.20225363969802856, + "learning_rate": 0.001, + "loss": 3.2114, + "num_input_tokens_seen": 3984588800, + "step": 15200 + }, + { + "epoch": 0.10257910783082183, + "grad_norm": 0.19048044085502625, + "learning_rate": 0.001, + "loss": 3.2135, + "num_input_tokens_seen": 3997696000, + "step": 15250 + }, + { + "epoch": 0.10291543277452944, + "grad_norm": 0.17177866399288177, + "learning_rate": 0.001, + "loss": 3.1954, + "num_input_tokens_seen": 4010803200, + "step": 15300 + }, + { + "epoch": 0.10325175771823705, + "grad_norm": 0.17647191882133484, + "learning_rate": 0.001, + "loss": 3.2036, + "num_input_tokens_seen": 4023910400, + "step": 15350 + }, + { + "epoch": 0.10358808266194466, + "grad_norm": 0.16163323819637299, + "learning_rate": 0.001, + "loss": 3.1913, + "num_input_tokens_seen": 4037017600, + "step": 15400 + }, + { + "epoch": 0.10392440760565227, + "grad_norm": 0.18218201398849487, + "learning_rate": 0.001, + "loss": 3.1963, + "num_input_tokens_seen": 4050124800, + "step": 15450 + }, + { + "epoch": 0.10426073254935989, + "grad_norm": 0.17650413513183594, + "learning_rate": 0.001, + "loss": 3.1925, + "num_input_tokens_seen": 4063232000, + "step": 15500 + }, + { + "epoch": 0.10426073254935989, + "eval_loss": 3.0995914936065674, + "eval_runtime": 99.8658, + "eval_samples_per_second": 50.067, + "eval_steps_per_second": 12.517, + "num_input_tokens_seen": 4063232000, + "step": 15500 + }, + { + "epoch": 0.1045970574930675, + "grad_norm": 0.1616327166557312, + "learning_rate": 0.001, + "loss": 3.1924, + "num_input_tokens_seen": 4076339200, + "step": 15550 + }, + { + "epoch": 0.10493338243677511, + "grad_norm": 0.16149432957172394, + "learning_rate": 0.001, + "loss": 3.189, + "num_input_tokens_seen": 4089446400, + "step": 15600 + }, + { + "epoch": 0.10526970738048273, + "grad_norm": 0.2035779058933258, + "learning_rate": 0.001, + "loss": 3.1927, + "num_input_tokens_seen": 4102553600, + "step": 15650 + }, + { + "epoch": 0.10560603232419034, + "grad_norm": 0.16653041541576385, + "learning_rate": 0.001, + "loss": 3.1874, + "num_input_tokens_seen": 4115660800, + "step": 15700 + }, + { + "epoch": 0.10594235726789795, + "grad_norm": 0.16677066683769226, + "learning_rate": 0.001, + "loss": 3.1831, + "num_input_tokens_seen": 4128768000, + "step": 15750 + }, + { + "epoch": 0.10627868221160557, + "grad_norm": 0.17420975863933563, + "learning_rate": 0.001, + "loss": 3.1933, + "num_input_tokens_seen": 4141875200, + "step": 15800 + }, + { + "epoch": 0.10661500715531318, + "grad_norm": 0.16593104600906372, + "learning_rate": 0.001, + "loss": 3.1869, + "num_input_tokens_seen": 4154982400, + "step": 15850 + }, + { + "epoch": 0.10695133209902079, + "grad_norm": 0.18399874866008759, + "learning_rate": 0.001, + "loss": 3.1894, + "num_input_tokens_seen": 4168089600, + "step": 15900 + }, + { + "epoch": 0.1072876570427284, + "grad_norm": 0.15823860466480255, + "learning_rate": 0.001, + "loss": 3.1887, + "num_input_tokens_seen": 4181196800, + "step": 15950 + }, + { + "epoch": 0.10762398198643601, + "grad_norm": 0.18964843451976776, + "learning_rate": 0.001, + "loss": 3.1796, + "num_input_tokens_seen": 4194304000, + "step": 16000 + }, + { + "epoch": 0.10762398198643601, + "eval_loss": 3.0902183055877686, + "eval_runtime": 100.8295, + "eval_samples_per_second": 49.589, + "eval_steps_per_second": 12.397, + "num_input_tokens_seen": 4194304000, + "step": 16000 + }, + { + "epoch": 0.10796030693014362, + "grad_norm": 0.1692574918270111, + "learning_rate": 0.001, + "loss": 3.1892, + "num_input_tokens_seen": 4207411200, + "step": 16050 + }, + { + "epoch": 0.10829663187385125, + "grad_norm": 0.162678524851799, + "learning_rate": 0.001, + "loss": 3.1849, + "num_input_tokens_seen": 4220518400, + "step": 16100 + }, + { + "epoch": 0.10863295681755886, + "grad_norm": 0.16249045729637146, + "learning_rate": 0.001, + "loss": 3.1849, + "num_input_tokens_seen": 4233625600, + "step": 16150 + }, + { + "epoch": 0.10896928176126647, + "grad_norm": 0.17242908477783203, + "learning_rate": 0.001, + "loss": 3.1791, + "num_input_tokens_seen": 4246732800, + "step": 16200 + }, + { + "epoch": 0.10930560670497408, + "grad_norm": 0.15996769070625305, + "learning_rate": 0.001, + "loss": 3.1812, + "num_input_tokens_seen": 4259840000, + "step": 16250 + }, + { + "epoch": 0.10964193164868169, + "grad_norm": 0.1693849265575409, + "learning_rate": 0.001, + "loss": 3.1762, + "num_input_tokens_seen": 4272947200, + "step": 16300 + }, + { + "epoch": 0.1099782565923893, + "grad_norm": 0.1593247950077057, + "learning_rate": 0.001, + "loss": 3.1806, + "num_input_tokens_seen": 4286054400, + "step": 16350 + }, + { + "epoch": 0.11031458153609691, + "grad_norm": 0.16207775473594666, + "learning_rate": 0.001, + "loss": 3.175, + "num_input_tokens_seen": 4299161600, + "step": 16400 + }, + { + "epoch": 0.11065090647980452, + "grad_norm": 0.17720963060855865, + "learning_rate": 0.001, + "loss": 3.1834, + "num_input_tokens_seen": 4312268800, + "step": 16450 + }, + { + "epoch": 0.11098723142351213, + "grad_norm": 0.1996976137161255, + "learning_rate": 0.001, + "loss": 3.211, + "num_input_tokens_seen": 4325376000, + "step": 16500 + }, + { + "epoch": 0.11098723142351213, + "eval_loss": 3.098728895187378, + "eval_runtime": 101.226, + "eval_samples_per_second": 49.394, + "eval_steps_per_second": 12.349, + "num_input_tokens_seen": 4325376000, + "step": 16500 + }, + { + "epoch": 0.11132355636721976, + "grad_norm": 0.1731133908033371, + "learning_rate": 0.001, + "loss": 3.1828, + "num_input_tokens_seen": 4338483200, + "step": 16550 + }, + { + "epoch": 0.11165988131092737, + "grad_norm": 0.21048209071159363, + "learning_rate": 0.001, + "loss": 3.1843, + "num_input_tokens_seen": 4351590400, + "step": 16600 + }, + { + "epoch": 0.11199620625463498, + "grad_norm": 0.18280939757823944, + "learning_rate": 0.001, + "loss": 3.1904, + "num_input_tokens_seen": 4364697600, + "step": 16650 + }, + { + "epoch": 0.11233253119834259, + "grad_norm": 0.15612006187438965, + "learning_rate": 0.001, + "loss": 3.1795, + "num_input_tokens_seen": 4377804800, + "step": 16700 + }, + { + "epoch": 0.1126688561420502, + "grad_norm": 0.17242297530174255, + "learning_rate": 0.001, + "loss": 3.1727, + "num_input_tokens_seen": 4390912000, + "step": 16750 + }, + { + "epoch": 0.11300518108575781, + "grad_norm": 0.170341357588768, + "learning_rate": 0.001, + "loss": 3.1828, + "num_input_tokens_seen": 4404019200, + "step": 16800 + }, + { + "epoch": 0.11334150602946542, + "grad_norm": 0.17627349495887756, + "learning_rate": 0.001, + "loss": 3.1946, + "num_input_tokens_seen": 4417126400, + "step": 16850 + }, + { + "epoch": 0.11367783097317304, + "grad_norm": 0.19702504575252533, + "learning_rate": 0.001, + "loss": 3.1737, + "num_input_tokens_seen": 4430233600, + "step": 16900 + }, + { + "epoch": 0.11401415591688065, + "grad_norm": 0.170149028301239, + "learning_rate": 0.001, + "loss": 3.188, + "num_input_tokens_seen": 4443340800, + "step": 16950 + }, + { + "epoch": 0.11435048086058827, + "grad_norm": 0.1967497169971466, + "learning_rate": 0.001, + "loss": 3.1778, + "num_input_tokens_seen": 4456448000, + "step": 17000 + }, + { + "epoch": 0.11435048086058827, + "eval_loss": 3.084319829940796, + "eval_runtime": 101.2745, + "eval_samples_per_second": 49.371, + "eval_steps_per_second": 12.343, + "num_input_tokens_seen": 4456448000, + "step": 17000 + }, + { + "epoch": 0.11468680580429588, + "grad_norm": 0.17489473521709442, + "learning_rate": 0.001, + "loss": 3.1781, + "num_input_tokens_seen": 4469555200, + "step": 17050 + }, + { + "epoch": 0.1150231307480035, + "grad_norm": 0.17033468186855316, + "learning_rate": 0.001, + "loss": 3.1787, + "num_input_tokens_seen": 4482662400, + "step": 17100 + }, + { + "epoch": 0.1153594556917111, + "grad_norm": 0.16838806867599487, + "learning_rate": 0.001, + "loss": 3.1819, + "num_input_tokens_seen": 4495769600, + "step": 17150 + }, + { + "epoch": 0.11569578063541872, + "grad_norm": 0.18173356354236603, + "learning_rate": 0.001, + "loss": 3.1663, + "num_input_tokens_seen": 4508876800, + "step": 17200 + }, + { + "epoch": 0.11603210557912633, + "grad_norm": 0.17072565853595734, + "learning_rate": 0.001, + "loss": 3.1777, + "num_input_tokens_seen": 4521984000, + "step": 17250 + }, + { + "epoch": 0.11636843052283394, + "grad_norm": 0.17745070159435272, + "learning_rate": 0.001, + "loss": 3.1708, + "num_input_tokens_seen": 4535091200, + "step": 17300 + }, + { + "epoch": 0.11670475546654155, + "grad_norm": 0.16486075520515442, + "learning_rate": 0.001, + "loss": 3.1698, + "num_input_tokens_seen": 4548198400, + "step": 17350 + }, + { + "epoch": 0.11704108041024916, + "grad_norm": 0.1572778970003128, + "learning_rate": 0.001, + "loss": 3.1742, + "num_input_tokens_seen": 4561305600, + "step": 17400 + }, + { + "epoch": 0.11737740535395678, + "grad_norm": 0.17188695073127747, + "learning_rate": 0.001, + "loss": 3.1779, + "num_input_tokens_seen": 4574412800, + "step": 17450 + }, + { + "epoch": 0.1177137302976644, + "grad_norm": 0.16766607761383057, + "learning_rate": 0.001, + "loss": 3.1717, + "num_input_tokens_seen": 4587520000, + "step": 17500 + }, + { + "epoch": 0.1177137302976644, + "eval_loss": 3.075218677520752, + "eval_runtime": 101.5328, + "eval_samples_per_second": 49.245, + "eval_steps_per_second": 12.311, + "num_input_tokens_seen": 4587520000, + "step": 17500 + }, + { + "epoch": 0.118050055241372, + "grad_norm": 0.16463638842105865, + "learning_rate": 0.001, + "loss": 3.1698, + "num_input_tokens_seen": 4600627200, + "step": 17550 + }, + { + "epoch": 0.11838638018507962, + "grad_norm": 0.4281676709651947, + "learning_rate": 0.001, + "loss": 3.169, + "num_input_tokens_seen": 4613734400, + "step": 17600 + }, + { + "epoch": 0.11872270512878723, + "grad_norm": 0.18109829723834991, + "learning_rate": 0.001, + "loss": 3.166, + "num_input_tokens_seen": 4626841600, + "step": 17650 + }, + { + "epoch": 0.11905903007249484, + "grad_norm": 0.16371768712997437, + "learning_rate": 0.001, + "loss": 3.1694, + "num_input_tokens_seen": 4639948800, + "step": 17700 + }, + { + "epoch": 0.11939535501620245, + "grad_norm": 0.18475505709648132, + "learning_rate": 0.001, + "loss": 3.174, + "num_input_tokens_seen": 4653056000, + "step": 17750 + }, + { + "epoch": 0.11973167995991006, + "grad_norm": 0.20489992201328278, + "learning_rate": 0.001, + "loss": 3.1644, + "num_input_tokens_seen": 4666163200, + "step": 17800 + }, + { + "epoch": 0.12006800490361767, + "grad_norm": 0.1695111244916916, + "learning_rate": 0.001, + "loss": 3.1699, + "num_input_tokens_seen": 4679270400, + "step": 17850 + }, + { + "epoch": 0.1204043298473253, + "grad_norm": 0.16501003503799438, + "learning_rate": 0.001, + "loss": 3.1549, + "num_input_tokens_seen": 4692377600, + "step": 17900 + }, + { + "epoch": 0.12074065479103291, + "grad_norm": 0.16232050955295563, + "learning_rate": 0.001, + "loss": 3.1762, + "num_input_tokens_seen": 4705484800, + "step": 17950 + }, + { + "epoch": 0.12107697973474052, + "grad_norm": 0.17002490162849426, + "learning_rate": 0.001, + "loss": 3.1597, + "num_input_tokens_seen": 4718592000, + "step": 18000 + }, + { + "epoch": 0.12107697973474052, + "eval_loss": 3.069894552230835, + "eval_runtime": 100.7483, + "eval_samples_per_second": 49.629, + "eval_steps_per_second": 12.407, + "num_input_tokens_seen": 4718592000, + "step": 18000 + }, + { + "epoch": 0.12141330467844813, + "grad_norm": 0.1668955534696579, + "learning_rate": 0.001, + "loss": 3.1641, + "num_input_tokens_seen": 4731699200, + "step": 18050 + }, + { + "epoch": 0.12174962962215574, + "grad_norm": 0.17743679881095886, + "learning_rate": 0.001, + "loss": 3.1717, + "num_input_tokens_seen": 4744806400, + "step": 18100 + }, + { + "epoch": 0.12208595456586335, + "grad_norm": 0.17474418878555298, + "learning_rate": 0.001, + "loss": 3.1745, + "num_input_tokens_seen": 4757913600, + "step": 18150 + }, + { + "epoch": 0.12242227950957096, + "grad_norm": 0.18446923792362213, + "learning_rate": 0.001, + "loss": 3.1526, + "num_input_tokens_seen": 4771020800, + "step": 18200 + }, + { + "epoch": 0.12275860445327857, + "grad_norm": 0.19560950994491577, + "learning_rate": 0.001, + "loss": 3.1736, + "num_input_tokens_seen": 4784128000, + "step": 18250 + }, + { + "epoch": 0.12309492939698619, + "grad_norm": 0.17012590169906616, + "learning_rate": 0.001, + "loss": 3.1743, + "num_input_tokens_seen": 4797235200, + "step": 18300 + }, + { + "epoch": 0.12343125434069381, + "grad_norm": 0.17102253437042236, + "learning_rate": 0.001, + "loss": 3.169, + "num_input_tokens_seen": 4810342400, + "step": 18350 + }, + { + "epoch": 0.12376757928440142, + "grad_norm": 0.16899776458740234, + "learning_rate": 0.001, + "loss": 3.1657, + "num_input_tokens_seen": 4823449600, + "step": 18400 + }, + { + "epoch": 0.12410390422810903, + "grad_norm": 0.18831774592399597, + "learning_rate": 0.001, + "loss": 3.1664, + "num_input_tokens_seen": 4836556800, + "step": 18450 + }, + { + "epoch": 0.12444022917181664, + "grad_norm": 0.2649637460708618, + "learning_rate": 0.001, + "loss": 3.183, + "num_input_tokens_seen": 4849664000, + "step": 18500 + }, + { + "epoch": 0.12444022917181664, + "eval_loss": 3.088426351547241, + "eval_runtime": 100.5331, + "eval_samples_per_second": 49.735, + "eval_steps_per_second": 12.434, + "num_input_tokens_seen": 4849664000, + "step": 18500 + }, + { + "epoch": 0.12477655411552425, + "grad_norm": 0.2588728368282318, + "learning_rate": 0.001, + "loss": 3.2022, + "num_input_tokens_seen": 4862771200, + "step": 18550 + }, + { + "epoch": 0.12511287905923188, + "grad_norm": 0.1861683577299118, + "learning_rate": 0.001, + "loss": 3.1779, + "num_input_tokens_seen": 4875878400, + "step": 18600 + }, + { + "epoch": 0.1254492040029395, + "grad_norm": 0.1803797483444214, + "learning_rate": 0.001, + "loss": 3.1761, + "num_input_tokens_seen": 4888985600, + "step": 18650 + }, + { + "epoch": 0.1257855289466471, + "grad_norm": 0.20752698183059692, + "learning_rate": 0.001, + "loss": 3.1867, + "num_input_tokens_seen": 4902092800, + "step": 18700 + }, + { + "epoch": 0.1261218538903547, + "grad_norm": 0.18387116491794586, + "learning_rate": 0.001, + "loss": 3.1667, + "num_input_tokens_seen": 4915200000, + "step": 18750 + }, + { + "epoch": 0.12645817883406232, + "grad_norm": 0.3406733572483063, + "learning_rate": 0.001, + "loss": 3.1706, + "num_input_tokens_seen": 4928307200, + "step": 18800 + }, + { + "epoch": 0.12679450377776993, + "grad_norm": 0.17068707942962646, + "learning_rate": 0.001, + "loss": 3.1715, + "num_input_tokens_seen": 4941414400, + "step": 18850 + }, + { + "epoch": 0.12713082872147755, + "grad_norm": 0.18792368471622467, + "learning_rate": 0.001, + "loss": 3.1711, + "num_input_tokens_seen": 4954521600, + "step": 18900 + }, + { + "epoch": 0.12746715366518516, + "grad_norm": 0.18366409838199615, + "learning_rate": 0.001, + "loss": 3.1724, + "num_input_tokens_seen": 4967628800, + "step": 18950 + }, + { + "epoch": 0.12780347860889277, + "grad_norm": 0.15937770903110504, + "learning_rate": 0.001, + "loss": 3.1541, + "num_input_tokens_seen": 4980736000, + "step": 19000 + }, + { + "epoch": 0.12780347860889277, + "eval_loss": 3.06679630279541, + "eval_runtime": 100.9386, + "eval_samples_per_second": 49.535, + "eval_steps_per_second": 12.384, + "num_input_tokens_seen": 4980736000, + "step": 19000 + }, + { + "epoch": 0.12813980355260038, + "grad_norm": 0.17235547304153442, + "learning_rate": 0.001, + "loss": 3.1644, + "num_input_tokens_seen": 4993843200, + "step": 19050 + }, + { + "epoch": 0.128476128496308, + "grad_norm": 0.17877432703971863, + "learning_rate": 0.001, + "loss": 3.1613, + "num_input_tokens_seen": 5006950400, + "step": 19100 + }, + { + "epoch": 0.1288124534400156, + "grad_norm": 0.20647567510604858, + "learning_rate": 0.001, + "loss": 3.157, + "num_input_tokens_seen": 5020057600, + "step": 19150 + }, + { + "epoch": 0.1291487783837232, + "grad_norm": 0.17320992052555084, + "learning_rate": 0.001, + "loss": 3.1597, + "num_input_tokens_seen": 5033164800, + "step": 19200 + }, + { + "epoch": 0.12948510332743082, + "grad_norm": 0.1975463479757309, + "learning_rate": 0.001, + "loss": 3.1588, + "num_input_tokens_seen": 5046272000, + "step": 19250 + }, + { + "epoch": 0.12982142827113843, + "grad_norm": 0.2234167754650116, + "learning_rate": 0.001, + "loss": 3.1752, + "num_input_tokens_seen": 5059379200, + "step": 19300 + }, + { + "epoch": 0.13015775321484604, + "grad_norm": 0.16287492215633392, + "learning_rate": 0.001, + "loss": 3.1559, + "num_input_tokens_seen": 5072486400, + "step": 19350 + }, + { + "epoch": 0.13049407815855366, + "grad_norm": 0.21676546335220337, + "learning_rate": 0.001, + "loss": 3.1556, + "num_input_tokens_seen": 5085593600, + "step": 19400 + }, + { + "epoch": 0.1308304031022613, + "grad_norm": 0.19858922064304352, + "learning_rate": 0.001, + "loss": 3.1617, + "num_input_tokens_seen": 5098700800, + "step": 19450 + }, + { + "epoch": 0.1311667280459689, + "grad_norm": 0.17459186911582947, + "learning_rate": 0.001, + "loss": 3.1499, + "num_input_tokens_seen": 5111808000, + "step": 19500 + }, + { + "epoch": 0.1311667280459689, + "eval_loss": 3.0653886795043945, + "eval_runtime": 101.0357, + "eval_samples_per_second": 49.487, + "eval_steps_per_second": 12.372, + "num_input_tokens_seen": 5111808000, + "step": 19500 + }, + { + "epoch": 0.13150305298967652, + "grad_norm": 0.17277012765407562, + "learning_rate": 0.001, + "loss": 3.1641, + "num_input_tokens_seen": 5124915200, + "step": 19550 + }, + { + "epoch": 0.13183937793338413, + "grad_norm": 0.1894499808549881, + "learning_rate": 0.001, + "loss": 3.1465, + "num_input_tokens_seen": 5138022400, + "step": 19600 + }, + { + "epoch": 0.13217570287709174, + "grad_norm": 0.17682771384716034, + "learning_rate": 0.001, + "loss": 3.1573, + "num_input_tokens_seen": 5151129600, + "step": 19650 + }, + { + "epoch": 0.13251202782079935, + "grad_norm": 0.17981217801570892, + "learning_rate": 0.001, + "loss": 3.1484, + "num_input_tokens_seen": 5164236800, + "step": 19700 + }, + { + "epoch": 0.13284835276450696, + "grad_norm": 0.19152362644672394, + "learning_rate": 0.001, + "loss": 3.1597, + "num_input_tokens_seen": 5177344000, + "step": 19750 + }, + { + "epoch": 0.13318467770821457, + "grad_norm": 0.17498181760311127, + "learning_rate": 0.001, + "loss": 3.1604, + "num_input_tokens_seen": 5190451200, + "step": 19800 + }, + { + "epoch": 0.13352100265192218, + "grad_norm": 0.17143672704696655, + "learning_rate": 0.001, + "loss": 3.1601, + "num_input_tokens_seen": 5203558400, + "step": 19850 + }, + { + "epoch": 0.1338573275956298, + "grad_norm": 0.19592677056789398, + "learning_rate": 0.001, + "loss": 3.1568, + "num_input_tokens_seen": 5216665600, + "step": 19900 + }, + { + "epoch": 0.1341936525393374, + "grad_norm": 0.1618625670671463, + "learning_rate": 0.001, + "loss": 3.1584, + "num_input_tokens_seen": 5229772800, + "step": 19950 + }, + { + "epoch": 0.13452997748304502, + "grad_norm": 0.18170061707496643, + "learning_rate": 0.001, + "loss": 3.1499, + "num_input_tokens_seen": 5242880000, + "step": 20000 + }, + { + "epoch": 0.13452997748304502, + "eval_loss": 3.056318759918213, + "eval_runtime": 100.9422, + "eval_samples_per_second": 49.533, + "eval_steps_per_second": 12.383, + "num_input_tokens_seen": 5242880000, + "step": 20000 + }, + { + "epoch": 0.13486630242675263, + "grad_norm": 0.1726132184267044, + "learning_rate": 0.001, + "loss": 3.1545, + "num_input_tokens_seen": 5255987200, + "step": 20050 + }, + { + "epoch": 0.13520262737046024, + "grad_norm": 0.17690512537956238, + "learning_rate": 0.001, + "loss": 3.1526, + "num_input_tokens_seen": 5269094400, + "step": 20100 + }, + { + "epoch": 0.13553895231416785, + "grad_norm": 0.20317280292510986, + "learning_rate": 0.001, + "loss": 3.15, + "num_input_tokens_seen": 5282201600, + "step": 20150 + }, + { + "epoch": 0.13587527725787546, + "grad_norm": 0.16687753796577454, + "learning_rate": 0.001, + "loss": 3.144, + "num_input_tokens_seen": 5295308800, + "step": 20200 + }, + { + "epoch": 0.13621160220158307, + "grad_norm": 0.16671325266361237, + "learning_rate": 0.001, + "loss": 3.1425, + "num_input_tokens_seen": 5308416000, + "step": 20250 + }, + { + "epoch": 0.13654792714529068, + "grad_norm": 0.16259203851222992, + "learning_rate": 0.001, + "loss": 3.1386, + "num_input_tokens_seen": 5321523200, + "step": 20300 + }, + { + "epoch": 0.1368842520889983, + "grad_norm": 0.15738168358802795, + "learning_rate": 0.001, + "loss": 3.148, + "num_input_tokens_seen": 5334630400, + "step": 20350 + }, + { + "epoch": 0.13722057703270593, + "grad_norm": 0.17398318648338318, + "learning_rate": 0.001, + "loss": 3.1484, + "num_input_tokens_seen": 5347737600, + "step": 20400 + }, + { + "epoch": 0.13755690197641354, + "grad_norm": 0.177555114030838, + "learning_rate": 0.001, + "loss": 3.1507, + "num_input_tokens_seen": 5360844800, + "step": 20450 + }, + { + "epoch": 0.13789322692012115, + "grad_norm": 0.16208910942077637, + "learning_rate": 0.001, + "loss": 3.1462, + "num_input_tokens_seen": 5373952000, + "step": 20500 + }, + { + "epoch": 0.13789322692012115, + "eval_loss": 3.0525152683258057, + "eval_runtime": 101.4785, + "eval_samples_per_second": 49.272, + "eval_steps_per_second": 12.318, + "num_input_tokens_seen": 5373952000, + "step": 20500 + }, + { + "epoch": 0.13822955186382876, + "grad_norm": 0.1615586280822754, + "learning_rate": 0.001, + "loss": 3.1469, + "num_input_tokens_seen": 5387059200, + "step": 20550 + }, + { + "epoch": 0.13856587680753638, + "grad_norm": 0.17003317177295685, + "learning_rate": 0.001, + "loss": 3.15, + "num_input_tokens_seen": 5400166400, + "step": 20600 + }, + { + "epoch": 0.138902201751244, + "grad_norm": 0.16299164295196533, + "learning_rate": 0.001, + "loss": 3.1522, + "num_input_tokens_seen": 5413273600, + "step": 20650 + }, + { + "epoch": 0.1392385266949516, + "grad_norm": 0.18732890486717224, + "learning_rate": 0.001, + "loss": 3.1462, + "num_input_tokens_seen": 5426380800, + "step": 20700 + }, + { + "epoch": 0.1395748516386592, + "grad_norm": 0.23970580101013184, + "learning_rate": 0.001, + "loss": 3.1832, + "num_input_tokens_seen": 5439488000, + "step": 20750 + }, + { + "epoch": 0.13991117658236682, + "grad_norm": 0.1701073795557022, + "learning_rate": 0.001, + "loss": 3.1713, + "num_input_tokens_seen": 5452595200, + "step": 20800 + }, + { + "epoch": 0.14024750152607443, + "grad_norm": 0.18976852297782898, + "learning_rate": 0.001, + "loss": 3.1636, + "num_input_tokens_seen": 5465702400, + "step": 20850 + }, + { + "epoch": 0.14058382646978204, + "grad_norm": 0.17788629233837128, + "learning_rate": 0.001, + "loss": 3.1712, + "num_input_tokens_seen": 5478809600, + "step": 20900 + }, + { + "epoch": 0.14092015141348965, + "grad_norm": 0.20413383841514587, + "learning_rate": 0.001, + "loss": 3.1604, + "num_input_tokens_seen": 5491916800, + "step": 20950 + }, + { + "epoch": 0.14125647635719726, + "grad_norm": 0.1921602487564087, + "learning_rate": 0.001, + "loss": 3.15, + "num_input_tokens_seen": 5505024000, + "step": 21000 + }, + { + "epoch": 0.14125647635719726, + "eval_loss": 3.053833484649658, + "eval_runtime": 101.2468, + "eval_samples_per_second": 49.384, + "eval_steps_per_second": 12.346, + "num_input_tokens_seen": 5505024000, + "step": 21000 + }, + { + "epoch": 0.14159280130090487, + "grad_norm": 0.18453796207904816, + "learning_rate": 0.001, + "loss": 3.1542, + "num_input_tokens_seen": 5518131200, + "step": 21050 + }, + { + "epoch": 0.14192912624461249, + "grad_norm": 0.29931920766830444, + "learning_rate": 0.001, + "loss": 3.1633, + "num_input_tokens_seen": 5531238400, + "step": 21100 + }, + { + "epoch": 0.1422654511883201, + "grad_norm": 0.19636057317256927, + "learning_rate": 0.001, + "loss": 3.1544, + "num_input_tokens_seen": 5544345600, + "step": 21150 + }, + { + "epoch": 0.1426017761320277, + "grad_norm": 0.3110333979129791, + "learning_rate": 0.001, + "loss": 3.186, + "num_input_tokens_seen": 5557452800, + "step": 21200 + }, + { + "epoch": 0.14293810107573532, + "grad_norm": 0.21632343530654907, + "learning_rate": 0.001, + "loss": 3.1759, + "num_input_tokens_seen": 5570560000, + "step": 21250 + }, + { + "epoch": 0.14327442601944296, + "grad_norm": 0.23088929057121277, + "learning_rate": 0.001, + "loss": 3.1683, + "num_input_tokens_seen": 5583667200, + "step": 21300 + }, + { + "epoch": 0.14361075096315057, + "grad_norm": 0.19326886534690857, + "learning_rate": 0.001, + "loss": 3.1535, + "num_input_tokens_seen": 5596774400, + "step": 21350 + }, + { + "epoch": 0.14394707590685818, + "grad_norm": 0.3554578125476837, + "learning_rate": 0.001, + "loss": 3.1606, + "num_input_tokens_seen": 5609881600, + "step": 21400 + }, + { + "epoch": 0.1442834008505658, + "grad_norm": 0.18607909977436066, + "learning_rate": 0.001, + "loss": 3.159, + "num_input_tokens_seen": 5622988800, + "step": 21450 + }, + { + "epoch": 0.1446197257942734, + "grad_norm": 0.2276984453201294, + "learning_rate": 0.001, + "loss": 3.1544, + "num_input_tokens_seen": 5636096000, + "step": 21500 + }, + { + "epoch": 0.1446197257942734, + "eval_loss": 3.051608085632324, + "eval_runtime": 101.9015, + "eval_samples_per_second": 49.067, + "eval_steps_per_second": 12.267, + "num_input_tokens_seen": 5636096000, + "step": 21500 + }, + { + "epoch": 0.144956050737981, + "grad_norm": 0.17796960473060608, + "learning_rate": 0.001, + "loss": 3.1488, + "num_input_tokens_seen": 5649203200, + "step": 21550 + }, + { + "epoch": 0.14529237568168862, + "grad_norm": 0.1721925139427185, + "learning_rate": 0.001, + "loss": 3.1465, + "num_input_tokens_seen": 5662310400, + "step": 21600 + }, + { + "epoch": 0.14562870062539623, + "grad_norm": 0.1779259443283081, + "learning_rate": 0.001, + "loss": 3.1479, + "num_input_tokens_seen": 5675417600, + "step": 21650 + }, + { + "epoch": 0.14596502556910385, + "grad_norm": 0.1955435425043106, + "learning_rate": 0.001, + "loss": 3.147, + "num_input_tokens_seen": 5688524800, + "step": 21700 + }, + { + "epoch": 0.14630135051281146, + "grad_norm": 0.2717543840408325, + "learning_rate": 0.001, + "loss": 3.1456, + "num_input_tokens_seen": 5701632000, + "step": 21750 + }, + { + "epoch": 0.14663767545651907, + "grad_norm": 0.22944161295890808, + "learning_rate": 0.001, + "loss": 3.1596, + "num_input_tokens_seen": 5714739200, + "step": 21800 + }, + { + "epoch": 0.14697400040022668, + "grad_norm": 0.18696273863315582, + "learning_rate": 0.001, + "loss": 3.1417, + "num_input_tokens_seen": 5727846400, + "step": 21850 + }, + { + "epoch": 0.1473103253439343, + "grad_norm": 0.17678014934062958, + "learning_rate": 0.001, + "loss": 3.1454, + "num_input_tokens_seen": 5740953600, + "step": 21900 + }, + { + "epoch": 0.1476466502876419, + "grad_norm": 0.16891658306121826, + "learning_rate": 0.001, + "loss": 3.142, + "num_input_tokens_seen": 5754060800, + "step": 21950 + }, + { + "epoch": 0.1479829752313495, + "grad_norm": 0.20900680124759674, + "learning_rate": 0.001, + "loss": 3.1475, + "num_input_tokens_seen": 5767168000, + "step": 22000 + }, + { + "epoch": 0.1479829752313495, + "eval_loss": 3.0481514930725098, + "eval_runtime": 100.7385, + "eval_samples_per_second": 49.633, + "eval_steps_per_second": 12.408, + "num_input_tokens_seen": 5767168000, + "step": 22000 + }, + { + "epoch": 0.14831930017505712, + "grad_norm": 0.17889799177646637, + "learning_rate": 0.001, + "loss": 3.1407, + "num_input_tokens_seen": 5780275200, + "step": 22050 + }, + { + "epoch": 0.14865562511876473, + "grad_norm": 1.7358074188232422, + "learning_rate": 0.001, + "loss": 3.1454, + "num_input_tokens_seen": 5793382400, + "step": 22100 + }, + { + "epoch": 0.14899195006247234, + "grad_norm": 0.17746248841285706, + "learning_rate": 0.001, + "loss": 3.154, + "num_input_tokens_seen": 5806489600, + "step": 22150 + }, + { + "epoch": 0.14932827500617998, + "grad_norm": 0.185111865401268, + "learning_rate": 0.001, + "loss": 3.1522, + "num_input_tokens_seen": 5819596800, + "step": 22200 + }, + { + "epoch": 0.1496645999498876, + "grad_norm": 0.23882286250591278, + "learning_rate": 0.001, + "loss": 3.1379, + "num_input_tokens_seen": 5832704000, + "step": 22250 + }, + { + "epoch": 0.1500009248935952, + "grad_norm": 0.19389848411083221, + "learning_rate": 0.001, + "loss": 3.1459, + "num_input_tokens_seen": 5845811200, + "step": 22300 + }, + { + "epoch": 0.15033724983730282, + "grad_norm": 0.18890796601772308, + "learning_rate": 0.001, + "loss": 3.1401, + "num_input_tokens_seen": 5858918400, + "step": 22350 + }, + { + "epoch": 0.15067357478101043, + "grad_norm": 0.17061029374599457, + "learning_rate": 0.001, + "loss": 3.1555, + "num_input_tokens_seen": 5872025600, + "step": 22400 + }, + { + "epoch": 0.15100989972471804, + "grad_norm": 0.18124708533287048, + "learning_rate": 0.001, + "loss": 3.1499, + "num_input_tokens_seen": 5885132800, + "step": 22450 + }, + { + "epoch": 0.15134622466842565, + "grad_norm": 0.21192225813865662, + "learning_rate": 0.001, + "loss": 3.1364, + "num_input_tokens_seen": 5898240000, + "step": 22500 + }, + { + "epoch": 0.15134622466842565, + "eval_loss": 3.042147636413574, + "eval_runtime": 100.7091, + "eval_samples_per_second": 49.648, + "eval_steps_per_second": 12.412, + "num_input_tokens_seen": 5898240000, + "step": 22500 + }, + { + "epoch": 0.15168254961213326, + "grad_norm": 0.16922616958618164, + "learning_rate": 0.001, + "loss": 3.149, + "num_input_tokens_seen": 5911347200, + "step": 22550 + }, + { + "epoch": 0.15201887455584087, + "grad_norm": 0.1636754721403122, + "learning_rate": 0.001, + "loss": 3.1379, + "num_input_tokens_seen": 5924454400, + "step": 22600 + }, + { + "epoch": 0.15235519949954848, + "grad_norm": 0.167410746216774, + "learning_rate": 0.001, + "loss": 3.1465, + "num_input_tokens_seen": 5937561600, + "step": 22650 + }, + { + "epoch": 0.1526915244432561, + "grad_norm": 0.18413004279136658, + "learning_rate": 0.001, + "loss": 3.131, + "num_input_tokens_seen": 5950668800, + "step": 22700 + }, + { + "epoch": 0.1530278493869637, + "grad_norm": 0.20482878386974335, + "learning_rate": 0.001, + "loss": 3.1395, + "num_input_tokens_seen": 5963776000, + "step": 22750 + }, + { + "epoch": 0.15336417433067132, + "grad_norm": 0.19031567871570587, + "learning_rate": 0.001, + "loss": 3.1369, + "num_input_tokens_seen": 5976883200, + "step": 22800 + }, + { + "epoch": 0.15370049927437893, + "grad_norm": 0.21622490882873535, + "learning_rate": 0.001, + "loss": 3.1448, + "num_input_tokens_seen": 5989990400, + "step": 22850 + }, + { + "epoch": 0.15403682421808654, + "grad_norm": 0.19105197489261627, + "learning_rate": 0.001, + "loss": 3.1456, + "num_input_tokens_seen": 6003097600, + "step": 22900 + }, + { + "epoch": 0.15437314916179415, + "grad_norm": 0.26653292775154114, + "learning_rate": 0.001, + "loss": 3.1405, + "num_input_tokens_seen": 6016204800, + "step": 22950 + }, + { + "epoch": 0.15470947410550176, + "grad_norm": 0.30645105242729187, + "learning_rate": 0.001, + "loss": 3.1564, + "num_input_tokens_seen": 6029312000, + "step": 23000 + }, + { + "epoch": 0.15470947410550176, + "eval_loss": 3.072319507598877, + "eval_runtime": 100.6281, + "eval_samples_per_second": 49.688, + "eval_steps_per_second": 12.422, + "num_input_tokens_seen": 6029312000, + "step": 23000 + }, + { + "epoch": 0.15504579904920937, + "grad_norm": 0.18146337568759918, + "learning_rate": 0.001, + "loss": 3.1774, + "num_input_tokens_seen": 6042419200, + "step": 23050 + }, + { + "epoch": 0.155382123992917, + "grad_norm": 0.19545282423496246, + "learning_rate": 0.001, + "loss": 3.1384, + "num_input_tokens_seen": 6055526400, + "step": 23100 + }, + { + "epoch": 0.15571844893662462, + "grad_norm": 0.17861327528953552, + "learning_rate": 0.001, + "loss": 3.1459, + "num_input_tokens_seen": 6068633600, + "step": 23150 + }, + { + "epoch": 0.15605477388033223, + "grad_norm": 0.17025263607501984, + "learning_rate": 0.001, + "loss": 3.1369, + "num_input_tokens_seen": 6081740800, + "step": 23200 + }, + { + "epoch": 0.15639109882403984, + "grad_norm": 0.17162847518920898, + "learning_rate": 0.001, + "loss": 3.1479, + "num_input_tokens_seen": 6094848000, + "step": 23250 + }, + { + "epoch": 0.15672742376774745, + "grad_norm": 0.19068972766399384, + "learning_rate": 0.001, + "loss": 3.1418, + "num_input_tokens_seen": 6107955200, + "step": 23300 + }, + { + "epoch": 0.15706374871145506, + "grad_norm": 0.20177774131298065, + "learning_rate": 0.001, + "loss": 3.1435, + "num_input_tokens_seen": 6121062400, + "step": 23350 + }, + { + "epoch": 0.15740007365516268, + "grad_norm": 0.235867440700531, + "learning_rate": 0.001, + "loss": 3.1466, + "num_input_tokens_seen": 6134169600, + "step": 23400 + }, + { + "epoch": 0.1577363985988703, + "grad_norm": 0.17313481867313385, + "learning_rate": 0.001, + "loss": 3.1509, + "num_input_tokens_seen": 6147276800, + "step": 23450 + }, + { + "epoch": 0.1580727235425779, + "grad_norm": 0.20208750665187836, + "learning_rate": 0.001, + "loss": 3.1312, + "num_input_tokens_seen": 6160384000, + "step": 23500 + }, + { + "epoch": 0.1580727235425779, + "eval_loss": 3.0457797050476074, + "eval_runtime": 99.8915, + "eval_samples_per_second": 50.054, + "eval_steps_per_second": 12.514, + "num_input_tokens_seen": 6160384000, + "step": 23500 + }, + { + "epoch": 0.1584090484862855, + "grad_norm": 0.1771639734506607, + "learning_rate": 0.001, + "loss": 3.1407, + "num_input_tokens_seen": 6173491200, + "step": 23550 + }, + { + "epoch": 0.15874537342999312, + "grad_norm": 0.20403757691383362, + "learning_rate": 0.001, + "loss": 3.1404, + "num_input_tokens_seen": 6186598400, + "step": 23600 + }, + { + "epoch": 0.15908169837370073, + "grad_norm": 0.19679167866706848, + "learning_rate": 0.001, + "loss": 3.1417, + "num_input_tokens_seen": 6199705600, + "step": 23650 + }, + { + "epoch": 0.15941802331740834, + "grad_norm": 0.18299609422683716, + "learning_rate": 0.001, + "loss": 3.14, + "num_input_tokens_seen": 6212812800, + "step": 23700 + }, + { + "epoch": 0.15975434826111595, + "grad_norm": 0.16773872077465057, + "learning_rate": 0.001, + "loss": 3.1326, + "num_input_tokens_seen": 6225920000, + "step": 23750 + }, + { + "epoch": 0.16009067320482356, + "grad_norm": 0.18539400398731232, + "learning_rate": 0.001, + "loss": 3.1294, + "num_input_tokens_seen": 6239027200, + "step": 23800 + }, + { + "epoch": 0.16042699814853117, + "grad_norm": 0.19088850915431976, + "learning_rate": 0.001, + "loss": 3.1297, + "num_input_tokens_seen": 6252134400, + "step": 23850 + }, + { + "epoch": 0.16076332309223879, + "grad_norm": 0.17390431463718414, + "learning_rate": 0.001, + "loss": 3.1364, + "num_input_tokens_seen": 6265241600, + "step": 23900 + }, + { + "epoch": 0.1610996480359464, + "grad_norm": 0.35694462060928345, + "learning_rate": 0.001, + "loss": 3.1519, + "num_input_tokens_seen": 6278348800, + "step": 23950 + }, + { + "epoch": 0.16143597297965404, + "grad_norm": 0.17942555248737335, + "learning_rate": 0.001, + "loss": 3.132, + "num_input_tokens_seen": 6291456000, + "step": 24000 + }, + { + "epoch": 0.16143597297965404, + "eval_loss": 3.0352118015289307, + "eval_runtime": 101.1582, + "eval_samples_per_second": 49.428, + "eval_steps_per_second": 12.357, + "num_input_tokens_seen": 6291456000, + "step": 24000 + }, + { + "epoch": 0.16177229792336165, + "grad_norm": 0.1748410165309906, + "learning_rate": 0.001, + "loss": 3.1385, + "num_input_tokens_seen": 6304563200, + "step": 24050 + }, + { + "epoch": 0.16210862286706926, + "grad_norm": 0.19787083566188812, + "learning_rate": 0.001, + "loss": 3.1241, + "num_input_tokens_seen": 6317670400, + "step": 24100 + }, + { + "epoch": 0.16244494781077687, + "grad_norm": 0.16369663178920746, + "learning_rate": 0.001, + "loss": 3.1402, + "num_input_tokens_seen": 6330777600, + "step": 24150 + }, + { + "epoch": 0.16278127275448448, + "grad_norm": 0.17019188404083252, + "learning_rate": 0.001, + "loss": 3.1293, + "num_input_tokens_seen": 6343884800, + "step": 24200 + }, + { + "epoch": 0.1631175976981921, + "grad_norm": 0.1666271686553955, + "learning_rate": 0.001, + "loss": 3.1218, + "num_input_tokens_seen": 6356992000, + "step": 24250 + }, + { + "epoch": 0.1634539226418997, + "grad_norm": 0.1591423600912094, + "learning_rate": 0.001, + "loss": 3.1319, + "num_input_tokens_seen": 6370099200, + "step": 24300 + }, + { + "epoch": 0.1637902475856073, + "grad_norm": 0.1642790287733078, + "learning_rate": 0.001, + "loss": 3.1319, + "num_input_tokens_seen": 6383206400, + "step": 24350 + }, + { + "epoch": 0.16412657252931492, + "grad_norm": 0.17917323112487793, + "learning_rate": 0.001, + "loss": 3.1204, + "num_input_tokens_seen": 6396313600, + "step": 24400 + }, + { + "epoch": 0.16446289747302253, + "grad_norm": 0.2367531955242157, + "learning_rate": 0.001, + "loss": 3.125, + "num_input_tokens_seen": 6409420800, + "step": 24450 + }, + { + "epoch": 0.16479922241673015, + "grad_norm": 0.175731360912323, + "learning_rate": 0.001, + "loss": 3.1358, + "num_input_tokens_seen": 6422528000, + "step": 24500 + }, + { + "epoch": 0.16479922241673015, + "eval_loss": 3.032782793045044, + "eval_runtime": 100.7748, + "eval_samples_per_second": 49.616, + "eval_steps_per_second": 12.404, + "num_input_tokens_seen": 6422528000, + "step": 24500 + }, + { + "epoch": 0.16513554736043776, + "grad_norm": 0.16778303682804108, + "learning_rate": 0.001, + "loss": 3.1382, + "num_input_tokens_seen": 6435635200, + "step": 24550 + }, + { + "epoch": 0.16547187230414537, + "grad_norm": 0.19019022583961487, + "learning_rate": 0.001, + "loss": 3.1328, + "num_input_tokens_seen": 6448742400, + "step": 24600 + }, + { + "epoch": 0.16580819724785298, + "grad_norm": 0.18815970420837402, + "learning_rate": 0.001, + "loss": 3.1274, + "num_input_tokens_seen": 6461849600, + "step": 24650 + }, + { + "epoch": 0.1661445221915606, + "grad_norm": 0.18299463391304016, + "learning_rate": 0.001, + "loss": 3.1258, + "num_input_tokens_seen": 6474956800, + "step": 24700 + }, + { + "epoch": 0.1664808471352682, + "grad_norm": 0.20152020454406738, + "learning_rate": 0.001, + "loss": 3.1245, + "num_input_tokens_seen": 6488064000, + "step": 24750 + }, + { + "epoch": 0.1668171720789758, + "grad_norm": 0.1800755113363266, + "learning_rate": 0.001, + "loss": 3.1332, + "num_input_tokens_seen": 6501171200, + "step": 24800 + }, + { + "epoch": 0.16715349702268342, + "grad_norm": 0.33742496371269226, + "learning_rate": 0.001, + "loss": 3.1264, + "num_input_tokens_seen": 6514278400, + "step": 24850 + }, + { + "epoch": 0.16748982196639106, + "grad_norm": 0.20022252202033997, + "learning_rate": 0.001, + "loss": 3.1414, + "num_input_tokens_seen": 6527385600, + "step": 24900 + }, + { + "epoch": 0.16782614691009867, + "grad_norm": 0.19905851781368256, + "learning_rate": 0.001, + "loss": 3.1327, + "num_input_tokens_seen": 6540492800, + "step": 24950 + }, + { + "epoch": 0.16816247185380628, + "grad_norm": 0.26300039887428284, + "learning_rate": 0.001, + "loss": 3.1231, + "num_input_tokens_seen": 6553600000, + "step": 25000 + }, + { + "epoch": 0.16816247185380628, + "eval_loss": 3.0353240966796875, + "eval_runtime": 100.6, + "eval_samples_per_second": 49.702, + "eval_steps_per_second": 12.425, + "num_input_tokens_seen": 6553600000, + "step": 25000 + }, + { + "epoch": 0.1684987967975139, + "grad_norm": 1.1301233768463135, + "learning_rate": 0.001, + "loss": 3.1419, + "num_input_tokens_seen": 6566707200, + "step": 25050 + }, + { + "epoch": 0.1688351217412215, + "grad_norm": 0.18019410967826843, + "learning_rate": 0.001, + "loss": 3.1514, + "num_input_tokens_seen": 6579814400, + "step": 25100 + }, + { + "epoch": 0.16917144668492912, + "grad_norm": 0.17898766696453094, + "learning_rate": 0.001, + "loss": 3.1337, + "num_input_tokens_seen": 6592921600, + "step": 25150 + }, + { + "epoch": 0.16950777162863673, + "grad_norm": 0.18393439054489136, + "learning_rate": 0.001, + "loss": 3.1313, + "num_input_tokens_seen": 6606028800, + "step": 25200 + }, + { + "epoch": 0.16984409657234434, + "grad_norm": 0.16713738441467285, + "learning_rate": 0.001, + "loss": 3.129, + "num_input_tokens_seen": 6619136000, + "step": 25250 + }, + { + "epoch": 0.17018042151605195, + "grad_norm": 0.17655207216739655, + "learning_rate": 0.001, + "loss": 3.1245, + "num_input_tokens_seen": 6632243200, + "step": 25300 + }, + { + "epoch": 0.17051674645975956, + "grad_norm": 0.20735637843608856, + "learning_rate": 0.001, + "loss": 3.1344, + "num_input_tokens_seen": 6645350400, + "step": 25350 + }, + { + "epoch": 0.17085307140346717, + "grad_norm": 0.21318195760250092, + "learning_rate": 0.001, + "loss": 3.1336, + "num_input_tokens_seen": 6658457600, + "step": 25400 + }, + { + "epoch": 0.17118939634717478, + "grad_norm": 0.1637289971113205, + "learning_rate": 0.001, + "loss": 3.1306, + "num_input_tokens_seen": 6671564800, + "step": 25450 + }, + { + "epoch": 0.1715257212908824, + "grad_norm": 0.1866239458322525, + "learning_rate": 0.001, + "loss": 3.1248, + "num_input_tokens_seen": 6684672000, + "step": 25500 + }, + { + "epoch": 0.1715257212908824, + "eval_loss": 3.025984048843384, + "eval_runtime": 100.8165, + "eval_samples_per_second": 49.595, + "eval_steps_per_second": 12.399, + "num_input_tokens_seen": 6684672000, + "step": 25500 + }, + { + "epoch": 0.17186204623459, + "grad_norm": 0.19653931260108948, + "learning_rate": 0.001, + "loss": 3.1346, + "num_input_tokens_seen": 6697779200, + "step": 25550 + }, + { + "epoch": 0.17219837117829762, + "grad_norm": 0.18339622020721436, + "learning_rate": 0.001, + "loss": 3.12, + "num_input_tokens_seen": 6710886400, + "step": 25600 + }, + { + "epoch": 0.17253469612200523, + "grad_norm": 0.3683246970176697, + "learning_rate": 0.001, + "loss": 3.1338, + "num_input_tokens_seen": 6723993600, + "step": 25650 + }, + { + "epoch": 0.17287102106571284, + "grad_norm": 0.17096757888793945, + "learning_rate": 0.001, + "loss": 3.1283, + "num_input_tokens_seen": 6737100800, + "step": 25700 + }, + { + "epoch": 0.17320734600942045, + "grad_norm": 0.16841623187065125, + "learning_rate": 0.001, + "loss": 3.1236, + "num_input_tokens_seen": 6750208000, + "step": 25750 + }, + { + "epoch": 0.1735436709531281, + "grad_norm": 0.18569235503673553, + "learning_rate": 0.001, + "loss": 3.1273, + "num_input_tokens_seen": 6763315200, + "step": 25800 + }, + { + "epoch": 0.1738799958968357, + "grad_norm": 0.18508999049663544, + "learning_rate": 0.001, + "loss": 3.1246, + "num_input_tokens_seen": 6776422400, + "step": 25850 + }, + { + "epoch": 0.1742163208405433, + "grad_norm": 0.189519464969635, + "learning_rate": 0.001, + "loss": 3.1239, + "num_input_tokens_seen": 6789529600, + "step": 25900 + }, + { + "epoch": 0.17455264578425092, + "grad_norm": 0.1591208577156067, + "learning_rate": 0.001, + "loss": 3.1198, + "num_input_tokens_seen": 6802636800, + "step": 25950 + }, + { + "epoch": 0.17488897072795853, + "grad_norm": 0.199269101023674, + "learning_rate": 0.001, + "loss": 3.118, + "num_input_tokens_seen": 6815744000, + "step": 26000 + }, + { + "epoch": 0.17488897072795853, + "eval_loss": 3.019541025161743, + "eval_runtime": 101.0314, + "eval_samples_per_second": 49.49, + "eval_steps_per_second": 12.372, + "num_input_tokens_seen": 6815744000, + "step": 26000 + }, + { + "epoch": 0.17522529567166614, + "grad_norm": 0.17492325603961945, + "learning_rate": 0.001, + "loss": 3.1093, + "num_input_tokens_seen": 6828851200, + "step": 26050 + }, + { + "epoch": 0.17556162061537375, + "grad_norm": 0.17826683819293976, + "learning_rate": 0.001, + "loss": 3.1285, + "num_input_tokens_seen": 6841958400, + "step": 26100 + }, + { + "epoch": 0.17589794555908136, + "grad_norm": 0.17716389894485474, + "learning_rate": 0.001, + "loss": 3.1141, + "num_input_tokens_seen": 6855065600, + "step": 26150 + }, + { + "epoch": 0.17623427050278898, + "grad_norm": 0.18649104237556458, + "learning_rate": 0.001, + "loss": 3.1128, + "num_input_tokens_seen": 6868172800, + "step": 26200 + }, + { + "epoch": 0.1765705954464966, + "grad_norm": 0.4175710678100586, + "learning_rate": 0.001, + "loss": 3.1283, + "num_input_tokens_seen": 6881280000, + "step": 26250 + }, + { + "epoch": 0.1769069203902042, + "grad_norm": 0.2275037169456482, + "learning_rate": 0.001, + "loss": 3.148, + "num_input_tokens_seen": 6894387200, + "step": 26300 + }, + { + "epoch": 0.1772432453339118, + "grad_norm": 0.42409747838974, + "learning_rate": 0.001, + "loss": 3.1338, + "num_input_tokens_seen": 6907494400, + "step": 26350 + }, + { + "epoch": 0.17757957027761942, + "grad_norm": 0.23025737702846527, + "learning_rate": 0.001, + "loss": 3.1451, + "num_input_tokens_seen": 6920601600, + "step": 26400 + }, + { + "epoch": 0.17791589522132703, + "grad_norm": 0.20386695861816406, + "learning_rate": 0.001, + "loss": 3.1396, + "num_input_tokens_seen": 6933708800, + "step": 26450 + }, + { + "epoch": 0.17825222016503464, + "grad_norm": 0.20394697785377502, + "learning_rate": 0.001, + "loss": 3.1308, + "num_input_tokens_seen": 6946816000, + "step": 26500 + }, + { + "epoch": 0.17825222016503464, + "eval_loss": 3.02968430519104, + "eval_runtime": 100.5354, + "eval_samples_per_second": 49.734, + "eval_steps_per_second": 12.433, + "num_input_tokens_seen": 6946816000, + "step": 26500 + }, + { + "epoch": 0.17858854510874225, + "grad_norm": 0.19505399465560913, + "learning_rate": 0.001, + "loss": 3.1229, + "num_input_tokens_seen": 6959923200, + "step": 26550 + }, + { + "epoch": 0.17892487005244986, + "grad_norm": 0.17783001065254211, + "learning_rate": 0.001, + "loss": 3.108, + "num_input_tokens_seen": 6973030400, + "step": 26600 + }, + { + "epoch": 0.17926119499615747, + "grad_norm": 0.16989105939865112, + "learning_rate": 0.001, + "loss": 3.1252, + "num_input_tokens_seen": 6986137600, + "step": 26650 + }, + { + "epoch": 0.1795975199398651, + "grad_norm": 0.1939496099948883, + "learning_rate": 0.001, + "loss": 3.1234, + "num_input_tokens_seen": 6999244800, + "step": 26700 + }, + { + "epoch": 0.17993384488357272, + "grad_norm": 0.1649375557899475, + "learning_rate": 0.001, + "loss": 3.1156, + "num_input_tokens_seen": 7012352000, + "step": 26750 + }, + { + "epoch": 0.18027016982728034, + "grad_norm": 0.1829315572977066, + "learning_rate": 0.001, + "loss": 3.1209, + "num_input_tokens_seen": 7025459200, + "step": 26800 + }, + { + "epoch": 0.18060649477098795, + "grad_norm": 0.182273730635643, + "learning_rate": 0.001, + "loss": 3.1277, + "num_input_tokens_seen": 7038566400, + "step": 26850 + }, + { + "epoch": 0.18094281971469556, + "grad_norm": 0.1677001416683197, + "learning_rate": 0.001, + "loss": 3.1132, + "num_input_tokens_seen": 7051673600, + "step": 26900 + }, + { + "epoch": 0.18127914465840317, + "grad_norm": 0.2254924178123474, + "learning_rate": 0.001, + "loss": 3.1254, + "num_input_tokens_seen": 7064780800, + "step": 26950 + }, + { + "epoch": 0.18161546960211078, + "grad_norm": 0.2008505016565323, + "learning_rate": 0.001, + "loss": 3.1286, + "num_input_tokens_seen": 7077888000, + "step": 27000 + }, + { + "epoch": 0.18161546960211078, + "eval_loss": 3.0181450843811035, + "eval_runtime": 100.7921, + "eval_samples_per_second": 49.607, + "eval_steps_per_second": 12.402, + "num_input_tokens_seen": 7077888000, + "step": 27000 + }, + { + "epoch": 0.1819517945458184, + "grad_norm": 0.172062486410141, + "learning_rate": 0.001, + "loss": 3.1125, + "num_input_tokens_seen": 7090995200, + "step": 27050 + }, + { + "epoch": 0.182288119489526, + "grad_norm": 0.17802311480045319, + "learning_rate": 0.001, + "loss": 3.1145, + "num_input_tokens_seen": 7104102400, + "step": 27100 + }, + { + "epoch": 0.1826244444332336, + "grad_norm": 1.872890591621399, + "learning_rate": 0.001, + "loss": 3.1203, + "num_input_tokens_seen": 7117209600, + "step": 27150 + }, + { + "epoch": 0.18296076937694122, + "grad_norm": 0.1898074597120285, + "learning_rate": 0.001, + "loss": 3.1246, + "num_input_tokens_seen": 7130316800, + "step": 27200 + }, + { + "epoch": 0.18329709432064883, + "grad_norm": 0.19125746190547943, + "learning_rate": 0.001, + "loss": 3.1138, + "num_input_tokens_seen": 7143424000, + "step": 27250 + }, + { + "epoch": 0.18363341926435645, + "grad_norm": 0.17721381783485413, + "learning_rate": 0.001, + "loss": 3.1181, + "num_input_tokens_seen": 7156531200, + "step": 27300 + }, + { + "epoch": 0.18396974420806406, + "grad_norm": 0.26583337783813477, + "learning_rate": 0.001, + "loss": 3.1067, + "num_input_tokens_seen": 7169638400, + "step": 27350 + }, + { + "epoch": 0.18430606915177167, + "grad_norm": 0.18157972395420074, + "learning_rate": 0.001, + "loss": 3.1266, + "num_input_tokens_seen": 7182745600, + "step": 27400 + }, + { + "epoch": 0.18464239409547928, + "grad_norm": 0.17585282027721405, + "learning_rate": 0.001, + "loss": 3.1124, + "num_input_tokens_seen": 7195852800, + "step": 27450 + }, + { + "epoch": 0.1849787190391869, + "grad_norm": 0.23974797129631042, + "learning_rate": 0.001, + "loss": 3.1231, + "num_input_tokens_seen": 7208960000, + "step": 27500 + }, + { + "epoch": 0.1849787190391869, + "eval_loss": 3.023569345474243, + "eval_runtime": 100.613, + "eval_samples_per_second": 49.695, + "eval_steps_per_second": 12.424, + "num_input_tokens_seen": 7208960000, + "step": 27500 + }, + { + "epoch": 0.1853150439828945, + "grad_norm": 0.2258712202310562, + "learning_rate": 0.001, + "loss": 3.1415, + "num_input_tokens_seen": 7222067200, + "step": 27550 + }, + { + "epoch": 0.18565136892660214, + "grad_norm": 0.19764864444732666, + "learning_rate": 0.001, + "loss": 3.1207, + "num_input_tokens_seen": 7235174400, + "step": 27600 + }, + { + "epoch": 0.18598769387030975, + "grad_norm": 0.20053404569625854, + "learning_rate": 0.001, + "loss": 3.1252, + "num_input_tokens_seen": 7248281600, + "step": 27650 + }, + { + "epoch": 0.18632401881401736, + "grad_norm": 0.20705857872962952, + "learning_rate": 0.001, + "loss": 3.1296, + "num_input_tokens_seen": 7261388800, + "step": 27700 + }, + { + "epoch": 0.18666034375772497, + "grad_norm": 0.17856301367282867, + "learning_rate": 0.001, + "loss": 3.1141, + "num_input_tokens_seen": 7274496000, + "step": 27750 + }, + { + "epoch": 0.18699666870143258, + "grad_norm": 0.28354203701019287, + "learning_rate": 0.001, + "loss": 3.1263, + "num_input_tokens_seen": 7287603200, + "step": 27800 + }, + { + "epoch": 0.1873329936451402, + "grad_norm": 0.25223788619041443, + "learning_rate": 0.001, + "loss": 3.122, + "num_input_tokens_seen": 7300710400, + "step": 27850 + }, + { + "epoch": 0.1876693185888478, + "grad_norm": 0.6653568148612976, + "learning_rate": 0.001, + "loss": 3.1434, + "num_input_tokens_seen": 7313817600, + "step": 27900 + }, + { + "epoch": 0.18800564353255542, + "grad_norm": 0.44238439202308655, + "learning_rate": 0.001, + "loss": 3.1306, + "num_input_tokens_seen": 7326924800, + "step": 27950 + }, + { + "epoch": 0.18834196847626303, + "grad_norm": 0.2601284980773926, + "learning_rate": 0.001, + "loss": 3.1399, + "num_input_tokens_seen": 7340032000, + "step": 28000 + }, + { + "epoch": 0.18834196847626303, + "eval_loss": 3.0279700756073, + "eval_runtime": 101.2218, + "eval_samples_per_second": 49.396, + "eval_steps_per_second": 12.349, + "num_input_tokens_seen": 7340032000, + "step": 28000 + }, + { + "epoch": 0.18867829341997064, + "grad_norm": 0.22075016796588898, + "learning_rate": 0.001, + "loss": 3.1316, + "num_input_tokens_seen": 7353139200, + "step": 28050 + }, + { + "epoch": 0.18901461836367825, + "grad_norm": 0.25096815824508667, + "learning_rate": 0.001, + "loss": 3.1188, + "num_input_tokens_seen": 7366246400, + "step": 28100 + }, + { + "epoch": 0.18935094330738586, + "grad_norm": 0.1836758553981781, + "learning_rate": 0.001, + "loss": 3.1159, + "num_input_tokens_seen": 7379353600, + "step": 28150 + }, + { + "epoch": 0.18968726825109347, + "grad_norm": 0.24736745655536652, + "learning_rate": 0.001, + "loss": 3.1068, + "num_input_tokens_seen": 7392460800, + "step": 28200 + }, + { + "epoch": 0.19002359319480108, + "grad_norm": 0.18398351967334747, + "learning_rate": 0.001, + "loss": 3.1122, + "num_input_tokens_seen": 7405568000, + "step": 28250 + }, + { + "epoch": 0.1903599181385087, + "grad_norm": 0.2027016431093216, + "learning_rate": 0.001, + "loss": 3.1086, + "num_input_tokens_seen": 7418675200, + "step": 28300 + }, + { + "epoch": 0.1906962430822163, + "grad_norm": 0.18662536144256592, + "learning_rate": 0.001, + "loss": 3.1165, + "num_input_tokens_seen": 7431782400, + "step": 28350 + }, + { + "epoch": 0.19103256802592392, + "grad_norm": 0.1824251115322113, + "learning_rate": 0.001, + "loss": 3.1179, + "num_input_tokens_seen": 7444889600, + "step": 28400 + }, + { + "epoch": 0.19136889296963153, + "grad_norm": 0.17664241790771484, + "learning_rate": 0.001, + "loss": 3.1114, + "num_input_tokens_seen": 7457996800, + "step": 28450 + }, + { + "epoch": 0.19170521791333917, + "grad_norm": 0.17245933413505554, + "learning_rate": 0.001, + "loss": 3.1113, + "num_input_tokens_seen": 7471104000, + "step": 28500 + }, + { + "epoch": 0.19170521791333917, + "eval_loss": 3.0132832527160645, + "eval_runtime": 100.8874, + "eval_samples_per_second": 49.56, + "eval_steps_per_second": 12.39, + "num_input_tokens_seen": 7471104000, + "step": 28500 + }, + { + "epoch": 0.19204154285704678, + "grad_norm": 0.20906178653240204, + "learning_rate": 0.001, + "loss": 3.1229, + "num_input_tokens_seen": 7484211200, + "step": 28550 + }, + { + "epoch": 0.1923778678007544, + "grad_norm": 0.19940024614334106, + "learning_rate": 0.001, + "loss": 3.1214, + "num_input_tokens_seen": 7497318400, + "step": 28600 + }, + { + "epoch": 0.192714192744462, + "grad_norm": 0.20571599900722504, + "learning_rate": 0.001, + "loss": 3.1124, + "num_input_tokens_seen": 7510425600, + "step": 28650 + }, + { + "epoch": 0.1930505176881696, + "grad_norm": 0.20708389580249786, + "learning_rate": 0.001, + "loss": 3.1336, + "num_input_tokens_seen": 7523532800, + "step": 28700 + }, + { + "epoch": 0.19338684263187722, + "grad_norm": 0.22445373237133026, + "learning_rate": 0.001, + "loss": 3.1157, + "num_input_tokens_seen": 7536640000, + "step": 28750 + }, + { + "epoch": 0.19372316757558483, + "grad_norm": 0.19508902728557587, + "learning_rate": 0.001, + "loss": 3.1248, + "num_input_tokens_seen": 7549747200, + "step": 28800 + }, + { + "epoch": 0.19405949251929244, + "grad_norm": 0.17587284743785858, + "learning_rate": 0.001, + "loss": 3.126, + "num_input_tokens_seen": 7562854400, + "step": 28850 + }, + { + "epoch": 0.19439581746300005, + "grad_norm": 0.18851327896118164, + "learning_rate": 0.001, + "loss": 3.1128, + "num_input_tokens_seen": 7575961600, + "step": 28900 + }, + { + "epoch": 0.19473214240670766, + "grad_norm": 0.1922932267189026, + "learning_rate": 0.001, + "loss": 3.1244, + "num_input_tokens_seen": 7589068800, + "step": 28950 + }, + { + "epoch": 0.19506846735041528, + "grad_norm": 0.21472705900669098, + "learning_rate": 0.001, + "loss": 3.1287, + "num_input_tokens_seen": 7602176000, + "step": 29000 + }, + { + "epoch": 0.19506846735041528, + "eval_loss": 3.0183699131011963, + "eval_runtime": 100.5473, + "eval_samples_per_second": 49.728, + "eval_steps_per_second": 12.432, + "num_input_tokens_seen": 7602176000, + "step": 29000 + }, + { + "epoch": 0.1954047922941229, + "grad_norm": 0.4608127474784851, + "learning_rate": 0.001, + "loss": 3.1279, + "num_input_tokens_seen": 7615283200, + "step": 29050 + }, + { + "epoch": 0.1957411172378305, + "grad_norm": 0.19692400097846985, + "learning_rate": 0.001, + "loss": 3.1227, + "num_input_tokens_seen": 7628390400, + "step": 29100 + }, + { + "epoch": 0.1960774421815381, + "grad_norm": 0.17576786875724792, + "learning_rate": 0.001, + "loss": 3.1082, + "num_input_tokens_seen": 7641497600, + "step": 29150 + }, + { + "epoch": 0.19641376712524572, + "grad_norm": 0.3715643584728241, + "learning_rate": 0.001, + "loss": 3.1136, + "num_input_tokens_seen": 7654604800, + "step": 29200 + }, + { + "epoch": 0.19675009206895333, + "grad_norm": 0.19605587422847748, + "learning_rate": 0.001, + "loss": 3.1233, + "num_input_tokens_seen": 7667712000, + "step": 29250 + }, + { + "epoch": 0.19708641701266094, + "grad_norm": 0.18321235477924347, + "learning_rate": 0.001, + "loss": 3.1127, + "num_input_tokens_seen": 7680819200, + "step": 29300 + }, + { + "epoch": 0.19742274195636855, + "grad_norm": 0.19760318100452423, + "learning_rate": 0.001, + "loss": 3.1074, + "num_input_tokens_seen": 7693926400, + "step": 29350 + }, + { + "epoch": 0.1977590669000762, + "grad_norm": 0.2162732481956482, + "learning_rate": 0.001, + "loss": 3.1126, + "num_input_tokens_seen": 7707033600, + "step": 29400 + }, + { + "epoch": 0.1980953918437838, + "grad_norm": 0.20082025229930878, + "learning_rate": 0.001, + "loss": 3.1111, + "num_input_tokens_seen": 7720140800, + "step": 29450 + }, + { + "epoch": 0.1984317167874914, + "grad_norm": 0.1623256504535675, + "learning_rate": 0.001, + "loss": 3.108, + "num_input_tokens_seen": 7733248000, + "step": 29500 + }, + { + "epoch": 0.1984317167874914, + "eval_loss": 3.0064518451690674, + "eval_runtime": 101.1359, + "eval_samples_per_second": 49.438, + "eval_steps_per_second": 12.36, + "num_input_tokens_seen": 7733248000, + "step": 29500 + }, + { + "epoch": 0.19876804173119902, + "grad_norm": 0.22083748877048492, + "learning_rate": 0.001, + "loss": 3.1171, + "num_input_tokens_seen": 7746355200, + "step": 29550 + }, + { + "epoch": 0.19910436667490664, + "grad_norm": 0.23460319638252258, + "learning_rate": 0.001, + "loss": 3.1199, + "num_input_tokens_seen": 7759462400, + "step": 29600 + }, + { + "epoch": 0.19944069161861425, + "grad_norm": 0.22132454812526703, + "learning_rate": 0.001, + "loss": 3.1058, + "num_input_tokens_seen": 7772569600, + "step": 29650 + }, + { + "epoch": 0.19977701656232186, + "grad_norm": 0.18013770878314972, + "learning_rate": 0.001, + "loss": 3.1193, + "num_input_tokens_seen": 7785676800, + "step": 29700 + }, + { + "epoch": 0.20011334150602947, + "grad_norm": 0.17522746324539185, + "learning_rate": 0.001, + "loss": 3.1017, + "num_input_tokens_seen": 7798784000, + "step": 29750 + }, + { + "epoch": 0.20044966644973708, + "grad_norm": 0.18989771604537964, + "learning_rate": 0.001, + "loss": 3.1029, + "num_input_tokens_seen": 7811891200, + "step": 29800 + }, + { + "epoch": 0.2007859913934447, + "grad_norm": 0.38332825899124146, + "learning_rate": 0.001, + "loss": 3.11, + "num_input_tokens_seen": 7824998400, + "step": 29850 + }, + { + "epoch": 0.2011223163371523, + "grad_norm": 0.1822918802499771, + "learning_rate": 0.001, + "loss": 3.1018, + "num_input_tokens_seen": 7838105600, + "step": 29900 + }, + { + "epoch": 0.2014586412808599, + "grad_norm": 0.1707374006509781, + "learning_rate": 0.001, + "loss": 3.1077, + "num_input_tokens_seen": 7851212800, + "step": 29950 + }, + { + "epoch": 0.20179496622456752, + "grad_norm": 0.32529768347740173, + "learning_rate": 0.001, + "loss": 3.1074, + "num_input_tokens_seen": 7864320000, + "step": 30000 + }, + { + "epoch": 0.20179496622456752, + "eval_loss": 3.0052828788757324, + "eval_runtime": 101.5526, + "eval_samples_per_second": 49.236, + "eval_steps_per_second": 12.309, + "num_input_tokens_seen": 7864320000, + "step": 30000 + }, + { + "epoch": 0.20213129116827513, + "grad_norm": 0.3476233184337616, + "learning_rate": 0.001, + "loss": 3.1145, + "num_input_tokens_seen": 7877427200, + "step": 30050 + }, + { + "epoch": 0.20246761611198275, + "grad_norm": 0.18626414239406586, + "learning_rate": 0.001, + "loss": 3.1066, + "num_input_tokens_seen": 7890534400, + "step": 30100 + }, + { + "epoch": 0.20280394105569036, + "grad_norm": 0.254221647977829, + "learning_rate": 0.001, + "loss": 3.1092, + "num_input_tokens_seen": 7903641600, + "step": 30150 + }, + { + "epoch": 0.20314026599939797, + "grad_norm": 0.22347959876060486, + "learning_rate": 0.001, + "loss": 3.1168, + "num_input_tokens_seen": 7916748800, + "step": 30200 + }, + { + "epoch": 0.20347659094310558, + "grad_norm": 0.22947949171066284, + "learning_rate": 0.001, + "loss": 3.1181, + "num_input_tokens_seen": 7929856000, + "step": 30250 + }, + { + "epoch": 0.2038129158868132, + "grad_norm": 0.18353967368602753, + "learning_rate": 0.001, + "loss": 3.1154, + "num_input_tokens_seen": 7942963200, + "step": 30300 + }, + { + "epoch": 0.20414924083052083, + "grad_norm": 0.1840088963508606, + "learning_rate": 0.001, + "loss": 3.1145, + "num_input_tokens_seen": 7956070400, + "step": 30350 + }, + { + "epoch": 0.20448556577422844, + "grad_norm": 0.21340833604335785, + "learning_rate": 0.001, + "loss": 3.1175, + "num_input_tokens_seen": 7969177600, + "step": 30400 + }, + { + "epoch": 0.20482189071793605, + "grad_norm": 0.22054563462734222, + "learning_rate": 0.001, + "loss": 3.1082, + "num_input_tokens_seen": 7982284800, + "step": 30450 + }, + { + "epoch": 0.20515821566164366, + "grad_norm": 0.23745377361774445, + "learning_rate": 0.001, + "loss": 3.1155, + "num_input_tokens_seen": 7995392000, + "step": 30500 + }, + { + "epoch": 0.20515821566164366, + "eval_loss": 3.0057761669158936, + "eval_runtime": 111.9719, + "eval_samples_per_second": 44.654, + "eval_steps_per_second": 11.164, + "num_input_tokens_seen": 7995392000, + "step": 30500 + }, + { + "epoch": 0.20549454060535127, + "grad_norm": 0.18189583718776703, + "learning_rate": 0.001, + "loss": 3.119, + "num_input_tokens_seen": 8008499200, + "step": 30550 + }, + { + "epoch": 0.20583086554905888, + "grad_norm": 0.23735597729682922, + "learning_rate": 0.001, + "loss": 3.1114, + "num_input_tokens_seen": 8021606400, + "step": 30600 + }, + { + "epoch": 0.2061671904927665, + "grad_norm": 0.6922980546951294, + "learning_rate": 0.001, + "loss": 3.1155, + "num_input_tokens_seen": 8034713600, + "step": 30650 + }, + { + "epoch": 0.2065035154364741, + "grad_norm": 0.277959406375885, + "learning_rate": 0.001, + "loss": 3.113, + "num_input_tokens_seen": 8047820800, + "step": 30700 + }, + { + "epoch": 0.20683984038018172, + "grad_norm": 0.20879347622394562, + "learning_rate": 0.001, + "loss": 3.1052, + "num_input_tokens_seen": 8060928000, + "step": 30750 + }, + { + "epoch": 0.20717616532388933, + "grad_norm": 0.2380591332912445, + "learning_rate": 0.001, + "loss": 3.0957, + "num_input_tokens_seen": 8074035200, + "step": 30800 + }, + { + "epoch": 0.20751249026759694, + "grad_norm": 0.19781485199928284, + "learning_rate": 0.001, + "loss": 3.1043, + "num_input_tokens_seen": 8087142400, + "step": 30850 + }, + { + "epoch": 0.20784881521130455, + "grad_norm": 0.20070037245750427, + "learning_rate": 0.001, + "loss": 3.1064, + "num_input_tokens_seen": 8100249600, + "step": 30900 + }, + { + "epoch": 0.20818514015501216, + "grad_norm": 0.1823301464319229, + "learning_rate": 0.001, + "loss": 3.0957, + "num_input_tokens_seen": 8113356800, + "step": 30950 + }, + { + "epoch": 0.20852146509871977, + "grad_norm": 0.21749699115753174, + "learning_rate": 0.001, + "loss": 3.0952, + "num_input_tokens_seen": 8126464000, + "step": 31000 + }, + { + "epoch": 0.20852146509871977, + "eval_loss": 3.003377676010132, + "eval_runtime": 101.0747, + "eval_samples_per_second": 49.468, + "eval_steps_per_second": 12.367, + "num_input_tokens_seen": 8126464000, + "step": 31000 + }, + { + "epoch": 0.20885779004242738, + "grad_norm": 0.189363494515419, + "learning_rate": 0.001, + "loss": 3.0965, + "num_input_tokens_seen": 8139571200, + "step": 31050 + }, + { + "epoch": 0.209194114986135, + "grad_norm": 0.22209693491458893, + "learning_rate": 0.001, + "loss": 3.0999, + "num_input_tokens_seen": 8152678400, + "step": 31100 + }, + { + "epoch": 0.2095304399298426, + "grad_norm": 0.2229391485452652, + "learning_rate": 0.001, + "loss": 3.1051, + "num_input_tokens_seen": 8165785600, + "step": 31150 + }, + { + "epoch": 0.20986676487355022, + "grad_norm": 0.29246941208839417, + "learning_rate": 0.001, + "loss": 3.1091, + "num_input_tokens_seen": 8178892800, + "step": 31200 + }, + { + "epoch": 0.21020308981725785, + "grad_norm": 0.20801013708114624, + "learning_rate": 0.001, + "loss": 3.1203, + "num_input_tokens_seen": 8192000000, + "step": 31250 + }, + { + "epoch": 0.21053941476096547, + "grad_norm": 0.20411519706249237, + "learning_rate": 0.001, + "loss": 3.0988, + "num_input_tokens_seen": 8205107200, + "step": 31300 + }, + { + "epoch": 0.21087573970467308, + "grad_norm": 0.20220200717449188, + "learning_rate": 0.001, + "loss": 3.0999, + "num_input_tokens_seen": 8218214400, + "step": 31350 + }, + { + "epoch": 0.2112120646483807, + "grad_norm": 0.2107418328523636, + "learning_rate": 0.001, + "loss": 3.1052, + "num_input_tokens_seen": 8231321600, + "step": 31400 + }, + { + "epoch": 0.2115483895920883, + "grad_norm": 0.2079913169145584, + "learning_rate": 0.001, + "loss": 3.1095, + "num_input_tokens_seen": 8244428800, + "step": 31450 + }, + { + "epoch": 0.2118847145357959, + "grad_norm": 0.20990462601184845, + "learning_rate": 0.001, + "loss": 3.1095, + "num_input_tokens_seen": 8257536000, + "step": 31500 + }, + { + "epoch": 0.2118847145357959, + "eval_loss": 3.002542018890381, + "eval_runtime": 102.5542, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 12.189, + "num_input_tokens_seen": 8257536000, + "step": 31500 + }, + { + "epoch": 0.21222103947950352, + "grad_norm": 0.3199199140071869, + "learning_rate": 0.001, + "loss": 3.1193, + "num_input_tokens_seen": 8270643200, + "step": 31550 + }, + { + "epoch": 0.21255736442321113, + "grad_norm": 0.3326248824596405, + "learning_rate": 0.001, + "loss": 3.1398, + "num_input_tokens_seen": 8283750400, + "step": 31600 + }, + { + "epoch": 0.21289368936691874, + "grad_norm": 0.22599756717681885, + "learning_rate": 0.001, + "loss": 3.1192, + "num_input_tokens_seen": 8296857600, + "step": 31650 + }, + { + "epoch": 0.21323001431062635, + "grad_norm": 0.24016565084457397, + "learning_rate": 0.001, + "loss": 3.1208, + "num_input_tokens_seen": 8309964800, + "step": 31700 + }, + { + "epoch": 0.21356633925433396, + "grad_norm": 0.2414240539073944, + "learning_rate": 0.001, + "loss": 3.1188, + "num_input_tokens_seen": 8323072000, + "step": 31750 + }, + { + "epoch": 0.21390266419804158, + "grad_norm": 0.21480241417884827, + "learning_rate": 0.001, + "loss": 3.1183, + "num_input_tokens_seen": 8336179200, + "step": 31800 + }, + { + "epoch": 0.2142389891417492, + "grad_norm": 0.23109237849712372, + "learning_rate": 0.001, + "loss": 3.117, + "num_input_tokens_seen": 8349286400, + "step": 31850 + }, + { + "epoch": 0.2145753140854568, + "grad_norm": 1.8171563148498535, + "learning_rate": 0.001, + "loss": 3.1133, + "num_input_tokens_seen": 8362393600, + "step": 31900 + }, + { + "epoch": 0.2149116390291644, + "grad_norm": 0.19770418107509613, + "learning_rate": 0.001, + "loss": 3.1004, + "num_input_tokens_seen": 8375500800, + "step": 31950 + }, + { + "epoch": 0.21524796397287202, + "grad_norm": 0.17420834302902222, + "learning_rate": 0.001, + "loss": 3.1201, + "num_input_tokens_seen": 8388608000, + "step": 32000 + }, + { + "epoch": 0.21524796397287202, + "eval_loss": 2.999030113220215, + "eval_runtime": 101.4799, + "eval_samples_per_second": 49.271, + "eval_steps_per_second": 12.318, + "num_input_tokens_seen": 8388608000, + "step": 32000 + }, + { + "epoch": 0.21558428891657963, + "grad_norm": 0.17642393708229065, + "learning_rate": 0.001, + "loss": 3.0933, + "num_input_tokens_seen": 8401715200, + "step": 32050 + }, + { + "epoch": 0.21592061386028724, + "grad_norm": 0.17682494223117828, + "learning_rate": 0.001, + "loss": 3.0944, + "num_input_tokens_seen": 8414822400, + "step": 32100 + }, + { + "epoch": 0.21625693880399488, + "grad_norm": 0.18188636004924774, + "learning_rate": 0.001, + "loss": 3.0952, + "num_input_tokens_seen": 8427929600, + "step": 32150 + }, + { + "epoch": 0.2165932637477025, + "grad_norm": 0.19842028617858887, + "learning_rate": 0.001, + "loss": 3.0952, + "num_input_tokens_seen": 8441036800, + "step": 32200 + }, + { + "epoch": 0.2169295886914101, + "grad_norm": 0.5376595854759216, + "learning_rate": 0.001, + "loss": 3.0995, + "num_input_tokens_seen": 8454144000, + "step": 32250 + }, + { + "epoch": 0.2172659136351177, + "grad_norm": 0.1966828554868698, + "learning_rate": 0.001, + "loss": 3.0998, + "num_input_tokens_seen": 8467251200, + "step": 32300 + }, + { + "epoch": 0.21760223857882532, + "grad_norm": 0.16826917231082916, + "learning_rate": 0.001, + "loss": 3.1058, + "num_input_tokens_seen": 8480358400, + "step": 32350 + }, + { + "epoch": 0.21793856352253294, + "grad_norm": 0.17534971237182617, + "learning_rate": 0.001, + "loss": 3.0989, + "num_input_tokens_seen": 8493465600, + "step": 32400 + }, + { + "epoch": 0.21827488846624055, + "grad_norm": 0.18534857034683228, + "learning_rate": 0.001, + "loss": 3.1006, + "num_input_tokens_seen": 8506572800, + "step": 32450 + }, + { + "epoch": 0.21861121340994816, + "grad_norm": 0.23653754591941833, + "learning_rate": 0.001, + "loss": 3.0979, + "num_input_tokens_seen": 8519680000, + "step": 32500 + }, + { + "epoch": 0.21861121340994816, + "eval_loss": 2.9992752075195312, + "eval_runtime": 104.1376, + "eval_samples_per_second": 48.013, + "eval_steps_per_second": 12.003, + "num_input_tokens_seen": 8519680000, + "step": 32500 + }, + { + "epoch": 0.21894753835365577, + "grad_norm": 0.235910564661026, + "learning_rate": 0.001, + "loss": 3.0965, + "num_input_tokens_seen": 8532787200, + "step": 32550 + }, + { + "epoch": 0.21928386329736338, + "grad_norm": 0.17582012712955475, + "learning_rate": 0.001, + "loss": 3.0979, + "num_input_tokens_seen": 8545894400, + "step": 32600 + }, + { + "epoch": 0.219620188241071, + "grad_norm": 0.16854169964790344, + "learning_rate": 0.001, + "loss": 3.0962, + "num_input_tokens_seen": 8559001600, + "step": 32650 + }, + { + "epoch": 0.2199565131847786, + "grad_norm": 0.20170539617538452, + "learning_rate": 0.001, + "loss": 3.0967, + "num_input_tokens_seen": 8572108800, + "step": 32700 + }, + { + "epoch": 0.2202928381284862, + "grad_norm": 0.15898527204990387, + "learning_rate": 0.001, + "loss": 3.0846, + "num_input_tokens_seen": 8585216000, + "step": 32750 + }, + { + "epoch": 0.22062916307219382, + "grad_norm": 0.18423962593078613, + "learning_rate": 0.001, + "loss": 3.0873, + "num_input_tokens_seen": 8598323200, + "step": 32800 + }, + { + "epoch": 0.22096548801590143, + "grad_norm": 0.22025519609451294, + "learning_rate": 0.001, + "loss": 3.1034, + "num_input_tokens_seen": 8611430400, + "step": 32850 + }, + { + "epoch": 0.22130181295960905, + "grad_norm": 0.22972916066646576, + "learning_rate": 0.001, + "loss": 3.1018, + "num_input_tokens_seen": 8624537600, + "step": 32900 + }, + { + "epoch": 0.22163813790331666, + "grad_norm": 0.3072693347930908, + "learning_rate": 0.001, + "loss": 3.1044, + "num_input_tokens_seen": 8637644800, + "step": 32950 + }, + { + "epoch": 0.22197446284702427, + "grad_norm": 0.16734054684638977, + "learning_rate": 0.001, + "loss": 3.1079, + "num_input_tokens_seen": 8650752000, + "step": 33000 + }, + { + "epoch": 0.22197446284702427, + "eval_loss": 2.9946696758270264, + "eval_runtime": 101.2832, + "eval_samples_per_second": 49.367, + "eval_steps_per_second": 12.342, + "num_input_tokens_seen": 8650752000, + "step": 33000 + }, + { + "epoch": 0.2223107877907319, + "grad_norm": 0.19366054236888885, + "learning_rate": 0.001, + "loss": 3.1123, + "num_input_tokens_seen": 8663859200, + "step": 33050 + }, + { + "epoch": 0.22264711273443952, + "grad_norm": 0.1908022165298462, + "learning_rate": 0.001, + "loss": 3.0921, + "num_input_tokens_seen": 8676966400, + "step": 33100 + }, + { + "epoch": 0.22298343767814713, + "grad_norm": 0.1756322979927063, + "learning_rate": 0.001, + "loss": 3.0898, + "num_input_tokens_seen": 8690073600, + "step": 33150 + }, + { + "epoch": 0.22331976262185474, + "grad_norm": 0.17791526019573212, + "learning_rate": 0.001, + "loss": 3.099, + "num_input_tokens_seen": 8703180800, + "step": 33200 + }, + { + "epoch": 0.22365608756556235, + "grad_norm": 0.1831691414117813, + "learning_rate": 0.001, + "loss": 3.0961, + "num_input_tokens_seen": 8716288000, + "step": 33250 + }, + { + "epoch": 0.22399241250926996, + "grad_norm": 0.21115051209926605, + "learning_rate": 0.001, + "loss": 3.092, + "num_input_tokens_seen": 8729395200, + "step": 33300 + }, + { + "epoch": 0.22432873745297757, + "grad_norm": 0.2059226632118225, + "learning_rate": 0.001, + "loss": 3.0982, + "num_input_tokens_seen": 8742502400, + "step": 33350 + }, + { + "epoch": 0.22466506239668518, + "grad_norm": 0.18022479116916656, + "learning_rate": 0.001, + "loss": 3.0853, + "num_input_tokens_seen": 8755609600, + "step": 33400 + }, + { + "epoch": 0.2250013873403928, + "grad_norm": 0.18534015119075775, + "learning_rate": 0.001, + "loss": 3.0872, + "num_input_tokens_seen": 8768716800, + "step": 33450 + }, + { + "epoch": 0.2253377122841004, + "grad_norm": 0.1856871247291565, + "learning_rate": 0.001, + "loss": 3.0888, + "num_input_tokens_seen": 8781824000, + "step": 33500 + }, + { + "epoch": 0.2253377122841004, + "eval_loss": 2.989856243133545, + "eval_runtime": 100.552, + "eval_samples_per_second": 49.725, + "eval_steps_per_second": 12.431, + "num_input_tokens_seen": 8781824000, + "step": 33500 + }, + { + "epoch": 0.22567403722780802, + "grad_norm": 0.1858453005552292, + "learning_rate": 0.001, + "loss": 3.0859, + "num_input_tokens_seen": 8794931200, + "step": 33550 + }, + { + "epoch": 0.22601036217151563, + "grad_norm": 0.15424181520938873, + "learning_rate": 0.001, + "loss": 3.094, + "num_input_tokens_seen": 8808038400, + "step": 33600 + }, + { + "epoch": 0.22634668711522324, + "grad_norm": 0.1684613823890686, + "learning_rate": 0.001, + "loss": 3.0924, + "num_input_tokens_seen": 8821145600, + "step": 33650 + }, + { + "epoch": 0.22668301205893085, + "grad_norm": 0.20759907364845276, + "learning_rate": 0.001, + "loss": 3.0967, + "num_input_tokens_seen": 8834252800, + "step": 33700 + }, + { + "epoch": 0.22701933700263846, + "grad_norm": 0.20460882782936096, + "learning_rate": 0.001, + "loss": 3.0938, + "num_input_tokens_seen": 8847360000, + "step": 33750 + }, + { + "epoch": 0.22735566194634607, + "grad_norm": 1.1036453247070312, + "learning_rate": 0.001, + "loss": 3.092, + "num_input_tokens_seen": 8860467200, + "step": 33800 + }, + { + "epoch": 0.22769198689005368, + "grad_norm": 0.23920981585979462, + "learning_rate": 0.001, + "loss": 3.1014, + "num_input_tokens_seen": 8873574400, + "step": 33850 + }, + { + "epoch": 0.2280283118337613, + "grad_norm": 0.44639232754707336, + "learning_rate": 0.001, + "loss": 3.0925, + "num_input_tokens_seen": 8886681600, + "step": 33900 + }, + { + "epoch": 0.22836463677746893, + "grad_norm": 0.20524434745311737, + "learning_rate": 0.001, + "loss": 3.0998, + "num_input_tokens_seen": 8899788800, + "step": 33950 + }, + { + "epoch": 0.22870096172117654, + "grad_norm": 0.3326469957828522, + "learning_rate": 0.001, + "loss": 3.1028, + "num_input_tokens_seen": 8912896000, + "step": 34000 + }, + { + "epoch": 0.22870096172117654, + "eval_loss": 2.99273681640625, + "eval_runtime": 101.3229, + "eval_samples_per_second": 49.347, + "eval_steps_per_second": 12.337, + "num_input_tokens_seen": 8912896000, + "step": 34000 + }, + { + "epoch": 0.22903728666488415, + "grad_norm": 0.1811443269252777, + "learning_rate": 0.001, + "loss": 3.0963, + "num_input_tokens_seen": 8926003200, + "step": 34050 + }, + { + "epoch": 0.22937361160859177, + "grad_norm": 0.1995471566915512, + "learning_rate": 0.001, + "loss": 3.0929, + "num_input_tokens_seen": 8939110400, + "step": 34100 + }, + { + "epoch": 0.22970993655229938, + "grad_norm": 0.18768733739852905, + "learning_rate": 0.001, + "loss": 3.088, + "num_input_tokens_seen": 8952217600, + "step": 34150 + }, + { + "epoch": 0.230046261496007, + "grad_norm": 0.23258289694786072, + "learning_rate": 0.001, + "loss": 3.0952, + "num_input_tokens_seen": 8965324800, + "step": 34200 + }, + { + "epoch": 0.2303825864397146, + "grad_norm": 0.2201758772134781, + "learning_rate": 0.001, + "loss": 3.105, + "num_input_tokens_seen": 8978432000, + "step": 34250 + }, + { + "epoch": 0.2307189113834222, + "grad_norm": 0.3385378420352936, + "learning_rate": 0.001, + "loss": 3.1242, + "num_input_tokens_seen": 8991539200, + "step": 34300 + }, + { + "epoch": 0.23105523632712982, + "grad_norm": 0.40026524662971497, + "learning_rate": 0.001, + "loss": 3.1236, + "num_input_tokens_seen": 9004646400, + "step": 34350 + }, + { + "epoch": 0.23139156127083743, + "grad_norm": 0.9256707429885864, + "learning_rate": 0.001, + "loss": 3.1417, + "num_input_tokens_seen": 9017753600, + "step": 34400 + }, + { + "epoch": 0.23172788621454504, + "grad_norm": 0.2774488627910614, + "learning_rate": 0.001, + "loss": 3.1258, + "num_input_tokens_seen": 9030860800, + "step": 34450 + }, + { + "epoch": 0.23206421115825265, + "grad_norm": 0.3596802353858948, + "learning_rate": 0.001, + "loss": 3.1182, + "num_input_tokens_seen": 9043968000, + "step": 34500 + }, + { + "epoch": 0.23206421115825265, + "eval_loss": 3.0026750564575195, + "eval_runtime": 101.1458, + "eval_samples_per_second": 49.434, + "eval_steps_per_second": 12.358, + "num_input_tokens_seen": 9043968000, + "step": 34500 + }, + { + "epoch": 0.23240053610196026, + "grad_norm": 0.23315957188606262, + "learning_rate": 0.001, + "loss": 3.0983, + "num_input_tokens_seen": 9057075200, + "step": 34550 + }, + { + "epoch": 0.23273686104566788, + "grad_norm": 0.21506045758724213, + "learning_rate": 0.001, + "loss": 3.1006, + "num_input_tokens_seen": 9070182400, + "step": 34600 + }, + { + "epoch": 0.2330731859893755, + "grad_norm": 0.23909543454647064, + "learning_rate": 0.001, + "loss": 3.109, + "num_input_tokens_seen": 9083289600, + "step": 34650 + }, + { + "epoch": 0.2334095109330831, + "grad_norm": 0.31270143389701843, + "learning_rate": 0.001, + "loss": 3.1062, + "num_input_tokens_seen": 9096396800, + "step": 34700 + }, + { + "epoch": 0.2337458358767907, + "grad_norm": 0.2879350483417511, + "learning_rate": 0.001, + "loss": 3.1065, + "num_input_tokens_seen": 9109504000, + "step": 34750 + }, + { + "epoch": 0.23408216082049832, + "grad_norm": 0.1994767040014267, + "learning_rate": 0.001, + "loss": 3.0984, + "num_input_tokens_seen": 9122611200, + "step": 34800 + }, + { + "epoch": 0.23441848576420596, + "grad_norm": 0.19194720685482025, + "learning_rate": 0.001, + "loss": 3.1003, + "num_input_tokens_seen": 9135718400, + "step": 34850 + }, + { + "epoch": 0.23475481070791357, + "grad_norm": 0.22253084182739258, + "learning_rate": 0.001, + "loss": 3.0969, + "num_input_tokens_seen": 9148825600, + "step": 34900 + }, + { + "epoch": 0.23509113565162118, + "grad_norm": 0.2180721014738083, + "learning_rate": 0.001, + "loss": 3.0986, + "num_input_tokens_seen": 9161932800, + "step": 34950 + }, + { + "epoch": 0.2354274605953288, + "grad_norm": 0.1867762804031372, + "learning_rate": 0.001, + "loss": 3.0831, + "num_input_tokens_seen": 9175040000, + "step": 35000 + }, + { + "epoch": 0.2354274605953288, + "eval_loss": 2.987546920776367, + "eval_runtime": 100.7704, + "eval_samples_per_second": 49.618, + "eval_steps_per_second": 12.404, + "num_input_tokens_seen": 9175040000, + "step": 35000 + }, + { + "epoch": 0.2357637855390364, + "grad_norm": 0.22034546732902527, + "learning_rate": 0.001, + "loss": 3.0963, + "num_input_tokens_seen": 9188147200, + "step": 35050 + }, + { + "epoch": 0.236100110482744, + "grad_norm": 0.20113885402679443, + "learning_rate": 0.001, + "loss": 3.0948, + "num_input_tokens_seen": 9201254400, + "step": 35100 + }, + { + "epoch": 0.23643643542645162, + "grad_norm": 0.3071548342704773, + "learning_rate": 0.001, + "loss": 3.0991, + "num_input_tokens_seen": 9214361600, + "step": 35150 + }, + { + "epoch": 0.23677276037015924, + "grad_norm": 0.19438254833221436, + "learning_rate": 0.001, + "loss": 3.0931, + "num_input_tokens_seen": 9227468800, + "step": 35200 + }, + { + "epoch": 0.23710908531386685, + "grad_norm": 0.19387036561965942, + "learning_rate": 0.001, + "loss": 3.0945, + "num_input_tokens_seen": 9240576000, + "step": 35250 + }, + { + "epoch": 0.23744541025757446, + "grad_norm": 0.33751723170280457, + "learning_rate": 0.001, + "loss": 3.0853, + "num_input_tokens_seen": 9253683200, + "step": 35300 + }, + { + "epoch": 0.23778173520128207, + "grad_norm": 0.20809979736804962, + "learning_rate": 0.001, + "loss": 3.0886, + "num_input_tokens_seen": 9266790400, + "step": 35350 + }, + { + "epoch": 0.23811806014498968, + "grad_norm": 0.22419853508472443, + "learning_rate": 0.001, + "loss": 3.0903, + "num_input_tokens_seen": 9279897600, + "step": 35400 + }, + { + "epoch": 0.2384543850886973, + "grad_norm": 0.38772207498550415, + "learning_rate": 0.001, + "loss": 3.0917, + "num_input_tokens_seen": 9293004800, + "step": 35450 + }, + { + "epoch": 0.2387907100324049, + "grad_norm": 0.26076874136924744, + "learning_rate": 0.001, + "loss": 3.1019, + "num_input_tokens_seen": 9306112000, + "step": 35500 + }, + { + "epoch": 0.2387907100324049, + "eval_loss": 2.9896371364593506, + "eval_runtime": 101.1154, + "eval_samples_per_second": 49.448, + "eval_steps_per_second": 12.362, + "num_input_tokens_seen": 9306112000, + "step": 35500 + }, + { + "epoch": 0.2391270349761125, + "grad_norm": 0.2551634907722473, + "learning_rate": 0.001, + "loss": 3.0983, + "num_input_tokens_seen": 9319219200, + "step": 35550 + }, + { + "epoch": 0.23946335991982012, + "grad_norm": 0.25666406750679016, + "learning_rate": 0.001, + "loss": 3.0973, + "num_input_tokens_seen": 9332326400, + "step": 35600 + }, + { + "epoch": 0.23979968486352773, + "grad_norm": 0.21999171376228333, + "learning_rate": 0.001, + "loss": 3.1023, + "num_input_tokens_seen": 9345433600, + "step": 35650 + }, + { + "epoch": 0.24013600980723535, + "grad_norm": 0.24575746059417725, + "learning_rate": 0.001, + "loss": 3.0954, + "num_input_tokens_seen": 9358540800, + "step": 35700 + }, + { + "epoch": 0.24047233475094298, + "grad_norm": 0.23686733841896057, + "learning_rate": 0.001, + "loss": 3.0939, + "num_input_tokens_seen": 9371648000, + "step": 35750 + }, + { + "epoch": 0.2408086596946506, + "grad_norm": 0.2265947312116623, + "learning_rate": 0.001, + "loss": 3.0972, + "num_input_tokens_seen": 9384755200, + "step": 35800 + }, + { + "epoch": 0.2411449846383582, + "grad_norm": 0.19846303761005402, + "learning_rate": 0.001, + "loss": 3.0879, + "num_input_tokens_seen": 9397862400, + "step": 35850 + }, + { + "epoch": 0.24148130958206582, + "grad_norm": 0.3813537657260895, + "learning_rate": 0.001, + "loss": 3.091, + "num_input_tokens_seen": 9410969600, + "step": 35900 + }, + { + "epoch": 0.24181763452577343, + "grad_norm": 0.21791686117649078, + "learning_rate": 0.001, + "loss": 3.1126, + "num_input_tokens_seen": 9424076800, + "step": 35950 + }, + { + "epoch": 0.24215395946948104, + "grad_norm": 0.1958397924900055, + "learning_rate": 0.001, + "loss": 3.0993, + "num_input_tokens_seen": 9437184000, + "step": 36000 + }, + { + "epoch": 0.24215395946948104, + "eval_loss": 2.9876413345336914, + "eval_runtime": 100.7038, + "eval_samples_per_second": 49.651, + "eval_steps_per_second": 12.413, + "num_input_tokens_seen": 9437184000, + "step": 36000 + }, + { + "epoch": 0.24249028441318865, + "grad_norm": 0.18913355469703674, + "learning_rate": 0.001, + "loss": 3.0824, + "num_input_tokens_seen": 9450291200, + "step": 36050 + }, + { + "epoch": 0.24282660935689626, + "grad_norm": 0.17502999305725098, + "learning_rate": 0.001, + "loss": 3.0947, + "num_input_tokens_seen": 9463398400, + "step": 36100 + }, + { + "epoch": 0.24316293430060387, + "grad_norm": 0.1844000667333603, + "learning_rate": 0.001, + "loss": 3.1031, + "num_input_tokens_seen": 9476505600, + "step": 36150 + }, + { + "epoch": 0.24349925924431148, + "grad_norm": 0.21123170852661133, + "learning_rate": 0.001, + "loss": 3.0917, + "num_input_tokens_seen": 9489612800, + "step": 36200 + }, + { + "epoch": 0.2438355841880191, + "grad_norm": 0.20432326197624207, + "learning_rate": 0.001, + "loss": 3.0777, + "num_input_tokens_seen": 9502720000, + "step": 36250 + }, + { + "epoch": 0.2441719091317267, + "grad_norm": 0.1782015562057495, + "learning_rate": 0.001, + "loss": 3.0825, + "num_input_tokens_seen": 9515827200, + "step": 36300 + }, + { + "epoch": 0.24450823407543432, + "grad_norm": 0.17421406507492065, + "learning_rate": 0.001, + "loss": 3.0936, + "num_input_tokens_seen": 9528934400, + "step": 36350 + }, + { + "epoch": 0.24484455901914193, + "grad_norm": 0.20186392962932587, + "learning_rate": 0.001, + "loss": 3.0873, + "num_input_tokens_seen": 9542041600, + "step": 36400 + }, + { + "epoch": 0.24518088396284954, + "grad_norm": 0.8788098692893982, + "learning_rate": 0.001, + "loss": 3.0859, + "num_input_tokens_seen": 9555148800, + "step": 36450 + }, + { + "epoch": 0.24551720890655715, + "grad_norm": 0.21201574802398682, + "learning_rate": 0.001, + "loss": 3.0801, + "num_input_tokens_seen": 9568256000, + "step": 36500 + }, + { + "epoch": 0.24551720890655715, + "eval_loss": 2.9814889430999756, + "eval_runtime": 102.1027, + "eval_samples_per_second": 48.97, + "eval_steps_per_second": 12.243, + "num_input_tokens_seen": 9568256000, + "step": 36500 + }, + { + "epoch": 0.24585353385026476, + "grad_norm": 0.19073808193206787, + "learning_rate": 0.001, + "loss": 3.0944, + "num_input_tokens_seen": 9581363200, + "step": 36550 + }, + { + "epoch": 0.24618985879397237, + "grad_norm": 0.17367486655712128, + "learning_rate": 0.001, + "loss": 3.0892, + "num_input_tokens_seen": 9594470400, + "step": 36600 + }, + { + "epoch": 0.24652618373768, + "grad_norm": 0.24065230786800385, + "learning_rate": 0.001, + "loss": 3.092, + "num_input_tokens_seen": 9607577600, + "step": 36650 + }, + { + "epoch": 0.24686250868138762, + "grad_norm": 0.18443045020103455, + "learning_rate": 0.001, + "loss": 3.0863, + "num_input_tokens_seen": 9620684800, + "step": 36700 + }, + { + "epoch": 0.24719883362509523, + "grad_norm": 0.2121111899614334, + "learning_rate": 0.001, + "loss": 3.0902, + "num_input_tokens_seen": 9633792000, + "step": 36750 + }, + { + "epoch": 0.24753515856880284, + "grad_norm": 0.17981579899787903, + "learning_rate": 0.001, + "loss": 3.0888, + "num_input_tokens_seen": 9646899200, + "step": 36800 + }, + { + "epoch": 0.24787148351251045, + "grad_norm": 0.24683868885040283, + "learning_rate": 0.001, + "loss": 3.0895, + "num_input_tokens_seen": 9660006400, + "step": 36850 + }, + { + "epoch": 0.24820780845621807, + "grad_norm": 0.17905527353286743, + "learning_rate": 0.001, + "loss": 3.0771, + "num_input_tokens_seen": 9673113600, + "step": 36900 + }, + { + "epoch": 0.24854413339992568, + "grad_norm": 0.4657650589942932, + "learning_rate": 0.001, + "loss": 3.093, + "num_input_tokens_seen": 9686220800, + "step": 36950 + }, + { + "epoch": 0.2488804583436333, + "grad_norm": 0.2079911082983017, + "learning_rate": 0.001, + "loss": 3.0913, + "num_input_tokens_seen": 9699328000, + "step": 37000 + }, + { + "epoch": 0.2488804583436333, + "eval_loss": 2.984112024307251, + "eval_runtime": 100.7836, + "eval_samples_per_second": 49.611, + "eval_steps_per_second": 12.403, + "num_input_tokens_seen": 9699328000, + "step": 37000 + }, + { + "epoch": 0.2492167832873409, + "grad_norm": 0.18588006496429443, + "learning_rate": 0.001, + "loss": 3.086, + "num_input_tokens_seen": 9712435200, + "step": 37050 + }, + { + "epoch": 0.2495531082310485, + "grad_norm": 0.500970721244812, + "learning_rate": 0.001, + "loss": 3.0786, + "num_input_tokens_seen": 9725542400, + "step": 37100 + }, + { + "epoch": 0.24988943317475612, + "grad_norm": 0.20005236566066742, + "learning_rate": 0.001, + "loss": 3.0797, + "num_input_tokens_seen": 9738649600, + "step": 37150 + }, + { + "epoch": 0.25022575811846376, + "grad_norm": 0.1864924281835556, + "learning_rate": 0.001, + "loss": 3.076, + "num_input_tokens_seen": 9751756800, + "step": 37200 + }, + { + "epoch": 0.25056208306217137, + "grad_norm": 0.19927112758159637, + "learning_rate": 0.001, + "loss": 3.083, + "num_input_tokens_seen": 9764864000, + "step": 37250 + }, + { + "epoch": 0.250898408005879, + "grad_norm": 0.18902507424354553, + "learning_rate": 0.001, + "loss": 3.0885, + "num_input_tokens_seen": 9777971200, + "step": 37300 + }, + { + "epoch": 0.2512347329495866, + "grad_norm": 0.19465987384319305, + "learning_rate": 0.001, + "loss": 3.0826, + "num_input_tokens_seen": 9791078400, + "step": 37350 + }, + { + "epoch": 0.2515710578932942, + "grad_norm": 0.2374599725008011, + "learning_rate": 0.001, + "loss": 3.0796, + "num_input_tokens_seen": 9804185600, + "step": 37400 + }, + { + "epoch": 0.2519073828370018, + "grad_norm": 0.201645627617836, + "learning_rate": 0.001, + "loss": 3.0856, + "num_input_tokens_seen": 9817292800, + "step": 37450 + }, + { + "epoch": 0.2522437077807094, + "grad_norm": 0.5505014061927795, + "learning_rate": 0.001, + "loss": 3.1105, + "num_input_tokens_seen": 9830400000, + "step": 37500 + }, + { + "epoch": 0.2522437077807094, + "eval_loss": 2.9955086708068848, + "eval_runtime": 51.2641, + "eval_samples_per_second": 97.534, + "eval_steps_per_second": 24.384, + "num_input_tokens_seen": 9830400000, + "step": 37500 + }, + { + "epoch": 0.25258003272441704, + "grad_norm": 0.5572711229324341, + "learning_rate": 0.001, + "loss": 3.0967, + "num_input_tokens_seen": 9843507200, + "step": 37550 + }, + { + "epoch": 0.25291635766812465, + "grad_norm": 0.25361862778663635, + "learning_rate": 0.001, + "loss": 3.1296, + "num_input_tokens_seen": 9856614400, + "step": 37600 + }, + { + "epoch": 0.25325268261183226, + "grad_norm": 0.24185167253017426, + "learning_rate": 0.001, + "loss": 3.1003, + "num_input_tokens_seen": 9869721600, + "step": 37650 + }, + { + "epoch": 0.25358900755553987, + "grad_norm": 0.2068016678094864, + "learning_rate": 0.001, + "loss": 3.088, + "num_input_tokens_seen": 9882828800, + "step": 37700 + }, + { + "epoch": 0.2539253324992475, + "grad_norm": 0.2029482126235962, + "learning_rate": 0.001, + "loss": 3.0878, + "num_input_tokens_seen": 9895936000, + "step": 37750 + }, + { + "epoch": 0.2542616574429551, + "grad_norm": 0.22508949041366577, + "learning_rate": 0.001, + "loss": 3.0992, + "num_input_tokens_seen": 9909043200, + "step": 37800 + }, + { + "epoch": 0.2545979823866627, + "grad_norm": 0.19577881693840027, + "learning_rate": 0.001, + "loss": 3.0866, + "num_input_tokens_seen": 9922150400, + "step": 37850 + }, + { + "epoch": 0.2549343073303703, + "grad_norm": 0.815874457359314, + "learning_rate": 0.001, + "loss": 3.0858, + "num_input_tokens_seen": 9935257600, + "step": 37900 + }, + { + "epoch": 0.2552706322740779, + "grad_norm": 0.20485574007034302, + "learning_rate": 0.001, + "loss": 3.0943, + "num_input_tokens_seen": 9948364800, + "step": 37950 + }, + { + "epoch": 0.25560695721778554, + "grad_norm": 0.23158146440982819, + "learning_rate": 0.001, + "loss": 3.0926, + "num_input_tokens_seen": 9961472000, + "step": 38000 + }, + { + "epoch": 0.25560695721778554, + "eval_loss": 2.9854347705841064, + "eval_runtime": 51.4615, + "eval_samples_per_second": 97.16, + "eval_steps_per_second": 24.29, + "num_input_tokens_seen": 9961472000, + "step": 38000 + }, + { + "epoch": 0.25594328216149315, + "grad_norm": 0.32355332374572754, + "learning_rate": 0.001, + "loss": 3.0965, + "num_input_tokens_seen": 9974579200, + "step": 38050 + }, + { + "epoch": 0.25627960710520076, + "grad_norm": 0.20291878283023834, + "learning_rate": 0.001, + "loss": 3.0803, + "num_input_tokens_seen": 9987686400, + "step": 38100 + }, + { + "epoch": 0.25661593204890837, + "grad_norm": 0.17536096274852753, + "learning_rate": 0.001, + "loss": 3.0765, + "num_input_tokens_seen": 10000793600, + "step": 38150 + }, + { + "epoch": 0.256952256992616, + "grad_norm": 0.17826804518699646, + "learning_rate": 0.001, + "loss": 3.0735, + "num_input_tokens_seen": 10013900800, + "step": 38200 + }, + { + "epoch": 0.2572885819363236, + "grad_norm": 0.20115964114665985, + "learning_rate": 0.001, + "loss": 3.0813, + "num_input_tokens_seen": 10027008000, + "step": 38250 + }, + { + "epoch": 0.2576249068800312, + "grad_norm": 0.23634804785251617, + "learning_rate": 0.001, + "loss": 3.0821, + "num_input_tokens_seen": 10040115200, + "step": 38300 + }, + { + "epoch": 0.2579612318237388, + "grad_norm": 0.31893596053123474, + "learning_rate": 0.001, + "loss": 3.096, + "num_input_tokens_seen": 10053222400, + "step": 38350 + }, + { + "epoch": 0.2582975567674464, + "grad_norm": 0.21891412138938904, + "learning_rate": 0.001, + "loss": 3.0904, + "num_input_tokens_seen": 10066329600, + "step": 38400 + }, + { + "epoch": 0.25863388171115403, + "grad_norm": 0.21848681569099426, + "learning_rate": 0.001, + "loss": 3.0793, + "num_input_tokens_seen": 10079436800, + "step": 38450 + }, + { + "epoch": 0.25897020665486165, + "grad_norm": 0.2052360624074936, + "learning_rate": 0.001, + "loss": 3.0802, + "num_input_tokens_seen": 10092544000, + "step": 38500 + }, + { + "epoch": 0.25897020665486165, + "eval_loss": 2.9803249835968018, + "eval_runtime": 51.5379, + "eval_samples_per_second": 97.016, + "eval_steps_per_second": 24.254, + "num_input_tokens_seen": 10092544000, + "step": 38500 + }, + { + "epoch": 0.25930653159856926, + "grad_norm": 0.23162005841732025, + "learning_rate": 0.001, + "loss": 3.0876, + "num_input_tokens_seen": 10105651200, + "step": 38550 + }, + { + "epoch": 0.25964285654227687, + "grad_norm": 0.23110276460647583, + "learning_rate": 0.001, + "loss": 3.0745, + "num_input_tokens_seen": 10118758400, + "step": 38600 + }, + { + "epoch": 0.2599791814859845, + "grad_norm": 0.22557710111141205, + "learning_rate": 0.001, + "loss": 3.0716, + "num_input_tokens_seen": 10131865600, + "step": 38650 + }, + { + "epoch": 0.2603155064296921, + "grad_norm": 0.19009199738502502, + "learning_rate": 0.001, + "loss": 3.0765, + "num_input_tokens_seen": 10144972800, + "step": 38700 + }, + { + "epoch": 0.2606518313733997, + "grad_norm": 0.2352983057498932, + "learning_rate": 0.001, + "loss": 3.0834, + "num_input_tokens_seen": 10158080000, + "step": 38750 + }, + { + "epoch": 0.2609881563171073, + "grad_norm": 0.1986854076385498, + "learning_rate": 0.001, + "loss": 3.084, + "num_input_tokens_seen": 10171187200, + "step": 38800 + }, + { + "epoch": 0.2613244812608149, + "grad_norm": 1.6988344192504883, + "learning_rate": 0.001, + "loss": 3.0994, + "num_input_tokens_seen": 10184294400, + "step": 38850 + }, + { + "epoch": 0.2616608062045226, + "grad_norm": 0.2794356346130371, + "learning_rate": 0.001, + "loss": 3.1265, + "num_input_tokens_seen": 10197401600, + "step": 38900 + }, + { + "epoch": 0.2619971311482302, + "grad_norm": 0.2307969629764557, + "learning_rate": 0.001, + "loss": 3.1015, + "num_input_tokens_seen": 10210508800, + "step": 38950 + }, + { + "epoch": 0.2623334560919378, + "grad_norm": 0.29501858353614807, + "learning_rate": 0.001, + "loss": 3.0881, + "num_input_tokens_seen": 10223616000, + "step": 39000 + }, + { + "epoch": 0.2623334560919378, + "eval_loss": 2.9856879711151123, + "eval_runtime": 51.7163, + "eval_samples_per_second": 96.681, + "eval_steps_per_second": 24.17, + "num_input_tokens_seen": 10223616000, + "step": 39000 + }, + { + "epoch": 0.2626697810356454, + "grad_norm": 0.288418710231781, + "learning_rate": 0.001, + "loss": 3.0901, + "num_input_tokens_seen": 10236723200, + "step": 39050 + }, + { + "epoch": 0.26300610597935303, + "grad_norm": 0.2238176465034485, + "learning_rate": 0.001, + "loss": 3.0869, + "num_input_tokens_seen": 10249830400, + "step": 39100 + }, + { + "epoch": 0.26334243092306064, + "grad_norm": 0.21013687551021576, + "learning_rate": 0.001, + "loss": 3.0821, + "num_input_tokens_seen": 10262937600, + "step": 39150 + }, + { + "epoch": 0.26367875586676826, + "grad_norm": 0.20650598406791687, + "learning_rate": 0.001, + "loss": 3.0924, + "num_input_tokens_seen": 10276044800, + "step": 39200 + }, + { + "epoch": 0.26401508081047587, + "grad_norm": 1.1471627950668335, + "learning_rate": 0.001, + "loss": 3.0719, + "num_input_tokens_seen": 10289152000, + "step": 39250 + }, + { + "epoch": 0.2643514057541835, + "grad_norm": 0.2899995446205139, + "learning_rate": 0.001, + "loss": 3.0901, + "num_input_tokens_seen": 10302259200, + "step": 39300 + }, + { + "epoch": 0.2646877306978911, + "grad_norm": 0.25812703371047974, + "learning_rate": 0.001, + "loss": 3.088, + "num_input_tokens_seen": 10315366400, + "step": 39350 + }, + { + "epoch": 0.2650240556415987, + "grad_norm": 0.36547353863716125, + "learning_rate": 0.001, + "loss": 3.0782, + "num_input_tokens_seen": 10328473600, + "step": 39400 + }, + { + "epoch": 0.2653603805853063, + "grad_norm": 0.41187551617622375, + "learning_rate": 0.001, + "loss": 3.0991, + "num_input_tokens_seen": 10341580800, + "step": 39450 + }, + { + "epoch": 0.2656967055290139, + "grad_norm": 0.2279098927974701, + "learning_rate": 0.001, + "loss": 3.083, + "num_input_tokens_seen": 10354688000, + "step": 39500 + }, + { + "epoch": 0.2656967055290139, + "eval_loss": 2.9809067249298096, + "eval_runtime": 51.2151, + "eval_samples_per_second": 97.627, + "eval_steps_per_second": 24.407, + "num_input_tokens_seen": 10354688000, + "step": 39500 + }, + { + "epoch": 0.26603303047272153, + "grad_norm": 0.2444402426481247, + "learning_rate": 0.001, + "loss": 3.0887, + "num_input_tokens_seen": 10367795200, + "step": 39550 + }, + { + "epoch": 0.26636935541642914, + "grad_norm": 0.24601753056049347, + "learning_rate": 0.001, + "loss": 3.0802, + "num_input_tokens_seen": 10380902400, + "step": 39600 + }, + { + "epoch": 0.26670568036013675, + "grad_norm": 0.21487103402614594, + "learning_rate": 0.001, + "loss": 3.0777, + "num_input_tokens_seen": 10394009600, + "step": 39650 + }, + { + "epoch": 0.26704200530384437, + "grad_norm": 0.21092857420444489, + "learning_rate": 0.001, + "loss": 3.083, + "num_input_tokens_seen": 10407116800, + "step": 39700 + }, + { + "epoch": 0.267378330247552, + "grad_norm": 0.21346789598464966, + "learning_rate": 0.001, + "loss": 3.0946, + "num_input_tokens_seen": 10420224000, + "step": 39750 + }, + { + "epoch": 0.2677146551912596, + "grad_norm": 0.3449369966983795, + "learning_rate": 0.001, + "loss": 3.0827, + "num_input_tokens_seen": 10433331200, + "step": 39800 + }, + { + "epoch": 0.2680509801349672, + "grad_norm": 0.33765944838523865, + "learning_rate": 0.001, + "loss": 3.0886, + "num_input_tokens_seen": 10446438400, + "step": 39850 + }, + { + "epoch": 0.2683873050786748, + "grad_norm": 0.2361810952425003, + "learning_rate": 0.001, + "loss": 3.0902, + "num_input_tokens_seen": 10459545600, + "step": 39900 + }, + { + "epoch": 0.2687236300223824, + "grad_norm": 0.2562389373779297, + "learning_rate": 0.001, + "loss": 3.0958, + "num_input_tokens_seen": 10472652800, + "step": 39950 + }, + { + "epoch": 0.26905995496609003, + "grad_norm": 0.20619696378707886, + "learning_rate": 0.001, + "loss": 3.0904, + "num_input_tokens_seen": 10485760000, + "step": 40000 + }, + { + "epoch": 0.26905995496609003, + "eval_loss": 2.9785397052764893, + "eval_runtime": 51.4486, + "eval_samples_per_second": 97.184, + "eval_steps_per_second": 24.296, + "num_input_tokens_seen": 10485760000, + "step": 40000 + }, + { + "epoch": 0.26939627990979764, + "grad_norm": 0.775861382484436, + "learning_rate": 0.001, + "loss": 3.0856, + "num_input_tokens_seen": 10498867200, + "step": 40050 + }, + { + "epoch": 0.26973260485350525, + "grad_norm": 0.2422339916229248, + "learning_rate": 0.001, + "loss": 3.0831, + "num_input_tokens_seen": 10511974400, + "step": 40100 + }, + { + "epoch": 0.27006892979721286, + "grad_norm": 0.21582826972007751, + "learning_rate": 0.001, + "loss": 3.0735, + "num_input_tokens_seen": 10525081600, + "step": 40150 + }, + { + "epoch": 0.2704052547409205, + "grad_norm": 0.17632398009300232, + "learning_rate": 0.001, + "loss": 3.0878, + "num_input_tokens_seen": 10538188800, + "step": 40200 + }, + { + "epoch": 0.2707415796846281, + "grad_norm": 0.2152569591999054, + "learning_rate": 0.001, + "loss": 3.0766, + "num_input_tokens_seen": 10551296000, + "step": 40250 + }, + { + "epoch": 0.2710779046283357, + "grad_norm": 0.19657547771930695, + "learning_rate": 0.001, + "loss": 3.0674, + "num_input_tokens_seen": 10564403200, + "step": 40300 + }, + { + "epoch": 0.2714142295720433, + "grad_norm": 0.21468690037727356, + "learning_rate": 0.001, + "loss": 3.0746, + "num_input_tokens_seen": 10577510400, + "step": 40350 + }, + { + "epoch": 0.2717505545157509, + "grad_norm": 0.19693616032600403, + "learning_rate": 0.001, + "loss": 3.0795, + "num_input_tokens_seen": 10590617600, + "step": 40400 + }, + { + "epoch": 0.27208687945945853, + "grad_norm": 0.21241678297519684, + "learning_rate": 0.001, + "loss": 3.0843, + "num_input_tokens_seen": 10603724800, + "step": 40450 + }, + { + "epoch": 0.27242320440316614, + "grad_norm": 0.2053707093000412, + "learning_rate": 0.001, + "loss": 3.0857, + "num_input_tokens_seen": 10616832000, + "step": 40500 + }, + { + "epoch": 0.27242320440316614, + "eval_loss": 2.9741692543029785, + "eval_runtime": 51.4287, + "eval_samples_per_second": 97.222, + "eval_steps_per_second": 24.305, + "num_input_tokens_seen": 10616832000, + "step": 40500 + }, + { + "epoch": 0.27275952934687375, + "grad_norm": 0.18715134263038635, + "learning_rate": 0.001, + "loss": 3.0699, + "num_input_tokens_seen": 10629939200, + "step": 40550 + }, + { + "epoch": 0.27309585429058136, + "grad_norm": 0.20241226255893707, + "learning_rate": 0.001, + "loss": 3.0794, + "num_input_tokens_seen": 10643046400, + "step": 40600 + }, + { + "epoch": 0.273432179234289, + "grad_norm": 0.19430743157863617, + "learning_rate": 0.001, + "loss": 3.0768, + "num_input_tokens_seen": 10656153600, + "step": 40650 + }, + { + "epoch": 0.2737685041779966, + "grad_norm": 0.18806929886341095, + "learning_rate": 0.001, + "loss": 3.0778, + "num_input_tokens_seen": 10669260800, + "step": 40700 + }, + { + "epoch": 0.27410482912170425, + "grad_norm": 0.20148856937885284, + "learning_rate": 0.001, + "loss": 3.0768, + "num_input_tokens_seen": 10682368000, + "step": 40750 + }, + { + "epoch": 0.27444115406541186, + "grad_norm": 0.17226669192314148, + "learning_rate": 0.001, + "loss": 3.0621, + "num_input_tokens_seen": 10695475200, + "step": 40800 + }, + { + "epoch": 0.2747774790091195, + "grad_norm": 0.18090015649795532, + "learning_rate": 0.001, + "loss": 3.0693, + "num_input_tokens_seen": 10708582400, + "step": 40850 + }, + { + "epoch": 0.2751138039528271, + "grad_norm": 0.177758127450943, + "learning_rate": 0.001, + "loss": 3.0719, + "num_input_tokens_seen": 10721689600, + "step": 40900 + }, + { + "epoch": 0.2754501288965347, + "grad_norm": 0.18323731422424316, + "learning_rate": 0.001, + "loss": 3.0662, + "num_input_tokens_seen": 10734796800, + "step": 40950 + }, + { + "epoch": 0.2757864538402423, + "grad_norm": 0.1883731186389923, + "learning_rate": 0.001, + "loss": 3.0675, + "num_input_tokens_seen": 10747904000, + "step": 41000 + }, + { + "epoch": 0.2757864538402423, + "eval_loss": 2.9687845706939697, + "eval_runtime": 51.0712, + "eval_samples_per_second": 97.902, + "eval_steps_per_second": 24.476, + "num_input_tokens_seen": 10747904000, + "step": 41000 + }, + { + "epoch": 0.2761227787839499, + "grad_norm": 0.26252472400665283, + "learning_rate": 0.001, + "loss": 3.0668, + "num_input_tokens_seen": 10761011200, + "step": 41050 + }, + { + "epoch": 0.27645910372765753, + "grad_norm": 0.2921096086502075, + "learning_rate": 0.001, + "loss": 3.0817, + "num_input_tokens_seen": 10774118400, + "step": 41100 + }, + { + "epoch": 0.27679542867136514, + "grad_norm": 0.2732321619987488, + "learning_rate": 0.001, + "loss": 3.0918, + "num_input_tokens_seen": 10787225600, + "step": 41150 + }, + { + "epoch": 0.27713175361507275, + "grad_norm": 0.23650842905044556, + "learning_rate": 0.001, + "loss": 3.0767, + "num_input_tokens_seen": 10800332800, + "step": 41200 + }, + { + "epoch": 0.27746807855878036, + "grad_norm": 0.36582860350608826, + "learning_rate": 0.001, + "loss": 3.097, + "num_input_tokens_seen": 10813440000, + "step": 41250 + }, + { + "epoch": 0.277804403502488, + "grad_norm": 0.20505377650260925, + "learning_rate": 0.001, + "loss": 3.0776, + "num_input_tokens_seen": 10826547200, + "step": 41300 + }, + { + "epoch": 0.2781407284461956, + "grad_norm": 0.22332334518432617, + "learning_rate": 0.001, + "loss": 3.0822, + "num_input_tokens_seen": 10839654400, + "step": 41350 + }, + { + "epoch": 0.2784770533899032, + "grad_norm": 0.20054738223552704, + "learning_rate": 0.001, + "loss": 3.0756, + "num_input_tokens_seen": 10852761600, + "step": 41400 + }, + { + "epoch": 0.2788133783336108, + "grad_norm": 0.1935984194278717, + "learning_rate": 0.001, + "loss": 3.0772, + "num_input_tokens_seen": 10865868800, + "step": 41450 + }, + { + "epoch": 0.2791497032773184, + "grad_norm": 0.21076564490795135, + "learning_rate": 0.001, + "loss": 3.0733, + "num_input_tokens_seen": 10878976000, + "step": 41500 + }, + { + "epoch": 0.2791497032773184, + "eval_loss": 2.969437837600708, + "eval_runtime": 51.3634, + "eval_samples_per_second": 97.346, + "eval_steps_per_second": 24.336, + "num_input_tokens_seen": 10878976000, + "step": 41500 + }, + { + "epoch": 0.27948602822102603, + "grad_norm": 0.19656164944171906, + "learning_rate": 0.001, + "loss": 3.0705, + "num_input_tokens_seen": 10892083200, + "step": 41550 + }, + { + "epoch": 0.27982235316473364, + "grad_norm": 0.20582802593708038, + "learning_rate": 0.001, + "loss": 3.0847, + "num_input_tokens_seen": 10905190400, + "step": 41600 + }, + { + "epoch": 0.28015867810844125, + "grad_norm": 0.1748686581850052, + "learning_rate": 0.001, + "loss": 3.0612, + "num_input_tokens_seen": 10918297600, + "step": 41650 + }, + { + "epoch": 0.28049500305214886, + "grad_norm": 0.1908082365989685, + "learning_rate": 0.001, + "loss": 3.0673, + "num_input_tokens_seen": 10931404800, + "step": 41700 + }, + { + "epoch": 0.2808313279958565, + "grad_norm": 0.20851461589336395, + "learning_rate": 0.001, + "loss": 3.0724, + "num_input_tokens_seen": 10944512000, + "step": 41750 + }, + { + "epoch": 0.2811676529395641, + "grad_norm": 0.18114647269248962, + "learning_rate": 0.001, + "loss": 3.069, + "num_input_tokens_seen": 10957619200, + "step": 41800 + }, + { + "epoch": 0.2815039778832717, + "grad_norm": 0.19079278409481049, + "learning_rate": 0.001, + "loss": 3.0658, + "num_input_tokens_seen": 10970726400, + "step": 41850 + }, + { + "epoch": 0.2818403028269793, + "grad_norm": 0.16630959510803223, + "learning_rate": 0.001, + "loss": 3.0589, + "num_input_tokens_seen": 10983833600, + "step": 41900 + }, + { + "epoch": 0.2821766277706869, + "grad_norm": 0.1777983158826828, + "learning_rate": 0.001, + "loss": 3.0642, + "num_input_tokens_seen": 10996940800, + "step": 41950 + }, + { + "epoch": 0.2825129527143945, + "grad_norm": 0.36513733863830566, + "learning_rate": 0.001, + "loss": 3.0685, + "num_input_tokens_seen": 11010048000, + "step": 42000 + }, + { + "epoch": 0.2825129527143945, + "eval_loss": 2.9688751697540283, + "eval_runtime": 51.3924, + "eval_samples_per_second": 97.291, + "eval_steps_per_second": 24.323, + "num_input_tokens_seen": 11010048000, + "step": 42000 + }, + { + "epoch": 0.28284927765810214, + "grad_norm": 0.36728134751319885, + "learning_rate": 0.001, + "loss": 3.1317, + "num_input_tokens_seen": 11023155200, + "step": 42050 + }, + { + "epoch": 0.28318560260180975, + "grad_norm": 0.27042749524116516, + "learning_rate": 0.001, + "loss": 3.0815, + "num_input_tokens_seen": 11036262400, + "step": 42100 + }, + { + "epoch": 0.28352192754551736, + "grad_norm": 0.18322569131851196, + "learning_rate": 0.001, + "loss": 3.1027, + "num_input_tokens_seen": 11049369600, + "step": 42150 + }, + { + "epoch": 0.28385825248922497, + "grad_norm": 0.21849602460861206, + "learning_rate": 0.001, + "loss": 3.071, + "num_input_tokens_seen": 11062476800, + "step": 42200 + }, + { + "epoch": 0.2841945774329326, + "grad_norm": 0.38253432512283325, + "learning_rate": 0.001, + "loss": 3.0898, + "num_input_tokens_seen": 11075584000, + "step": 42250 + }, + { + "epoch": 0.2845309023766402, + "grad_norm": 0.21137705445289612, + "learning_rate": 0.001, + "loss": 3.0813, + "num_input_tokens_seen": 11088691200, + "step": 42300 + }, + { + "epoch": 0.2848672273203478, + "grad_norm": 1.2625150680541992, + "learning_rate": 0.001, + "loss": 3.0799, + "num_input_tokens_seen": 11101798400, + "step": 42350 + }, + { + "epoch": 0.2852035522640554, + "grad_norm": 0.2312895804643631, + "learning_rate": 0.001, + "loss": 3.0877, + "num_input_tokens_seen": 11114905600, + "step": 42400 + }, + { + "epoch": 0.285539877207763, + "grad_norm": 0.2823241353034973, + "learning_rate": 0.001, + "loss": 3.0733, + "num_input_tokens_seen": 11128012800, + "step": 42450 + }, + { + "epoch": 0.28587620215147064, + "grad_norm": 0.21215762197971344, + "learning_rate": 0.001, + "loss": 3.0798, + "num_input_tokens_seen": 11141120000, + "step": 42500 + }, + { + "epoch": 0.28587620215147064, + "eval_loss": 2.9728448390960693, + "eval_runtime": 51.3574, + "eval_samples_per_second": 97.357, + "eval_steps_per_second": 24.339, + "num_input_tokens_seen": 11141120000, + "step": 42500 + }, + { + "epoch": 0.2862125270951783, + "grad_norm": 0.216337651014328, + "learning_rate": 0.001, + "loss": 3.0786, + "num_input_tokens_seen": 11154227200, + "step": 42550 + }, + { + "epoch": 0.2865488520388859, + "grad_norm": 0.2741137444972992, + "learning_rate": 0.001, + "loss": 3.0814, + "num_input_tokens_seen": 11167334400, + "step": 42600 + }, + { + "epoch": 0.2868851769825935, + "grad_norm": 0.22545816004276276, + "learning_rate": 0.001, + "loss": 3.0808, + "num_input_tokens_seen": 11180441600, + "step": 42650 + }, + { + "epoch": 0.28722150192630114, + "grad_norm": 0.2085258960723877, + "learning_rate": 0.001, + "loss": 3.0696, + "num_input_tokens_seen": 11193548800, + "step": 42700 + }, + { + "epoch": 0.28755782687000875, + "grad_norm": 0.25214484333992004, + "learning_rate": 0.001, + "loss": 3.0787, + "num_input_tokens_seen": 11206656000, + "step": 42750 + }, + { + "epoch": 0.28789415181371636, + "grad_norm": 0.23247800767421722, + "learning_rate": 0.001, + "loss": 3.0718, + "num_input_tokens_seen": 11219763200, + "step": 42800 + }, + { + "epoch": 0.28823047675742397, + "grad_norm": 0.22879773378372192, + "learning_rate": 0.001, + "loss": 3.0709, + "num_input_tokens_seen": 11232870400, + "step": 42850 + }, + { + "epoch": 0.2885668017011316, + "grad_norm": 0.19782567024230957, + "learning_rate": 0.001, + "loss": 3.07, + "num_input_tokens_seen": 11245977600, + "step": 42900 + }, + { + "epoch": 0.2889031266448392, + "grad_norm": 0.2164357751607895, + "learning_rate": 0.001, + "loss": 3.0614, + "num_input_tokens_seen": 11259084800, + "step": 42950 + }, + { + "epoch": 0.2892394515885468, + "grad_norm": 0.2364019900560379, + "learning_rate": 0.001, + "loss": 3.071, + "num_input_tokens_seen": 11272192000, + "step": 43000 + }, + { + "epoch": 0.2892394515885468, + "eval_loss": 2.969633102416992, + "eval_runtime": 51.5112, + "eval_samples_per_second": 97.066, + "eval_steps_per_second": 24.267, + "num_input_tokens_seen": 11272192000, + "step": 43000 + }, + { + "epoch": 0.2895757765322544, + "grad_norm": 0.2350464016199112, + "learning_rate": 0.001, + "loss": 3.0636, + "num_input_tokens_seen": 11285299200, + "step": 43050 + }, + { + "epoch": 0.289912101475962, + "grad_norm": 0.2345239371061325, + "learning_rate": 0.001, + "loss": 3.0764, + "num_input_tokens_seen": 11298406400, + "step": 43100 + }, + { + "epoch": 0.29024842641966964, + "grad_norm": 0.2316889762878418, + "learning_rate": 0.001, + "loss": 3.0799, + "num_input_tokens_seen": 11311513600, + "step": 43150 + }, + { + "epoch": 0.29058475136337725, + "grad_norm": 0.17912918329238892, + "learning_rate": 0.001, + "loss": 3.0822, + "num_input_tokens_seen": 11324620800, + "step": 43200 + }, + { + "epoch": 0.29092107630708486, + "grad_norm": 0.18596945703029633, + "learning_rate": 0.001, + "loss": 3.0599, + "num_input_tokens_seen": 11337728000, + "step": 43250 + }, + { + "epoch": 0.29125740125079247, + "grad_norm": 0.20373962819576263, + "learning_rate": 0.001, + "loss": 3.0668, + "num_input_tokens_seen": 11350835200, + "step": 43300 + }, + { + "epoch": 0.2915937261945001, + "grad_norm": 0.17628611624240875, + "learning_rate": 0.001, + "loss": 3.0642, + "num_input_tokens_seen": 11363942400, + "step": 43350 + }, + { + "epoch": 0.2919300511382077, + "grad_norm": 0.17134816944599152, + "learning_rate": 0.001, + "loss": 3.0711, + "num_input_tokens_seen": 11377049600, + "step": 43400 + }, + { + "epoch": 0.2922663760819153, + "grad_norm": 0.18842625617980957, + "learning_rate": 0.001, + "loss": 3.0656, + "num_input_tokens_seen": 11390156800, + "step": 43450 + }, + { + "epoch": 0.2926027010256229, + "grad_norm": 0.20406392216682434, + "learning_rate": 0.001, + "loss": 3.0664, + "num_input_tokens_seen": 11403264000, + "step": 43500 + }, + { + "epoch": 0.2926027010256229, + "eval_loss": 2.967707872390747, + "eval_runtime": 50.7844, + "eval_samples_per_second": 98.455, + "eval_steps_per_second": 24.614, + "num_input_tokens_seen": 11403264000, + "step": 43500 + }, + { + "epoch": 0.2929390259693305, + "grad_norm": 0.1807146519422531, + "learning_rate": 0.001, + "loss": 3.0722, + "num_input_tokens_seen": 11416371200, + "step": 43550 + }, + { + "epoch": 0.29327535091303814, + "grad_norm": 0.18754515051841736, + "learning_rate": 0.001, + "loss": 3.0705, + "num_input_tokens_seen": 11429478400, + "step": 43600 + }, + { + "epoch": 0.29361167585674575, + "grad_norm": 0.21045394241809845, + "learning_rate": 0.001, + "loss": 3.0727, + "num_input_tokens_seen": 11442585600, + "step": 43650 + }, + { + "epoch": 0.29394800080045336, + "grad_norm": 0.2657870054244995, + "learning_rate": 0.001, + "loss": 3.0692, + "num_input_tokens_seen": 11455692800, + "step": 43700 + }, + { + "epoch": 0.29428432574416097, + "grad_norm": 0.18931645154953003, + "learning_rate": 0.001, + "loss": 3.0773, + "num_input_tokens_seen": 11468800000, + "step": 43750 + }, + { + "epoch": 0.2946206506878686, + "grad_norm": 0.18956629931926727, + "learning_rate": 0.001, + "loss": 3.0638, + "num_input_tokens_seen": 11481907200, + "step": 43800 + }, + { + "epoch": 0.2949569756315762, + "grad_norm": 0.18412917852401733, + "learning_rate": 0.001, + "loss": 3.0673, + "num_input_tokens_seen": 11495014400, + "step": 43850 + }, + { + "epoch": 0.2952933005752838, + "grad_norm": 0.18386723101139069, + "learning_rate": 0.001, + "loss": 3.0532, + "num_input_tokens_seen": 11508121600, + "step": 43900 + }, + { + "epoch": 0.2956296255189914, + "grad_norm": 1.7288094758987427, + "learning_rate": 0.001, + "loss": 3.0686, + "num_input_tokens_seen": 11521228800, + "step": 43950 + }, + { + "epoch": 0.295965950462699, + "grad_norm": 1.273023009300232, + "learning_rate": 0.001, + "loss": 3.0844, + "num_input_tokens_seen": 11534336000, + "step": 44000 + }, + { + "epoch": 0.295965950462699, + "eval_loss": 2.988032102584839, + "eval_runtime": 51.218, + "eval_samples_per_second": 97.622, + "eval_steps_per_second": 24.405, + "num_input_tokens_seen": 11534336000, + "step": 44000 + }, + { + "epoch": 0.29630227540640663, + "grad_norm": 0.22384904325008392, + "learning_rate": 0.001, + "loss": 3.0741, + "num_input_tokens_seen": 11547443200, + "step": 44050 + }, + { + "epoch": 0.29663860035011425, + "grad_norm": 0.2000838816165924, + "learning_rate": 0.001, + "loss": 3.0681, + "num_input_tokens_seen": 11560550400, + "step": 44100 + }, + { + "epoch": 0.29697492529382186, + "grad_norm": 0.5984578728675842, + "learning_rate": 0.001, + "loss": 3.0652, + "num_input_tokens_seen": 11573657600, + "step": 44150 + }, + { + "epoch": 0.29731125023752947, + "grad_norm": 0.20970633625984192, + "learning_rate": 0.001, + "loss": 3.0799, + "num_input_tokens_seen": 11586764800, + "step": 44200 + }, + { + "epoch": 0.2976475751812371, + "grad_norm": 0.18818268179893494, + "learning_rate": 0.001, + "loss": 3.0719, + "num_input_tokens_seen": 11599872000, + "step": 44250 + }, + { + "epoch": 0.2979839001249447, + "grad_norm": 0.2058591991662979, + "learning_rate": 0.001, + "loss": 3.074, + "num_input_tokens_seen": 11612979200, + "step": 44300 + }, + { + "epoch": 0.29832022506865236, + "grad_norm": 0.40354540944099426, + "learning_rate": 0.001, + "loss": 3.0809, + "num_input_tokens_seen": 11626086400, + "step": 44350 + }, + { + "epoch": 0.29865655001235997, + "grad_norm": 0.1923748105764389, + "learning_rate": 0.001, + "loss": 3.0769, + "num_input_tokens_seen": 11639193600, + "step": 44400 + }, + { + "epoch": 0.2989928749560676, + "grad_norm": 0.24034352600574493, + "learning_rate": 0.001, + "loss": 3.0648, + "num_input_tokens_seen": 11652300800, + "step": 44450 + }, + { + "epoch": 0.2993291998997752, + "grad_norm": 0.2124001830816269, + "learning_rate": 0.001, + "loss": 3.0591, + "num_input_tokens_seen": 11665408000, + "step": 44500 + }, + { + "epoch": 0.2993291998997752, + "eval_loss": 2.9622106552124023, + "eval_runtime": 51.3129, + "eval_samples_per_second": 97.441, + "eval_steps_per_second": 24.36, + "num_input_tokens_seen": 11665408000, + "step": 44500 + }, + { + "epoch": 0.2996655248434828, + "grad_norm": 0.2701764702796936, + "learning_rate": 0.001, + "loss": 3.0739, + "num_input_tokens_seen": 11678515200, + "step": 44550 + }, + { + "epoch": 0.3000018497871904, + "grad_norm": 0.20468254387378693, + "learning_rate": 0.001, + "loss": 3.0724, + "num_input_tokens_seen": 11691622400, + "step": 44600 + }, + { + "epoch": 0.300338174730898, + "grad_norm": 0.18791192770004272, + "learning_rate": 0.001, + "loss": 3.0705, + "num_input_tokens_seen": 11704729600, + "step": 44650 + }, + { + "epoch": 0.30067449967460563, + "grad_norm": 0.21384365856647491, + "learning_rate": 0.001, + "loss": 3.0719, + "num_input_tokens_seen": 11717836800, + "step": 44700 + }, + { + "epoch": 0.30101082461831324, + "grad_norm": 0.20965896546840668, + "learning_rate": 0.001, + "loss": 3.0575, + "num_input_tokens_seen": 11730944000, + "step": 44750 + }, + { + "epoch": 0.30134714956202086, + "grad_norm": 0.4128492772579193, + "learning_rate": 0.001, + "loss": 3.0948, + "num_input_tokens_seen": 11744051200, + "step": 44800 + }, + { + "epoch": 0.30168347450572847, + "grad_norm": 0.27056023478507996, + "learning_rate": 0.001, + "loss": 3.0807, + "num_input_tokens_seen": 11757158400, + "step": 44850 + }, + { + "epoch": 0.3020197994494361, + "grad_norm": 0.1866568773984909, + "learning_rate": 0.001, + "loss": 3.0737, + "num_input_tokens_seen": 11770265600, + "step": 44900 + }, + { + "epoch": 0.3023561243931437, + "grad_norm": 0.21440774202346802, + "learning_rate": 0.001, + "loss": 3.0804, + "num_input_tokens_seen": 11783372800, + "step": 44950 + }, + { + "epoch": 0.3026924493368513, + "grad_norm": 0.30685120820999146, + "learning_rate": 0.001, + "loss": 3.0603, + "num_input_tokens_seen": 11796480000, + "step": 45000 + }, + { + "epoch": 0.3026924493368513, + "eval_loss": 2.966905355453491, + "eval_runtime": 51.6659, + "eval_samples_per_second": 96.776, + "eval_steps_per_second": 24.194, + "num_input_tokens_seen": 11796480000, + "step": 45000 + }, + { + "epoch": 0.3030287742805589, + "grad_norm": 0.22337579727172852, + "learning_rate": 0.001, + "loss": 3.0839, + "num_input_tokens_seen": 11809587200, + "step": 45050 + }, + { + "epoch": 0.3033650992242665, + "grad_norm": 0.23798304796218872, + "learning_rate": 0.001, + "loss": 3.0729, + "num_input_tokens_seen": 11822694400, + "step": 45100 + }, + { + "epoch": 0.30370142416797413, + "grad_norm": 0.1956755667924881, + "learning_rate": 0.001, + "loss": 3.071, + "num_input_tokens_seen": 11835801600, + "step": 45150 + }, + { + "epoch": 0.30403774911168174, + "grad_norm": 0.20110267400741577, + "learning_rate": 0.001, + "loss": 3.0662, + "num_input_tokens_seen": 11848908800, + "step": 45200 + }, + { + "epoch": 0.30437407405538935, + "grad_norm": 0.20845186710357666, + "learning_rate": 0.001, + "loss": 3.0627, + "num_input_tokens_seen": 11862016000, + "step": 45250 + }, + { + "epoch": 0.30471039899909697, + "grad_norm": 0.17928333580493927, + "learning_rate": 0.001, + "loss": 3.0543, + "num_input_tokens_seen": 11875123200, + "step": 45300 + }, + { + "epoch": 0.3050467239428046, + "grad_norm": 0.17617329955101013, + "learning_rate": 0.001, + "loss": 3.0614, + "num_input_tokens_seen": 11888230400, + "step": 45350 + }, + { + "epoch": 0.3053830488865122, + "grad_norm": 0.2610914409160614, + "learning_rate": 0.001, + "loss": 3.069, + "num_input_tokens_seen": 11901337600, + "step": 45400 + }, + { + "epoch": 0.3057193738302198, + "grad_norm": 0.3862092196941376, + "learning_rate": 0.001, + "loss": 3.0944, + "num_input_tokens_seen": 11914444800, + "step": 45450 + }, + { + "epoch": 0.3060556987739274, + "grad_norm": 0.40467047691345215, + "learning_rate": 0.001, + "loss": 3.0714, + "num_input_tokens_seen": 11927552000, + "step": 45500 + }, + { + "epoch": 0.3060556987739274, + "eval_loss": 2.965502977371216, + "eval_runtime": 51.4918, + "eval_samples_per_second": 97.103, + "eval_steps_per_second": 24.276, + "num_input_tokens_seen": 11927552000, + "step": 45500 + }, + { + "epoch": 0.306392023717635, + "grad_norm": 0.19686584174633026, + "learning_rate": 0.001, + "loss": 3.0673, + "num_input_tokens_seen": 11940659200, + "step": 45550 + }, + { + "epoch": 0.30672834866134263, + "grad_norm": 0.24866420030593872, + "learning_rate": 0.001, + "loss": 3.0773, + "num_input_tokens_seen": 11953766400, + "step": 45600 + }, + { + "epoch": 0.30706467360505024, + "grad_norm": 0.208501935005188, + "learning_rate": 0.001, + "loss": 3.0751, + "num_input_tokens_seen": 11966873600, + "step": 45650 + }, + { + "epoch": 0.30740099854875785, + "grad_norm": 0.20565010607242584, + "learning_rate": 0.001, + "loss": 3.0709, + "num_input_tokens_seen": 11979980800, + "step": 45700 + }, + { + "epoch": 0.30773732349246546, + "grad_norm": 0.21517585217952728, + "learning_rate": 0.001, + "loss": 3.0765, + "num_input_tokens_seen": 11993088000, + "step": 45750 + }, + { + "epoch": 0.3080736484361731, + "grad_norm": 0.2618497610092163, + "learning_rate": 0.001, + "loss": 3.0549, + "num_input_tokens_seen": 12006195200, + "step": 45800 + }, + { + "epoch": 0.3084099733798807, + "grad_norm": 0.24163542687892914, + "learning_rate": 0.001, + "loss": 3.0704, + "num_input_tokens_seen": 12019302400, + "step": 45850 + }, + { + "epoch": 0.3087462983235883, + "grad_norm": 0.2038656324148178, + "learning_rate": 0.001, + "loss": 3.0631, + "num_input_tokens_seen": 12032409600, + "step": 45900 + }, + { + "epoch": 0.3090826232672959, + "grad_norm": 0.20691357553005219, + "learning_rate": 0.001, + "loss": 3.0723, + "num_input_tokens_seen": 12045516800, + "step": 45950 + }, + { + "epoch": 0.3094189482110035, + "grad_norm": 0.18757498264312744, + "learning_rate": 0.001, + "loss": 3.0602, + "num_input_tokens_seen": 12058624000, + "step": 46000 + }, + { + "epoch": 0.3094189482110035, + "eval_loss": 2.9599878787994385, + "eval_runtime": 53.3376, + "eval_samples_per_second": 93.742, + "eval_steps_per_second": 23.436, + "num_input_tokens_seen": 12058624000, + "step": 46000 + }, + { + "epoch": 0.30975527315471113, + "grad_norm": 0.1939045637845993, + "learning_rate": 0.001, + "loss": 3.0701, + "num_input_tokens_seen": 12071731200, + "step": 46050 + }, + { + "epoch": 0.31009159809841874, + "grad_norm": 0.22705882787704468, + "learning_rate": 0.001, + "loss": 3.0574, + "num_input_tokens_seen": 12084838400, + "step": 46100 + }, + { + "epoch": 0.3104279230421264, + "grad_norm": 0.2075151801109314, + "learning_rate": 0.001, + "loss": 3.067, + "num_input_tokens_seen": 12097945600, + "step": 46150 + }, + { + "epoch": 0.310764247985834, + "grad_norm": 0.4686187207698822, + "learning_rate": 0.001, + "loss": 3.0656, + "num_input_tokens_seen": 12111052800, + "step": 46200 + }, + { + "epoch": 0.31110057292954163, + "grad_norm": 0.1929931640625, + "learning_rate": 0.001, + "loss": 3.0631, + "num_input_tokens_seen": 12124160000, + "step": 46250 + }, + { + "epoch": 0.31143689787324924, + "grad_norm": 0.18403789401054382, + "learning_rate": 0.001, + "loss": 3.0579, + "num_input_tokens_seen": 12137267200, + "step": 46300 + }, + { + "epoch": 0.31177322281695685, + "grad_norm": 0.18552987277507782, + "learning_rate": 0.001, + "loss": 3.0625, + "num_input_tokens_seen": 12150374400, + "step": 46350 + }, + { + "epoch": 0.31210954776066446, + "grad_norm": 0.24002918601036072, + "learning_rate": 0.001, + "loss": 3.0629, + "num_input_tokens_seen": 12163481600, + "step": 46400 + }, + { + "epoch": 0.3124458727043721, + "grad_norm": 0.17444545030593872, + "learning_rate": 0.001, + "loss": 3.055, + "num_input_tokens_seen": 12176588800, + "step": 46450 + }, + { + "epoch": 0.3127821976480797, + "grad_norm": 0.1911567747592926, + "learning_rate": 0.001, + "loss": 3.067, + "num_input_tokens_seen": 12189696000, + "step": 46500 + }, + { + "epoch": 0.3127821976480797, + "eval_loss": 2.9570987224578857, + "eval_runtime": 52.5377, + "eval_samples_per_second": 95.17, + "eval_steps_per_second": 23.792, + "num_input_tokens_seen": 12189696000, + "step": 46500 + }, + { + "epoch": 0.3131185225917873, + "grad_norm": 0.18903926014900208, + "learning_rate": 0.001, + "loss": 3.0499, + "num_input_tokens_seen": 12202803200, + "step": 46550 + }, + { + "epoch": 0.3134548475354949, + "grad_norm": 0.35490429401397705, + "learning_rate": 0.001, + "loss": 3.0573, + "num_input_tokens_seen": 12215910400, + "step": 46600 + }, + { + "epoch": 0.3137911724792025, + "grad_norm": 0.2066306322813034, + "learning_rate": 0.001, + "loss": 3.0613, + "num_input_tokens_seen": 12229017600, + "step": 46650 + }, + { + "epoch": 0.31412749742291013, + "grad_norm": 0.3016819357872009, + "learning_rate": 0.001, + "loss": 3.068, + "num_input_tokens_seen": 12242124800, + "step": 46700 + }, + { + "epoch": 0.31446382236661774, + "grad_norm": 0.22070977091789246, + "learning_rate": 0.001, + "loss": 3.0929, + "num_input_tokens_seen": 12255232000, + "step": 46750 + }, + { + "epoch": 0.31480014731032535, + "grad_norm": 0.21311117708683014, + "learning_rate": 0.001, + "loss": 3.0739, + "num_input_tokens_seen": 12268339200, + "step": 46800 + }, + { + "epoch": 0.31513647225403296, + "grad_norm": 0.22895431518554688, + "learning_rate": 0.001, + "loss": 3.0726, + "num_input_tokens_seen": 12281446400, + "step": 46850 + }, + { + "epoch": 0.3154727971977406, + "grad_norm": 0.304040789604187, + "learning_rate": 0.001, + "loss": 3.0561, + "num_input_tokens_seen": 12294553600, + "step": 46900 + }, + { + "epoch": 0.3158091221414482, + "grad_norm": 0.18291215598583221, + "learning_rate": 0.001, + "loss": 3.0671, + "num_input_tokens_seen": 12307660800, + "step": 46950 + }, + { + "epoch": 0.3161454470851558, + "grad_norm": 0.19144318997859955, + "learning_rate": 0.001, + "loss": 3.0676, + "num_input_tokens_seen": 12320768000, + "step": 47000 + }, + { + "epoch": 0.3161454470851558, + "eval_loss": 2.9561285972595215, + "eval_runtime": 53.2725, + "eval_samples_per_second": 93.857, + "eval_steps_per_second": 23.464, + "num_input_tokens_seen": 12320768000, + "step": 47000 + }, + { + "epoch": 0.3164817720288634, + "grad_norm": 0.1988057643175125, + "learning_rate": 0.001, + "loss": 3.0652, + "num_input_tokens_seen": 12333875200, + "step": 47050 + }, + { + "epoch": 0.316818096972571, + "grad_norm": 0.19520634412765503, + "learning_rate": 0.001, + "loss": 3.0641, + "num_input_tokens_seen": 12346982400, + "step": 47100 + }, + { + "epoch": 0.31715442191627863, + "grad_norm": 0.21420574188232422, + "learning_rate": 0.001, + "loss": 3.0665, + "num_input_tokens_seen": 12360089600, + "step": 47150 + }, + { + "epoch": 0.31749074685998624, + "grad_norm": 0.18173083662986755, + "learning_rate": 0.001, + "loss": 3.0565, + "num_input_tokens_seen": 12373196800, + "step": 47200 + }, + { + "epoch": 0.31782707180369385, + "grad_norm": 0.1746867150068283, + "learning_rate": 0.001, + "loss": 3.0739, + "num_input_tokens_seen": 12386304000, + "step": 47250 + }, + { + "epoch": 0.31816339674740146, + "grad_norm": 0.16941632330417633, + "learning_rate": 0.001, + "loss": 3.0514, + "num_input_tokens_seen": 12399411200, + "step": 47300 + }, + { + "epoch": 0.3184997216911091, + "grad_norm": 0.19572339951992035, + "learning_rate": 0.001, + "loss": 3.0512, + "num_input_tokens_seen": 12412518400, + "step": 47350 + }, + { + "epoch": 0.3188360466348167, + "grad_norm": 0.19083815813064575, + "learning_rate": 0.001, + "loss": 3.0482, + "num_input_tokens_seen": 12425625600, + "step": 47400 + }, + { + "epoch": 0.3191723715785243, + "grad_norm": 0.1741664558649063, + "learning_rate": 0.001, + "loss": 3.0544, + "num_input_tokens_seen": 12438732800, + "step": 47450 + }, + { + "epoch": 0.3195086965222319, + "grad_norm": 0.1787065714597702, + "learning_rate": 0.001, + "loss": 3.0544, + "num_input_tokens_seen": 12451840000, + "step": 47500 + }, + { + "epoch": 0.3195086965222319, + "eval_loss": 2.953366756439209, + "eval_runtime": 53.4179, + "eval_samples_per_second": 93.602, + "eval_steps_per_second": 23.4, + "num_input_tokens_seen": 12451840000, + "step": 47500 + }, + { + "epoch": 0.3198450214659395, + "grad_norm": 0.1822129189968109, + "learning_rate": 0.001, + "loss": 3.0606, + "num_input_tokens_seen": 12464947200, + "step": 47550 + }, + { + "epoch": 0.3201813464096471, + "grad_norm": 0.20426377654075623, + "learning_rate": 0.001, + "loss": 3.0586, + "num_input_tokens_seen": 12478054400, + "step": 47600 + }, + { + "epoch": 0.32051767135335474, + "grad_norm": 0.2057754248380661, + "learning_rate": 0.001, + "loss": 3.0604, + "num_input_tokens_seen": 12491161600, + "step": 47650 + }, + { + "epoch": 0.32085399629706235, + "grad_norm": 0.19302618503570557, + "learning_rate": 0.001, + "loss": 3.0578, + "num_input_tokens_seen": 12504268800, + "step": 47700 + }, + { + "epoch": 0.32119032124076996, + "grad_norm": 0.4289242625236511, + "learning_rate": 0.001, + "loss": 3.0563, + "num_input_tokens_seen": 12517376000, + "step": 47750 + }, + { + "epoch": 0.32152664618447757, + "grad_norm": 0.6544061899185181, + "learning_rate": 0.001, + "loss": 3.072, + "num_input_tokens_seen": 12530483200, + "step": 47800 + }, + { + "epoch": 0.3218629711281852, + "grad_norm": 0.27349674701690674, + "learning_rate": 0.001, + "loss": 3.077, + "num_input_tokens_seen": 12543590400, + "step": 47850 + }, + { + "epoch": 0.3221992960718928, + "grad_norm": 0.21640093624591827, + "learning_rate": 0.001, + "loss": 3.0665, + "num_input_tokens_seen": 12556697600, + "step": 47900 + }, + { + "epoch": 0.32253562101560046, + "grad_norm": 0.19193384051322937, + "learning_rate": 0.001, + "loss": 3.0542, + "num_input_tokens_seen": 12569804800, + "step": 47950 + }, + { + "epoch": 0.32287194595930807, + "grad_norm": 0.21732182800769806, + "learning_rate": 0.001, + "loss": 3.0489, + "num_input_tokens_seen": 12582912000, + "step": 48000 + }, + { + "epoch": 0.32287194595930807, + "eval_loss": 2.95477294921875, + "eval_runtime": 53.2298, + "eval_samples_per_second": 93.932, + "eval_steps_per_second": 23.483, + "num_input_tokens_seen": 12582912000, + "step": 48000 + }, + { + "epoch": 0.3232082709030157, + "grad_norm": 0.1941204071044922, + "learning_rate": 0.001, + "loss": 3.0582, + "num_input_tokens_seen": 12596019200, + "step": 48050 + }, + { + "epoch": 0.3235445958467233, + "grad_norm": 4.390545845031738, + "learning_rate": 0.001, + "loss": 3.0713, + "num_input_tokens_seen": 12609126400, + "step": 48100 + }, + { + "epoch": 0.3238809207904309, + "grad_norm": 0.2377273291349411, + "learning_rate": 0.001, + "loss": 3.0792, + "num_input_tokens_seen": 12622233600, + "step": 48150 + }, + { + "epoch": 0.3242172457341385, + "grad_norm": 0.20397226512432098, + "learning_rate": 0.001, + "loss": 3.0718, + "num_input_tokens_seen": 12635340800, + "step": 48200 + }, + { + "epoch": 0.3245535706778461, + "grad_norm": 0.21039831638336182, + "learning_rate": 0.001, + "loss": 3.0669, + "num_input_tokens_seen": 12648448000, + "step": 48250 + }, + { + "epoch": 0.32488989562155374, + "grad_norm": 0.18443848192691803, + "learning_rate": 0.001, + "loss": 3.0723, + "num_input_tokens_seen": 12661555200, + "step": 48300 + }, + { + "epoch": 0.32522622056526135, + "grad_norm": 0.1816088706254959, + "learning_rate": 0.001, + "loss": 3.0516, + "num_input_tokens_seen": 12674662400, + "step": 48350 + }, + { + "epoch": 0.32556254550896896, + "grad_norm": 0.17938339710235596, + "learning_rate": 0.001, + "loss": 3.0567, + "num_input_tokens_seen": 12687769600, + "step": 48400 + }, + { + "epoch": 0.32589887045267657, + "grad_norm": 0.2365075796842575, + "learning_rate": 0.001, + "loss": 3.0673, + "num_input_tokens_seen": 12700876800, + "step": 48450 + }, + { + "epoch": 0.3262351953963842, + "grad_norm": 0.24168556928634644, + "learning_rate": 0.001, + "loss": 3.072, + "num_input_tokens_seen": 12713984000, + "step": 48500 + }, + { + "epoch": 0.3262351953963842, + "eval_loss": 2.967775583267212, + "eval_runtime": 53.2002, + "eval_samples_per_second": 93.985, + "eval_steps_per_second": 23.496, + "num_input_tokens_seen": 12713984000, + "step": 48500 + }, + { + "epoch": 0.3265715203400918, + "grad_norm": 0.3560108542442322, + "learning_rate": 0.001, + "loss": 3.0714, + "num_input_tokens_seen": 12727091200, + "step": 48550 + }, + { + "epoch": 0.3269078452837994, + "grad_norm": 2.40458345413208, + "learning_rate": 0.001, + "loss": 3.0729, + "num_input_tokens_seen": 12740198400, + "step": 48600 + }, + { + "epoch": 0.327244170227507, + "grad_norm": 0.3201708495616913, + "learning_rate": 0.001, + "loss": 3.0719, + "num_input_tokens_seen": 12753305600, + "step": 48650 + }, + { + "epoch": 0.3275804951712146, + "grad_norm": 0.2094539850950241, + "learning_rate": 0.001, + "loss": 3.0559, + "num_input_tokens_seen": 12766412800, + "step": 48700 + }, + { + "epoch": 0.32791682011492224, + "grad_norm": 0.2323814332485199, + "learning_rate": 0.001, + "loss": 3.0652, + "num_input_tokens_seen": 12779520000, + "step": 48750 + }, + { + "epoch": 0.32825314505862985, + "grad_norm": 0.20684729516506195, + "learning_rate": 0.001, + "loss": 3.0631, + "num_input_tokens_seen": 12792627200, + "step": 48800 + }, + { + "epoch": 0.32858947000233746, + "grad_norm": 0.19242416322231293, + "learning_rate": 0.001, + "loss": 3.0578, + "num_input_tokens_seen": 12805734400, + "step": 48850 + }, + { + "epoch": 0.32892579494604507, + "grad_norm": 0.1994556188583374, + "learning_rate": 0.001, + "loss": 3.0615, + "num_input_tokens_seen": 12818841600, + "step": 48900 + }, + { + "epoch": 0.3292621198897527, + "grad_norm": 0.19869546592235565, + "learning_rate": 0.001, + "loss": 3.0647, + "num_input_tokens_seen": 12831948800, + "step": 48950 + }, + { + "epoch": 0.3295984448334603, + "grad_norm": 0.21512825787067413, + "learning_rate": 0.001, + "loss": 3.0473, + "num_input_tokens_seen": 12845056000, + "step": 49000 + }, + { + "epoch": 0.3295984448334603, + "eval_loss": 2.952073335647583, + "eval_runtime": 52.8116, + "eval_samples_per_second": 94.676, + "eval_steps_per_second": 23.669, + "num_input_tokens_seen": 12845056000, + "step": 49000 + }, + { + "epoch": 0.3299347697771679, + "grad_norm": 0.22994808852672577, + "learning_rate": 0.001, + "loss": 3.0594, + "num_input_tokens_seen": 12858163200, + "step": 49050 + }, + { + "epoch": 0.3302710947208755, + "grad_norm": 0.45408371090888977, + "learning_rate": 0.001, + "loss": 3.0777, + "num_input_tokens_seen": 12871270400, + "step": 49100 + }, + { + "epoch": 0.3306074196645831, + "grad_norm": 0.2698614001274109, + "learning_rate": 0.001, + "loss": 3.0689, + "num_input_tokens_seen": 12884377600, + "step": 49150 + }, + { + "epoch": 0.33094374460829074, + "grad_norm": 0.22741588950157166, + "learning_rate": 0.001, + "loss": 3.0746, + "num_input_tokens_seen": 12897484800, + "step": 49200 + }, + { + "epoch": 0.33128006955199835, + "grad_norm": 0.3616434335708618, + "learning_rate": 0.001, + "loss": 3.0813, + "num_input_tokens_seen": 12910592000, + "step": 49250 + }, + { + "epoch": 0.33161639449570596, + "grad_norm": 0.2551349401473999, + "learning_rate": 0.001, + "loss": 3.0749, + "num_input_tokens_seen": 12923699200, + "step": 49300 + }, + { + "epoch": 0.33195271943941357, + "grad_norm": 0.24054627120494843, + "learning_rate": 0.001, + "loss": 3.0664, + "num_input_tokens_seen": 12936806400, + "step": 49350 + }, + { + "epoch": 0.3322890443831212, + "grad_norm": 0.25859707593917847, + "learning_rate": 0.001, + "loss": 3.0673, + "num_input_tokens_seen": 12949913600, + "step": 49400 + }, + { + "epoch": 0.3326253693268288, + "grad_norm": 0.2500990629196167, + "learning_rate": 0.001, + "loss": 3.0575, + "num_input_tokens_seen": 12963020800, + "step": 49450 + }, + { + "epoch": 0.3329616942705364, + "grad_norm": 1.1027246713638306, + "learning_rate": 0.001, + "loss": 3.0573, + "num_input_tokens_seen": 12976128000, + "step": 49500 + }, + { + "epoch": 0.3329616942705364, + "eval_loss": 2.976292133331299, + "eval_runtime": 53.0133, + "eval_samples_per_second": 94.316, + "eval_steps_per_second": 23.579, + "num_input_tokens_seen": 12976128000, + "step": 49500 + }, + { + "epoch": 0.333298019214244, + "grad_norm": 0.3605392873287201, + "learning_rate": 0.001, + "loss": 3.0741, + "num_input_tokens_seen": 12989235200, + "step": 49550 + }, + { + "epoch": 0.3336343441579516, + "grad_norm": 0.3265780210494995, + "learning_rate": 0.001, + "loss": 3.0707, + "num_input_tokens_seen": 13002342400, + "step": 49600 + }, + { + "epoch": 0.33397066910165923, + "grad_norm": 0.4178712069988251, + "learning_rate": 0.001, + "loss": 3.0681, + "num_input_tokens_seen": 13015449600, + "step": 49650 + }, + { + "epoch": 0.33430699404536685, + "grad_norm": 0.2647295892238617, + "learning_rate": 0.001, + "loss": 3.0684, + "num_input_tokens_seen": 13028556800, + "step": 49700 + }, + { + "epoch": 0.3346433189890745, + "grad_norm": 0.20664212107658386, + "learning_rate": 0.001, + "loss": 3.0649, + "num_input_tokens_seen": 13041664000, + "step": 49750 + }, + { + "epoch": 0.3349796439327821, + "grad_norm": 0.45491111278533936, + "learning_rate": 0.001, + "loss": 3.0667, + "num_input_tokens_seen": 13054771200, + "step": 49800 + }, + { + "epoch": 0.33531596887648973, + "grad_norm": 0.27275514602661133, + "learning_rate": 0.001, + "loss": 3.0717, + "num_input_tokens_seen": 13067878400, + "step": 49850 + }, + { + "epoch": 0.33565229382019734, + "grad_norm": 0.24294881522655487, + "learning_rate": 0.001, + "loss": 3.0717, + "num_input_tokens_seen": 13080985600, + "step": 49900 + }, + { + "epoch": 0.33598861876390496, + "grad_norm": 0.2790290415287018, + "learning_rate": 0.001, + "loss": 3.0687, + "num_input_tokens_seen": 13094092800, + "step": 49950 + }, + { + "epoch": 0.33632494370761257, + "grad_norm": 0.32556888461112976, + "learning_rate": 0.001, + "loss": 3.0805, + "num_input_tokens_seen": 13107200000, + "step": 50000 + }, + { + "epoch": 0.33632494370761257, + "eval_loss": 2.958136796951294, + "eval_runtime": 53.5587, + "eval_samples_per_second": 93.355, + "eval_steps_per_second": 23.339, + "num_input_tokens_seen": 13107200000, + "step": 50000 + }, + { + "epoch": 0.3366612686513202, + "grad_norm": 0.2707626223564148, + "learning_rate": 0.001, + "loss": 3.0719, + "num_input_tokens_seen": 13120307200, + "step": 50050 + }, + { + "epoch": 0.3369975935950278, + "grad_norm": 0.28657791018486023, + "learning_rate": 0.001, + "loss": 3.0713, + "num_input_tokens_seen": 13133414400, + "step": 50100 + }, + { + "epoch": 0.3373339185387354, + "grad_norm": 0.22508807480335236, + "learning_rate": 0.001, + "loss": 3.0626, + "num_input_tokens_seen": 13146521600, + "step": 50150 + }, + { + "epoch": 0.337670243482443, + "grad_norm": 0.3013211786746979, + "learning_rate": 0.001, + "loss": 3.0871, + "num_input_tokens_seen": 13159628800, + "step": 50200 + }, + { + "epoch": 0.3380065684261506, + "grad_norm": 0.4010023772716522, + "learning_rate": 0.001, + "loss": 3.0624, + "num_input_tokens_seen": 13172736000, + "step": 50250 + }, + { + "epoch": 0.33834289336985823, + "grad_norm": 0.23215509951114655, + "learning_rate": 0.001, + "loss": 3.058, + "num_input_tokens_seen": 13185843200, + "step": 50300 + }, + { + "epoch": 0.33867921831356584, + "grad_norm": 0.3135644495487213, + "learning_rate": 0.001, + "loss": 3.0668, + "num_input_tokens_seen": 13198950400, + "step": 50350 + }, + { + "epoch": 0.33901554325727346, + "grad_norm": 0.8496716618537903, + "learning_rate": 0.001, + "loss": 3.0558, + "num_input_tokens_seen": 13212057600, + "step": 50400 + }, + { + "epoch": 0.33935186820098107, + "grad_norm": 0.2706848084926605, + "learning_rate": 0.001, + "loss": 3.0656, + "num_input_tokens_seen": 13225164800, + "step": 50450 + }, + { + "epoch": 0.3396881931446887, + "grad_norm": 0.24779066443443298, + "learning_rate": 0.001, + "loss": 3.073, + "num_input_tokens_seen": 13238272000, + "step": 50500 + }, + { + "epoch": 0.3396881931446887, + "eval_loss": 2.9553372859954834, + "eval_runtime": 53.3111, + "eval_samples_per_second": 93.789, + "eval_steps_per_second": 23.447, + "num_input_tokens_seen": 13238272000, + "step": 50500 + }, + { + "epoch": 0.3400245180883963, + "grad_norm": 0.2277699112892151, + "learning_rate": 0.001, + "loss": 3.0516, + "num_input_tokens_seen": 13251379200, + "step": 50550 + }, + { + "epoch": 0.3403608430321039, + "grad_norm": 0.2331630438566208, + "learning_rate": 0.001, + "loss": 3.0685, + "num_input_tokens_seen": 13264486400, + "step": 50600 + }, + { + "epoch": 0.3406971679758115, + "grad_norm": 0.2572009563446045, + "learning_rate": 0.001, + "loss": 3.0628, + "num_input_tokens_seen": 13277593600, + "step": 50650 + }, + { + "epoch": 0.3410334929195191, + "grad_norm": 0.35386911034584045, + "learning_rate": 0.001, + "loss": 3.0554, + "num_input_tokens_seen": 13290700800, + "step": 50700 + }, + { + "epoch": 0.34136981786322673, + "grad_norm": 0.19923870265483856, + "learning_rate": 0.001, + "loss": 3.0579, + "num_input_tokens_seen": 13303808000, + "step": 50750 + }, + { + "epoch": 0.34170614280693434, + "grad_norm": 0.22291727364063263, + "learning_rate": 0.001, + "loss": 3.0653, + "num_input_tokens_seen": 13316915200, + "step": 50800 + }, + { + "epoch": 0.34204246775064195, + "grad_norm": 0.2049180120229721, + "learning_rate": 0.001, + "loss": 3.0511, + "num_input_tokens_seen": 13330022400, + "step": 50850 + }, + { + "epoch": 0.34237879269434957, + "grad_norm": 0.2586233615875244, + "learning_rate": 0.001, + "loss": 3.0589, + "num_input_tokens_seen": 13343129600, + "step": 50900 + }, + { + "epoch": 0.3427151176380572, + "grad_norm": 0.2137664556503296, + "learning_rate": 0.001, + "loss": 3.0522, + "num_input_tokens_seen": 13356236800, + "step": 50950 + }, + { + "epoch": 0.3430514425817648, + "grad_norm": 0.21726618707180023, + "learning_rate": 0.001, + "loss": 3.054, + "num_input_tokens_seen": 13369344000, + "step": 51000 + }, + { + "epoch": 0.3430514425817648, + "eval_loss": 2.948333740234375, + "eval_runtime": 53.0951, + "eval_samples_per_second": 94.171, + "eval_steps_per_second": 23.543, + "num_input_tokens_seen": 13369344000, + "step": 51000 + }, + { + "epoch": 0.3433877675254724, + "grad_norm": 0.19399498403072357, + "learning_rate": 0.001, + "loss": 3.0583, + "num_input_tokens_seen": 13382451200, + "step": 51050 + }, + { + "epoch": 0.34372409246918, + "grad_norm": 0.19893072545528412, + "learning_rate": 0.001, + "loss": 3.0505, + "num_input_tokens_seen": 13395558400, + "step": 51100 + }, + { + "epoch": 0.3440604174128876, + "grad_norm": 0.17791305482387543, + "learning_rate": 0.001, + "loss": 3.0504, + "num_input_tokens_seen": 13408665600, + "step": 51150 + }, + { + "epoch": 0.34439674235659523, + "grad_norm": 0.7631425261497498, + "learning_rate": 0.001, + "loss": 3.0483, + "num_input_tokens_seen": 13421772800, + "step": 51200 + }, + { + "epoch": 0.34473306730030284, + "grad_norm": 0.22620978951454163, + "learning_rate": 0.001, + "loss": 3.0512, + "num_input_tokens_seen": 13434880000, + "step": 51250 + }, + { + "epoch": 0.34506939224401045, + "grad_norm": 0.219919815659523, + "learning_rate": 0.001, + "loss": 3.0415, + "num_input_tokens_seen": 13447987200, + "step": 51300 + }, + { + "epoch": 0.34540571718771806, + "grad_norm": 0.21654649078845978, + "learning_rate": 0.001, + "loss": 3.062, + "num_input_tokens_seen": 13461094400, + "step": 51350 + }, + { + "epoch": 0.3457420421314257, + "grad_norm": 0.2439095377922058, + "learning_rate": 0.001, + "loss": 3.0478, + "num_input_tokens_seen": 13474201600, + "step": 51400 + }, + { + "epoch": 0.3460783670751333, + "grad_norm": 0.19535380601882935, + "learning_rate": 0.001, + "loss": 3.0444, + "num_input_tokens_seen": 13487308800, + "step": 51450 + }, + { + "epoch": 0.3464146920188409, + "grad_norm": 0.1964534968137741, + "learning_rate": 0.001, + "loss": 3.049, + "num_input_tokens_seen": 13500416000, + "step": 51500 + }, + { + "epoch": 0.3464146920188409, + "eval_loss": 2.945749044418335, + "eval_runtime": 53.0447, + "eval_samples_per_second": 94.26, + "eval_steps_per_second": 23.565, + "num_input_tokens_seen": 13500416000, + "step": 51500 + }, + { + "epoch": 0.3467510169625485, + "grad_norm": 0.2085062563419342, + "learning_rate": 0.001, + "loss": 3.0582, + "num_input_tokens_seen": 13513523200, + "step": 51550 + }, + { + "epoch": 0.3470873419062562, + "grad_norm": 0.1903097778558731, + "learning_rate": 0.001, + "loss": 3.0488, + "num_input_tokens_seen": 13526630400, + "step": 51600 + }, + { + "epoch": 0.3474236668499638, + "grad_norm": 0.20101405680179596, + "learning_rate": 0.001, + "loss": 3.0573, + "num_input_tokens_seen": 13539737600, + "step": 51650 + }, + { + "epoch": 0.3477599917936714, + "grad_norm": 0.6418889164924622, + "learning_rate": 0.001, + "loss": 3.0513, + "num_input_tokens_seen": 13552844800, + "step": 51700 + }, + { + "epoch": 0.348096316737379, + "grad_norm": 0.22524093091487885, + "learning_rate": 0.001, + "loss": 3.0567, + "num_input_tokens_seen": 13565952000, + "step": 51750 + }, + { + "epoch": 0.3484326416810866, + "grad_norm": 0.21830599009990692, + "learning_rate": 0.001, + "loss": 3.0538, + "num_input_tokens_seen": 13579059200, + "step": 51800 + }, + { + "epoch": 0.34876896662479423, + "grad_norm": 0.6111611127853394, + "learning_rate": 0.001, + "loss": 3.0581, + "num_input_tokens_seen": 13592166400, + "step": 51850 + }, + { + "epoch": 0.34910529156850184, + "grad_norm": 0.3782864511013031, + "learning_rate": 0.001, + "loss": 3.0694, + "num_input_tokens_seen": 13605273600, + "step": 51900 + }, + { + "epoch": 0.34944161651220945, + "grad_norm": 0.23944802582263947, + "learning_rate": 0.001, + "loss": 3.0683, + "num_input_tokens_seen": 13618380800, + "step": 51950 + }, + { + "epoch": 0.34977794145591706, + "grad_norm": 0.20257577300071716, + "learning_rate": 0.001, + "loss": 3.0509, + "num_input_tokens_seen": 13631488000, + "step": 52000 + }, + { + "epoch": 0.34977794145591706, + "eval_loss": 2.94769287109375, + "eval_runtime": 53.1351, + "eval_samples_per_second": 94.1, + "eval_steps_per_second": 23.525, + "num_input_tokens_seen": 13631488000, + "step": 52000 + }, + { + "epoch": 0.3501142663996247, + "grad_norm": 0.22132734954357147, + "learning_rate": 0.001, + "loss": 3.0564, + "num_input_tokens_seen": 13644595200, + "step": 52050 + }, + { + "epoch": 0.3504505913433323, + "grad_norm": 0.19554653763771057, + "learning_rate": 0.001, + "loss": 3.0457, + "num_input_tokens_seen": 13657702400, + "step": 52100 + }, + { + "epoch": 0.3507869162870399, + "grad_norm": 0.23935073614120483, + "learning_rate": 0.001, + "loss": 3.0465, + "num_input_tokens_seen": 13670809600, + "step": 52150 + }, + { + "epoch": 0.3511232412307475, + "grad_norm": 0.2895826995372772, + "learning_rate": 0.001, + "loss": 3.0509, + "num_input_tokens_seen": 13683916800, + "step": 52200 + }, + { + "epoch": 0.3514595661744551, + "grad_norm": 0.24599236249923706, + "learning_rate": 0.001, + "loss": 3.0385, + "num_input_tokens_seen": 13697024000, + "step": 52250 + }, + { + "epoch": 0.35179589111816273, + "grad_norm": 0.19500850141048431, + "learning_rate": 0.001, + "loss": 3.0523, + "num_input_tokens_seen": 13710131200, + "step": 52300 + }, + { + "epoch": 0.35213221606187034, + "grad_norm": 0.20790818333625793, + "learning_rate": 0.001, + "loss": 3.0547, + "num_input_tokens_seen": 13723238400, + "step": 52350 + }, + { + "epoch": 0.35246854100557795, + "grad_norm": 0.18653196096420288, + "learning_rate": 0.001, + "loss": 3.0545, + "num_input_tokens_seen": 13736345600, + "step": 52400 + }, + { + "epoch": 0.35280486594928556, + "grad_norm": 0.22097791731357574, + "learning_rate": 0.001, + "loss": 3.0573, + "num_input_tokens_seen": 13749452800, + "step": 52450 + }, + { + "epoch": 0.3531411908929932, + "grad_norm": 0.22931267321109772, + "learning_rate": 0.001, + "loss": 3.0478, + "num_input_tokens_seen": 13762560000, + "step": 52500 + }, + { + "epoch": 0.3531411908929932, + "eval_loss": 2.9459915161132812, + "eval_runtime": 52.6495, + "eval_samples_per_second": 94.968, + "eval_steps_per_second": 23.742, + "num_input_tokens_seen": 13762560000, + "step": 52500 + }, + { + "epoch": 0.3534775158367008, + "grad_norm": 0.31109049916267395, + "learning_rate": 0.001, + "loss": 3.0462, + "num_input_tokens_seen": 13775667200, + "step": 52550 + }, + { + "epoch": 0.3538138407804084, + "grad_norm": 1.7297276258468628, + "learning_rate": 0.001, + "loss": 3.0629, + "num_input_tokens_seen": 13788774400, + "step": 52600 + }, + { + "epoch": 0.354150165724116, + "grad_norm": 0.4056268334388733, + "learning_rate": 0.001, + "loss": 3.0763, + "num_input_tokens_seen": 13801881600, + "step": 52650 + }, + { + "epoch": 0.3544864906678236, + "grad_norm": 0.3694227635860443, + "learning_rate": 0.001, + "loss": 3.099, + "num_input_tokens_seen": 13814988800, + "step": 52700 + }, + { + "epoch": 0.35482281561153123, + "grad_norm": 0.2708556056022644, + "learning_rate": 0.001, + "loss": 3.0985, + "num_input_tokens_seen": 13828096000, + "step": 52750 + }, + { + "epoch": 0.35515914055523884, + "grad_norm": 0.27150145173072815, + "learning_rate": 0.001, + "loss": 3.0694, + "num_input_tokens_seen": 13841203200, + "step": 52800 + }, + { + "epoch": 0.35549546549894645, + "grad_norm": 0.2626855969429016, + "learning_rate": 0.001, + "loss": 3.0642, + "num_input_tokens_seen": 13854310400, + "step": 52850 + }, + { + "epoch": 0.35583179044265406, + "grad_norm": 0.20539118349552155, + "learning_rate": 0.001, + "loss": 3.059, + "num_input_tokens_seen": 13867417600, + "step": 52900 + }, + { + "epoch": 0.35616811538636167, + "grad_norm": 0.21489828824996948, + "learning_rate": 0.001, + "loss": 3.054, + "num_input_tokens_seen": 13880524800, + "step": 52950 + }, + { + "epoch": 0.3565044403300693, + "grad_norm": 0.263488233089447, + "learning_rate": 0.001, + "loss": 3.044, + "num_input_tokens_seen": 13893632000, + "step": 53000 + }, + { + "epoch": 0.3565044403300693, + "eval_loss": 2.9570043087005615, + "eval_runtime": 53.2194, + "eval_samples_per_second": 93.951, + "eval_steps_per_second": 23.488, + "num_input_tokens_seen": 13893632000, + "step": 53000 + }, + { + "epoch": 0.3568407652737769, + "grad_norm": 0.3147699236869812, + "learning_rate": 0.001, + "loss": 3.0557, + "num_input_tokens_seen": 13906739200, + "step": 53050 + }, + { + "epoch": 0.3571770902174845, + "grad_norm": 0.22110533714294434, + "learning_rate": 0.001, + "loss": 3.0515, + "num_input_tokens_seen": 13919846400, + "step": 53100 + }, + { + "epoch": 0.3575134151611921, + "grad_norm": 0.23334212601184845, + "learning_rate": 0.001, + "loss": 3.0523, + "num_input_tokens_seen": 13932953600, + "step": 53150 + }, + { + "epoch": 0.3578497401048997, + "grad_norm": 0.200640469789505, + "learning_rate": 0.001, + "loss": 3.0621, + "num_input_tokens_seen": 13946060800, + "step": 53200 + }, + { + "epoch": 0.35818606504860734, + "grad_norm": 0.20875929296016693, + "learning_rate": 0.001, + "loss": 3.0591, + "num_input_tokens_seen": 13959168000, + "step": 53250 + }, + { + "epoch": 0.35852238999231495, + "grad_norm": 0.19065573811531067, + "learning_rate": 0.001, + "loss": 3.0591, + "num_input_tokens_seen": 13972275200, + "step": 53300 + }, + { + "epoch": 0.35885871493602256, + "grad_norm": 0.18688392639160156, + "learning_rate": 0.001, + "loss": 3.0475, + "num_input_tokens_seen": 13985382400, + "step": 53350 + }, + { + "epoch": 0.3591950398797302, + "grad_norm": 0.1864282786846161, + "learning_rate": 0.001, + "loss": 3.0485, + "num_input_tokens_seen": 13998489600, + "step": 53400 + }, + { + "epoch": 0.35953136482343784, + "grad_norm": 0.20456114411354065, + "learning_rate": 0.001, + "loss": 3.0529, + "num_input_tokens_seen": 14011596800, + "step": 53450 + }, + { + "epoch": 0.35986768976714545, + "grad_norm": 0.24362069368362427, + "learning_rate": 0.001, + "loss": 3.0444, + "num_input_tokens_seen": 14024704000, + "step": 53500 + }, + { + "epoch": 0.35986768976714545, + "eval_loss": 2.943416118621826, + "eval_runtime": 53.1574, + "eval_samples_per_second": 94.06, + "eval_steps_per_second": 23.515, + "num_input_tokens_seen": 14024704000, + "step": 53500 + }, + { + "epoch": 0.36020401471085306, + "grad_norm": 0.19701169431209564, + "learning_rate": 0.001, + "loss": 3.0513, + "num_input_tokens_seen": 14037811200, + "step": 53550 + }, + { + "epoch": 0.36054033965456067, + "grad_norm": 0.1785692274570465, + "learning_rate": 0.001, + "loss": 3.0541, + "num_input_tokens_seen": 14050918400, + "step": 53600 + }, + { + "epoch": 0.3608766645982683, + "grad_norm": 0.1865462064743042, + "learning_rate": 0.001, + "loss": 3.0367, + "num_input_tokens_seen": 14064025600, + "step": 53650 + }, + { + "epoch": 0.3612129895419759, + "grad_norm": 0.4129047095775604, + "learning_rate": 0.001, + "loss": 3.043, + "num_input_tokens_seen": 14077132800, + "step": 53700 + }, + { + "epoch": 0.3615493144856835, + "grad_norm": 0.21066440641880035, + "learning_rate": 0.001, + "loss": 3.0585, + "num_input_tokens_seen": 14090240000, + "step": 53750 + }, + { + "epoch": 0.3618856394293911, + "grad_norm": 0.6820788383483887, + "learning_rate": 0.001, + "loss": 3.0534, + "num_input_tokens_seen": 14103347200, + "step": 53800 + }, + { + "epoch": 0.3622219643730987, + "grad_norm": 0.9664424657821655, + "learning_rate": 0.001, + "loss": 3.069, + "num_input_tokens_seen": 14116454400, + "step": 53850 + }, + { + "epoch": 0.36255828931680634, + "grad_norm": 0.35416921973228455, + "learning_rate": 0.001, + "loss": 3.0629, + "num_input_tokens_seen": 14129561600, + "step": 53900 + }, + { + "epoch": 0.36289461426051395, + "grad_norm": 0.3159606158733368, + "learning_rate": 0.001, + "loss": 3.0722, + "num_input_tokens_seen": 14142668800, + "step": 53950 + }, + { + "epoch": 0.36323093920422156, + "grad_norm": 0.2518790662288666, + "learning_rate": 0.001, + "loss": 3.071, + "num_input_tokens_seen": 14155776000, + "step": 54000 + }, + { + "epoch": 0.36323093920422156, + "eval_loss": 2.9483964443206787, + "eval_runtime": 53.2042, + "eval_samples_per_second": 93.978, + "eval_steps_per_second": 23.494, + "num_input_tokens_seen": 14155776000, + "step": 54000 + }, + { + "epoch": 0.36356726414792917, + "grad_norm": 0.2197147160768509, + "learning_rate": 0.0009998286624877785, + "loss": 3.0502, + "num_input_tokens_seen": 14168883200, + "step": 54050 + }, + { + "epoch": 0.3639035890916368, + "grad_norm": 0.22259306907653809, + "learning_rate": 0.0009993147673772868, + "loss": 3.0433, + "num_input_tokens_seen": 14181990400, + "step": 54100 + }, + { + "epoch": 0.3642399140353444, + "grad_norm": 0.19341766834259033, + "learning_rate": 0.000998458666866564, + "loss": 3.0486, + "num_input_tokens_seen": 14195097600, + "step": 54150 + }, + { + "epoch": 0.364576238979052, + "grad_norm": 0.2313617616891861, + "learning_rate": 0.0009972609476841367, + "loss": 3.0446, + "num_input_tokens_seen": 14208204800, + "step": 54200 + }, + { + "epoch": 0.3649125639227596, + "grad_norm": 0.1925128698348999, + "learning_rate": 0.0009957224306869053, + "loss": 3.0528, + "num_input_tokens_seen": 14221312000, + "step": 54250 + }, + { + "epoch": 0.3652488888664672, + "grad_norm": 0.2100643515586853, + "learning_rate": 0.0009938441702975688, + "loss": 3.0453, + "num_input_tokens_seen": 14234419200, + "step": 54300 + }, + { + "epoch": 0.36558521381017484, + "grad_norm": 0.46658360958099365, + "learning_rate": 0.0009916274537819774, + "loss": 3.0464, + "num_input_tokens_seen": 14247526400, + "step": 54350 + }, + { + "epoch": 0.36592153875388245, + "grad_norm": 0.19623732566833496, + "learning_rate": 0.0009890738003669028, + "loss": 3.0427, + "num_input_tokens_seen": 14260633600, + "step": 54400 + }, + { + "epoch": 0.36625786369759006, + "grad_norm": 0.24941138923168182, + "learning_rate": 0.0009861849601988384, + "loss": 3.0528, + "num_input_tokens_seen": 14273740800, + "step": 54450 + }, + { + "epoch": 0.36659418864129767, + "grad_norm": 0.22141198813915253, + "learning_rate": 0.0009829629131445341, + "loss": 3.0523, + "num_input_tokens_seen": 14286848000, + "step": 54500 + }, + { + "epoch": 0.36659418864129767, + "eval_loss": 2.9419288635253906, + "eval_runtime": 53.6937, + "eval_samples_per_second": 93.121, + "eval_steps_per_second": 23.28, + "num_input_tokens_seen": 14286848000, + "step": 54500 + }, + { + "epoch": 0.3669305135850053, + "grad_norm": 0.2028401494026184, + "learning_rate": 0.0009794098674340967, + "loss": 3.0403, + "num_input_tokens_seen": 14299955200, + "step": 54550 + }, + { + "epoch": 0.3672668385287129, + "grad_norm": 0.20509253442287445, + "learning_rate": 0.0009755282581475768, + "loss": 3.0543, + "num_input_tokens_seen": 14313062400, + "step": 54600 + }, + { + "epoch": 0.3676031634724205, + "grad_norm": 1.2793521881103516, + "learning_rate": 0.0009713207455460893, + "loss": 3.0718, + "num_input_tokens_seen": 14326169600, + "step": 54650 + }, + { + "epoch": 0.3679394884161281, + "grad_norm": 1.1210218667984009, + "learning_rate": 0.0009667902132486009, + "loss": 3.0706, + "num_input_tokens_seen": 14339276800, + "step": 54700 + }, + { + "epoch": 0.3682758133598357, + "grad_norm": 0.5492864847183228, + "learning_rate": 0.0009619397662556434, + "loss": 3.0793, + "num_input_tokens_seen": 14352384000, + "step": 54750 + }, + { + "epoch": 0.36861213830354334, + "grad_norm": 0.34732338786125183, + "learning_rate": 0.0009567727288213005, + "loss": 3.0662, + "num_input_tokens_seen": 14365491200, + "step": 54800 + }, + { + "epoch": 0.36894846324725095, + "grad_norm": 0.2698073983192444, + "learning_rate": 0.0009512926421749304, + "loss": 3.0682, + "num_input_tokens_seen": 14378598400, + "step": 54850 + }, + { + "epoch": 0.36928478819095856, + "grad_norm": 0.593543529510498, + "learning_rate": 0.0009455032620941839, + "loss": 3.0507, + "num_input_tokens_seen": 14391705600, + "step": 54900 + }, + { + "epoch": 0.36962111313466617, + "grad_norm": 0.28389155864715576, + "learning_rate": 0.0009394085563309827, + "loss": 3.0593, + "num_input_tokens_seen": 14404812800, + "step": 54950 + }, + { + "epoch": 0.3699574380783738, + "grad_norm": 0.2569947838783264, + "learning_rate": 0.0009330127018922195, + "loss": 3.0524, + "num_input_tokens_seen": 14417920000, + "step": 55000 + }, + { + "epoch": 0.3699574380783738, + "eval_loss": 2.9468750953674316, + "eval_runtime": 52.9661, + "eval_samples_per_second": 94.4, + "eval_steps_per_second": 23.6, + "num_input_tokens_seen": 14417920000, + "step": 55000 + }, + { + "epoch": 0.3702937630220814, + "grad_norm": 0.2545956075191498, + "learning_rate": 0.0009263200821770461, + "loss": 3.0397, + "num_input_tokens_seen": 14431027200, + "step": 55050 + }, + { + "epoch": 0.370630087965789, + "grad_norm": 0.26363736391067505, + "learning_rate": 0.0009193352839727121, + "loss": 3.0554, + "num_input_tokens_seen": 14444134400, + "step": 55100 + }, + { + "epoch": 0.3709664129094966, + "grad_norm": 0.2228112667798996, + "learning_rate": 0.0009120630943110077, + "loss": 3.0482, + "num_input_tokens_seen": 14457241600, + "step": 55150 + }, + { + "epoch": 0.3713027378532043, + "grad_norm": 0.2184106856584549, + "learning_rate": 0.0009045084971874737, + "loss": 3.0368, + "num_input_tokens_seen": 14470348800, + "step": 55200 + }, + { + "epoch": 0.3716390627969119, + "grad_norm": 0.5658212900161743, + "learning_rate": 0.0008966766701456176, + "loss": 3.0541, + "num_input_tokens_seen": 14483456000, + "step": 55250 + }, + { + "epoch": 0.3719753877406195, + "grad_norm": 0.31839439272880554, + "learning_rate": 0.0008885729807284854, + "loss": 3.0516, + "num_input_tokens_seen": 14496563200, + "step": 55300 + }, + { + "epoch": 0.3723117126843271, + "grad_norm": 0.2521055042743683, + "learning_rate": 0.0008802029828000156, + "loss": 3.049, + "num_input_tokens_seen": 14509670400, + "step": 55350 + }, + { + "epoch": 0.3726480376280347, + "grad_norm": 0.23797062039375305, + "learning_rate": 0.0008715724127386971, + "loss": 3.0393, + "num_input_tokens_seen": 14522777600, + "step": 55400 + }, + { + "epoch": 0.37298436257174233, + "grad_norm": 0.26673102378845215, + "learning_rate": 0.0008626871855061438, + "loss": 3.0535, + "num_input_tokens_seen": 14535884800, + "step": 55450 + }, + { + "epoch": 0.37332068751544994, + "grad_norm": 0.37754055857658386, + "learning_rate": 0.0008535533905932737, + "loss": 3.0432, + "num_input_tokens_seen": 14548992000, + "step": 55500 + }, + { + "epoch": 0.37332068751544994, + "eval_loss": 2.9362170696258545, + "eval_runtime": 53.4795, + "eval_samples_per_second": 93.494, + "eval_steps_per_second": 23.373, + "num_input_tokens_seen": 14548992000, + "step": 55500 + }, + { + "epoch": 0.37365701245915756, + "grad_norm": 0.2160724252462387, + "learning_rate": 0.000844177287846877, + "loss": 3.0378, + "num_input_tokens_seen": 14562099200, + "step": 55550 + }, + { + "epoch": 0.37399333740286517, + "grad_norm": 0.22323860228061676, + "learning_rate": 0.0008345653031794292, + "loss": 3.0419, + "num_input_tokens_seen": 14575206400, + "step": 55600 + }, + { + "epoch": 0.3743296623465728, + "grad_norm": 0.19688346982002258, + "learning_rate": 0.0008247240241650918, + "loss": 3.0297, + "num_input_tokens_seen": 14588313600, + "step": 55650 + }, + { + "epoch": 0.3746659872902804, + "grad_norm": 0.1972673088312149, + "learning_rate": 0.0008146601955249188, + "loss": 3.0405, + "num_input_tokens_seen": 14601420800, + "step": 55700 + }, + { + "epoch": 0.375002312233988, + "grad_norm": 0.44073277711868286, + "learning_rate": 0.0008043807145043603, + "loss": 3.0343, + "num_input_tokens_seen": 14614528000, + "step": 55750 + }, + { + "epoch": 0.3753386371776956, + "grad_norm": 0.22042399644851685, + "learning_rate": 0.0007938926261462366, + "loss": 3.0337, + "num_input_tokens_seen": 14627635200, + "step": 55800 + }, + { + "epoch": 0.3756749621214032, + "grad_norm": 0.2954588234424591, + "learning_rate": 0.0007832031184624164, + "loss": 3.0334, + "num_input_tokens_seen": 14640742400, + "step": 55850 + }, + { + "epoch": 0.37601128706511083, + "grad_norm": 0.5062097907066345, + "learning_rate": 0.0007723195175075137, + "loss": 3.0385, + "num_input_tokens_seen": 14653849600, + "step": 55900 + }, + { + "epoch": 0.37634761200881844, + "grad_norm": 0.30344095826148987, + "learning_rate": 0.0007612492823579744, + "loss": 3.04, + "num_input_tokens_seen": 14666956800, + "step": 55950 + }, + { + "epoch": 0.37668393695252606, + "grad_norm": 0.21088473498821259, + "learning_rate": 0.00075, + "loss": 3.0364, + "num_input_tokens_seen": 14680064000, + "step": 56000 + }, + { + "epoch": 0.37668393695252606, + "eval_loss": 2.9313743114471436, + "eval_runtime": 53.142, + "eval_samples_per_second": 94.088, + "eval_steps_per_second": 23.522, + "num_input_tokens_seen": 14680064000, + "step": 56000 + }, + { + "epoch": 0.37702026189623367, + "grad_norm": 0.2067674696445465, + "learning_rate": 0.0007385793801298042, + "loss": 3.05, + "num_input_tokens_seen": 14693171200, + "step": 56050 + }, + { + "epoch": 0.3773565868399413, + "grad_norm": 0.20803235471248627, + "learning_rate": 0.0007269952498697733, + "loss": 3.0451, + "num_input_tokens_seen": 14706278400, + "step": 56100 + }, + { + "epoch": 0.3776929117836489, + "grad_norm": 0.2035783976316452, + "learning_rate": 0.0007152555484041476, + "loss": 3.0281, + "num_input_tokens_seen": 14719385600, + "step": 56150 + }, + { + "epoch": 0.3780292367273565, + "grad_norm": 0.21911849081516266, + "learning_rate": 0.0007033683215379002, + "loss": 3.0312, + "num_input_tokens_seen": 14732492800, + "step": 56200 + }, + { + "epoch": 0.3783655616710641, + "grad_norm": 0.2263978123664856, + "learning_rate": 0.000691341716182545, + "loss": 3.0237, + "num_input_tokens_seen": 14745600000, + "step": 56250 + }, + { + "epoch": 0.3787018866147717, + "grad_norm": 0.20394045114517212, + "learning_rate": 0.0006791839747726501, + "loss": 3.0271, + "num_input_tokens_seen": 14758707200, + "step": 56300 + }, + { + "epoch": 0.37903821155847933, + "grad_norm": 0.1954122930765152, + "learning_rate": 0.0006669034296168854, + "loss": 3.0368, + "num_input_tokens_seen": 14771814400, + "step": 56350 + }, + { + "epoch": 0.37937453650218694, + "grad_norm": 0.2434541881084442, + "learning_rate": 0.0006545084971874737, + "loss": 3.0268, + "num_input_tokens_seen": 14784921600, + "step": 56400 + }, + { + "epoch": 0.37971086144589455, + "grad_norm": 0.19820261001586914, + "learning_rate": 0.0006420076723519614, + "loss": 3.0193, + "num_input_tokens_seen": 14798028800, + "step": 56450 + }, + { + "epoch": 0.38004718638960217, + "grad_norm": 0.18117697536945343, + "learning_rate": 0.0006294095225512603, + "loss": 3.0241, + "num_input_tokens_seen": 14811136000, + "step": 56500 + }, + { + "epoch": 0.38004718638960217, + "eval_loss": 2.920185089111328, + "eval_runtime": 53.8805, + "eval_samples_per_second": 92.798, + "eval_steps_per_second": 23.199, + "num_input_tokens_seen": 14811136000, + "step": 56500 + }, + { + "epoch": 0.3803835113333098, + "grad_norm": 0.20303522050380707, + "learning_rate": 0.0006167226819279528, + "loss": 3.0133, + "num_input_tokens_seen": 14824243200, + "step": 56550 + }, + { + "epoch": 0.3807198362770174, + "grad_norm": 0.19498929381370544, + "learning_rate": 0.0006039558454088796, + "loss": 3.0241, + "num_input_tokens_seen": 14837350400, + "step": 56600 + }, + { + "epoch": 0.381056161220725, + "grad_norm": 0.21773076057434082, + "learning_rate": 0.0005911177627460738, + "loss": 3.0235, + "num_input_tokens_seen": 14850457600, + "step": 56650 + }, + { + "epoch": 0.3813924861644326, + "grad_norm": 0.19796748459339142, + "learning_rate": 0.0005782172325201155, + "loss": 3.019, + "num_input_tokens_seen": 14863564800, + "step": 56700 + }, + { + "epoch": 0.3817288111081402, + "grad_norm": 0.18569409847259521, + "learning_rate": 0.000565263096110026, + "loss": 3.0189, + "num_input_tokens_seen": 14876672000, + "step": 56750 + }, + { + "epoch": 0.38206513605184783, + "grad_norm": 0.27358362078666687, + "learning_rate": 0.0005522642316338268, + "loss": 3.0107, + "num_input_tokens_seen": 14889779200, + "step": 56800 + }, + { + "epoch": 0.38240146099555544, + "grad_norm": 0.2143600583076477, + "learning_rate": 0.0005392295478639225, + "loss": 3.0139, + "num_input_tokens_seen": 14902886400, + "step": 56850 + }, + { + "epoch": 0.38273778593926305, + "grad_norm": 0.18786349892616272, + "learning_rate": 0.000526167978121472, + "loss": 3.0187, + "num_input_tokens_seen": 14915993600, + "step": 56900 + }, + { + "epoch": 0.38307411088297066, + "grad_norm": 0.1809261441230774, + "learning_rate": 0.0005130884741539367, + "loss": 3.0197, + "num_input_tokens_seen": 14929100800, + "step": 56950 + }, + { + "epoch": 0.38341043582667833, + "grad_norm": 0.1926116794347763, + "learning_rate": 0.0005, + "loss": 3.0101, + "num_input_tokens_seen": 14942208000, + "step": 57000 + }, + { + "epoch": 0.38341043582667833, + "eval_loss": 2.912503242492676, + "eval_runtime": 52.7455, + "eval_samples_per_second": 94.795, + "eval_steps_per_second": 23.699, + "num_input_tokens_seen": 14942208000, + "step": 57000 + }, + { + "epoch": 0.38374676077038594, + "grad_norm": 0.255500853061676, + "learning_rate": 0.0004869115258460635, + "loss": 3.0102, + "num_input_tokens_seen": 14955315200, + "step": 57050 + }, + { + "epoch": 0.38408308571409355, + "grad_norm": 0.18287675082683563, + "learning_rate": 0.0004738320218785281, + "loss": 3.0074, + "num_input_tokens_seen": 14968422400, + "step": 57100 + }, + { + "epoch": 0.38441941065780116, + "grad_norm": 0.1864452064037323, + "learning_rate": 0.0004607704521360776, + "loss": 3.0181, + "num_input_tokens_seen": 14981529600, + "step": 57150 + }, + { + "epoch": 0.3847557356015088, + "grad_norm": 0.17273065447807312, + "learning_rate": 0.00044773576836617336, + "loss": 3.0077, + "num_input_tokens_seen": 14994636800, + "step": 57200 + }, + { + "epoch": 0.3850920605452164, + "grad_norm": 0.17590677738189697, + "learning_rate": 0.00043473690388997434, + "loss": 3.0118, + "num_input_tokens_seen": 15007744000, + "step": 57250 + }, + { + "epoch": 0.385428385488924, + "grad_norm": 0.16380582749843597, + "learning_rate": 0.0004217827674798845, + "loss": 3.0074, + "num_input_tokens_seen": 15020851200, + "step": 57300 + }, + { + "epoch": 0.3857647104326316, + "grad_norm": 0.19464251399040222, + "learning_rate": 0.00040888223725392626, + "loss": 3.0126, + "num_input_tokens_seen": 15033958400, + "step": 57350 + }, + { + "epoch": 0.3861010353763392, + "grad_norm": 0.17150136828422546, + "learning_rate": 0.0003960441545911204, + "loss": 3.0049, + "num_input_tokens_seen": 15047065600, + "step": 57400 + }, + { + "epoch": 0.38643736032004683, + "grad_norm": 0.1877928376197815, + "learning_rate": 0.00038327731807204744, + "loss": 3.0089, + "num_input_tokens_seen": 15060172800, + "step": 57450 + }, + { + "epoch": 0.38677368526375444, + "grad_norm": 0.2605326771736145, + "learning_rate": 0.0003705904774487396, + "loss": 3.0115, + "num_input_tokens_seen": 15073280000, + "step": 57500 + }, + { + "epoch": 0.38677368526375444, + "eval_loss": 2.9029135704040527, + "eval_runtime": 53.9097, + "eval_samples_per_second": 92.748, + "eval_steps_per_second": 23.187, + "num_input_tokens_seen": 15073280000, + "step": 57500 + }, + { + "epoch": 0.38711001020746205, + "grad_norm": 0.21006393432617188, + "learning_rate": 0.0003579923276480387, + "loss": 3.0044, + "num_input_tokens_seen": 15086387200, + "step": 57550 + }, + { + "epoch": 0.38744633515116966, + "grad_norm": 0.1743878722190857, + "learning_rate": 0.00034549150281252633, + "loss": 3.0114, + "num_input_tokens_seen": 15099494400, + "step": 57600 + }, + { + "epoch": 0.3877826600948773, + "grad_norm": 0.16699257493019104, + "learning_rate": 0.00033309657038311456, + "loss": 3.0041, + "num_input_tokens_seen": 15112601600, + "step": 57650 + }, + { + "epoch": 0.3881189850385849, + "grad_norm": 0.17115868628025055, + "learning_rate": 0.00032081602522734986, + "loss": 3.0051, + "num_input_tokens_seen": 15125708800, + "step": 57700 + }, + { + "epoch": 0.3884553099822925, + "grad_norm": 0.16885310411453247, + "learning_rate": 0.0003086582838174551, + "loss": 2.9969, + "num_input_tokens_seen": 15138816000, + "step": 57750 + }, + { + "epoch": 0.3887916349260001, + "grad_norm": 0.17101123929023743, + "learning_rate": 0.0002966316784621, + "loss": 2.9947, + "num_input_tokens_seen": 15151923200, + "step": 57800 + }, + { + "epoch": 0.3891279598697077, + "grad_norm": 0.1529199331998825, + "learning_rate": 0.0002847444515958523, + "loss": 3.0019, + "num_input_tokens_seen": 15165030400, + "step": 57850 + }, + { + "epoch": 0.38946428481341533, + "grad_norm": 0.16087768971920013, + "learning_rate": 0.00027300475013022663, + "loss": 2.9947, + "num_input_tokens_seen": 15178137600, + "step": 57900 + }, + { + "epoch": 0.38980060975712294, + "grad_norm": 0.16023555397987366, + "learning_rate": 0.00026142061987019576, + "loss": 3.0022, + "num_input_tokens_seen": 15191244800, + "step": 57950 + }, + { + "epoch": 0.39013693470083055, + "grad_norm": 0.16161410510540009, + "learning_rate": 0.0002500000000000001, + "loss": 2.9931, + "num_input_tokens_seen": 15204352000, + "step": 58000 + }, + { + "epoch": 0.39013693470083055, + "eval_loss": 2.8950610160827637, + "eval_runtime": 53.5434, + "eval_samples_per_second": 93.382, + "eval_steps_per_second": 23.346, + "num_input_tokens_seen": 15204352000, + "step": 58000 + }, + { + "epoch": 0.39047325964453816, + "grad_norm": 0.1577194780111313, + "learning_rate": 0.00023875071764202561, + "loss": 2.9866, + "num_input_tokens_seen": 15217459200, + "step": 58050 + }, + { + "epoch": 0.3908095845882458, + "grad_norm": 0.1869671791791916, + "learning_rate": 0.00022768048249248646, + "loss": 2.9973, + "num_input_tokens_seen": 15230566400, + "step": 58100 + }, + { + "epoch": 0.3911459095319534, + "grad_norm": 0.1568073183298111, + "learning_rate": 0.0002167968815375837, + "loss": 3.0012, + "num_input_tokens_seen": 15243673600, + "step": 58150 + }, + { + "epoch": 0.391482234475661, + "grad_norm": 0.15343065559864044, + "learning_rate": 0.00020610737385376348, + "loss": 2.988, + "num_input_tokens_seen": 15256780800, + "step": 58200 + }, + { + "epoch": 0.3918185594193686, + "grad_norm": 0.22413235902786255, + "learning_rate": 0.00019561928549563967, + "loss": 2.993, + "num_input_tokens_seen": 15269888000, + "step": 58250 + }, + { + "epoch": 0.3921548843630762, + "grad_norm": 0.1807044893503189, + "learning_rate": 0.00018533980447508135, + "loss": 2.9905, + "num_input_tokens_seen": 15282995200, + "step": 58300 + }, + { + "epoch": 0.39249120930678383, + "grad_norm": 0.1571112871170044, + "learning_rate": 0.00017527597583490823, + "loss": 2.9983, + "num_input_tokens_seen": 15296102400, + "step": 58350 + }, + { + "epoch": 0.39282753425049144, + "grad_norm": 0.16821637749671936, + "learning_rate": 0.00016543469682057105, + "loss": 2.9966, + "num_input_tokens_seen": 15309209600, + "step": 58400 + }, + { + "epoch": 0.39316385919419905, + "grad_norm": 0.1497010737657547, + "learning_rate": 0.00015582271215312294, + "loss": 2.9814, + "num_input_tokens_seen": 15322316800, + "step": 58450 + }, + { + "epoch": 0.39350018413790666, + "grad_norm": 0.15679225325584412, + "learning_rate": 0.00014644660940672628, + "loss": 2.9876, + "num_input_tokens_seen": 15335424000, + "step": 58500 + }, + { + "epoch": 0.39350018413790666, + "eval_loss": 2.8887994289398193, + "eval_runtime": 53.8449, + "eval_samples_per_second": 92.859, + "eval_steps_per_second": 23.215, + "num_input_tokens_seen": 15335424000, + "step": 58500 + }, + { + "epoch": 0.39383650908161427, + "grad_norm": 0.15169823169708252, + "learning_rate": 0.0001373128144938563, + "loss": 2.9875, + "num_input_tokens_seen": 15348531200, + "step": 58550 + }, + { + "epoch": 0.3941728340253219, + "grad_norm": 0.1635347604751587, + "learning_rate": 0.00012842758726130281, + "loss": 2.9898, + "num_input_tokens_seen": 15361638400, + "step": 58600 + }, + { + "epoch": 0.3945091589690295, + "grad_norm": 0.15156348049640656, + "learning_rate": 0.00011979701719998454, + "loss": 2.9977, + "num_input_tokens_seen": 15374745600, + "step": 58650 + }, + { + "epoch": 0.3948454839127371, + "grad_norm": 0.15710316598415375, + "learning_rate": 0.00011142701927151455, + "loss": 2.981, + "num_input_tokens_seen": 15387852800, + "step": 58700 + }, + { + "epoch": 0.3951818088564447, + "grad_norm": 0.2838917374610901, + "learning_rate": 0.00010332332985438247, + "loss": 2.9909, + "num_input_tokens_seen": 15400960000, + "step": 58750 + }, + { + "epoch": 0.3955181338001524, + "grad_norm": 0.1509639173746109, + "learning_rate": 9.549150281252633e-05, + "loss": 2.9851, + "num_input_tokens_seen": 15414067200, + "step": 58800 + }, + { + "epoch": 0.39585445874386, + "grad_norm": 0.1501421183347702, + "learning_rate": 8.793690568899215e-05, + "loss": 2.9931, + "num_input_tokens_seen": 15427174400, + "step": 58850 + }, + { + "epoch": 0.3961907836875676, + "grad_norm": 0.14904147386550903, + "learning_rate": 8.066471602728804e-05, + "loss": 2.9862, + "num_input_tokens_seen": 15440281600, + "step": 58900 + }, + { + "epoch": 0.3965271086312752, + "grad_norm": 0.15182824432849884, + "learning_rate": 7.367991782295391e-05, + "loss": 2.9882, + "num_input_tokens_seen": 15453388800, + "step": 58950 + }, + { + "epoch": 0.3968634335749828, + "grad_norm": 0.14710576832294464, + "learning_rate": 6.698729810778065e-05, + "loss": 2.9856, + "num_input_tokens_seen": 15466496000, + "step": 59000 + }, + { + "epoch": 0.3968634335749828, + "eval_loss": 2.8845956325531006, + "eval_runtime": 53.5429, + "eval_samples_per_second": 93.383, + "eval_steps_per_second": 23.346, + "num_input_tokens_seen": 15466496000, + "step": 59000 + }, + { + "epoch": 0.39719975851869044, + "grad_norm": 0.14572475850582123, + "learning_rate": 6.059144366901737e-05, + "loss": 2.9861, + "num_input_tokens_seen": 15479603200, + "step": 59050 + }, + { + "epoch": 0.39753608346239805, + "grad_norm": 0.5027282238006592, + "learning_rate": 5.449673790581611e-05, + "loss": 2.9773, + "num_input_tokens_seen": 15492710400, + "step": 59100 + }, + { + "epoch": 0.39787240840610566, + "grad_norm": 0.192597895860672, + "learning_rate": 4.87073578250698e-05, + "loss": 2.9874, + "num_input_tokens_seen": 15505817600, + "step": 59150 + }, + { + "epoch": 0.39820873334981327, + "grad_norm": 0.15083667635917664, + "learning_rate": 4.322727117869951e-05, + "loss": 2.987, + "num_input_tokens_seen": 15518924800, + "step": 59200 + }, + { + "epoch": 0.3985450582935209, + "grad_norm": 0.14701534807682037, + "learning_rate": 3.806023374435663e-05, + "loss": 2.9858, + "num_input_tokens_seen": 15532032000, + "step": 59250 + }, + { + "epoch": 0.3988813832372285, + "grad_norm": 0.145115464925766, + "learning_rate": 3.3209786751399184e-05, + "loss": 2.9926, + "num_input_tokens_seen": 15545139200, + "step": 59300 + }, + { + "epoch": 0.3992177081809361, + "grad_norm": 0.15828457474708557, + "learning_rate": 2.8679254453910786e-05, + "loss": 2.9803, + "num_input_tokens_seen": 15558246400, + "step": 59350 + }, + { + "epoch": 0.3995540331246437, + "grad_norm": 0.14400678873062134, + "learning_rate": 2.4471741852423235e-05, + "loss": 2.9701, + "num_input_tokens_seen": 15571353600, + "step": 59400 + }, + { + "epoch": 0.3998903580683513, + "grad_norm": 0.14925344288349152, + "learning_rate": 2.0590132565903473e-05, + "loss": 2.989, + "num_input_tokens_seen": 15584460800, + "step": 59450 + }, + { + "epoch": 0.40022668301205894, + "grad_norm": 0.14081260561943054, + "learning_rate": 1.70370868554659e-05, + "loss": 2.9824, + "num_input_tokens_seen": 15597568000, + "step": 59500 + }, + { + "epoch": 0.40022668301205894, + "eval_loss": 2.882228136062622, + "eval_runtime": 53.7595, + "eval_samples_per_second": 93.007, + "eval_steps_per_second": 23.252, + "num_input_tokens_seen": 15597568000, + "step": 59500 + }, + { + "epoch": 0.40056300795576655, + "grad_norm": 0.13585136830806732, + "learning_rate": 1.3815039801161721e-05, + "loss": 2.9883, + "num_input_tokens_seen": 15610675200, + "step": 59550 + }, + { + "epoch": 0.40089933289947416, + "grad_norm": 0.1438748985528946, + "learning_rate": 1.0926199633097156e-05, + "loss": 2.9781, + "num_input_tokens_seen": 15623782400, + "step": 59600 + }, + { + "epoch": 0.40123565784318177, + "grad_norm": 0.3345394730567932, + "learning_rate": 8.372546218022748e-06, + "loss": 2.9869, + "num_input_tokens_seen": 15636889600, + "step": 59650 + }, + { + "epoch": 0.4015719827868894, + "grad_norm": 0.14581316709518433, + "learning_rate": 6.15582970243117e-06, + "loss": 2.9882, + "num_input_tokens_seen": 15649996800, + "step": 59700 + }, + { + "epoch": 0.401908307730597, + "grad_norm": 0.1409323662519455, + "learning_rate": 4.277569313094809e-06, + "loss": 2.9833, + "num_input_tokens_seen": 15663104000, + "step": 59750 + }, + { + "epoch": 0.4022446326743046, + "grad_norm": 0.1412041187286377, + "learning_rate": 2.739052315863355e-06, + "loss": 2.9835, + "num_input_tokens_seen": 15676211200, + "step": 59800 + }, + { + "epoch": 0.4025809576180122, + "grad_norm": 0.14011850953102112, + "learning_rate": 1.541333133436018e-06, + "loss": 2.9819, + "num_input_tokens_seen": 15689318400, + "step": 59850 + }, + { + "epoch": 0.4029172825617198, + "grad_norm": 0.14772015810012817, + "learning_rate": 6.852326227130834e-07, + "loss": 2.9855, + "num_input_tokens_seen": 15702425600, + "step": 59900 + }, + { + "epoch": 0.40325360750542744, + "grad_norm": 0.14281156659126282, + "learning_rate": 1.7133751222137007e-07, + "loss": 2.978, + "num_input_tokens_seen": 15715532800, + "step": 59950 + }, + { + "epoch": 0.40358993244913505, + "grad_norm": 0.14420129358768463, + "learning_rate": 0.0, + "loss": 2.9789, + "num_input_tokens_seen": 15728640000, + "step": 60000 + }, + { + "epoch": 0.40358993244913505, + "eval_loss": 2.8818726539611816, + "eval_runtime": 53.5982, + "eval_samples_per_second": 93.287, + "eval_steps_per_second": 23.322, + "num_input_tokens_seen": 15728640000, + "step": 60000 + }, + { + "epoch": 0.40358993244913505, + "num_input_tokens_seen": 15728640000, + "step": 60000, + "total_flos": 4.2075647115264e+18, + "train_loss": 0.7609903035481771, + "train_runtime": 21458.0072, + "train_samples_per_second": 715.817, + "train_steps_per_second": 2.796, + "train_tokens_per_second": 732996.306 + } + ], + "logging_steps": 50, + "max_steps": 60000, + "num_input_tokens_seen": 15728640000, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.2075647115264e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}