{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4708549211906576, "eval_steps": 500, "global_step": 70000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00033632494370761253, "grad_norm": 1.274348497390747, "learning_rate": 0.0001, "loss": 9.9462, "num_input_tokens_seen": 13107200, "step": 50 }, { "epoch": 0.0006726498874152251, "grad_norm": 0.3978561758995056, "learning_rate": 0.0002, "loss": 8.361, "num_input_tokens_seen": 26214400, "step": 100 }, { "epoch": 0.0010089748311228376, "grad_norm": 0.41306379437446594, "learning_rate": 0.0003, "loss": 7.6593, "num_input_tokens_seen": 39321600, "step": 150 }, { "epoch": 0.0013452997748304501, "grad_norm": 0.3370315134525299, "learning_rate": 0.0004, "loss": 7.1018, "num_input_tokens_seen": 52428800, "step": 200 }, { "epoch": 0.0016816247185380627, "grad_norm": 0.6396230459213257, "learning_rate": 0.0005, "loss": 6.6331, "num_input_tokens_seen": 65536000, "step": 250 }, { "epoch": 0.002017949662245675, "grad_norm": 0.5036832690238953, "learning_rate": 0.0006, "loss": 6.2646, "num_input_tokens_seen": 78643200, "step": 300 }, { "epoch": 0.002354274605953288, "grad_norm": 0.4829367399215698, "learning_rate": 0.0007, "loss": 5.9656, "num_input_tokens_seen": 91750400, "step": 350 }, { "epoch": 0.0026905995496609002, "grad_norm": 0.6319091320037842, "learning_rate": 0.0008, "loss": 5.7098, "num_input_tokens_seen": 104857600, "step": 400 }, { "epoch": 0.003026924493368513, "grad_norm": 0.8607974648475647, "learning_rate": 0.0009000000000000001, "loss": 5.5012, "num_input_tokens_seen": 117964800, "step": 450 }, { "epoch": 0.0033632494370761253, "grad_norm": 0.4290812611579895, "learning_rate": 0.001, "loss": 5.3179, "num_input_tokens_seen": 131072000, "step": 500 }, { "epoch": 0.0033632494370761253, "eval_loss": 5.179312705993652, "eval_runtime": 100.6159, "eval_samples_per_second": 49.694, "eval_steps_per_second": 12.423, "num_input_tokens_seen": 131072000, "step": 500 }, { "epoch": 0.003699574380783738, "grad_norm": 0.407632052898407, "learning_rate": 0.001, "loss": 5.1431, "num_input_tokens_seen": 144179200, "step": 550 }, { "epoch": 0.00403589932449135, "grad_norm": 0.31742918491363525, "learning_rate": 0.001, "loss": 4.9611, "num_input_tokens_seen": 157286400, "step": 600 }, { "epoch": 0.004372224268198963, "grad_norm": 0.3897629678249359, "learning_rate": 0.001, "loss": 4.8043, "num_input_tokens_seen": 170393600, "step": 650 }, { "epoch": 0.004708549211906576, "grad_norm": 0.41930389404296875, "learning_rate": 0.001, "loss": 4.6617, "num_input_tokens_seen": 183500800, "step": 700 }, { "epoch": 0.005044874155614189, "grad_norm": 0.4283705949783325, "learning_rate": 0.001, "loss": 4.5535, "num_input_tokens_seen": 196608000, "step": 750 }, { "epoch": 0.0053811990993218005, "grad_norm": 0.3931107521057129, "learning_rate": 0.001, "loss": 4.4556, "num_input_tokens_seen": 209715200, "step": 800 }, { "epoch": 0.005717524043029413, "grad_norm": 0.3258611857891083, "learning_rate": 0.001, "loss": 4.3808, "num_input_tokens_seen": 222822400, "step": 850 }, { "epoch": 0.006053848986737026, "grad_norm": 0.3562588095664978, "learning_rate": 0.001, "loss": 4.3136, "num_input_tokens_seen": 235929600, "step": 900 }, { "epoch": 0.006390173930444639, "grad_norm": 0.3626460134983063, "learning_rate": 0.001, "loss": 4.2557, "num_input_tokens_seen": 249036800, "step": 950 }, { "epoch": 0.006726498874152251, "grad_norm": 0.4001849293708801, "learning_rate": 0.001, "loss": 4.208, "num_input_tokens_seen": 262144000, "step": 1000 }, { "epoch": 0.006726498874152251, "eval_loss": 4.108783721923828, "eval_runtime": 101.5418, "eval_samples_per_second": 49.241, "eval_steps_per_second": 12.31, "num_input_tokens_seen": 262144000, "step": 1000 }, { "epoch": 0.007062823817859863, "grad_norm": 0.36548155546188354, "learning_rate": 0.001, "loss": 4.1537, "num_input_tokens_seen": 275251200, "step": 1050 }, { "epoch": 0.007399148761567476, "grad_norm": 0.33794787526130676, "learning_rate": 0.001, "loss": 4.1062, "num_input_tokens_seen": 288358400, "step": 1100 }, { "epoch": 0.007735473705275089, "grad_norm": 0.3374481499195099, "learning_rate": 0.001, "loss": 4.0703, "num_input_tokens_seen": 301465600, "step": 1150 }, { "epoch": 0.0080717986489827, "grad_norm": 0.3061329126358032, "learning_rate": 0.001, "loss": 4.0253, "num_input_tokens_seen": 314572800, "step": 1200 }, { "epoch": 0.008408123592690313, "grad_norm": 0.3140158951282501, "learning_rate": 0.001, "loss": 4.0128, "num_input_tokens_seen": 327680000, "step": 1250 }, { "epoch": 0.008744448536397926, "grad_norm": 0.3002954125404358, "learning_rate": 0.001, "loss": 3.9762, "num_input_tokens_seen": 340787200, "step": 1300 }, { "epoch": 0.009080773480105539, "grad_norm": 0.2994467318058014, "learning_rate": 0.001, "loss": 3.9485, "num_input_tokens_seen": 353894400, "step": 1350 }, { "epoch": 0.009417098423813152, "grad_norm": 0.25649985671043396, "learning_rate": 0.001, "loss": 3.9298, "num_input_tokens_seen": 367001600, "step": 1400 }, { "epoch": 0.009753423367520764, "grad_norm": 0.2627107501029968, "learning_rate": 0.001, "loss": 3.904, "num_input_tokens_seen": 380108800, "step": 1450 }, { "epoch": 0.010089748311228377, "grad_norm": 0.29527419805526733, "learning_rate": 0.001, "loss": 3.8864, "num_input_tokens_seen": 393216000, "step": 1500 }, { "epoch": 0.010089748311228377, "eval_loss": 3.8075473308563232, "eval_runtime": 99.5484, "eval_samples_per_second": 50.227, "eval_steps_per_second": 12.557, "num_input_tokens_seen": 393216000, "step": 1500 }, { "epoch": 0.010426073254935988, "grad_norm": 0.26328331232070923, "learning_rate": 0.001, "loss": 3.8701, "num_input_tokens_seen": 406323200, "step": 1550 }, { "epoch": 0.010762398198643601, "grad_norm": 0.2574864625930786, "learning_rate": 0.001, "loss": 3.8575, "num_input_tokens_seen": 419430400, "step": 1600 }, { "epoch": 0.011098723142351214, "grad_norm": 0.2445235550403595, "learning_rate": 0.001, "loss": 3.8277, "num_input_tokens_seen": 432537600, "step": 1650 }, { "epoch": 0.011435048086058826, "grad_norm": 0.2726516127586365, "learning_rate": 0.001, "loss": 3.8171, "num_input_tokens_seen": 445644800, "step": 1700 }, { "epoch": 0.01177137302976644, "grad_norm": 0.2658848166465759, "learning_rate": 0.001, "loss": 3.8026, "num_input_tokens_seen": 458752000, "step": 1750 }, { "epoch": 0.012107697973474052, "grad_norm": 0.30301713943481445, "learning_rate": 0.001, "loss": 3.7838, "num_input_tokens_seen": 471859200, "step": 1800 }, { "epoch": 0.012444022917181665, "grad_norm": 0.24884650111198425, "learning_rate": 0.001, "loss": 3.7685, "num_input_tokens_seen": 484966400, "step": 1850 }, { "epoch": 0.012780347860889277, "grad_norm": 0.2728760540485382, "learning_rate": 0.001, "loss": 3.759, "num_input_tokens_seen": 498073600, "step": 1900 }, { "epoch": 0.013116672804596888, "grad_norm": 0.24091999232769012, "learning_rate": 0.001, "loss": 3.7496, "num_input_tokens_seen": 511180800, "step": 1950 }, { "epoch": 0.013452997748304501, "grad_norm": 0.2559104561805725, "learning_rate": 0.001, "loss": 3.7289, "num_input_tokens_seen": 524288000, "step": 2000 }, { "epoch": 0.013452997748304501, "eval_loss": 3.654578447341919, "eval_runtime": 99.6741, "eval_samples_per_second": 50.164, "eval_steps_per_second": 12.541, "num_input_tokens_seen": 524288000, "step": 2000 }, { "epoch": 0.013789322692012114, "grad_norm": 0.24713733792304993, "learning_rate": 0.001, "loss": 3.7259, "num_input_tokens_seen": 537395200, "step": 2050 }, { "epoch": 0.014125647635719727, "grad_norm": 0.2594991624355316, "learning_rate": 0.001, "loss": 3.7121, "num_input_tokens_seen": 550502400, "step": 2100 }, { "epoch": 0.01446197257942734, "grad_norm": 0.2594399154186249, "learning_rate": 0.001, "loss": 3.6975, "num_input_tokens_seen": 563609600, "step": 2150 }, { "epoch": 0.014798297523134952, "grad_norm": 0.21877750754356384, "learning_rate": 0.001, "loss": 3.6868, "num_input_tokens_seen": 576716800, "step": 2200 }, { "epoch": 0.015134622466842565, "grad_norm": 0.23657166957855225, "learning_rate": 0.001, "loss": 3.6853, "num_input_tokens_seen": 589824000, "step": 2250 }, { "epoch": 0.015470947410550178, "grad_norm": 0.24121630191802979, "learning_rate": 0.001, "loss": 3.6718, "num_input_tokens_seen": 602931200, "step": 2300 }, { "epoch": 0.01580727235425779, "grad_norm": 0.25701209902763367, "learning_rate": 0.001, "loss": 3.6658, "num_input_tokens_seen": 616038400, "step": 2350 }, { "epoch": 0.0161435972979654, "grad_norm": 0.22402645647525787, "learning_rate": 0.001, "loss": 3.6704, "num_input_tokens_seen": 629145600, "step": 2400 }, { "epoch": 0.016479922241673016, "grad_norm": 0.2358316332101822, "learning_rate": 0.001, "loss": 3.6464, "num_input_tokens_seen": 642252800, "step": 2450 }, { "epoch": 0.016816247185380627, "grad_norm": 0.23835696280002594, "learning_rate": 0.001, "loss": 3.6424, "num_input_tokens_seen": 655360000, "step": 2500 }, { "epoch": 0.016816247185380627, "eval_loss": 3.560713768005371, "eval_runtime": 100.1074, "eval_samples_per_second": 49.946, "eval_steps_per_second": 12.487, "num_input_tokens_seen": 655360000, "step": 2500 }, { "epoch": 0.01715257212908824, "grad_norm": 0.20949696004390717, "learning_rate": 0.001, "loss": 3.6346, "num_input_tokens_seen": 668467200, "step": 2550 }, { "epoch": 0.017488897072795852, "grad_norm": 0.21765078604221344, "learning_rate": 0.001, "loss": 3.6123, "num_input_tokens_seen": 681574400, "step": 2600 }, { "epoch": 0.017825222016503463, "grad_norm": 0.21167752146720886, "learning_rate": 0.001, "loss": 3.6173, "num_input_tokens_seen": 694681600, "step": 2650 }, { "epoch": 0.018161546960211078, "grad_norm": 0.22060410678386688, "learning_rate": 0.001, "loss": 3.62, "num_input_tokens_seen": 707788800, "step": 2700 }, { "epoch": 0.01849787190391869, "grad_norm": 0.21778976917266846, "learning_rate": 0.001, "loss": 3.6009, "num_input_tokens_seen": 720896000, "step": 2750 }, { "epoch": 0.018834196847626303, "grad_norm": 0.21130047738552094, "learning_rate": 0.001, "loss": 3.5882, "num_input_tokens_seen": 734003200, "step": 2800 }, { "epoch": 0.019170521791333914, "grad_norm": 0.20137132704257965, "learning_rate": 0.001, "loss": 3.5923, "num_input_tokens_seen": 747110400, "step": 2850 }, { "epoch": 0.01950684673504153, "grad_norm": 0.24937641620635986, "learning_rate": 0.001, "loss": 3.5872, "num_input_tokens_seen": 760217600, "step": 2900 }, { "epoch": 0.01984317167874914, "grad_norm": 0.20992155373096466, "learning_rate": 0.001, "loss": 3.5786, "num_input_tokens_seen": 773324800, "step": 2950 }, { "epoch": 0.020179496622456754, "grad_norm": 0.24723300337791443, "learning_rate": 0.001, "loss": 3.5846, "num_input_tokens_seen": 786432000, "step": 3000 }, { "epoch": 0.020179496622456754, "eval_loss": 3.502939462661743, "eval_runtime": 99.6836, "eval_samples_per_second": 50.159, "eval_steps_per_second": 12.54, "num_input_tokens_seen": 786432000, "step": 3000 }, { "epoch": 0.020515821566164365, "grad_norm": 0.22368234395980835, "learning_rate": 0.001, "loss": 3.5744, "num_input_tokens_seen": 799539200, "step": 3050 }, { "epoch": 0.020852146509871976, "grad_norm": 0.20934642851352692, "learning_rate": 0.001, "loss": 3.5666, "num_input_tokens_seen": 812646400, "step": 3100 }, { "epoch": 0.02118847145357959, "grad_norm": 0.1929185688495636, "learning_rate": 0.001, "loss": 3.5624, "num_input_tokens_seen": 825753600, "step": 3150 }, { "epoch": 0.021524796397287202, "grad_norm": 0.20416973531246185, "learning_rate": 0.001, "loss": 3.5491, "num_input_tokens_seen": 838860800, "step": 3200 }, { "epoch": 0.021861121340994816, "grad_norm": 0.20873814821243286, "learning_rate": 0.001, "loss": 3.5509, "num_input_tokens_seen": 851968000, "step": 3250 }, { "epoch": 0.022197446284702427, "grad_norm": 0.22235794365406036, "learning_rate": 0.001, "loss": 3.5453, "num_input_tokens_seen": 865075200, "step": 3300 }, { "epoch": 0.022533771228410042, "grad_norm": 0.2174178808927536, "learning_rate": 0.001, "loss": 3.5378, "num_input_tokens_seen": 878182400, "step": 3350 }, { "epoch": 0.022870096172117653, "grad_norm": 0.2016495019197464, "learning_rate": 0.001, "loss": 3.5352, "num_input_tokens_seen": 891289600, "step": 3400 }, { "epoch": 0.023206421115825267, "grad_norm": 0.1864960640668869, "learning_rate": 0.001, "loss": 3.524, "num_input_tokens_seen": 904396800, "step": 3450 }, { "epoch": 0.02354274605953288, "grad_norm": 0.19614097476005554, "learning_rate": 0.001, "loss": 3.528, "num_input_tokens_seen": 917504000, "step": 3500 }, { "epoch": 0.02354274605953288, "eval_loss": 3.4472692012786865, "eval_runtime": 101.0365, "eval_samples_per_second": 49.487, "eval_steps_per_second": 12.372, "num_input_tokens_seen": 917504000, "step": 3500 }, { "epoch": 0.02387907100324049, "grad_norm": 0.18501116335391998, "learning_rate": 0.001, "loss": 3.53, "num_input_tokens_seen": 930611200, "step": 3550 }, { "epoch": 0.024215395946948104, "grad_norm": 0.1863412857055664, "learning_rate": 0.001, "loss": 3.5145, "num_input_tokens_seen": 943718400, "step": 3600 }, { "epoch": 0.024551720890655715, "grad_norm": 0.1979917287826538, "learning_rate": 0.001, "loss": 3.5143, "num_input_tokens_seen": 956825600, "step": 3650 }, { "epoch": 0.02488804583436333, "grad_norm": 0.1991748809814453, "learning_rate": 0.001, "loss": 3.5065, "num_input_tokens_seen": 969932800, "step": 3700 }, { "epoch": 0.02522437077807094, "grad_norm": 0.19475233554840088, "learning_rate": 0.001, "loss": 3.5011, "num_input_tokens_seen": 983040000, "step": 3750 }, { "epoch": 0.025560695721778555, "grad_norm": 0.195469468832016, "learning_rate": 0.001, "loss": 3.506, "num_input_tokens_seen": 996147200, "step": 3800 }, { "epoch": 0.025897020665486166, "grad_norm": 0.19666293263435364, "learning_rate": 0.001, "loss": 3.4936, "num_input_tokens_seen": 1009254400, "step": 3850 }, { "epoch": 0.026233345609193777, "grad_norm": 0.20198987424373627, "learning_rate": 0.001, "loss": 3.4873, "num_input_tokens_seen": 1022361600, "step": 3900 }, { "epoch": 0.02656967055290139, "grad_norm": 0.18537157773971558, "learning_rate": 0.001, "loss": 3.4939, "num_input_tokens_seen": 1035468800, "step": 3950 }, { "epoch": 0.026905995496609002, "grad_norm": 0.18743236362934113, "learning_rate": 0.001, "loss": 3.4784, "num_input_tokens_seen": 1048576000, "step": 4000 }, { "epoch": 0.026905995496609002, "eval_loss": 3.4036922454833984, "eval_runtime": 99.4277, "eval_samples_per_second": 50.288, "eval_steps_per_second": 12.572, "num_input_tokens_seen": 1048576000, "step": 4000 }, { "epoch": 0.027242320440316617, "grad_norm": 0.18962019681930542, "learning_rate": 0.001, "loss": 3.4848, "num_input_tokens_seen": 1061683200, "step": 4050 }, { "epoch": 0.027578645384024228, "grad_norm": 0.19415706396102905, "learning_rate": 0.001, "loss": 3.4934, "num_input_tokens_seen": 1074790400, "step": 4100 }, { "epoch": 0.027914970327731842, "grad_norm": 0.17727437615394592, "learning_rate": 0.001, "loss": 3.4835, "num_input_tokens_seen": 1087897600, "step": 4150 }, { "epoch": 0.028251295271439453, "grad_norm": 0.18400466442108154, "learning_rate": 0.001, "loss": 3.4843, "num_input_tokens_seen": 1101004800, "step": 4200 }, { "epoch": 0.028587620215147068, "grad_norm": 0.19114799797534943, "learning_rate": 0.001, "loss": 3.4713, "num_input_tokens_seen": 1114112000, "step": 4250 }, { "epoch": 0.02892394515885468, "grad_norm": 0.18681153655052185, "learning_rate": 0.001, "loss": 3.4601, "num_input_tokens_seen": 1127219200, "step": 4300 }, { "epoch": 0.02926027010256229, "grad_norm": 0.20739078521728516, "learning_rate": 0.001, "loss": 3.4628, "num_input_tokens_seen": 1140326400, "step": 4350 }, { "epoch": 0.029596595046269904, "grad_norm": 0.18018484115600586, "learning_rate": 0.001, "loss": 3.4521, "num_input_tokens_seen": 1153433600, "step": 4400 }, { "epoch": 0.029932919989977515, "grad_norm": 0.18144090473651886, "learning_rate": 0.001, "loss": 3.4536, "num_input_tokens_seen": 1166540800, "step": 4450 }, { "epoch": 0.03026924493368513, "grad_norm": 0.17444822192192078, "learning_rate": 0.001, "loss": 3.4509, "num_input_tokens_seen": 1179648000, "step": 4500 }, { "epoch": 0.03026924493368513, "eval_loss": 3.3682761192321777, "eval_runtime": 100.0421, "eval_samples_per_second": 49.979, "eval_steps_per_second": 12.495, "num_input_tokens_seen": 1179648000, "step": 4500 }, { "epoch": 0.03060556987739274, "grad_norm": 0.21026909351348877, "learning_rate": 0.001, "loss": 3.4493, "num_input_tokens_seen": 1192755200, "step": 4550 }, { "epoch": 0.030941894821100355, "grad_norm": 0.1758560836315155, "learning_rate": 0.001, "loss": 3.4416, "num_input_tokens_seen": 1205862400, "step": 4600 }, { "epoch": 0.03127821976480797, "grad_norm": 0.1979188174009323, "learning_rate": 0.001, "loss": 3.4353, "num_input_tokens_seen": 1218969600, "step": 4650 }, { "epoch": 0.03161454470851558, "grad_norm": 0.17621161043643951, "learning_rate": 0.001, "loss": 3.435, "num_input_tokens_seen": 1232076800, "step": 4700 }, { "epoch": 0.03195086965222319, "grad_norm": 0.18691854178905487, "learning_rate": 0.001, "loss": 3.4367, "num_input_tokens_seen": 1245184000, "step": 4750 }, { "epoch": 0.0322871945959308, "grad_norm": 0.17202869057655334, "learning_rate": 0.001, "loss": 3.4426, "num_input_tokens_seen": 1258291200, "step": 4800 }, { "epoch": 0.032623519539638414, "grad_norm": 0.17097769677639008, "learning_rate": 0.001, "loss": 3.439, "num_input_tokens_seen": 1271398400, "step": 4850 }, { "epoch": 0.03295984448334603, "grad_norm": 0.1845879703760147, "learning_rate": 0.001, "loss": 3.4263, "num_input_tokens_seen": 1284505600, "step": 4900 }, { "epoch": 0.03329616942705364, "grad_norm": 0.18462544679641724, "learning_rate": 0.001, "loss": 3.4273, "num_input_tokens_seen": 1297612800, "step": 4950 }, { "epoch": 0.033632494370761254, "grad_norm": 0.18930740654468536, "learning_rate": 0.001, "loss": 3.4252, "num_input_tokens_seen": 1310720000, "step": 5000 }, { "epoch": 0.033632494370761254, "eval_loss": 3.3413195610046387, "eval_runtime": 101.3099, "eval_samples_per_second": 49.354, "eval_steps_per_second": 12.338, "num_input_tokens_seen": 1310720000, "step": 5000 }, { "epoch": 0.033968819314468865, "grad_norm": 0.1741451472043991, "learning_rate": 0.001, "loss": 3.4216, "num_input_tokens_seen": 1323827200, "step": 5050 }, { "epoch": 0.03430514425817648, "grad_norm": 0.18628782033920288, "learning_rate": 0.001, "loss": 3.418, "num_input_tokens_seen": 1336934400, "step": 5100 }, { "epoch": 0.034641469201884094, "grad_norm": 0.17286072671413422, "learning_rate": 0.001, "loss": 3.4126, "num_input_tokens_seen": 1350041600, "step": 5150 }, { "epoch": 0.034977794145591705, "grad_norm": 0.18905507028102875, "learning_rate": 0.001, "loss": 3.4169, "num_input_tokens_seen": 1363148800, "step": 5200 }, { "epoch": 0.035314119089299316, "grad_norm": 0.17675209045410156, "learning_rate": 0.001, "loss": 3.4229, "num_input_tokens_seen": 1376256000, "step": 5250 }, { "epoch": 0.03565044403300693, "grad_norm": 0.19063080847263336, "learning_rate": 0.001, "loss": 3.4099, "num_input_tokens_seen": 1389363200, "step": 5300 }, { "epoch": 0.035986768976714545, "grad_norm": 0.1875571757555008, "learning_rate": 0.001, "loss": 3.4185, "num_input_tokens_seen": 1402470400, "step": 5350 }, { "epoch": 0.036323093920422156, "grad_norm": 1.0721184015274048, "learning_rate": 0.001, "loss": 3.4298, "num_input_tokens_seen": 1415577600, "step": 5400 }, { "epoch": 0.03665941886412977, "grad_norm": 0.1907675564289093, "learning_rate": 0.001, "loss": 3.4268, "num_input_tokens_seen": 1428684800, "step": 5450 }, { "epoch": 0.03699574380783738, "grad_norm": 0.18285758793354034, "learning_rate": 0.001, "loss": 3.4036, "num_input_tokens_seen": 1441792000, "step": 5500 }, { "epoch": 0.03699574380783738, "eval_loss": 3.31872820854187, "eval_runtime": 101.3489, "eval_samples_per_second": 49.335, "eval_steps_per_second": 12.334, "num_input_tokens_seen": 1441792000, "step": 5500 }, { "epoch": 0.037332068751544996, "grad_norm": 0.18330691754817963, "learning_rate": 0.001, "loss": 3.3926, "num_input_tokens_seen": 1454899200, "step": 5550 }, { "epoch": 0.03766839369525261, "grad_norm": 0.16875725984573364, "learning_rate": 0.001, "loss": 3.3975, "num_input_tokens_seen": 1468006400, "step": 5600 }, { "epoch": 0.03800471863896022, "grad_norm": 0.18510381877422333, "learning_rate": 0.001, "loss": 3.3964, "num_input_tokens_seen": 1481113600, "step": 5650 }, { "epoch": 0.03834104358266783, "grad_norm": 0.16602838039398193, "learning_rate": 0.001, "loss": 3.3819, "num_input_tokens_seen": 1494220800, "step": 5700 }, { "epoch": 0.03867736852637544, "grad_norm": 0.16771391034126282, "learning_rate": 0.001, "loss": 3.3968, "num_input_tokens_seen": 1507328000, "step": 5750 }, { "epoch": 0.03901369347008306, "grad_norm": 0.16801221668720245, "learning_rate": 0.001, "loss": 3.3921, "num_input_tokens_seen": 1520435200, "step": 5800 }, { "epoch": 0.03935001841379067, "grad_norm": 0.16846245527267456, "learning_rate": 0.001, "loss": 3.3841, "num_input_tokens_seen": 1533542400, "step": 5850 }, { "epoch": 0.03968634335749828, "grad_norm": 0.17359821498394012, "learning_rate": 0.001, "loss": 3.372, "num_input_tokens_seen": 1546649600, "step": 5900 }, { "epoch": 0.04002266830120589, "grad_norm": 0.1578226536512375, "learning_rate": 0.001, "loss": 3.3826, "num_input_tokens_seen": 1559756800, "step": 5950 }, { "epoch": 0.04035899324491351, "grad_norm": 0.17228208482265472, "learning_rate": 0.001, "loss": 3.3953, "num_input_tokens_seen": 1572864000, "step": 6000 }, { "epoch": 0.04035899324491351, "eval_loss": 3.293433666229248, "eval_runtime": 100.6763, "eval_samples_per_second": 49.664, "eval_steps_per_second": 12.416, "num_input_tokens_seen": 1572864000, "step": 6000 }, { "epoch": 0.04069531818862112, "grad_norm": 0.18091177940368652, "learning_rate": 0.001, "loss": 3.3783, "num_input_tokens_seen": 1585971200, "step": 6050 }, { "epoch": 0.04103164313232873, "grad_norm": 0.17565099895000458, "learning_rate": 0.001, "loss": 3.3773, "num_input_tokens_seen": 1599078400, "step": 6100 }, { "epoch": 0.04136796807603634, "grad_norm": 0.1635759323835373, "learning_rate": 0.001, "loss": 3.3823, "num_input_tokens_seen": 1612185600, "step": 6150 }, { "epoch": 0.04170429301974395, "grad_norm": 0.19144974648952484, "learning_rate": 0.001, "loss": 3.3665, "num_input_tokens_seen": 1625292800, "step": 6200 }, { "epoch": 0.04204061796345157, "grad_norm": 0.1741226762533188, "learning_rate": 0.001, "loss": 3.3637, "num_input_tokens_seen": 1638400000, "step": 6250 }, { "epoch": 0.04237694290715918, "grad_norm": 0.17072845995426178, "learning_rate": 0.001, "loss": 3.3627, "num_input_tokens_seen": 1651507200, "step": 6300 }, { "epoch": 0.04271326785086679, "grad_norm": 0.16942182183265686, "learning_rate": 0.001, "loss": 3.3729, "num_input_tokens_seen": 1664614400, "step": 6350 }, { "epoch": 0.043049592794574404, "grad_norm": 0.16412265598773956, "learning_rate": 0.001, "loss": 3.3616, "num_input_tokens_seen": 1677721600, "step": 6400 }, { "epoch": 0.04338591773828202, "grad_norm": 0.17044900357723236, "learning_rate": 0.001, "loss": 3.3574, "num_input_tokens_seen": 1690828800, "step": 6450 }, { "epoch": 0.04372224268198963, "grad_norm": 0.18034328520298004, "learning_rate": 0.001, "loss": 3.3625, "num_input_tokens_seen": 1703936000, "step": 6500 }, { "epoch": 0.04372224268198963, "eval_loss": 3.2744550704956055, "eval_runtime": 101.1801, "eval_samples_per_second": 49.417, "eval_steps_per_second": 12.354, "num_input_tokens_seen": 1703936000, "step": 6500 }, { "epoch": 0.044058567625697244, "grad_norm": 0.169066920876503, "learning_rate": 0.001, "loss": 3.3569, "num_input_tokens_seen": 1717043200, "step": 6550 }, { "epoch": 0.044394892569404855, "grad_norm": 0.1789105087518692, "learning_rate": 0.001, "loss": 3.3636, "num_input_tokens_seen": 1730150400, "step": 6600 }, { "epoch": 0.044731217513112466, "grad_norm": 0.2083519697189331, "learning_rate": 0.001, "loss": 3.3565, "num_input_tokens_seen": 1743257600, "step": 6650 }, { "epoch": 0.045067542456820084, "grad_norm": 0.16989745199680328, "learning_rate": 0.001, "loss": 3.3562, "num_input_tokens_seen": 1756364800, "step": 6700 }, { "epoch": 0.045403867400527695, "grad_norm": 0.16275504231452942, "learning_rate": 0.001, "loss": 3.3625, "num_input_tokens_seen": 1769472000, "step": 6750 }, { "epoch": 0.045740192344235306, "grad_norm": 0.17771874368190765, "learning_rate": 0.001, "loss": 3.3477, "num_input_tokens_seen": 1782579200, "step": 6800 }, { "epoch": 0.04607651728794292, "grad_norm": 0.1635473072528839, "learning_rate": 0.001, "loss": 3.3513, "num_input_tokens_seen": 1795686400, "step": 6850 }, { "epoch": 0.046412842231650535, "grad_norm": 0.17198441922664642, "learning_rate": 0.001, "loss": 3.3461, "num_input_tokens_seen": 1808793600, "step": 6900 }, { "epoch": 0.046749167175358146, "grad_norm": 0.174327552318573, "learning_rate": 0.001, "loss": 3.3519, "num_input_tokens_seen": 1821900800, "step": 6950 }, { "epoch": 0.04708549211906576, "grad_norm": 0.17880085110664368, "learning_rate": 0.001, "loss": 3.3387, "num_input_tokens_seen": 1835008000, "step": 7000 }, { "epoch": 0.04708549211906576, "eval_loss": 3.2563092708587646, "eval_runtime": 100.9949, "eval_samples_per_second": 49.507, "eval_steps_per_second": 12.377, "num_input_tokens_seen": 1835008000, "step": 7000 }, { "epoch": 0.04742181706277337, "grad_norm": 0.18561717867851257, "learning_rate": 0.001, "loss": 3.3436, "num_input_tokens_seen": 1848115200, "step": 7050 }, { "epoch": 0.04775814200648098, "grad_norm": 0.17194584012031555, "learning_rate": 0.001, "loss": 3.3391, "num_input_tokens_seen": 1861222400, "step": 7100 }, { "epoch": 0.0480944669501886, "grad_norm": 0.16629651188850403, "learning_rate": 0.001, "loss": 3.3419, "num_input_tokens_seen": 1874329600, "step": 7150 }, { "epoch": 0.04843079189389621, "grad_norm": 0.1665981113910675, "learning_rate": 0.001, "loss": 3.3422, "num_input_tokens_seen": 1887436800, "step": 7200 }, { "epoch": 0.04876711683760382, "grad_norm": 0.17213182151317596, "learning_rate": 0.001, "loss": 3.3389, "num_input_tokens_seen": 1900544000, "step": 7250 }, { "epoch": 0.04910344178131143, "grad_norm": 0.18480969965457916, "learning_rate": 0.001, "loss": 3.3257, "num_input_tokens_seen": 1913651200, "step": 7300 }, { "epoch": 0.04943976672501905, "grad_norm": 0.17105132341384888, "learning_rate": 0.001, "loss": 3.3339, "num_input_tokens_seen": 1926758400, "step": 7350 }, { "epoch": 0.04977609166872666, "grad_norm": 0.17547503113746643, "learning_rate": 0.001, "loss": 3.3364, "num_input_tokens_seen": 1939865600, "step": 7400 }, { "epoch": 0.05011241661243427, "grad_norm": 0.16320562362670898, "learning_rate": 0.001, "loss": 3.3349, "num_input_tokens_seen": 1952972800, "step": 7450 }, { "epoch": 0.05044874155614188, "grad_norm": 0.16704347729682922, "learning_rate": 0.001, "loss": 3.3459, "num_input_tokens_seen": 1966080000, "step": 7500 }, { "epoch": 0.05044874155614188, "eval_loss": 3.241454601287842, "eval_runtime": 100.6752, "eval_samples_per_second": 49.665, "eval_steps_per_second": 12.416, "num_input_tokens_seen": 1966080000, "step": 7500 }, { "epoch": 0.05078506649984949, "grad_norm": 0.18855977058410645, "learning_rate": 0.001, "loss": 3.3298, "num_input_tokens_seen": 1979187200, "step": 7550 }, { "epoch": 0.05112139144355711, "grad_norm": 0.16146792471408844, "learning_rate": 0.001, "loss": 3.324, "num_input_tokens_seen": 1992294400, "step": 7600 }, { "epoch": 0.05145771638726472, "grad_norm": 0.1644527018070221, "learning_rate": 0.001, "loss": 3.3306, "num_input_tokens_seen": 2005401600, "step": 7650 }, { "epoch": 0.05179404133097233, "grad_norm": 0.17106670141220093, "learning_rate": 0.001, "loss": 3.33, "num_input_tokens_seen": 2018508800, "step": 7700 }, { "epoch": 0.05213036627467994, "grad_norm": 0.1606895476579666, "learning_rate": 0.001, "loss": 3.3188, "num_input_tokens_seen": 2031616000, "step": 7750 }, { "epoch": 0.052466691218387554, "grad_norm": 0.16948160529136658, "learning_rate": 0.001, "loss": 3.3306, "num_input_tokens_seen": 2044723200, "step": 7800 }, { "epoch": 0.05280301616209517, "grad_norm": 0.20683230459690094, "learning_rate": 0.001, "loss": 3.3203, "num_input_tokens_seen": 2057830400, "step": 7850 }, { "epoch": 0.05313934110580278, "grad_norm": 0.161922886967659, "learning_rate": 0.001, "loss": 3.3161, "num_input_tokens_seen": 2070937600, "step": 7900 }, { "epoch": 0.053475666049510394, "grad_norm": 0.1616695076227188, "learning_rate": 0.001, "loss": 3.3245, "num_input_tokens_seen": 2084044800, "step": 7950 }, { "epoch": 0.053811990993218005, "grad_norm": 0.1723030060529709, "learning_rate": 0.001, "loss": 3.3143, "num_input_tokens_seen": 2097152000, "step": 8000 }, { "epoch": 0.053811990993218005, "eval_loss": 3.2274885177612305, "eval_runtime": 100.3755, "eval_samples_per_second": 49.813, "eval_steps_per_second": 12.453, "num_input_tokens_seen": 2097152000, "step": 8000 }, { "epoch": 0.05414831593692562, "grad_norm": 0.16236628592014313, "learning_rate": 0.001, "loss": 3.3214, "num_input_tokens_seen": 2110259200, "step": 8050 }, { "epoch": 0.054484640880633234, "grad_norm": 0.1676984280347824, "learning_rate": 0.001, "loss": 3.3152, "num_input_tokens_seen": 2123366400, "step": 8100 }, { "epoch": 0.054820965824340845, "grad_norm": 0.16020448505878448, "learning_rate": 0.001, "loss": 3.3134, "num_input_tokens_seen": 2136473600, "step": 8150 }, { "epoch": 0.055157290768048456, "grad_norm": 0.1649223119020462, "learning_rate": 0.001, "loss": 3.3121, "num_input_tokens_seen": 2149580800, "step": 8200 }, { "epoch": 0.05549361571175607, "grad_norm": 0.1627037674188614, "learning_rate": 0.001, "loss": 3.3092, "num_input_tokens_seen": 2162688000, "step": 8250 }, { "epoch": 0.055829940655463685, "grad_norm": 0.17913097143173218, "learning_rate": 0.001, "loss": 3.3032, "num_input_tokens_seen": 2175795200, "step": 8300 }, { "epoch": 0.056166265599171296, "grad_norm": 0.18965736031532288, "learning_rate": 0.001, "loss": 3.3075, "num_input_tokens_seen": 2188902400, "step": 8350 }, { "epoch": 0.05650259054287891, "grad_norm": 0.16027510166168213, "learning_rate": 0.001, "loss": 3.3086, "num_input_tokens_seen": 2202009600, "step": 8400 }, { "epoch": 0.05683891548658652, "grad_norm": 0.16940778493881226, "learning_rate": 0.001, "loss": 3.2849, "num_input_tokens_seen": 2215116800, "step": 8450 }, { "epoch": 0.057175240430294136, "grad_norm": 0.17754122614860535, "learning_rate": 0.001, "loss": 3.2975, "num_input_tokens_seen": 2228224000, "step": 8500 }, { "epoch": 0.057175240430294136, "eval_loss": 3.2149288654327393, "eval_runtime": 101.5641, "eval_samples_per_second": 49.23, "eval_steps_per_second": 12.307, "num_input_tokens_seen": 2228224000, "step": 8500 }, { "epoch": 0.05751156537400175, "grad_norm": 0.1716330349445343, "learning_rate": 0.001, "loss": 3.3091, "num_input_tokens_seen": 2241331200, "step": 8550 }, { "epoch": 0.05784789031770936, "grad_norm": 0.16466470062732697, "learning_rate": 0.001, "loss": 3.2934, "num_input_tokens_seen": 2254438400, "step": 8600 }, { "epoch": 0.05818421526141697, "grad_norm": 0.1640830636024475, "learning_rate": 0.001, "loss": 3.2986, "num_input_tokens_seen": 2267545600, "step": 8650 }, { "epoch": 0.05852054020512458, "grad_norm": 0.16982024908065796, "learning_rate": 0.001, "loss": 3.3016, "num_input_tokens_seen": 2280652800, "step": 8700 }, { "epoch": 0.0588568651488322, "grad_norm": 0.1577749252319336, "learning_rate": 0.001, "loss": 3.2961, "num_input_tokens_seen": 2293760000, "step": 8750 }, { "epoch": 0.05919319009253981, "grad_norm": 0.1626594513654709, "learning_rate": 0.001, "loss": 3.293, "num_input_tokens_seen": 2306867200, "step": 8800 }, { "epoch": 0.05952951503624742, "grad_norm": 0.18469755351543427, "learning_rate": 0.001, "loss": 3.2919, "num_input_tokens_seen": 2319974400, "step": 8850 }, { "epoch": 0.05986583997995503, "grad_norm": 0.17915847897529602, "learning_rate": 0.001, "loss": 3.2914, "num_input_tokens_seen": 2333081600, "step": 8900 }, { "epoch": 0.06020216492366265, "grad_norm": 0.17483194172382355, "learning_rate": 0.001, "loss": 3.2855, "num_input_tokens_seen": 2346188800, "step": 8950 }, { "epoch": 0.06053848986737026, "grad_norm": 0.16408763825893402, "learning_rate": 0.001, "loss": 3.2817, "num_input_tokens_seen": 2359296000, "step": 9000 }, { "epoch": 0.06053848986737026, "eval_loss": 3.201568603515625, "eval_runtime": 100.2925, "eval_samples_per_second": 49.854, "eval_steps_per_second": 12.464, "num_input_tokens_seen": 2359296000, "step": 9000 }, { "epoch": 0.06087481481107787, "grad_norm": 0.15520979464054108, "learning_rate": 0.001, "loss": 3.2917, "num_input_tokens_seen": 2372403200, "step": 9050 }, { "epoch": 0.06121113975478548, "grad_norm": 0.19632326066493988, "learning_rate": 0.001, "loss": 3.3164, "num_input_tokens_seen": 2385510400, "step": 9100 }, { "epoch": 0.06154746469849309, "grad_norm": 0.17335627973079681, "learning_rate": 0.001, "loss": 3.3025, "num_input_tokens_seen": 2398617600, "step": 9150 }, { "epoch": 0.06188378964220071, "grad_norm": 0.18116877973079681, "learning_rate": 0.001, "loss": 3.2857, "num_input_tokens_seen": 2411724800, "step": 9200 }, { "epoch": 0.06222011458590832, "grad_norm": 0.17199201881885529, "learning_rate": 0.001, "loss": 3.2901, "num_input_tokens_seen": 2424832000, "step": 9250 }, { "epoch": 0.06255643952961594, "grad_norm": 0.163723424077034, "learning_rate": 0.001, "loss": 3.2865, "num_input_tokens_seen": 2437939200, "step": 9300 }, { "epoch": 0.06289276447332355, "grad_norm": 0.17228147387504578, "learning_rate": 0.001, "loss": 3.2884, "num_input_tokens_seen": 2451046400, "step": 9350 }, { "epoch": 0.06322908941703116, "grad_norm": 0.1656276136636734, "learning_rate": 0.001, "loss": 3.2871, "num_input_tokens_seen": 2464153600, "step": 9400 }, { "epoch": 0.06356541436073877, "grad_norm": 0.16867949068546295, "learning_rate": 0.001, "loss": 3.2817, "num_input_tokens_seen": 2477260800, "step": 9450 }, { "epoch": 0.06390173930444638, "grad_norm": 0.17453493177890778, "learning_rate": 0.001, "loss": 3.2876, "num_input_tokens_seen": 2490368000, "step": 9500 }, { "epoch": 0.06390173930444638, "eval_loss": 3.190683364868164, "eval_runtime": 100.5146, "eval_samples_per_second": 49.744, "eval_steps_per_second": 12.436, "num_input_tokens_seen": 2490368000, "step": 9500 }, { "epoch": 0.064238064248154, "grad_norm": 0.6719958186149597, "learning_rate": 0.001, "loss": 3.299, "num_input_tokens_seen": 2503475200, "step": 9550 }, { "epoch": 0.0645743891918616, "grad_norm": 0.19327396154403687, "learning_rate": 0.001, "loss": 3.2907, "num_input_tokens_seen": 2516582400, "step": 9600 }, { "epoch": 0.06491071413556922, "grad_norm": 0.17817369103431702, "learning_rate": 0.001, "loss": 3.2801, "num_input_tokens_seen": 2529689600, "step": 9650 }, { "epoch": 0.06524703907927683, "grad_norm": 0.16956672072410583, "learning_rate": 0.001, "loss": 3.2761, "num_input_tokens_seen": 2542796800, "step": 9700 }, { "epoch": 0.06558336402298445, "grad_norm": 0.1854093372821808, "learning_rate": 0.001, "loss": 3.27, "num_input_tokens_seen": 2555904000, "step": 9750 }, { "epoch": 0.06591968896669206, "grad_norm": 0.15702186524868011, "learning_rate": 0.001, "loss": 3.2792, "num_input_tokens_seen": 2569011200, "step": 9800 }, { "epoch": 0.06625601391039967, "grad_norm": 0.16380488872528076, "learning_rate": 0.001, "loss": 3.2672, "num_input_tokens_seen": 2582118400, "step": 9850 }, { "epoch": 0.06659233885410729, "grad_norm": 0.15908506512641907, "learning_rate": 0.001, "loss": 3.2665, "num_input_tokens_seen": 2595225600, "step": 9900 }, { "epoch": 0.0669286637978149, "grad_norm": 0.1654980629682541, "learning_rate": 0.001, "loss": 3.2726, "num_input_tokens_seen": 2608332800, "step": 9950 }, { "epoch": 0.06726498874152251, "grad_norm": 0.1780249923467636, "learning_rate": 0.001, "loss": 3.2632, "num_input_tokens_seen": 2621440000, "step": 10000 }, { "epoch": 0.06726498874152251, "eval_loss": 3.1775221824645996, "eval_runtime": 100.6676, "eval_samples_per_second": 49.668, "eval_steps_per_second": 12.417, "num_input_tokens_seen": 2621440000, "step": 10000 }, { "epoch": 0.06760131368523012, "grad_norm": 0.1572471708059311, "learning_rate": 0.001, "loss": 3.2652, "num_input_tokens_seen": 2634547200, "step": 10050 }, { "epoch": 0.06793763862893773, "grad_norm": 0.1655690222978592, "learning_rate": 0.001, "loss": 3.2753, "num_input_tokens_seen": 2647654400, "step": 10100 }, { "epoch": 0.06827396357264534, "grad_norm": 0.18156391382217407, "learning_rate": 0.001, "loss": 3.2717, "num_input_tokens_seen": 2660761600, "step": 10150 }, { "epoch": 0.06861028851635297, "grad_norm": 0.1684606373310089, "learning_rate": 0.001, "loss": 3.269, "num_input_tokens_seen": 2673868800, "step": 10200 }, { "epoch": 0.06894661346006058, "grad_norm": 0.18199962377548218, "learning_rate": 0.001, "loss": 3.2591, "num_input_tokens_seen": 2686976000, "step": 10250 }, { "epoch": 0.06928293840376819, "grad_norm": 0.1662759929895401, "learning_rate": 0.001, "loss": 3.2674, "num_input_tokens_seen": 2700083200, "step": 10300 }, { "epoch": 0.0696192633474758, "grad_norm": 0.16799511015415192, "learning_rate": 0.001, "loss": 3.2788, "num_input_tokens_seen": 2713190400, "step": 10350 }, { "epoch": 0.06995558829118341, "grad_norm": 0.17926375567913055, "learning_rate": 0.001, "loss": 3.2742, "num_input_tokens_seen": 2726297600, "step": 10400 }, { "epoch": 0.07029191323489102, "grad_norm": 0.18057045340538025, "learning_rate": 0.001, "loss": 3.2662, "num_input_tokens_seen": 2739404800, "step": 10450 }, { "epoch": 0.07062823817859863, "grad_norm": 0.17588871717453003, "learning_rate": 0.001, "loss": 3.2577, "num_input_tokens_seen": 2752512000, "step": 10500 }, { "epoch": 0.07062823817859863, "eval_loss": 3.168182849884033, "eval_runtime": 100.5761, "eval_samples_per_second": 49.714, "eval_steps_per_second": 12.428, "num_input_tokens_seen": 2752512000, "step": 10500 }, { "epoch": 0.07096456312230624, "grad_norm": 0.1731673628091812, "learning_rate": 0.001, "loss": 3.2565, "num_input_tokens_seen": 2765619200, "step": 10550 }, { "epoch": 0.07130088806601385, "grad_norm": 0.16532014310359955, "learning_rate": 0.001, "loss": 3.2511, "num_input_tokens_seen": 2778726400, "step": 10600 }, { "epoch": 0.07163721300972148, "grad_norm": 0.17818772792816162, "learning_rate": 0.001, "loss": 3.2541, "num_input_tokens_seen": 2791833600, "step": 10650 }, { "epoch": 0.07197353795342909, "grad_norm": 0.16863703727722168, "learning_rate": 0.001, "loss": 3.2485, "num_input_tokens_seen": 2804940800, "step": 10700 }, { "epoch": 0.0723098628971367, "grad_norm": 0.17316773533821106, "learning_rate": 0.001, "loss": 3.2583, "num_input_tokens_seen": 2818048000, "step": 10750 }, { "epoch": 0.07264618784084431, "grad_norm": 0.16366828978061676, "learning_rate": 0.001, "loss": 3.2502, "num_input_tokens_seen": 2831155200, "step": 10800 }, { "epoch": 0.07298251278455192, "grad_norm": 0.16141986846923828, "learning_rate": 0.001, "loss": 3.2537, "num_input_tokens_seen": 2844262400, "step": 10850 }, { "epoch": 0.07331883772825953, "grad_norm": 0.16185277700424194, "learning_rate": 0.001, "loss": 3.2453, "num_input_tokens_seen": 2857369600, "step": 10900 }, { "epoch": 0.07365516267196714, "grad_norm": 0.15637634694576263, "learning_rate": 0.001, "loss": 3.2562, "num_input_tokens_seen": 2870476800, "step": 10950 }, { "epoch": 0.07399148761567476, "grad_norm": 0.16142712533473969, "learning_rate": 0.001, "loss": 3.2427, "num_input_tokens_seen": 2883584000, "step": 11000 }, { "epoch": 0.07399148761567476, "eval_loss": 3.1591553688049316, "eval_runtime": 100.7501, "eval_samples_per_second": 49.628, "eval_steps_per_second": 12.407, "num_input_tokens_seen": 2883584000, "step": 11000 }, { "epoch": 0.07432781255938237, "grad_norm": 0.16285482048988342, "learning_rate": 0.001, "loss": 3.2556, "num_input_tokens_seen": 2896691200, "step": 11050 }, { "epoch": 0.07466413750308999, "grad_norm": 0.1800818145275116, "learning_rate": 0.001, "loss": 3.2498, "num_input_tokens_seen": 2909798400, "step": 11100 }, { "epoch": 0.0750004624467976, "grad_norm": 0.1587436944246292, "learning_rate": 0.001, "loss": 3.2454, "num_input_tokens_seen": 2922905600, "step": 11150 }, { "epoch": 0.07533678739050521, "grad_norm": 0.17776361107826233, "learning_rate": 0.001, "loss": 3.2671, "num_input_tokens_seen": 2936012800, "step": 11200 }, { "epoch": 0.07567311233421282, "grad_norm": 0.16090282797813416, "learning_rate": 0.001, "loss": 3.2582, "num_input_tokens_seen": 2949120000, "step": 11250 }, { "epoch": 0.07600943727792044, "grad_norm": 0.1685740053653717, "learning_rate": 0.001, "loss": 3.2485, "num_input_tokens_seen": 2962227200, "step": 11300 }, { "epoch": 0.07634576222162805, "grad_norm": 0.16622695326805115, "learning_rate": 0.001, "loss": 3.2517, "num_input_tokens_seen": 2975334400, "step": 11350 }, { "epoch": 0.07668208716533566, "grad_norm": 0.2576703131198883, "learning_rate": 0.001, "loss": 3.2536, "num_input_tokens_seen": 2988441600, "step": 11400 }, { "epoch": 0.07701841210904327, "grad_norm": 0.16928231716156006, "learning_rate": 0.001, "loss": 3.234, "num_input_tokens_seen": 3001548800, "step": 11450 }, { "epoch": 0.07735473705275088, "grad_norm": 0.16732951998710632, "learning_rate": 0.001, "loss": 3.2421, "num_input_tokens_seen": 3014656000, "step": 11500 }, { "epoch": 0.07735473705275088, "eval_loss": 3.1493282318115234, "eval_runtime": 100.6579, "eval_samples_per_second": 49.673, "eval_steps_per_second": 12.418, "num_input_tokens_seen": 3014656000, "step": 11500 }, { "epoch": 0.0776910619964585, "grad_norm": 0.1627015471458435, "learning_rate": 0.001, "loss": 3.2449, "num_input_tokens_seen": 3027763200, "step": 11550 }, { "epoch": 0.07802738694016612, "grad_norm": 0.1591007262468338, "learning_rate": 0.001, "loss": 3.2405, "num_input_tokens_seen": 3040870400, "step": 11600 }, { "epoch": 0.07836371188387373, "grad_norm": 0.16861042380332947, "learning_rate": 0.001, "loss": 3.2371, "num_input_tokens_seen": 3053977600, "step": 11650 }, { "epoch": 0.07870003682758134, "grad_norm": 0.17942191660404205, "learning_rate": 0.001, "loss": 3.2401, "num_input_tokens_seen": 3067084800, "step": 11700 }, { "epoch": 0.07903636177128895, "grad_norm": 0.19918367266654968, "learning_rate": 0.001, "loss": 3.2522, "num_input_tokens_seen": 3080192000, "step": 11750 }, { "epoch": 0.07937268671499656, "grad_norm": 0.20974946022033691, "learning_rate": 0.001, "loss": 3.2476, "num_input_tokens_seen": 3093299200, "step": 11800 }, { "epoch": 0.07970901165870417, "grad_norm": 0.17063277959823608, "learning_rate": 0.001, "loss": 3.2461, "num_input_tokens_seen": 3106406400, "step": 11850 }, { "epoch": 0.08004533660241178, "grad_norm": 0.17285390198230743, "learning_rate": 0.001, "loss": 3.2389, "num_input_tokens_seen": 3119513600, "step": 11900 }, { "epoch": 0.08038166154611939, "grad_norm": 0.16399264335632324, "learning_rate": 0.001, "loss": 3.2354, "num_input_tokens_seen": 3132620800, "step": 11950 }, { "epoch": 0.08071798648982702, "grad_norm": 0.17166489362716675, "learning_rate": 0.001, "loss": 3.2393, "num_input_tokens_seen": 3145728000, "step": 12000 }, { "epoch": 0.08071798648982702, "eval_loss": 3.1431546211242676, "eval_runtime": 100.4686, "eval_samples_per_second": 49.767, "eval_steps_per_second": 12.442, "num_input_tokens_seen": 3145728000, "step": 12000 }, { "epoch": 0.08105431143353463, "grad_norm": 0.16976477205753326, "learning_rate": 0.001, "loss": 3.2367, "num_input_tokens_seen": 3158835200, "step": 12050 }, { "epoch": 0.08139063637724224, "grad_norm": 0.17778240144252777, "learning_rate": 0.001, "loss": 3.2346, "num_input_tokens_seen": 3171942400, "step": 12100 }, { "epoch": 0.08172696132094985, "grad_norm": 0.17096461355686188, "learning_rate": 0.001, "loss": 3.235, "num_input_tokens_seen": 3185049600, "step": 12150 }, { "epoch": 0.08206328626465746, "grad_norm": 0.16154351830482483, "learning_rate": 0.001, "loss": 3.2263, "num_input_tokens_seen": 3198156800, "step": 12200 }, { "epoch": 0.08239961120836507, "grad_norm": 0.23045915365219116, "learning_rate": 0.001, "loss": 3.2295, "num_input_tokens_seen": 3211264000, "step": 12250 }, { "epoch": 0.08273593615207268, "grad_norm": 0.17755016684532166, "learning_rate": 0.001, "loss": 3.2465, "num_input_tokens_seen": 3224371200, "step": 12300 }, { "epoch": 0.0830722610957803, "grad_norm": 0.17216768860816956, "learning_rate": 0.001, "loss": 3.2353, "num_input_tokens_seen": 3237478400, "step": 12350 }, { "epoch": 0.0834085860394879, "grad_norm": 0.166086345911026, "learning_rate": 0.001, "loss": 3.231, "num_input_tokens_seen": 3250585600, "step": 12400 }, { "epoch": 0.08374491098319553, "grad_norm": 0.1681985855102539, "learning_rate": 0.001, "loss": 3.2343, "num_input_tokens_seen": 3263692800, "step": 12450 }, { "epoch": 0.08408123592690314, "grad_norm": 0.1611029952764511, "learning_rate": 0.001, "loss": 3.2386, "num_input_tokens_seen": 3276800000, "step": 12500 }, { "epoch": 0.08408123592690314, "eval_loss": 3.135470390319824, "eval_runtime": 99.9587, "eval_samples_per_second": 50.021, "eval_steps_per_second": 12.505, "num_input_tokens_seen": 3276800000, "step": 12500 }, { "epoch": 0.08441756087061075, "grad_norm": 0.16823448240756989, "learning_rate": 0.001, "loss": 3.2315, "num_input_tokens_seen": 3289907200, "step": 12550 }, { "epoch": 0.08475388581431836, "grad_norm": 0.17325358092784882, "learning_rate": 0.001, "loss": 3.2258, "num_input_tokens_seen": 3303014400, "step": 12600 }, { "epoch": 0.08509021075802597, "grad_norm": 0.16828718781471252, "learning_rate": 0.001, "loss": 3.2251, "num_input_tokens_seen": 3316121600, "step": 12650 }, { "epoch": 0.08542653570173359, "grad_norm": 0.3836762309074402, "learning_rate": 0.001, "loss": 3.2279, "num_input_tokens_seen": 3329228800, "step": 12700 }, { "epoch": 0.0857628606454412, "grad_norm": 0.17255236208438873, "learning_rate": 0.001, "loss": 3.221, "num_input_tokens_seen": 3342336000, "step": 12750 }, { "epoch": 0.08609918558914881, "grad_norm": 0.2381184846162796, "learning_rate": 0.001, "loss": 3.2228, "num_input_tokens_seen": 3355443200, "step": 12800 }, { "epoch": 0.08643551053285642, "grad_norm": 0.3065573573112488, "learning_rate": 0.001, "loss": 3.2251, "num_input_tokens_seen": 3368550400, "step": 12850 }, { "epoch": 0.08677183547656404, "grad_norm": 0.1801990419626236, "learning_rate": 0.001, "loss": 3.2488, "num_input_tokens_seen": 3381657600, "step": 12900 }, { "epoch": 0.08710816042027165, "grad_norm": 0.17388571798801422, "learning_rate": 0.001, "loss": 3.2309, "num_input_tokens_seen": 3394764800, "step": 12950 }, { "epoch": 0.08744448536397927, "grad_norm": 0.16619688272476196, "learning_rate": 0.001, "loss": 3.2158, "num_input_tokens_seen": 3407872000, "step": 13000 }, { "epoch": 0.08744448536397927, "eval_loss": 3.128695011138916, "eval_runtime": 101.514, "eval_samples_per_second": 49.254, "eval_steps_per_second": 12.314, "num_input_tokens_seen": 3407872000, "step": 13000 }, { "epoch": 0.08778081030768688, "grad_norm": 0.16921883821487427, "learning_rate": 0.001, "loss": 3.2157, "num_input_tokens_seen": 3420979200, "step": 13050 }, { "epoch": 0.08811713525139449, "grad_norm": 0.16760320961475372, "learning_rate": 0.001, "loss": 3.2057, "num_input_tokens_seen": 3434086400, "step": 13100 }, { "epoch": 0.0884534601951021, "grad_norm": 0.16922198235988617, "learning_rate": 0.001, "loss": 3.2281, "num_input_tokens_seen": 3447193600, "step": 13150 }, { "epoch": 0.08878978513880971, "grad_norm": 0.1857660859823227, "learning_rate": 0.001, "loss": 3.2107, "num_input_tokens_seen": 3460300800, "step": 13200 }, { "epoch": 0.08912611008251732, "grad_norm": 0.1746143400669098, "learning_rate": 0.001, "loss": 3.217, "num_input_tokens_seen": 3473408000, "step": 13250 }, { "epoch": 0.08946243502622493, "grad_norm": 0.16841556131839752, "learning_rate": 0.001, "loss": 3.2241, "num_input_tokens_seen": 3486515200, "step": 13300 }, { "epoch": 0.08979875996993256, "grad_norm": 0.1724822223186493, "learning_rate": 0.001, "loss": 3.2158, "num_input_tokens_seen": 3499622400, "step": 13350 }, { "epoch": 0.09013508491364017, "grad_norm": 0.17045529186725616, "learning_rate": 0.001, "loss": 3.2135, "num_input_tokens_seen": 3512729600, "step": 13400 }, { "epoch": 0.09047140985734778, "grad_norm": 0.18234893679618835, "learning_rate": 0.001, "loss": 3.2109, "num_input_tokens_seen": 3525836800, "step": 13450 }, { "epoch": 0.09080773480105539, "grad_norm": 0.16932611167430878, "learning_rate": 0.001, "loss": 3.2117, "num_input_tokens_seen": 3538944000, "step": 13500 }, { "epoch": 0.09080773480105539, "eval_loss": 3.1214168071746826, "eval_runtime": 100.8175, "eval_samples_per_second": 49.595, "eval_steps_per_second": 12.399, "num_input_tokens_seen": 3538944000, "step": 13500 }, { "epoch": 0.091144059744763, "grad_norm": 0.17800532281398773, "learning_rate": 0.001, "loss": 3.2005, "num_input_tokens_seen": 3552051200, "step": 13550 }, { "epoch": 0.09148038468847061, "grad_norm": 0.15552346408367157, "learning_rate": 0.001, "loss": 3.2165, "num_input_tokens_seen": 3565158400, "step": 13600 }, { "epoch": 0.09181670963217822, "grad_norm": 0.1732388734817505, "learning_rate": 0.001, "loss": 3.2159, "num_input_tokens_seen": 3578265600, "step": 13650 }, { "epoch": 0.09215303457588583, "grad_norm": 0.17064529657363892, "learning_rate": 0.001, "loss": 3.2213, "num_input_tokens_seen": 3591372800, "step": 13700 }, { "epoch": 0.09248935951959344, "grad_norm": 0.18150164186954498, "learning_rate": 0.001, "loss": 3.2207, "num_input_tokens_seen": 3604480000, "step": 13750 }, { "epoch": 0.09282568446330107, "grad_norm": 0.16305723786354065, "learning_rate": 0.001, "loss": 3.2112, "num_input_tokens_seen": 3617587200, "step": 13800 }, { "epoch": 0.09316200940700868, "grad_norm": 0.17140090465545654, "learning_rate": 0.001, "loss": 3.2064, "num_input_tokens_seen": 3630694400, "step": 13850 }, { "epoch": 0.09349833435071629, "grad_norm": 0.3770304024219513, "learning_rate": 0.001, "loss": 3.2129, "num_input_tokens_seen": 3643801600, "step": 13900 }, { "epoch": 0.0938346592944239, "grad_norm": 0.15605700016021729, "learning_rate": 0.001, "loss": 3.2194, "num_input_tokens_seen": 3656908800, "step": 13950 }, { "epoch": 0.09417098423813151, "grad_norm": 0.18392467498779297, "learning_rate": 0.001, "loss": 3.2057, "num_input_tokens_seen": 3670016000, "step": 14000 }, { "epoch": 0.09417098423813151, "eval_loss": 3.1151933670043945, "eval_runtime": 100.6439, "eval_samples_per_second": 49.68, "eval_steps_per_second": 12.42, "num_input_tokens_seen": 3670016000, "step": 14000 }, { "epoch": 0.09450730918183912, "grad_norm": 0.17042067646980286, "learning_rate": 0.001, "loss": 3.2079, "num_input_tokens_seen": 3683123200, "step": 14050 }, { "epoch": 0.09484363412554674, "grad_norm": 0.1771795153617859, "learning_rate": 0.001, "loss": 3.2074, "num_input_tokens_seen": 3696230400, "step": 14100 }, { "epoch": 0.09517995906925435, "grad_norm": 0.18254883587360382, "learning_rate": 0.001, "loss": 3.1999, "num_input_tokens_seen": 3709337600, "step": 14150 }, { "epoch": 0.09551628401296196, "grad_norm": 0.17174501717090607, "learning_rate": 0.001, "loss": 3.2066, "num_input_tokens_seen": 3722444800, "step": 14200 }, { "epoch": 0.09585260895666958, "grad_norm": 0.15733762085437775, "learning_rate": 0.001, "loss": 3.1931, "num_input_tokens_seen": 3735552000, "step": 14250 }, { "epoch": 0.0961889339003772, "grad_norm": 0.17221161723136902, "learning_rate": 0.001, "loss": 3.2055, "num_input_tokens_seen": 3748659200, "step": 14300 }, { "epoch": 0.0965252588440848, "grad_norm": 0.2117476761341095, "learning_rate": 0.001, "loss": 3.2046, "num_input_tokens_seen": 3761766400, "step": 14350 }, { "epoch": 0.09686158378779242, "grad_norm": 0.19019798934459686, "learning_rate": 0.001, "loss": 3.2086, "num_input_tokens_seen": 3774873600, "step": 14400 }, { "epoch": 0.09719790873150003, "grad_norm": 0.1791025549173355, "learning_rate": 0.001, "loss": 3.2002, "num_input_tokens_seen": 3787980800, "step": 14450 }, { "epoch": 0.09753423367520764, "grad_norm": 0.1800592541694641, "learning_rate": 0.001, "loss": 3.2121, "num_input_tokens_seen": 3801088000, "step": 14500 }, { "epoch": 0.09753423367520764, "eval_loss": 3.1071391105651855, "eval_runtime": 101.3171, "eval_samples_per_second": 49.35, "eval_steps_per_second": 12.337, "num_input_tokens_seen": 3801088000, "step": 14500 }, { "epoch": 0.09787055861891525, "grad_norm": 0.15765570104122162, "learning_rate": 0.001, "loss": 3.2046, "num_input_tokens_seen": 3814195200, "step": 14550 }, { "epoch": 0.09820688356262286, "grad_norm": 0.16369874775409698, "learning_rate": 0.001, "loss": 3.2059, "num_input_tokens_seen": 3827302400, "step": 14600 }, { "epoch": 0.09854320850633047, "grad_norm": 0.16802681982517242, "learning_rate": 0.001, "loss": 3.1967, "num_input_tokens_seen": 3840409600, "step": 14650 }, { "epoch": 0.0988795334500381, "grad_norm": 0.1722741425037384, "learning_rate": 0.001, "loss": 3.2189, "num_input_tokens_seen": 3853516800, "step": 14700 }, { "epoch": 0.0992158583937457, "grad_norm": 0.17442888021469116, "learning_rate": 0.001, "loss": 3.2079, "num_input_tokens_seen": 3866624000, "step": 14750 }, { "epoch": 0.09955218333745332, "grad_norm": 0.18931840360164642, "learning_rate": 0.001, "loss": 3.1978, "num_input_tokens_seen": 3879731200, "step": 14800 }, { "epoch": 0.09988850828116093, "grad_norm": 0.17893177270889282, "learning_rate": 0.001, "loss": 3.202, "num_input_tokens_seen": 3892838400, "step": 14850 }, { "epoch": 0.10022483322486854, "grad_norm": 0.18453757464885712, "learning_rate": 0.001, "loss": 3.2004, "num_input_tokens_seen": 3905945600, "step": 14900 }, { "epoch": 0.10056115816857615, "grad_norm": 0.17419569194316864, "learning_rate": 0.001, "loss": 3.1936, "num_input_tokens_seen": 3919052800, "step": 14950 }, { "epoch": 0.10089748311228376, "grad_norm": 0.1765667200088501, "learning_rate": 0.001, "loss": 3.2015, "num_input_tokens_seen": 3932160000, "step": 15000 }, { "epoch": 0.10089748311228376, "eval_loss": 3.1014840602874756, "eval_runtime": 100.8594, "eval_samples_per_second": 49.574, "eval_steps_per_second": 12.393, "num_input_tokens_seen": 3932160000, "step": 15000 }, { "epoch": 0.10123380805599137, "grad_norm": 0.18003995716571808, "learning_rate": 0.001, "loss": 3.2003, "num_input_tokens_seen": 3945267200, "step": 15050 }, { "epoch": 0.10157013299969898, "grad_norm": 0.16458339989185333, "learning_rate": 0.001, "loss": 3.1917, "num_input_tokens_seen": 3958374400, "step": 15100 }, { "epoch": 0.1019064579434066, "grad_norm": 0.6094233393669128, "learning_rate": 0.001, "loss": 3.2255, "num_input_tokens_seen": 3971481600, "step": 15150 }, { "epoch": 0.10224278288711422, "grad_norm": 0.20225363969802856, "learning_rate": 0.001, "loss": 3.2114, "num_input_tokens_seen": 3984588800, "step": 15200 }, { "epoch": 0.10257910783082183, "grad_norm": 0.19048044085502625, "learning_rate": 0.001, "loss": 3.2135, "num_input_tokens_seen": 3997696000, "step": 15250 }, { "epoch": 0.10291543277452944, "grad_norm": 0.17177866399288177, "learning_rate": 0.001, "loss": 3.1954, "num_input_tokens_seen": 4010803200, "step": 15300 }, { "epoch": 0.10325175771823705, "grad_norm": 0.17647191882133484, "learning_rate": 0.001, "loss": 3.2036, "num_input_tokens_seen": 4023910400, "step": 15350 }, { "epoch": 0.10358808266194466, "grad_norm": 0.16163323819637299, "learning_rate": 0.001, "loss": 3.1913, "num_input_tokens_seen": 4037017600, "step": 15400 }, { "epoch": 0.10392440760565227, "grad_norm": 0.18218201398849487, "learning_rate": 0.001, "loss": 3.1963, "num_input_tokens_seen": 4050124800, "step": 15450 }, { "epoch": 0.10426073254935989, "grad_norm": 0.17650413513183594, "learning_rate": 0.001, "loss": 3.1925, "num_input_tokens_seen": 4063232000, "step": 15500 }, { "epoch": 0.10426073254935989, "eval_loss": 3.0995914936065674, "eval_runtime": 99.8658, "eval_samples_per_second": 50.067, "eval_steps_per_second": 12.517, "num_input_tokens_seen": 4063232000, "step": 15500 }, { "epoch": 0.1045970574930675, "grad_norm": 0.1616327166557312, "learning_rate": 0.001, "loss": 3.1924, "num_input_tokens_seen": 4076339200, "step": 15550 }, { "epoch": 0.10493338243677511, "grad_norm": 0.16149432957172394, "learning_rate": 0.001, "loss": 3.189, "num_input_tokens_seen": 4089446400, "step": 15600 }, { "epoch": 0.10526970738048273, "grad_norm": 0.2035779058933258, "learning_rate": 0.001, "loss": 3.1927, "num_input_tokens_seen": 4102553600, "step": 15650 }, { "epoch": 0.10560603232419034, "grad_norm": 0.16653041541576385, "learning_rate": 0.001, "loss": 3.1874, "num_input_tokens_seen": 4115660800, "step": 15700 }, { "epoch": 0.10594235726789795, "grad_norm": 0.16677066683769226, "learning_rate": 0.001, "loss": 3.1831, "num_input_tokens_seen": 4128768000, "step": 15750 }, { "epoch": 0.10627868221160557, "grad_norm": 0.17420975863933563, "learning_rate": 0.001, "loss": 3.1933, "num_input_tokens_seen": 4141875200, "step": 15800 }, { "epoch": 0.10661500715531318, "grad_norm": 0.16593104600906372, "learning_rate": 0.001, "loss": 3.1869, "num_input_tokens_seen": 4154982400, "step": 15850 }, { "epoch": 0.10695133209902079, "grad_norm": 0.18399874866008759, "learning_rate": 0.001, "loss": 3.1894, "num_input_tokens_seen": 4168089600, "step": 15900 }, { "epoch": 0.1072876570427284, "grad_norm": 0.15823860466480255, "learning_rate": 0.001, "loss": 3.1887, "num_input_tokens_seen": 4181196800, "step": 15950 }, { "epoch": 0.10762398198643601, "grad_norm": 0.18964843451976776, "learning_rate": 0.001, "loss": 3.1796, "num_input_tokens_seen": 4194304000, "step": 16000 }, { "epoch": 0.10762398198643601, "eval_loss": 3.0902183055877686, "eval_runtime": 100.8295, "eval_samples_per_second": 49.589, "eval_steps_per_second": 12.397, "num_input_tokens_seen": 4194304000, "step": 16000 }, { "epoch": 0.10796030693014362, "grad_norm": 0.1692574918270111, "learning_rate": 0.001, "loss": 3.1892, "num_input_tokens_seen": 4207411200, "step": 16050 }, { "epoch": 0.10829663187385125, "grad_norm": 0.162678524851799, "learning_rate": 0.001, "loss": 3.1849, "num_input_tokens_seen": 4220518400, "step": 16100 }, { "epoch": 0.10863295681755886, "grad_norm": 0.16249045729637146, "learning_rate": 0.001, "loss": 3.1849, "num_input_tokens_seen": 4233625600, "step": 16150 }, { "epoch": 0.10896928176126647, "grad_norm": 0.17242908477783203, "learning_rate": 0.001, "loss": 3.1791, "num_input_tokens_seen": 4246732800, "step": 16200 }, { "epoch": 0.10930560670497408, "grad_norm": 0.15996769070625305, "learning_rate": 0.001, "loss": 3.1812, "num_input_tokens_seen": 4259840000, "step": 16250 }, { "epoch": 0.10964193164868169, "grad_norm": 0.1693849265575409, "learning_rate": 0.001, "loss": 3.1762, "num_input_tokens_seen": 4272947200, "step": 16300 }, { "epoch": 0.1099782565923893, "grad_norm": 0.1593247950077057, "learning_rate": 0.001, "loss": 3.1806, "num_input_tokens_seen": 4286054400, "step": 16350 }, { "epoch": 0.11031458153609691, "grad_norm": 0.16207775473594666, "learning_rate": 0.001, "loss": 3.175, "num_input_tokens_seen": 4299161600, "step": 16400 }, { "epoch": 0.11065090647980452, "grad_norm": 0.17720963060855865, "learning_rate": 0.001, "loss": 3.1834, "num_input_tokens_seen": 4312268800, "step": 16450 }, { "epoch": 0.11098723142351213, "grad_norm": 0.1996976137161255, "learning_rate": 0.001, "loss": 3.211, "num_input_tokens_seen": 4325376000, "step": 16500 }, { "epoch": 0.11098723142351213, "eval_loss": 3.098728895187378, "eval_runtime": 101.226, "eval_samples_per_second": 49.394, "eval_steps_per_second": 12.349, "num_input_tokens_seen": 4325376000, "step": 16500 }, { "epoch": 0.11132355636721976, "grad_norm": 0.1731133908033371, "learning_rate": 0.001, "loss": 3.1828, "num_input_tokens_seen": 4338483200, "step": 16550 }, { "epoch": 0.11165988131092737, "grad_norm": 0.21048209071159363, "learning_rate": 0.001, "loss": 3.1843, "num_input_tokens_seen": 4351590400, "step": 16600 }, { "epoch": 0.11199620625463498, "grad_norm": 0.18280939757823944, "learning_rate": 0.001, "loss": 3.1904, "num_input_tokens_seen": 4364697600, "step": 16650 }, { "epoch": 0.11233253119834259, "grad_norm": 0.15612006187438965, "learning_rate": 0.001, "loss": 3.1795, "num_input_tokens_seen": 4377804800, "step": 16700 }, { "epoch": 0.1126688561420502, "grad_norm": 0.17242297530174255, "learning_rate": 0.001, "loss": 3.1727, "num_input_tokens_seen": 4390912000, "step": 16750 }, { "epoch": 0.11300518108575781, "grad_norm": 0.170341357588768, "learning_rate": 0.001, "loss": 3.1828, "num_input_tokens_seen": 4404019200, "step": 16800 }, { "epoch": 0.11334150602946542, "grad_norm": 0.17627349495887756, "learning_rate": 0.001, "loss": 3.1946, "num_input_tokens_seen": 4417126400, "step": 16850 }, { "epoch": 0.11367783097317304, "grad_norm": 0.19702504575252533, "learning_rate": 0.001, "loss": 3.1737, "num_input_tokens_seen": 4430233600, "step": 16900 }, { "epoch": 0.11401415591688065, "grad_norm": 0.170149028301239, "learning_rate": 0.001, "loss": 3.188, "num_input_tokens_seen": 4443340800, "step": 16950 }, { "epoch": 0.11435048086058827, "grad_norm": 0.1967497169971466, "learning_rate": 0.001, "loss": 3.1778, "num_input_tokens_seen": 4456448000, "step": 17000 }, { "epoch": 0.11435048086058827, "eval_loss": 3.084319829940796, "eval_runtime": 101.2745, "eval_samples_per_second": 49.371, "eval_steps_per_second": 12.343, "num_input_tokens_seen": 4456448000, "step": 17000 }, { "epoch": 0.11468680580429588, "grad_norm": 0.17489473521709442, "learning_rate": 0.001, "loss": 3.1781, "num_input_tokens_seen": 4469555200, "step": 17050 }, { "epoch": 0.1150231307480035, "grad_norm": 0.17033468186855316, "learning_rate": 0.001, "loss": 3.1787, "num_input_tokens_seen": 4482662400, "step": 17100 }, { "epoch": 0.1153594556917111, "grad_norm": 0.16838806867599487, "learning_rate": 0.001, "loss": 3.1819, "num_input_tokens_seen": 4495769600, "step": 17150 }, { "epoch": 0.11569578063541872, "grad_norm": 0.18173356354236603, "learning_rate": 0.001, "loss": 3.1663, "num_input_tokens_seen": 4508876800, "step": 17200 }, { "epoch": 0.11603210557912633, "grad_norm": 0.17072565853595734, "learning_rate": 0.001, "loss": 3.1777, "num_input_tokens_seen": 4521984000, "step": 17250 }, { "epoch": 0.11636843052283394, "grad_norm": 0.17745070159435272, "learning_rate": 0.001, "loss": 3.1708, "num_input_tokens_seen": 4535091200, "step": 17300 }, { "epoch": 0.11670475546654155, "grad_norm": 0.16486075520515442, "learning_rate": 0.001, "loss": 3.1698, "num_input_tokens_seen": 4548198400, "step": 17350 }, { "epoch": 0.11704108041024916, "grad_norm": 0.1572778970003128, "learning_rate": 0.001, "loss": 3.1742, "num_input_tokens_seen": 4561305600, "step": 17400 }, { "epoch": 0.11737740535395678, "grad_norm": 0.17188695073127747, "learning_rate": 0.001, "loss": 3.1779, "num_input_tokens_seen": 4574412800, "step": 17450 }, { "epoch": 0.1177137302976644, "grad_norm": 0.16766607761383057, "learning_rate": 0.001, "loss": 3.1717, "num_input_tokens_seen": 4587520000, "step": 17500 }, { "epoch": 0.1177137302976644, "eval_loss": 3.075218677520752, "eval_runtime": 101.5328, "eval_samples_per_second": 49.245, "eval_steps_per_second": 12.311, "num_input_tokens_seen": 4587520000, "step": 17500 }, { "epoch": 0.118050055241372, "grad_norm": 0.16463638842105865, "learning_rate": 0.001, "loss": 3.1698, "num_input_tokens_seen": 4600627200, "step": 17550 }, { "epoch": 0.11838638018507962, "grad_norm": 0.4281676709651947, "learning_rate": 0.001, "loss": 3.169, "num_input_tokens_seen": 4613734400, "step": 17600 }, { "epoch": 0.11872270512878723, "grad_norm": 0.18109829723834991, "learning_rate": 0.001, "loss": 3.166, "num_input_tokens_seen": 4626841600, "step": 17650 }, { "epoch": 0.11905903007249484, "grad_norm": 0.16371768712997437, "learning_rate": 0.001, "loss": 3.1694, "num_input_tokens_seen": 4639948800, "step": 17700 }, { "epoch": 0.11939535501620245, "grad_norm": 0.18475505709648132, "learning_rate": 0.001, "loss": 3.174, "num_input_tokens_seen": 4653056000, "step": 17750 }, { "epoch": 0.11973167995991006, "grad_norm": 0.20489992201328278, "learning_rate": 0.001, "loss": 3.1644, "num_input_tokens_seen": 4666163200, "step": 17800 }, { "epoch": 0.12006800490361767, "grad_norm": 0.1695111244916916, "learning_rate": 0.001, "loss": 3.1699, "num_input_tokens_seen": 4679270400, "step": 17850 }, { "epoch": 0.1204043298473253, "grad_norm": 0.16501003503799438, "learning_rate": 0.001, "loss": 3.1549, "num_input_tokens_seen": 4692377600, "step": 17900 }, { "epoch": 0.12074065479103291, "grad_norm": 0.16232050955295563, "learning_rate": 0.001, "loss": 3.1762, "num_input_tokens_seen": 4705484800, "step": 17950 }, { "epoch": 0.12107697973474052, "grad_norm": 0.17002490162849426, "learning_rate": 0.001, "loss": 3.1597, "num_input_tokens_seen": 4718592000, "step": 18000 }, { "epoch": 0.12107697973474052, "eval_loss": 3.069894552230835, "eval_runtime": 100.7483, "eval_samples_per_second": 49.629, "eval_steps_per_second": 12.407, "num_input_tokens_seen": 4718592000, "step": 18000 }, { "epoch": 0.12141330467844813, "grad_norm": 0.1668955534696579, "learning_rate": 0.001, "loss": 3.1641, "num_input_tokens_seen": 4731699200, "step": 18050 }, { "epoch": 0.12174962962215574, "grad_norm": 0.17743679881095886, "learning_rate": 0.001, "loss": 3.1717, "num_input_tokens_seen": 4744806400, "step": 18100 }, { "epoch": 0.12208595456586335, "grad_norm": 0.17474418878555298, "learning_rate": 0.001, "loss": 3.1745, "num_input_tokens_seen": 4757913600, "step": 18150 }, { "epoch": 0.12242227950957096, "grad_norm": 0.18446923792362213, "learning_rate": 0.001, "loss": 3.1526, "num_input_tokens_seen": 4771020800, "step": 18200 }, { "epoch": 0.12275860445327857, "grad_norm": 0.19560950994491577, "learning_rate": 0.001, "loss": 3.1736, "num_input_tokens_seen": 4784128000, "step": 18250 }, { "epoch": 0.12309492939698619, "grad_norm": 0.17012590169906616, "learning_rate": 0.001, "loss": 3.1743, "num_input_tokens_seen": 4797235200, "step": 18300 }, { "epoch": 0.12343125434069381, "grad_norm": 0.17102253437042236, "learning_rate": 0.001, "loss": 3.169, "num_input_tokens_seen": 4810342400, "step": 18350 }, { "epoch": 0.12376757928440142, "grad_norm": 0.16899776458740234, "learning_rate": 0.001, "loss": 3.1657, "num_input_tokens_seen": 4823449600, "step": 18400 }, { "epoch": 0.12410390422810903, "grad_norm": 0.18831774592399597, "learning_rate": 0.001, "loss": 3.1664, "num_input_tokens_seen": 4836556800, "step": 18450 }, { "epoch": 0.12444022917181664, "grad_norm": 0.2649637460708618, "learning_rate": 0.001, "loss": 3.183, "num_input_tokens_seen": 4849664000, "step": 18500 }, { "epoch": 0.12444022917181664, "eval_loss": 3.088426351547241, "eval_runtime": 100.5331, "eval_samples_per_second": 49.735, "eval_steps_per_second": 12.434, "num_input_tokens_seen": 4849664000, "step": 18500 }, { "epoch": 0.12477655411552425, "grad_norm": 0.2588728368282318, "learning_rate": 0.001, "loss": 3.2022, "num_input_tokens_seen": 4862771200, "step": 18550 }, { "epoch": 0.12511287905923188, "grad_norm": 0.1861683577299118, "learning_rate": 0.001, "loss": 3.1779, "num_input_tokens_seen": 4875878400, "step": 18600 }, { "epoch": 0.1254492040029395, "grad_norm": 0.1803797483444214, "learning_rate": 0.001, "loss": 3.1761, "num_input_tokens_seen": 4888985600, "step": 18650 }, { "epoch": 0.1257855289466471, "grad_norm": 0.20752698183059692, "learning_rate": 0.001, "loss": 3.1867, "num_input_tokens_seen": 4902092800, "step": 18700 }, { "epoch": 0.1261218538903547, "grad_norm": 0.18387116491794586, "learning_rate": 0.001, "loss": 3.1667, "num_input_tokens_seen": 4915200000, "step": 18750 }, { "epoch": 0.12645817883406232, "grad_norm": 0.3406733572483063, "learning_rate": 0.001, "loss": 3.1706, "num_input_tokens_seen": 4928307200, "step": 18800 }, { "epoch": 0.12679450377776993, "grad_norm": 0.17068707942962646, "learning_rate": 0.001, "loss": 3.1715, "num_input_tokens_seen": 4941414400, "step": 18850 }, { "epoch": 0.12713082872147755, "grad_norm": 0.18792368471622467, "learning_rate": 0.001, "loss": 3.1711, "num_input_tokens_seen": 4954521600, "step": 18900 }, { "epoch": 0.12746715366518516, "grad_norm": 0.18366409838199615, "learning_rate": 0.001, "loss": 3.1724, "num_input_tokens_seen": 4967628800, "step": 18950 }, { "epoch": 0.12780347860889277, "grad_norm": 0.15937770903110504, "learning_rate": 0.001, "loss": 3.1541, "num_input_tokens_seen": 4980736000, "step": 19000 }, { "epoch": 0.12780347860889277, "eval_loss": 3.06679630279541, "eval_runtime": 100.9386, "eval_samples_per_second": 49.535, "eval_steps_per_second": 12.384, "num_input_tokens_seen": 4980736000, "step": 19000 }, { "epoch": 0.12813980355260038, "grad_norm": 0.17235547304153442, "learning_rate": 0.001, "loss": 3.1644, "num_input_tokens_seen": 4993843200, "step": 19050 }, { "epoch": 0.128476128496308, "grad_norm": 0.17877432703971863, "learning_rate": 0.001, "loss": 3.1613, "num_input_tokens_seen": 5006950400, "step": 19100 }, { "epoch": 0.1288124534400156, "grad_norm": 0.20647567510604858, "learning_rate": 0.001, "loss": 3.157, "num_input_tokens_seen": 5020057600, "step": 19150 }, { "epoch": 0.1291487783837232, "grad_norm": 0.17320992052555084, "learning_rate": 0.001, "loss": 3.1597, "num_input_tokens_seen": 5033164800, "step": 19200 }, { "epoch": 0.12948510332743082, "grad_norm": 0.1975463479757309, "learning_rate": 0.001, "loss": 3.1588, "num_input_tokens_seen": 5046272000, "step": 19250 }, { "epoch": 0.12982142827113843, "grad_norm": 0.2234167754650116, "learning_rate": 0.001, "loss": 3.1752, "num_input_tokens_seen": 5059379200, "step": 19300 }, { "epoch": 0.13015775321484604, "grad_norm": 0.16287492215633392, "learning_rate": 0.001, "loss": 3.1559, "num_input_tokens_seen": 5072486400, "step": 19350 }, { "epoch": 0.13049407815855366, "grad_norm": 0.21676546335220337, "learning_rate": 0.001, "loss": 3.1556, "num_input_tokens_seen": 5085593600, "step": 19400 }, { "epoch": 0.1308304031022613, "grad_norm": 0.19858922064304352, "learning_rate": 0.001, "loss": 3.1617, "num_input_tokens_seen": 5098700800, "step": 19450 }, { "epoch": 0.1311667280459689, "grad_norm": 0.17459186911582947, "learning_rate": 0.001, "loss": 3.1499, "num_input_tokens_seen": 5111808000, "step": 19500 }, { "epoch": 0.1311667280459689, "eval_loss": 3.0653886795043945, "eval_runtime": 101.0357, "eval_samples_per_second": 49.487, "eval_steps_per_second": 12.372, "num_input_tokens_seen": 5111808000, "step": 19500 }, { "epoch": 0.13150305298967652, "grad_norm": 0.17277012765407562, "learning_rate": 0.001, "loss": 3.1641, "num_input_tokens_seen": 5124915200, "step": 19550 }, { "epoch": 0.13183937793338413, "grad_norm": 0.1894499808549881, "learning_rate": 0.001, "loss": 3.1465, "num_input_tokens_seen": 5138022400, "step": 19600 }, { "epoch": 0.13217570287709174, "grad_norm": 0.17682771384716034, "learning_rate": 0.001, "loss": 3.1573, "num_input_tokens_seen": 5151129600, "step": 19650 }, { "epoch": 0.13251202782079935, "grad_norm": 0.17981217801570892, "learning_rate": 0.001, "loss": 3.1484, "num_input_tokens_seen": 5164236800, "step": 19700 }, { "epoch": 0.13284835276450696, "grad_norm": 0.19152362644672394, "learning_rate": 0.001, "loss": 3.1597, "num_input_tokens_seen": 5177344000, "step": 19750 }, { "epoch": 0.13318467770821457, "grad_norm": 0.17498181760311127, "learning_rate": 0.001, "loss": 3.1604, "num_input_tokens_seen": 5190451200, "step": 19800 }, { "epoch": 0.13352100265192218, "grad_norm": 0.17143672704696655, "learning_rate": 0.001, "loss": 3.1601, "num_input_tokens_seen": 5203558400, "step": 19850 }, { "epoch": 0.1338573275956298, "grad_norm": 0.19592677056789398, "learning_rate": 0.001, "loss": 3.1568, "num_input_tokens_seen": 5216665600, "step": 19900 }, { "epoch": 0.1341936525393374, "grad_norm": 0.1618625670671463, "learning_rate": 0.001, "loss": 3.1584, "num_input_tokens_seen": 5229772800, "step": 19950 }, { "epoch": 0.13452997748304502, "grad_norm": 0.18170061707496643, "learning_rate": 0.001, "loss": 3.1499, "num_input_tokens_seen": 5242880000, "step": 20000 }, { "epoch": 0.13452997748304502, "eval_loss": 3.056318759918213, "eval_runtime": 100.9422, "eval_samples_per_second": 49.533, "eval_steps_per_second": 12.383, "num_input_tokens_seen": 5242880000, "step": 20000 }, { "epoch": 0.13486630242675263, "grad_norm": 0.1726132184267044, "learning_rate": 0.001, "loss": 3.1545, "num_input_tokens_seen": 5255987200, "step": 20050 }, { "epoch": 0.13520262737046024, "grad_norm": 0.17690512537956238, "learning_rate": 0.001, "loss": 3.1526, "num_input_tokens_seen": 5269094400, "step": 20100 }, { "epoch": 0.13553895231416785, "grad_norm": 0.20317280292510986, "learning_rate": 0.001, "loss": 3.15, "num_input_tokens_seen": 5282201600, "step": 20150 }, { "epoch": 0.13587527725787546, "grad_norm": 0.16687753796577454, "learning_rate": 0.001, "loss": 3.144, "num_input_tokens_seen": 5295308800, "step": 20200 }, { "epoch": 0.13621160220158307, "grad_norm": 0.16671325266361237, "learning_rate": 0.001, "loss": 3.1425, "num_input_tokens_seen": 5308416000, "step": 20250 }, { "epoch": 0.13654792714529068, "grad_norm": 0.16259203851222992, "learning_rate": 0.001, "loss": 3.1386, "num_input_tokens_seen": 5321523200, "step": 20300 }, { "epoch": 0.1368842520889983, "grad_norm": 0.15738168358802795, "learning_rate": 0.001, "loss": 3.148, "num_input_tokens_seen": 5334630400, "step": 20350 }, { "epoch": 0.13722057703270593, "grad_norm": 0.17398318648338318, "learning_rate": 0.001, "loss": 3.1484, "num_input_tokens_seen": 5347737600, "step": 20400 }, { "epoch": 0.13755690197641354, "grad_norm": 0.177555114030838, "learning_rate": 0.001, "loss": 3.1507, "num_input_tokens_seen": 5360844800, "step": 20450 }, { "epoch": 0.13789322692012115, "grad_norm": 0.16208910942077637, "learning_rate": 0.001, "loss": 3.1462, "num_input_tokens_seen": 5373952000, "step": 20500 }, { "epoch": 0.13789322692012115, "eval_loss": 3.0525152683258057, "eval_runtime": 101.4785, "eval_samples_per_second": 49.272, "eval_steps_per_second": 12.318, "num_input_tokens_seen": 5373952000, "step": 20500 }, { "epoch": 0.13822955186382876, "grad_norm": 0.1615586280822754, "learning_rate": 0.001, "loss": 3.1469, "num_input_tokens_seen": 5387059200, "step": 20550 }, { "epoch": 0.13856587680753638, "grad_norm": 0.17003317177295685, "learning_rate": 0.001, "loss": 3.15, "num_input_tokens_seen": 5400166400, "step": 20600 }, { "epoch": 0.138902201751244, "grad_norm": 0.16299164295196533, "learning_rate": 0.001, "loss": 3.1522, "num_input_tokens_seen": 5413273600, "step": 20650 }, { "epoch": 0.1392385266949516, "grad_norm": 0.18732890486717224, "learning_rate": 0.001, "loss": 3.1462, "num_input_tokens_seen": 5426380800, "step": 20700 }, { "epoch": 0.1395748516386592, "grad_norm": 0.23970580101013184, "learning_rate": 0.001, "loss": 3.1832, "num_input_tokens_seen": 5439488000, "step": 20750 }, { "epoch": 0.13991117658236682, "grad_norm": 0.1701073795557022, "learning_rate": 0.001, "loss": 3.1713, "num_input_tokens_seen": 5452595200, "step": 20800 }, { "epoch": 0.14024750152607443, "grad_norm": 0.18976852297782898, "learning_rate": 0.001, "loss": 3.1636, "num_input_tokens_seen": 5465702400, "step": 20850 }, { "epoch": 0.14058382646978204, "grad_norm": 0.17788629233837128, "learning_rate": 0.001, "loss": 3.1712, "num_input_tokens_seen": 5478809600, "step": 20900 }, { "epoch": 0.14092015141348965, "grad_norm": 0.20413383841514587, "learning_rate": 0.001, "loss": 3.1604, "num_input_tokens_seen": 5491916800, "step": 20950 }, { "epoch": 0.14125647635719726, "grad_norm": 0.1921602487564087, "learning_rate": 0.001, "loss": 3.15, "num_input_tokens_seen": 5505024000, "step": 21000 }, { "epoch": 0.14125647635719726, "eval_loss": 3.053833484649658, "eval_runtime": 101.2468, "eval_samples_per_second": 49.384, "eval_steps_per_second": 12.346, "num_input_tokens_seen": 5505024000, "step": 21000 }, { "epoch": 0.14159280130090487, "grad_norm": 0.18453796207904816, "learning_rate": 0.001, "loss": 3.1542, "num_input_tokens_seen": 5518131200, "step": 21050 }, { "epoch": 0.14192912624461249, "grad_norm": 0.29931920766830444, "learning_rate": 0.001, "loss": 3.1633, "num_input_tokens_seen": 5531238400, "step": 21100 }, { "epoch": 0.1422654511883201, "grad_norm": 0.19636057317256927, "learning_rate": 0.001, "loss": 3.1544, "num_input_tokens_seen": 5544345600, "step": 21150 }, { "epoch": 0.1426017761320277, "grad_norm": 0.3110333979129791, "learning_rate": 0.001, "loss": 3.186, "num_input_tokens_seen": 5557452800, "step": 21200 }, { "epoch": 0.14293810107573532, "grad_norm": 0.21632343530654907, "learning_rate": 0.001, "loss": 3.1759, "num_input_tokens_seen": 5570560000, "step": 21250 }, { "epoch": 0.14327442601944296, "grad_norm": 0.23088929057121277, "learning_rate": 0.001, "loss": 3.1683, "num_input_tokens_seen": 5583667200, "step": 21300 }, { "epoch": 0.14361075096315057, "grad_norm": 0.19326886534690857, "learning_rate": 0.001, "loss": 3.1535, "num_input_tokens_seen": 5596774400, "step": 21350 }, { "epoch": 0.14394707590685818, "grad_norm": 0.3554578125476837, "learning_rate": 0.001, "loss": 3.1606, "num_input_tokens_seen": 5609881600, "step": 21400 }, { "epoch": 0.1442834008505658, "grad_norm": 0.18607909977436066, "learning_rate": 0.001, "loss": 3.159, "num_input_tokens_seen": 5622988800, "step": 21450 }, { "epoch": 0.1446197257942734, "grad_norm": 0.2276984453201294, "learning_rate": 0.001, "loss": 3.1544, "num_input_tokens_seen": 5636096000, "step": 21500 }, { "epoch": 0.1446197257942734, "eval_loss": 3.051608085632324, "eval_runtime": 101.9015, "eval_samples_per_second": 49.067, "eval_steps_per_second": 12.267, "num_input_tokens_seen": 5636096000, "step": 21500 }, { "epoch": 0.144956050737981, "grad_norm": 0.17796960473060608, "learning_rate": 0.001, "loss": 3.1488, "num_input_tokens_seen": 5649203200, "step": 21550 }, { "epoch": 0.14529237568168862, "grad_norm": 0.1721925139427185, "learning_rate": 0.001, "loss": 3.1465, "num_input_tokens_seen": 5662310400, "step": 21600 }, { "epoch": 0.14562870062539623, "grad_norm": 0.1779259443283081, "learning_rate": 0.001, "loss": 3.1479, "num_input_tokens_seen": 5675417600, "step": 21650 }, { "epoch": 0.14596502556910385, "grad_norm": 0.1955435425043106, "learning_rate": 0.001, "loss": 3.147, "num_input_tokens_seen": 5688524800, "step": 21700 }, { "epoch": 0.14630135051281146, "grad_norm": 0.2717543840408325, "learning_rate": 0.001, "loss": 3.1456, "num_input_tokens_seen": 5701632000, "step": 21750 }, { "epoch": 0.14663767545651907, "grad_norm": 0.22944161295890808, "learning_rate": 0.001, "loss": 3.1596, "num_input_tokens_seen": 5714739200, "step": 21800 }, { "epoch": 0.14697400040022668, "grad_norm": 0.18696273863315582, "learning_rate": 0.001, "loss": 3.1417, "num_input_tokens_seen": 5727846400, "step": 21850 }, { "epoch": 0.1473103253439343, "grad_norm": 0.17678014934062958, "learning_rate": 0.001, "loss": 3.1454, "num_input_tokens_seen": 5740953600, "step": 21900 }, { "epoch": 0.1476466502876419, "grad_norm": 0.16891658306121826, "learning_rate": 0.001, "loss": 3.142, "num_input_tokens_seen": 5754060800, "step": 21950 }, { "epoch": 0.1479829752313495, "grad_norm": 0.20900680124759674, "learning_rate": 0.001, "loss": 3.1475, "num_input_tokens_seen": 5767168000, "step": 22000 }, { "epoch": 0.1479829752313495, "eval_loss": 3.0481514930725098, "eval_runtime": 100.7385, "eval_samples_per_second": 49.633, "eval_steps_per_second": 12.408, "num_input_tokens_seen": 5767168000, "step": 22000 }, { "epoch": 0.14831930017505712, "grad_norm": 0.17889799177646637, "learning_rate": 0.001, "loss": 3.1407, "num_input_tokens_seen": 5780275200, "step": 22050 }, { "epoch": 0.14865562511876473, "grad_norm": 1.7358074188232422, "learning_rate": 0.001, "loss": 3.1454, "num_input_tokens_seen": 5793382400, "step": 22100 }, { "epoch": 0.14899195006247234, "grad_norm": 0.17746248841285706, "learning_rate": 0.001, "loss": 3.154, "num_input_tokens_seen": 5806489600, "step": 22150 }, { "epoch": 0.14932827500617998, "grad_norm": 0.185111865401268, "learning_rate": 0.001, "loss": 3.1522, "num_input_tokens_seen": 5819596800, "step": 22200 }, { "epoch": 0.1496645999498876, "grad_norm": 0.23882286250591278, "learning_rate": 0.001, "loss": 3.1379, "num_input_tokens_seen": 5832704000, "step": 22250 }, { "epoch": 0.1500009248935952, "grad_norm": 0.19389848411083221, "learning_rate": 0.001, "loss": 3.1459, "num_input_tokens_seen": 5845811200, "step": 22300 }, { "epoch": 0.15033724983730282, "grad_norm": 0.18890796601772308, "learning_rate": 0.001, "loss": 3.1401, "num_input_tokens_seen": 5858918400, "step": 22350 }, { "epoch": 0.15067357478101043, "grad_norm": 0.17061029374599457, "learning_rate": 0.001, "loss": 3.1555, "num_input_tokens_seen": 5872025600, "step": 22400 }, { "epoch": 0.15100989972471804, "grad_norm": 0.18124708533287048, "learning_rate": 0.001, "loss": 3.1499, "num_input_tokens_seen": 5885132800, "step": 22450 }, { "epoch": 0.15134622466842565, "grad_norm": 0.21192225813865662, "learning_rate": 0.001, "loss": 3.1364, "num_input_tokens_seen": 5898240000, "step": 22500 }, { "epoch": 0.15134622466842565, "eval_loss": 3.042147636413574, "eval_runtime": 100.7091, "eval_samples_per_second": 49.648, "eval_steps_per_second": 12.412, "num_input_tokens_seen": 5898240000, "step": 22500 }, { "epoch": 0.15168254961213326, "grad_norm": 0.16922616958618164, "learning_rate": 0.001, "loss": 3.149, "num_input_tokens_seen": 5911347200, "step": 22550 }, { "epoch": 0.15201887455584087, "grad_norm": 0.1636754721403122, "learning_rate": 0.001, "loss": 3.1379, "num_input_tokens_seen": 5924454400, "step": 22600 }, { "epoch": 0.15235519949954848, "grad_norm": 0.167410746216774, "learning_rate": 0.001, "loss": 3.1465, "num_input_tokens_seen": 5937561600, "step": 22650 }, { "epoch": 0.1526915244432561, "grad_norm": 0.18413004279136658, "learning_rate": 0.001, "loss": 3.131, "num_input_tokens_seen": 5950668800, "step": 22700 }, { "epoch": 0.1530278493869637, "grad_norm": 0.20482878386974335, "learning_rate": 0.001, "loss": 3.1395, "num_input_tokens_seen": 5963776000, "step": 22750 }, { "epoch": 0.15336417433067132, "grad_norm": 0.19031567871570587, "learning_rate": 0.001, "loss": 3.1369, "num_input_tokens_seen": 5976883200, "step": 22800 }, { "epoch": 0.15370049927437893, "grad_norm": 0.21622490882873535, "learning_rate": 0.001, "loss": 3.1448, "num_input_tokens_seen": 5989990400, "step": 22850 }, { "epoch": 0.15403682421808654, "grad_norm": 0.19105197489261627, "learning_rate": 0.001, "loss": 3.1456, "num_input_tokens_seen": 6003097600, "step": 22900 }, { "epoch": 0.15437314916179415, "grad_norm": 0.26653292775154114, "learning_rate": 0.001, "loss": 3.1405, "num_input_tokens_seen": 6016204800, "step": 22950 }, { "epoch": 0.15470947410550176, "grad_norm": 0.30645105242729187, "learning_rate": 0.001, "loss": 3.1564, "num_input_tokens_seen": 6029312000, "step": 23000 }, { "epoch": 0.15470947410550176, "eval_loss": 3.072319507598877, "eval_runtime": 100.6281, "eval_samples_per_second": 49.688, "eval_steps_per_second": 12.422, "num_input_tokens_seen": 6029312000, "step": 23000 }, { "epoch": 0.15504579904920937, "grad_norm": 0.18146337568759918, "learning_rate": 0.001, "loss": 3.1774, "num_input_tokens_seen": 6042419200, "step": 23050 }, { "epoch": 0.155382123992917, "grad_norm": 0.19545282423496246, "learning_rate": 0.001, "loss": 3.1384, "num_input_tokens_seen": 6055526400, "step": 23100 }, { "epoch": 0.15571844893662462, "grad_norm": 0.17861327528953552, "learning_rate": 0.001, "loss": 3.1459, "num_input_tokens_seen": 6068633600, "step": 23150 }, { "epoch": 0.15605477388033223, "grad_norm": 0.17025263607501984, "learning_rate": 0.001, "loss": 3.1369, "num_input_tokens_seen": 6081740800, "step": 23200 }, { "epoch": 0.15639109882403984, "grad_norm": 0.17162847518920898, "learning_rate": 0.001, "loss": 3.1479, "num_input_tokens_seen": 6094848000, "step": 23250 }, { "epoch": 0.15672742376774745, "grad_norm": 0.19068972766399384, "learning_rate": 0.001, "loss": 3.1418, "num_input_tokens_seen": 6107955200, "step": 23300 }, { "epoch": 0.15706374871145506, "grad_norm": 0.20177774131298065, "learning_rate": 0.001, "loss": 3.1435, "num_input_tokens_seen": 6121062400, "step": 23350 }, { "epoch": 0.15740007365516268, "grad_norm": 0.235867440700531, "learning_rate": 0.001, "loss": 3.1466, "num_input_tokens_seen": 6134169600, "step": 23400 }, { "epoch": 0.1577363985988703, "grad_norm": 0.17313481867313385, "learning_rate": 0.001, "loss": 3.1509, "num_input_tokens_seen": 6147276800, "step": 23450 }, { "epoch": 0.1580727235425779, "grad_norm": 0.20208750665187836, "learning_rate": 0.001, "loss": 3.1312, "num_input_tokens_seen": 6160384000, "step": 23500 }, { "epoch": 0.1580727235425779, "eval_loss": 3.0457797050476074, "eval_runtime": 99.8915, "eval_samples_per_second": 50.054, "eval_steps_per_second": 12.514, "num_input_tokens_seen": 6160384000, "step": 23500 }, { "epoch": 0.1584090484862855, "grad_norm": 0.1771639734506607, "learning_rate": 0.001, "loss": 3.1407, "num_input_tokens_seen": 6173491200, "step": 23550 }, { "epoch": 0.15874537342999312, "grad_norm": 0.20403757691383362, "learning_rate": 0.001, "loss": 3.1404, "num_input_tokens_seen": 6186598400, "step": 23600 }, { "epoch": 0.15908169837370073, "grad_norm": 0.19679167866706848, "learning_rate": 0.001, "loss": 3.1417, "num_input_tokens_seen": 6199705600, "step": 23650 }, { "epoch": 0.15941802331740834, "grad_norm": 0.18299609422683716, "learning_rate": 0.001, "loss": 3.14, "num_input_tokens_seen": 6212812800, "step": 23700 }, { "epoch": 0.15975434826111595, "grad_norm": 0.16773872077465057, "learning_rate": 0.001, "loss": 3.1326, "num_input_tokens_seen": 6225920000, "step": 23750 }, { "epoch": 0.16009067320482356, "grad_norm": 0.18539400398731232, "learning_rate": 0.001, "loss": 3.1294, "num_input_tokens_seen": 6239027200, "step": 23800 }, { "epoch": 0.16042699814853117, "grad_norm": 0.19088850915431976, "learning_rate": 0.001, "loss": 3.1297, "num_input_tokens_seen": 6252134400, "step": 23850 }, { "epoch": 0.16076332309223879, "grad_norm": 0.17390431463718414, "learning_rate": 0.001, "loss": 3.1364, "num_input_tokens_seen": 6265241600, "step": 23900 }, { "epoch": 0.1610996480359464, "grad_norm": 0.35694462060928345, "learning_rate": 0.001, "loss": 3.1519, "num_input_tokens_seen": 6278348800, "step": 23950 }, { "epoch": 0.16143597297965404, "grad_norm": 0.17942555248737335, "learning_rate": 0.001, "loss": 3.132, "num_input_tokens_seen": 6291456000, "step": 24000 }, { "epoch": 0.16143597297965404, "eval_loss": 3.0352118015289307, "eval_runtime": 101.1582, "eval_samples_per_second": 49.428, "eval_steps_per_second": 12.357, "num_input_tokens_seen": 6291456000, "step": 24000 }, { "epoch": 0.16177229792336165, "grad_norm": 0.1748410165309906, "learning_rate": 0.001, "loss": 3.1385, "num_input_tokens_seen": 6304563200, "step": 24050 }, { "epoch": 0.16210862286706926, "grad_norm": 0.19787083566188812, "learning_rate": 0.001, "loss": 3.1241, "num_input_tokens_seen": 6317670400, "step": 24100 }, { "epoch": 0.16244494781077687, "grad_norm": 0.16369663178920746, "learning_rate": 0.001, "loss": 3.1402, "num_input_tokens_seen": 6330777600, "step": 24150 }, { "epoch": 0.16278127275448448, "grad_norm": 0.17019188404083252, "learning_rate": 0.001, "loss": 3.1293, "num_input_tokens_seen": 6343884800, "step": 24200 }, { "epoch": 0.1631175976981921, "grad_norm": 0.1666271686553955, "learning_rate": 0.001, "loss": 3.1218, "num_input_tokens_seen": 6356992000, "step": 24250 }, { "epoch": 0.1634539226418997, "grad_norm": 0.1591423600912094, "learning_rate": 0.001, "loss": 3.1319, "num_input_tokens_seen": 6370099200, "step": 24300 }, { "epoch": 0.1637902475856073, "grad_norm": 0.1642790287733078, "learning_rate": 0.001, "loss": 3.1319, "num_input_tokens_seen": 6383206400, "step": 24350 }, { "epoch": 0.16412657252931492, "grad_norm": 0.17917323112487793, "learning_rate": 0.001, "loss": 3.1204, "num_input_tokens_seen": 6396313600, "step": 24400 }, { "epoch": 0.16446289747302253, "grad_norm": 0.2367531955242157, "learning_rate": 0.001, "loss": 3.125, "num_input_tokens_seen": 6409420800, "step": 24450 }, { "epoch": 0.16479922241673015, "grad_norm": 0.175731360912323, "learning_rate": 0.001, "loss": 3.1358, "num_input_tokens_seen": 6422528000, "step": 24500 }, { "epoch": 0.16479922241673015, "eval_loss": 3.032782793045044, "eval_runtime": 100.7748, "eval_samples_per_second": 49.616, "eval_steps_per_second": 12.404, "num_input_tokens_seen": 6422528000, "step": 24500 }, { "epoch": 0.16513554736043776, "grad_norm": 0.16778303682804108, "learning_rate": 0.001, "loss": 3.1382, "num_input_tokens_seen": 6435635200, "step": 24550 }, { "epoch": 0.16547187230414537, "grad_norm": 0.19019022583961487, "learning_rate": 0.001, "loss": 3.1328, "num_input_tokens_seen": 6448742400, "step": 24600 }, { "epoch": 0.16580819724785298, "grad_norm": 0.18815970420837402, "learning_rate": 0.001, "loss": 3.1274, "num_input_tokens_seen": 6461849600, "step": 24650 }, { "epoch": 0.1661445221915606, "grad_norm": 0.18299463391304016, "learning_rate": 0.001, "loss": 3.1258, "num_input_tokens_seen": 6474956800, "step": 24700 }, { "epoch": 0.1664808471352682, "grad_norm": 0.20152020454406738, "learning_rate": 0.001, "loss": 3.1245, "num_input_tokens_seen": 6488064000, "step": 24750 }, { "epoch": 0.1668171720789758, "grad_norm": 0.1800755113363266, "learning_rate": 0.001, "loss": 3.1332, "num_input_tokens_seen": 6501171200, "step": 24800 }, { "epoch": 0.16715349702268342, "grad_norm": 0.33742496371269226, "learning_rate": 0.001, "loss": 3.1264, "num_input_tokens_seen": 6514278400, "step": 24850 }, { "epoch": 0.16748982196639106, "grad_norm": 0.20022252202033997, "learning_rate": 0.001, "loss": 3.1414, "num_input_tokens_seen": 6527385600, "step": 24900 }, { "epoch": 0.16782614691009867, "grad_norm": 0.19905851781368256, "learning_rate": 0.001, "loss": 3.1327, "num_input_tokens_seen": 6540492800, "step": 24950 }, { "epoch": 0.16816247185380628, "grad_norm": 0.26300039887428284, "learning_rate": 0.001, "loss": 3.1231, "num_input_tokens_seen": 6553600000, "step": 25000 }, { "epoch": 0.16816247185380628, "eval_loss": 3.0353240966796875, "eval_runtime": 100.6, "eval_samples_per_second": 49.702, "eval_steps_per_second": 12.425, "num_input_tokens_seen": 6553600000, "step": 25000 }, { "epoch": 0.1684987967975139, "grad_norm": 1.1301233768463135, "learning_rate": 0.001, "loss": 3.1419, "num_input_tokens_seen": 6566707200, "step": 25050 }, { "epoch": 0.1688351217412215, "grad_norm": 0.18019410967826843, "learning_rate": 0.001, "loss": 3.1514, "num_input_tokens_seen": 6579814400, "step": 25100 }, { "epoch": 0.16917144668492912, "grad_norm": 0.17898766696453094, "learning_rate": 0.001, "loss": 3.1337, "num_input_tokens_seen": 6592921600, "step": 25150 }, { "epoch": 0.16950777162863673, "grad_norm": 0.18393439054489136, "learning_rate": 0.001, "loss": 3.1313, "num_input_tokens_seen": 6606028800, "step": 25200 }, { "epoch": 0.16984409657234434, "grad_norm": 0.16713738441467285, "learning_rate": 0.001, "loss": 3.129, "num_input_tokens_seen": 6619136000, "step": 25250 }, { "epoch": 0.17018042151605195, "grad_norm": 0.17655207216739655, "learning_rate": 0.001, "loss": 3.1245, "num_input_tokens_seen": 6632243200, "step": 25300 }, { "epoch": 0.17051674645975956, "grad_norm": 0.20735637843608856, "learning_rate": 0.001, "loss": 3.1344, "num_input_tokens_seen": 6645350400, "step": 25350 }, { "epoch": 0.17085307140346717, "grad_norm": 0.21318195760250092, "learning_rate": 0.001, "loss": 3.1336, "num_input_tokens_seen": 6658457600, "step": 25400 }, { "epoch": 0.17118939634717478, "grad_norm": 0.1637289971113205, "learning_rate": 0.001, "loss": 3.1306, "num_input_tokens_seen": 6671564800, "step": 25450 }, { "epoch": 0.1715257212908824, "grad_norm": 0.1866239458322525, "learning_rate": 0.001, "loss": 3.1248, "num_input_tokens_seen": 6684672000, "step": 25500 }, { "epoch": 0.1715257212908824, "eval_loss": 3.025984048843384, "eval_runtime": 100.8165, "eval_samples_per_second": 49.595, "eval_steps_per_second": 12.399, "num_input_tokens_seen": 6684672000, "step": 25500 }, { "epoch": 0.17186204623459, "grad_norm": 0.19653931260108948, "learning_rate": 0.001, "loss": 3.1346, "num_input_tokens_seen": 6697779200, "step": 25550 }, { "epoch": 0.17219837117829762, "grad_norm": 0.18339622020721436, "learning_rate": 0.001, "loss": 3.12, "num_input_tokens_seen": 6710886400, "step": 25600 }, { "epoch": 0.17253469612200523, "grad_norm": 0.3683246970176697, "learning_rate": 0.001, "loss": 3.1338, "num_input_tokens_seen": 6723993600, "step": 25650 }, { "epoch": 0.17287102106571284, "grad_norm": 0.17096757888793945, "learning_rate": 0.001, "loss": 3.1283, "num_input_tokens_seen": 6737100800, "step": 25700 }, { "epoch": 0.17320734600942045, "grad_norm": 0.16841623187065125, "learning_rate": 0.001, "loss": 3.1236, "num_input_tokens_seen": 6750208000, "step": 25750 }, { "epoch": 0.1735436709531281, "grad_norm": 0.18569235503673553, "learning_rate": 0.001, "loss": 3.1273, "num_input_tokens_seen": 6763315200, "step": 25800 }, { "epoch": 0.1738799958968357, "grad_norm": 0.18508999049663544, "learning_rate": 0.001, "loss": 3.1246, "num_input_tokens_seen": 6776422400, "step": 25850 }, { "epoch": 0.1742163208405433, "grad_norm": 0.189519464969635, "learning_rate": 0.001, "loss": 3.1239, "num_input_tokens_seen": 6789529600, "step": 25900 }, { "epoch": 0.17455264578425092, "grad_norm": 0.1591208577156067, "learning_rate": 0.001, "loss": 3.1198, "num_input_tokens_seen": 6802636800, "step": 25950 }, { "epoch": 0.17488897072795853, "grad_norm": 0.199269101023674, "learning_rate": 0.001, "loss": 3.118, "num_input_tokens_seen": 6815744000, "step": 26000 }, { "epoch": 0.17488897072795853, "eval_loss": 3.019541025161743, "eval_runtime": 101.0314, "eval_samples_per_second": 49.49, "eval_steps_per_second": 12.372, "num_input_tokens_seen": 6815744000, "step": 26000 }, { "epoch": 0.17522529567166614, "grad_norm": 0.17492325603961945, "learning_rate": 0.001, "loss": 3.1093, "num_input_tokens_seen": 6828851200, "step": 26050 }, { "epoch": 0.17556162061537375, "grad_norm": 0.17826683819293976, "learning_rate": 0.001, "loss": 3.1285, "num_input_tokens_seen": 6841958400, "step": 26100 }, { "epoch": 0.17589794555908136, "grad_norm": 0.17716389894485474, "learning_rate": 0.001, "loss": 3.1141, "num_input_tokens_seen": 6855065600, "step": 26150 }, { "epoch": 0.17623427050278898, "grad_norm": 0.18649104237556458, "learning_rate": 0.001, "loss": 3.1128, "num_input_tokens_seen": 6868172800, "step": 26200 }, { "epoch": 0.1765705954464966, "grad_norm": 0.4175710678100586, "learning_rate": 0.001, "loss": 3.1283, "num_input_tokens_seen": 6881280000, "step": 26250 }, { "epoch": 0.1769069203902042, "grad_norm": 0.2275037169456482, "learning_rate": 0.001, "loss": 3.148, "num_input_tokens_seen": 6894387200, "step": 26300 }, { "epoch": 0.1772432453339118, "grad_norm": 0.42409747838974, "learning_rate": 0.001, "loss": 3.1338, "num_input_tokens_seen": 6907494400, "step": 26350 }, { "epoch": 0.17757957027761942, "grad_norm": 0.23025737702846527, "learning_rate": 0.001, "loss": 3.1451, "num_input_tokens_seen": 6920601600, "step": 26400 }, { "epoch": 0.17791589522132703, "grad_norm": 0.20386695861816406, "learning_rate": 0.001, "loss": 3.1396, "num_input_tokens_seen": 6933708800, "step": 26450 }, { "epoch": 0.17825222016503464, "grad_norm": 0.20394697785377502, "learning_rate": 0.001, "loss": 3.1308, "num_input_tokens_seen": 6946816000, "step": 26500 }, { "epoch": 0.17825222016503464, "eval_loss": 3.02968430519104, "eval_runtime": 100.5354, "eval_samples_per_second": 49.734, "eval_steps_per_second": 12.433, "num_input_tokens_seen": 6946816000, "step": 26500 }, { "epoch": 0.17858854510874225, "grad_norm": 0.19505399465560913, "learning_rate": 0.001, "loss": 3.1229, "num_input_tokens_seen": 6959923200, "step": 26550 }, { "epoch": 0.17892487005244986, "grad_norm": 0.17783001065254211, "learning_rate": 0.001, "loss": 3.108, "num_input_tokens_seen": 6973030400, "step": 26600 }, { "epoch": 0.17926119499615747, "grad_norm": 0.16989105939865112, "learning_rate": 0.001, "loss": 3.1252, "num_input_tokens_seen": 6986137600, "step": 26650 }, { "epoch": 0.1795975199398651, "grad_norm": 0.1939496099948883, "learning_rate": 0.001, "loss": 3.1234, "num_input_tokens_seen": 6999244800, "step": 26700 }, { "epoch": 0.17993384488357272, "grad_norm": 0.1649375557899475, "learning_rate": 0.001, "loss": 3.1156, "num_input_tokens_seen": 7012352000, "step": 26750 }, { "epoch": 0.18027016982728034, "grad_norm": 0.1829315572977066, "learning_rate": 0.001, "loss": 3.1209, "num_input_tokens_seen": 7025459200, "step": 26800 }, { "epoch": 0.18060649477098795, "grad_norm": 0.182273730635643, "learning_rate": 0.001, "loss": 3.1277, "num_input_tokens_seen": 7038566400, "step": 26850 }, { "epoch": 0.18094281971469556, "grad_norm": 0.1677001416683197, "learning_rate": 0.001, "loss": 3.1132, "num_input_tokens_seen": 7051673600, "step": 26900 }, { "epoch": 0.18127914465840317, "grad_norm": 0.2254924178123474, "learning_rate": 0.001, "loss": 3.1254, "num_input_tokens_seen": 7064780800, "step": 26950 }, { "epoch": 0.18161546960211078, "grad_norm": 0.2008505016565323, "learning_rate": 0.001, "loss": 3.1286, "num_input_tokens_seen": 7077888000, "step": 27000 }, { "epoch": 0.18161546960211078, "eval_loss": 3.0181450843811035, "eval_runtime": 100.7921, "eval_samples_per_second": 49.607, "eval_steps_per_second": 12.402, "num_input_tokens_seen": 7077888000, "step": 27000 }, { "epoch": 0.1819517945458184, "grad_norm": 0.172062486410141, "learning_rate": 0.001, "loss": 3.1125, "num_input_tokens_seen": 7090995200, "step": 27050 }, { "epoch": 0.182288119489526, "grad_norm": 0.17802311480045319, "learning_rate": 0.001, "loss": 3.1145, "num_input_tokens_seen": 7104102400, "step": 27100 }, { "epoch": 0.1826244444332336, "grad_norm": 1.872890591621399, "learning_rate": 0.001, "loss": 3.1203, "num_input_tokens_seen": 7117209600, "step": 27150 }, { "epoch": 0.18296076937694122, "grad_norm": 0.1898074597120285, "learning_rate": 0.001, "loss": 3.1246, "num_input_tokens_seen": 7130316800, "step": 27200 }, { "epoch": 0.18329709432064883, "grad_norm": 0.19125746190547943, "learning_rate": 0.001, "loss": 3.1138, "num_input_tokens_seen": 7143424000, "step": 27250 }, { "epoch": 0.18363341926435645, "grad_norm": 0.17721381783485413, "learning_rate": 0.001, "loss": 3.1181, "num_input_tokens_seen": 7156531200, "step": 27300 }, { "epoch": 0.18396974420806406, "grad_norm": 0.26583337783813477, "learning_rate": 0.001, "loss": 3.1067, "num_input_tokens_seen": 7169638400, "step": 27350 }, { "epoch": 0.18430606915177167, "grad_norm": 0.18157972395420074, "learning_rate": 0.001, "loss": 3.1266, "num_input_tokens_seen": 7182745600, "step": 27400 }, { "epoch": 0.18464239409547928, "grad_norm": 0.17585282027721405, "learning_rate": 0.001, "loss": 3.1124, "num_input_tokens_seen": 7195852800, "step": 27450 }, { "epoch": 0.1849787190391869, "grad_norm": 0.23974797129631042, "learning_rate": 0.001, "loss": 3.1231, "num_input_tokens_seen": 7208960000, "step": 27500 }, { "epoch": 0.1849787190391869, "eval_loss": 3.023569345474243, "eval_runtime": 100.613, "eval_samples_per_second": 49.695, "eval_steps_per_second": 12.424, "num_input_tokens_seen": 7208960000, "step": 27500 }, { "epoch": 0.1853150439828945, "grad_norm": 0.2258712202310562, "learning_rate": 0.001, "loss": 3.1415, "num_input_tokens_seen": 7222067200, "step": 27550 }, { "epoch": 0.18565136892660214, "grad_norm": 0.19764864444732666, "learning_rate": 0.001, "loss": 3.1207, "num_input_tokens_seen": 7235174400, "step": 27600 }, { "epoch": 0.18598769387030975, "grad_norm": 0.20053404569625854, "learning_rate": 0.001, "loss": 3.1252, "num_input_tokens_seen": 7248281600, "step": 27650 }, { "epoch": 0.18632401881401736, "grad_norm": 0.20705857872962952, "learning_rate": 0.001, "loss": 3.1296, "num_input_tokens_seen": 7261388800, "step": 27700 }, { "epoch": 0.18666034375772497, "grad_norm": 0.17856301367282867, "learning_rate": 0.001, "loss": 3.1141, "num_input_tokens_seen": 7274496000, "step": 27750 }, { "epoch": 0.18699666870143258, "grad_norm": 0.28354203701019287, "learning_rate": 0.001, "loss": 3.1263, "num_input_tokens_seen": 7287603200, "step": 27800 }, { "epoch": 0.1873329936451402, "grad_norm": 0.25223788619041443, "learning_rate": 0.001, "loss": 3.122, "num_input_tokens_seen": 7300710400, "step": 27850 }, { "epoch": 0.1876693185888478, "grad_norm": 0.6653568148612976, "learning_rate": 0.001, "loss": 3.1434, "num_input_tokens_seen": 7313817600, "step": 27900 }, { "epoch": 0.18800564353255542, "grad_norm": 0.44238439202308655, "learning_rate": 0.001, "loss": 3.1306, "num_input_tokens_seen": 7326924800, "step": 27950 }, { "epoch": 0.18834196847626303, "grad_norm": 0.2601284980773926, "learning_rate": 0.001, "loss": 3.1399, "num_input_tokens_seen": 7340032000, "step": 28000 }, { "epoch": 0.18834196847626303, "eval_loss": 3.0279700756073, "eval_runtime": 101.2218, "eval_samples_per_second": 49.396, "eval_steps_per_second": 12.349, "num_input_tokens_seen": 7340032000, "step": 28000 }, { "epoch": 0.18867829341997064, "grad_norm": 0.22075016796588898, "learning_rate": 0.001, "loss": 3.1316, "num_input_tokens_seen": 7353139200, "step": 28050 }, { "epoch": 0.18901461836367825, "grad_norm": 0.25096815824508667, "learning_rate": 0.001, "loss": 3.1188, "num_input_tokens_seen": 7366246400, "step": 28100 }, { "epoch": 0.18935094330738586, "grad_norm": 0.1836758553981781, "learning_rate": 0.001, "loss": 3.1159, "num_input_tokens_seen": 7379353600, "step": 28150 }, { "epoch": 0.18968726825109347, "grad_norm": 0.24736745655536652, "learning_rate": 0.001, "loss": 3.1068, "num_input_tokens_seen": 7392460800, "step": 28200 }, { "epoch": 0.19002359319480108, "grad_norm": 0.18398351967334747, "learning_rate": 0.001, "loss": 3.1122, "num_input_tokens_seen": 7405568000, "step": 28250 }, { "epoch": 0.1903599181385087, "grad_norm": 0.2027016431093216, "learning_rate": 0.001, "loss": 3.1086, "num_input_tokens_seen": 7418675200, "step": 28300 }, { "epoch": 0.1906962430822163, "grad_norm": 0.18662536144256592, "learning_rate": 0.001, "loss": 3.1165, "num_input_tokens_seen": 7431782400, "step": 28350 }, { "epoch": 0.19103256802592392, "grad_norm": 0.1824251115322113, "learning_rate": 0.001, "loss": 3.1179, "num_input_tokens_seen": 7444889600, "step": 28400 }, { "epoch": 0.19136889296963153, "grad_norm": 0.17664241790771484, "learning_rate": 0.001, "loss": 3.1114, "num_input_tokens_seen": 7457996800, "step": 28450 }, { "epoch": 0.19170521791333917, "grad_norm": 0.17245933413505554, "learning_rate": 0.001, "loss": 3.1113, "num_input_tokens_seen": 7471104000, "step": 28500 }, { "epoch": 0.19170521791333917, "eval_loss": 3.0132832527160645, "eval_runtime": 100.8874, "eval_samples_per_second": 49.56, "eval_steps_per_second": 12.39, "num_input_tokens_seen": 7471104000, "step": 28500 }, { "epoch": 0.19204154285704678, "grad_norm": 0.20906178653240204, "learning_rate": 0.001, "loss": 3.1229, "num_input_tokens_seen": 7484211200, "step": 28550 }, { "epoch": 0.1923778678007544, "grad_norm": 0.19940024614334106, "learning_rate": 0.001, "loss": 3.1214, "num_input_tokens_seen": 7497318400, "step": 28600 }, { "epoch": 0.192714192744462, "grad_norm": 0.20571599900722504, "learning_rate": 0.001, "loss": 3.1124, "num_input_tokens_seen": 7510425600, "step": 28650 }, { "epoch": 0.1930505176881696, "grad_norm": 0.20708389580249786, "learning_rate": 0.001, "loss": 3.1336, "num_input_tokens_seen": 7523532800, "step": 28700 }, { "epoch": 0.19338684263187722, "grad_norm": 0.22445373237133026, "learning_rate": 0.001, "loss": 3.1157, "num_input_tokens_seen": 7536640000, "step": 28750 }, { "epoch": 0.19372316757558483, "grad_norm": 0.19508902728557587, "learning_rate": 0.001, "loss": 3.1248, "num_input_tokens_seen": 7549747200, "step": 28800 }, { "epoch": 0.19405949251929244, "grad_norm": 0.17587284743785858, "learning_rate": 0.001, "loss": 3.126, "num_input_tokens_seen": 7562854400, "step": 28850 }, { "epoch": 0.19439581746300005, "grad_norm": 0.18851327896118164, "learning_rate": 0.001, "loss": 3.1128, "num_input_tokens_seen": 7575961600, "step": 28900 }, { "epoch": 0.19473214240670766, "grad_norm": 0.1922932267189026, "learning_rate": 0.001, "loss": 3.1244, "num_input_tokens_seen": 7589068800, "step": 28950 }, { "epoch": 0.19506846735041528, "grad_norm": 0.21472705900669098, "learning_rate": 0.001, "loss": 3.1287, "num_input_tokens_seen": 7602176000, "step": 29000 }, { "epoch": 0.19506846735041528, "eval_loss": 3.0183699131011963, "eval_runtime": 100.5473, "eval_samples_per_second": 49.728, "eval_steps_per_second": 12.432, "num_input_tokens_seen": 7602176000, "step": 29000 }, { "epoch": 0.1954047922941229, "grad_norm": 0.4608127474784851, "learning_rate": 0.001, "loss": 3.1279, "num_input_tokens_seen": 7615283200, "step": 29050 }, { "epoch": 0.1957411172378305, "grad_norm": 0.19692400097846985, "learning_rate": 0.001, "loss": 3.1227, "num_input_tokens_seen": 7628390400, "step": 29100 }, { "epoch": 0.1960774421815381, "grad_norm": 0.17576786875724792, "learning_rate": 0.001, "loss": 3.1082, "num_input_tokens_seen": 7641497600, "step": 29150 }, { "epoch": 0.19641376712524572, "grad_norm": 0.3715643584728241, "learning_rate": 0.001, "loss": 3.1136, "num_input_tokens_seen": 7654604800, "step": 29200 }, { "epoch": 0.19675009206895333, "grad_norm": 0.19605587422847748, "learning_rate": 0.001, "loss": 3.1233, "num_input_tokens_seen": 7667712000, "step": 29250 }, { "epoch": 0.19708641701266094, "grad_norm": 0.18321235477924347, "learning_rate": 0.001, "loss": 3.1127, "num_input_tokens_seen": 7680819200, "step": 29300 }, { "epoch": 0.19742274195636855, "grad_norm": 0.19760318100452423, "learning_rate": 0.001, "loss": 3.1074, "num_input_tokens_seen": 7693926400, "step": 29350 }, { "epoch": 0.1977590669000762, "grad_norm": 0.2162732481956482, "learning_rate": 0.001, "loss": 3.1126, "num_input_tokens_seen": 7707033600, "step": 29400 }, { "epoch": 0.1980953918437838, "grad_norm": 0.20082025229930878, "learning_rate": 0.001, "loss": 3.1111, "num_input_tokens_seen": 7720140800, "step": 29450 }, { "epoch": 0.1984317167874914, "grad_norm": 0.1623256504535675, "learning_rate": 0.001, "loss": 3.108, "num_input_tokens_seen": 7733248000, "step": 29500 }, { "epoch": 0.1984317167874914, "eval_loss": 3.0064518451690674, "eval_runtime": 101.1359, "eval_samples_per_second": 49.438, "eval_steps_per_second": 12.36, "num_input_tokens_seen": 7733248000, "step": 29500 }, { "epoch": 0.19876804173119902, "grad_norm": 0.22083748877048492, "learning_rate": 0.001, "loss": 3.1171, "num_input_tokens_seen": 7746355200, "step": 29550 }, { "epoch": 0.19910436667490664, "grad_norm": 0.23460319638252258, "learning_rate": 0.001, "loss": 3.1199, "num_input_tokens_seen": 7759462400, "step": 29600 }, { "epoch": 0.19944069161861425, "grad_norm": 0.22132454812526703, "learning_rate": 0.001, "loss": 3.1058, "num_input_tokens_seen": 7772569600, "step": 29650 }, { "epoch": 0.19977701656232186, "grad_norm": 0.18013770878314972, "learning_rate": 0.001, "loss": 3.1193, "num_input_tokens_seen": 7785676800, "step": 29700 }, { "epoch": 0.20011334150602947, "grad_norm": 0.17522746324539185, "learning_rate": 0.001, "loss": 3.1017, "num_input_tokens_seen": 7798784000, "step": 29750 }, { "epoch": 0.20044966644973708, "grad_norm": 0.18989771604537964, "learning_rate": 0.001, "loss": 3.1029, "num_input_tokens_seen": 7811891200, "step": 29800 }, { "epoch": 0.2007859913934447, "grad_norm": 0.38332825899124146, "learning_rate": 0.001, "loss": 3.11, "num_input_tokens_seen": 7824998400, "step": 29850 }, { "epoch": 0.2011223163371523, "grad_norm": 0.1822918802499771, "learning_rate": 0.001, "loss": 3.1018, "num_input_tokens_seen": 7838105600, "step": 29900 }, { "epoch": 0.2014586412808599, "grad_norm": 0.1707374006509781, "learning_rate": 0.001, "loss": 3.1077, "num_input_tokens_seen": 7851212800, "step": 29950 }, { "epoch": 0.20179496622456752, "grad_norm": 0.32529768347740173, "learning_rate": 0.001, "loss": 3.1074, "num_input_tokens_seen": 7864320000, "step": 30000 }, { "epoch": 0.20179496622456752, "eval_loss": 3.0052828788757324, "eval_runtime": 101.5526, "eval_samples_per_second": 49.236, "eval_steps_per_second": 12.309, "num_input_tokens_seen": 7864320000, "step": 30000 }, { "epoch": 0.20213129116827513, "grad_norm": 0.3476233184337616, "learning_rate": 0.001, "loss": 3.1145, "num_input_tokens_seen": 7877427200, "step": 30050 }, { "epoch": 0.20246761611198275, "grad_norm": 0.18626414239406586, "learning_rate": 0.001, "loss": 3.1066, "num_input_tokens_seen": 7890534400, "step": 30100 }, { "epoch": 0.20280394105569036, "grad_norm": 0.254221647977829, "learning_rate": 0.001, "loss": 3.1092, "num_input_tokens_seen": 7903641600, "step": 30150 }, { "epoch": 0.20314026599939797, "grad_norm": 0.22347959876060486, "learning_rate": 0.001, "loss": 3.1168, "num_input_tokens_seen": 7916748800, "step": 30200 }, { "epoch": 0.20347659094310558, "grad_norm": 0.22947949171066284, "learning_rate": 0.001, "loss": 3.1181, "num_input_tokens_seen": 7929856000, "step": 30250 }, { "epoch": 0.2038129158868132, "grad_norm": 0.18353967368602753, "learning_rate": 0.001, "loss": 3.1154, "num_input_tokens_seen": 7942963200, "step": 30300 }, { "epoch": 0.20414924083052083, "grad_norm": 0.1840088963508606, "learning_rate": 0.001, "loss": 3.1145, "num_input_tokens_seen": 7956070400, "step": 30350 }, { "epoch": 0.20448556577422844, "grad_norm": 0.21340833604335785, "learning_rate": 0.001, "loss": 3.1175, "num_input_tokens_seen": 7969177600, "step": 30400 }, { "epoch": 0.20482189071793605, "grad_norm": 0.22054563462734222, "learning_rate": 0.001, "loss": 3.1082, "num_input_tokens_seen": 7982284800, "step": 30450 }, { "epoch": 0.20515821566164366, "grad_norm": 0.23745377361774445, "learning_rate": 0.001, "loss": 3.1155, "num_input_tokens_seen": 7995392000, "step": 30500 }, { "epoch": 0.20515821566164366, "eval_loss": 3.0057761669158936, "eval_runtime": 111.9719, "eval_samples_per_second": 44.654, "eval_steps_per_second": 11.164, "num_input_tokens_seen": 7995392000, "step": 30500 }, { "epoch": 0.20549454060535127, "grad_norm": 0.18189583718776703, "learning_rate": 0.001, "loss": 3.119, "num_input_tokens_seen": 8008499200, "step": 30550 }, { "epoch": 0.20583086554905888, "grad_norm": 0.23735597729682922, "learning_rate": 0.001, "loss": 3.1114, "num_input_tokens_seen": 8021606400, "step": 30600 }, { "epoch": 0.2061671904927665, "grad_norm": 0.6922980546951294, "learning_rate": 0.001, "loss": 3.1155, "num_input_tokens_seen": 8034713600, "step": 30650 }, { "epoch": 0.2065035154364741, "grad_norm": 0.277959406375885, "learning_rate": 0.001, "loss": 3.113, "num_input_tokens_seen": 8047820800, "step": 30700 }, { "epoch": 0.20683984038018172, "grad_norm": 0.20879347622394562, "learning_rate": 0.001, "loss": 3.1052, "num_input_tokens_seen": 8060928000, "step": 30750 }, { "epoch": 0.20717616532388933, "grad_norm": 0.2380591332912445, "learning_rate": 0.001, "loss": 3.0957, "num_input_tokens_seen": 8074035200, "step": 30800 }, { "epoch": 0.20751249026759694, "grad_norm": 0.19781485199928284, "learning_rate": 0.001, "loss": 3.1043, "num_input_tokens_seen": 8087142400, "step": 30850 }, { "epoch": 0.20784881521130455, "grad_norm": 0.20070037245750427, "learning_rate": 0.001, "loss": 3.1064, "num_input_tokens_seen": 8100249600, "step": 30900 }, { "epoch": 0.20818514015501216, "grad_norm": 0.1823301464319229, "learning_rate": 0.001, "loss": 3.0957, "num_input_tokens_seen": 8113356800, "step": 30950 }, { "epoch": 0.20852146509871977, "grad_norm": 0.21749699115753174, "learning_rate": 0.001, "loss": 3.0952, "num_input_tokens_seen": 8126464000, "step": 31000 }, { "epoch": 0.20852146509871977, "eval_loss": 3.003377676010132, "eval_runtime": 101.0747, "eval_samples_per_second": 49.468, "eval_steps_per_second": 12.367, "num_input_tokens_seen": 8126464000, "step": 31000 }, { "epoch": 0.20885779004242738, "grad_norm": 0.189363494515419, "learning_rate": 0.001, "loss": 3.0965, "num_input_tokens_seen": 8139571200, "step": 31050 }, { "epoch": 0.209194114986135, "grad_norm": 0.22209693491458893, "learning_rate": 0.001, "loss": 3.0999, "num_input_tokens_seen": 8152678400, "step": 31100 }, { "epoch": 0.2095304399298426, "grad_norm": 0.2229391485452652, "learning_rate": 0.001, "loss": 3.1051, "num_input_tokens_seen": 8165785600, "step": 31150 }, { "epoch": 0.20986676487355022, "grad_norm": 0.29246941208839417, "learning_rate": 0.001, "loss": 3.1091, "num_input_tokens_seen": 8178892800, "step": 31200 }, { "epoch": 0.21020308981725785, "grad_norm": 0.20801013708114624, "learning_rate": 0.001, "loss": 3.1203, "num_input_tokens_seen": 8192000000, "step": 31250 }, { "epoch": 0.21053941476096547, "grad_norm": 0.20411519706249237, "learning_rate": 0.001, "loss": 3.0988, "num_input_tokens_seen": 8205107200, "step": 31300 }, { "epoch": 0.21087573970467308, "grad_norm": 0.20220200717449188, "learning_rate": 0.001, "loss": 3.0999, "num_input_tokens_seen": 8218214400, "step": 31350 }, { "epoch": 0.2112120646483807, "grad_norm": 0.2107418328523636, "learning_rate": 0.001, "loss": 3.1052, "num_input_tokens_seen": 8231321600, "step": 31400 }, { "epoch": 0.2115483895920883, "grad_norm": 0.2079913169145584, "learning_rate": 0.001, "loss": 3.1095, "num_input_tokens_seen": 8244428800, "step": 31450 }, { "epoch": 0.2118847145357959, "grad_norm": 0.20990462601184845, "learning_rate": 0.001, "loss": 3.1095, "num_input_tokens_seen": 8257536000, "step": 31500 }, { "epoch": 0.2118847145357959, "eval_loss": 3.002542018890381, "eval_runtime": 102.5542, "eval_samples_per_second": 48.755, "eval_steps_per_second": 12.189, "num_input_tokens_seen": 8257536000, "step": 31500 }, { "epoch": 0.21222103947950352, "grad_norm": 0.3199199140071869, "learning_rate": 0.001, "loss": 3.1193, "num_input_tokens_seen": 8270643200, "step": 31550 }, { "epoch": 0.21255736442321113, "grad_norm": 0.3326248824596405, "learning_rate": 0.001, "loss": 3.1398, "num_input_tokens_seen": 8283750400, "step": 31600 }, { "epoch": 0.21289368936691874, "grad_norm": 0.22599756717681885, "learning_rate": 0.001, "loss": 3.1192, "num_input_tokens_seen": 8296857600, "step": 31650 }, { "epoch": 0.21323001431062635, "grad_norm": 0.24016565084457397, "learning_rate": 0.001, "loss": 3.1208, "num_input_tokens_seen": 8309964800, "step": 31700 }, { "epoch": 0.21356633925433396, "grad_norm": 0.2414240539073944, "learning_rate": 0.001, "loss": 3.1188, "num_input_tokens_seen": 8323072000, "step": 31750 }, { "epoch": 0.21390266419804158, "grad_norm": 0.21480241417884827, "learning_rate": 0.001, "loss": 3.1183, "num_input_tokens_seen": 8336179200, "step": 31800 }, { "epoch": 0.2142389891417492, "grad_norm": 0.23109237849712372, "learning_rate": 0.001, "loss": 3.117, "num_input_tokens_seen": 8349286400, "step": 31850 }, { "epoch": 0.2145753140854568, "grad_norm": 1.8171563148498535, "learning_rate": 0.001, "loss": 3.1133, "num_input_tokens_seen": 8362393600, "step": 31900 }, { "epoch": 0.2149116390291644, "grad_norm": 0.19770418107509613, "learning_rate": 0.001, "loss": 3.1004, "num_input_tokens_seen": 8375500800, "step": 31950 }, { "epoch": 0.21524796397287202, "grad_norm": 0.17420834302902222, "learning_rate": 0.001, "loss": 3.1201, "num_input_tokens_seen": 8388608000, "step": 32000 }, { "epoch": 0.21524796397287202, "eval_loss": 2.999030113220215, "eval_runtime": 101.4799, "eval_samples_per_second": 49.271, "eval_steps_per_second": 12.318, "num_input_tokens_seen": 8388608000, "step": 32000 }, { "epoch": 0.21558428891657963, "grad_norm": 0.17642393708229065, "learning_rate": 0.001, "loss": 3.0933, "num_input_tokens_seen": 8401715200, "step": 32050 }, { "epoch": 0.21592061386028724, "grad_norm": 0.17682494223117828, "learning_rate": 0.001, "loss": 3.0944, "num_input_tokens_seen": 8414822400, "step": 32100 }, { "epoch": 0.21625693880399488, "grad_norm": 0.18188636004924774, "learning_rate": 0.001, "loss": 3.0952, "num_input_tokens_seen": 8427929600, "step": 32150 }, { "epoch": 0.2165932637477025, "grad_norm": 0.19842028617858887, "learning_rate": 0.001, "loss": 3.0952, "num_input_tokens_seen": 8441036800, "step": 32200 }, { "epoch": 0.2169295886914101, "grad_norm": 0.5376595854759216, "learning_rate": 0.001, "loss": 3.0995, "num_input_tokens_seen": 8454144000, "step": 32250 }, { "epoch": 0.2172659136351177, "grad_norm": 0.1966828554868698, "learning_rate": 0.001, "loss": 3.0998, "num_input_tokens_seen": 8467251200, "step": 32300 }, { "epoch": 0.21760223857882532, "grad_norm": 0.16826917231082916, "learning_rate": 0.001, "loss": 3.1058, "num_input_tokens_seen": 8480358400, "step": 32350 }, { "epoch": 0.21793856352253294, "grad_norm": 0.17534971237182617, "learning_rate": 0.001, "loss": 3.0989, "num_input_tokens_seen": 8493465600, "step": 32400 }, { "epoch": 0.21827488846624055, "grad_norm": 0.18534857034683228, "learning_rate": 0.001, "loss": 3.1006, "num_input_tokens_seen": 8506572800, "step": 32450 }, { "epoch": 0.21861121340994816, "grad_norm": 0.23653754591941833, "learning_rate": 0.001, "loss": 3.0979, "num_input_tokens_seen": 8519680000, "step": 32500 }, { "epoch": 0.21861121340994816, "eval_loss": 2.9992752075195312, "eval_runtime": 104.1376, "eval_samples_per_second": 48.013, "eval_steps_per_second": 12.003, "num_input_tokens_seen": 8519680000, "step": 32500 }, { "epoch": 0.21894753835365577, "grad_norm": 0.235910564661026, "learning_rate": 0.001, "loss": 3.0965, "num_input_tokens_seen": 8532787200, "step": 32550 }, { "epoch": 0.21928386329736338, "grad_norm": 0.17582012712955475, "learning_rate": 0.001, "loss": 3.0979, "num_input_tokens_seen": 8545894400, "step": 32600 }, { "epoch": 0.219620188241071, "grad_norm": 0.16854169964790344, "learning_rate": 0.001, "loss": 3.0962, "num_input_tokens_seen": 8559001600, "step": 32650 }, { "epoch": 0.2199565131847786, "grad_norm": 0.20170539617538452, "learning_rate": 0.001, "loss": 3.0967, "num_input_tokens_seen": 8572108800, "step": 32700 }, { "epoch": 0.2202928381284862, "grad_norm": 0.15898527204990387, "learning_rate": 0.001, "loss": 3.0846, "num_input_tokens_seen": 8585216000, "step": 32750 }, { "epoch": 0.22062916307219382, "grad_norm": 0.18423962593078613, "learning_rate": 0.001, "loss": 3.0873, "num_input_tokens_seen": 8598323200, "step": 32800 }, { "epoch": 0.22096548801590143, "grad_norm": 0.22025519609451294, "learning_rate": 0.001, "loss": 3.1034, "num_input_tokens_seen": 8611430400, "step": 32850 }, { "epoch": 0.22130181295960905, "grad_norm": 0.22972916066646576, "learning_rate": 0.001, "loss": 3.1018, "num_input_tokens_seen": 8624537600, "step": 32900 }, { "epoch": 0.22163813790331666, "grad_norm": 0.3072693347930908, "learning_rate": 0.001, "loss": 3.1044, "num_input_tokens_seen": 8637644800, "step": 32950 }, { "epoch": 0.22197446284702427, "grad_norm": 0.16734054684638977, "learning_rate": 0.001, "loss": 3.1079, "num_input_tokens_seen": 8650752000, "step": 33000 }, { "epoch": 0.22197446284702427, "eval_loss": 2.9946696758270264, "eval_runtime": 101.2832, "eval_samples_per_second": 49.367, "eval_steps_per_second": 12.342, "num_input_tokens_seen": 8650752000, "step": 33000 }, { "epoch": 0.2223107877907319, "grad_norm": 0.19366054236888885, "learning_rate": 0.001, "loss": 3.1123, "num_input_tokens_seen": 8663859200, "step": 33050 }, { "epoch": 0.22264711273443952, "grad_norm": 0.1908022165298462, "learning_rate": 0.001, "loss": 3.0921, "num_input_tokens_seen": 8676966400, "step": 33100 }, { "epoch": 0.22298343767814713, "grad_norm": 0.1756322979927063, "learning_rate": 0.001, "loss": 3.0898, "num_input_tokens_seen": 8690073600, "step": 33150 }, { "epoch": 0.22331976262185474, "grad_norm": 0.17791526019573212, "learning_rate": 0.001, "loss": 3.099, "num_input_tokens_seen": 8703180800, "step": 33200 }, { "epoch": 0.22365608756556235, "grad_norm": 0.1831691414117813, "learning_rate": 0.001, "loss": 3.0961, "num_input_tokens_seen": 8716288000, "step": 33250 }, { "epoch": 0.22399241250926996, "grad_norm": 0.21115051209926605, "learning_rate": 0.001, "loss": 3.092, "num_input_tokens_seen": 8729395200, "step": 33300 }, { "epoch": 0.22432873745297757, "grad_norm": 0.2059226632118225, "learning_rate": 0.001, "loss": 3.0982, "num_input_tokens_seen": 8742502400, "step": 33350 }, { "epoch": 0.22466506239668518, "grad_norm": 0.18022479116916656, "learning_rate": 0.001, "loss": 3.0853, "num_input_tokens_seen": 8755609600, "step": 33400 }, { "epoch": 0.2250013873403928, "grad_norm": 0.18534015119075775, "learning_rate": 0.001, "loss": 3.0872, "num_input_tokens_seen": 8768716800, "step": 33450 }, { "epoch": 0.2253377122841004, "grad_norm": 0.1856871247291565, "learning_rate": 0.001, "loss": 3.0888, "num_input_tokens_seen": 8781824000, "step": 33500 }, { "epoch": 0.2253377122841004, "eval_loss": 2.989856243133545, "eval_runtime": 100.552, "eval_samples_per_second": 49.725, "eval_steps_per_second": 12.431, "num_input_tokens_seen": 8781824000, "step": 33500 }, { "epoch": 0.22567403722780802, "grad_norm": 0.1858453005552292, "learning_rate": 0.001, "loss": 3.0859, "num_input_tokens_seen": 8794931200, "step": 33550 }, { "epoch": 0.22601036217151563, "grad_norm": 0.15424181520938873, "learning_rate": 0.001, "loss": 3.094, "num_input_tokens_seen": 8808038400, "step": 33600 }, { "epoch": 0.22634668711522324, "grad_norm": 0.1684613823890686, "learning_rate": 0.001, "loss": 3.0924, "num_input_tokens_seen": 8821145600, "step": 33650 }, { "epoch": 0.22668301205893085, "grad_norm": 0.20759907364845276, "learning_rate": 0.001, "loss": 3.0967, "num_input_tokens_seen": 8834252800, "step": 33700 }, { "epoch": 0.22701933700263846, "grad_norm": 0.20460882782936096, "learning_rate": 0.001, "loss": 3.0938, "num_input_tokens_seen": 8847360000, "step": 33750 }, { "epoch": 0.22735566194634607, "grad_norm": 1.1036453247070312, "learning_rate": 0.001, "loss": 3.092, "num_input_tokens_seen": 8860467200, "step": 33800 }, { "epoch": 0.22769198689005368, "grad_norm": 0.23920981585979462, "learning_rate": 0.001, "loss": 3.1014, "num_input_tokens_seen": 8873574400, "step": 33850 }, { "epoch": 0.2280283118337613, "grad_norm": 0.44639232754707336, "learning_rate": 0.001, "loss": 3.0925, "num_input_tokens_seen": 8886681600, "step": 33900 }, { "epoch": 0.22836463677746893, "grad_norm": 0.20524434745311737, "learning_rate": 0.001, "loss": 3.0998, "num_input_tokens_seen": 8899788800, "step": 33950 }, { "epoch": 0.22870096172117654, "grad_norm": 0.3326469957828522, "learning_rate": 0.001, "loss": 3.1028, "num_input_tokens_seen": 8912896000, "step": 34000 }, { "epoch": 0.22870096172117654, "eval_loss": 2.99273681640625, "eval_runtime": 101.3229, "eval_samples_per_second": 49.347, "eval_steps_per_second": 12.337, "num_input_tokens_seen": 8912896000, "step": 34000 }, { "epoch": 0.22903728666488415, "grad_norm": 0.1811443269252777, "learning_rate": 0.001, "loss": 3.0963, "num_input_tokens_seen": 8926003200, "step": 34050 }, { "epoch": 0.22937361160859177, "grad_norm": 0.1995471566915512, "learning_rate": 0.001, "loss": 3.0929, "num_input_tokens_seen": 8939110400, "step": 34100 }, { "epoch": 0.22970993655229938, "grad_norm": 0.18768733739852905, "learning_rate": 0.001, "loss": 3.088, "num_input_tokens_seen": 8952217600, "step": 34150 }, { "epoch": 0.230046261496007, "grad_norm": 0.23258289694786072, "learning_rate": 0.001, "loss": 3.0952, "num_input_tokens_seen": 8965324800, "step": 34200 }, { "epoch": 0.2303825864397146, "grad_norm": 0.2201758772134781, "learning_rate": 0.001, "loss": 3.105, "num_input_tokens_seen": 8978432000, "step": 34250 }, { "epoch": 0.2307189113834222, "grad_norm": 0.3385378420352936, "learning_rate": 0.001, "loss": 3.1242, "num_input_tokens_seen": 8991539200, "step": 34300 }, { "epoch": 0.23105523632712982, "grad_norm": 0.40026524662971497, "learning_rate": 0.001, "loss": 3.1236, "num_input_tokens_seen": 9004646400, "step": 34350 }, { "epoch": 0.23139156127083743, "grad_norm": 0.9256707429885864, "learning_rate": 0.001, "loss": 3.1417, "num_input_tokens_seen": 9017753600, "step": 34400 }, { "epoch": 0.23172788621454504, "grad_norm": 0.2774488627910614, "learning_rate": 0.001, "loss": 3.1258, "num_input_tokens_seen": 9030860800, "step": 34450 }, { "epoch": 0.23206421115825265, "grad_norm": 0.3596802353858948, "learning_rate": 0.001, "loss": 3.1182, "num_input_tokens_seen": 9043968000, "step": 34500 }, { "epoch": 0.23206421115825265, "eval_loss": 3.0026750564575195, "eval_runtime": 101.1458, "eval_samples_per_second": 49.434, "eval_steps_per_second": 12.358, "num_input_tokens_seen": 9043968000, "step": 34500 }, { "epoch": 0.23240053610196026, "grad_norm": 0.23315957188606262, "learning_rate": 0.001, "loss": 3.0983, "num_input_tokens_seen": 9057075200, "step": 34550 }, { "epoch": 0.23273686104566788, "grad_norm": 0.21506045758724213, "learning_rate": 0.001, "loss": 3.1006, "num_input_tokens_seen": 9070182400, "step": 34600 }, { "epoch": 0.2330731859893755, "grad_norm": 0.23909543454647064, "learning_rate": 0.001, "loss": 3.109, "num_input_tokens_seen": 9083289600, "step": 34650 }, { "epoch": 0.2334095109330831, "grad_norm": 0.31270143389701843, "learning_rate": 0.001, "loss": 3.1062, "num_input_tokens_seen": 9096396800, "step": 34700 }, { "epoch": 0.2337458358767907, "grad_norm": 0.2879350483417511, "learning_rate": 0.001, "loss": 3.1065, "num_input_tokens_seen": 9109504000, "step": 34750 }, { "epoch": 0.23408216082049832, "grad_norm": 0.1994767040014267, "learning_rate": 0.001, "loss": 3.0984, "num_input_tokens_seen": 9122611200, "step": 34800 }, { "epoch": 0.23441848576420596, "grad_norm": 0.19194720685482025, "learning_rate": 0.001, "loss": 3.1003, "num_input_tokens_seen": 9135718400, "step": 34850 }, { "epoch": 0.23475481070791357, "grad_norm": 0.22253084182739258, "learning_rate": 0.001, "loss": 3.0969, "num_input_tokens_seen": 9148825600, "step": 34900 }, { "epoch": 0.23509113565162118, "grad_norm": 0.2180721014738083, "learning_rate": 0.001, "loss": 3.0986, "num_input_tokens_seen": 9161932800, "step": 34950 }, { "epoch": 0.2354274605953288, "grad_norm": 0.1867762804031372, "learning_rate": 0.001, "loss": 3.0831, "num_input_tokens_seen": 9175040000, "step": 35000 }, { "epoch": 0.2354274605953288, "eval_loss": 2.987546920776367, "eval_runtime": 100.7704, "eval_samples_per_second": 49.618, "eval_steps_per_second": 12.404, "num_input_tokens_seen": 9175040000, "step": 35000 }, { "epoch": 0.2357637855390364, "grad_norm": 0.22034546732902527, "learning_rate": 0.001, "loss": 3.0963, "num_input_tokens_seen": 9188147200, "step": 35050 }, { "epoch": 0.236100110482744, "grad_norm": 0.20113885402679443, "learning_rate": 0.001, "loss": 3.0948, "num_input_tokens_seen": 9201254400, "step": 35100 }, { "epoch": 0.23643643542645162, "grad_norm": 0.3071548342704773, "learning_rate": 0.001, "loss": 3.0991, "num_input_tokens_seen": 9214361600, "step": 35150 }, { "epoch": 0.23677276037015924, "grad_norm": 0.19438254833221436, "learning_rate": 0.001, "loss": 3.0931, "num_input_tokens_seen": 9227468800, "step": 35200 }, { "epoch": 0.23710908531386685, "grad_norm": 0.19387036561965942, "learning_rate": 0.001, "loss": 3.0945, "num_input_tokens_seen": 9240576000, "step": 35250 }, { "epoch": 0.23744541025757446, "grad_norm": 0.33751723170280457, "learning_rate": 0.001, "loss": 3.0853, "num_input_tokens_seen": 9253683200, "step": 35300 }, { "epoch": 0.23778173520128207, "grad_norm": 0.20809979736804962, "learning_rate": 0.001, "loss": 3.0886, "num_input_tokens_seen": 9266790400, "step": 35350 }, { "epoch": 0.23811806014498968, "grad_norm": 0.22419853508472443, "learning_rate": 0.001, "loss": 3.0903, "num_input_tokens_seen": 9279897600, "step": 35400 }, { "epoch": 0.2384543850886973, "grad_norm": 0.38772207498550415, "learning_rate": 0.001, "loss": 3.0917, "num_input_tokens_seen": 9293004800, "step": 35450 }, { "epoch": 0.2387907100324049, "grad_norm": 0.26076874136924744, "learning_rate": 0.001, "loss": 3.1019, "num_input_tokens_seen": 9306112000, "step": 35500 }, { "epoch": 0.2387907100324049, "eval_loss": 2.9896371364593506, "eval_runtime": 101.1154, "eval_samples_per_second": 49.448, "eval_steps_per_second": 12.362, "num_input_tokens_seen": 9306112000, "step": 35500 }, { "epoch": 0.2391270349761125, "grad_norm": 0.2551634907722473, "learning_rate": 0.001, "loss": 3.0983, "num_input_tokens_seen": 9319219200, "step": 35550 }, { "epoch": 0.23946335991982012, "grad_norm": 0.25666406750679016, "learning_rate": 0.001, "loss": 3.0973, "num_input_tokens_seen": 9332326400, "step": 35600 }, { "epoch": 0.23979968486352773, "grad_norm": 0.21999171376228333, "learning_rate": 0.001, "loss": 3.1023, "num_input_tokens_seen": 9345433600, "step": 35650 }, { "epoch": 0.24013600980723535, "grad_norm": 0.24575746059417725, "learning_rate": 0.001, "loss": 3.0954, "num_input_tokens_seen": 9358540800, "step": 35700 }, { "epoch": 0.24047233475094298, "grad_norm": 0.23686733841896057, "learning_rate": 0.001, "loss": 3.0939, "num_input_tokens_seen": 9371648000, "step": 35750 }, { "epoch": 0.2408086596946506, "grad_norm": 0.2265947312116623, "learning_rate": 0.001, "loss": 3.0972, "num_input_tokens_seen": 9384755200, "step": 35800 }, { "epoch": 0.2411449846383582, "grad_norm": 0.19846303761005402, "learning_rate": 0.001, "loss": 3.0879, "num_input_tokens_seen": 9397862400, "step": 35850 }, { "epoch": 0.24148130958206582, "grad_norm": 0.3813537657260895, "learning_rate": 0.001, "loss": 3.091, "num_input_tokens_seen": 9410969600, "step": 35900 }, { "epoch": 0.24181763452577343, "grad_norm": 0.21791686117649078, "learning_rate": 0.001, "loss": 3.1126, "num_input_tokens_seen": 9424076800, "step": 35950 }, { "epoch": 0.24215395946948104, "grad_norm": 0.1958397924900055, "learning_rate": 0.001, "loss": 3.0993, "num_input_tokens_seen": 9437184000, "step": 36000 }, { "epoch": 0.24215395946948104, "eval_loss": 2.9876413345336914, "eval_runtime": 100.7038, "eval_samples_per_second": 49.651, "eval_steps_per_second": 12.413, "num_input_tokens_seen": 9437184000, "step": 36000 }, { "epoch": 0.24249028441318865, "grad_norm": 0.18913355469703674, "learning_rate": 0.001, "loss": 3.0824, "num_input_tokens_seen": 9450291200, "step": 36050 }, { "epoch": 0.24282660935689626, "grad_norm": 0.17502999305725098, "learning_rate": 0.001, "loss": 3.0947, "num_input_tokens_seen": 9463398400, "step": 36100 }, { "epoch": 0.24316293430060387, "grad_norm": 0.1844000667333603, "learning_rate": 0.001, "loss": 3.1031, "num_input_tokens_seen": 9476505600, "step": 36150 }, { "epoch": 0.24349925924431148, "grad_norm": 0.21123170852661133, "learning_rate": 0.001, "loss": 3.0917, "num_input_tokens_seen": 9489612800, "step": 36200 }, { "epoch": 0.2438355841880191, "grad_norm": 0.20432326197624207, "learning_rate": 0.001, "loss": 3.0777, "num_input_tokens_seen": 9502720000, "step": 36250 }, { "epoch": 0.2441719091317267, "grad_norm": 0.1782015562057495, "learning_rate": 0.001, "loss": 3.0825, "num_input_tokens_seen": 9515827200, "step": 36300 }, { "epoch": 0.24450823407543432, "grad_norm": 0.17421406507492065, "learning_rate": 0.001, "loss": 3.0936, "num_input_tokens_seen": 9528934400, "step": 36350 }, { "epoch": 0.24484455901914193, "grad_norm": 0.20186392962932587, "learning_rate": 0.001, "loss": 3.0873, "num_input_tokens_seen": 9542041600, "step": 36400 }, { "epoch": 0.24518088396284954, "grad_norm": 0.8788098692893982, "learning_rate": 0.001, "loss": 3.0859, "num_input_tokens_seen": 9555148800, "step": 36450 }, { "epoch": 0.24551720890655715, "grad_norm": 0.21201574802398682, "learning_rate": 0.001, "loss": 3.0801, "num_input_tokens_seen": 9568256000, "step": 36500 }, { "epoch": 0.24551720890655715, "eval_loss": 2.9814889430999756, "eval_runtime": 102.1027, "eval_samples_per_second": 48.97, "eval_steps_per_second": 12.243, "num_input_tokens_seen": 9568256000, "step": 36500 }, { "epoch": 0.24585353385026476, "grad_norm": 0.19073808193206787, "learning_rate": 0.001, "loss": 3.0944, "num_input_tokens_seen": 9581363200, "step": 36550 }, { "epoch": 0.24618985879397237, "grad_norm": 0.17367486655712128, "learning_rate": 0.001, "loss": 3.0892, "num_input_tokens_seen": 9594470400, "step": 36600 }, { "epoch": 0.24652618373768, "grad_norm": 0.24065230786800385, "learning_rate": 0.001, "loss": 3.092, "num_input_tokens_seen": 9607577600, "step": 36650 }, { "epoch": 0.24686250868138762, "grad_norm": 0.18443045020103455, "learning_rate": 0.001, "loss": 3.0863, "num_input_tokens_seen": 9620684800, "step": 36700 }, { "epoch": 0.24719883362509523, "grad_norm": 0.2121111899614334, "learning_rate": 0.001, "loss": 3.0902, "num_input_tokens_seen": 9633792000, "step": 36750 }, { "epoch": 0.24753515856880284, "grad_norm": 0.17981579899787903, "learning_rate": 0.001, "loss": 3.0888, "num_input_tokens_seen": 9646899200, "step": 36800 }, { "epoch": 0.24787148351251045, "grad_norm": 0.24683868885040283, "learning_rate": 0.001, "loss": 3.0895, "num_input_tokens_seen": 9660006400, "step": 36850 }, { "epoch": 0.24820780845621807, "grad_norm": 0.17905527353286743, "learning_rate": 0.001, "loss": 3.0771, "num_input_tokens_seen": 9673113600, "step": 36900 }, { "epoch": 0.24854413339992568, "grad_norm": 0.4657650589942932, "learning_rate": 0.001, "loss": 3.093, "num_input_tokens_seen": 9686220800, "step": 36950 }, { "epoch": 0.2488804583436333, "grad_norm": 0.2079911082983017, "learning_rate": 0.001, "loss": 3.0913, "num_input_tokens_seen": 9699328000, "step": 37000 }, { "epoch": 0.2488804583436333, "eval_loss": 2.984112024307251, "eval_runtime": 100.7836, "eval_samples_per_second": 49.611, "eval_steps_per_second": 12.403, "num_input_tokens_seen": 9699328000, "step": 37000 }, { "epoch": 0.2492167832873409, "grad_norm": 0.18588006496429443, "learning_rate": 0.001, "loss": 3.086, "num_input_tokens_seen": 9712435200, "step": 37050 }, { "epoch": 0.2495531082310485, "grad_norm": 0.500970721244812, "learning_rate": 0.001, "loss": 3.0786, "num_input_tokens_seen": 9725542400, "step": 37100 }, { "epoch": 0.24988943317475612, "grad_norm": 0.20005236566066742, "learning_rate": 0.001, "loss": 3.0797, "num_input_tokens_seen": 9738649600, "step": 37150 }, { "epoch": 0.25022575811846376, "grad_norm": 0.1864924281835556, "learning_rate": 0.001, "loss": 3.076, "num_input_tokens_seen": 9751756800, "step": 37200 }, { "epoch": 0.25056208306217137, "grad_norm": 0.19927112758159637, "learning_rate": 0.001, "loss": 3.083, "num_input_tokens_seen": 9764864000, "step": 37250 }, { "epoch": 0.250898408005879, "grad_norm": 0.18902507424354553, "learning_rate": 0.001, "loss": 3.0885, "num_input_tokens_seen": 9777971200, "step": 37300 }, { "epoch": 0.2512347329495866, "grad_norm": 0.19465987384319305, "learning_rate": 0.001, "loss": 3.0826, "num_input_tokens_seen": 9791078400, "step": 37350 }, { "epoch": 0.2515710578932942, "grad_norm": 0.2374599725008011, "learning_rate": 0.001, "loss": 3.0796, "num_input_tokens_seen": 9804185600, "step": 37400 }, { "epoch": 0.2519073828370018, "grad_norm": 0.201645627617836, "learning_rate": 0.001, "loss": 3.0856, "num_input_tokens_seen": 9817292800, "step": 37450 }, { "epoch": 0.2522437077807094, "grad_norm": 0.5505014061927795, "learning_rate": 0.001, "loss": 3.1105, "num_input_tokens_seen": 9830400000, "step": 37500 }, { "epoch": 0.2522437077807094, "eval_loss": 2.9955086708068848, "eval_runtime": 51.2641, "eval_samples_per_second": 97.534, "eval_steps_per_second": 24.384, "num_input_tokens_seen": 9830400000, "step": 37500 }, { "epoch": 0.25258003272441704, "grad_norm": 0.5572711229324341, "learning_rate": 0.001, "loss": 3.0967, "num_input_tokens_seen": 9843507200, "step": 37550 }, { "epoch": 0.25291635766812465, "grad_norm": 0.25361862778663635, "learning_rate": 0.001, "loss": 3.1296, "num_input_tokens_seen": 9856614400, "step": 37600 }, { "epoch": 0.25325268261183226, "grad_norm": 0.24185167253017426, "learning_rate": 0.001, "loss": 3.1003, "num_input_tokens_seen": 9869721600, "step": 37650 }, { "epoch": 0.25358900755553987, "grad_norm": 0.2068016678094864, "learning_rate": 0.001, "loss": 3.088, "num_input_tokens_seen": 9882828800, "step": 37700 }, { "epoch": 0.2539253324992475, "grad_norm": 0.2029482126235962, "learning_rate": 0.001, "loss": 3.0878, "num_input_tokens_seen": 9895936000, "step": 37750 }, { "epoch": 0.2542616574429551, "grad_norm": 0.22508949041366577, "learning_rate": 0.001, "loss": 3.0992, "num_input_tokens_seen": 9909043200, "step": 37800 }, { "epoch": 0.2545979823866627, "grad_norm": 0.19577881693840027, "learning_rate": 0.001, "loss": 3.0866, "num_input_tokens_seen": 9922150400, "step": 37850 }, { "epoch": 0.2549343073303703, "grad_norm": 0.815874457359314, "learning_rate": 0.001, "loss": 3.0858, "num_input_tokens_seen": 9935257600, "step": 37900 }, { "epoch": 0.2552706322740779, "grad_norm": 0.20485574007034302, "learning_rate": 0.001, "loss": 3.0943, "num_input_tokens_seen": 9948364800, "step": 37950 }, { "epoch": 0.25560695721778554, "grad_norm": 0.23158146440982819, "learning_rate": 0.001, "loss": 3.0926, "num_input_tokens_seen": 9961472000, "step": 38000 }, { "epoch": 0.25560695721778554, "eval_loss": 2.9854347705841064, "eval_runtime": 51.4615, "eval_samples_per_second": 97.16, "eval_steps_per_second": 24.29, "num_input_tokens_seen": 9961472000, "step": 38000 }, { "epoch": 0.25594328216149315, "grad_norm": 0.32355332374572754, "learning_rate": 0.001, "loss": 3.0965, "num_input_tokens_seen": 9974579200, "step": 38050 }, { "epoch": 0.25627960710520076, "grad_norm": 0.20291878283023834, "learning_rate": 0.001, "loss": 3.0803, "num_input_tokens_seen": 9987686400, "step": 38100 }, { "epoch": 0.25661593204890837, "grad_norm": 0.17536096274852753, "learning_rate": 0.001, "loss": 3.0765, "num_input_tokens_seen": 10000793600, "step": 38150 }, { "epoch": 0.256952256992616, "grad_norm": 0.17826804518699646, "learning_rate": 0.001, "loss": 3.0735, "num_input_tokens_seen": 10013900800, "step": 38200 }, { "epoch": 0.2572885819363236, "grad_norm": 0.20115964114665985, "learning_rate": 0.001, "loss": 3.0813, "num_input_tokens_seen": 10027008000, "step": 38250 }, { "epoch": 0.2576249068800312, "grad_norm": 0.23634804785251617, "learning_rate": 0.001, "loss": 3.0821, "num_input_tokens_seen": 10040115200, "step": 38300 }, { "epoch": 0.2579612318237388, "grad_norm": 0.31893596053123474, "learning_rate": 0.001, "loss": 3.096, "num_input_tokens_seen": 10053222400, "step": 38350 }, { "epoch": 0.2582975567674464, "grad_norm": 0.21891412138938904, "learning_rate": 0.001, "loss": 3.0904, "num_input_tokens_seen": 10066329600, "step": 38400 }, { "epoch": 0.25863388171115403, "grad_norm": 0.21848681569099426, "learning_rate": 0.001, "loss": 3.0793, "num_input_tokens_seen": 10079436800, "step": 38450 }, { "epoch": 0.25897020665486165, "grad_norm": 0.2052360624074936, "learning_rate": 0.001, "loss": 3.0802, "num_input_tokens_seen": 10092544000, "step": 38500 }, { "epoch": 0.25897020665486165, "eval_loss": 2.9803249835968018, "eval_runtime": 51.5379, "eval_samples_per_second": 97.016, "eval_steps_per_second": 24.254, "num_input_tokens_seen": 10092544000, "step": 38500 }, { "epoch": 0.25930653159856926, "grad_norm": 0.23162005841732025, "learning_rate": 0.001, "loss": 3.0876, "num_input_tokens_seen": 10105651200, "step": 38550 }, { "epoch": 0.25964285654227687, "grad_norm": 0.23110276460647583, "learning_rate": 0.001, "loss": 3.0745, "num_input_tokens_seen": 10118758400, "step": 38600 }, { "epoch": 0.2599791814859845, "grad_norm": 0.22557710111141205, "learning_rate": 0.001, "loss": 3.0716, "num_input_tokens_seen": 10131865600, "step": 38650 }, { "epoch": 0.2603155064296921, "grad_norm": 0.19009199738502502, "learning_rate": 0.001, "loss": 3.0765, "num_input_tokens_seen": 10144972800, "step": 38700 }, { "epoch": 0.2606518313733997, "grad_norm": 0.2352983057498932, "learning_rate": 0.001, "loss": 3.0834, "num_input_tokens_seen": 10158080000, "step": 38750 }, { "epoch": 0.2609881563171073, "grad_norm": 0.1986854076385498, "learning_rate": 0.001, "loss": 3.084, "num_input_tokens_seen": 10171187200, "step": 38800 }, { "epoch": 0.2613244812608149, "grad_norm": 1.6988344192504883, "learning_rate": 0.001, "loss": 3.0994, "num_input_tokens_seen": 10184294400, "step": 38850 }, { "epoch": 0.2616608062045226, "grad_norm": 0.2794356346130371, "learning_rate": 0.001, "loss": 3.1265, "num_input_tokens_seen": 10197401600, "step": 38900 }, { "epoch": 0.2619971311482302, "grad_norm": 0.2307969629764557, "learning_rate": 0.001, "loss": 3.1015, "num_input_tokens_seen": 10210508800, "step": 38950 }, { "epoch": 0.2623334560919378, "grad_norm": 0.29501858353614807, "learning_rate": 0.001, "loss": 3.0881, "num_input_tokens_seen": 10223616000, "step": 39000 }, { "epoch": 0.2623334560919378, "eval_loss": 2.9856879711151123, "eval_runtime": 51.7163, "eval_samples_per_second": 96.681, "eval_steps_per_second": 24.17, "num_input_tokens_seen": 10223616000, "step": 39000 }, { "epoch": 0.2626697810356454, "grad_norm": 0.288418710231781, "learning_rate": 0.001, "loss": 3.0901, "num_input_tokens_seen": 10236723200, "step": 39050 }, { "epoch": 0.26300610597935303, "grad_norm": 0.2238176465034485, "learning_rate": 0.001, "loss": 3.0869, "num_input_tokens_seen": 10249830400, "step": 39100 }, { "epoch": 0.26334243092306064, "grad_norm": 0.21013687551021576, "learning_rate": 0.001, "loss": 3.0821, "num_input_tokens_seen": 10262937600, "step": 39150 }, { "epoch": 0.26367875586676826, "grad_norm": 0.20650598406791687, "learning_rate": 0.001, "loss": 3.0924, "num_input_tokens_seen": 10276044800, "step": 39200 }, { "epoch": 0.26401508081047587, "grad_norm": 1.1471627950668335, "learning_rate": 0.001, "loss": 3.0719, "num_input_tokens_seen": 10289152000, "step": 39250 }, { "epoch": 0.2643514057541835, "grad_norm": 0.2899995446205139, "learning_rate": 0.001, "loss": 3.0901, "num_input_tokens_seen": 10302259200, "step": 39300 }, { "epoch": 0.2646877306978911, "grad_norm": 0.25812703371047974, "learning_rate": 0.001, "loss": 3.088, "num_input_tokens_seen": 10315366400, "step": 39350 }, { "epoch": 0.2650240556415987, "grad_norm": 0.36547353863716125, "learning_rate": 0.001, "loss": 3.0782, "num_input_tokens_seen": 10328473600, "step": 39400 }, { "epoch": 0.2653603805853063, "grad_norm": 0.41187551617622375, "learning_rate": 0.001, "loss": 3.0991, "num_input_tokens_seen": 10341580800, "step": 39450 }, { "epoch": 0.2656967055290139, "grad_norm": 0.2279098927974701, "learning_rate": 0.001, "loss": 3.083, "num_input_tokens_seen": 10354688000, "step": 39500 }, { "epoch": 0.2656967055290139, "eval_loss": 2.9809067249298096, "eval_runtime": 51.2151, "eval_samples_per_second": 97.627, "eval_steps_per_second": 24.407, "num_input_tokens_seen": 10354688000, "step": 39500 }, { "epoch": 0.26603303047272153, "grad_norm": 0.2444402426481247, "learning_rate": 0.001, "loss": 3.0887, "num_input_tokens_seen": 10367795200, "step": 39550 }, { "epoch": 0.26636935541642914, "grad_norm": 0.24601753056049347, "learning_rate": 0.001, "loss": 3.0802, "num_input_tokens_seen": 10380902400, "step": 39600 }, { "epoch": 0.26670568036013675, "grad_norm": 0.21487103402614594, "learning_rate": 0.001, "loss": 3.0777, "num_input_tokens_seen": 10394009600, "step": 39650 }, { "epoch": 0.26704200530384437, "grad_norm": 0.21092857420444489, "learning_rate": 0.001, "loss": 3.083, "num_input_tokens_seen": 10407116800, "step": 39700 }, { "epoch": 0.267378330247552, "grad_norm": 0.21346789598464966, "learning_rate": 0.001, "loss": 3.0946, "num_input_tokens_seen": 10420224000, "step": 39750 }, { "epoch": 0.2677146551912596, "grad_norm": 0.3449369966983795, "learning_rate": 0.001, "loss": 3.0827, "num_input_tokens_seen": 10433331200, "step": 39800 }, { "epoch": 0.2680509801349672, "grad_norm": 0.33765944838523865, "learning_rate": 0.001, "loss": 3.0886, "num_input_tokens_seen": 10446438400, "step": 39850 }, { "epoch": 0.2683873050786748, "grad_norm": 0.2361810952425003, "learning_rate": 0.001, "loss": 3.0902, "num_input_tokens_seen": 10459545600, "step": 39900 }, { "epoch": 0.2687236300223824, "grad_norm": 0.2562389373779297, "learning_rate": 0.001, "loss": 3.0958, "num_input_tokens_seen": 10472652800, "step": 39950 }, { "epoch": 0.26905995496609003, "grad_norm": 0.20619696378707886, "learning_rate": 0.001, "loss": 3.0904, "num_input_tokens_seen": 10485760000, "step": 40000 }, { "epoch": 0.26905995496609003, "eval_loss": 2.9785397052764893, "eval_runtime": 51.4486, "eval_samples_per_second": 97.184, "eval_steps_per_second": 24.296, "num_input_tokens_seen": 10485760000, "step": 40000 }, { "epoch": 0.26939627990979764, "grad_norm": 0.775861382484436, "learning_rate": 0.001, "loss": 3.0856, "num_input_tokens_seen": 10498867200, "step": 40050 }, { "epoch": 0.26973260485350525, "grad_norm": 0.2422339916229248, "learning_rate": 0.001, "loss": 3.0831, "num_input_tokens_seen": 10511974400, "step": 40100 }, { "epoch": 0.27006892979721286, "grad_norm": 0.21582826972007751, "learning_rate": 0.001, "loss": 3.0735, "num_input_tokens_seen": 10525081600, "step": 40150 }, { "epoch": 0.2704052547409205, "grad_norm": 0.17632398009300232, "learning_rate": 0.001, "loss": 3.0878, "num_input_tokens_seen": 10538188800, "step": 40200 }, { "epoch": 0.2707415796846281, "grad_norm": 0.2152569591999054, "learning_rate": 0.001, "loss": 3.0766, "num_input_tokens_seen": 10551296000, "step": 40250 }, { "epoch": 0.2710779046283357, "grad_norm": 0.19657547771930695, "learning_rate": 0.001, "loss": 3.0674, "num_input_tokens_seen": 10564403200, "step": 40300 }, { "epoch": 0.2714142295720433, "grad_norm": 0.21468690037727356, "learning_rate": 0.001, "loss": 3.0746, "num_input_tokens_seen": 10577510400, "step": 40350 }, { "epoch": 0.2717505545157509, "grad_norm": 0.19693616032600403, "learning_rate": 0.001, "loss": 3.0795, "num_input_tokens_seen": 10590617600, "step": 40400 }, { "epoch": 0.27208687945945853, "grad_norm": 0.21241678297519684, "learning_rate": 0.001, "loss": 3.0843, "num_input_tokens_seen": 10603724800, "step": 40450 }, { "epoch": 0.27242320440316614, "grad_norm": 0.2053707093000412, "learning_rate": 0.001, "loss": 3.0857, "num_input_tokens_seen": 10616832000, "step": 40500 }, { "epoch": 0.27242320440316614, "eval_loss": 2.9741692543029785, "eval_runtime": 51.4287, "eval_samples_per_second": 97.222, "eval_steps_per_second": 24.305, "num_input_tokens_seen": 10616832000, "step": 40500 }, { "epoch": 0.27275952934687375, "grad_norm": 0.18715134263038635, "learning_rate": 0.001, "loss": 3.0699, "num_input_tokens_seen": 10629939200, "step": 40550 }, { "epoch": 0.27309585429058136, "grad_norm": 0.20241226255893707, "learning_rate": 0.001, "loss": 3.0794, "num_input_tokens_seen": 10643046400, "step": 40600 }, { "epoch": 0.273432179234289, "grad_norm": 0.19430743157863617, "learning_rate": 0.001, "loss": 3.0768, "num_input_tokens_seen": 10656153600, "step": 40650 }, { "epoch": 0.2737685041779966, "grad_norm": 0.18806929886341095, "learning_rate": 0.001, "loss": 3.0778, "num_input_tokens_seen": 10669260800, "step": 40700 }, { "epoch": 0.27410482912170425, "grad_norm": 0.20148856937885284, "learning_rate": 0.001, "loss": 3.0768, "num_input_tokens_seen": 10682368000, "step": 40750 }, { "epoch": 0.27444115406541186, "grad_norm": 0.17226669192314148, "learning_rate": 0.001, "loss": 3.0621, "num_input_tokens_seen": 10695475200, "step": 40800 }, { "epoch": 0.2747774790091195, "grad_norm": 0.18090015649795532, "learning_rate": 0.001, "loss": 3.0693, "num_input_tokens_seen": 10708582400, "step": 40850 }, { "epoch": 0.2751138039528271, "grad_norm": 0.177758127450943, "learning_rate": 0.001, "loss": 3.0719, "num_input_tokens_seen": 10721689600, "step": 40900 }, { "epoch": 0.2754501288965347, "grad_norm": 0.18323731422424316, "learning_rate": 0.001, "loss": 3.0662, "num_input_tokens_seen": 10734796800, "step": 40950 }, { "epoch": 0.2757864538402423, "grad_norm": 0.1883731186389923, "learning_rate": 0.001, "loss": 3.0675, "num_input_tokens_seen": 10747904000, "step": 41000 }, { "epoch": 0.2757864538402423, "eval_loss": 2.9687845706939697, "eval_runtime": 51.0712, "eval_samples_per_second": 97.902, "eval_steps_per_second": 24.476, "num_input_tokens_seen": 10747904000, "step": 41000 }, { "epoch": 0.2761227787839499, "grad_norm": 0.26252472400665283, "learning_rate": 0.001, "loss": 3.0668, "num_input_tokens_seen": 10761011200, "step": 41050 }, { "epoch": 0.27645910372765753, "grad_norm": 0.2921096086502075, "learning_rate": 0.001, "loss": 3.0817, "num_input_tokens_seen": 10774118400, "step": 41100 }, { "epoch": 0.27679542867136514, "grad_norm": 0.2732321619987488, "learning_rate": 0.001, "loss": 3.0918, "num_input_tokens_seen": 10787225600, "step": 41150 }, { "epoch": 0.27713175361507275, "grad_norm": 0.23650842905044556, "learning_rate": 0.001, "loss": 3.0767, "num_input_tokens_seen": 10800332800, "step": 41200 }, { "epoch": 0.27746807855878036, "grad_norm": 0.36582860350608826, "learning_rate": 0.001, "loss": 3.097, "num_input_tokens_seen": 10813440000, "step": 41250 }, { "epoch": 0.277804403502488, "grad_norm": 0.20505377650260925, "learning_rate": 0.001, "loss": 3.0776, "num_input_tokens_seen": 10826547200, "step": 41300 }, { "epoch": 0.2781407284461956, "grad_norm": 0.22332334518432617, "learning_rate": 0.001, "loss": 3.0822, "num_input_tokens_seen": 10839654400, "step": 41350 }, { "epoch": 0.2784770533899032, "grad_norm": 0.20054738223552704, "learning_rate": 0.001, "loss": 3.0756, "num_input_tokens_seen": 10852761600, "step": 41400 }, { "epoch": 0.2788133783336108, "grad_norm": 0.1935984194278717, "learning_rate": 0.001, "loss": 3.0772, "num_input_tokens_seen": 10865868800, "step": 41450 }, { "epoch": 0.2791497032773184, "grad_norm": 0.21076564490795135, "learning_rate": 0.001, "loss": 3.0733, "num_input_tokens_seen": 10878976000, "step": 41500 }, { "epoch": 0.2791497032773184, "eval_loss": 2.969437837600708, "eval_runtime": 51.3634, "eval_samples_per_second": 97.346, "eval_steps_per_second": 24.336, "num_input_tokens_seen": 10878976000, "step": 41500 }, { "epoch": 0.27948602822102603, "grad_norm": 0.19656164944171906, "learning_rate": 0.001, "loss": 3.0705, "num_input_tokens_seen": 10892083200, "step": 41550 }, { "epoch": 0.27982235316473364, "grad_norm": 0.20582802593708038, "learning_rate": 0.001, "loss": 3.0847, "num_input_tokens_seen": 10905190400, "step": 41600 }, { "epoch": 0.28015867810844125, "grad_norm": 0.1748686581850052, "learning_rate": 0.001, "loss": 3.0612, "num_input_tokens_seen": 10918297600, "step": 41650 }, { "epoch": 0.28049500305214886, "grad_norm": 0.1908082365989685, "learning_rate": 0.001, "loss": 3.0673, "num_input_tokens_seen": 10931404800, "step": 41700 }, { "epoch": 0.2808313279958565, "grad_norm": 0.20851461589336395, "learning_rate": 0.001, "loss": 3.0724, "num_input_tokens_seen": 10944512000, "step": 41750 }, { "epoch": 0.2811676529395641, "grad_norm": 0.18114647269248962, "learning_rate": 0.001, "loss": 3.069, "num_input_tokens_seen": 10957619200, "step": 41800 }, { "epoch": 0.2815039778832717, "grad_norm": 0.19079278409481049, "learning_rate": 0.001, "loss": 3.0658, "num_input_tokens_seen": 10970726400, "step": 41850 }, { "epoch": 0.2818403028269793, "grad_norm": 0.16630959510803223, "learning_rate": 0.001, "loss": 3.0589, "num_input_tokens_seen": 10983833600, "step": 41900 }, { "epoch": 0.2821766277706869, "grad_norm": 0.1777983158826828, "learning_rate": 0.001, "loss": 3.0642, "num_input_tokens_seen": 10996940800, "step": 41950 }, { "epoch": 0.2825129527143945, "grad_norm": 0.36513733863830566, "learning_rate": 0.001, "loss": 3.0685, "num_input_tokens_seen": 11010048000, "step": 42000 }, { "epoch": 0.2825129527143945, "eval_loss": 2.9688751697540283, "eval_runtime": 51.3924, "eval_samples_per_second": 97.291, "eval_steps_per_second": 24.323, "num_input_tokens_seen": 11010048000, "step": 42000 }, { "epoch": 0.28284927765810214, "grad_norm": 0.36728134751319885, "learning_rate": 0.001, "loss": 3.1317, "num_input_tokens_seen": 11023155200, "step": 42050 }, { "epoch": 0.28318560260180975, "grad_norm": 0.27042749524116516, "learning_rate": 0.001, "loss": 3.0815, "num_input_tokens_seen": 11036262400, "step": 42100 }, { "epoch": 0.28352192754551736, "grad_norm": 0.18322569131851196, "learning_rate": 0.001, "loss": 3.1027, "num_input_tokens_seen": 11049369600, "step": 42150 }, { "epoch": 0.28385825248922497, "grad_norm": 0.21849602460861206, "learning_rate": 0.001, "loss": 3.071, "num_input_tokens_seen": 11062476800, "step": 42200 }, { "epoch": 0.2841945774329326, "grad_norm": 0.38253432512283325, "learning_rate": 0.001, "loss": 3.0898, "num_input_tokens_seen": 11075584000, "step": 42250 }, { "epoch": 0.2845309023766402, "grad_norm": 0.21137705445289612, "learning_rate": 0.001, "loss": 3.0813, "num_input_tokens_seen": 11088691200, "step": 42300 }, { "epoch": 0.2848672273203478, "grad_norm": 1.2625150680541992, "learning_rate": 0.001, "loss": 3.0799, "num_input_tokens_seen": 11101798400, "step": 42350 }, { "epoch": 0.2852035522640554, "grad_norm": 0.2312895804643631, "learning_rate": 0.001, "loss": 3.0877, "num_input_tokens_seen": 11114905600, "step": 42400 }, { "epoch": 0.285539877207763, "grad_norm": 0.2823241353034973, "learning_rate": 0.001, "loss": 3.0733, "num_input_tokens_seen": 11128012800, "step": 42450 }, { "epoch": 0.28587620215147064, "grad_norm": 0.21215762197971344, "learning_rate": 0.001, "loss": 3.0798, "num_input_tokens_seen": 11141120000, "step": 42500 }, { "epoch": 0.28587620215147064, "eval_loss": 2.9728448390960693, "eval_runtime": 51.3574, "eval_samples_per_second": 97.357, "eval_steps_per_second": 24.339, "num_input_tokens_seen": 11141120000, "step": 42500 }, { "epoch": 0.2862125270951783, "grad_norm": 0.216337651014328, "learning_rate": 0.001, "loss": 3.0786, "num_input_tokens_seen": 11154227200, "step": 42550 }, { "epoch": 0.2865488520388859, "grad_norm": 0.2741137444972992, "learning_rate": 0.001, "loss": 3.0814, "num_input_tokens_seen": 11167334400, "step": 42600 }, { "epoch": 0.2868851769825935, "grad_norm": 0.22545816004276276, "learning_rate": 0.001, "loss": 3.0808, "num_input_tokens_seen": 11180441600, "step": 42650 }, { "epoch": 0.28722150192630114, "grad_norm": 0.2085258960723877, "learning_rate": 0.001, "loss": 3.0696, "num_input_tokens_seen": 11193548800, "step": 42700 }, { "epoch": 0.28755782687000875, "grad_norm": 0.25214484333992004, "learning_rate": 0.001, "loss": 3.0787, "num_input_tokens_seen": 11206656000, "step": 42750 }, { "epoch": 0.28789415181371636, "grad_norm": 0.23247800767421722, "learning_rate": 0.001, "loss": 3.0718, "num_input_tokens_seen": 11219763200, "step": 42800 }, { "epoch": 0.28823047675742397, "grad_norm": 0.22879773378372192, "learning_rate": 0.001, "loss": 3.0709, "num_input_tokens_seen": 11232870400, "step": 42850 }, { "epoch": 0.2885668017011316, "grad_norm": 0.19782567024230957, "learning_rate": 0.001, "loss": 3.07, "num_input_tokens_seen": 11245977600, "step": 42900 }, { "epoch": 0.2889031266448392, "grad_norm": 0.2164357751607895, "learning_rate": 0.001, "loss": 3.0614, "num_input_tokens_seen": 11259084800, "step": 42950 }, { "epoch": 0.2892394515885468, "grad_norm": 0.2364019900560379, "learning_rate": 0.001, "loss": 3.071, "num_input_tokens_seen": 11272192000, "step": 43000 }, { "epoch": 0.2892394515885468, "eval_loss": 2.969633102416992, "eval_runtime": 51.5112, "eval_samples_per_second": 97.066, "eval_steps_per_second": 24.267, "num_input_tokens_seen": 11272192000, "step": 43000 }, { "epoch": 0.2895757765322544, "grad_norm": 0.2350464016199112, "learning_rate": 0.001, "loss": 3.0636, "num_input_tokens_seen": 11285299200, "step": 43050 }, { "epoch": 0.289912101475962, "grad_norm": 0.2345239371061325, "learning_rate": 0.001, "loss": 3.0764, "num_input_tokens_seen": 11298406400, "step": 43100 }, { "epoch": 0.29024842641966964, "grad_norm": 0.2316889762878418, "learning_rate": 0.001, "loss": 3.0799, "num_input_tokens_seen": 11311513600, "step": 43150 }, { "epoch": 0.29058475136337725, "grad_norm": 0.17912918329238892, "learning_rate": 0.001, "loss": 3.0822, "num_input_tokens_seen": 11324620800, "step": 43200 }, { "epoch": 0.29092107630708486, "grad_norm": 0.18596945703029633, "learning_rate": 0.001, "loss": 3.0599, "num_input_tokens_seen": 11337728000, "step": 43250 }, { "epoch": 0.29125740125079247, "grad_norm": 0.20373962819576263, "learning_rate": 0.001, "loss": 3.0668, "num_input_tokens_seen": 11350835200, "step": 43300 }, { "epoch": 0.2915937261945001, "grad_norm": 0.17628611624240875, "learning_rate": 0.001, "loss": 3.0642, "num_input_tokens_seen": 11363942400, "step": 43350 }, { "epoch": 0.2919300511382077, "grad_norm": 0.17134816944599152, "learning_rate": 0.001, "loss": 3.0711, "num_input_tokens_seen": 11377049600, "step": 43400 }, { "epoch": 0.2922663760819153, "grad_norm": 0.18842625617980957, "learning_rate": 0.001, "loss": 3.0656, "num_input_tokens_seen": 11390156800, "step": 43450 }, { "epoch": 0.2926027010256229, "grad_norm": 0.20406392216682434, "learning_rate": 0.001, "loss": 3.0664, "num_input_tokens_seen": 11403264000, "step": 43500 }, { "epoch": 0.2926027010256229, "eval_loss": 2.967707872390747, "eval_runtime": 50.7844, "eval_samples_per_second": 98.455, "eval_steps_per_second": 24.614, "num_input_tokens_seen": 11403264000, "step": 43500 }, { "epoch": 0.2929390259693305, "grad_norm": 0.1807146519422531, "learning_rate": 0.001, "loss": 3.0722, "num_input_tokens_seen": 11416371200, "step": 43550 }, { "epoch": 0.29327535091303814, "grad_norm": 0.18754515051841736, "learning_rate": 0.001, "loss": 3.0705, "num_input_tokens_seen": 11429478400, "step": 43600 }, { "epoch": 0.29361167585674575, "grad_norm": 0.21045394241809845, "learning_rate": 0.001, "loss": 3.0727, "num_input_tokens_seen": 11442585600, "step": 43650 }, { "epoch": 0.29394800080045336, "grad_norm": 0.2657870054244995, "learning_rate": 0.001, "loss": 3.0692, "num_input_tokens_seen": 11455692800, "step": 43700 }, { "epoch": 0.29428432574416097, "grad_norm": 0.18931645154953003, "learning_rate": 0.001, "loss": 3.0773, "num_input_tokens_seen": 11468800000, "step": 43750 }, { "epoch": 0.2946206506878686, "grad_norm": 0.18956629931926727, "learning_rate": 0.001, "loss": 3.0638, "num_input_tokens_seen": 11481907200, "step": 43800 }, { "epoch": 0.2949569756315762, "grad_norm": 0.18412917852401733, "learning_rate": 0.001, "loss": 3.0673, "num_input_tokens_seen": 11495014400, "step": 43850 }, { "epoch": 0.2952933005752838, "grad_norm": 0.18386723101139069, "learning_rate": 0.001, "loss": 3.0532, "num_input_tokens_seen": 11508121600, "step": 43900 }, { "epoch": 0.2956296255189914, "grad_norm": 1.7288094758987427, "learning_rate": 0.001, "loss": 3.0686, "num_input_tokens_seen": 11521228800, "step": 43950 }, { "epoch": 0.295965950462699, "grad_norm": 1.273023009300232, "learning_rate": 0.001, "loss": 3.0844, "num_input_tokens_seen": 11534336000, "step": 44000 }, { "epoch": 0.295965950462699, "eval_loss": 2.988032102584839, "eval_runtime": 51.218, "eval_samples_per_second": 97.622, "eval_steps_per_second": 24.405, "num_input_tokens_seen": 11534336000, "step": 44000 }, { "epoch": 0.29630227540640663, "grad_norm": 0.22384904325008392, "learning_rate": 0.001, "loss": 3.0741, "num_input_tokens_seen": 11547443200, "step": 44050 }, { "epoch": 0.29663860035011425, "grad_norm": 0.2000838816165924, "learning_rate": 0.001, "loss": 3.0681, "num_input_tokens_seen": 11560550400, "step": 44100 }, { "epoch": 0.29697492529382186, "grad_norm": 0.5984578728675842, "learning_rate": 0.001, "loss": 3.0652, "num_input_tokens_seen": 11573657600, "step": 44150 }, { "epoch": 0.29731125023752947, "grad_norm": 0.20970633625984192, "learning_rate": 0.001, "loss": 3.0799, "num_input_tokens_seen": 11586764800, "step": 44200 }, { "epoch": 0.2976475751812371, "grad_norm": 0.18818268179893494, "learning_rate": 0.001, "loss": 3.0719, "num_input_tokens_seen": 11599872000, "step": 44250 }, { "epoch": 0.2979839001249447, "grad_norm": 0.2058591991662979, "learning_rate": 0.001, "loss": 3.074, "num_input_tokens_seen": 11612979200, "step": 44300 }, { "epoch": 0.29832022506865236, "grad_norm": 0.40354540944099426, "learning_rate": 0.001, "loss": 3.0809, "num_input_tokens_seen": 11626086400, "step": 44350 }, { "epoch": 0.29865655001235997, "grad_norm": 0.1923748105764389, "learning_rate": 0.001, "loss": 3.0769, "num_input_tokens_seen": 11639193600, "step": 44400 }, { "epoch": 0.2989928749560676, "grad_norm": 0.24034352600574493, "learning_rate": 0.001, "loss": 3.0648, "num_input_tokens_seen": 11652300800, "step": 44450 }, { "epoch": 0.2993291998997752, "grad_norm": 0.2124001830816269, "learning_rate": 0.001, "loss": 3.0591, "num_input_tokens_seen": 11665408000, "step": 44500 }, { "epoch": 0.2993291998997752, "eval_loss": 2.9622106552124023, "eval_runtime": 51.3129, "eval_samples_per_second": 97.441, "eval_steps_per_second": 24.36, "num_input_tokens_seen": 11665408000, "step": 44500 }, { "epoch": 0.2996655248434828, "grad_norm": 0.2701764702796936, "learning_rate": 0.001, "loss": 3.0739, "num_input_tokens_seen": 11678515200, "step": 44550 }, { "epoch": 0.3000018497871904, "grad_norm": 0.20468254387378693, "learning_rate": 0.001, "loss": 3.0724, "num_input_tokens_seen": 11691622400, "step": 44600 }, { "epoch": 0.300338174730898, "grad_norm": 0.18791192770004272, "learning_rate": 0.001, "loss": 3.0705, "num_input_tokens_seen": 11704729600, "step": 44650 }, { "epoch": 0.30067449967460563, "grad_norm": 0.21384365856647491, "learning_rate": 0.001, "loss": 3.0719, "num_input_tokens_seen": 11717836800, "step": 44700 }, { "epoch": 0.30101082461831324, "grad_norm": 0.20965896546840668, "learning_rate": 0.001, "loss": 3.0575, "num_input_tokens_seen": 11730944000, "step": 44750 }, { "epoch": 0.30134714956202086, "grad_norm": 0.4128492772579193, "learning_rate": 0.001, "loss": 3.0948, "num_input_tokens_seen": 11744051200, "step": 44800 }, { "epoch": 0.30168347450572847, "grad_norm": 0.27056023478507996, "learning_rate": 0.001, "loss": 3.0807, "num_input_tokens_seen": 11757158400, "step": 44850 }, { "epoch": 0.3020197994494361, "grad_norm": 0.1866568773984909, "learning_rate": 0.001, "loss": 3.0737, "num_input_tokens_seen": 11770265600, "step": 44900 }, { "epoch": 0.3023561243931437, "grad_norm": 0.21440774202346802, "learning_rate": 0.001, "loss": 3.0804, "num_input_tokens_seen": 11783372800, "step": 44950 }, { "epoch": 0.3026924493368513, "grad_norm": 0.30685120820999146, "learning_rate": 0.001, "loss": 3.0603, "num_input_tokens_seen": 11796480000, "step": 45000 }, { "epoch": 0.3026924493368513, "eval_loss": 2.966905355453491, "eval_runtime": 51.6659, "eval_samples_per_second": 96.776, "eval_steps_per_second": 24.194, "num_input_tokens_seen": 11796480000, "step": 45000 }, { "epoch": 0.3030287742805589, "grad_norm": 0.22337579727172852, "learning_rate": 0.001, "loss": 3.0839, "num_input_tokens_seen": 11809587200, "step": 45050 }, { "epoch": 0.3033650992242665, "grad_norm": 0.23798304796218872, "learning_rate": 0.001, "loss": 3.0729, "num_input_tokens_seen": 11822694400, "step": 45100 }, { "epoch": 0.30370142416797413, "grad_norm": 0.1956755667924881, "learning_rate": 0.001, "loss": 3.071, "num_input_tokens_seen": 11835801600, "step": 45150 }, { "epoch": 0.30403774911168174, "grad_norm": 0.20110267400741577, "learning_rate": 0.001, "loss": 3.0662, "num_input_tokens_seen": 11848908800, "step": 45200 }, { "epoch": 0.30437407405538935, "grad_norm": 0.20845186710357666, "learning_rate": 0.001, "loss": 3.0627, "num_input_tokens_seen": 11862016000, "step": 45250 }, { "epoch": 0.30471039899909697, "grad_norm": 0.17928333580493927, "learning_rate": 0.001, "loss": 3.0543, "num_input_tokens_seen": 11875123200, "step": 45300 }, { "epoch": 0.3050467239428046, "grad_norm": 0.17617329955101013, "learning_rate": 0.001, "loss": 3.0614, "num_input_tokens_seen": 11888230400, "step": 45350 }, { "epoch": 0.3053830488865122, "grad_norm": 0.2610914409160614, "learning_rate": 0.001, "loss": 3.069, "num_input_tokens_seen": 11901337600, "step": 45400 }, { "epoch": 0.3057193738302198, "grad_norm": 0.3862092196941376, "learning_rate": 0.001, "loss": 3.0944, "num_input_tokens_seen": 11914444800, "step": 45450 }, { "epoch": 0.3060556987739274, "grad_norm": 0.40467047691345215, "learning_rate": 0.001, "loss": 3.0714, "num_input_tokens_seen": 11927552000, "step": 45500 }, { "epoch": 0.3060556987739274, "eval_loss": 2.965502977371216, "eval_runtime": 51.4918, "eval_samples_per_second": 97.103, "eval_steps_per_second": 24.276, "num_input_tokens_seen": 11927552000, "step": 45500 }, { "epoch": 0.306392023717635, "grad_norm": 0.19686584174633026, "learning_rate": 0.001, "loss": 3.0673, "num_input_tokens_seen": 11940659200, "step": 45550 }, { "epoch": 0.30672834866134263, "grad_norm": 0.24866420030593872, "learning_rate": 0.001, "loss": 3.0773, "num_input_tokens_seen": 11953766400, "step": 45600 }, { "epoch": 0.30706467360505024, "grad_norm": 0.208501935005188, "learning_rate": 0.001, "loss": 3.0751, "num_input_tokens_seen": 11966873600, "step": 45650 }, { "epoch": 0.30740099854875785, "grad_norm": 0.20565010607242584, "learning_rate": 0.001, "loss": 3.0709, "num_input_tokens_seen": 11979980800, "step": 45700 }, { "epoch": 0.30773732349246546, "grad_norm": 0.21517585217952728, "learning_rate": 0.001, "loss": 3.0765, "num_input_tokens_seen": 11993088000, "step": 45750 }, { "epoch": 0.3080736484361731, "grad_norm": 0.2618497610092163, "learning_rate": 0.001, "loss": 3.0549, "num_input_tokens_seen": 12006195200, "step": 45800 }, { "epoch": 0.3084099733798807, "grad_norm": 0.24163542687892914, "learning_rate": 0.001, "loss": 3.0704, "num_input_tokens_seen": 12019302400, "step": 45850 }, { "epoch": 0.3087462983235883, "grad_norm": 0.2038656324148178, "learning_rate": 0.001, "loss": 3.0631, "num_input_tokens_seen": 12032409600, "step": 45900 }, { "epoch": 0.3090826232672959, "grad_norm": 0.20691357553005219, "learning_rate": 0.001, "loss": 3.0723, "num_input_tokens_seen": 12045516800, "step": 45950 }, { "epoch": 0.3094189482110035, "grad_norm": 0.18757498264312744, "learning_rate": 0.001, "loss": 3.0602, "num_input_tokens_seen": 12058624000, "step": 46000 }, { "epoch": 0.3094189482110035, "eval_loss": 2.9599878787994385, "eval_runtime": 53.3376, "eval_samples_per_second": 93.742, "eval_steps_per_second": 23.436, "num_input_tokens_seen": 12058624000, "step": 46000 }, { "epoch": 0.30975527315471113, "grad_norm": 0.1939045637845993, "learning_rate": 0.001, "loss": 3.0701, "num_input_tokens_seen": 12071731200, "step": 46050 }, { "epoch": 0.31009159809841874, "grad_norm": 0.22705882787704468, "learning_rate": 0.001, "loss": 3.0574, "num_input_tokens_seen": 12084838400, "step": 46100 }, { "epoch": 0.3104279230421264, "grad_norm": 0.2075151801109314, "learning_rate": 0.001, "loss": 3.067, "num_input_tokens_seen": 12097945600, "step": 46150 }, { "epoch": 0.310764247985834, "grad_norm": 0.4686187207698822, "learning_rate": 0.001, "loss": 3.0656, "num_input_tokens_seen": 12111052800, "step": 46200 }, { "epoch": 0.31110057292954163, "grad_norm": 0.1929931640625, "learning_rate": 0.001, "loss": 3.0631, "num_input_tokens_seen": 12124160000, "step": 46250 }, { "epoch": 0.31143689787324924, "grad_norm": 0.18403789401054382, "learning_rate": 0.001, "loss": 3.0579, "num_input_tokens_seen": 12137267200, "step": 46300 }, { "epoch": 0.31177322281695685, "grad_norm": 0.18552987277507782, "learning_rate": 0.001, "loss": 3.0625, "num_input_tokens_seen": 12150374400, "step": 46350 }, { "epoch": 0.31210954776066446, "grad_norm": 0.24002918601036072, "learning_rate": 0.001, "loss": 3.0629, "num_input_tokens_seen": 12163481600, "step": 46400 }, { "epoch": 0.3124458727043721, "grad_norm": 0.17444545030593872, "learning_rate": 0.001, "loss": 3.055, "num_input_tokens_seen": 12176588800, "step": 46450 }, { "epoch": 0.3127821976480797, "grad_norm": 0.1911567747592926, "learning_rate": 0.001, "loss": 3.067, "num_input_tokens_seen": 12189696000, "step": 46500 }, { "epoch": 0.3127821976480797, "eval_loss": 2.9570987224578857, "eval_runtime": 52.5377, "eval_samples_per_second": 95.17, "eval_steps_per_second": 23.792, "num_input_tokens_seen": 12189696000, "step": 46500 }, { "epoch": 0.3131185225917873, "grad_norm": 0.18903926014900208, "learning_rate": 0.001, "loss": 3.0499, "num_input_tokens_seen": 12202803200, "step": 46550 }, { "epoch": 0.3134548475354949, "grad_norm": 0.35490429401397705, "learning_rate": 0.001, "loss": 3.0573, "num_input_tokens_seen": 12215910400, "step": 46600 }, { "epoch": 0.3137911724792025, "grad_norm": 0.2066306322813034, "learning_rate": 0.001, "loss": 3.0613, "num_input_tokens_seen": 12229017600, "step": 46650 }, { "epoch": 0.31412749742291013, "grad_norm": 0.3016819357872009, "learning_rate": 0.001, "loss": 3.068, "num_input_tokens_seen": 12242124800, "step": 46700 }, { "epoch": 0.31446382236661774, "grad_norm": 0.22070977091789246, "learning_rate": 0.001, "loss": 3.0929, "num_input_tokens_seen": 12255232000, "step": 46750 }, { "epoch": 0.31480014731032535, "grad_norm": 0.21311117708683014, "learning_rate": 0.001, "loss": 3.0739, "num_input_tokens_seen": 12268339200, "step": 46800 }, { "epoch": 0.31513647225403296, "grad_norm": 0.22895431518554688, "learning_rate": 0.001, "loss": 3.0726, "num_input_tokens_seen": 12281446400, "step": 46850 }, { "epoch": 0.3154727971977406, "grad_norm": 0.304040789604187, "learning_rate": 0.001, "loss": 3.0561, "num_input_tokens_seen": 12294553600, "step": 46900 }, { "epoch": 0.3158091221414482, "grad_norm": 0.18291215598583221, "learning_rate": 0.001, "loss": 3.0671, "num_input_tokens_seen": 12307660800, "step": 46950 }, { "epoch": 0.3161454470851558, "grad_norm": 0.19144318997859955, "learning_rate": 0.001, "loss": 3.0676, "num_input_tokens_seen": 12320768000, "step": 47000 }, { "epoch": 0.3161454470851558, "eval_loss": 2.9561285972595215, "eval_runtime": 53.2725, "eval_samples_per_second": 93.857, "eval_steps_per_second": 23.464, "num_input_tokens_seen": 12320768000, "step": 47000 }, { "epoch": 0.3164817720288634, "grad_norm": 0.1988057643175125, "learning_rate": 0.001, "loss": 3.0652, "num_input_tokens_seen": 12333875200, "step": 47050 }, { "epoch": 0.316818096972571, "grad_norm": 0.19520634412765503, "learning_rate": 0.001, "loss": 3.0641, "num_input_tokens_seen": 12346982400, "step": 47100 }, { "epoch": 0.31715442191627863, "grad_norm": 0.21420574188232422, "learning_rate": 0.001, "loss": 3.0665, "num_input_tokens_seen": 12360089600, "step": 47150 }, { "epoch": 0.31749074685998624, "grad_norm": 0.18173083662986755, "learning_rate": 0.001, "loss": 3.0565, "num_input_tokens_seen": 12373196800, "step": 47200 }, { "epoch": 0.31782707180369385, "grad_norm": 0.1746867150068283, "learning_rate": 0.001, "loss": 3.0739, "num_input_tokens_seen": 12386304000, "step": 47250 }, { "epoch": 0.31816339674740146, "grad_norm": 0.16941632330417633, "learning_rate": 0.001, "loss": 3.0514, "num_input_tokens_seen": 12399411200, "step": 47300 }, { "epoch": 0.3184997216911091, "grad_norm": 0.19572339951992035, "learning_rate": 0.001, "loss": 3.0512, "num_input_tokens_seen": 12412518400, "step": 47350 }, { "epoch": 0.3188360466348167, "grad_norm": 0.19083815813064575, "learning_rate": 0.001, "loss": 3.0482, "num_input_tokens_seen": 12425625600, "step": 47400 }, { "epoch": 0.3191723715785243, "grad_norm": 0.1741664558649063, "learning_rate": 0.001, "loss": 3.0544, "num_input_tokens_seen": 12438732800, "step": 47450 }, { "epoch": 0.3195086965222319, "grad_norm": 0.1787065714597702, "learning_rate": 0.001, "loss": 3.0544, "num_input_tokens_seen": 12451840000, "step": 47500 }, { "epoch": 0.3195086965222319, "eval_loss": 2.953366756439209, "eval_runtime": 53.4179, "eval_samples_per_second": 93.602, "eval_steps_per_second": 23.4, "num_input_tokens_seen": 12451840000, "step": 47500 }, { "epoch": 0.3198450214659395, "grad_norm": 0.1822129189968109, "learning_rate": 0.001, "loss": 3.0606, "num_input_tokens_seen": 12464947200, "step": 47550 }, { "epoch": 0.3201813464096471, "grad_norm": 0.20426377654075623, "learning_rate": 0.001, "loss": 3.0586, "num_input_tokens_seen": 12478054400, "step": 47600 }, { "epoch": 0.32051767135335474, "grad_norm": 0.2057754248380661, "learning_rate": 0.001, "loss": 3.0604, "num_input_tokens_seen": 12491161600, "step": 47650 }, { "epoch": 0.32085399629706235, "grad_norm": 0.19302618503570557, "learning_rate": 0.001, "loss": 3.0578, "num_input_tokens_seen": 12504268800, "step": 47700 }, { "epoch": 0.32119032124076996, "grad_norm": 0.4289242625236511, "learning_rate": 0.001, "loss": 3.0563, "num_input_tokens_seen": 12517376000, "step": 47750 }, { "epoch": 0.32152664618447757, "grad_norm": 0.6544061899185181, "learning_rate": 0.001, "loss": 3.072, "num_input_tokens_seen": 12530483200, "step": 47800 }, { "epoch": 0.3218629711281852, "grad_norm": 0.27349674701690674, "learning_rate": 0.001, "loss": 3.077, "num_input_tokens_seen": 12543590400, "step": 47850 }, { "epoch": 0.3221992960718928, "grad_norm": 0.21640093624591827, "learning_rate": 0.001, "loss": 3.0665, "num_input_tokens_seen": 12556697600, "step": 47900 }, { "epoch": 0.32253562101560046, "grad_norm": 0.19193384051322937, "learning_rate": 0.001, "loss": 3.0542, "num_input_tokens_seen": 12569804800, "step": 47950 }, { "epoch": 0.32287194595930807, "grad_norm": 0.21732182800769806, "learning_rate": 0.001, "loss": 3.0489, "num_input_tokens_seen": 12582912000, "step": 48000 }, { "epoch": 0.32287194595930807, "eval_loss": 2.95477294921875, "eval_runtime": 53.2298, "eval_samples_per_second": 93.932, "eval_steps_per_second": 23.483, "num_input_tokens_seen": 12582912000, "step": 48000 }, { "epoch": 0.3232082709030157, "grad_norm": 0.1941204071044922, "learning_rate": 0.001, "loss": 3.0582, "num_input_tokens_seen": 12596019200, "step": 48050 }, { "epoch": 0.3235445958467233, "grad_norm": 4.390545845031738, "learning_rate": 0.001, "loss": 3.0713, "num_input_tokens_seen": 12609126400, "step": 48100 }, { "epoch": 0.3238809207904309, "grad_norm": 0.2377273291349411, "learning_rate": 0.001, "loss": 3.0792, "num_input_tokens_seen": 12622233600, "step": 48150 }, { "epoch": 0.3242172457341385, "grad_norm": 0.20397226512432098, "learning_rate": 0.001, "loss": 3.0718, "num_input_tokens_seen": 12635340800, "step": 48200 }, { "epoch": 0.3245535706778461, "grad_norm": 0.21039831638336182, "learning_rate": 0.001, "loss": 3.0669, "num_input_tokens_seen": 12648448000, "step": 48250 }, { "epoch": 0.32488989562155374, "grad_norm": 0.18443848192691803, "learning_rate": 0.001, "loss": 3.0723, "num_input_tokens_seen": 12661555200, "step": 48300 }, { "epoch": 0.32522622056526135, "grad_norm": 0.1816088706254959, "learning_rate": 0.001, "loss": 3.0516, "num_input_tokens_seen": 12674662400, "step": 48350 }, { "epoch": 0.32556254550896896, "grad_norm": 0.17938339710235596, "learning_rate": 0.001, "loss": 3.0567, "num_input_tokens_seen": 12687769600, "step": 48400 }, { "epoch": 0.32589887045267657, "grad_norm": 0.2365075796842575, "learning_rate": 0.001, "loss": 3.0673, "num_input_tokens_seen": 12700876800, "step": 48450 }, { "epoch": 0.3262351953963842, "grad_norm": 0.24168556928634644, "learning_rate": 0.001, "loss": 3.072, "num_input_tokens_seen": 12713984000, "step": 48500 }, { "epoch": 0.3262351953963842, "eval_loss": 2.967775583267212, "eval_runtime": 53.2002, "eval_samples_per_second": 93.985, "eval_steps_per_second": 23.496, "num_input_tokens_seen": 12713984000, "step": 48500 }, { "epoch": 0.3265715203400918, "grad_norm": 0.3560108542442322, "learning_rate": 0.001, "loss": 3.0714, "num_input_tokens_seen": 12727091200, "step": 48550 }, { "epoch": 0.3269078452837994, "grad_norm": 2.40458345413208, "learning_rate": 0.001, "loss": 3.0729, "num_input_tokens_seen": 12740198400, "step": 48600 }, { "epoch": 0.327244170227507, "grad_norm": 0.3201708495616913, "learning_rate": 0.001, "loss": 3.0719, "num_input_tokens_seen": 12753305600, "step": 48650 }, { "epoch": 0.3275804951712146, "grad_norm": 0.2094539850950241, "learning_rate": 0.001, "loss": 3.0559, "num_input_tokens_seen": 12766412800, "step": 48700 }, { "epoch": 0.32791682011492224, "grad_norm": 0.2323814332485199, "learning_rate": 0.001, "loss": 3.0652, "num_input_tokens_seen": 12779520000, "step": 48750 }, { "epoch": 0.32825314505862985, "grad_norm": 0.20684729516506195, "learning_rate": 0.001, "loss": 3.0631, "num_input_tokens_seen": 12792627200, "step": 48800 }, { "epoch": 0.32858947000233746, "grad_norm": 0.19242416322231293, "learning_rate": 0.001, "loss": 3.0578, "num_input_tokens_seen": 12805734400, "step": 48850 }, { "epoch": 0.32892579494604507, "grad_norm": 0.1994556188583374, "learning_rate": 0.001, "loss": 3.0615, "num_input_tokens_seen": 12818841600, "step": 48900 }, { "epoch": 0.3292621198897527, "grad_norm": 0.19869546592235565, "learning_rate": 0.001, "loss": 3.0647, "num_input_tokens_seen": 12831948800, "step": 48950 }, { "epoch": 0.3295984448334603, "grad_norm": 0.21512825787067413, "learning_rate": 0.001, "loss": 3.0473, "num_input_tokens_seen": 12845056000, "step": 49000 }, { "epoch": 0.3295984448334603, "eval_loss": 2.952073335647583, "eval_runtime": 52.8116, "eval_samples_per_second": 94.676, "eval_steps_per_second": 23.669, "num_input_tokens_seen": 12845056000, "step": 49000 }, { "epoch": 0.3299347697771679, "grad_norm": 0.22994808852672577, "learning_rate": 0.001, "loss": 3.0594, "num_input_tokens_seen": 12858163200, "step": 49050 }, { "epoch": 0.3302710947208755, "grad_norm": 0.45408371090888977, "learning_rate": 0.001, "loss": 3.0777, "num_input_tokens_seen": 12871270400, "step": 49100 }, { "epoch": 0.3306074196645831, "grad_norm": 0.2698614001274109, "learning_rate": 0.001, "loss": 3.0689, "num_input_tokens_seen": 12884377600, "step": 49150 }, { "epoch": 0.33094374460829074, "grad_norm": 0.22741588950157166, "learning_rate": 0.001, "loss": 3.0746, "num_input_tokens_seen": 12897484800, "step": 49200 }, { "epoch": 0.33128006955199835, "grad_norm": 0.3616434335708618, "learning_rate": 0.001, "loss": 3.0813, "num_input_tokens_seen": 12910592000, "step": 49250 }, { "epoch": 0.33161639449570596, "grad_norm": 0.2551349401473999, "learning_rate": 0.001, "loss": 3.0749, "num_input_tokens_seen": 12923699200, "step": 49300 }, { "epoch": 0.33195271943941357, "grad_norm": 0.24054627120494843, "learning_rate": 0.001, "loss": 3.0664, "num_input_tokens_seen": 12936806400, "step": 49350 }, { "epoch": 0.3322890443831212, "grad_norm": 0.25859707593917847, "learning_rate": 0.001, "loss": 3.0673, "num_input_tokens_seen": 12949913600, "step": 49400 }, { "epoch": 0.3326253693268288, "grad_norm": 0.2500990629196167, "learning_rate": 0.001, "loss": 3.0575, "num_input_tokens_seen": 12963020800, "step": 49450 }, { "epoch": 0.3329616942705364, "grad_norm": 1.1027246713638306, "learning_rate": 0.001, "loss": 3.0573, "num_input_tokens_seen": 12976128000, "step": 49500 }, { "epoch": 0.3329616942705364, "eval_loss": 2.976292133331299, "eval_runtime": 53.0133, "eval_samples_per_second": 94.316, "eval_steps_per_second": 23.579, "num_input_tokens_seen": 12976128000, "step": 49500 }, { "epoch": 0.333298019214244, "grad_norm": 0.3605392873287201, "learning_rate": 0.001, "loss": 3.0741, "num_input_tokens_seen": 12989235200, "step": 49550 }, { "epoch": 0.3336343441579516, "grad_norm": 0.3265780210494995, "learning_rate": 0.001, "loss": 3.0707, "num_input_tokens_seen": 13002342400, "step": 49600 }, { "epoch": 0.33397066910165923, "grad_norm": 0.4178712069988251, "learning_rate": 0.001, "loss": 3.0681, "num_input_tokens_seen": 13015449600, "step": 49650 }, { "epoch": 0.33430699404536685, "grad_norm": 0.2647295892238617, "learning_rate": 0.001, "loss": 3.0684, "num_input_tokens_seen": 13028556800, "step": 49700 }, { "epoch": 0.3346433189890745, "grad_norm": 0.20664212107658386, "learning_rate": 0.001, "loss": 3.0649, "num_input_tokens_seen": 13041664000, "step": 49750 }, { "epoch": 0.3349796439327821, "grad_norm": 0.45491111278533936, "learning_rate": 0.001, "loss": 3.0667, "num_input_tokens_seen": 13054771200, "step": 49800 }, { "epoch": 0.33531596887648973, "grad_norm": 0.27275514602661133, "learning_rate": 0.001, "loss": 3.0717, "num_input_tokens_seen": 13067878400, "step": 49850 }, { "epoch": 0.33565229382019734, "grad_norm": 0.24294881522655487, "learning_rate": 0.001, "loss": 3.0717, "num_input_tokens_seen": 13080985600, "step": 49900 }, { "epoch": 0.33598861876390496, "grad_norm": 0.2790290415287018, "learning_rate": 0.001, "loss": 3.0687, "num_input_tokens_seen": 13094092800, "step": 49950 }, { "epoch": 0.33632494370761257, "grad_norm": 0.32556888461112976, "learning_rate": 0.001, "loss": 3.0805, "num_input_tokens_seen": 13107200000, "step": 50000 }, { "epoch": 0.33632494370761257, "eval_loss": 2.958136796951294, "eval_runtime": 53.5587, "eval_samples_per_second": 93.355, "eval_steps_per_second": 23.339, "num_input_tokens_seen": 13107200000, "step": 50000 }, { "epoch": 0.3366612686513202, "grad_norm": 0.2707626223564148, "learning_rate": 0.001, "loss": 3.0719, "num_input_tokens_seen": 13120307200, "step": 50050 }, { "epoch": 0.3369975935950278, "grad_norm": 0.28657791018486023, "learning_rate": 0.001, "loss": 3.0713, "num_input_tokens_seen": 13133414400, "step": 50100 }, { "epoch": 0.3373339185387354, "grad_norm": 0.22508807480335236, "learning_rate": 0.001, "loss": 3.0626, "num_input_tokens_seen": 13146521600, "step": 50150 }, { "epoch": 0.337670243482443, "grad_norm": 0.3013211786746979, "learning_rate": 0.001, "loss": 3.0871, "num_input_tokens_seen": 13159628800, "step": 50200 }, { "epoch": 0.3380065684261506, "grad_norm": 0.4010023772716522, "learning_rate": 0.001, "loss": 3.0624, "num_input_tokens_seen": 13172736000, "step": 50250 }, { "epoch": 0.33834289336985823, "grad_norm": 0.23215509951114655, "learning_rate": 0.001, "loss": 3.058, "num_input_tokens_seen": 13185843200, "step": 50300 }, { "epoch": 0.33867921831356584, "grad_norm": 0.3135644495487213, "learning_rate": 0.001, "loss": 3.0668, "num_input_tokens_seen": 13198950400, "step": 50350 }, { "epoch": 0.33901554325727346, "grad_norm": 0.8496716618537903, "learning_rate": 0.001, "loss": 3.0558, "num_input_tokens_seen": 13212057600, "step": 50400 }, { "epoch": 0.33935186820098107, "grad_norm": 0.2706848084926605, "learning_rate": 0.001, "loss": 3.0656, "num_input_tokens_seen": 13225164800, "step": 50450 }, { "epoch": 0.3396881931446887, "grad_norm": 0.24779066443443298, "learning_rate": 0.001, "loss": 3.073, "num_input_tokens_seen": 13238272000, "step": 50500 }, { "epoch": 0.3396881931446887, "eval_loss": 2.9553372859954834, "eval_runtime": 53.3111, "eval_samples_per_second": 93.789, "eval_steps_per_second": 23.447, "num_input_tokens_seen": 13238272000, "step": 50500 }, { "epoch": 0.3400245180883963, "grad_norm": 0.2277699112892151, "learning_rate": 0.001, "loss": 3.0516, "num_input_tokens_seen": 13251379200, "step": 50550 }, { "epoch": 0.3403608430321039, "grad_norm": 0.2331630438566208, "learning_rate": 0.001, "loss": 3.0685, "num_input_tokens_seen": 13264486400, "step": 50600 }, { "epoch": 0.3406971679758115, "grad_norm": 0.2572009563446045, "learning_rate": 0.001, "loss": 3.0628, "num_input_tokens_seen": 13277593600, "step": 50650 }, { "epoch": 0.3410334929195191, "grad_norm": 0.35386911034584045, "learning_rate": 0.001, "loss": 3.0554, "num_input_tokens_seen": 13290700800, "step": 50700 }, { "epoch": 0.34136981786322673, "grad_norm": 0.19923870265483856, "learning_rate": 0.001, "loss": 3.0579, "num_input_tokens_seen": 13303808000, "step": 50750 }, { "epoch": 0.34170614280693434, "grad_norm": 0.22291727364063263, "learning_rate": 0.001, "loss": 3.0653, "num_input_tokens_seen": 13316915200, "step": 50800 }, { "epoch": 0.34204246775064195, "grad_norm": 0.2049180120229721, "learning_rate": 0.001, "loss": 3.0511, "num_input_tokens_seen": 13330022400, "step": 50850 }, { "epoch": 0.34237879269434957, "grad_norm": 0.2586233615875244, "learning_rate": 0.001, "loss": 3.0589, "num_input_tokens_seen": 13343129600, "step": 50900 }, { "epoch": 0.3427151176380572, "grad_norm": 0.2137664556503296, "learning_rate": 0.001, "loss": 3.0522, "num_input_tokens_seen": 13356236800, "step": 50950 }, { "epoch": 0.3430514425817648, "grad_norm": 0.21726618707180023, "learning_rate": 0.001, "loss": 3.054, "num_input_tokens_seen": 13369344000, "step": 51000 }, { "epoch": 0.3430514425817648, "eval_loss": 2.948333740234375, "eval_runtime": 53.0951, "eval_samples_per_second": 94.171, "eval_steps_per_second": 23.543, "num_input_tokens_seen": 13369344000, "step": 51000 }, { "epoch": 0.3433877675254724, "grad_norm": 0.19399498403072357, "learning_rate": 0.001, "loss": 3.0583, "num_input_tokens_seen": 13382451200, "step": 51050 }, { "epoch": 0.34372409246918, "grad_norm": 0.19893072545528412, "learning_rate": 0.001, "loss": 3.0505, "num_input_tokens_seen": 13395558400, "step": 51100 }, { "epoch": 0.3440604174128876, "grad_norm": 0.17791305482387543, "learning_rate": 0.001, "loss": 3.0504, "num_input_tokens_seen": 13408665600, "step": 51150 }, { "epoch": 0.34439674235659523, "grad_norm": 0.7631425261497498, "learning_rate": 0.001, "loss": 3.0483, "num_input_tokens_seen": 13421772800, "step": 51200 }, { "epoch": 0.34473306730030284, "grad_norm": 0.22620978951454163, "learning_rate": 0.001, "loss": 3.0512, "num_input_tokens_seen": 13434880000, "step": 51250 }, { "epoch": 0.34506939224401045, "grad_norm": 0.219919815659523, "learning_rate": 0.001, "loss": 3.0415, "num_input_tokens_seen": 13447987200, "step": 51300 }, { "epoch": 0.34540571718771806, "grad_norm": 0.21654649078845978, "learning_rate": 0.001, "loss": 3.062, "num_input_tokens_seen": 13461094400, "step": 51350 }, { "epoch": 0.3457420421314257, "grad_norm": 0.2439095377922058, "learning_rate": 0.001, "loss": 3.0478, "num_input_tokens_seen": 13474201600, "step": 51400 }, { "epoch": 0.3460783670751333, "grad_norm": 0.19535380601882935, "learning_rate": 0.001, "loss": 3.0444, "num_input_tokens_seen": 13487308800, "step": 51450 }, { "epoch": 0.3464146920188409, "grad_norm": 0.1964534968137741, "learning_rate": 0.001, "loss": 3.049, "num_input_tokens_seen": 13500416000, "step": 51500 }, { "epoch": 0.3464146920188409, "eval_loss": 2.945749044418335, "eval_runtime": 53.0447, "eval_samples_per_second": 94.26, "eval_steps_per_second": 23.565, "num_input_tokens_seen": 13500416000, "step": 51500 }, { "epoch": 0.3467510169625485, "grad_norm": 0.2085062563419342, "learning_rate": 0.001, "loss": 3.0582, "num_input_tokens_seen": 13513523200, "step": 51550 }, { "epoch": 0.3470873419062562, "grad_norm": 0.1903097778558731, "learning_rate": 0.001, "loss": 3.0488, "num_input_tokens_seen": 13526630400, "step": 51600 }, { "epoch": 0.3474236668499638, "grad_norm": 0.20101405680179596, "learning_rate": 0.001, "loss": 3.0573, "num_input_tokens_seen": 13539737600, "step": 51650 }, { "epoch": 0.3477599917936714, "grad_norm": 0.6418889164924622, "learning_rate": 0.001, "loss": 3.0513, "num_input_tokens_seen": 13552844800, "step": 51700 }, { "epoch": 0.348096316737379, "grad_norm": 0.22524093091487885, "learning_rate": 0.001, "loss": 3.0567, "num_input_tokens_seen": 13565952000, "step": 51750 }, { "epoch": 0.3484326416810866, "grad_norm": 0.21830599009990692, "learning_rate": 0.001, "loss": 3.0538, "num_input_tokens_seen": 13579059200, "step": 51800 }, { "epoch": 0.34876896662479423, "grad_norm": 0.6111611127853394, "learning_rate": 0.001, "loss": 3.0581, "num_input_tokens_seen": 13592166400, "step": 51850 }, { "epoch": 0.34910529156850184, "grad_norm": 0.3782864511013031, "learning_rate": 0.001, "loss": 3.0694, "num_input_tokens_seen": 13605273600, "step": 51900 }, { "epoch": 0.34944161651220945, "grad_norm": 0.23944802582263947, "learning_rate": 0.001, "loss": 3.0683, "num_input_tokens_seen": 13618380800, "step": 51950 }, { "epoch": 0.34977794145591706, "grad_norm": 0.20257577300071716, "learning_rate": 0.001, "loss": 3.0509, "num_input_tokens_seen": 13631488000, "step": 52000 }, { "epoch": 0.34977794145591706, "eval_loss": 2.94769287109375, "eval_runtime": 53.1351, "eval_samples_per_second": 94.1, "eval_steps_per_second": 23.525, "num_input_tokens_seen": 13631488000, "step": 52000 }, { "epoch": 0.3501142663996247, "grad_norm": 0.22132734954357147, "learning_rate": 0.001, "loss": 3.0564, "num_input_tokens_seen": 13644595200, "step": 52050 }, { "epoch": 0.3504505913433323, "grad_norm": 0.19554653763771057, "learning_rate": 0.001, "loss": 3.0457, "num_input_tokens_seen": 13657702400, "step": 52100 }, { "epoch": 0.3507869162870399, "grad_norm": 0.23935073614120483, "learning_rate": 0.001, "loss": 3.0465, "num_input_tokens_seen": 13670809600, "step": 52150 }, { "epoch": 0.3511232412307475, "grad_norm": 0.2895826995372772, "learning_rate": 0.001, "loss": 3.0509, "num_input_tokens_seen": 13683916800, "step": 52200 }, { "epoch": 0.3514595661744551, "grad_norm": 0.24599236249923706, "learning_rate": 0.001, "loss": 3.0385, "num_input_tokens_seen": 13697024000, "step": 52250 }, { "epoch": 0.35179589111816273, "grad_norm": 0.19500850141048431, "learning_rate": 0.001, "loss": 3.0523, "num_input_tokens_seen": 13710131200, "step": 52300 }, { "epoch": 0.35213221606187034, "grad_norm": 0.20790818333625793, "learning_rate": 0.001, "loss": 3.0547, "num_input_tokens_seen": 13723238400, "step": 52350 }, { "epoch": 0.35246854100557795, "grad_norm": 0.18653196096420288, "learning_rate": 0.001, "loss": 3.0545, "num_input_tokens_seen": 13736345600, "step": 52400 }, { "epoch": 0.35280486594928556, "grad_norm": 0.22097791731357574, "learning_rate": 0.001, "loss": 3.0573, "num_input_tokens_seen": 13749452800, "step": 52450 }, { "epoch": 0.3531411908929932, "grad_norm": 0.22931267321109772, "learning_rate": 0.001, "loss": 3.0478, "num_input_tokens_seen": 13762560000, "step": 52500 }, { "epoch": 0.3531411908929932, "eval_loss": 2.9459915161132812, "eval_runtime": 52.6495, "eval_samples_per_second": 94.968, "eval_steps_per_second": 23.742, "num_input_tokens_seen": 13762560000, "step": 52500 }, { "epoch": 0.3534775158367008, "grad_norm": 0.31109049916267395, "learning_rate": 0.001, "loss": 3.0462, "num_input_tokens_seen": 13775667200, "step": 52550 }, { "epoch": 0.3538138407804084, "grad_norm": 1.7297276258468628, "learning_rate": 0.001, "loss": 3.0629, "num_input_tokens_seen": 13788774400, "step": 52600 }, { "epoch": 0.354150165724116, "grad_norm": 0.4056268334388733, "learning_rate": 0.001, "loss": 3.0763, "num_input_tokens_seen": 13801881600, "step": 52650 }, { "epoch": 0.3544864906678236, "grad_norm": 0.3694227635860443, "learning_rate": 0.001, "loss": 3.099, "num_input_tokens_seen": 13814988800, "step": 52700 }, { "epoch": 0.35482281561153123, "grad_norm": 0.2708556056022644, "learning_rate": 0.001, "loss": 3.0985, "num_input_tokens_seen": 13828096000, "step": 52750 }, { "epoch": 0.35515914055523884, "grad_norm": 0.27150145173072815, "learning_rate": 0.001, "loss": 3.0694, "num_input_tokens_seen": 13841203200, "step": 52800 }, { "epoch": 0.35549546549894645, "grad_norm": 0.2626855969429016, "learning_rate": 0.001, "loss": 3.0642, "num_input_tokens_seen": 13854310400, "step": 52850 }, { "epoch": 0.35583179044265406, "grad_norm": 0.20539118349552155, "learning_rate": 0.001, "loss": 3.059, "num_input_tokens_seen": 13867417600, "step": 52900 }, { "epoch": 0.35616811538636167, "grad_norm": 0.21489828824996948, "learning_rate": 0.001, "loss": 3.054, "num_input_tokens_seen": 13880524800, "step": 52950 }, { "epoch": 0.3565044403300693, "grad_norm": 0.263488233089447, "learning_rate": 0.001, "loss": 3.044, "num_input_tokens_seen": 13893632000, "step": 53000 }, { "epoch": 0.3565044403300693, "eval_loss": 2.9570043087005615, "eval_runtime": 53.2194, "eval_samples_per_second": 93.951, "eval_steps_per_second": 23.488, "num_input_tokens_seen": 13893632000, "step": 53000 }, { "epoch": 0.3568407652737769, "grad_norm": 0.3147699236869812, "learning_rate": 0.001, "loss": 3.0557, "num_input_tokens_seen": 13906739200, "step": 53050 }, { "epoch": 0.3571770902174845, "grad_norm": 0.22110533714294434, "learning_rate": 0.001, "loss": 3.0515, "num_input_tokens_seen": 13919846400, "step": 53100 }, { "epoch": 0.3575134151611921, "grad_norm": 0.23334212601184845, "learning_rate": 0.001, "loss": 3.0523, "num_input_tokens_seen": 13932953600, "step": 53150 }, { "epoch": 0.3578497401048997, "grad_norm": 0.200640469789505, "learning_rate": 0.001, "loss": 3.0621, "num_input_tokens_seen": 13946060800, "step": 53200 }, { "epoch": 0.35818606504860734, "grad_norm": 0.20875929296016693, "learning_rate": 0.001, "loss": 3.0591, "num_input_tokens_seen": 13959168000, "step": 53250 }, { "epoch": 0.35852238999231495, "grad_norm": 0.19065573811531067, "learning_rate": 0.001, "loss": 3.0591, "num_input_tokens_seen": 13972275200, "step": 53300 }, { "epoch": 0.35885871493602256, "grad_norm": 0.18688392639160156, "learning_rate": 0.001, "loss": 3.0475, "num_input_tokens_seen": 13985382400, "step": 53350 }, { "epoch": 0.3591950398797302, "grad_norm": 0.1864282786846161, "learning_rate": 0.001, "loss": 3.0485, "num_input_tokens_seen": 13998489600, "step": 53400 }, { "epoch": 0.35953136482343784, "grad_norm": 0.20456114411354065, "learning_rate": 0.001, "loss": 3.0529, "num_input_tokens_seen": 14011596800, "step": 53450 }, { "epoch": 0.35986768976714545, "grad_norm": 0.24362069368362427, "learning_rate": 0.001, "loss": 3.0444, "num_input_tokens_seen": 14024704000, "step": 53500 }, { "epoch": 0.35986768976714545, "eval_loss": 2.943416118621826, "eval_runtime": 53.1574, "eval_samples_per_second": 94.06, "eval_steps_per_second": 23.515, "num_input_tokens_seen": 14024704000, "step": 53500 }, { "epoch": 0.36020401471085306, "grad_norm": 0.19701169431209564, "learning_rate": 0.001, "loss": 3.0513, "num_input_tokens_seen": 14037811200, "step": 53550 }, { "epoch": 0.36054033965456067, "grad_norm": 0.1785692274570465, "learning_rate": 0.001, "loss": 3.0541, "num_input_tokens_seen": 14050918400, "step": 53600 }, { "epoch": 0.3608766645982683, "grad_norm": 0.1865462064743042, "learning_rate": 0.001, "loss": 3.0367, "num_input_tokens_seen": 14064025600, "step": 53650 }, { "epoch": 0.3612129895419759, "grad_norm": 0.4129047095775604, "learning_rate": 0.001, "loss": 3.043, "num_input_tokens_seen": 14077132800, "step": 53700 }, { "epoch": 0.3615493144856835, "grad_norm": 0.21066440641880035, "learning_rate": 0.001, "loss": 3.0585, "num_input_tokens_seen": 14090240000, "step": 53750 }, { "epoch": 0.3618856394293911, "grad_norm": 0.6820788383483887, "learning_rate": 0.001, "loss": 3.0534, "num_input_tokens_seen": 14103347200, "step": 53800 }, { "epoch": 0.3622219643730987, "grad_norm": 0.9664424657821655, "learning_rate": 0.001, "loss": 3.069, "num_input_tokens_seen": 14116454400, "step": 53850 }, { "epoch": 0.36255828931680634, "grad_norm": 0.35416921973228455, "learning_rate": 0.001, "loss": 3.0629, "num_input_tokens_seen": 14129561600, "step": 53900 }, { "epoch": 0.36289461426051395, "grad_norm": 0.3159606158733368, "learning_rate": 0.001, "loss": 3.0722, "num_input_tokens_seen": 14142668800, "step": 53950 }, { "epoch": 0.36323093920422156, "grad_norm": 0.2518790662288666, "learning_rate": 0.001, "loss": 3.071, "num_input_tokens_seen": 14155776000, "step": 54000 }, { "epoch": 0.36323093920422156, "eval_loss": 2.9483964443206787, "eval_runtime": 53.2042, "eval_samples_per_second": 93.978, "eval_steps_per_second": 23.494, "num_input_tokens_seen": 14155776000, "step": 54000 }, { "epoch": 0.36356726414792917, "grad_norm": 0.2197147160768509, "learning_rate": 0.0009998286624877785, "loss": 3.0502, "num_input_tokens_seen": 14168883200, "step": 54050 }, { "epoch": 0.3639035890916368, "grad_norm": 0.22259306907653809, "learning_rate": 0.0009993147673772868, "loss": 3.0433, "num_input_tokens_seen": 14181990400, "step": 54100 }, { "epoch": 0.3642399140353444, "grad_norm": 0.19341766834259033, "learning_rate": 0.000998458666866564, "loss": 3.0486, "num_input_tokens_seen": 14195097600, "step": 54150 }, { "epoch": 0.364576238979052, "grad_norm": 0.2313617616891861, "learning_rate": 0.0009972609476841367, "loss": 3.0446, "num_input_tokens_seen": 14208204800, "step": 54200 }, { "epoch": 0.3649125639227596, "grad_norm": 0.1925128698348999, "learning_rate": 0.0009957224306869053, "loss": 3.0528, "num_input_tokens_seen": 14221312000, "step": 54250 }, { "epoch": 0.3652488888664672, "grad_norm": 0.2100643515586853, "learning_rate": 0.0009938441702975688, "loss": 3.0453, "num_input_tokens_seen": 14234419200, "step": 54300 }, { "epoch": 0.36558521381017484, "grad_norm": 0.46658360958099365, "learning_rate": 0.0009916274537819774, "loss": 3.0464, "num_input_tokens_seen": 14247526400, "step": 54350 }, { "epoch": 0.36592153875388245, "grad_norm": 0.19623732566833496, "learning_rate": 0.0009890738003669028, "loss": 3.0427, "num_input_tokens_seen": 14260633600, "step": 54400 }, { "epoch": 0.36625786369759006, "grad_norm": 0.24941138923168182, "learning_rate": 0.0009861849601988384, "loss": 3.0528, "num_input_tokens_seen": 14273740800, "step": 54450 }, { "epoch": 0.36659418864129767, "grad_norm": 0.22141198813915253, "learning_rate": 0.0009829629131445341, "loss": 3.0523, "num_input_tokens_seen": 14286848000, "step": 54500 }, { "epoch": 0.36659418864129767, "eval_loss": 2.9419288635253906, "eval_runtime": 53.6937, "eval_samples_per_second": 93.121, "eval_steps_per_second": 23.28, "num_input_tokens_seen": 14286848000, "step": 54500 }, { "epoch": 0.3669305135850053, "grad_norm": 0.2028401494026184, "learning_rate": 0.0009794098674340967, "loss": 3.0403, "num_input_tokens_seen": 14299955200, "step": 54550 }, { "epoch": 0.3672668385287129, "grad_norm": 0.20509253442287445, "learning_rate": 0.0009755282581475768, "loss": 3.0543, "num_input_tokens_seen": 14313062400, "step": 54600 }, { "epoch": 0.3676031634724205, "grad_norm": 1.2793521881103516, "learning_rate": 0.0009713207455460893, "loss": 3.0718, "num_input_tokens_seen": 14326169600, "step": 54650 }, { "epoch": 0.3679394884161281, "grad_norm": 1.1210218667984009, "learning_rate": 0.0009667902132486009, "loss": 3.0706, "num_input_tokens_seen": 14339276800, "step": 54700 }, { "epoch": 0.3682758133598357, "grad_norm": 0.5492864847183228, "learning_rate": 0.0009619397662556434, "loss": 3.0793, "num_input_tokens_seen": 14352384000, "step": 54750 }, { "epoch": 0.36861213830354334, "grad_norm": 0.34732338786125183, "learning_rate": 0.0009567727288213005, "loss": 3.0662, "num_input_tokens_seen": 14365491200, "step": 54800 }, { "epoch": 0.36894846324725095, "grad_norm": 0.2698073983192444, "learning_rate": 0.0009512926421749304, "loss": 3.0682, "num_input_tokens_seen": 14378598400, "step": 54850 }, { "epoch": 0.36928478819095856, "grad_norm": 0.593543529510498, "learning_rate": 0.0009455032620941839, "loss": 3.0507, "num_input_tokens_seen": 14391705600, "step": 54900 }, { "epoch": 0.36962111313466617, "grad_norm": 0.28389155864715576, "learning_rate": 0.0009394085563309827, "loss": 3.0593, "num_input_tokens_seen": 14404812800, "step": 54950 }, { "epoch": 0.3699574380783738, "grad_norm": 0.2569947838783264, "learning_rate": 0.0009330127018922195, "loss": 3.0524, "num_input_tokens_seen": 14417920000, "step": 55000 }, { "epoch": 0.3699574380783738, "eval_loss": 2.9468750953674316, "eval_runtime": 52.9661, "eval_samples_per_second": 94.4, "eval_steps_per_second": 23.6, "num_input_tokens_seen": 14417920000, "step": 55000 }, { "epoch": 0.3702937630220814, "grad_norm": 0.2545956075191498, "learning_rate": 0.0009263200821770461, "loss": 3.0397, "num_input_tokens_seen": 14431027200, "step": 55050 }, { "epoch": 0.370630087965789, "grad_norm": 0.26363736391067505, "learning_rate": 0.0009193352839727121, "loss": 3.0554, "num_input_tokens_seen": 14444134400, "step": 55100 }, { "epoch": 0.3709664129094966, "grad_norm": 0.2228112667798996, "learning_rate": 0.0009120630943110077, "loss": 3.0482, "num_input_tokens_seen": 14457241600, "step": 55150 }, { "epoch": 0.3713027378532043, "grad_norm": 0.2184106856584549, "learning_rate": 0.0009045084971874737, "loss": 3.0368, "num_input_tokens_seen": 14470348800, "step": 55200 }, { "epoch": 0.3716390627969119, "grad_norm": 0.5658212900161743, "learning_rate": 0.0008966766701456176, "loss": 3.0541, "num_input_tokens_seen": 14483456000, "step": 55250 }, { "epoch": 0.3719753877406195, "grad_norm": 0.31839439272880554, "learning_rate": 0.0008885729807284854, "loss": 3.0516, "num_input_tokens_seen": 14496563200, "step": 55300 }, { "epoch": 0.3723117126843271, "grad_norm": 0.2521055042743683, "learning_rate": 0.0008802029828000156, "loss": 3.049, "num_input_tokens_seen": 14509670400, "step": 55350 }, { "epoch": 0.3726480376280347, "grad_norm": 0.23797062039375305, "learning_rate": 0.0008715724127386971, "loss": 3.0393, "num_input_tokens_seen": 14522777600, "step": 55400 }, { "epoch": 0.37298436257174233, "grad_norm": 0.26673102378845215, "learning_rate": 0.0008626871855061438, "loss": 3.0535, "num_input_tokens_seen": 14535884800, "step": 55450 }, { "epoch": 0.37332068751544994, "grad_norm": 0.37754055857658386, "learning_rate": 0.0008535533905932737, "loss": 3.0432, "num_input_tokens_seen": 14548992000, "step": 55500 }, { "epoch": 0.37332068751544994, "eval_loss": 2.9362170696258545, "eval_runtime": 53.4795, "eval_samples_per_second": 93.494, "eval_steps_per_second": 23.373, "num_input_tokens_seen": 14548992000, "step": 55500 }, { "epoch": 0.37365701245915756, "grad_norm": 0.2160724252462387, "learning_rate": 0.000844177287846877, "loss": 3.0378, "num_input_tokens_seen": 14562099200, "step": 55550 }, { "epoch": 0.37399333740286517, "grad_norm": 0.22323860228061676, "learning_rate": 0.0008345653031794292, "loss": 3.0419, "num_input_tokens_seen": 14575206400, "step": 55600 }, { "epoch": 0.3743296623465728, "grad_norm": 0.19688346982002258, "learning_rate": 0.0008247240241650918, "loss": 3.0297, "num_input_tokens_seen": 14588313600, "step": 55650 }, { "epoch": 0.3746659872902804, "grad_norm": 0.1972673088312149, "learning_rate": 0.0008146601955249188, "loss": 3.0405, "num_input_tokens_seen": 14601420800, "step": 55700 }, { "epoch": 0.375002312233988, "grad_norm": 0.44073277711868286, "learning_rate": 0.0008043807145043603, "loss": 3.0343, "num_input_tokens_seen": 14614528000, "step": 55750 }, { "epoch": 0.3753386371776956, "grad_norm": 0.22042399644851685, "learning_rate": 0.0007938926261462366, "loss": 3.0337, "num_input_tokens_seen": 14627635200, "step": 55800 }, { "epoch": 0.3756749621214032, "grad_norm": 0.2954588234424591, "learning_rate": 0.0007832031184624164, "loss": 3.0334, "num_input_tokens_seen": 14640742400, "step": 55850 }, { "epoch": 0.37601128706511083, "grad_norm": 0.5062097907066345, "learning_rate": 0.0007723195175075137, "loss": 3.0385, "num_input_tokens_seen": 14653849600, "step": 55900 }, { "epoch": 0.37634761200881844, "grad_norm": 0.30344095826148987, "learning_rate": 0.0007612492823579744, "loss": 3.04, "num_input_tokens_seen": 14666956800, "step": 55950 }, { "epoch": 0.37668393695252606, "grad_norm": 0.21088473498821259, "learning_rate": 0.00075, "loss": 3.0364, "num_input_tokens_seen": 14680064000, "step": 56000 }, { "epoch": 0.37668393695252606, "eval_loss": 2.9313743114471436, "eval_runtime": 53.142, "eval_samples_per_second": 94.088, "eval_steps_per_second": 23.522, "num_input_tokens_seen": 14680064000, "step": 56000 }, { "epoch": 0.37702026189623367, "grad_norm": 0.2067674696445465, "learning_rate": 0.0007385793801298042, "loss": 3.05, "num_input_tokens_seen": 14693171200, "step": 56050 }, { "epoch": 0.3773565868399413, "grad_norm": 0.20803235471248627, "learning_rate": 0.0007269952498697733, "loss": 3.0451, "num_input_tokens_seen": 14706278400, "step": 56100 }, { "epoch": 0.3776929117836489, "grad_norm": 0.2035783976316452, "learning_rate": 0.0007152555484041476, "loss": 3.0281, "num_input_tokens_seen": 14719385600, "step": 56150 }, { "epoch": 0.3780292367273565, "grad_norm": 0.21911849081516266, "learning_rate": 0.0007033683215379002, "loss": 3.0312, "num_input_tokens_seen": 14732492800, "step": 56200 }, { "epoch": 0.3783655616710641, "grad_norm": 0.2263978123664856, "learning_rate": 0.000691341716182545, "loss": 3.0237, "num_input_tokens_seen": 14745600000, "step": 56250 }, { "epoch": 0.3787018866147717, "grad_norm": 0.20394045114517212, "learning_rate": 0.0006791839747726501, "loss": 3.0271, "num_input_tokens_seen": 14758707200, "step": 56300 }, { "epoch": 0.37903821155847933, "grad_norm": 0.1954122930765152, "learning_rate": 0.0006669034296168854, "loss": 3.0368, "num_input_tokens_seen": 14771814400, "step": 56350 }, { "epoch": 0.37937453650218694, "grad_norm": 0.2434541881084442, "learning_rate": 0.0006545084971874737, "loss": 3.0268, "num_input_tokens_seen": 14784921600, "step": 56400 }, { "epoch": 0.37971086144589455, "grad_norm": 0.19820261001586914, "learning_rate": 0.0006420076723519614, "loss": 3.0193, "num_input_tokens_seen": 14798028800, "step": 56450 }, { "epoch": 0.38004718638960217, "grad_norm": 0.18117697536945343, "learning_rate": 0.0006294095225512603, "loss": 3.0241, "num_input_tokens_seen": 14811136000, "step": 56500 }, { "epoch": 0.38004718638960217, "eval_loss": 2.920185089111328, "eval_runtime": 53.8805, "eval_samples_per_second": 92.798, "eval_steps_per_second": 23.199, "num_input_tokens_seen": 14811136000, "step": 56500 }, { "epoch": 0.3803835113333098, "grad_norm": 0.20303522050380707, "learning_rate": 0.0006167226819279528, "loss": 3.0133, "num_input_tokens_seen": 14824243200, "step": 56550 }, { "epoch": 0.3807198362770174, "grad_norm": 0.19498929381370544, "learning_rate": 0.0006039558454088796, "loss": 3.0241, "num_input_tokens_seen": 14837350400, "step": 56600 }, { "epoch": 0.381056161220725, "grad_norm": 0.21773076057434082, "learning_rate": 0.0005911177627460738, "loss": 3.0235, "num_input_tokens_seen": 14850457600, "step": 56650 }, { "epoch": 0.3813924861644326, "grad_norm": 0.19796748459339142, "learning_rate": 0.0005782172325201155, "loss": 3.019, "num_input_tokens_seen": 14863564800, "step": 56700 }, { "epoch": 0.3817288111081402, "grad_norm": 0.18569409847259521, "learning_rate": 0.000565263096110026, "loss": 3.0189, "num_input_tokens_seen": 14876672000, "step": 56750 }, { "epoch": 0.38206513605184783, "grad_norm": 0.27358362078666687, "learning_rate": 0.0005522642316338268, "loss": 3.0107, "num_input_tokens_seen": 14889779200, "step": 56800 }, { "epoch": 0.38240146099555544, "grad_norm": 0.2143600583076477, "learning_rate": 0.0005392295478639225, "loss": 3.0139, "num_input_tokens_seen": 14902886400, "step": 56850 }, { "epoch": 0.38273778593926305, "grad_norm": 0.18786349892616272, "learning_rate": 0.000526167978121472, "loss": 3.0187, "num_input_tokens_seen": 14915993600, "step": 56900 }, { "epoch": 0.38307411088297066, "grad_norm": 0.1809261441230774, "learning_rate": 0.0005130884741539367, "loss": 3.0197, "num_input_tokens_seen": 14929100800, "step": 56950 }, { "epoch": 0.38341043582667833, "grad_norm": 0.1926116794347763, "learning_rate": 0.0005, "loss": 3.0101, "num_input_tokens_seen": 14942208000, "step": 57000 }, { "epoch": 0.38341043582667833, "eval_loss": 2.912503242492676, "eval_runtime": 52.7455, "eval_samples_per_second": 94.795, "eval_steps_per_second": 23.699, "num_input_tokens_seen": 14942208000, "step": 57000 }, { "epoch": 0.38374676077038594, "grad_norm": 0.255500853061676, "learning_rate": 0.0004869115258460635, "loss": 3.0102, "num_input_tokens_seen": 14955315200, "step": 57050 }, { "epoch": 0.38408308571409355, "grad_norm": 0.18287675082683563, "learning_rate": 0.0004738320218785281, "loss": 3.0074, "num_input_tokens_seen": 14968422400, "step": 57100 }, { "epoch": 0.38441941065780116, "grad_norm": 0.1864452064037323, "learning_rate": 0.0004607704521360776, "loss": 3.0181, "num_input_tokens_seen": 14981529600, "step": 57150 }, { "epoch": 0.3847557356015088, "grad_norm": 0.17273065447807312, "learning_rate": 0.00044773576836617336, "loss": 3.0077, "num_input_tokens_seen": 14994636800, "step": 57200 }, { "epoch": 0.3850920605452164, "grad_norm": 0.17590677738189697, "learning_rate": 0.00043473690388997434, "loss": 3.0118, "num_input_tokens_seen": 15007744000, "step": 57250 }, { "epoch": 0.385428385488924, "grad_norm": 0.16380582749843597, "learning_rate": 0.0004217827674798845, "loss": 3.0074, "num_input_tokens_seen": 15020851200, "step": 57300 }, { "epoch": 0.3857647104326316, "grad_norm": 0.19464251399040222, "learning_rate": 0.00040888223725392626, "loss": 3.0126, "num_input_tokens_seen": 15033958400, "step": 57350 }, { "epoch": 0.3861010353763392, "grad_norm": 0.17150136828422546, "learning_rate": 0.0003960441545911204, "loss": 3.0049, "num_input_tokens_seen": 15047065600, "step": 57400 }, { "epoch": 0.38643736032004683, "grad_norm": 0.1877928376197815, "learning_rate": 0.00038327731807204744, "loss": 3.0089, "num_input_tokens_seen": 15060172800, "step": 57450 }, { "epoch": 0.38677368526375444, "grad_norm": 0.2605326771736145, "learning_rate": 0.0003705904774487396, "loss": 3.0115, "num_input_tokens_seen": 15073280000, "step": 57500 }, { "epoch": 0.38677368526375444, "eval_loss": 2.9029135704040527, "eval_runtime": 53.9097, "eval_samples_per_second": 92.748, "eval_steps_per_second": 23.187, "num_input_tokens_seen": 15073280000, "step": 57500 }, { "epoch": 0.38711001020746205, "grad_norm": 0.21006393432617188, "learning_rate": 0.0003579923276480387, "loss": 3.0044, "num_input_tokens_seen": 15086387200, "step": 57550 }, { "epoch": 0.38744633515116966, "grad_norm": 0.1743878722190857, "learning_rate": 0.00034549150281252633, "loss": 3.0114, "num_input_tokens_seen": 15099494400, "step": 57600 }, { "epoch": 0.3877826600948773, "grad_norm": 0.16699257493019104, "learning_rate": 0.00033309657038311456, "loss": 3.0041, "num_input_tokens_seen": 15112601600, "step": 57650 }, { "epoch": 0.3881189850385849, "grad_norm": 0.17115868628025055, "learning_rate": 0.00032081602522734986, "loss": 3.0051, "num_input_tokens_seen": 15125708800, "step": 57700 }, { "epoch": 0.3884553099822925, "grad_norm": 0.16885310411453247, "learning_rate": 0.0003086582838174551, "loss": 2.9969, "num_input_tokens_seen": 15138816000, "step": 57750 }, { "epoch": 0.3887916349260001, "grad_norm": 0.17101123929023743, "learning_rate": 0.0002966316784621, "loss": 2.9947, "num_input_tokens_seen": 15151923200, "step": 57800 }, { "epoch": 0.3891279598697077, "grad_norm": 0.1529199331998825, "learning_rate": 0.0002847444515958523, "loss": 3.0019, "num_input_tokens_seen": 15165030400, "step": 57850 }, { "epoch": 0.38946428481341533, "grad_norm": 0.16087768971920013, "learning_rate": 0.00027300475013022663, "loss": 2.9947, "num_input_tokens_seen": 15178137600, "step": 57900 }, { "epoch": 0.38980060975712294, "grad_norm": 0.16023555397987366, "learning_rate": 0.00026142061987019576, "loss": 3.0022, "num_input_tokens_seen": 15191244800, "step": 57950 }, { "epoch": 0.39013693470083055, "grad_norm": 0.16161410510540009, "learning_rate": 0.0002500000000000001, "loss": 2.9931, "num_input_tokens_seen": 15204352000, "step": 58000 }, { "epoch": 0.39013693470083055, "eval_loss": 2.8950610160827637, "eval_runtime": 53.5434, "eval_samples_per_second": 93.382, "eval_steps_per_second": 23.346, "num_input_tokens_seen": 15204352000, "step": 58000 }, { "epoch": 0.39047325964453816, "grad_norm": 0.1577194780111313, "learning_rate": 0.00023875071764202561, "loss": 2.9866, "num_input_tokens_seen": 15217459200, "step": 58050 }, { "epoch": 0.3908095845882458, "grad_norm": 0.1869671791791916, "learning_rate": 0.00022768048249248646, "loss": 2.9973, "num_input_tokens_seen": 15230566400, "step": 58100 }, { "epoch": 0.3911459095319534, "grad_norm": 0.1568073183298111, "learning_rate": 0.0002167968815375837, "loss": 3.0012, "num_input_tokens_seen": 15243673600, "step": 58150 }, { "epoch": 0.391482234475661, "grad_norm": 0.15343065559864044, "learning_rate": 0.00020610737385376348, "loss": 2.988, "num_input_tokens_seen": 15256780800, "step": 58200 }, { "epoch": 0.3918185594193686, "grad_norm": 0.22413235902786255, "learning_rate": 0.00019561928549563967, "loss": 2.993, "num_input_tokens_seen": 15269888000, "step": 58250 }, { "epoch": 0.3921548843630762, "grad_norm": 0.1807044893503189, "learning_rate": 0.00018533980447508135, "loss": 2.9905, "num_input_tokens_seen": 15282995200, "step": 58300 }, { "epoch": 0.39249120930678383, "grad_norm": 0.1571112871170044, "learning_rate": 0.00017527597583490823, "loss": 2.9983, "num_input_tokens_seen": 15296102400, "step": 58350 }, { "epoch": 0.39282753425049144, "grad_norm": 0.16821637749671936, "learning_rate": 0.00016543469682057105, "loss": 2.9966, "num_input_tokens_seen": 15309209600, "step": 58400 }, { "epoch": 0.39316385919419905, "grad_norm": 0.1497010737657547, "learning_rate": 0.00015582271215312294, "loss": 2.9814, "num_input_tokens_seen": 15322316800, "step": 58450 }, { "epoch": 0.39350018413790666, "grad_norm": 0.15679225325584412, "learning_rate": 0.00014644660940672628, "loss": 2.9876, "num_input_tokens_seen": 15335424000, "step": 58500 }, { "epoch": 0.39350018413790666, "eval_loss": 2.8887994289398193, "eval_runtime": 53.8449, "eval_samples_per_second": 92.859, "eval_steps_per_second": 23.215, "num_input_tokens_seen": 15335424000, "step": 58500 }, { "epoch": 0.39383650908161427, "grad_norm": 0.15169823169708252, "learning_rate": 0.0001373128144938563, "loss": 2.9875, "num_input_tokens_seen": 15348531200, "step": 58550 }, { "epoch": 0.3941728340253219, "grad_norm": 0.1635347604751587, "learning_rate": 0.00012842758726130281, "loss": 2.9898, "num_input_tokens_seen": 15361638400, "step": 58600 }, { "epoch": 0.3945091589690295, "grad_norm": 0.15156348049640656, "learning_rate": 0.00011979701719998454, "loss": 2.9977, "num_input_tokens_seen": 15374745600, "step": 58650 }, { "epoch": 0.3948454839127371, "grad_norm": 0.15710316598415375, "learning_rate": 0.00011142701927151455, "loss": 2.981, "num_input_tokens_seen": 15387852800, "step": 58700 }, { "epoch": 0.3951818088564447, "grad_norm": 0.2838917374610901, "learning_rate": 0.00010332332985438247, "loss": 2.9909, "num_input_tokens_seen": 15400960000, "step": 58750 }, { "epoch": 0.3955181338001524, "grad_norm": 0.1509639173746109, "learning_rate": 9.549150281252633e-05, "loss": 2.9851, "num_input_tokens_seen": 15414067200, "step": 58800 }, { "epoch": 0.39585445874386, "grad_norm": 0.1501421183347702, "learning_rate": 8.793690568899215e-05, "loss": 2.9931, "num_input_tokens_seen": 15427174400, "step": 58850 }, { "epoch": 0.3961907836875676, "grad_norm": 0.14904147386550903, "learning_rate": 8.066471602728804e-05, "loss": 2.9862, "num_input_tokens_seen": 15440281600, "step": 58900 }, { "epoch": 0.3965271086312752, "grad_norm": 0.15182824432849884, "learning_rate": 7.367991782295391e-05, "loss": 2.9882, "num_input_tokens_seen": 15453388800, "step": 58950 }, { "epoch": 0.3968634335749828, "grad_norm": 0.14710576832294464, "learning_rate": 6.698729810778065e-05, "loss": 2.9856, "num_input_tokens_seen": 15466496000, "step": 59000 }, { "epoch": 0.3968634335749828, "eval_loss": 2.8845956325531006, "eval_runtime": 53.5429, "eval_samples_per_second": 93.383, "eval_steps_per_second": 23.346, "num_input_tokens_seen": 15466496000, "step": 59000 }, { "epoch": 0.39719975851869044, "grad_norm": 0.14572475850582123, "learning_rate": 6.059144366901737e-05, "loss": 2.9861, "num_input_tokens_seen": 15479603200, "step": 59050 }, { "epoch": 0.39753608346239805, "grad_norm": 0.5027282238006592, "learning_rate": 5.449673790581611e-05, "loss": 2.9773, "num_input_tokens_seen": 15492710400, "step": 59100 }, { "epoch": 0.39787240840610566, "grad_norm": 0.192597895860672, "learning_rate": 4.87073578250698e-05, "loss": 2.9874, "num_input_tokens_seen": 15505817600, "step": 59150 }, { "epoch": 0.39820873334981327, "grad_norm": 0.15083667635917664, "learning_rate": 4.322727117869951e-05, "loss": 2.987, "num_input_tokens_seen": 15518924800, "step": 59200 }, { "epoch": 0.3985450582935209, "grad_norm": 0.14701534807682037, "learning_rate": 3.806023374435663e-05, "loss": 2.9858, "num_input_tokens_seen": 15532032000, "step": 59250 }, { "epoch": 0.3988813832372285, "grad_norm": 0.145115464925766, "learning_rate": 3.3209786751399184e-05, "loss": 2.9926, "num_input_tokens_seen": 15545139200, "step": 59300 }, { "epoch": 0.3992177081809361, "grad_norm": 0.15828457474708557, "learning_rate": 2.8679254453910786e-05, "loss": 2.9803, "num_input_tokens_seen": 15558246400, "step": 59350 }, { "epoch": 0.3995540331246437, "grad_norm": 0.14400678873062134, "learning_rate": 2.4471741852423235e-05, "loss": 2.9701, "num_input_tokens_seen": 15571353600, "step": 59400 }, { "epoch": 0.3998903580683513, "grad_norm": 0.14925344288349152, "learning_rate": 2.0590132565903473e-05, "loss": 2.989, "num_input_tokens_seen": 15584460800, "step": 59450 }, { "epoch": 0.40022668301205894, "grad_norm": 0.14081260561943054, "learning_rate": 1.70370868554659e-05, "loss": 2.9824, "num_input_tokens_seen": 15597568000, "step": 59500 }, { "epoch": 0.40022668301205894, "eval_loss": 2.882228136062622, "eval_runtime": 53.7595, "eval_samples_per_second": 93.007, "eval_steps_per_second": 23.252, "num_input_tokens_seen": 15597568000, "step": 59500 }, { "epoch": 0.40056300795576655, "grad_norm": 0.13585136830806732, "learning_rate": 1.3815039801161721e-05, "loss": 2.9883, "num_input_tokens_seen": 15610675200, "step": 59550 }, { "epoch": 0.40089933289947416, "grad_norm": 0.1438748985528946, "learning_rate": 1.0926199633097156e-05, "loss": 2.9781, "num_input_tokens_seen": 15623782400, "step": 59600 }, { "epoch": 0.40123565784318177, "grad_norm": 0.3345394730567932, "learning_rate": 8.372546218022748e-06, "loss": 2.9869, "num_input_tokens_seen": 15636889600, "step": 59650 }, { "epoch": 0.4015719827868894, "grad_norm": 0.14581316709518433, "learning_rate": 6.15582970243117e-06, "loss": 2.9882, "num_input_tokens_seen": 15649996800, "step": 59700 }, { "epoch": 0.401908307730597, "grad_norm": 0.1409323662519455, "learning_rate": 4.277569313094809e-06, "loss": 2.9833, "num_input_tokens_seen": 15663104000, "step": 59750 }, { "epoch": 0.4022446326743046, "grad_norm": 0.1412041187286377, "learning_rate": 2.739052315863355e-06, "loss": 2.9835, "num_input_tokens_seen": 15676211200, "step": 59800 }, { "epoch": 0.4025809576180122, "grad_norm": 0.14011850953102112, "learning_rate": 1.541333133436018e-06, "loss": 2.9819, "num_input_tokens_seen": 15689318400, "step": 59850 }, { "epoch": 0.4029172825617198, "grad_norm": 0.14772015810012817, "learning_rate": 6.852326227130834e-07, "loss": 2.9855, "num_input_tokens_seen": 15702425600, "step": 59900 }, { "epoch": 0.40325360750542744, "grad_norm": 0.14281156659126282, "learning_rate": 1.7133751222137007e-07, "loss": 2.978, "num_input_tokens_seen": 15715532800, "step": 59950 }, { "epoch": 0.40358993244913505, "grad_norm": 0.14420129358768463, "learning_rate": 0.0, "loss": 2.9789, "num_input_tokens_seen": 15728640000, "step": 60000 }, { "epoch": 0.40358993244913505, "eval_loss": 2.8818726539611816, "eval_runtime": 53.5982, "eval_samples_per_second": 93.287, "eval_steps_per_second": 23.322, "num_input_tokens_seen": 15728640000, "step": 60000 }, { "epoch": 0.40392625739284266, "grad_norm": 0.2130047082901001, "learning_rate": 0.0006867974850262581, "loss": 3.0074, "num_input_tokens_seen": 15741747200, "step": 60050 }, { "epoch": 0.40426258233655027, "grad_norm": 0.18596570193767548, "learning_rate": 0.000682235249939575, "loss": 2.9981, "num_input_tokens_seen": 15754854400, "step": 60100 }, { "epoch": 0.4045989072802579, "grad_norm": 0.2774942219257355, "learning_rate": 0.0006776554506402081, "loss": 3.0024, "num_input_tokens_seen": 15767961600, "step": 60150 }, { "epoch": 0.4049352322239655, "grad_norm": 0.19329522550106049, "learning_rate": 0.0006730585285387465, "loss": 3.0101, "num_input_tokens_seen": 15781068800, "step": 60200 }, { "epoch": 0.4052715571676731, "grad_norm": 0.21384254097938538, "learning_rate": 0.0006684449266961101, "loss": 3.0095, "num_input_tokens_seen": 15794176000, "step": 60250 }, { "epoch": 0.4056078821113807, "grad_norm": 0.3892166018486023, "learning_rate": 0.0006638150897808468, "loss": 3.0101, "num_input_tokens_seen": 15807283200, "step": 60300 }, { "epoch": 0.4059442070550883, "grad_norm": 0.27356287837028503, "learning_rate": 0.0006591694640262749, "loss": 3.0322, "num_input_tokens_seen": 15820390400, "step": 60350 }, { "epoch": 0.40628053199879594, "grad_norm": 0.20498153567314148, "learning_rate": 0.0006545084971874737, "loss": 3.0064, "num_input_tokens_seen": 15833497600, "step": 60400 }, { "epoch": 0.40661685694250355, "grad_norm": 0.19939659535884857, "learning_rate": 0.0006498326384981283, "loss": 3.0158, "num_input_tokens_seen": 15846604800, "step": 60450 }, { "epoch": 0.40695318188621116, "grad_norm": 0.24545226991176605, "learning_rate": 0.0006451423386272311, "loss": 3.0132, "num_input_tokens_seen": 15859712000, "step": 60500 }, { "epoch": 0.40695318188621116, "eval_loss": 2.914865255355835, "eval_runtime": 51.2039, "eval_samples_per_second": 97.649, "eval_steps_per_second": 24.412, "num_input_tokens_seen": 15859712000, "step": 60500 }, { "epoch": 0.40728950682991877, "grad_norm": 0.2364359349012375, "learning_rate": 0.0006404380496356461, "loss": 3.0102, "num_input_tokens_seen": 15872819200, "step": 60550 }, { "epoch": 0.4076258317736264, "grad_norm": 0.19283762574195862, "learning_rate": 0.0006357202249325371, "loss": 3.0132, "num_input_tokens_seen": 15885926400, "step": 60600 }, { "epoch": 0.40796215671733405, "grad_norm": 0.19770501554012299, "learning_rate": 0.0006309893192316686, "loss": 3.0106, "num_input_tokens_seen": 15899033600, "step": 60650 }, { "epoch": 0.40829848166104166, "grad_norm": 0.18395134806632996, "learning_rate": 0.000626245788507579, "loss": 3.005, "num_input_tokens_seen": 15912140800, "step": 60700 }, { "epoch": 0.40863480660474927, "grad_norm": 0.21380823850631714, "learning_rate": 0.000621490089951632, "loss": 3.0106, "num_input_tokens_seen": 15925248000, "step": 60750 }, { "epoch": 0.4089711315484569, "grad_norm": 0.17995478212833405, "learning_rate": 0.0006167226819279528, "loss": 3.0237, "num_input_tokens_seen": 15938355200, "step": 60800 }, { "epoch": 0.4093074564921645, "grad_norm": 0.31993716955184937, "learning_rate": 0.0006119440239292493, "loss": 3.0158, "num_input_tokens_seen": 15951462400, "step": 60850 }, { "epoch": 0.4096437814358721, "grad_norm": 0.19210565090179443, "learning_rate": 0.0006071545765325253, "loss": 3.0121, "num_input_tokens_seen": 15964569600, "step": 60900 }, { "epoch": 0.4099801063795797, "grad_norm": 0.4126472771167755, "learning_rate": 0.0006023548013546899, "loss": 3.0215, "num_input_tokens_seen": 15977676800, "step": 60950 }, { "epoch": 0.4103164313232873, "grad_norm": 0.26418012380599976, "learning_rate": 0.0005975451610080642, "loss": 3.0125, "num_input_tokens_seen": 15990784000, "step": 61000 }, { "epoch": 0.4103164313232873, "eval_loss": 2.913696765899658, "eval_runtime": 52.0924, "eval_samples_per_second": 95.983, "eval_steps_per_second": 23.996, "num_input_tokens_seen": 15990784000, "step": 61000 }, { "epoch": 0.41065275626699493, "grad_norm": 0.3535885810852051, "learning_rate": 0.0005927261190557954, "loss": 3.0102, "num_input_tokens_seen": 16003891200, "step": 61050 }, { "epoch": 0.41098908121070254, "grad_norm": 0.2633107304573059, "learning_rate": 0.0005878981399671774, "loss": 3.0424, "num_input_tokens_seen": 16016998400, "step": 61100 }, { "epoch": 0.41132540615441016, "grad_norm": 0.3054018020629883, "learning_rate": 0.0005830616890728827, "loss": 3.0233, "num_input_tokens_seen": 16030105600, "step": 61150 }, { "epoch": 0.41166173109811777, "grad_norm": 0.21453993022441864, "learning_rate": 0.0005782172325201155, "loss": 3.018, "num_input_tokens_seen": 16043212800, "step": 61200 }, { "epoch": 0.4119980560418254, "grad_norm": 0.27815598249435425, "learning_rate": 0.0005733652372276809, "loss": 3.0254, "num_input_tokens_seen": 16056320000, "step": 61250 }, { "epoch": 0.412334380985533, "grad_norm": 0.20687313377857208, "learning_rate": 0.0005685061708409841, "loss": 3.0165, "num_input_tokens_seen": 16069427200, "step": 61300 }, { "epoch": 0.4126707059292406, "grad_norm": 0.1985252946615219, "learning_rate": 0.0005636405016869566, "loss": 3.0164, "num_input_tokens_seen": 16082534400, "step": 61350 }, { "epoch": 0.4130070308729482, "grad_norm": 0.26703181862831116, "learning_rate": 0.0005587686987289189, "loss": 3.0001, "num_input_tokens_seen": 16095641600, "step": 61400 }, { "epoch": 0.4133433558166558, "grad_norm": 0.1948036104440689, "learning_rate": 0.0005538912315213797, "loss": 3.0058, "num_input_tokens_seen": 16108748800, "step": 61450 }, { "epoch": 0.41367968076036343, "grad_norm": 0.20653308928012848, "learning_rate": 0.0005490085701647804, "loss": 3.0115, "num_input_tokens_seen": 16121856000, "step": 61500 }, { "epoch": 0.41367968076036343, "eval_loss": 2.9048781394958496, "eval_runtime": 53.8207, "eval_samples_per_second": 92.901, "eval_steps_per_second": 23.225, "num_input_tokens_seen": 16121856000, "step": 61500 }, { "epoch": 0.41401600570407104, "grad_norm": 0.19605295360088348, "learning_rate": 0.0005441211852601849, "loss": 3.0225, "num_input_tokens_seen": 16134963200, "step": 61550 }, { "epoch": 0.41435233064777865, "grad_norm": 0.17526155710220337, "learning_rate": 0.0005392295478639225, "loss": 3.0117, "num_input_tokens_seen": 16148070400, "step": 61600 }, { "epoch": 0.41468865559148627, "grad_norm": 0.17657403647899628, "learning_rate": 0.0005343341294421868, "loss": 3.0107, "num_input_tokens_seen": 16161177600, "step": 61650 }, { "epoch": 0.4150249805351939, "grad_norm": 0.18658681213855743, "learning_rate": 0.0005294354018255945, "loss": 3.0085, "num_input_tokens_seen": 16174284800, "step": 61700 }, { "epoch": 0.4153613054789015, "grad_norm": 0.24781519174575806, "learning_rate": 0.0005245338371637091, "loss": 2.9939, "num_input_tokens_seen": 16187392000, "step": 61750 }, { "epoch": 0.4156976304226091, "grad_norm": 0.20824941992759705, "learning_rate": 0.0005196299078795343, "loss": 3.0038, "num_input_tokens_seen": 16200499200, "step": 61800 }, { "epoch": 0.4160339553663167, "grad_norm": 0.38262441754341125, "learning_rate": 0.0005147240866239817, "loss": 3.0141, "num_input_tokens_seen": 16213606400, "step": 61850 }, { "epoch": 0.4163702803100243, "grad_norm": 0.200628861784935, "learning_rate": 0.0005098168462303141, "loss": 3.0187, "num_input_tokens_seen": 16226713600, "step": 61900 }, { "epoch": 0.41670660525373193, "grad_norm": 0.18858259916305542, "learning_rate": 0.000504908659668575, "loss": 3.0049, "num_input_tokens_seen": 16239820800, "step": 61950 }, { "epoch": 0.41704293019743954, "grad_norm": 0.19025108218193054, "learning_rate": 0.0005, "loss": 3.0079, "num_input_tokens_seen": 16252928000, "step": 62000 }, { "epoch": 0.41704293019743954, "eval_loss": 2.9012608528137207, "eval_runtime": 52.7052, "eval_samples_per_second": 94.867, "eval_steps_per_second": 23.717, "num_input_tokens_seen": 16252928000, "step": 62000 }, { "epoch": 0.41737925514114715, "grad_norm": 0.19505389034748077, "learning_rate": 0.0004950913403314252, "loss": 2.9995, "num_input_tokens_seen": 16266035200, "step": 62050 }, { "epoch": 0.41771558008485477, "grad_norm": 0.18988089263439178, "learning_rate": 0.0004901831537696859, "loss": 3.0041, "num_input_tokens_seen": 16279142400, "step": 62100 }, { "epoch": 0.4180519050285624, "grad_norm": 0.19544407725334167, "learning_rate": 0.0004852759133760184, "loss": 3.0073, "num_input_tokens_seen": 16292249600, "step": 62150 }, { "epoch": 0.41838822997227, "grad_norm": 0.1884351521730423, "learning_rate": 0.00048037009212046586, "loss": 3.0035, "num_input_tokens_seen": 16305356800, "step": 62200 }, { "epoch": 0.4187245549159776, "grad_norm": 0.17927390336990356, "learning_rate": 0.000475466162836291, "loss": 2.9921, "num_input_tokens_seen": 16318464000, "step": 62250 }, { "epoch": 0.4190608798596852, "grad_norm": 0.18687283992767334, "learning_rate": 0.00047056459817440544, "loss": 3.0042, "num_input_tokens_seen": 16331571200, "step": 62300 }, { "epoch": 0.4193972048033928, "grad_norm": 0.18783149123191833, "learning_rate": 0.00046566587055781316, "loss": 3.0003, "num_input_tokens_seen": 16344678400, "step": 62350 }, { "epoch": 0.41973352974710043, "grad_norm": 0.18625770509243011, "learning_rate": 0.0004607704521360776, "loss": 3.0061, "num_input_tokens_seen": 16357785600, "step": 62400 }, { "epoch": 0.4200698546908081, "grad_norm": 0.20189669728279114, "learning_rate": 0.00045587881473981533, "loss": 2.9976, "num_input_tokens_seen": 16370892800, "step": 62450 }, { "epoch": 0.4204061796345157, "grad_norm": 0.19049198925495148, "learning_rate": 0.0004509914298352197, "loss": 3.0055, "num_input_tokens_seen": 16384000000, "step": 62500 }, { "epoch": 0.4204061796345157, "eval_loss": 2.896798849105835, "eval_runtime": 52.8908, "eval_samples_per_second": 94.534, "eval_steps_per_second": 23.634, "num_input_tokens_seen": 16384000000, "step": 62500 }, { "epoch": 0.4207425045782233, "grad_norm": 0.1667575091123581, "learning_rate": 0.00044610876847862033, "loss": 2.9929, "num_input_tokens_seen": 16397107200, "step": 62550 }, { "epoch": 0.42107882952193093, "grad_norm": 0.7176526188850403, "learning_rate": 0.00044123130127108126, "loss": 2.9918, "num_input_tokens_seen": 16410214400, "step": 62600 }, { "epoch": 0.42141515446563854, "grad_norm": 0.20578069984912872, "learning_rate": 0.00043635949831304343, "loss": 3.0037, "num_input_tokens_seen": 16423321600, "step": 62650 }, { "epoch": 0.42175147940934615, "grad_norm": 0.19712655246257782, "learning_rate": 0.0004314938291590161, "loss": 3.0142, "num_input_tokens_seen": 16436428800, "step": 62700 }, { "epoch": 0.42208780435305376, "grad_norm": 0.20189446210861206, "learning_rate": 0.00042663476277231917, "loss": 2.9983, "num_input_tokens_seen": 16449536000, "step": 62750 }, { "epoch": 0.4224241292967614, "grad_norm": 0.18463867902755737, "learning_rate": 0.0004217827674798845, "loss": 2.9971, "num_input_tokens_seen": 16462643200, "step": 62800 }, { "epoch": 0.422760454240469, "grad_norm": 0.17639389634132385, "learning_rate": 0.0004169383109271174, "loss": 3.0032, "num_input_tokens_seen": 16475750400, "step": 62850 }, { "epoch": 0.4230967791841766, "grad_norm": 0.1733781099319458, "learning_rate": 0.00041210186003282274, "loss": 2.9932, "num_input_tokens_seen": 16488857600, "step": 62900 }, { "epoch": 0.4234331041278842, "grad_norm": 0.17753124237060547, "learning_rate": 0.00040727388094420456, "loss": 3.0012, "num_input_tokens_seen": 16501964800, "step": 62950 }, { "epoch": 0.4237694290715918, "grad_norm": 0.180925652384758, "learning_rate": 0.00040245483899193594, "loss": 2.9823, "num_input_tokens_seen": 16515072000, "step": 63000 }, { "epoch": 0.4237694290715918, "eval_loss": 2.8929545879364014, "eval_runtime": 53.37, "eval_samples_per_second": 93.686, "eval_steps_per_second": 23.421, "num_input_tokens_seen": 16515072000, "step": 63000 }, { "epoch": 0.42410575401529943, "grad_norm": 0.15995506942272186, "learning_rate": 0.00039764519864531023, "loss": 2.9898, "num_input_tokens_seen": 16528179200, "step": 63050 }, { "epoch": 0.42444207895900704, "grad_norm": 0.16034817695617676, "learning_rate": 0.0003928454234674747, "loss": 2.9884, "num_input_tokens_seen": 16541286400, "step": 63100 }, { "epoch": 0.42477840390271465, "grad_norm": 0.17681469023227692, "learning_rate": 0.00038805597607075075, "loss": 2.9952, "num_input_tokens_seen": 16554393600, "step": 63150 }, { "epoch": 0.42511472884642226, "grad_norm": 0.18527273833751678, "learning_rate": 0.00038327731807204744, "loss": 2.9947, "num_input_tokens_seen": 16567500800, "step": 63200 }, { "epoch": 0.4254510537901299, "grad_norm": 0.16262546181678772, "learning_rate": 0.0003785099100483681, "loss": 2.9972, "num_input_tokens_seen": 16580608000, "step": 63250 }, { "epoch": 0.4257873787338375, "grad_norm": 0.1709870994091034, "learning_rate": 0.00037375421149242103, "loss": 2.999, "num_input_tokens_seen": 16593715200, "step": 63300 }, { "epoch": 0.4261237036775451, "grad_norm": 0.1716383844614029, "learning_rate": 0.0003690106807683313, "loss": 2.9964, "num_input_tokens_seen": 16606822400, "step": 63350 }, { "epoch": 0.4264600286212527, "grad_norm": 0.18682868778705597, "learning_rate": 0.0003642797750674629, "loss": 3.0037, "num_input_tokens_seen": 16619929600, "step": 63400 }, { "epoch": 0.4267963535649603, "grad_norm": 0.16003596782684326, "learning_rate": 0.00035956195036435405, "loss": 2.9893, "num_input_tokens_seen": 16633036800, "step": 63450 }, { "epoch": 0.42713267850866793, "grad_norm": 0.17876048386096954, "learning_rate": 0.0003548576613727689, "loss": 3.0004, "num_input_tokens_seen": 16646144000, "step": 63500 }, { "epoch": 0.42713267850866793, "eval_loss": 2.8903579711914062, "eval_runtime": 53.0482, "eval_samples_per_second": 94.254, "eval_steps_per_second": 23.563, "num_input_tokens_seen": 16646144000, "step": 63500 }, { "epoch": 0.42746900345237554, "grad_norm": 0.21229425072669983, "learning_rate": 0.00035016736150187165, "loss": 2.9925, "num_input_tokens_seen": 16659251200, "step": 63550 }, { "epoch": 0.42780532839608315, "grad_norm": 0.19477584958076477, "learning_rate": 0.00034549150281252633, "loss": 2.9892, "num_input_tokens_seen": 16672358400, "step": 63600 }, { "epoch": 0.42814165333979076, "grad_norm": 0.1866609901189804, "learning_rate": 0.0003408305359737252, "loss": 2.9913, "num_input_tokens_seen": 16685465600, "step": 63650 }, { "epoch": 0.4284779782834984, "grad_norm": 0.19487887620925903, "learning_rate": 0.0003361849102191533, "loss": 2.9875, "num_input_tokens_seen": 16698572800, "step": 63700 }, { "epoch": 0.428814303227206, "grad_norm": 0.15979841351509094, "learning_rate": 0.00033155507330389, "loss": 2.9894, "num_input_tokens_seen": 16711680000, "step": 63750 }, { "epoch": 0.4291506281709136, "grad_norm": 0.1749998778104782, "learning_rate": 0.0003269414714612534, "loss": 2.9945, "num_input_tokens_seen": 16724787200, "step": 63800 }, { "epoch": 0.4294869531146212, "grad_norm": 0.16839075088500977, "learning_rate": 0.00032234454935979205, "loss": 2.9989, "num_input_tokens_seen": 16737894400, "step": 63850 }, { "epoch": 0.4298232780583288, "grad_norm": 0.19226372241973877, "learning_rate": 0.0003177647500604252, "loss": 2.9854, "num_input_tokens_seen": 16751001600, "step": 63900 }, { "epoch": 0.43015960300203643, "grad_norm": 0.15530380606651306, "learning_rate": 0.0003132025149737419, "loss": 2.9903, "num_input_tokens_seen": 16764108800, "step": 63950 }, { "epoch": 0.43049592794574404, "grad_norm": 0.17773845791816711, "learning_rate": 0.0003086582838174551, "loss": 2.9839, "num_input_tokens_seen": 16777216000, "step": 64000 }, { "epoch": 0.43049592794574404, "eval_loss": 2.8860437870025635, "eval_runtime": 53.1514, "eval_samples_per_second": 94.071, "eval_steps_per_second": 23.518, "num_input_tokens_seen": 16777216000, "step": 64000 }, { "epoch": 0.43083225288945165, "grad_norm": 0.15883377194404602, "learning_rate": 0.000304132494574022, "loss": 2.9851, "num_input_tokens_seen": 16790323200, "step": 64050 }, { "epoch": 0.43116857783315926, "grad_norm": 0.176467627286911, "learning_rate": 0.00029962558344842963, "loss": 2.9865, "num_input_tokens_seen": 16803430400, "step": 64100 }, { "epoch": 0.43150490277686687, "grad_norm": 0.16392388939857483, "learning_rate": 0.00029513798482615227, "loss": 2.9788, "num_input_tokens_seen": 16816537600, "step": 64150 }, { "epoch": 0.4318412277205745, "grad_norm": 0.15614169836044312, "learning_rate": 0.0002906701312312861, "loss": 2.9769, "num_input_tokens_seen": 16829644800, "step": 64200 }, { "epoch": 0.43217755266428215, "grad_norm": 0.16225555539131165, "learning_rate": 0.00028622245328485907, "loss": 2.9881, "num_input_tokens_seen": 16842752000, "step": 64250 }, { "epoch": 0.43251387760798976, "grad_norm": 0.16419048607349396, "learning_rate": 0.0002817953796633289, "loss": 2.99, "num_input_tokens_seen": 16855859200, "step": 64300 }, { "epoch": 0.43285020255169737, "grad_norm": 0.16654469072818756, "learning_rate": 0.000277389337057266, "loss": 2.9919, "num_input_tokens_seen": 16868966400, "step": 64350 }, { "epoch": 0.433186527495405, "grad_norm": 0.1688661277294159, "learning_rate": 0.00027300475013022663, "loss": 2.9844, "num_input_tokens_seen": 16882073600, "step": 64400 }, { "epoch": 0.4335228524391126, "grad_norm": 0.162180095911026, "learning_rate": 0.000268642041477825, "loss": 2.9847, "num_input_tokens_seen": 16895180800, "step": 64450 }, { "epoch": 0.4338591773828202, "grad_norm": 0.18244421482086182, "learning_rate": 0.00026430163158700117, "loss": 2.9789, "num_input_tokens_seen": 16908288000, "step": 64500 }, { "epoch": 0.4338591773828202, "eval_loss": 2.8813860416412354, "eval_runtime": 53.1806, "eval_samples_per_second": 94.019, "eval_steps_per_second": 23.505, "num_input_tokens_seen": 16908288000, "step": 64500 }, { "epoch": 0.4341955023265278, "grad_norm": 0.15887753665447235, "learning_rate": 0.00025998393879549445, "loss": 2.9723, "num_input_tokens_seen": 16921395200, "step": 64550 }, { "epoch": 0.4345318272702354, "grad_norm": 0.17573221027851105, "learning_rate": 0.0002556893792515227, "loss": 2.99, "num_input_tokens_seen": 16934502400, "step": 64600 }, { "epoch": 0.43486815221394304, "grad_norm": 0.1790430247783661, "learning_rate": 0.0002514183668736727, "loss": 2.9887, "num_input_tokens_seen": 16947609600, "step": 64650 }, { "epoch": 0.43520447715765065, "grad_norm": 0.16031622886657715, "learning_rate": 0.0002471713133110078, "loss": 2.9835, "num_input_tokens_seen": 16960716800, "step": 64700 }, { "epoch": 0.43554080210135826, "grad_norm": 0.1702345311641693, "learning_rate": 0.0002429486279033892, "loss": 2.9862, "num_input_tokens_seen": 16973824000, "step": 64750 }, { "epoch": 0.43587712704506587, "grad_norm": 0.16080138087272644, "learning_rate": 0.00023875071764202561, "loss": 2.9785, "num_input_tokens_seen": 16986931200, "step": 64800 }, { "epoch": 0.4362134519887735, "grad_norm": 0.17694465816020966, "learning_rate": 0.0002345779871302453, "loss": 2.9962, "num_input_tokens_seen": 17000038400, "step": 64850 }, { "epoch": 0.4365497769324811, "grad_norm": 0.15310978889465332, "learning_rate": 0.00023043083854449987, "loss": 2.98, "num_input_tokens_seen": 17013145600, "step": 64900 }, { "epoch": 0.4368861018761887, "grad_norm": 0.15505504608154297, "learning_rate": 0.0002263096715956019, "loss": 2.9825, "num_input_tokens_seen": 17026252800, "step": 64950 }, { "epoch": 0.4372224268198963, "grad_norm": 0.15211448073387146, "learning_rate": 0.00022221488349019903, "loss": 2.9876, "num_input_tokens_seen": 17039360000, "step": 65000 }, { "epoch": 0.4372224268198963, "eval_loss": 2.8792829513549805, "eval_runtime": 53.0249, "eval_samples_per_second": 94.295, "eval_steps_per_second": 23.574, "num_input_tokens_seen": 17039360000, "step": 65000 }, { "epoch": 0.4375587517636039, "grad_norm": 0.16188842058181763, "learning_rate": 0.00021814686889249158, "loss": 2.9812, "num_input_tokens_seen": 17052467200, "step": 65050 }, { "epoch": 0.43789507670731154, "grad_norm": 0.14550812542438507, "learning_rate": 0.00021410601988619394, "loss": 2.9856, "num_input_tokens_seen": 17065574400, "step": 65100 }, { "epoch": 0.43823140165101915, "grad_norm": 0.1500539779663086, "learning_rate": 0.00021009272593674322, "loss": 2.9827, "num_input_tokens_seen": 17078681600, "step": 65150 }, { "epoch": 0.43856772659472676, "grad_norm": 0.1571357101202011, "learning_rate": 0.00020610737385376348, "loss": 2.9788, "num_input_tokens_seen": 17091788800, "step": 65200 }, { "epoch": 0.43890405153843437, "grad_norm": 0.1671544760465622, "learning_rate": 0.00020215034775378332, "loss": 2.9758, "num_input_tokens_seen": 17104896000, "step": 65250 }, { "epoch": 0.439240376482142, "grad_norm": 0.15525776147842407, "learning_rate": 0.0001982220290232143, "loss": 2.9823, "num_input_tokens_seen": 17118003200, "step": 65300 }, { "epoch": 0.4395767014258496, "grad_norm": 0.14799903333187103, "learning_rate": 0.00019432279628159188, "loss": 2.9781, "num_input_tokens_seen": 17131110400, "step": 65350 }, { "epoch": 0.4399130263695572, "grad_norm": 0.16087676584720612, "learning_rate": 0.00019045302534508295, "loss": 2.9805, "num_input_tokens_seen": 17144217600, "step": 65400 }, { "epoch": 0.4402493513132648, "grad_norm": 0.15892113745212555, "learning_rate": 0.0001866130891902653, "loss": 2.9823, "num_input_tokens_seen": 17157324800, "step": 65450 }, { "epoch": 0.4405856762569724, "grad_norm": 0.187602236866951, "learning_rate": 0.00018280335791817732, "loss": 2.9804, "num_input_tokens_seen": 17170432000, "step": 65500 }, { "epoch": 0.4405856762569724, "eval_loss": 2.875824451446533, "eval_runtime": 53.0867, "eval_samples_per_second": 94.186, "eval_steps_per_second": 23.546, "num_input_tokens_seen": 17170432000, "step": 65500 }, { "epoch": 0.44092200120068004, "grad_norm": 0.15579210221767426, "learning_rate": 0.0001790241987186485, "loss": 2.9734, "num_input_tokens_seen": 17183539200, "step": 65550 }, { "epoch": 0.44125832614438765, "grad_norm": 0.15250550210475922, "learning_rate": 0.00017527597583490823, "loss": 2.9787, "num_input_tokens_seen": 17196646400, "step": 65600 }, { "epoch": 0.44159465108809526, "grad_norm": 0.15954890847206116, "learning_rate": 0.00017155905052847938, "loss": 2.978, "num_input_tokens_seen": 17209753600, "step": 65650 }, { "epoch": 0.44193097603180287, "grad_norm": 0.15598754584789276, "learning_rate": 0.00016787378104435928, "loss": 2.9809, "num_input_tokens_seen": 17222860800, "step": 65700 }, { "epoch": 0.4422673009755105, "grad_norm": 0.14709477126598358, "learning_rate": 0.00016422052257649078, "loss": 2.9793, "num_input_tokens_seen": 17235968000, "step": 65750 }, { "epoch": 0.4426036259192181, "grad_norm": 0.15505217015743256, "learning_rate": 0.0001605996272335291, "loss": 2.9763, "num_input_tokens_seen": 17249075200, "step": 65800 }, { "epoch": 0.4429399508629257, "grad_norm": 0.14491549134254456, "learning_rate": 0.0001570114440049037, "loss": 2.9756, "num_input_tokens_seen": 17262182400, "step": 65850 }, { "epoch": 0.4432762758066333, "grad_norm": 0.1571652740240097, "learning_rate": 0.00015345631872718213, "loss": 2.977, "num_input_tokens_seen": 17275289600, "step": 65900 }, { "epoch": 0.4436126007503409, "grad_norm": 0.18299035727977753, "learning_rate": 0.00014993459405073824, "loss": 2.9788, "num_input_tokens_seen": 17288396800, "step": 65950 }, { "epoch": 0.44394892569404854, "grad_norm": 0.14829285442829132, "learning_rate": 0.00014644660940672628, "loss": 2.9851, "num_input_tokens_seen": 17301504000, "step": 66000 }, { "epoch": 0.44394892569404854, "eval_loss": 2.8729286193847656, "eval_runtime": 53.2839, "eval_samples_per_second": 93.837, "eval_steps_per_second": 23.459, "num_input_tokens_seen": 17301504000, "step": 66000 }, { "epoch": 0.4442852506377562, "grad_norm": 0.14435406029224396, "learning_rate": 0.0001429927009743659, "loss": 2.9718, "num_input_tokens_seen": 17314611200, "step": 66050 }, { "epoch": 0.4446215755814638, "grad_norm": 0.1603071242570877, "learning_rate": 0.0001395732016485406, "loss": 2.9731, "num_input_tokens_seen": 17327718400, "step": 66100 }, { "epoch": 0.4449579005251714, "grad_norm": 0.14310726523399353, "learning_rate": 0.00013618844100771256, "loss": 2.9665, "num_input_tokens_seen": 17340825600, "step": 66150 }, { "epoch": 0.44529422546887903, "grad_norm": 0.276594340801239, "learning_rate": 0.00013283874528215734, "loss": 2.9711, "num_input_tokens_seen": 17353932800, "step": 66200 }, { "epoch": 0.44563055041258665, "grad_norm": 0.1535540074110031, "learning_rate": 0.00012952443732252057, "loss": 2.9693, "num_input_tokens_seen": 17367040000, "step": 66250 }, { "epoch": 0.44596687535629426, "grad_norm": 0.15807458758354187, "learning_rate": 0.00012624583656870153, "loss": 2.9754, "num_input_tokens_seen": 17380147200, "step": 66300 }, { "epoch": 0.44630320030000187, "grad_norm": 0.14477893710136414, "learning_rate": 0.00012300325901906528, "loss": 2.9735, "num_input_tokens_seen": 17393254400, "step": 66350 }, { "epoch": 0.4466395252437095, "grad_norm": 0.14505073428153992, "learning_rate": 0.00011979701719998454, "loss": 2.9783, "num_input_tokens_seen": 17406361600, "step": 66400 }, { "epoch": 0.4469758501874171, "grad_norm": 0.15850161015987396, "learning_rate": 0.00011662742013571926, "loss": 2.967, "num_input_tokens_seen": 17419468800, "step": 66450 }, { "epoch": 0.4473121751311247, "grad_norm": 0.14653578400611877, "learning_rate": 0.00011349477331863151, "loss": 2.9651, "num_input_tokens_seen": 17432576000, "step": 66500 }, { "epoch": 0.4473121751311247, "eval_loss": 2.8710148334503174, "eval_runtime": 53.2889, "eval_samples_per_second": 93.828, "eval_steps_per_second": 23.457, "num_input_tokens_seen": 17432576000, "step": 66500 }, { "epoch": 0.4476485000748323, "grad_norm": 0.15636616945266724, "learning_rate": 0.00011039937867974164, "loss": 2.9758, "num_input_tokens_seen": 17445683200, "step": 66550 }, { "epoch": 0.4479848250185399, "grad_norm": 0.14427579939365387, "learning_rate": 0.00010734153455962764, "loss": 2.9594, "num_input_tokens_seen": 17458790400, "step": 66600 }, { "epoch": 0.44832114996224753, "grad_norm": 0.15148353576660156, "learning_rate": 0.00010432153567966984, "loss": 2.9684, "num_input_tokens_seen": 17471897600, "step": 66650 }, { "epoch": 0.44865747490595514, "grad_norm": 0.1541094332933426, "learning_rate": 0.0001013396731136465, "loss": 2.9685, "num_input_tokens_seen": 17485004800, "step": 66700 }, { "epoch": 0.44899379984966276, "grad_norm": 0.14267295598983765, "learning_rate": 9.839623425967759e-05, "loss": 2.9728, "num_input_tokens_seen": 17498112000, "step": 66750 }, { "epoch": 0.44933012479337037, "grad_norm": 0.1437918245792389, "learning_rate": 9.549150281252633e-05, "loss": 2.9752, "num_input_tokens_seen": 17511219200, "step": 66800 }, { "epoch": 0.449666449737078, "grad_norm": 0.1517232209444046, "learning_rate": 9.262575873625529e-05, "loss": 2.9729, "num_input_tokens_seen": 17524326400, "step": 66850 }, { "epoch": 0.4500027746807856, "grad_norm": 0.15286608040332794, "learning_rate": 8.979927823724321e-05, "loss": 2.9687, "num_input_tokens_seen": 17537433600, "step": 66900 }, { "epoch": 0.4503390996244932, "grad_norm": 0.14875057339668274, "learning_rate": 8.70123337375635e-05, "loss": 2.9758, "num_input_tokens_seen": 17550540800, "step": 66950 }, { "epoch": 0.4506754245682008, "grad_norm": 0.1493612825870514, "learning_rate": 8.426519384872733e-05, "loss": 2.9704, "num_input_tokens_seen": 17563648000, "step": 67000 }, { "epoch": 0.4506754245682008, "eval_loss": 2.869231939315796, "eval_runtime": 53.2491, "eval_samples_per_second": 93.898, "eval_steps_per_second": 23.475, "num_input_tokens_seen": 17563648000, "step": 67000 }, { "epoch": 0.4510117495119084, "grad_norm": 0.14675357937812805, "learning_rate": 8.155812334579532e-05, "loss": 2.9682, "num_input_tokens_seen": 17576755200, "step": 67050 }, { "epoch": 0.45134807445561603, "grad_norm": 0.14341385662555695, "learning_rate": 7.889138314185678e-05, "loss": 2.9749, "num_input_tokens_seen": 17589862400, "step": 67100 }, { "epoch": 0.45168439939932364, "grad_norm": 0.1442009061574936, "learning_rate": 7.626523026288279e-05, "loss": 2.9637, "num_input_tokens_seen": 17602969600, "step": 67150 }, { "epoch": 0.45202072434303125, "grad_norm": 0.14580078423023224, "learning_rate": 7.367991782295391e-05, "loss": 2.9636, "num_input_tokens_seen": 17616076800, "step": 67200 }, { "epoch": 0.45235704928673887, "grad_norm": 0.13888555765151978, "learning_rate": 7.1135694999864e-05, "loss": 2.9737, "num_input_tokens_seen": 17629184000, "step": 67250 }, { "epoch": 0.4526933742304465, "grad_norm": 0.14820803701877594, "learning_rate": 6.863280701110408e-05, "loss": 2.9778, "num_input_tokens_seen": 17642291200, "step": 67300 }, { "epoch": 0.4530296991741541, "grad_norm": 0.14933691918849945, "learning_rate": 6.617149509022808e-05, "loss": 2.9667, "num_input_tokens_seen": 17655398400, "step": 67350 }, { "epoch": 0.4533660241178617, "grad_norm": 0.14829853177070618, "learning_rate": 6.375199646360142e-05, "loss": 2.9691, "num_input_tokens_seen": 17668505600, "step": 67400 }, { "epoch": 0.4537023490615693, "grad_norm": 0.14731477200984955, "learning_rate": 6.137454432753797e-05, "loss": 2.9731, "num_input_tokens_seen": 17681612800, "step": 67450 }, { "epoch": 0.4540386740052769, "grad_norm": 0.14357906579971313, "learning_rate": 5.903936782582253e-05, "loss": 2.9785, "num_input_tokens_seen": 17694720000, "step": 67500 }, { "epoch": 0.4540386740052769, "eval_loss": 2.867840528488159, "eval_runtime": 53.8197, "eval_samples_per_second": 92.903, "eval_steps_per_second": 23.226, "num_input_tokens_seen": 17694720000, "step": 67500 }, { "epoch": 0.45437499894898453, "grad_norm": 0.1438903659582138, "learning_rate": 5.6746692027626835e-05, "loss": 2.9733, "num_input_tokens_seen": 17707827200, "step": 67550 }, { "epoch": 0.45471132389269214, "grad_norm": 0.14171506464481354, "learning_rate": 5.449673790581611e-05, "loss": 2.9637, "num_input_tokens_seen": 17720934400, "step": 67600 }, { "epoch": 0.45504764883639975, "grad_norm": 0.1645549088716507, "learning_rate": 5.2289722315651546e-05, "loss": 2.9668, "num_input_tokens_seen": 17734041600, "step": 67650 }, { "epoch": 0.45538397378010737, "grad_norm": 0.1390199065208435, "learning_rate": 5.0125857973889355e-05, "loss": 2.9762, "num_input_tokens_seen": 17747148800, "step": 67700 }, { "epoch": 0.455720298723815, "grad_norm": 0.14667369425296783, "learning_rate": 4.800535343827833e-05, "loss": 2.9724, "num_input_tokens_seen": 17760256000, "step": 67750 }, { "epoch": 0.4560566236675226, "grad_norm": 0.14203302562236786, "learning_rate": 4.592841308745932e-05, "loss": 2.9679, "num_input_tokens_seen": 17773363200, "step": 67800 }, { "epoch": 0.45639294861123025, "grad_norm": 0.1517883837223053, "learning_rate": 4.389523710126619e-05, "loss": 2.9723, "num_input_tokens_seen": 17786470400, "step": 67850 }, { "epoch": 0.45672927355493786, "grad_norm": 0.1438019722700119, "learning_rate": 4.190602144143207e-05, "loss": 2.973, "num_input_tokens_seen": 17799577600, "step": 67900 }, { "epoch": 0.4570655984986455, "grad_norm": 0.14281606674194336, "learning_rate": 3.9960957832702595e-05, "loss": 2.9733, "num_input_tokens_seen": 17812684800, "step": 67950 }, { "epoch": 0.4574019234423531, "grad_norm": 0.14911025762557983, "learning_rate": 3.806023374435663e-05, "loss": 2.9724, "num_input_tokens_seen": 17825792000, "step": 68000 }, { "epoch": 0.4574019234423531, "eval_loss": 2.8663442134857178, "eval_runtime": 53.8853, "eval_samples_per_second": 92.79, "eval_steps_per_second": 23.197, "num_input_tokens_seen": 17825792000, "step": 68000 }, { "epoch": 0.4577382483860607, "grad_norm": 0.14517797529697418, "learning_rate": 3.6204032372137984e-05, "loss": 2.9674, "num_input_tokens_seen": 17838899200, "step": 68050 }, { "epoch": 0.4580745733297683, "grad_norm": 0.14154207706451416, "learning_rate": 3.439253262059822e-05, "loss": 2.9627, "num_input_tokens_seen": 17852006400, "step": 68100 }, { "epoch": 0.4584108982734759, "grad_norm": 0.14251314103603363, "learning_rate": 3.2625909085853776e-05, "loss": 2.9681, "num_input_tokens_seen": 17865113600, "step": 68150 }, { "epoch": 0.45874722321718353, "grad_norm": 0.15670983493328094, "learning_rate": 3.0904332038757974e-05, "loss": 2.9708, "num_input_tokens_seen": 17878220800, "step": 68200 }, { "epoch": 0.45908354816089114, "grad_norm": 0.1453925371170044, "learning_rate": 2.9227967408489654e-05, "loss": 2.9686, "num_input_tokens_seen": 17891328000, "step": 68250 }, { "epoch": 0.45941987310459875, "grad_norm": 0.13307476043701172, "learning_rate": 2.7596976766560976e-05, "loss": 2.9595, "num_input_tokens_seen": 17904435200, "step": 68300 }, { "epoch": 0.45975619804830636, "grad_norm": 0.14958307147026062, "learning_rate": 2.6011517311244848e-05, "loss": 2.9661, "num_input_tokens_seen": 17917542400, "step": 68350 }, { "epoch": 0.460092522992014, "grad_norm": 0.14210085570812225, "learning_rate": 2.4471741852423235e-05, "loss": 2.9737, "num_input_tokens_seen": 17930649600, "step": 68400 }, { "epoch": 0.4604288479357216, "grad_norm": 0.15127155184745789, "learning_rate": 2.2977798796859794e-05, "loss": 2.9627, "num_input_tokens_seen": 17943756800, "step": 68450 }, { "epoch": 0.4607651728794292, "grad_norm": 0.14184921979904175, "learning_rate": 2.152983213389559e-05, "loss": 2.9732, "num_input_tokens_seen": 17956864000, "step": 68500 }, { "epoch": 0.4607651728794292, "eval_loss": 2.865307331085205, "eval_runtime": 53.2908, "eval_samples_per_second": 93.825, "eval_steps_per_second": 23.456, "num_input_tokens_seen": 17956864000, "step": 68500 }, { "epoch": 0.4611014978231368, "grad_norm": 0.14755961298942566, "learning_rate": 2.0127981421571295e-05, "loss": 2.9687, "num_input_tokens_seen": 17969971200, "step": 68550 }, { "epoch": 0.4614378227668444, "grad_norm": 0.1370965540409088, "learning_rate": 1.8772381773176416e-05, "loss": 2.9711, "num_input_tokens_seen": 17983078400, "step": 68600 }, { "epoch": 0.46177414771055203, "grad_norm": 0.14454130828380585, "learning_rate": 1.7463163844226305e-05, "loss": 2.9633, "num_input_tokens_seen": 17996185600, "step": 68650 }, { "epoch": 0.46211047265425964, "grad_norm": 0.13908445835113525, "learning_rate": 1.620045381987012e-05, "loss": 2.9662, "num_input_tokens_seen": 18009292800, "step": 68700 }, { "epoch": 0.46244679759796725, "grad_norm": 0.2359876185655594, "learning_rate": 1.4984373402728013e-05, "loss": 2.9671, "num_input_tokens_seen": 18022400000, "step": 68750 }, { "epoch": 0.46278312254167486, "grad_norm": 0.13809122145175934, "learning_rate": 1.3815039801161721e-05, "loss": 2.9684, "num_input_tokens_seen": 18035507200, "step": 68800 }, { "epoch": 0.4631194474853825, "grad_norm": 0.14375115931034088, "learning_rate": 1.26925657179775e-05, "loss": 2.9677, "num_input_tokens_seen": 18048614400, "step": 68850 }, { "epoch": 0.4634557724290901, "grad_norm": 0.14648525416851044, "learning_rate": 1.1617059339563806e-05, "loss": 2.9625, "num_input_tokens_seen": 18061721600, "step": 68900 }, { "epoch": 0.4637920973727977, "grad_norm": 0.1428016871213913, "learning_rate": 1.058862432546387e-05, "loss": 2.9717, "num_input_tokens_seen": 18074828800, "step": 68950 }, { "epoch": 0.4641284223165053, "grad_norm": 0.14518927037715912, "learning_rate": 9.607359798384786e-06, "loss": 2.9622, "num_input_tokens_seen": 18087936000, "step": 69000 }, { "epoch": 0.4641284223165053, "eval_loss": 2.8647797107696533, "eval_runtime": 53.1259, "eval_samples_per_second": 94.116, "eval_steps_per_second": 23.529, "num_input_tokens_seen": 18087936000, "step": 69000 }, { "epoch": 0.4644647472602129, "grad_norm": 0.1424110382795334, "learning_rate": 8.67336033464411e-06, "loss": 2.9591, "num_input_tokens_seen": 18101043200, "step": 69050 }, { "epoch": 0.46480107220392053, "grad_norm": 0.14686723053455353, "learning_rate": 7.786715955054202e-06, "loss": 2.9561, "num_input_tokens_seen": 18114150400, "step": 69100 }, { "epoch": 0.46513739714762814, "grad_norm": 0.13719068467617035, "learning_rate": 6.947512116245669e-06, "loss": 2.9629, "num_input_tokens_seen": 18127257600, "step": 69150 }, { "epoch": 0.46547372209133575, "grad_norm": 0.14337210357189178, "learning_rate": 6.15582970243117e-06, "loss": 2.9713, "num_input_tokens_seen": 18140364800, "step": 69200 }, { "epoch": 0.46581004703504336, "grad_norm": 0.18305008113384247, "learning_rate": 5.411745017609493e-06, "loss": 2.9659, "num_input_tokens_seen": 18153472000, "step": 69250 }, { "epoch": 0.466146371978751, "grad_norm": 0.137322798371315, "learning_rate": 4.715329778211374e-06, "loss": 2.9678, "num_input_tokens_seen": 18166579200, "step": 69300 }, { "epoch": 0.4664826969224586, "grad_norm": 0.13300293684005737, "learning_rate": 4.066651106186981e-06, "loss": 2.9647, "num_input_tokens_seen": 18179686400, "step": 69350 }, { "epoch": 0.4668190218661662, "grad_norm": 0.13357709348201752, "learning_rate": 3.4657715225368535e-06, "loss": 2.965, "num_input_tokens_seen": 18192793600, "step": 69400 }, { "epoch": 0.4671553468098738, "grad_norm": 0.13399702310562134, "learning_rate": 2.9127489412859033e-06, "loss": 2.9614, "num_input_tokens_seen": 18205900800, "step": 69450 }, { "epoch": 0.4674916717535814, "grad_norm": 0.13703274726867676, "learning_rate": 2.4076366639015913e-06, "loss": 2.964, "num_input_tokens_seen": 18219008000, "step": 69500 }, { "epoch": 0.4674916717535814, "eval_loss": 2.8645894527435303, "eval_runtime": 53.3524, "eval_samples_per_second": 93.716, "eval_steps_per_second": 23.429, "num_input_tokens_seen": 18219008000, "step": 69500 }, { "epoch": 0.46782799669728903, "grad_norm": 0.3837803900241852, "learning_rate": 1.950483374156431e-06, "loss": 2.9665, "num_input_tokens_seen": 18232115200, "step": 69550 }, { "epoch": 0.46816432164099664, "grad_norm": 0.13585589826107025, "learning_rate": 1.541333133436018e-06, "loss": 2.9579, "num_input_tokens_seen": 18245222400, "step": 69600 }, { "epoch": 0.4685006465847043, "grad_norm": 0.13347585499286652, "learning_rate": 1.18022537649215e-06, "loss": 2.9636, "num_input_tokens_seen": 18258329600, "step": 69650 }, { "epoch": 0.4688369715284119, "grad_norm": 0.13726544380187988, "learning_rate": 8.671949076420882e-07, "loss": 2.9626, "num_input_tokens_seen": 18271436800, "step": 69700 }, { "epoch": 0.4691732964721195, "grad_norm": 0.14254987239837646, "learning_rate": 6.022718974137975e-07, "loss": 2.9698, "num_input_tokens_seen": 18284544000, "step": 69750 }, { "epoch": 0.46950962141582714, "grad_norm": 0.1329219937324524, "learning_rate": 3.854818796385495e-07, "loss": 2.96, "num_input_tokens_seen": 18297651200, "step": 69800 }, { "epoch": 0.46984594635953475, "grad_norm": 0.1384582668542862, "learning_rate": 2.1684574898939157e-07, "loss": 2.9693, "num_input_tokens_seen": 18310758400, "step": 69850 }, { "epoch": 0.47018227130324236, "grad_norm": 0.14365264773368835, "learning_rate": 9.637975896759077e-08, "loss": 2.9686, "num_input_tokens_seen": 18323865600, "step": 69900 }, { "epoch": 0.47051859624694997, "grad_norm": 0.13613733649253845, "learning_rate": 2.4095520335998265e-08, "loss": 2.9607, "num_input_tokens_seen": 18336972800, "step": 69950 }, { "epoch": 0.4708549211906576, "grad_norm": 0.14377959072589874, "learning_rate": 0.0, "loss": 2.9684, "num_input_tokens_seen": 18350080000, "step": 70000 }, { "epoch": 0.4708549211906576, "eval_loss": 2.8644959926605225, "eval_runtime": 54.0337, "eval_samples_per_second": 92.535, "eval_steps_per_second": 23.134, "num_input_tokens_seen": 18350080000, "step": 70000 }, { "epoch": 0.4708549211906576, "num_input_tokens_seen": 18350080000, "step": 70000, "total_flos": 4.9088254967808e+18, "train_loss": 0.4265073311941964, "train_runtime": 14322.5496, "train_samples_per_second": 1251.174, "train_steps_per_second": 4.887, "train_tokens_per_second": 1281202.057 } ], "logging_steps": 50, "max_steps": 70000, "num_input_tokens_seen": 18350080000, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.9088254967808e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }