{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8566756864731733, "eval_steps": 500, "global_step": 39000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001098302162145094, "grad_norm": 1.8242720365524292, "learning_rate": 1e-05, "loss": 10.2008, "num_input_tokens_seen": 52428800, "step": 50 }, { "epoch": 0.002196604324290188, "grad_norm": 1.7588739395141602, "learning_rate": 2e-05, "loss": 9.6579, "num_input_tokens_seen": 104857600, "step": 100 }, { "epoch": 0.003294906486435282, "grad_norm": 1.4990028142929077, "learning_rate": 3e-05, "loss": 8.9277, "num_input_tokens_seen": 157286400, "step": 150 }, { "epoch": 0.004393208648580376, "grad_norm": 0.9206030368804932, "learning_rate": 4e-05, "loss": 8.0739, "num_input_tokens_seen": 209715200, "step": 200 }, { "epoch": 0.00549151081072547, "grad_norm": 0.4887239933013916, "learning_rate": 5e-05, "loss": 7.406, "num_input_tokens_seen": 262144000, "step": 250 }, { "epoch": 0.006589812972870564, "grad_norm": 0.7044657468795776, "learning_rate": 6e-05, "loss": 6.9708, "num_input_tokens_seen": 314572800, "step": 300 }, { "epoch": 0.007688115135015658, "grad_norm": 0.9420009255409241, "learning_rate": 7.000000000000001e-05, "loss": 6.6177, "num_input_tokens_seen": 367001600, "step": 350 }, { "epoch": 0.008786417297160752, "grad_norm": 1.4098254442214966, "learning_rate": 8e-05, "loss": 6.3285, "num_input_tokens_seen": 419430400, "step": 400 }, { "epoch": 0.009884719459305845, "grad_norm": 0.5596774220466614, "learning_rate": 8.999999999999999e-05, "loss": 6.0918, "num_input_tokens_seen": 471859200, "step": 450 }, { "epoch": 0.01098302162145094, "grad_norm": 0.5934723615646362, "learning_rate": 0.0001, "loss": 5.8926, "num_input_tokens_seen": 524288000, "step": 500 }, { "epoch": 0.01098302162145094, "eval_loss": 5.72648286819458, "eval_runtime": 65.255, "eval_samples_per_second": 76.622, "eval_steps_per_second": 19.156, "num_input_tokens_seen": 524288000, "step": 500 }, { "epoch": 0.012081323783596033, "grad_norm": 0.6180132627487183, "learning_rate": 0.00011, "loss": 5.713, "num_input_tokens_seen": 576716800, "step": 550 }, { "epoch": 0.013179625945741128, "grad_norm": 0.7194430232048035, "learning_rate": 0.00012, "loss": 5.5604, "num_input_tokens_seen": 629145600, "step": 600 }, { "epoch": 0.014277928107886221, "grad_norm": 0.7763974070549011, "learning_rate": 0.00013000000000000002, "loss": 5.4212, "num_input_tokens_seen": 681574400, "step": 650 }, { "epoch": 0.015376230270031316, "grad_norm": 0.7948254942893982, "learning_rate": 0.00014000000000000001, "loss": 5.2875, "num_input_tokens_seen": 734003200, "step": 700 }, { "epoch": 0.01647453243217641, "grad_norm": 0.7185749411582947, "learning_rate": 0.00015, "loss": 5.1765, "num_input_tokens_seen": 786432000, "step": 750 }, { "epoch": 0.017572834594321504, "grad_norm": 0.673218846321106, "learning_rate": 0.00016, "loss": 5.0599, "num_input_tokens_seen": 838860800, "step": 800 }, { "epoch": 0.018671136756466596, "grad_norm": 0.6499584317207336, "learning_rate": 0.00017, "loss": 4.9475, "num_input_tokens_seen": 891289600, "step": 850 }, { "epoch": 0.01976943891861169, "grad_norm": 0.9044798016548157, "learning_rate": 0.00017999999999999998, "loss": 4.8334, "num_input_tokens_seen": 943718400, "step": 900 }, { "epoch": 0.020867741080756785, "grad_norm": 0.886431872844696, "learning_rate": 0.00019, "loss": 4.723, "num_input_tokens_seen": 996147200, "step": 950 }, { "epoch": 0.02196604324290188, "grad_norm": 0.6721145510673523, "learning_rate": 0.0002, "loss": 4.6106, "num_input_tokens_seen": 1048576000, "step": 1000 }, { "epoch": 0.02196604324290188, "eval_loss": 4.456684589385986, "eval_runtime": 66.2606, "eval_samples_per_second": 75.46, "eval_steps_per_second": 18.865, "num_input_tokens_seen": 1048576000, "step": 1000 }, { "epoch": 0.02306434540504697, "grad_norm": 0.6067565083503723, "learning_rate": 0.00021, "loss": 4.5355, "num_input_tokens_seen": 1101004800, "step": 1050 }, { "epoch": 0.024162647567192067, "grad_norm": 0.6668316721916199, "learning_rate": 0.00022, "loss": 4.4383, "num_input_tokens_seen": 1153433600, "step": 1100 }, { "epoch": 0.02526094972933716, "grad_norm": 0.3714616000652313, "learning_rate": 0.00023, "loss": 4.3538, "num_input_tokens_seen": 1205862400, "step": 1150 }, { "epoch": 0.026359251891482256, "grad_norm": 0.439012348651886, "learning_rate": 0.00024, "loss": 4.2848, "num_input_tokens_seen": 1258291200, "step": 1200 }, { "epoch": 0.027457554053627348, "grad_norm": 0.5026286840438843, "learning_rate": 0.00025, "loss": 4.2181, "num_input_tokens_seen": 1310720000, "step": 1250 }, { "epoch": 0.028555856215772443, "grad_norm": 0.4865541160106659, "learning_rate": 0.00026000000000000003, "loss": 4.1495, "num_input_tokens_seen": 1363148800, "step": 1300 }, { "epoch": 0.029654158377917537, "grad_norm": 0.5259677767753601, "learning_rate": 0.00027, "loss": 4.0873, "num_input_tokens_seen": 1415577600, "step": 1350 }, { "epoch": 0.030752460540062632, "grad_norm": 0.4151704013347626, "learning_rate": 0.00028000000000000003, "loss": 4.0369, "num_input_tokens_seen": 1468006400, "step": 1400 }, { "epoch": 0.03185076270220773, "grad_norm": 0.5806245803833008, "learning_rate": 0.00029, "loss": 3.9881, "num_input_tokens_seen": 1520435200, "step": 1450 }, { "epoch": 0.03294906486435282, "grad_norm": 0.46140730381011963, "learning_rate": 0.0003, "loss": 3.9311, "num_input_tokens_seen": 1572864000, "step": 1500 }, { "epoch": 0.03294906486435282, "eval_loss": 3.8112432956695557, "eval_runtime": 65.8947, "eval_samples_per_second": 75.879, "eval_steps_per_second": 18.97, "num_input_tokens_seen": 1572864000, "step": 1500 }, { "epoch": 0.03404736702649791, "grad_norm": 0.4219188392162323, "learning_rate": 0.00031, "loss": 3.8972, "num_input_tokens_seen": 1625292800, "step": 1550 }, { "epoch": 0.03514566918864301, "grad_norm": 0.3506027162075043, "learning_rate": 0.00032, "loss": 3.8596, "num_input_tokens_seen": 1677721600, "step": 1600 }, { "epoch": 0.0362439713507881, "grad_norm": 0.5210819840431213, "learning_rate": 0.00033, "loss": 3.8182, "num_input_tokens_seen": 1730150400, "step": 1650 }, { "epoch": 0.03734227351293319, "grad_norm": 0.5830159783363342, "learning_rate": 0.00034, "loss": 3.7766, "num_input_tokens_seen": 1782579200, "step": 1700 }, { "epoch": 0.03844057567507829, "grad_norm": 0.4602348804473877, "learning_rate": 0.00035, "loss": 3.7362, "num_input_tokens_seen": 1835008000, "step": 1750 }, { "epoch": 0.03953887783722338, "grad_norm": 0.40075036883354187, "learning_rate": 0.00035999999999999997, "loss": 3.7136, "num_input_tokens_seen": 1887436800, "step": 1800 }, { "epoch": 0.04063717999936848, "grad_norm": 0.3893415629863739, "learning_rate": 0.00037, "loss": 3.6809, "num_input_tokens_seen": 1939865600, "step": 1850 }, { "epoch": 0.04173548216151357, "grad_norm": 0.2921469211578369, "learning_rate": 0.00038, "loss": 3.6565, "num_input_tokens_seen": 1992294400, "step": 1900 }, { "epoch": 0.04283378432365866, "grad_norm": 0.49007460474967957, "learning_rate": 0.00039000000000000005, "loss": 3.6215, "num_input_tokens_seen": 2044723200, "step": 1950 }, { "epoch": 0.04393208648580376, "grad_norm": 0.2980474531650543, "learning_rate": 0.0004, "loss": 3.591, "num_input_tokens_seen": 2097152000, "step": 2000 }, { "epoch": 0.04393208648580376, "eval_loss": 3.4769670963287354, "eval_runtime": 62.8853, "eval_samples_per_second": 79.51, "eval_steps_per_second": 19.877, "num_input_tokens_seen": 2097152000, "step": 2000 }, { "epoch": 0.04503038864794885, "grad_norm": 0.33002936840057373, "learning_rate": 0.00041, "loss": 3.5684, "num_input_tokens_seen": 2149580800, "step": 2050 }, { "epoch": 0.04612869081009394, "grad_norm": 0.43806758522987366, "learning_rate": 0.00042, "loss": 3.5436, "num_input_tokens_seen": 2202009600, "step": 2100 }, { "epoch": 0.04722699297223904, "grad_norm": 0.32842758297920227, "learning_rate": 0.00043, "loss": 3.5191, "num_input_tokens_seen": 2254438400, "step": 2150 }, { "epoch": 0.04832529513438413, "grad_norm": 0.3068505525588989, "learning_rate": 0.00044, "loss": 3.5009, "num_input_tokens_seen": 2306867200, "step": 2200 }, { "epoch": 0.049423597296529224, "grad_norm": 0.2950410544872284, "learning_rate": 0.00045000000000000004, "loss": 3.4796, "num_input_tokens_seen": 2359296000, "step": 2250 }, { "epoch": 0.05052189945867432, "grad_norm": 0.29731425642967224, "learning_rate": 0.00046, "loss": 3.4583, "num_input_tokens_seen": 2411724800, "step": 2300 }, { "epoch": 0.051620201620819414, "grad_norm": 0.2702693045139313, "learning_rate": 0.00047, "loss": 3.4385, "num_input_tokens_seen": 2464153600, "step": 2350 }, { "epoch": 0.05271850378296451, "grad_norm": 0.2418452948331833, "learning_rate": 0.00048, "loss": 3.4244, "num_input_tokens_seen": 2516582400, "step": 2400 }, { "epoch": 0.053816805945109604, "grad_norm": 0.28668686747550964, "learning_rate": 0.00049, "loss": 3.3977, "num_input_tokens_seen": 2569011200, "step": 2450 }, { "epoch": 0.054915108107254695, "grad_norm": 0.3115544319152832, "learning_rate": 0.0005, "loss": 3.3881, "num_input_tokens_seen": 2621440000, "step": 2500 }, { "epoch": 0.054915108107254695, "eval_loss": 3.2789928913116455, "eval_runtime": 62.6749, "eval_samples_per_second": 79.777, "eval_steps_per_second": 19.944, "num_input_tokens_seen": 2621440000, "step": 2500 }, { "epoch": 0.056013410269399794, "grad_norm": 0.32340022921562195, "learning_rate": 0.00051, "loss": 3.3667, "num_input_tokens_seen": 2673868800, "step": 2550 }, { "epoch": 0.057111712431544885, "grad_norm": 0.2612442970275879, "learning_rate": 0.0005200000000000001, "loss": 3.3612, "num_input_tokens_seen": 2726297600, "step": 2600 }, { "epoch": 0.05821001459368998, "grad_norm": 0.29934820532798767, "learning_rate": 0.0005300000000000001, "loss": 3.3386, "num_input_tokens_seen": 2778726400, "step": 2650 }, { "epoch": 0.059308316755835075, "grad_norm": 0.2737022042274475, "learning_rate": 0.00054, "loss": 3.3274, "num_input_tokens_seen": 2831155200, "step": 2700 }, { "epoch": 0.060406618917980166, "grad_norm": 0.2101408988237381, "learning_rate": 0.00055, "loss": 3.3153, "num_input_tokens_seen": 2883584000, "step": 2750 }, { "epoch": 0.061504921080125265, "grad_norm": 0.3240911066532135, "learning_rate": 0.0005600000000000001, "loss": 3.2978, "num_input_tokens_seen": 2936012800, "step": 2800 }, { "epoch": 0.06260322324227036, "grad_norm": 0.20592735707759857, "learning_rate": 0.00057, "loss": 3.2984, "num_input_tokens_seen": 2988441600, "step": 2850 }, { "epoch": 0.06370152540441545, "grad_norm": 0.263443261384964, "learning_rate": 0.00058, "loss": 3.2706, "num_input_tokens_seen": 3040870400, "step": 2900 }, { "epoch": 0.06479982756656054, "grad_norm": 0.24249990284442902, "learning_rate": 0.00059, "loss": 3.2673, "num_input_tokens_seen": 3093299200, "step": 2950 }, { "epoch": 0.06589812972870564, "grad_norm": 0.25961214303970337, "learning_rate": 0.0006, "loss": 3.2512, "num_input_tokens_seen": 3145728000, "step": 3000 }, { "epoch": 0.06589812972870564, "eval_loss": 3.150442600250244, "eval_runtime": 65.9549, "eval_samples_per_second": 75.809, "eval_steps_per_second": 18.952, "num_input_tokens_seen": 3145728000, "step": 3000 }, { "epoch": 0.06699643189085074, "grad_norm": 0.21884848177433014, "learning_rate": 0.00061, "loss": 3.2437, "num_input_tokens_seen": 3198156800, "step": 3050 }, { "epoch": 0.06809473405299582, "grad_norm": 0.2534893751144409, "learning_rate": 0.00062, "loss": 3.2366, "num_input_tokens_seen": 3250585600, "step": 3100 }, { "epoch": 0.06919303621514092, "grad_norm": 0.2408875823020935, "learning_rate": 0.00063, "loss": 3.2264, "num_input_tokens_seen": 3303014400, "step": 3150 }, { "epoch": 0.07029133837728602, "grad_norm": 0.22240856289863586, "learning_rate": 0.00064, "loss": 3.2102, "num_input_tokens_seen": 3355443200, "step": 3200 }, { "epoch": 0.0713896405394311, "grad_norm": 0.21527299284934998, "learning_rate": 0.0006500000000000001, "loss": 3.1985, "num_input_tokens_seen": 3407872000, "step": 3250 }, { "epoch": 0.0724879427015762, "grad_norm": 0.26642242074012756, "learning_rate": 0.00066, "loss": 3.1923, "num_input_tokens_seen": 3460300800, "step": 3300 }, { "epoch": 0.0735862448637213, "grad_norm": 0.22164040803909302, "learning_rate": 0.00067, "loss": 3.1848, "num_input_tokens_seen": 3512729600, "step": 3350 }, { "epoch": 0.07468454702586638, "grad_norm": 0.21594341099262238, "learning_rate": 0.00068, "loss": 3.1764, "num_input_tokens_seen": 3565158400, "step": 3400 }, { "epoch": 0.07578284918801148, "grad_norm": 0.1921539604663849, "learning_rate": 0.00069, "loss": 3.1643, "num_input_tokens_seen": 3617587200, "step": 3450 }, { "epoch": 0.07688115135015658, "grad_norm": 0.2266080528497696, "learning_rate": 0.0007, "loss": 3.1647, "num_input_tokens_seen": 3670016000, "step": 3500 }, { "epoch": 0.07688115135015658, "eval_loss": 3.061373472213745, "eval_runtime": 63.388, "eval_samples_per_second": 78.879, "eval_steps_per_second": 19.72, "num_input_tokens_seen": 3670016000, "step": 3500 }, { "epoch": 0.07797945351230168, "grad_norm": 0.19900226593017578, "learning_rate": 0.00071, "loss": 3.1557, "num_input_tokens_seen": 3722444800, "step": 3550 }, { "epoch": 0.07907775567444676, "grad_norm": 0.20299012959003448, "learning_rate": 0.0007199999999999999, "loss": 3.1503, "num_input_tokens_seen": 3774873600, "step": 3600 }, { "epoch": 0.08017605783659186, "grad_norm": 0.232399120926857, "learning_rate": 0.00073, "loss": 3.1387, "num_input_tokens_seen": 3827302400, "step": 3650 }, { "epoch": 0.08127435999873696, "grad_norm": 0.2127719670534134, "learning_rate": 0.00074, "loss": 3.1388, "num_input_tokens_seen": 3879731200, "step": 3700 }, { "epoch": 0.08237266216088204, "grad_norm": 0.22336533665657043, "learning_rate": 0.00075, "loss": 3.1247, "num_input_tokens_seen": 3932160000, "step": 3750 }, { "epoch": 0.08347096432302714, "grad_norm": 0.18270662426948547, "learning_rate": 0.00076, "loss": 3.1192, "num_input_tokens_seen": 3984588800, "step": 3800 }, { "epoch": 0.08456926648517224, "grad_norm": 0.16843897104263306, "learning_rate": 0.0007700000000000001, "loss": 3.1153, "num_input_tokens_seen": 4037017600, "step": 3850 }, { "epoch": 0.08566756864731732, "grad_norm": 0.19947747886180878, "learning_rate": 0.0007800000000000001, "loss": 3.1048, "num_input_tokens_seen": 4089446400, "step": 3900 }, { "epoch": 0.08676587080946242, "grad_norm": 0.17078733444213867, "learning_rate": 0.00079, "loss": 3.1014, "num_input_tokens_seen": 4141875200, "step": 3950 }, { "epoch": 0.08786417297160752, "grad_norm": 0.22091113030910492, "learning_rate": 0.0008, "loss": 3.0982, "num_input_tokens_seen": 4194304000, "step": 4000 }, { "epoch": 0.08786417297160752, "eval_loss": 2.9978296756744385, "eval_runtime": 65.6064, "eval_samples_per_second": 76.212, "eval_steps_per_second": 19.053, "num_input_tokens_seen": 4194304000, "step": 4000 }, { "epoch": 0.0889624751337526, "grad_norm": 0.1839856207370758, "learning_rate": 0.0008100000000000001, "loss": 3.0862, "num_input_tokens_seen": 4246732800, "step": 4050 }, { "epoch": 0.0900607772958977, "grad_norm": 0.17331145703792572, "learning_rate": 0.00082, "loss": 3.087, "num_input_tokens_seen": 4299161600, "step": 4100 }, { "epoch": 0.0911590794580428, "grad_norm": 0.18384258449077606, "learning_rate": 0.00083, "loss": 3.076, "num_input_tokens_seen": 4351590400, "step": 4150 }, { "epoch": 0.09225738162018789, "grad_norm": 0.17061170935630798, "learning_rate": 0.00084, "loss": 3.0693, "num_input_tokens_seen": 4404019200, "step": 4200 }, { "epoch": 0.09335568378233298, "grad_norm": 0.18157647550106049, "learning_rate": 0.00085, "loss": 3.0698, "num_input_tokens_seen": 4456448000, "step": 4250 }, { "epoch": 0.09445398594447808, "grad_norm": 0.15678547322750092, "learning_rate": 0.00086, "loss": 3.064, "num_input_tokens_seen": 4508876800, "step": 4300 }, { "epoch": 0.09555228810662317, "grad_norm": 0.19118325412273407, "learning_rate": 0.00087, "loss": 3.0541, "num_input_tokens_seen": 4561305600, "step": 4350 }, { "epoch": 0.09665059026876827, "grad_norm": 0.17620691657066345, "learning_rate": 0.00088, "loss": 3.0532, "num_input_tokens_seen": 4613734400, "step": 4400 }, { "epoch": 0.09774889243091336, "grad_norm": 0.17351101338863373, "learning_rate": 0.0008900000000000001, "loss": 3.0549, "num_input_tokens_seen": 4666163200, "step": 4450 }, { "epoch": 0.09884719459305845, "grad_norm": 0.15183581411838531, "learning_rate": 0.0009000000000000001, "loss": 3.0485, "num_input_tokens_seen": 4718592000, "step": 4500 }, { "epoch": 0.09884719459305845, "eval_loss": 2.9479379653930664, "eval_runtime": 66.5611, "eval_samples_per_second": 75.119, "eval_steps_per_second": 18.78, "num_input_tokens_seen": 4718592000, "step": 4500 }, { "epoch": 0.09994549675520355, "grad_norm": 0.1681961864233017, "learning_rate": 0.00091, "loss": 3.0395, "num_input_tokens_seen": 4771020800, "step": 4550 }, { "epoch": 0.10104379891734865, "grad_norm": 0.17382557690143585, "learning_rate": 0.00092, "loss": 3.0371, "num_input_tokens_seen": 4823449600, "step": 4600 }, { "epoch": 0.10214210107949374, "grad_norm": 0.14377906918525696, "learning_rate": 0.00093, "loss": 3.0377, "num_input_tokens_seen": 4875878400, "step": 4650 }, { "epoch": 0.10324040324163883, "grad_norm": 0.1590214967727661, "learning_rate": 0.00094, "loss": 3.0305, "num_input_tokens_seen": 4928307200, "step": 4700 }, { "epoch": 0.10433870540378393, "grad_norm": 0.15563353896141052, "learning_rate": 0.00095, "loss": 3.0254, "num_input_tokens_seen": 4980736000, "step": 4750 }, { "epoch": 0.10543700756592903, "grad_norm": 0.16002103686332703, "learning_rate": 0.00096, "loss": 3.0222, "num_input_tokens_seen": 5033164800, "step": 4800 }, { "epoch": 0.10653530972807411, "grad_norm": 0.1406039148569107, "learning_rate": 0.0009699999999999999, "loss": 3.0185, "num_input_tokens_seen": 5085593600, "step": 4850 }, { "epoch": 0.10763361189021921, "grad_norm": 0.14609627425670624, "learning_rate": 0.00098, "loss": 3.0177, "num_input_tokens_seen": 5138022400, "step": 4900 }, { "epoch": 0.1087319140523643, "grad_norm": 0.16061657667160034, "learning_rate": 0.00099, "loss": 3.0137, "num_input_tokens_seen": 5190451200, "step": 4950 }, { "epoch": 0.10983021621450939, "grad_norm": 0.18423974514007568, "learning_rate": 0.001, "loss": 3.016, "num_input_tokens_seen": 5242880000, "step": 5000 }, { "epoch": 0.10983021621450939, "eval_loss": 2.9132862091064453, "eval_runtime": 65.7163, "eval_samples_per_second": 76.085, "eval_steps_per_second": 19.021, "num_input_tokens_seen": 5242880000, "step": 5000 }, { "epoch": 0.11092851837665449, "grad_norm": 0.15302155911922455, "learning_rate": 0.001, "loss": 3.0037, "num_input_tokens_seen": 5295308800, "step": 5050 }, { "epoch": 0.11202682053879959, "grad_norm": 0.1474563181400299, "learning_rate": 0.001, "loss": 3.0063, "num_input_tokens_seen": 5347737600, "step": 5100 }, { "epoch": 0.11312512270094467, "grad_norm": 0.14318443834781647, "learning_rate": 0.001, "loss": 3.0011, "num_input_tokens_seen": 5400166400, "step": 5150 }, { "epoch": 0.11422342486308977, "grad_norm": 0.1521013379096985, "learning_rate": 0.001, "loss": 2.9946, "num_input_tokens_seen": 5452595200, "step": 5200 }, { "epoch": 0.11532172702523487, "grad_norm": 0.14434175193309784, "learning_rate": 0.001, "loss": 2.9909, "num_input_tokens_seen": 5505024000, "step": 5250 }, { "epoch": 0.11642002918737995, "grad_norm": 0.16284991800785065, "learning_rate": 0.001, "loss": 2.9846, "num_input_tokens_seen": 5557452800, "step": 5300 }, { "epoch": 0.11751833134952505, "grad_norm": 0.15281164646148682, "learning_rate": 0.001, "loss": 2.9843, "num_input_tokens_seen": 5609881600, "step": 5350 }, { "epoch": 0.11861663351167015, "grad_norm": 0.1227719634771347, "learning_rate": 0.001, "loss": 2.9778, "num_input_tokens_seen": 5662310400, "step": 5400 }, { "epoch": 0.11971493567381523, "grad_norm": 0.1346055269241333, "learning_rate": 0.001, "loss": 2.9745, "num_input_tokens_seen": 5714739200, "step": 5450 }, { "epoch": 0.12081323783596033, "grad_norm": 0.15828204154968262, "learning_rate": 0.001, "loss": 2.9723, "num_input_tokens_seen": 5767168000, "step": 5500 }, { "epoch": 0.12081323783596033, "eval_loss": 2.8801000118255615, "eval_runtime": 65.3935, "eval_samples_per_second": 76.46, "eval_steps_per_second": 19.115, "num_input_tokens_seen": 5767168000, "step": 5500 }, { "epoch": 0.12191153999810543, "grad_norm": 0.1391400694847107, "learning_rate": 0.001, "loss": 2.9609, "num_input_tokens_seen": 5819596800, "step": 5550 }, { "epoch": 0.12300984216025053, "grad_norm": 0.14347107708454132, "learning_rate": 0.001, "loss": 2.9697, "num_input_tokens_seen": 5872025600, "step": 5600 }, { "epoch": 0.12410814432239561, "grad_norm": 0.13779127597808838, "learning_rate": 0.001, "loss": 2.9609, "num_input_tokens_seen": 5924454400, "step": 5650 }, { "epoch": 0.1252064464845407, "grad_norm": 0.13017955422401428, "learning_rate": 0.001, "loss": 2.9545, "num_input_tokens_seen": 5976883200, "step": 5700 }, { "epoch": 0.1263047486466858, "grad_norm": 0.12697578966617584, "learning_rate": 0.001, "loss": 2.9563, "num_input_tokens_seen": 6029312000, "step": 5750 }, { "epoch": 0.1274030508088309, "grad_norm": 0.15175020694732666, "learning_rate": 0.001, "loss": 2.9502, "num_input_tokens_seen": 6081740800, "step": 5800 }, { "epoch": 0.12850135297097598, "grad_norm": 0.1209852397441864, "learning_rate": 0.001, "loss": 2.9516, "num_input_tokens_seen": 6134169600, "step": 5850 }, { "epoch": 0.12959965513312108, "grad_norm": 0.16521666944026947, "learning_rate": 0.001, "loss": 2.9528, "num_input_tokens_seen": 6186598400, "step": 5900 }, { "epoch": 0.13069795729526618, "grad_norm": 0.12271756678819656, "learning_rate": 0.001, "loss": 2.9382, "num_input_tokens_seen": 6239027200, "step": 5950 }, { "epoch": 0.13179625945741127, "grad_norm": 0.1376461535692215, "learning_rate": 0.001, "loss": 2.9464, "num_input_tokens_seen": 6291456000, "step": 6000 }, { "epoch": 0.13179625945741127, "eval_loss": 2.84769606590271, "eval_runtime": 65.8814, "eval_samples_per_second": 75.894, "eval_steps_per_second": 18.973, "num_input_tokens_seen": 6291456000, "step": 6000 }, { "epoch": 0.13289456161955637, "grad_norm": 0.11629872024059296, "learning_rate": 0.001, "loss": 2.9406, "num_input_tokens_seen": 6343884800, "step": 6050 }, { "epoch": 0.13399286378170147, "grad_norm": 0.13740529119968414, "learning_rate": 0.001, "loss": 2.9343, "num_input_tokens_seen": 6396313600, "step": 6100 }, { "epoch": 0.13509116594384657, "grad_norm": 0.11548039317131042, "learning_rate": 0.001, "loss": 2.9374, "num_input_tokens_seen": 6448742400, "step": 6150 }, { "epoch": 0.13618946810599164, "grad_norm": 0.11710146814584732, "learning_rate": 0.001, "loss": 2.9376, "num_input_tokens_seen": 6501171200, "step": 6200 }, { "epoch": 0.13728777026813674, "grad_norm": 0.11223472654819489, "learning_rate": 0.001, "loss": 2.9284, "num_input_tokens_seen": 6553600000, "step": 6250 }, { "epoch": 0.13838607243028184, "grad_norm": 0.12880656123161316, "learning_rate": 0.001, "loss": 2.9303, "num_input_tokens_seen": 6606028800, "step": 6300 }, { "epoch": 0.13948437459242694, "grad_norm": 0.11898139119148254, "learning_rate": 0.001, "loss": 2.9246, "num_input_tokens_seen": 6658457600, "step": 6350 }, { "epoch": 0.14058267675457203, "grad_norm": 0.11154898256063461, "learning_rate": 0.001, "loss": 2.9254, "num_input_tokens_seen": 6710886400, "step": 6400 }, { "epoch": 0.14168097891671713, "grad_norm": 0.12669232487678528, "learning_rate": 0.001, "loss": 2.9162, "num_input_tokens_seen": 6763315200, "step": 6450 }, { "epoch": 0.1427792810788622, "grad_norm": 0.12259842455387115, "learning_rate": 0.001, "loss": 2.9179, "num_input_tokens_seen": 6815744000, "step": 6500 }, { "epoch": 0.1427792810788622, "eval_loss": 2.8220207691192627, "eval_runtime": 65.2868, "eval_samples_per_second": 76.585, "eval_steps_per_second": 19.146, "num_input_tokens_seen": 6815744000, "step": 6500 }, { "epoch": 0.1438775832410073, "grad_norm": 0.13403092324733734, "learning_rate": 0.001, "loss": 2.9102, "num_input_tokens_seen": 6868172800, "step": 6550 }, { "epoch": 0.1449758854031524, "grad_norm": 0.13063696026802063, "learning_rate": 0.001, "loss": 2.9112, "num_input_tokens_seen": 6920601600, "step": 6600 }, { "epoch": 0.1460741875652975, "grad_norm": 0.11871635168790817, "learning_rate": 0.001, "loss": 2.9085, "num_input_tokens_seen": 6973030400, "step": 6650 }, { "epoch": 0.1471724897274426, "grad_norm": 0.11007633060216904, "learning_rate": 0.001, "loss": 2.9098, "num_input_tokens_seen": 7025459200, "step": 6700 }, { "epoch": 0.1482707918895877, "grad_norm": 0.10521857440471649, "learning_rate": 0.001, "loss": 2.9086, "num_input_tokens_seen": 7077888000, "step": 6750 }, { "epoch": 0.14936909405173276, "grad_norm": 0.11179310083389282, "learning_rate": 0.001, "loss": 2.9066, "num_input_tokens_seen": 7130316800, "step": 6800 }, { "epoch": 0.15046739621387786, "grad_norm": 0.1192353144288063, "learning_rate": 0.001, "loss": 2.9135, "num_input_tokens_seen": 7182745600, "step": 6850 }, { "epoch": 0.15156569837602296, "grad_norm": 0.11084350198507309, "learning_rate": 0.001, "loss": 2.9054, "num_input_tokens_seen": 7235174400, "step": 6900 }, { "epoch": 0.15266400053816806, "grad_norm": 0.11826325207948685, "learning_rate": 0.001, "loss": 2.9054, "num_input_tokens_seen": 7287603200, "step": 6950 }, { "epoch": 0.15376230270031316, "grad_norm": 0.12597590684890747, "learning_rate": 0.001, "loss": 2.8945, "num_input_tokens_seen": 7340032000, "step": 7000 }, { "epoch": 0.15376230270031316, "eval_loss": 2.802734851837158, "eval_runtime": 65.3332, "eval_samples_per_second": 76.531, "eval_steps_per_second": 19.133, "num_input_tokens_seen": 7340032000, "step": 7000 }, { "epoch": 0.15486060486245826, "grad_norm": 0.11222469806671143, "learning_rate": 0.001, "loss": 2.8997, "num_input_tokens_seen": 7392460800, "step": 7050 }, { "epoch": 0.15595890702460335, "grad_norm": 0.11488104611635208, "learning_rate": 0.001, "loss": 2.8965, "num_input_tokens_seen": 7444889600, "step": 7100 }, { "epoch": 0.15705720918674843, "grad_norm": 0.1285555213689804, "learning_rate": 0.001, "loss": 2.8909, "num_input_tokens_seen": 7497318400, "step": 7150 }, { "epoch": 0.15815551134889352, "grad_norm": 0.12659265100955963, "learning_rate": 0.001, "loss": 2.8833, "num_input_tokens_seen": 7549747200, "step": 7200 }, { "epoch": 0.15925381351103862, "grad_norm": 0.10823842883110046, "learning_rate": 0.001, "loss": 2.9031, "num_input_tokens_seen": 7602176000, "step": 7250 }, { "epoch": 0.16035211567318372, "grad_norm": 0.12597811222076416, "learning_rate": 0.001, "loss": 2.8831, "num_input_tokens_seen": 7654604800, "step": 7300 }, { "epoch": 0.16145041783532882, "grad_norm": 0.1285410374403, "learning_rate": 0.001, "loss": 2.8931, "num_input_tokens_seen": 7707033600, "step": 7350 }, { "epoch": 0.16254871999747392, "grad_norm": 0.11170299351215363, "learning_rate": 0.001, "loss": 2.8861, "num_input_tokens_seen": 7759462400, "step": 7400 }, { "epoch": 0.163647022159619, "grad_norm": 0.11146055907011032, "learning_rate": 0.001, "loss": 2.8756, "num_input_tokens_seen": 7811891200, "step": 7450 }, { "epoch": 0.1647453243217641, "grad_norm": 0.10750412940979004, "learning_rate": 0.001, "loss": 2.8808, "num_input_tokens_seen": 7864320000, "step": 7500 }, { "epoch": 0.1647453243217641, "eval_loss": 2.785506248474121, "eval_runtime": 65.0661, "eval_samples_per_second": 76.845, "eval_steps_per_second": 19.211, "num_input_tokens_seen": 7864320000, "step": 7500 }, { "epoch": 0.16584362648390918, "grad_norm": 0.11221355944871902, "learning_rate": 0.001, "loss": 2.8834, "num_input_tokens_seen": 7916748800, "step": 7550 }, { "epoch": 0.16694192864605428, "grad_norm": 0.1089220717549324, "learning_rate": 0.001, "loss": 2.8796, "num_input_tokens_seen": 7969177600, "step": 7600 }, { "epoch": 0.16804023080819938, "grad_norm": 0.11125486344099045, "learning_rate": 0.001, "loss": 2.8836, "num_input_tokens_seen": 8021606400, "step": 7650 }, { "epoch": 0.16913853297034448, "grad_norm": 0.12804660201072693, "learning_rate": 0.001, "loss": 2.8754, "num_input_tokens_seen": 8074035200, "step": 7700 }, { "epoch": 0.17023683513248955, "grad_norm": 0.11395713686943054, "learning_rate": 0.001, "loss": 2.8736, "num_input_tokens_seen": 8126464000, "step": 7750 }, { "epoch": 0.17133513729463465, "grad_norm": 0.1095738559961319, "learning_rate": 0.001, "loss": 2.8743, "num_input_tokens_seen": 8178892800, "step": 7800 }, { "epoch": 0.17243343945677975, "grad_norm": 0.10545111447572708, "learning_rate": 0.001, "loss": 2.8718, "num_input_tokens_seen": 8231321600, "step": 7850 }, { "epoch": 0.17353174161892485, "grad_norm": 0.13135021924972534, "learning_rate": 0.001, "loss": 2.8648, "num_input_tokens_seen": 8283750400, "step": 7900 }, { "epoch": 0.17463004378106994, "grad_norm": 0.12348899990320206, "learning_rate": 0.001, "loss": 2.8628, "num_input_tokens_seen": 8336179200, "step": 7950 }, { "epoch": 0.17572834594321504, "grad_norm": 0.10604492574930191, "learning_rate": 0.001, "loss": 2.8676, "num_input_tokens_seen": 8388608000, "step": 8000 }, { "epoch": 0.17572834594321504, "eval_loss": 2.7698919773101807, "eval_runtime": 65.5096, "eval_samples_per_second": 76.325, "eval_steps_per_second": 19.081, "num_input_tokens_seen": 8388608000, "step": 8000 }, { "epoch": 0.17682664810536014, "grad_norm": 0.12299258261919022, "learning_rate": 0.001, "loss": 2.8626, "num_input_tokens_seen": 8441036800, "step": 8050 }, { "epoch": 0.1779249502675052, "grad_norm": 0.11638012528419495, "learning_rate": 0.001, "loss": 2.864, "num_input_tokens_seen": 8493465600, "step": 8100 }, { "epoch": 0.1790232524296503, "grad_norm": 0.10978250205516815, "learning_rate": 0.001, "loss": 2.8589, "num_input_tokens_seen": 8545894400, "step": 8150 }, { "epoch": 0.1801215545917954, "grad_norm": 0.11229872703552246, "learning_rate": 0.001, "loss": 2.8671, "num_input_tokens_seen": 8598323200, "step": 8200 }, { "epoch": 0.1812198567539405, "grad_norm": 0.13177119195461273, "learning_rate": 0.001, "loss": 2.8524, "num_input_tokens_seen": 8650752000, "step": 8250 }, { "epoch": 0.1823181589160856, "grad_norm": 0.11021032929420471, "learning_rate": 0.001, "loss": 2.8552, "num_input_tokens_seen": 8703180800, "step": 8300 }, { "epoch": 0.1834164610782307, "grad_norm": 0.11381058394908905, "learning_rate": 0.001, "loss": 2.8529, "num_input_tokens_seen": 8755609600, "step": 8350 }, { "epoch": 0.18451476324037577, "grad_norm": 0.10889217257499695, "learning_rate": 0.001, "loss": 2.8581, "num_input_tokens_seen": 8808038400, "step": 8400 }, { "epoch": 0.18561306540252087, "grad_norm": 0.13519708812236786, "learning_rate": 0.001, "loss": 2.8518, "num_input_tokens_seen": 8860467200, "step": 8450 }, { "epoch": 0.18671136756466597, "grad_norm": 0.1265636533498764, "learning_rate": 0.001, "loss": 2.8452, "num_input_tokens_seen": 8912896000, "step": 8500 }, { "epoch": 0.18671136756466597, "eval_loss": 2.754452705383301, "eval_runtime": 65.4439, "eval_samples_per_second": 76.401, "eval_steps_per_second": 19.1, "num_input_tokens_seen": 8912896000, "step": 8500 }, { "epoch": 0.18780966972681107, "grad_norm": 0.12250006198883057, "learning_rate": 0.001, "loss": 2.8506, "num_input_tokens_seen": 8965324800, "step": 8550 }, { "epoch": 0.18890797188895617, "grad_norm": 0.1371607929468155, "learning_rate": 0.001, "loss": 2.8472, "num_input_tokens_seen": 9017753600, "step": 8600 }, { "epoch": 0.19000627405110126, "grad_norm": 0.11844755709171295, "learning_rate": 0.001, "loss": 2.8492, "num_input_tokens_seen": 9070182400, "step": 8650 }, { "epoch": 0.19110457621324634, "grad_norm": 0.38294216990470886, "learning_rate": 0.001, "loss": 6.3226, "num_input_tokens_seen": 9122611200, "step": 8700 }, { "epoch": 0.19220287837539143, "grad_norm": 0.44077590107917786, "learning_rate": 0.001, "loss": 6.7001, "num_input_tokens_seen": 9175040000, "step": 8750 }, { "epoch": 0.19330118053753653, "grad_norm": 0.4238772392272949, "learning_rate": 0.001, "loss": 5.8714, "num_input_tokens_seen": 9227468800, "step": 8800 }, { "epoch": 0.19439948269968163, "grad_norm": 0.2830688953399658, "learning_rate": 0.001, "loss": 4.8951, "num_input_tokens_seen": 9279897600, "step": 8850 }, { "epoch": 0.19549778486182673, "grad_norm": 0.2485039383172989, "learning_rate": 0.001, "loss": 3.928, "num_input_tokens_seen": 9332326400, "step": 8900 }, { "epoch": 0.19659608702397183, "grad_norm": 0.20515842735767365, "learning_rate": 0.001, "loss": 3.4277, "num_input_tokens_seen": 9384755200, "step": 8950 }, { "epoch": 0.1976943891861169, "grad_norm": 0.13605651259422302, "learning_rate": 0.001, "loss": 3.2263, "num_input_tokens_seen": 9437184000, "step": 9000 }, { "epoch": 0.1976943891861169, "eval_loss": 3.014314889907837, "eval_runtime": 65.8851, "eval_samples_per_second": 75.89, "eval_steps_per_second": 18.972, "num_input_tokens_seen": 9437184000, "step": 9000 }, { "epoch": 0.198792691348262, "grad_norm": 0.17666102945804596, "learning_rate": 0.001, "loss": 3.0728, "num_input_tokens_seen": 9489612800, "step": 9050 }, { "epoch": 0.1998909935104071, "grad_norm": 0.202484592795372, "learning_rate": 0.001, "loss": 2.9818, "num_input_tokens_seen": 9542041600, "step": 9100 }, { "epoch": 0.2009892956725522, "grad_norm": 0.15095236897468567, "learning_rate": 0.001, "loss": 2.9423, "num_input_tokens_seen": 9594470400, "step": 9150 }, { "epoch": 0.2020875978346973, "grad_norm": 0.13089850544929504, "learning_rate": 0.001, "loss": 2.9227, "num_input_tokens_seen": 9646899200, "step": 9200 }, { "epoch": 0.2031858999968424, "grad_norm": 0.14022304117679596, "learning_rate": 0.001, "loss": 2.8988, "num_input_tokens_seen": 9699328000, "step": 9250 }, { "epoch": 0.2042842021589875, "grad_norm": 0.13116785883903503, "learning_rate": 0.001, "loss": 2.8716, "num_input_tokens_seen": 9751756800, "step": 9300 }, { "epoch": 0.20538250432113256, "grad_norm": 0.1395471841096878, "learning_rate": 0.001, "loss": 2.8727, "num_input_tokens_seen": 9804185600, "step": 9350 }, { "epoch": 0.20648080648327766, "grad_norm": 0.1271878033876419, "learning_rate": 0.001, "loss": 2.864, "num_input_tokens_seen": 9856614400, "step": 9400 }, { "epoch": 0.20757910864542276, "grad_norm": 0.14148685336112976, "learning_rate": 0.001, "loss": 2.8604, "num_input_tokens_seen": 9909043200, "step": 9450 }, { "epoch": 0.20867741080756785, "grad_norm": 0.1292584091424942, "learning_rate": 0.001, "loss": 2.8547, "num_input_tokens_seen": 9961472000, "step": 9500 }, { "epoch": 0.20867741080756785, "eval_loss": 2.756131649017334, "eval_runtime": 65.0495, "eval_samples_per_second": 76.865, "eval_steps_per_second": 19.216, "num_input_tokens_seen": 9961472000, "step": 9500 }, { "epoch": 0.20977571296971295, "grad_norm": 0.10929372161626816, "learning_rate": 0.001, "loss": 2.8467, "num_input_tokens_seen": 10013900800, "step": 9550 }, { "epoch": 0.21087401513185805, "grad_norm": 0.1180899515748024, "learning_rate": 0.001, "loss": 2.8501, "num_input_tokens_seen": 10066329600, "step": 9600 }, { "epoch": 0.21197231729400312, "grad_norm": 0.12041448056697845, "learning_rate": 0.001, "loss": 2.8438, "num_input_tokens_seen": 10118758400, "step": 9650 }, { "epoch": 0.21307061945614822, "grad_norm": 0.13195224106311798, "learning_rate": 0.001, "loss": 2.8341, "num_input_tokens_seen": 10171187200, "step": 9700 }, { "epoch": 0.21416892161829332, "grad_norm": 0.11887054890394211, "learning_rate": 0.001, "loss": 2.8349, "num_input_tokens_seen": 10223616000, "step": 9750 }, { "epoch": 0.21526722378043842, "grad_norm": 0.1044996827840805, "learning_rate": 0.001, "loss": 2.8428, "num_input_tokens_seen": 10276044800, "step": 9800 }, { "epoch": 0.21636552594258351, "grad_norm": 0.11951665580272675, "learning_rate": 0.001, "loss": 2.8323, "num_input_tokens_seen": 10328473600, "step": 9850 }, { "epoch": 0.2174638281047286, "grad_norm": 0.11673793941736221, "learning_rate": 0.001, "loss": 2.8271, "num_input_tokens_seen": 10380902400, "step": 9900 }, { "epoch": 0.21856213026687368, "grad_norm": 0.1178969219326973, "learning_rate": 0.001, "loss": 2.8328, "num_input_tokens_seen": 10433331200, "step": 9950 }, { "epoch": 0.21966043242901878, "grad_norm": 0.11995361745357513, "learning_rate": 0.001, "loss": 2.8182, "num_input_tokens_seen": 10485760000, "step": 10000 }, { "epoch": 0.21966043242901878, "eval_loss": 2.732673168182373, "eval_runtime": 66.3377, "eval_samples_per_second": 75.372, "eval_steps_per_second": 18.843, "num_input_tokens_seen": 10485760000, "step": 10000 }, { "epoch": 0.22075873459116388, "grad_norm": 0.13463908433914185, "learning_rate": 0.001, "loss": 2.8242, "num_input_tokens_seen": 10538188800, "step": 10050 }, { "epoch": 0.22185703675330898, "grad_norm": 0.11778156459331512, "learning_rate": 0.001, "loss": 2.8234, "num_input_tokens_seen": 10590617600, "step": 10100 }, { "epoch": 0.22295533891545408, "grad_norm": 0.11393869668245316, "learning_rate": 0.001, "loss": 2.8204, "num_input_tokens_seen": 10643046400, "step": 10150 }, { "epoch": 0.22405364107759917, "grad_norm": 0.12454303354024887, "learning_rate": 0.001, "loss": 2.8185, "num_input_tokens_seen": 10695475200, "step": 10200 }, { "epoch": 0.22515194323974427, "grad_norm": 0.1148439347743988, "learning_rate": 0.001, "loss": 2.8219, "num_input_tokens_seen": 10747904000, "step": 10250 }, { "epoch": 0.22625024540188934, "grad_norm": 0.13888292014598846, "learning_rate": 0.001, "loss": 2.8157, "num_input_tokens_seen": 10800332800, "step": 10300 }, { "epoch": 0.22734854756403444, "grad_norm": 0.12242749333381653, "learning_rate": 0.001, "loss": 2.8165, "num_input_tokens_seen": 10852761600, "step": 10350 }, { "epoch": 0.22844684972617954, "grad_norm": 0.13651017844676971, "learning_rate": 0.001, "loss": 2.8165, "num_input_tokens_seen": 10905190400, "step": 10400 }, { "epoch": 0.22954515188832464, "grad_norm": 0.12349703162908554, "learning_rate": 0.001, "loss": 2.8126, "num_input_tokens_seen": 10957619200, "step": 10450 }, { "epoch": 0.23064345405046974, "grad_norm": 0.13448943197727203, "learning_rate": 0.001, "loss": 2.8162, "num_input_tokens_seen": 11010048000, "step": 10500 }, { "epoch": 0.23064345405046974, "eval_loss": 2.720102071762085, "eval_runtime": 65.0663, "eval_samples_per_second": 76.845, "eval_steps_per_second": 19.211, "num_input_tokens_seen": 11010048000, "step": 10500 }, { "epoch": 0.23174175621261484, "grad_norm": 0.1171165183186531, "learning_rate": 0.001, "loss": 2.817, "num_input_tokens_seen": 11062476800, "step": 10550 }, { "epoch": 0.2328400583747599, "grad_norm": 0.1417781263589859, "learning_rate": 0.001, "loss": 2.8159, "num_input_tokens_seen": 11114905600, "step": 10600 }, { "epoch": 0.233938360536905, "grad_norm": 0.13051685690879822, "learning_rate": 0.001, "loss": 2.8062, "num_input_tokens_seen": 11167334400, "step": 10650 }, { "epoch": 0.2350366626990501, "grad_norm": 0.12536808848381042, "learning_rate": 0.001, "loss": 2.8166, "num_input_tokens_seen": 11219763200, "step": 10700 }, { "epoch": 0.2361349648611952, "grad_norm": 0.11859289556741714, "learning_rate": 0.001, "loss": 2.8075, "num_input_tokens_seen": 11272192000, "step": 10750 }, { "epoch": 0.2372332670233403, "grad_norm": 0.14844287931919098, "learning_rate": 0.001, "loss": 2.8139, "num_input_tokens_seen": 11324620800, "step": 10800 }, { "epoch": 0.2383315691854854, "grad_norm": 0.12877844274044037, "learning_rate": 0.001, "loss": 2.8031, "num_input_tokens_seen": 11377049600, "step": 10850 }, { "epoch": 0.23942987134763047, "grad_norm": 0.13911722600460052, "learning_rate": 0.001, "loss": 2.7992, "num_input_tokens_seen": 11429478400, "step": 10900 }, { "epoch": 0.24052817350977557, "grad_norm": 0.156200110912323, "learning_rate": 0.001, "loss": 2.8059, "num_input_tokens_seen": 11481907200, "step": 10950 }, { "epoch": 0.24162647567192067, "grad_norm": 0.12990960478782654, "learning_rate": 0.001, "loss": 2.7984, "num_input_tokens_seen": 11534336000, "step": 11000 }, { "epoch": 0.24162647567192067, "eval_loss": 2.7103493213653564, "eval_runtime": 65.6611, "eval_samples_per_second": 76.149, "eval_steps_per_second": 19.037, "num_input_tokens_seen": 11534336000, "step": 11000 }, { "epoch": 0.24272477783406576, "grad_norm": 0.1190350204706192, "learning_rate": 0.001, "loss": 2.7994, "num_input_tokens_seen": 11586764800, "step": 11050 }, { "epoch": 0.24382307999621086, "grad_norm": 0.12825961410999298, "learning_rate": 0.001, "loss": 2.7992, "num_input_tokens_seen": 11639193600, "step": 11100 }, { "epoch": 0.24492138215835596, "grad_norm": 0.12561525404453278, "learning_rate": 0.001, "loss": 2.8009, "num_input_tokens_seen": 11691622400, "step": 11150 }, { "epoch": 0.24601968432050106, "grad_norm": 0.12596049904823303, "learning_rate": 0.001, "loss": 2.8002, "num_input_tokens_seen": 11744051200, "step": 11200 }, { "epoch": 0.24711798648264613, "grad_norm": 0.1415141373872757, "learning_rate": 0.001, "loss": 2.8004, "num_input_tokens_seen": 11796480000, "step": 11250 }, { "epoch": 0.24821628864479123, "grad_norm": 0.1359766125679016, "learning_rate": 0.001, "loss": 2.7988, "num_input_tokens_seen": 11848908800, "step": 11300 }, { "epoch": 0.24931459080693633, "grad_norm": 0.13459013402462006, "learning_rate": 0.001, "loss": 2.7991, "num_input_tokens_seen": 11901337600, "step": 11350 }, { "epoch": 0.2504128929690814, "grad_norm": 0.1344253420829773, "learning_rate": 0.001, "loss": 2.805, "num_input_tokens_seen": 11953766400, "step": 11400 }, { "epoch": 0.2515111951312265, "grad_norm": 0.13629016280174255, "learning_rate": 0.001, "loss": 2.7954, "num_input_tokens_seen": 12006195200, "step": 11450 }, { "epoch": 0.2526094972933716, "grad_norm": 0.12940892577171326, "learning_rate": 0.001, "loss": 2.8009, "num_input_tokens_seen": 12058624000, "step": 11500 }, { "epoch": 0.2526094972933716, "eval_loss": 2.7012581825256348, "eval_runtime": 65.7039, "eval_samples_per_second": 76.099, "eval_steps_per_second": 19.025, "num_input_tokens_seen": 12058624000, "step": 11500 }, { "epoch": 0.2537077994555167, "grad_norm": 0.15021966397762299, "learning_rate": 0.001, "loss": 2.7963, "num_input_tokens_seen": 12111052800, "step": 11550 }, { "epoch": 0.2548061016176618, "grad_norm": 0.12381847202777863, "learning_rate": 0.001, "loss": 2.7954, "num_input_tokens_seen": 12163481600, "step": 11600 }, { "epoch": 0.2559044037798069, "grad_norm": 0.14849607646465302, "learning_rate": 0.001, "loss": 2.7837, "num_input_tokens_seen": 12215910400, "step": 11650 }, { "epoch": 0.25700270594195196, "grad_norm": 0.1286240816116333, "learning_rate": 0.001, "loss": 2.7999, "num_input_tokens_seen": 12268339200, "step": 11700 }, { "epoch": 0.2581010081040971, "grad_norm": 0.11861539632081985, "learning_rate": 0.001, "loss": 2.7979, "num_input_tokens_seen": 12320768000, "step": 11750 }, { "epoch": 0.25919931026624216, "grad_norm": 0.11512617021799088, "learning_rate": 0.001, "loss": 2.7926, "num_input_tokens_seen": 12373196800, "step": 11800 }, { "epoch": 0.2602976124283873, "grad_norm": 0.13469178974628448, "learning_rate": 0.001, "loss": 2.7881, "num_input_tokens_seen": 12425625600, "step": 11850 }, { "epoch": 0.26139591459053235, "grad_norm": 0.15504290163516998, "learning_rate": 0.001, "loss": 2.7917, "num_input_tokens_seen": 12478054400, "step": 11900 }, { "epoch": 0.2624942167526775, "grad_norm": 0.1363905370235443, "learning_rate": 0.001, "loss": 2.7869, "num_input_tokens_seen": 12530483200, "step": 11950 }, { "epoch": 0.26359251891482255, "grad_norm": 0.11095720529556274, "learning_rate": 0.001, "loss": 2.7883, "num_input_tokens_seen": 12582912000, "step": 12000 }, { "epoch": 0.26359251891482255, "eval_loss": 2.6911227703094482, "eval_runtime": 65.4928, "eval_samples_per_second": 76.344, "eval_steps_per_second": 19.086, "num_input_tokens_seen": 12582912000, "step": 12000 }, { "epoch": 0.2646908210769676, "grad_norm": 0.1443321257829666, "learning_rate": 0.001, "loss": 2.7866, "num_input_tokens_seen": 12635340800, "step": 12050 }, { "epoch": 0.26578912323911275, "grad_norm": 0.12249191850423813, "learning_rate": 0.001, "loss": 2.8, "num_input_tokens_seen": 12687769600, "step": 12100 }, { "epoch": 0.2668874254012578, "grad_norm": 0.1505623608827591, "learning_rate": 0.001, "loss": 2.7934, "num_input_tokens_seen": 12740198400, "step": 12150 }, { "epoch": 0.26798572756340294, "grad_norm": 0.17367833852767944, "learning_rate": 0.001, "loss": 2.7905, "num_input_tokens_seen": 12792627200, "step": 12200 }, { "epoch": 0.269084029725548, "grad_norm": 0.12189670652151108, "learning_rate": 0.001, "loss": 2.7878, "num_input_tokens_seen": 12845056000, "step": 12250 }, { "epoch": 0.27018233188769314, "grad_norm": 0.12834201753139496, "learning_rate": 0.001, "loss": 2.7822, "num_input_tokens_seen": 12897484800, "step": 12300 }, { "epoch": 0.2712806340498382, "grad_norm": 0.1277332305908203, "learning_rate": 0.001, "loss": 2.7846, "num_input_tokens_seen": 12949913600, "step": 12350 }, { "epoch": 0.2723789362119833, "grad_norm": 0.14190761744976044, "learning_rate": 0.001, "loss": 2.7845, "num_input_tokens_seen": 13002342400, "step": 12400 }, { "epoch": 0.2734772383741284, "grad_norm": 0.14843693375587463, "learning_rate": 0.001, "loss": 2.7847, "num_input_tokens_seen": 13054771200, "step": 12450 }, { "epoch": 0.2745755405362735, "grad_norm": 0.14427120983600616, "learning_rate": 0.001, "loss": 2.78, "num_input_tokens_seen": 13107200000, "step": 12500 }, { "epoch": 0.2745755405362735, "eval_loss": 2.6847124099731445, "eval_runtime": 65.0448, "eval_samples_per_second": 76.87, "eval_steps_per_second": 19.218, "num_input_tokens_seen": 13107200000, "step": 12500 }, { "epoch": 0.2756738426984186, "grad_norm": 0.14408434927463531, "learning_rate": 0.001, "loss": 2.7794, "num_input_tokens_seen": 13159628800, "step": 12550 }, { "epoch": 0.2767721448605637, "grad_norm": 0.1557396501302719, "learning_rate": 0.001, "loss": 2.7754, "num_input_tokens_seen": 13212057600, "step": 12600 }, { "epoch": 0.27787044702270874, "grad_norm": 0.11494632810354233, "learning_rate": 0.001, "loss": 2.7839, "num_input_tokens_seen": 13264486400, "step": 12650 }, { "epoch": 0.27896874918485387, "grad_norm": 0.12402207404375076, "learning_rate": 0.001, "loss": 2.7773, "num_input_tokens_seen": 13316915200, "step": 12700 }, { "epoch": 0.28006705134699894, "grad_norm": 0.1308801770210266, "learning_rate": 0.001, "loss": 2.7864, "num_input_tokens_seen": 13369344000, "step": 12750 }, { "epoch": 0.28116535350914407, "grad_norm": 0.13596223294734955, "learning_rate": 0.001, "loss": 2.7763, "num_input_tokens_seen": 13421772800, "step": 12800 }, { "epoch": 0.28226365567128914, "grad_norm": 0.13256165385246277, "learning_rate": 0.001, "loss": 2.7762, "num_input_tokens_seen": 13474201600, "step": 12850 }, { "epoch": 0.28336195783343426, "grad_norm": 0.12955094873905182, "learning_rate": 0.001, "loss": 2.7823, "num_input_tokens_seen": 13526630400, "step": 12900 }, { "epoch": 0.28446025999557933, "grad_norm": 0.13506431877613068, "learning_rate": 0.001, "loss": 2.774, "num_input_tokens_seen": 13579059200, "step": 12950 }, { "epoch": 0.2855585621577244, "grad_norm": 0.14323291182518005, "learning_rate": 0.001, "loss": 2.7755, "num_input_tokens_seen": 13631488000, "step": 13000 }, { "epoch": 0.2855585621577244, "eval_loss": 2.6779518127441406, "eval_runtime": 66.0334, "eval_samples_per_second": 75.719, "eval_steps_per_second": 18.93, "num_input_tokens_seen": 13631488000, "step": 13000 }, { "epoch": 0.28665686431986953, "grad_norm": 0.13635839521884918, "learning_rate": 0.001, "loss": 2.7705, "num_input_tokens_seen": 13683916800, "step": 13050 }, { "epoch": 0.2877551664820146, "grad_norm": 0.1449163854122162, "learning_rate": 0.001, "loss": 2.775, "num_input_tokens_seen": 13736345600, "step": 13100 }, { "epoch": 0.2888534686441597, "grad_norm": 0.1385536640882492, "learning_rate": 0.001, "loss": 2.7705, "num_input_tokens_seen": 13788774400, "step": 13150 }, { "epoch": 0.2899517708063048, "grad_norm": 0.14647842943668365, "learning_rate": 0.001, "loss": 2.7709, "num_input_tokens_seen": 13841203200, "step": 13200 }, { "epoch": 0.2910500729684499, "grad_norm": 0.14193060994148254, "learning_rate": 0.001, "loss": 2.7753, "num_input_tokens_seen": 13893632000, "step": 13250 }, { "epoch": 0.292148375130595, "grad_norm": 0.15065765380859375, "learning_rate": 0.001, "loss": 2.7725, "num_input_tokens_seen": 13946060800, "step": 13300 }, { "epoch": 0.29324667729274007, "grad_norm": 0.1726570725440979, "learning_rate": 0.001, "loss": 2.7677, "num_input_tokens_seen": 13998489600, "step": 13350 }, { "epoch": 0.2943449794548852, "grad_norm": 0.13577735424041748, "learning_rate": 0.001, "loss": 2.7661, "num_input_tokens_seen": 14050918400, "step": 13400 }, { "epoch": 0.29544328161703026, "grad_norm": 0.1286347657442093, "learning_rate": 0.001, "loss": 2.7642, "num_input_tokens_seen": 14103347200, "step": 13450 }, { "epoch": 0.2965415837791754, "grad_norm": 0.12374001741409302, "learning_rate": 0.001, "loss": 2.7651, "num_input_tokens_seen": 14155776000, "step": 13500 }, { "epoch": 0.2965415837791754, "eval_loss": 2.6711983680725098, "eval_runtime": 65.6737, "eval_samples_per_second": 76.134, "eval_steps_per_second": 19.033, "num_input_tokens_seen": 14155776000, "step": 13500 }, { "epoch": 0.29763988594132046, "grad_norm": 0.1733749508857727, "learning_rate": 0.001, "loss": 2.765, "num_input_tokens_seen": 14208204800, "step": 13550 }, { "epoch": 0.29873818810346553, "grad_norm": 0.1459003984928131, "learning_rate": 0.001, "loss": 2.7683, "num_input_tokens_seen": 14260633600, "step": 13600 }, { "epoch": 0.29983649026561066, "grad_norm": 0.1527784913778305, "learning_rate": 0.001, "loss": 2.7678, "num_input_tokens_seen": 14313062400, "step": 13650 }, { "epoch": 0.3009347924277557, "grad_norm": 0.1344996690750122, "learning_rate": 0.001, "loss": 2.7613, "num_input_tokens_seen": 14365491200, "step": 13700 }, { "epoch": 0.30203309458990085, "grad_norm": 0.1291748583316803, "learning_rate": 0.001, "loss": 2.7682, "num_input_tokens_seen": 14417920000, "step": 13750 }, { "epoch": 0.3031313967520459, "grad_norm": 0.1352360099554062, "learning_rate": 0.001, "loss": 2.764, "num_input_tokens_seen": 14470348800, "step": 13800 }, { "epoch": 0.30422969891419105, "grad_norm": 0.13686618208885193, "learning_rate": 0.001, "loss": 2.7638, "num_input_tokens_seen": 14522777600, "step": 13850 }, { "epoch": 0.3053280010763361, "grad_norm": 0.15377116203308105, "learning_rate": 0.001, "loss": 2.7639, "num_input_tokens_seen": 14575206400, "step": 13900 }, { "epoch": 0.3064263032384812, "grad_norm": 0.13904446363449097, "learning_rate": 0.001, "loss": 2.7666, "num_input_tokens_seen": 14627635200, "step": 13950 }, { "epoch": 0.3075246054006263, "grad_norm": 0.12402611970901489, "learning_rate": 0.001, "loss": 2.759, "num_input_tokens_seen": 14680064000, "step": 14000 }, { "epoch": 0.3075246054006263, "eval_loss": 2.6654388904571533, "eval_runtime": 65.2775, "eval_samples_per_second": 76.596, "eval_steps_per_second": 19.149, "num_input_tokens_seen": 14680064000, "step": 14000 }, { "epoch": 0.3086229075627714, "grad_norm": 0.13326038420200348, "learning_rate": 0.001, "loss": 2.7622, "num_input_tokens_seen": 14732492800, "step": 14050 }, { "epoch": 0.3097212097249165, "grad_norm": 0.14305976033210754, "learning_rate": 0.001, "loss": 2.7597, "num_input_tokens_seen": 14784921600, "step": 14100 }, { "epoch": 0.3108195118870616, "grad_norm": 0.1182415783405304, "learning_rate": 0.001, "loss": 2.758, "num_input_tokens_seen": 14837350400, "step": 14150 }, { "epoch": 0.3119178140492067, "grad_norm": 0.12919387221336365, "learning_rate": 0.001, "loss": 2.759, "num_input_tokens_seen": 14889779200, "step": 14200 }, { "epoch": 0.3130161162113518, "grad_norm": 0.1420537382364273, "learning_rate": 0.001, "loss": 2.7519, "num_input_tokens_seen": 14942208000, "step": 14250 }, { "epoch": 0.31411441837349685, "grad_norm": 0.14349806308746338, "learning_rate": 0.001, "loss": 2.7653, "num_input_tokens_seen": 14994636800, "step": 14300 }, { "epoch": 0.315212720535642, "grad_norm": 0.16453324258327484, "learning_rate": 0.001, "loss": 2.7642, "num_input_tokens_seen": 15047065600, "step": 14350 }, { "epoch": 0.31631102269778705, "grad_norm": 0.11806487292051315, "learning_rate": 0.001, "loss": 2.7605, "num_input_tokens_seen": 15099494400, "step": 14400 }, { "epoch": 0.3174093248599322, "grad_norm": 0.12850746512413025, "learning_rate": 0.001, "loss": 2.7539, "num_input_tokens_seen": 15151923200, "step": 14450 }, { "epoch": 0.31850762702207724, "grad_norm": 0.1480904221534729, "learning_rate": 0.001, "loss": 2.7574, "num_input_tokens_seen": 15204352000, "step": 14500 }, { "epoch": 0.31850762702207724, "eval_loss": 2.6607398986816406, "eval_runtime": 65.6281, "eval_samples_per_second": 76.187, "eval_steps_per_second": 19.047, "num_input_tokens_seen": 15204352000, "step": 14500 }, { "epoch": 0.3196059291842223, "grad_norm": 0.13606210052967072, "learning_rate": 0.001, "loss": 2.763, "num_input_tokens_seen": 15256780800, "step": 14550 }, { "epoch": 0.32070423134636744, "grad_norm": 0.12546846270561218, "learning_rate": 0.001, "loss": 2.7556, "num_input_tokens_seen": 15309209600, "step": 14600 }, { "epoch": 0.3218025335085125, "grad_norm": 0.1267230361700058, "learning_rate": 0.001, "loss": 2.7617, "num_input_tokens_seen": 15361638400, "step": 14650 }, { "epoch": 0.32290083567065764, "grad_norm": 0.13812699913978577, "learning_rate": 0.001, "loss": 2.7533, "num_input_tokens_seen": 15414067200, "step": 14700 }, { "epoch": 0.3239991378328027, "grad_norm": 0.12577973306179047, "learning_rate": 0.001, "loss": 2.7519, "num_input_tokens_seen": 15466496000, "step": 14750 }, { "epoch": 0.32509743999494783, "grad_norm": 0.14296036958694458, "learning_rate": 0.001, "loss": 2.7479, "num_input_tokens_seen": 15518924800, "step": 14800 }, { "epoch": 0.3261957421570929, "grad_norm": 0.12737593054771423, "learning_rate": 0.001, "loss": 2.7546, "num_input_tokens_seen": 15571353600, "step": 14850 }, { "epoch": 0.327294044319238, "grad_norm": 0.1349722445011139, "learning_rate": 0.001, "loss": 2.7477, "num_input_tokens_seen": 15623782400, "step": 14900 }, { "epoch": 0.3283923464813831, "grad_norm": 0.12827487289905548, "learning_rate": 0.001, "loss": 2.7492, "num_input_tokens_seen": 15676211200, "step": 14950 }, { "epoch": 0.3294906486435282, "grad_norm": 0.13282813131809235, "learning_rate": 0.001, "loss": 2.7466, "num_input_tokens_seen": 15728640000, "step": 15000 }, { "epoch": 0.3294906486435282, "eval_loss": 2.6524744033813477, "eval_runtime": 65.8996, "eval_samples_per_second": 75.873, "eval_steps_per_second": 18.968, "num_input_tokens_seen": 15728640000, "step": 15000 }, { "epoch": 0.3305889508056733, "grad_norm": 0.11965218186378479, "learning_rate": 0.001, "loss": 2.7443, "num_input_tokens_seen": 15781068800, "step": 15050 }, { "epoch": 0.33168725296781837, "grad_norm": 0.14668309688568115, "learning_rate": 0.001, "loss": 2.7496, "num_input_tokens_seen": 15833497600, "step": 15100 }, { "epoch": 0.3327855551299635, "grad_norm": 0.12492749840021133, "learning_rate": 0.001, "loss": 2.7485, "num_input_tokens_seen": 15885926400, "step": 15150 }, { "epoch": 0.33388385729210857, "grad_norm": 0.1333470493555069, "learning_rate": 0.001, "loss": 2.7511, "num_input_tokens_seen": 15938355200, "step": 15200 }, { "epoch": 0.33498215945425364, "grad_norm": 0.14136457443237305, "learning_rate": 0.001, "loss": 2.74, "num_input_tokens_seen": 15990784000, "step": 15250 }, { "epoch": 0.33608046161639876, "grad_norm": 0.14975622296333313, "learning_rate": 0.001, "loss": 2.7543, "num_input_tokens_seen": 16043212800, "step": 15300 }, { "epoch": 0.33717876377854383, "grad_norm": 0.1193549856543541, "learning_rate": 0.001, "loss": 2.7497, "num_input_tokens_seen": 16095641600, "step": 15350 }, { "epoch": 0.33827706594068896, "grad_norm": 0.1429223120212555, "learning_rate": 0.001, "loss": 2.7463, "num_input_tokens_seen": 16148070400, "step": 15400 }, { "epoch": 0.33937536810283403, "grad_norm": 0.16827304661273956, "learning_rate": 0.001, "loss": 2.7415, "num_input_tokens_seen": 16200499200, "step": 15450 }, { "epoch": 0.3404736702649791, "grad_norm": 0.13952937722206116, "learning_rate": 0.001, "loss": 2.7388, "num_input_tokens_seen": 16252928000, "step": 15500 }, { "epoch": 0.3404736702649791, "eval_loss": 2.6472089290618896, "eval_runtime": 65.4943, "eval_samples_per_second": 76.343, "eval_steps_per_second": 19.086, "num_input_tokens_seen": 16252928000, "step": 15500 }, { "epoch": 0.3415719724271242, "grad_norm": 0.13359376788139343, "learning_rate": 0.001, "loss": 2.7522, "num_input_tokens_seen": 16305356800, "step": 15550 }, { "epoch": 0.3426702745892693, "grad_norm": 0.13101224601268768, "learning_rate": 0.001, "loss": 2.7483, "num_input_tokens_seen": 16357785600, "step": 15600 }, { "epoch": 0.3437685767514144, "grad_norm": 0.14006133377552032, "learning_rate": 0.001, "loss": 2.7439, "num_input_tokens_seen": 16410214400, "step": 15650 }, { "epoch": 0.3448668789135595, "grad_norm": 0.15062059462070465, "learning_rate": 0.001, "loss": 2.7454, "num_input_tokens_seen": 16462643200, "step": 15700 }, { "epoch": 0.3459651810757046, "grad_norm": 0.13822610676288605, "learning_rate": 0.001, "loss": 2.74, "num_input_tokens_seen": 16515072000, "step": 15750 }, { "epoch": 0.3470634832378497, "grad_norm": 0.1368207335472107, "learning_rate": 0.001, "loss": 2.745, "num_input_tokens_seen": 16567500800, "step": 15800 }, { "epoch": 0.34816178539999476, "grad_norm": 0.14573991298675537, "learning_rate": 0.001, "loss": 2.742, "num_input_tokens_seen": 16619929600, "step": 15850 }, { "epoch": 0.3492600875621399, "grad_norm": 12.025542259216309, "learning_rate": 0.001, "loss": 3.3278, "num_input_tokens_seen": 16672358400, "step": 15900 }, { "epoch": 0.35035838972428496, "grad_norm": 0.15699023008346558, "learning_rate": 0.001, "loss": 4.04, "num_input_tokens_seen": 16724787200, "step": 15950 }, { "epoch": 0.3514566918864301, "grad_norm": 0.13041897118091583, "learning_rate": 0.001, "loss": 2.8233, "num_input_tokens_seen": 16777216000, "step": 16000 }, { "epoch": 0.3514566918864301, "eval_loss": 2.689638614654541, "eval_runtime": 66.0949, "eval_samples_per_second": 75.649, "eval_steps_per_second": 18.912, "num_input_tokens_seen": 16777216000, "step": 16000 }, { "epoch": 0.35255499404857515, "grad_norm": 0.1446143537759781, "learning_rate": 0.001, "loss": 2.7837, "num_input_tokens_seen": 16829644800, "step": 16050 }, { "epoch": 0.3536532962107203, "grad_norm": 0.12466421723365784, "learning_rate": 0.001, "loss": 2.7808, "num_input_tokens_seen": 16882073600, "step": 16100 }, { "epoch": 0.35475159837286535, "grad_norm": 0.13154324889183044, "learning_rate": 0.001, "loss": 2.7608, "num_input_tokens_seen": 16934502400, "step": 16150 }, { "epoch": 0.3558499005350104, "grad_norm": 0.12929347157478333, "learning_rate": 0.001, "loss": 2.7599, "num_input_tokens_seen": 16986931200, "step": 16200 }, { "epoch": 0.35694820269715555, "grad_norm": 0.12805528938770294, "learning_rate": 0.001, "loss": 2.7562, "num_input_tokens_seen": 17039360000, "step": 16250 }, { "epoch": 0.3580465048593006, "grad_norm": 0.12885579466819763, "learning_rate": 0.001, "loss": 2.7498, "num_input_tokens_seen": 17091788800, "step": 16300 }, { "epoch": 0.35914480702144574, "grad_norm": 0.14422497153282166, "learning_rate": 0.001, "loss": 2.7518, "num_input_tokens_seen": 17144217600, "step": 16350 }, { "epoch": 0.3602431091835908, "grad_norm": 0.13284224271774292, "learning_rate": 0.001, "loss": 2.7453, "num_input_tokens_seen": 17196646400, "step": 16400 }, { "epoch": 0.3613414113457359, "grad_norm": 0.1408185362815857, "learning_rate": 0.001, "loss": 2.7422, "num_input_tokens_seen": 17249075200, "step": 16450 }, { "epoch": 0.362439713507881, "grad_norm": 0.1295713484287262, "learning_rate": 0.001, "loss": 2.7394, "num_input_tokens_seen": 17301504000, "step": 16500 }, { "epoch": 0.362439713507881, "eval_loss": 2.6431446075439453, "eval_runtime": 65.9239, "eval_samples_per_second": 75.845, "eval_steps_per_second": 18.961, "num_input_tokens_seen": 17301504000, "step": 16500 }, { "epoch": 0.3635380156700261, "grad_norm": 0.1245918869972229, "learning_rate": 0.001, "loss": 2.7434, "num_input_tokens_seen": 17353932800, "step": 16550 }, { "epoch": 0.3646363178321712, "grad_norm": 0.15865615010261536, "learning_rate": 0.001, "loss": 2.7378, "num_input_tokens_seen": 17406361600, "step": 16600 }, { "epoch": 0.3657346199943163, "grad_norm": 0.1391313523054123, "learning_rate": 0.001, "loss": 2.7415, "num_input_tokens_seen": 17458790400, "step": 16650 }, { "epoch": 0.3668329221564614, "grad_norm": 0.13604389131069183, "learning_rate": 0.001, "loss": 2.7394, "num_input_tokens_seen": 17511219200, "step": 16700 }, { "epoch": 0.3679312243186065, "grad_norm": 0.14926299452781677, "learning_rate": 0.001, "loss": 2.732, "num_input_tokens_seen": 17563648000, "step": 16750 }, { "epoch": 0.36902952648075155, "grad_norm": 0.12619628012180328, "learning_rate": 0.001, "loss": 2.7275, "num_input_tokens_seen": 17616076800, "step": 16800 }, { "epoch": 0.3701278286428967, "grad_norm": 0.1268402636051178, "learning_rate": 0.001, "loss": 2.7309, "num_input_tokens_seen": 17668505600, "step": 16850 }, { "epoch": 0.37122613080504174, "grad_norm": 0.1379624754190445, "learning_rate": 0.001, "loss": 2.7266, "num_input_tokens_seen": 17720934400, "step": 16900 }, { "epoch": 0.37232443296718687, "grad_norm": 0.1443478763103485, "learning_rate": 0.001, "loss": 2.7321, "num_input_tokens_seen": 17773363200, "step": 16950 }, { "epoch": 0.37342273512933194, "grad_norm": 0.15214091539382935, "learning_rate": 0.001, "loss": 2.7284, "num_input_tokens_seen": 17825792000, "step": 17000 }, { "epoch": 0.37342273512933194, "eval_loss": 2.63478946685791, "eval_runtime": 65.141, "eval_samples_per_second": 76.757, "eval_steps_per_second": 19.189, "num_input_tokens_seen": 17825792000, "step": 17000 }, { "epoch": 0.374521037291477, "grad_norm": 0.1361106038093567, "learning_rate": 0.001, "loss": 2.7342, "num_input_tokens_seen": 17878220800, "step": 17050 }, { "epoch": 0.37561933945362214, "grad_norm": 0.13839572668075562, "learning_rate": 0.001, "loss": 2.7259, "num_input_tokens_seen": 17930649600, "step": 17100 }, { "epoch": 0.3767176416157672, "grad_norm": 0.13055244088172913, "learning_rate": 0.001, "loss": 2.7306, "num_input_tokens_seen": 17983078400, "step": 17150 }, { "epoch": 0.37781594377791233, "grad_norm": 0.1444411724805832, "learning_rate": 0.001, "loss": 2.7315, "num_input_tokens_seen": 18035507200, "step": 17200 }, { "epoch": 0.3789142459400574, "grad_norm": 0.151028573513031, "learning_rate": 0.001, "loss": 2.7211, "num_input_tokens_seen": 18087936000, "step": 17250 }, { "epoch": 0.38001254810220253, "grad_norm": 0.15638011693954468, "learning_rate": 0.001, "loss": 2.7269, "num_input_tokens_seen": 18140364800, "step": 17300 }, { "epoch": 0.3811108502643476, "grad_norm": 0.1508658230304718, "learning_rate": 0.001, "loss": 2.7263, "num_input_tokens_seen": 18192793600, "step": 17350 }, { "epoch": 0.38220915242649267, "grad_norm": 0.13167701661586761, "learning_rate": 0.001, "loss": 2.7296, "num_input_tokens_seen": 18245222400, "step": 17400 }, { "epoch": 0.3833074545886378, "grad_norm": 0.14609253406524658, "learning_rate": 0.001, "loss": 2.7249, "num_input_tokens_seen": 18297651200, "step": 17450 }, { "epoch": 0.38440575675078287, "grad_norm": 0.13172782957553864, "learning_rate": 0.001, "loss": 2.7252, "num_input_tokens_seen": 18350080000, "step": 17500 }, { "epoch": 0.38440575675078287, "eval_loss": 2.630176544189453, "eval_runtime": 66.0667, "eval_samples_per_second": 75.681, "eval_steps_per_second": 18.92, "num_input_tokens_seen": 18350080000, "step": 17500 }, { "epoch": 0.385504058912928, "grad_norm": 0.149306520819664, "learning_rate": 0.001, "loss": 2.7245, "num_input_tokens_seen": 18402508800, "step": 17550 }, { "epoch": 0.38660236107507306, "grad_norm": 0.14191772043704987, "learning_rate": 0.001, "loss": 2.7204, "num_input_tokens_seen": 18454937600, "step": 17600 }, { "epoch": 0.3877006632372182, "grad_norm": 0.13731072843074799, "learning_rate": 0.001, "loss": 2.7243, "num_input_tokens_seen": 18507366400, "step": 17650 }, { "epoch": 0.38879896539936326, "grad_norm": 0.1466369777917862, "learning_rate": 0.001, "loss": 2.7262, "num_input_tokens_seen": 18559795200, "step": 17700 }, { "epoch": 0.38989726756150833, "grad_norm": 0.13290658593177795, "learning_rate": 0.001, "loss": 2.7314, "num_input_tokens_seen": 18612224000, "step": 17750 }, { "epoch": 0.39099556972365346, "grad_norm": 0.13785040378570557, "learning_rate": 0.001, "loss": 2.7252, "num_input_tokens_seen": 18664652800, "step": 17800 }, { "epoch": 0.39209387188579853, "grad_norm": 0.13384000957012177, "learning_rate": 0.001, "loss": 2.7321, "num_input_tokens_seen": 18717081600, "step": 17850 }, { "epoch": 0.39319217404794365, "grad_norm": 0.14927875995635986, "learning_rate": 0.001, "loss": 2.7236, "num_input_tokens_seen": 18769510400, "step": 17900 }, { "epoch": 0.3942904762100887, "grad_norm": 0.13494938611984253, "learning_rate": 0.001, "loss": 2.7234, "num_input_tokens_seen": 18821939200, "step": 17950 }, { "epoch": 0.3953887783722338, "grad_norm": 0.15054813027381897, "learning_rate": 0.001, "loss": 2.7236, "num_input_tokens_seen": 18874368000, "step": 18000 }, { "epoch": 0.3953887783722338, "eval_loss": 2.62626051902771, "eval_runtime": 65.3965, "eval_samples_per_second": 76.457, "eval_steps_per_second": 19.114, "num_input_tokens_seen": 18874368000, "step": 18000 }, { "epoch": 0.3964870805343789, "grad_norm": 0.1353403478860855, "learning_rate": 0.001, "loss": 2.724, "num_input_tokens_seen": 18926796800, "step": 18050 }, { "epoch": 0.397585382696524, "grad_norm": 0.15004459023475647, "learning_rate": 0.001, "loss": 2.717, "num_input_tokens_seen": 18979225600, "step": 18100 }, { "epoch": 0.3986836848586691, "grad_norm": 0.1293007880449295, "learning_rate": 0.001, "loss": 2.7187, "num_input_tokens_seen": 19031654400, "step": 18150 }, { "epoch": 0.3997819870208142, "grad_norm": 0.16373878717422485, "learning_rate": 0.001, "loss": 2.7217, "num_input_tokens_seen": 19084083200, "step": 18200 }, { "epoch": 0.4008802891829593, "grad_norm": 0.1529611349105835, "learning_rate": 0.001, "loss": 2.722, "num_input_tokens_seen": 19136512000, "step": 18250 }, { "epoch": 0.4019785913451044, "grad_norm": 0.14109951257705688, "learning_rate": 0.001, "loss": 2.7232, "num_input_tokens_seen": 19188940800, "step": 18300 }, { "epoch": 0.40307689350724946, "grad_norm": 0.13841493427753448, "learning_rate": 0.001, "loss": 2.7195, "num_input_tokens_seen": 19241369600, "step": 18350 }, { "epoch": 0.4041751956693946, "grad_norm": 0.13508476316928864, "learning_rate": 0.001, "loss": 2.7166, "num_input_tokens_seen": 19293798400, "step": 18400 }, { "epoch": 0.40527349783153965, "grad_norm": 0.1372646540403366, "learning_rate": 0.001, "loss": 2.7212, "num_input_tokens_seen": 19346227200, "step": 18450 }, { "epoch": 0.4063717999936848, "grad_norm": 0.1485033482313156, "learning_rate": 0.001, "loss": 2.7186, "num_input_tokens_seen": 19398656000, "step": 18500 }, { "epoch": 0.4063717999936848, "eval_loss": 2.622330904006958, "eval_runtime": 66.3601, "eval_samples_per_second": 75.346, "eval_steps_per_second": 18.837, "num_input_tokens_seen": 19398656000, "step": 18500 }, { "epoch": 0.40747010215582985, "grad_norm": 0.1484711617231369, "learning_rate": 0.001, "loss": 2.7235, "num_input_tokens_seen": 19451084800, "step": 18550 }, { "epoch": 0.408568404317975, "grad_norm": 0.141770601272583, "learning_rate": 0.001, "loss": 2.7225, "num_input_tokens_seen": 19503513600, "step": 18600 }, { "epoch": 0.40966670648012005, "grad_norm": 0.1213323250412941, "learning_rate": 0.001, "loss": 2.7212, "num_input_tokens_seen": 19555942400, "step": 18650 }, { "epoch": 0.4107650086422651, "grad_norm": 0.14149373769760132, "learning_rate": 0.001, "loss": 2.7181, "num_input_tokens_seen": 19608371200, "step": 18700 }, { "epoch": 0.41186331080441024, "grad_norm": 0.13964049518108368, "learning_rate": 0.001, "loss": 2.7147, "num_input_tokens_seen": 19660800000, "step": 18750 }, { "epoch": 0.4129616129665553, "grad_norm": 0.1384592205286026, "learning_rate": 0.001, "loss": 2.7141, "num_input_tokens_seen": 19713228800, "step": 18800 }, { "epoch": 0.41405991512870044, "grad_norm": 0.15027381479740143, "learning_rate": 0.001, "loss": 2.7185, "num_input_tokens_seen": 19765657600, "step": 18850 }, { "epoch": 0.4151582172908455, "grad_norm": 0.15221597254276276, "learning_rate": 0.001, "loss": 2.7206, "num_input_tokens_seen": 19818086400, "step": 18900 }, { "epoch": 0.4162565194529906, "grad_norm": 0.1272735893726349, "learning_rate": 0.001, "loss": 2.7183, "num_input_tokens_seen": 19870515200, "step": 18950 }, { "epoch": 0.4173548216151357, "grad_norm": 0.1258268654346466, "learning_rate": 0.001, "loss": 2.7117, "num_input_tokens_seen": 19922944000, "step": 19000 }, { "epoch": 0.4173548216151357, "eval_loss": 2.619187116622925, "eval_runtime": 65.7537, "eval_samples_per_second": 76.041, "eval_steps_per_second": 19.01, "num_input_tokens_seen": 19922944000, "step": 19000 }, { "epoch": 0.4184531237772808, "grad_norm": 0.12389284372329712, "learning_rate": 0.001, "loss": 2.7222, "num_input_tokens_seen": 19975372800, "step": 19050 }, { "epoch": 0.4195514259394259, "grad_norm": 0.14157339930534363, "learning_rate": 0.001, "loss": 2.7178, "num_input_tokens_seen": 20027801600, "step": 19100 }, { "epoch": 0.420649728101571, "grad_norm": 0.1490466445684433, "learning_rate": 0.001, "loss": 2.7185, "num_input_tokens_seen": 20080230400, "step": 19150 }, { "epoch": 0.4217480302637161, "grad_norm": 0.14112494885921478, "learning_rate": 0.001, "loss": 2.7166, "num_input_tokens_seen": 20132659200, "step": 19200 }, { "epoch": 0.42284633242586117, "grad_norm": 0.13986504077911377, "learning_rate": 0.001, "loss": 2.7201, "num_input_tokens_seen": 20185088000, "step": 19250 }, { "epoch": 0.42394463458800624, "grad_norm": 0.14087803661823273, "learning_rate": 0.001, "loss": 2.7175, "num_input_tokens_seen": 20237516800, "step": 19300 }, { "epoch": 0.42504293675015137, "grad_norm": 0.165438711643219, "learning_rate": 0.001, "loss": 2.7155, "num_input_tokens_seen": 20289945600, "step": 19350 }, { "epoch": 0.42614123891229644, "grad_norm": 0.132109135389328, "learning_rate": 0.001, "loss": 2.7116, "num_input_tokens_seen": 20342374400, "step": 19400 }, { "epoch": 0.42723954107444156, "grad_norm": 0.1372772753238678, "learning_rate": 0.001, "loss": 2.7137, "num_input_tokens_seen": 20394803200, "step": 19450 }, { "epoch": 0.42833784323658664, "grad_norm": 0.1470147669315338, "learning_rate": 0.001, "loss": 2.7081, "num_input_tokens_seen": 20447232000, "step": 19500 }, { "epoch": 0.42833784323658664, "eval_loss": 2.615947961807251, "eval_runtime": 65.588, "eval_samples_per_second": 76.233, "eval_steps_per_second": 19.058, "num_input_tokens_seen": 20447232000, "step": 19500 }, { "epoch": 0.42943614539873176, "grad_norm": 0.15671676397323608, "learning_rate": 0.001, "loss": 2.7176, "num_input_tokens_seen": 20499660800, "step": 19550 }, { "epoch": 0.43053444756087683, "grad_norm": 0.13104794919490814, "learning_rate": 0.001, "loss": 2.7108, "num_input_tokens_seen": 20552089600, "step": 19600 }, { "epoch": 0.4316327497230219, "grad_norm": 0.14532406628131866, "learning_rate": 0.001, "loss": 2.7087, "num_input_tokens_seen": 20604518400, "step": 19650 }, { "epoch": 0.43273105188516703, "grad_norm": 0.16199354827404022, "learning_rate": 0.001, "loss": 2.7178, "num_input_tokens_seen": 20656947200, "step": 19700 }, { "epoch": 0.4338293540473121, "grad_norm": 0.13537316024303436, "learning_rate": 0.001, "loss": 2.7124, "num_input_tokens_seen": 20709376000, "step": 19750 }, { "epoch": 0.4349276562094572, "grad_norm": 0.15098537504673004, "learning_rate": 0.001, "loss": 2.7119, "num_input_tokens_seen": 20761804800, "step": 19800 }, { "epoch": 0.4360259583716023, "grad_norm": 0.21563659608364105, "learning_rate": 0.001, "loss": 2.7118, "num_input_tokens_seen": 20814233600, "step": 19850 }, { "epoch": 0.43712426053374737, "grad_norm": 0.15981121361255646, "learning_rate": 0.001, "loss": 2.7043, "num_input_tokens_seen": 20866662400, "step": 19900 }, { "epoch": 0.4382225626958925, "grad_norm": 0.15192069113254547, "learning_rate": 0.001, "loss": 2.7137, "num_input_tokens_seen": 20919091200, "step": 19950 }, { "epoch": 0.43932086485803756, "grad_norm": 0.14211437106132507, "learning_rate": 0.001, "loss": 2.7128, "num_input_tokens_seen": 20971520000, "step": 20000 }, { "epoch": 0.43932086485803756, "eval_loss": 2.611689567565918, "eval_runtime": 66.3456, "eval_samples_per_second": 75.363, "eval_steps_per_second": 18.841, "num_input_tokens_seen": 20971520000, "step": 20000 }, { "epoch": 0.4404191670201827, "grad_norm": 0.14489957690238953, "learning_rate": 0.001, "loss": 2.7139, "num_input_tokens_seen": 21023948800, "step": 20050 }, { "epoch": 0.44151746918232776, "grad_norm": 0.13994646072387695, "learning_rate": 0.001, "loss": 2.7091, "num_input_tokens_seen": 21076377600, "step": 20100 }, { "epoch": 0.4426157713444729, "grad_norm": 0.17211903631687164, "learning_rate": 0.001, "loss": 2.7176, "num_input_tokens_seen": 21128806400, "step": 20150 }, { "epoch": 0.44371407350661796, "grad_norm": 0.16364862024784088, "learning_rate": 0.001, "loss": 2.7181, "num_input_tokens_seen": 21181235200, "step": 20200 }, { "epoch": 0.444812375668763, "grad_norm": 0.14166216552257538, "learning_rate": 0.001, "loss": 2.7127, "num_input_tokens_seen": 21233664000, "step": 20250 }, { "epoch": 0.44591067783090815, "grad_norm": 0.12995755672454834, "learning_rate": 0.001, "loss": 2.7085, "num_input_tokens_seen": 21286092800, "step": 20300 }, { "epoch": 0.4470089799930532, "grad_norm": 0.15717202425003052, "learning_rate": 0.001, "loss": 2.7071, "num_input_tokens_seen": 21338521600, "step": 20350 }, { "epoch": 0.44810728215519835, "grad_norm": 0.13354860246181488, "learning_rate": 0.001, "loss": 2.7094, "num_input_tokens_seen": 21390950400, "step": 20400 }, { "epoch": 0.4492055843173434, "grad_norm": 0.16004188358783722, "learning_rate": 0.001, "loss": 2.7109, "num_input_tokens_seen": 21443379200, "step": 20450 }, { "epoch": 0.45030388647948855, "grad_norm": 0.148077592253685, "learning_rate": 0.001, "loss": 2.7058, "num_input_tokens_seen": 21495808000, "step": 20500 }, { "epoch": 0.45030388647948855, "eval_loss": 2.6089115142822266, "eval_runtime": 65.5589, "eval_samples_per_second": 76.267, "eval_steps_per_second": 19.067, "num_input_tokens_seen": 21495808000, "step": 20500 }, { "epoch": 0.4514021886416336, "grad_norm": 0.16992634534835815, "learning_rate": 0.001, "loss": 2.7026, "num_input_tokens_seen": 21548236800, "step": 20550 }, { "epoch": 0.4525004908037787, "grad_norm": 0.14876551926136017, "learning_rate": 0.001, "loss": 2.7105, "num_input_tokens_seen": 21600665600, "step": 20600 }, { "epoch": 0.4535987929659238, "grad_norm": 0.16025613248348236, "learning_rate": 0.001, "loss": 2.707, "num_input_tokens_seen": 21653094400, "step": 20650 }, { "epoch": 0.4546970951280689, "grad_norm": 0.14609012007713318, "learning_rate": 0.001, "loss": 2.7086, "num_input_tokens_seen": 21705523200, "step": 20700 }, { "epoch": 0.455795397290214, "grad_norm": 0.14725832641124725, "learning_rate": 0.001, "loss": 2.7075, "num_input_tokens_seen": 21757952000, "step": 20750 }, { "epoch": 0.4568936994523591, "grad_norm": 0.1736454963684082, "learning_rate": 0.001, "loss": 2.7033, "num_input_tokens_seen": 21810380800, "step": 20800 }, { "epoch": 0.45799200161450415, "grad_norm": 0.14904257655143738, "learning_rate": 0.001, "loss": 2.7012, "num_input_tokens_seen": 21862809600, "step": 20850 }, { "epoch": 0.4590903037766493, "grad_norm": 0.14407765865325928, "learning_rate": 0.001, "loss": 2.7055, "num_input_tokens_seen": 21915238400, "step": 20900 }, { "epoch": 0.46018860593879435, "grad_norm": 0.13943473994731903, "learning_rate": 0.001, "loss": 2.6999, "num_input_tokens_seen": 21967667200, "step": 20950 }, { "epoch": 0.4612869081009395, "grad_norm": 0.1592896729707718, "learning_rate": 0.001, "loss": 2.7072, "num_input_tokens_seen": 22020096000, "step": 21000 }, { "epoch": 0.4612869081009395, "eval_loss": 2.605719566345215, "eval_runtime": 65.6879, "eval_samples_per_second": 76.117, "eval_steps_per_second": 19.029, "num_input_tokens_seen": 22020096000, "step": 21000 }, { "epoch": 0.46238521026308455, "grad_norm": 0.1428702473640442, "learning_rate": 0.001, "loss": 2.7042, "num_input_tokens_seen": 22072524800, "step": 21050 }, { "epoch": 0.46348351242522967, "grad_norm": 0.13529072701931, "learning_rate": 0.001, "loss": 2.7093, "num_input_tokens_seen": 22124953600, "step": 21100 }, { "epoch": 0.46458181458737474, "grad_norm": 0.17529748380184174, "learning_rate": 0.001, "loss": 2.713, "num_input_tokens_seen": 22177382400, "step": 21150 }, { "epoch": 0.4656801167495198, "grad_norm": 0.1479254513978958, "learning_rate": 0.001, "loss": 2.6984, "num_input_tokens_seen": 22229811200, "step": 21200 }, { "epoch": 0.46677841891166494, "grad_norm": 0.15110637247562408, "learning_rate": 0.001, "loss": 2.7128, "num_input_tokens_seen": 22282240000, "step": 21250 }, { "epoch": 0.46787672107381, "grad_norm": 0.13746944069862366, "learning_rate": 0.001, "loss": 2.7036, "num_input_tokens_seen": 22334668800, "step": 21300 }, { "epoch": 0.46897502323595514, "grad_norm": 0.17940136790275574, "learning_rate": 0.001, "loss": 2.7048, "num_input_tokens_seen": 22387097600, "step": 21350 }, { "epoch": 0.4700733253981002, "grad_norm": 0.14203256368637085, "learning_rate": 0.001, "loss": 2.6997, "num_input_tokens_seen": 22439526400, "step": 21400 }, { "epoch": 0.47117162756024533, "grad_norm": 0.14260704815387726, "learning_rate": 0.001, "loss": 2.7092, "num_input_tokens_seen": 22491955200, "step": 21450 }, { "epoch": 0.4722699297223904, "grad_norm": 0.16455897688865662, "learning_rate": 0.001, "loss": 2.6969, "num_input_tokens_seen": 22544384000, "step": 21500 }, { "epoch": 0.4722699297223904, "eval_loss": 2.60367751121521, "eval_runtime": 65.4304, "eval_samples_per_second": 76.417, "eval_steps_per_second": 19.104, "num_input_tokens_seen": 22544384000, "step": 21500 }, { "epoch": 0.4733682318845355, "grad_norm": 0.1529170274734497, "learning_rate": 0.001, "loss": 2.7003, "num_input_tokens_seen": 22596812800, "step": 21550 }, { "epoch": 0.4744665340466806, "grad_norm": 0.1921636164188385, "learning_rate": 0.001, "loss": 2.7014, "num_input_tokens_seen": 22649241600, "step": 21600 }, { "epoch": 0.47556483620882567, "grad_norm": 0.16029173135757446, "learning_rate": 0.001, "loss": 2.7028, "num_input_tokens_seen": 22701670400, "step": 21650 }, { "epoch": 0.4766631383709708, "grad_norm": 0.14740578830242157, "learning_rate": 0.001, "loss": 2.7019, "num_input_tokens_seen": 22754099200, "step": 21700 }, { "epoch": 0.47776144053311587, "grad_norm": 0.1734548658132553, "learning_rate": 0.001, "loss": 2.6985, "num_input_tokens_seen": 22806528000, "step": 21750 }, { "epoch": 0.47885974269526094, "grad_norm": 0.15502890944480896, "learning_rate": 0.001, "loss": 2.6973, "num_input_tokens_seen": 22858956800, "step": 21800 }, { "epoch": 0.47995804485740606, "grad_norm": 0.16783900558948517, "learning_rate": 0.001, "loss": 2.7003, "num_input_tokens_seen": 22911385600, "step": 21850 }, { "epoch": 0.48105634701955113, "grad_norm": 0.14911381900310516, "learning_rate": 0.001, "loss": 2.6992, "num_input_tokens_seen": 22963814400, "step": 21900 }, { "epoch": 0.48215464918169626, "grad_norm": 0.15027394890785217, "learning_rate": 0.001, "loss": 2.6957, "num_input_tokens_seen": 23016243200, "step": 21950 }, { "epoch": 0.48325295134384133, "grad_norm": 0.1261301189661026, "learning_rate": 0.001, "loss": 2.7064, "num_input_tokens_seen": 23068672000, "step": 22000 }, { "epoch": 0.48325295134384133, "eval_loss": 2.6012015342712402, "eval_runtime": 64.9701, "eval_samples_per_second": 76.958, "eval_steps_per_second": 19.24, "num_input_tokens_seen": 23068672000, "step": 22000 }, { "epoch": 0.48435125350598646, "grad_norm": 0.15728288888931274, "learning_rate": 0.001, "loss": 2.703, "num_input_tokens_seen": 23121100800, "step": 22050 }, { "epoch": 0.4854495556681315, "grad_norm": 0.13599443435668945, "learning_rate": 0.001, "loss": 2.6984, "num_input_tokens_seen": 23173529600, "step": 22100 }, { "epoch": 0.4865478578302766, "grad_norm": 0.25702551007270813, "learning_rate": 0.001, "loss": 2.9388, "num_input_tokens_seen": 23225958400, "step": 22150 }, { "epoch": 0.4876461599924217, "grad_norm": 0.12942279875278473, "learning_rate": 0.001, "loss": 2.7568, "num_input_tokens_seen": 23278383360, "step": 22200 }, { "epoch": 0.4887444621545668, "grad_norm": 0.12908817827701569, "learning_rate": 0.001, "loss": 2.7195, "num_input_tokens_seen": 23330812160, "step": 22250 }, { "epoch": 0.4898427643167119, "grad_norm": 0.1351587176322937, "learning_rate": 0.001, "loss": 2.7155, "num_input_tokens_seen": 23383240960, "step": 22300 }, { "epoch": 0.490941066478857, "grad_norm": 0.1245250552892685, "learning_rate": 0.001, "loss": 2.7074, "num_input_tokens_seen": 23435669760, "step": 22350 }, { "epoch": 0.4920393686410021, "grad_norm": 0.13818837702274323, "learning_rate": 0.001, "loss": 2.7064, "num_input_tokens_seen": 23488098560, "step": 22400 }, { "epoch": 0.4931376708031472, "grad_norm": 0.15505041182041168, "learning_rate": 0.001, "loss": 2.7044, "num_input_tokens_seen": 23540527360, "step": 22450 }, { "epoch": 0.49423597296529226, "grad_norm": 0.14414137601852417, "learning_rate": 0.001, "loss": 2.7046, "num_input_tokens_seen": 23592956160, "step": 22500 }, { "epoch": 0.49423597296529226, "eval_loss": 2.60188627243042, "eval_runtime": 67.3268, "eval_samples_per_second": 74.265, "eval_steps_per_second": 18.566, "num_input_tokens_seen": 23592956160, "step": 22500 }, { "epoch": 0.4953342751274374, "grad_norm": 0.14763414859771729, "learning_rate": 0.001, "loss": 2.695, "num_input_tokens_seen": 23645384960, "step": 22550 }, { "epoch": 0.49643257728958246, "grad_norm": 0.14800110459327698, "learning_rate": 0.001, "loss": 2.6939, "num_input_tokens_seen": 23697813760, "step": 22600 }, { "epoch": 0.4975308794517276, "grad_norm": 0.13590902090072632, "learning_rate": 0.001, "loss": 2.6967, "num_input_tokens_seen": 23750242560, "step": 22650 }, { "epoch": 0.49862918161387265, "grad_norm": 0.1315733939409256, "learning_rate": 0.001, "loss": 2.6909, "num_input_tokens_seen": 23802671360, "step": 22700 }, { "epoch": 0.4997274837760177, "grad_norm": 0.13714700937271118, "learning_rate": 0.001, "loss": 2.6957, "num_input_tokens_seen": 23855100160, "step": 22750 }, { "epoch": 0.5008257859381628, "grad_norm": 0.1412438154220581, "learning_rate": 0.001, "loss": 2.6977, "num_input_tokens_seen": 23907528960, "step": 22800 }, { "epoch": 0.501924088100308, "grad_norm": 0.15368172526359558, "learning_rate": 0.001, "loss": 2.6977, "num_input_tokens_seen": 23959957760, "step": 22850 }, { "epoch": 0.503022390262453, "grad_norm": 0.14018824696540833, "learning_rate": 0.001, "loss": 2.6992, "num_input_tokens_seen": 24012386560, "step": 22900 }, { "epoch": 0.5041206924245981, "grad_norm": 0.1284814178943634, "learning_rate": 0.001, "loss": 2.6962, "num_input_tokens_seen": 24064815360, "step": 22950 }, { "epoch": 0.5052189945867432, "grad_norm": 0.15145835280418396, "learning_rate": 0.001, "loss": 2.692, "num_input_tokens_seen": 24117244160, "step": 23000 }, { "epoch": 0.5052189945867432, "eval_loss": 2.5970778465270996, "eval_runtime": 66.1666, "eval_samples_per_second": 75.567, "eval_steps_per_second": 18.892, "num_input_tokens_seen": 24117244160, "step": 23000 }, { "epoch": 0.5063172967488883, "grad_norm": 0.15117652714252472, "learning_rate": 0.001, "loss": 2.696, "num_input_tokens_seen": 24169672960, "step": 23050 }, { "epoch": 0.5074155989110334, "grad_norm": 0.15605470538139343, "learning_rate": 0.001, "loss": 2.6918, "num_input_tokens_seen": 24222101760, "step": 23100 }, { "epoch": 0.5085139010731785, "grad_norm": 0.17503651976585388, "learning_rate": 0.001, "loss": 2.688, "num_input_tokens_seen": 24274530560, "step": 23150 }, { "epoch": 0.5096122032353236, "grad_norm": 0.1622135490179062, "learning_rate": 0.001, "loss": 2.6949, "num_input_tokens_seen": 24326959360, "step": 23200 }, { "epoch": 0.5107105053974687, "grad_norm": 0.1331271231174469, "learning_rate": 0.001, "loss": 2.6876, "num_input_tokens_seen": 24379388160, "step": 23250 }, { "epoch": 0.5118088075596138, "grad_norm": 0.14365510642528534, "learning_rate": 0.001, "loss": 2.7027, "num_input_tokens_seen": 24431816960, "step": 23300 }, { "epoch": 0.5129071097217589, "grad_norm": 0.13621902465820312, "learning_rate": 0.001, "loss": 2.6946, "num_input_tokens_seen": 24484245760, "step": 23350 }, { "epoch": 0.5140054118839039, "grad_norm": 0.12506547570228577, "learning_rate": 0.001, "loss": 2.6864, "num_input_tokens_seen": 24536674560, "step": 23400 }, { "epoch": 0.515103714046049, "grad_norm": 0.12824128568172455, "learning_rate": 0.001, "loss": 2.6871, "num_input_tokens_seen": 24589103360, "step": 23450 }, { "epoch": 0.5162020162081942, "grad_norm": 0.14310036599636078, "learning_rate": 0.001, "loss": 2.6936, "num_input_tokens_seen": 24641532160, "step": 23500 }, { "epoch": 0.5162020162081942, "eval_loss": 2.592362880706787, "eval_runtime": 66.663, "eval_samples_per_second": 75.004, "eval_steps_per_second": 18.751, "num_input_tokens_seen": 24641532160, "step": 23500 }, { "epoch": 0.5173003183703393, "grad_norm": 0.1362077295780182, "learning_rate": 0.001, "loss": 2.6924, "num_input_tokens_seen": 24693960960, "step": 23550 }, { "epoch": 0.5183986205324843, "grad_norm": 0.13662473857402802, "learning_rate": 0.001, "loss": 2.6972, "num_input_tokens_seen": 24746389760, "step": 23600 }, { "epoch": 0.5194969226946294, "grad_norm": 0.12603560090065002, "learning_rate": 0.001, "loss": 2.6908, "num_input_tokens_seen": 24798818560, "step": 23650 }, { "epoch": 0.5205952248567746, "grad_norm": 0.16597150266170502, "learning_rate": 0.001, "loss": 2.6882, "num_input_tokens_seen": 24851247360, "step": 23700 }, { "epoch": 0.5216935270189196, "grad_norm": 0.13665246963500977, "learning_rate": 0.001, "loss": 2.6958, "num_input_tokens_seen": 24903676160, "step": 23750 }, { "epoch": 0.5227918291810647, "grad_norm": 0.14349523186683655, "learning_rate": 0.001, "loss": 2.6874, "num_input_tokens_seen": 24956104960, "step": 23800 }, { "epoch": 0.5238901313432098, "grad_norm": 0.15857954323291779, "learning_rate": 0.001, "loss": 2.6882, "num_input_tokens_seen": 25008533760, "step": 23850 }, { "epoch": 0.524988433505355, "grad_norm": 0.15056300163269043, "learning_rate": 0.001, "loss": 2.694, "num_input_tokens_seen": 25060962560, "step": 23900 }, { "epoch": 0.5260867356675, "grad_norm": 0.12861080467700958, "learning_rate": 0.001, "loss": 2.6899, "num_input_tokens_seen": 25113391360, "step": 23950 }, { "epoch": 0.5271850378296451, "grad_norm": 0.14443258941173553, "learning_rate": 0.001, "loss": 2.6929, "num_input_tokens_seen": 25165820160, "step": 24000 }, { "epoch": 0.5271850378296451, "eval_loss": 2.5910630226135254, "eval_runtime": 66.9014, "eval_samples_per_second": 74.737, "eval_steps_per_second": 18.684, "num_input_tokens_seen": 25165820160, "step": 24000 }, { "epoch": 0.5282833399917902, "grad_norm": 0.14083649218082428, "learning_rate": 0.001, "loss": 2.6851, "num_input_tokens_seen": 25218248960, "step": 24050 }, { "epoch": 0.5293816421539352, "grad_norm": 0.13934968411922455, "learning_rate": 0.001, "loss": 2.6863, "num_input_tokens_seen": 25270677760, "step": 24100 }, { "epoch": 0.5304799443160804, "grad_norm": 0.15416787564754486, "learning_rate": 0.001, "loss": 2.6894, "num_input_tokens_seen": 25323106560, "step": 24150 }, { "epoch": 0.5315782464782255, "grad_norm": 0.17290246486663818, "learning_rate": 0.001, "loss": 2.6907, "num_input_tokens_seen": 25375535360, "step": 24200 }, { "epoch": 0.5326765486403706, "grad_norm": 0.14260552823543549, "learning_rate": 0.001, "loss": 2.6832, "num_input_tokens_seen": 25427964160, "step": 24250 }, { "epoch": 0.5337748508025156, "grad_norm": 0.14795690774917603, "learning_rate": 0.001, "loss": 2.6895, "num_input_tokens_seen": 25480392960, "step": 24300 }, { "epoch": 0.5348731529646608, "grad_norm": 0.15009699761867523, "learning_rate": 0.001, "loss": 2.6819, "num_input_tokens_seen": 25532821760, "step": 24350 }, { "epoch": 0.5359714551268059, "grad_norm": 0.15425953269004822, "learning_rate": 0.001, "loss": 2.6874, "num_input_tokens_seen": 25585250560, "step": 24400 }, { "epoch": 0.5370697572889509, "grad_norm": 0.14639410376548767, "learning_rate": 0.001, "loss": 2.6878, "num_input_tokens_seen": 25637679360, "step": 24450 }, { "epoch": 0.538168059451096, "grad_norm": 0.14785613119602203, "learning_rate": 0.001, "loss": 2.6841, "num_input_tokens_seen": 25690108160, "step": 24500 }, { "epoch": 0.538168059451096, "eval_loss": 2.5875706672668457, "eval_runtime": 66.9296, "eval_samples_per_second": 74.705, "eval_steps_per_second": 18.676, "num_input_tokens_seen": 25690108160, "step": 24500 }, { "epoch": 0.5392663616132412, "grad_norm": 0.14224180579185486, "learning_rate": 0.001, "loss": 2.6876, "num_input_tokens_seen": 25742536960, "step": 24550 }, { "epoch": 0.5403646637753863, "grad_norm": 0.14881493151187897, "learning_rate": 0.001, "loss": 2.6827, "num_input_tokens_seen": 25794965760, "step": 24600 }, { "epoch": 0.5414629659375313, "grad_norm": 0.17951786518096924, "learning_rate": 0.001, "loss": 2.688, "num_input_tokens_seen": 25847394560, "step": 24650 }, { "epoch": 0.5425612680996764, "grad_norm": 0.1400926560163498, "learning_rate": 0.001, "loss": 2.6945, "num_input_tokens_seen": 25899823360, "step": 24700 }, { "epoch": 0.5436595702618215, "grad_norm": 0.1421627402305603, "learning_rate": 0.001, "loss": 2.6852, "num_input_tokens_seen": 25952252160, "step": 24750 }, { "epoch": 0.5447578724239666, "grad_norm": 0.1617737114429474, "learning_rate": 0.001, "loss": 2.686, "num_input_tokens_seen": 26004680960, "step": 24800 }, { "epoch": 0.5458561745861117, "grad_norm": 0.1523471176624298, "learning_rate": 0.001, "loss": 2.6945, "num_input_tokens_seen": 26057109760, "step": 24850 }, { "epoch": 0.5469544767482568, "grad_norm": 0.13078247010707855, "learning_rate": 0.001, "loss": 2.6829, "num_input_tokens_seen": 26109538560, "step": 24900 }, { "epoch": 0.5480527789104018, "grad_norm": 0.14831651747226715, "learning_rate": 0.001, "loss": 2.6898, "num_input_tokens_seen": 26161967360, "step": 24950 }, { "epoch": 0.549151081072547, "grad_norm": 0.1782410740852356, "learning_rate": 0.001, "loss": 2.6871, "num_input_tokens_seen": 26214396160, "step": 25000 }, { "epoch": 0.549151081072547, "eval_loss": 2.5877788066864014, "eval_runtime": 67.2223, "eval_samples_per_second": 74.38, "eval_steps_per_second": 18.595, "num_input_tokens_seen": 26214396160, "step": 25000 }, { "epoch": 0.5502493832346921, "grad_norm": 0.16484692692756653, "learning_rate": 0.001, "loss": 2.6843, "num_input_tokens_seen": 26266824960, "step": 25050 }, { "epoch": 0.5513476853968372, "grad_norm": 0.1583317369222641, "learning_rate": 0.001, "loss": 2.6825, "num_input_tokens_seen": 26319253760, "step": 25100 }, { "epoch": 0.5524459875589822, "grad_norm": 0.1569424867630005, "learning_rate": 0.001, "loss": 2.6787, "num_input_tokens_seen": 26371682560, "step": 25150 }, { "epoch": 0.5535442897211273, "grad_norm": 0.13633306324481964, "learning_rate": 0.001, "loss": 2.6872, "num_input_tokens_seen": 26424111360, "step": 25200 }, { "epoch": 0.5546425918832725, "grad_norm": 0.1480533927679062, "learning_rate": 0.001, "loss": 2.6842, "num_input_tokens_seen": 26476540160, "step": 25250 }, { "epoch": 0.5557408940454175, "grad_norm": 0.1267666518688202, "learning_rate": 0.001, "loss": 2.6839, "num_input_tokens_seen": 26528968960, "step": 25300 }, { "epoch": 0.5568391962075626, "grad_norm": 0.13951599597930908, "learning_rate": 0.001, "loss": 2.6799, "num_input_tokens_seen": 26581397760, "step": 25350 }, { "epoch": 0.5579374983697077, "grad_norm": 0.15044580399990082, "learning_rate": 0.001, "loss": 2.6846, "num_input_tokens_seen": 26633826560, "step": 25400 }, { "epoch": 0.5590358005318529, "grad_norm": 0.12891829013824463, "learning_rate": 0.001, "loss": 2.682, "num_input_tokens_seen": 26686255360, "step": 25450 }, { "epoch": 0.5601341026939979, "grad_norm": 0.12812241911888123, "learning_rate": 0.001, "loss": 2.684, "num_input_tokens_seen": 26738684160, "step": 25500 }, { "epoch": 0.5601341026939979, "eval_loss": 2.5832085609436035, "eval_runtime": 66.9038, "eval_samples_per_second": 74.734, "eval_steps_per_second": 18.684, "num_input_tokens_seen": 26738684160, "step": 25500 }, { "epoch": 0.561232404856143, "grad_norm": 0.14243654906749725, "learning_rate": 0.001, "loss": 2.6883, "num_input_tokens_seen": 26791112960, "step": 25550 }, { "epoch": 0.5623307070182881, "grad_norm": 0.14436320960521698, "learning_rate": 0.001, "loss": 2.6835, "num_input_tokens_seen": 26843541760, "step": 25600 }, { "epoch": 0.5634290091804331, "grad_norm": 0.1516960710287094, "learning_rate": 0.001, "loss": 2.6752, "num_input_tokens_seen": 26895970560, "step": 25650 }, { "epoch": 0.5645273113425783, "grad_norm": 0.14002515375614166, "learning_rate": 0.001, "loss": 2.6817, "num_input_tokens_seen": 26948399360, "step": 25700 }, { "epoch": 0.5656256135047234, "grad_norm": 0.1379036009311676, "learning_rate": 0.001, "loss": 2.6904, "num_input_tokens_seen": 27000828160, "step": 25750 }, { "epoch": 0.5667239156668685, "grad_norm": 0.16127964854240417, "learning_rate": 0.001, "loss": 2.6813, "num_input_tokens_seen": 27053256960, "step": 25800 }, { "epoch": 0.5678222178290135, "grad_norm": 0.15714125335216522, "learning_rate": 0.001, "loss": 2.6851, "num_input_tokens_seen": 27105685760, "step": 25850 }, { "epoch": 0.5689205199911587, "grad_norm": 0.15288160741329193, "learning_rate": 0.001, "loss": 2.6832, "num_input_tokens_seen": 27158114560, "step": 25900 }, { "epoch": 0.5700188221533038, "grad_norm": 0.1398363709449768, "learning_rate": 0.001, "loss": 2.6814, "num_input_tokens_seen": 27210543360, "step": 25950 }, { "epoch": 0.5711171243154488, "grad_norm": 0.15253235399723053, "learning_rate": 0.001, "loss": 2.6755, "num_input_tokens_seen": 27262972160, "step": 26000 }, { "epoch": 0.5711171243154488, "eval_loss": 2.5809168815612793, "eval_runtime": 66.151, "eval_samples_per_second": 75.585, "eval_steps_per_second": 18.896, "num_input_tokens_seen": 27262972160, "step": 26000 }, { "epoch": 0.5722154264775939, "grad_norm": 0.1538383513689041, "learning_rate": 0.001, "loss": 2.6783, "num_input_tokens_seen": 27315400960, "step": 26050 }, { "epoch": 0.5733137286397391, "grad_norm": 0.15545998513698578, "learning_rate": 0.001, "loss": 2.6798, "num_input_tokens_seen": 27367829760, "step": 26100 }, { "epoch": 0.5744120308018842, "grad_norm": 0.15456970036029816, "learning_rate": 0.001, "loss": 2.6836, "num_input_tokens_seen": 27420258560, "step": 26150 }, { "epoch": 0.5755103329640292, "grad_norm": 0.1353277862071991, "learning_rate": 0.001, "loss": 2.6777, "num_input_tokens_seen": 27472687360, "step": 26200 }, { "epoch": 0.5766086351261743, "grad_norm": 0.15124258399009705, "learning_rate": 0.001, "loss": 2.681, "num_input_tokens_seen": 27525116160, "step": 26250 }, { "epoch": 0.5777069372883195, "grad_norm": 0.14200901985168457, "learning_rate": 0.001, "loss": 2.6827, "num_input_tokens_seen": 27577544960, "step": 26300 }, { "epoch": 0.5788052394504645, "grad_norm": 0.15356388688087463, "learning_rate": 0.001, "loss": 2.6802, "num_input_tokens_seen": 27629973760, "step": 26350 }, { "epoch": 0.5799035416126096, "grad_norm": 0.17395390570163727, "learning_rate": 0.001, "loss": 2.6921, "num_input_tokens_seen": 27682402560, "step": 26400 }, { "epoch": 0.5810018437747547, "grad_norm": 0.1507692188024521, "learning_rate": 0.001, "loss": 2.6811, "num_input_tokens_seen": 27734831360, "step": 26450 }, { "epoch": 0.5821001459368998, "grad_norm": 0.14512786269187927, "learning_rate": 0.001, "loss": 2.6798, "num_input_tokens_seen": 27787260160, "step": 26500 }, { "epoch": 0.5821001459368998, "eval_loss": 2.5802626609802246, "eval_runtime": 67.1032, "eval_samples_per_second": 74.512, "eval_steps_per_second": 18.628, "num_input_tokens_seen": 27787260160, "step": 26500 }, { "epoch": 0.5831984480990449, "grad_norm": 0.15365912020206451, "learning_rate": 0.001, "loss": 2.6813, "num_input_tokens_seen": 27839688960, "step": 26550 }, { "epoch": 0.58429675026119, "grad_norm": 0.14015646278858185, "learning_rate": 0.001, "loss": 2.6774, "num_input_tokens_seen": 27892117760, "step": 26600 }, { "epoch": 0.5853950524233351, "grad_norm": 0.1529797911643982, "learning_rate": 0.001, "loss": 2.6751, "num_input_tokens_seen": 27944546560, "step": 26650 }, { "epoch": 0.5864933545854801, "grad_norm": 0.16909636557102203, "learning_rate": 0.001, "loss": 2.6795, "num_input_tokens_seen": 27996975360, "step": 26700 }, { "epoch": 0.5875916567476253, "grad_norm": 0.14130276441574097, "learning_rate": 0.001, "loss": 2.6809, "num_input_tokens_seen": 28049404160, "step": 26750 }, { "epoch": 0.5886899589097704, "grad_norm": 0.15182790160179138, "learning_rate": 0.001, "loss": 2.685, "num_input_tokens_seen": 28101832960, "step": 26800 }, { "epoch": 0.5897882610719154, "grad_norm": 0.12757331132888794, "learning_rate": 0.001, "loss": 2.6766, "num_input_tokens_seen": 28154261760, "step": 26850 }, { "epoch": 0.5908865632340605, "grad_norm": 0.1527504026889801, "learning_rate": 0.001, "loss": 2.6767, "num_input_tokens_seen": 28206690560, "step": 26900 }, { "epoch": 0.5919848653962057, "grad_norm": 0.18337304890155792, "learning_rate": 0.001, "loss": 2.6752, "num_input_tokens_seen": 28259119360, "step": 26950 }, { "epoch": 0.5930831675583508, "grad_norm": 0.1472473442554474, "learning_rate": 0.001, "loss": 2.6717, "num_input_tokens_seen": 28311548160, "step": 27000 }, { "epoch": 0.5930831675583508, "eval_loss": 2.5781941413879395, "eval_runtime": 66.2194, "eval_samples_per_second": 75.507, "eval_steps_per_second": 18.877, "num_input_tokens_seen": 28311548160, "step": 27000 }, { "epoch": 0.5941814697204958, "grad_norm": 0.15350718796253204, "learning_rate": 0.001, "loss": 2.6787, "num_input_tokens_seen": 28363976960, "step": 27050 }, { "epoch": 0.5952797718826409, "grad_norm": 0.1393333077430725, "learning_rate": 0.001, "loss": 2.6759, "num_input_tokens_seen": 28416405760, "step": 27100 }, { "epoch": 0.596378074044786, "grad_norm": 0.1485709846019745, "learning_rate": 0.001, "loss": 2.6772, "num_input_tokens_seen": 28468834560, "step": 27150 }, { "epoch": 0.5974763762069311, "grad_norm": 0.13909003138542175, "learning_rate": 0.001, "loss": 2.6729, "num_input_tokens_seen": 28521263360, "step": 27200 }, { "epoch": 0.5985746783690762, "grad_norm": 0.15117496252059937, "learning_rate": 0.001, "loss": 2.6704, "num_input_tokens_seen": 28573692160, "step": 27250 }, { "epoch": 0.5996729805312213, "grad_norm": 0.14054876565933228, "learning_rate": 0.001, "loss": 2.6748, "num_input_tokens_seen": 28626120960, "step": 27300 }, { "epoch": 0.6007712826933664, "grad_norm": 0.15437620878219604, "learning_rate": 0.001, "loss": 2.6778, "num_input_tokens_seen": 28678549760, "step": 27350 }, { "epoch": 0.6018695848555115, "grad_norm": 0.15858007967472076, "learning_rate": 0.001, "loss": 2.6763, "num_input_tokens_seen": 28730978560, "step": 27400 }, { "epoch": 0.6029678870176566, "grad_norm": 0.14459487795829773, "learning_rate": 0.001, "loss": 2.6726, "num_input_tokens_seen": 28783407360, "step": 27450 }, { "epoch": 0.6040661891798017, "grad_norm": 0.17691345512866974, "learning_rate": 0.001, "loss": 2.678, "num_input_tokens_seen": 28835836160, "step": 27500 }, { "epoch": 0.6040661891798017, "eval_loss": 2.576051950454712, "eval_runtime": 66.9387, "eval_samples_per_second": 74.695, "eval_steps_per_second": 18.674, "num_input_tokens_seen": 28835836160, "step": 27500 }, { "epoch": 0.6051644913419467, "grad_norm": 0.16200922429561615, "learning_rate": 0.001, "loss": 2.6763, "num_input_tokens_seen": 28888264960, "step": 27550 }, { "epoch": 0.6062627935040918, "grad_norm": 0.14567038416862488, "learning_rate": 0.001, "loss": 2.6795, "num_input_tokens_seen": 28940693760, "step": 27600 }, { "epoch": 0.607361095666237, "grad_norm": 0.16075611114501953, "learning_rate": 0.001, "loss": 2.6746, "num_input_tokens_seen": 28993122560, "step": 27650 }, { "epoch": 0.6084593978283821, "grad_norm": 0.1386987417936325, "learning_rate": 0.001, "loss": 2.6771, "num_input_tokens_seen": 29045551360, "step": 27700 }, { "epoch": 0.6095576999905271, "grad_norm": 0.14672614634037018, "learning_rate": 0.001, "loss": 2.6792, "num_input_tokens_seen": 29097980160, "step": 27750 }, { "epoch": 0.6106560021526722, "grad_norm": 0.22614523768424988, "learning_rate": 0.001, "loss": 2.6728, "num_input_tokens_seen": 29150408960, "step": 27800 }, { "epoch": 0.6117543043148174, "grad_norm": 0.15554341673851013, "learning_rate": 0.001, "loss": 2.676, "num_input_tokens_seen": 29202837760, "step": 27850 }, { "epoch": 0.6128526064769624, "grad_norm": 0.17181837558746338, "learning_rate": 0.001, "loss": 2.6811, "num_input_tokens_seen": 29255266560, "step": 27900 }, { "epoch": 0.6139509086391075, "grad_norm": 0.15763437747955322, "learning_rate": 0.001, "loss": 2.6797, "num_input_tokens_seen": 29307695360, "step": 27950 }, { "epoch": 0.6150492108012526, "grad_norm": 0.14721135795116425, "learning_rate": 0.001, "loss": 2.6762, "num_input_tokens_seen": 29360124160, "step": 28000 }, { "epoch": 0.6150492108012526, "eval_loss": 2.5763511657714844, "eval_runtime": 66.3236, "eval_samples_per_second": 75.388, "eval_steps_per_second": 18.847, "num_input_tokens_seen": 29360124160, "step": 28000 }, { "epoch": 0.6161475129633978, "grad_norm": 0.13857993483543396, "learning_rate": 0.001, "loss": 2.677, "num_input_tokens_seen": 29412552960, "step": 28050 }, { "epoch": 0.6172458151255428, "grad_norm": 0.14276473224163055, "learning_rate": 0.001, "loss": 2.6669, "num_input_tokens_seen": 29464981760, "step": 28100 }, { "epoch": 0.6183441172876879, "grad_norm": 0.1536131203174591, "learning_rate": 0.001, "loss": 2.6757, "num_input_tokens_seen": 29517410560, "step": 28150 }, { "epoch": 0.619442419449833, "grad_norm": 0.15733414888381958, "learning_rate": 0.001, "loss": 2.6735, "num_input_tokens_seen": 29569839360, "step": 28200 }, { "epoch": 0.620540721611978, "grad_norm": 0.14553523063659668, "learning_rate": 0.001, "loss": 2.6683, "num_input_tokens_seen": 29622268160, "step": 28250 }, { "epoch": 0.6216390237741232, "grad_norm": 0.15685459971427917, "learning_rate": 0.001, "loss": 2.6692, "num_input_tokens_seen": 29674696960, "step": 28300 }, { "epoch": 0.6227373259362683, "grad_norm": 0.16553767025470734, "learning_rate": 0.001, "loss": 2.6778, "num_input_tokens_seen": 29727125760, "step": 28350 }, { "epoch": 0.6238356280984134, "grad_norm": 0.1619853973388672, "learning_rate": 0.001, "loss": 2.6807, "num_input_tokens_seen": 29779554560, "step": 28400 }, { "epoch": 0.6249339302605584, "grad_norm": 0.12794817984104156, "learning_rate": 0.001, "loss": 2.6776, "num_input_tokens_seen": 29831983360, "step": 28450 }, { "epoch": 0.6260322324227036, "grad_norm": 0.17001128196716309, "learning_rate": 0.001, "loss": 2.6797, "num_input_tokens_seen": 29884412160, "step": 28500 }, { "epoch": 0.6260322324227036, "eval_loss": 2.5728061199188232, "eval_runtime": 66.7752, "eval_samples_per_second": 74.878, "eval_steps_per_second": 18.72, "num_input_tokens_seen": 29884412160, "step": 28500 }, { "epoch": 0.6271305345848487, "grad_norm": 0.12936875224113464, "learning_rate": 0.001, "loss": 2.6677, "num_input_tokens_seen": 29936840960, "step": 28550 }, { "epoch": 0.6282288367469937, "grad_norm": 0.14839358627796173, "learning_rate": 0.001, "loss": 2.6681, "num_input_tokens_seen": 29989269760, "step": 28600 }, { "epoch": 0.6293271389091388, "grad_norm": 0.1526126265525818, "learning_rate": 0.001, "loss": 2.6711, "num_input_tokens_seen": 30041698560, "step": 28650 }, { "epoch": 0.630425441071284, "grad_norm": 11.806962013244629, "learning_rate": 0.001, "loss": 2.7543, "num_input_tokens_seen": 30094127360, "step": 28700 }, { "epoch": 0.631523743233429, "grad_norm": 0.13446328043937683, "learning_rate": 0.001, "loss": 2.9466, "num_input_tokens_seen": 30146556160, "step": 28750 }, { "epoch": 0.6326220453955741, "grad_norm": 0.1319582760334015, "learning_rate": 0.001, "loss": 2.7002, "num_input_tokens_seen": 30198984960, "step": 28800 }, { "epoch": 0.6337203475577192, "grad_norm": 0.13955356180667877, "learning_rate": 0.001, "loss": 2.6814, "num_input_tokens_seen": 30251413760, "step": 28850 }, { "epoch": 0.6348186497198643, "grad_norm": 0.1295064240694046, "learning_rate": 0.001, "loss": 2.676, "num_input_tokens_seen": 30303842560, "step": 28900 }, { "epoch": 0.6359169518820094, "grad_norm": 0.1440495401620865, "learning_rate": 0.001, "loss": 2.6778, "num_input_tokens_seen": 30356271360, "step": 28950 }, { "epoch": 0.6370152540441545, "grad_norm": 0.13806115090847015, "learning_rate": 0.001, "loss": 2.6712, "num_input_tokens_seen": 30408700160, "step": 29000 }, { "epoch": 0.6370152540441545, "eval_loss": 2.576237440109253, "eval_runtime": 66.9761, "eval_samples_per_second": 74.653, "eval_steps_per_second": 18.663, "num_input_tokens_seen": 30408700160, "step": 29000 }, { "epoch": 0.6381135562062996, "grad_norm": 0.13853897154331207, "learning_rate": 0.001, "loss": 2.6719, "num_input_tokens_seen": 30461128960, "step": 29050 }, { "epoch": 0.6392118583684446, "grad_norm": 0.14228977262973785, "learning_rate": 0.001, "loss": 2.6788, "num_input_tokens_seen": 30513557760, "step": 29100 }, { "epoch": 0.6403101605305898, "grad_norm": 0.13464143872261047, "learning_rate": 0.001, "loss": 2.6743, "num_input_tokens_seen": 30565986560, "step": 29150 }, { "epoch": 0.6414084626927349, "grad_norm": 0.15960821509361267, "learning_rate": 0.001, "loss": 2.6729, "num_input_tokens_seen": 30618415360, "step": 29200 }, { "epoch": 0.64250676485488, "grad_norm": 0.13830585777759552, "learning_rate": 0.001, "loss": 2.6723, "num_input_tokens_seen": 30670844160, "step": 29250 }, { "epoch": 0.643605067017025, "grad_norm": 0.14440728724002838, "learning_rate": 0.001, "loss": 2.664, "num_input_tokens_seen": 30723272960, "step": 29300 }, { "epoch": 0.6447033691791701, "grad_norm": 0.14259463548660278, "learning_rate": 0.001, "loss": 2.6675, "num_input_tokens_seen": 30775701760, "step": 29350 }, { "epoch": 0.6458016713413153, "grad_norm": 0.1462564468383789, "learning_rate": 0.001, "loss": 2.6671, "num_input_tokens_seen": 30828130560, "step": 29400 }, { "epoch": 0.6468999735034603, "grad_norm": 0.1443469077348709, "learning_rate": 0.001, "loss": 2.6667, "num_input_tokens_seen": 30880559360, "step": 29450 }, { "epoch": 0.6479982756656054, "grad_norm": 0.143255814909935, "learning_rate": 0.001, "loss": 2.6652, "num_input_tokens_seen": 30932988160, "step": 29500 }, { "epoch": 0.6479982756656054, "eval_loss": 2.569544792175293, "eval_runtime": 66.8674, "eval_samples_per_second": 74.775, "eval_steps_per_second": 18.694, "num_input_tokens_seen": 30932988160, "step": 29500 }, { "epoch": 0.6490965778277505, "grad_norm": 0.15149758756160736, "learning_rate": 0.001, "loss": 2.6681, "num_input_tokens_seen": 30985416960, "step": 29550 }, { "epoch": 0.6501948799898957, "grad_norm": 0.15703468024730682, "learning_rate": 0.001, "loss": 2.6681, "num_input_tokens_seen": 31037845760, "step": 29600 }, { "epoch": 0.6512931821520407, "grad_norm": 0.14332515001296997, "learning_rate": 0.001, "loss": 2.6622, "num_input_tokens_seen": 31090274560, "step": 29650 }, { "epoch": 0.6523914843141858, "grad_norm": 0.13763870298862457, "learning_rate": 0.001, "loss": 2.6724, "num_input_tokens_seen": 31142703360, "step": 29700 }, { "epoch": 0.6534897864763309, "grad_norm": 0.11858976632356644, "learning_rate": 0.001, "loss": 2.6743, "num_input_tokens_seen": 31195132160, "step": 29750 }, { "epoch": 0.654588088638476, "grad_norm": 0.15627937018871307, "learning_rate": 0.001, "loss": 2.6653, "num_input_tokens_seen": 31247560960, "step": 29800 }, { "epoch": 0.6556863908006211, "grad_norm": 0.15052759647369385, "learning_rate": 0.001, "loss": 2.6684, "num_input_tokens_seen": 31299989760, "step": 29850 }, { "epoch": 0.6567846929627662, "grad_norm": 0.1648450791835785, "learning_rate": 0.001, "loss": 2.6783, "num_input_tokens_seen": 31352418560, "step": 29900 }, { "epoch": 0.6578829951249113, "grad_norm": 0.13318586349487305, "learning_rate": 0.001, "loss": 2.6712, "num_input_tokens_seen": 31404847360, "step": 29950 }, { "epoch": 0.6589812972870563, "grad_norm": 0.1517287641763687, "learning_rate": 0.001, "loss": 2.6688, "num_input_tokens_seen": 31457276160, "step": 30000 }, { "epoch": 0.6589812972870563, "eval_loss": 2.5676708221435547, "eval_runtime": 66.0876, "eval_samples_per_second": 75.657, "eval_steps_per_second": 18.914, "num_input_tokens_seen": 31457276160, "step": 30000 }, { "epoch": 0.6600795994492015, "grad_norm": 0.14465224742889404, "learning_rate": 0.001, "loss": 2.6657, "num_input_tokens_seen": 31509704960, "step": 30050 }, { "epoch": 0.6611779016113466, "grad_norm": 0.16096332669258118, "learning_rate": 0.001, "loss": 2.6612, "num_input_tokens_seen": 31562133760, "step": 30100 }, { "epoch": 0.6622762037734916, "grad_norm": 0.1434296816587448, "learning_rate": 0.001, "loss": 2.6695, "num_input_tokens_seen": 31614562560, "step": 30150 }, { "epoch": 0.6633745059356367, "grad_norm": 0.13844367861747742, "learning_rate": 0.001, "loss": 2.6649, "num_input_tokens_seen": 31666991360, "step": 30200 }, { "epoch": 0.6644728080977819, "grad_norm": 0.1579446643590927, "learning_rate": 0.001, "loss": 2.6701, "num_input_tokens_seen": 31719420160, "step": 30250 }, { "epoch": 0.665571110259927, "grad_norm": 0.1585385501384735, "learning_rate": 0.001, "loss": 2.665, "num_input_tokens_seen": 31771848960, "step": 30300 }, { "epoch": 0.666669412422072, "grad_norm": 0.18768636882305145, "learning_rate": 0.001, "loss": 2.6708, "num_input_tokens_seen": 31824277760, "step": 30350 }, { "epoch": 0.6677677145842171, "grad_norm": 0.13027966022491455, "learning_rate": 0.001, "loss": 2.6657, "num_input_tokens_seen": 31876706560, "step": 30400 }, { "epoch": 0.6688660167463623, "grad_norm": 0.13473722338676453, "learning_rate": 0.001, "loss": 2.6658, "num_input_tokens_seen": 31929135360, "step": 30450 }, { "epoch": 0.6699643189085073, "grad_norm": 0.14617317914962769, "learning_rate": 0.001, "loss": 2.664, "num_input_tokens_seen": 31981564160, "step": 30500 }, { "epoch": 0.6699643189085073, "eval_loss": 2.5658769607543945, "eval_runtime": 67.5011, "eval_samples_per_second": 74.073, "eval_steps_per_second": 18.518, "num_input_tokens_seen": 31981564160, "step": 30500 }, { "epoch": 0.6710626210706524, "grad_norm": 0.14581717550754547, "learning_rate": 0.001, "loss": 2.6654, "num_input_tokens_seen": 32033992960, "step": 30550 }, { "epoch": 0.6721609232327975, "grad_norm": 0.12281567603349686, "learning_rate": 0.001, "loss": 2.6649, "num_input_tokens_seen": 32086421760, "step": 30600 }, { "epoch": 0.6732592253949425, "grad_norm": 0.14368072152137756, "learning_rate": 0.001, "loss": 2.6605, "num_input_tokens_seen": 32138850560, "step": 30650 }, { "epoch": 0.6743575275570877, "grad_norm": 0.14596907794475555, "learning_rate": 0.001, "loss": 2.6651, "num_input_tokens_seen": 32191279360, "step": 30700 }, { "epoch": 0.6754558297192328, "grad_norm": 0.15414392948150635, "learning_rate": 0.001, "loss": 2.6696, "num_input_tokens_seen": 32243708160, "step": 30750 }, { "epoch": 0.6765541318813779, "grad_norm": 0.14875884354114532, "learning_rate": 0.001, "loss": 2.6662, "num_input_tokens_seen": 32296136960, "step": 30800 }, { "epoch": 0.6776524340435229, "grad_norm": 0.13774773478507996, "learning_rate": 0.001, "loss": 2.6649, "num_input_tokens_seen": 32348565760, "step": 30850 }, { "epoch": 0.6787507362056681, "grad_norm": 0.1647578626871109, "learning_rate": 0.001, "loss": 2.6693, "num_input_tokens_seen": 32400994560, "step": 30900 }, { "epoch": 0.6798490383678132, "grad_norm": 0.1620490700006485, "learning_rate": 0.001, "loss": 2.6726, "num_input_tokens_seen": 32453423360, "step": 30950 }, { "epoch": 0.6809473405299582, "grad_norm": 0.14238062500953674, "learning_rate": 0.001, "loss": 2.6681, "num_input_tokens_seen": 32505852160, "step": 31000 }, { "epoch": 0.6809473405299582, "eval_loss": 2.5645763874053955, "eval_runtime": 65.7725, "eval_samples_per_second": 76.02, "eval_steps_per_second": 19.005, "num_input_tokens_seen": 32505852160, "step": 31000 }, { "epoch": 0.6820456426921033, "grad_norm": 0.143716499209404, "learning_rate": 0.001, "loss": 2.6591, "num_input_tokens_seen": 32558280960, "step": 31050 }, { "epoch": 0.6831439448542485, "grad_norm": 0.16048283874988556, "learning_rate": 0.001, "loss": 2.659, "num_input_tokens_seen": 32610709760, "step": 31100 }, { "epoch": 0.6842422470163936, "grad_norm": 0.15203309059143066, "learning_rate": 0.001, "loss": 2.6703, "num_input_tokens_seen": 32663138560, "step": 31150 }, { "epoch": 0.6853405491785386, "grad_norm": 0.14977113902568817, "learning_rate": 0.001, "loss": 2.6657, "num_input_tokens_seen": 32715567360, "step": 31200 }, { "epoch": 0.6864388513406837, "grad_norm": 0.15292279422283173, "learning_rate": 0.001, "loss": 2.6629, "num_input_tokens_seen": 32767996160, "step": 31250 }, { "epoch": 0.6875371535028288, "grad_norm": 0.13721971213817596, "learning_rate": 0.001, "loss": 2.6641, "num_input_tokens_seen": 32820424960, "step": 31300 }, { "epoch": 0.6886354556649739, "grad_norm": 0.15564891695976257, "learning_rate": 0.001, "loss": 2.6673, "num_input_tokens_seen": 32872853760, "step": 31350 }, { "epoch": 0.689733757827119, "grad_norm": 0.15267717838287354, "learning_rate": 0.001, "loss": 2.6624, "num_input_tokens_seen": 32925282560, "step": 31400 }, { "epoch": 0.6908320599892641, "grad_norm": 0.15039384365081787, "learning_rate": 0.001, "loss": 2.6615, "num_input_tokens_seen": 32977711360, "step": 31450 }, { "epoch": 0.6919303621514092, "grad_norm": 0.14114901423454285, "learning_rate": 0.001, "loss": 2.6663, "num_input_tokens_seen": 33030140160, "step": 31500 }, { "epoch": 0.6919303621514092, "eval_loss": 2.5618767738342285, "eval_runtime": 66.9611, "eval_samples_per_second": 74.67, "eval_steps_per_second": 18.668, "num_input_tokens_seen": 33030140160, "step": 31500 }, { "epoch": 0.6930286643135543, "grad_norm": 0.1415725201368332, "learning_rate": 0.001, "loss": 2.6606, "num_input_tokens_seen": 33082568960, "step": 31550 }, { "epoch": 0.6941269664756994, "grad_norm": 0.14324156939983368, "learning_rate": 0.001, "loss": 2.6616, "num_input_tokens_seen": 33134997760, "step": 31600 }, { "epoch": 0.6952252686378445, "grad_norm": 0.1544431746006012, "learning_rate": 0.001, "loss": 2.6567, "num_input_tokens_seen": 33187426560, "step": 31650 }, { "epoch": 0.6963235707999895, "grad_norm": 0.14641186594963074, "learning_rate": 0.001, "loss": 2.6605, "num_input_tokens_seen": 33239855360, "step": 31700 }, { "epoch": 0.6974218729621346, "grad_norm": 0.13757406175136566, "learning_rate": 0.001, "loss": 2.673, "num_input_tokens_seen": 33292284160, "step": 31750 }, { "epoch": 0.6985201751242798, "grad_norm": 0.14516425132751465, "learning_rate": 0.001, "loss": 2.6781, "num_input_tokens_seen": 33344712960, "step": 31800 }, { "epoch": 0.6996184772864249, "grad_norm": 0.15246887505054474, "learning_rate": 0.001, "loss": 2.6683, "num_input_tokens_seen": 33397141760, "step": 31850 }, { "epoch": 0.7007167794485699, "grad_norm": 0.1413787305355072, "learning_rate": 0.001, "loss": 2.6591, "num_input_tokens_seen": 33449570560, "step": 31900 }, { "epoch": 0.701815081610715, "grad_norm": 0.16077399253845215, "learning_rate": 0.001, "loss": 2.6628, "num_input_tokens_seen": 33501999360, "step": 31950 }, { "epoch": 0.7029133837728602, "grad_norm": 0.1555839478969574, "learning_rate": 0.001, "loss": 2.6631, "num_input_tokens_seen": 33554428160, "step": 32000 }, { "epoch": 0.7029133837728602, "eval_loss": 2.561042547225952, "eval_runtime": 66.7879, "eval_samples_per_second": 74.864, "eval_steps_per_second": 18.716, "num_input_tokens_seen": 33554428160, "step": 32000 }, { "epoch": 0.7040116859350052, "grad_norm": 0.15333816409111023, "learning_rate": 0.001, "loss": 2.6605, "num_input_tokens_seen": 33606856960, "step": 32050 }, { "epoch": 0.7051099880971503, "grad_norm": 0.14965052902698517, "learning_rate": 0.001, "loss": 2.6551, "num_input_tokens_seen": 33659285760, "step": 32100 }, { "epoch": 0.7062082902592954, "grad_norm": 0.1994074285030365, "learning_rate": 0.001, "loss": 2.6652, "num_input_tokens_seen": 33711714560, "step": 32150 }, { "epoch": 0.7073065924214406, "grad_norm": 0.3089894652366638, "learning_rate": 0.001, "loss": 2.6814, "num_input_tokens_seen": 33764143360, "step": 32200 }, { "epoch": 0.7084048945835856, "grad_norm": 0.14903652667999268, "learning_rate": 0.001, "loss": 2.6834, "num_input_tokens_seen": 33816572160, "step": 32250 }, { "epoch": 0.7095031967457307, "grad_norm": 0.17594854533672333, "learning_rate": 0.001, "loss": 2.6618, "num_input_tokens_seen": 33869000960, "step": 32300 }, { "epoch": 0.7106014989078758, "grad_norm": 0.15634667873382568, "learning_rate": 0.001, "loss": 2.6663, "num_input_tokens_seen": 33921429760, "step": 32350 }, { "epoch": 0.7116998010700208, "grad_norm": 0.13893702626228333, "learning_rate": 0.001, "loss": 2.67, "num_input_tokens_seen": 33973858560, "step": 32400 }, { "epoch": 0.712798103232166, "grad_norm": 0.16974663734436035, "learning_rate": 0.001, "loss": 2.6686, "num_input_tokens_seen": 34026287360, "step": 32450 }, { "epoch": 0.7138964053943111, "grad_norm": 0.15336968004703522, "learning_rate": 0.001, "loss": 2.6703, "num_input_tokens_seen": 34078716160, "step": 32500 }, { "epoch": 0.7138964053943111, "eval_loss": 2.5648574829101562, "eval_runtime": 66.0796, "eval_samples_per_second": 75.666, "eval_steps_per_second": 18.917, "num_input_tokens_seen": 34078716160, "step": 32500 }, { "epoch": 0.7149947075564561, "grad_norm": 1.428727626800537, "learning_rate": 0.001, "loss": 2.8433, "num_input_tokens_seen": 34131144960, "step": 32550 }, { "epoch": 0.7160930097186012, "grad_norm": 0.1666879504919052, "learning_rate": 0.001, "loss": 2.7236, "num_input_tokens_seen": 34183573760, "step": 32600 }, { "epoch": 0.7171913118807464, "grad_norm": 0.16038021445274353, "learning_rate": 0.001, "loss": 2.6876, "num_input_tokens_seen": 34236002560, "step": 32650 }, { "epoch": 0.7182896140428915, "grad_norm": 0.1514110267162323, "learning_rate": 0.001, "loss": 2.6717, "num_input_tokens_seen": 34288431360, "step": 32700 }, { "epoch": 0.7193879162050365, "grad_norm": 0.13304661214351654, "learning_rate": 0.001, "loss": 2.6664, "num_input_tokens_seen": 34340860160, "step": 32750 }, { "epoch": 0.7204862183671816, "grad_norm": 0.15957415103912354, "learning_rate": 0.001, "loss": 2.6683, "num_input_tokens_seen": 34393288960, "step": 32800 }, { "epoch": 0.7215845205293268, "grad_norm": 0.14532499015331268, "learning_rate": 0.001, "loss": 2.6632, "num_input_tokens_seen": 34445717760, "step": 32850 }, { "epoch": 0.7226828226914718, "grad_norm": 0.1402454972267151, "learning_rate": 0.001, "loss": 2.6631, "num_input_tokens_seen": 34498146560, "step": 32900 }, { "epoch": 0.7237811248536169, "grad_norm": 0.17248420417308807, "learning_rate": 0.001, "loss": 2.6743, "num_input_tokens_seen": 34550575360, "step": 32950 }, { "epoch": 0.724879427015762, "grad_norm": 0.1455400288105011, "learning_rate": 0.001, "loss": 2.6598, "num_input_tokens_seen": 34603004160, "step": 33000 }, { "epoch": 0.724879427015762, "eval_loss": 2.5639312267303467, "eval_runtime": 66.9575, "eval_samples_per_second": 74.674, "eval_steps_per_second": 18.669, "num_input_tokens_seen": 34603004160, "step": 33000 }, { "epoch": 0.7259777291779071, "grad_norm": 0.14448963105678558, "learning_rate": 0.001, "loss": 2.6579, "num_input_tokens_seen": 34655432960, "step": 33050 }, { "epoch": 0.7270760313400522, "grad_norm": 0.15785731375217438, "learning_rate": 0.001, "loss": 2.6641, "num_input_tokens_seen": 34707861760, "step": 33100 }, { "epoch": 0.7281743335021973, "grad_norm": 0.14524365961551666, "learning_rate": 0.001, "loss": 2.6639, "num_input_tokens_seen": 34760290560, "step": 33150 }, { "epoch": 0.7292726356643424, "grad_norm": 0.17661139369010925, "learning_rate": 0.001, "loss": 2.666, "num_input_tokens_seen": 34812719360, "step": 33200 }, { "epoch": 0.7303709378264874, "grad_norm": 0.14052839577198029, "learning_rate": 0.001, "loss": 2.6638, "num_input_tokens_seen": 34865148160, "step": 33250 }, { "epoch": 0.7314692399886326, "grad_norm": 0.14182330667972565, "learning_rate": 0.001, "loss": 2.6618, "num_input_tokens_seen": 34917576960, "step": 33300 }, { "epoch": 0.7325675421507777, "grad_norm": 0.168069988489151, "learning_rate": 0.001, "loss": 2.6655, "num_input_tokens_seen": 34970005760, "step": 33350 }, { "epoch": 0.7336658443129228, "grad_norm": 0.1627034991979599, "learning_rate": 0.001, "loss": 2.6646, "num_input_tokens_seen": 35022434560, "step": 33400 }, { "epoch": 0.7347641464750678, "grad_norm": 0.1257403939962387, "learning_rate": 0.001, "loss": 2.6682, "num_input_tokens_seen": 35074863360, "step": 33450 }, { "epoch": 0.735862448637213, "grad_norm": 0.15367744863033295, "learning_rate": 0.001, "loss": 2.6693, "num_input_tokens_seen": 35127292160, "step": 33500 }, { "epoch": 0.735862448637213, "eval_loss": 2.5610554218292236, "eval_runtime": 67.0185, "eval_samples_per_second": 74.606, "eval_steps_per_second": 18.652, "num_input_tokens_seen": 35127292160, "step": 33500 }, { "epoch": 0.7369607507993581, "grad_norm": 0.16001376509666443, "learning_rate": 0.001, "loss": 2.6594, "num_input_tokens_seen": 35179720960, "step": 33550 }, { "epoch": 0.7380590529615031, "grad_norm": 0.14694422483444214, "learning_rate": 0.001, "loss": 2.6635, "num_input_tokens_seen": 35232149760, "step": 33600 }, { "epoch": 0.7391573551236482, "grad_norm": 0.15586304664611816, "learning_rate": 0.001, "loss": 2.6565, "num_input_tokens_seen": 35284578560, "step": 33650 }, { "epoch": 0.7402556572857933, "grad_norm": 0.16455145180225372, "learning_rate": 0.001, "loss": 2.6621, "num_input_tokens_seen": 35337007360, "step": 33700 }, { "epoch": 0.7413539594479385, "grad_norm": 0.13630282878875732, "learning_rate": 0.001, "loss": 2.6658, "num_input_tokens_seen": 35389436160, "step": 33750 }, { "epoch": 0.7424522616100835, "grad_norm": 0.15180189907550812, "learning_rate": 0.001, "loss": 2.6593, "num_input_tokens_seen": 35441864960, "step": 33800 }, { "epoch": 0.7435505637722286, "grad_norm": 0.16608890891075134, "learning_rate": 0.001, "loss": 2.6777, "num_input_tokens_seen": 35494293760, "step": 33850 }, { "epoch": 0.7446488659343737, "grad_norm": 0.31720519065856934, "learning_rate": 0.001, "loss": 2.6685, "num_input_tokens_seen": 35546722560, "step": 33900 }, { "epoch": 0.7457471680965188, "grad_norm": 0.24131393432617188, "learning_rate": 0.001, "loss": 2.6682, "num_input_tokens_seen": 35599151360, "step": 33950 }, { "epoch": 0.7468454702586639, "grad_norm": 0.1594172567129135, "learning_rate": 0.001, "loss": 2.6575, "num_input_tokens_seen": 35651580160, "step": 34000 }, { "epoch": 0.7468454702586639, "eval_loss": 2.5587804317474365, "eval_runtime": 66.6197, "eval_samples_per_second": 75.053, "eval_steps_per_second": 18.763, "num_input_tokens_seen": 35651580160, "step": 34000 }, { "epoch": 0.747943772420809, "grad_norm": 0.1586858183145523, "learning_rate": 0.001, "loss": 2.6654, "num_input_tokens_seen": 35704008960, "step": 34050 }, { "epoch": 0.749042074582954, "grad_norm": 0.1376073956489563, "learning_rate": 0.001, "loss": 2.6627, "num_input_tokens_seen": 35756437760, "step": 34100 }, { "epoch": 0.7501403767450991, "grad_norm": 0.13904818892478943, "learning_rate": 0.001, "loss": 2.6605, "num_input_tokens_seen": 35808866560, "step": 34150 }, { "epoch": 0.7512386789072443, "grad_norm": 0.14543947577476501, "learning_rate": 0.001, "loss": 2.6589, "num_input_tokens_seen": 35861295360, "step": 34200 }, { "epoch": 0.7523369810693894, "grad_norm": 0.14855198562145233, "learning_rate": 0.001, "loss": 2.6612, "num_input_tokens_seen": 35913724160, "step": 34250 }, { "epoch": 0.7534352832315344, "grad_norm": 0.14492908120155334, "learning_rate": 0.001, "loss": 2.6561, "num_input_tokens_seen": 35966152960, "step": 34300 }, { "epoch": 0.7545335853936795, "grad_norm": 0.1388978660106659, "learning_rate": 0.001, "loss": 2.6551, "num_input_tokens_seen": 36018581760, "step": 34350 }, { "epoch": 0.7556318875558247, "grad_norm": 0.14582422375679016, "learning_rate": 0.001, "loss": 2.6521, "num_input_tokens_seen": 36071010560, "step": 34400 }, { "epoch": 0.7567301897179697, "grad_norm": 0.17488695681095123, "learning_rate": 0.001, "loss": 2.6516, "num_input_tokens_seen": 36123439360, "step": 34450 }, { "epoch": 0.7578284918801148, "grad_norm": 0.12302416563034058, "learning_rate": 0.001, "loss": 2.6617, "num_input_tokens_seen": 36175868160, "step": 34500 }, { "epoch": 0.7578284918801148, "eval_loss": 2.5549991130828857, "eval_runtime": 67.5095, "eval_samples_per_second": 74.064, "eval_steps_per_second": 18.516, "num_input_tokens_seen": 36175868160, "step": 34500 }, { "epoch": 0.7589267940422599, "grad_norm": 0.14238396286964417, "learning_rate": 0.001, "loss": 2.6609, "num_input_tokens_seen": 36228296960, "step": 34550 }, { "epoch": 0.7600250962044051, "grad_norm": 0.17919403314590454, "learning_rate": 0.001, "loss": 2.6621, "num_input_tokens_seen": 36280725760, "step": 34600 }, { "epoch": 0.7611233983665501, "grad_norm": 0.13188666105270386, "learning_rate": 0.001, "loss": 2.6529, "num_input_tokens_seen": 36333154560, "step": 34650 }, { "epoch": 0.7622217005286952, "grad_norm": 0.16191646456718445, "learning_rate": 0.001, "loss": 2.6584, "num_input_tokens_seen": 36385583360, "step": 34700 }, { "epoch": 0.7633200026908403, "grad_norm": 0.14606165885925293, "learning_rate": 0.001, "loss": 2.6567, "num_input_tokens_seen": 36438012160, "step": 34750 }, { "epoch": 0.7644183048529853, "grad_norm": 0.1648443192243576, "learning_rate": 0.001, "loss": 2.6587, "num_input_tokens_seen": 36490440960, "step": 34800 }, { "epoch": 0.7655166070151305, "grad_norm": 0.19523674249649048, "learning_rate": 0.001, "loss": 2.6662, "num_input_tokens_seen": 36542869760, "step": 34850 }, { "epoch": 0.7666149091772756, "grad_norm": 0.1713179498910904, "learning_rate": 0.001, "loss": 2.6683, "num_input_tokens_seen": 36595298560, "step": 34900 }, { "epoch": 0.7677132113394207, "grad_norm": 0.14923711121082306, "learning_rate": 0.001, "loss": 2.6629, "num_input_tokens_seen": 36647727360, "step": 34950 }, { "epoch": 0.7688115135015657, "grad_norm": 0.13948023319244385, "learning_rate": 0.001, "loss": 2.6619, "num_input_tokens_seen": 36700156160, "step": 35000 }, { "epoch": 0.7688115135015657, "eval_loss": 2.5569379329681396, "eval_runtime": 67.9393, "eval_samples_per_second": 73.595, "eval_steps_per_second": 18.399, "num_input_tokens_seen": 36700156160, "step": 35000 }, { "epoch": 0.7699098156637109, "grad_norm": 0.14624406397342682, "learning_rate": 0.001, "loss": 2.657, "num_input_tokens_seen": 36752584960, "step": 35050 }, { "epoch": 0.771008117825856, "grad_norm": 0.16855786740779877, "learning_rate": 0.001, "loss": 2.6585, "num_input_tokens_seen": 36805013760, "step": 35100 }, { "epoch": 0.772106419988001, "grad_norm": 0.1439932882785797, "learning_rate": 0.001, "loss": 2.6653, "num_input_tokens_seen": 36857442560, "step": 35150 }, { "epoch": 0.7732047221501461, "grad_norm": 0.16299331188201904, "learning_rate": 0.001, "loss": 2.6621, "num_input_tokens_seen": 36909871360, "step": 35200 }, { "epoch": 0.7743030243122913, "grad_norm": 0.16961826384067535, "learning_rate": 0.001, "loss": 2.6545, "num_input_tokens_seen": 36962300160, "step": 35250 }, { "epoch": 0.7754013264744364, "grad_norm": 0.13337954878807068, "learning_rate": 0.001, "loss": 2.652, "num_input_tokens_seen": 37014728960, "step": 35300 }, { "epoch": 0.7764996286365814, "grad_norm": 0.1728074699640274, "learning_rate": 0.001, "loss": 2.6631, "num_input_tokens_seen": 37067157760, "step": 35350 }, { "epoch": 0.7775979307987265, "grad_norm": 0.16615192592144012, "learning_rate": 0.001, "loss": 2.6551, "num_input_tokens_seen": 37119586560, "step": 35400 }, { "epoch": 0.7786962329608716, "grad_norm": 0.1515650749206543, "learning_rate": 0.001, "loss": 2.6529, "num_input_tokens_seen": 37172015360, "step": 35450 }, { "epoch": 0.7797945351230167, "grad_norm": 0.1534053236246109, "learning_rate": 0.001, "loss": 2.6567, "num_input_tokens_seen": 37224444160, "step": 35500 }, { "epoch": 0.7797945351230167, "eval_loss": 2.55454683303833, "eval_runtime": 67.0727, "eval_samples_per_second": 74.546, "eval_steps_per_second": 18.637, "num_input_tokens_seen": 37224444160, "step": 35500 }, { "epoch": 0.7808928372851618, "grad_norm": 0.16377541422843933, "learning_rate": 0.001, "loss": 2.6552, "num_input_tokens_seen": 37276872960, "step": 35550 }, { "epoch": 0.7819911394473069, "grad_norm": 0.14807477593421936, "learning_rate": 0.001, "loss": 2.6563, "num_input_tokens_seen": 37329301760, "step": 35600 }, { "epoch": 0.783089441609452, "grad_norm": 0.13599660992622375, "learning_rate": 0.001, "loss": 2.6575, "num_input_tokens_seen": 37381730560, "step": 35650 }, { "epoch": 0.7841877437715971, "grad_norm": 0.16653482615947723, "learning_rate": 0.001, "loss": 2.6515, "num_input_tokens_seen": 37434159360, "step": 35700 }, { "epoch": 0.7852860459337422, "grad_norm": 0.15467293560504913, "learning_rate": 0.001, "loss": 2.6548, "num_input_tokens_seen": 37486588160, "step": 35750 }, { "epoch": 0.7863843480958873, "grad_norm": 0.4751467704772949, "learning_rate": 0.001, "loss": 2.6592, "num_input_tokens_seen": 37539016960, "step": 35800 }, { "epoch": 0.7874826502580323, "grad_norm": 0.15940867364406586, "learning_rate": 0.001, "loss": 2.6624, "num_input_tokens_seen": 37591445760, "step": 35850 }, { "epoch": 0.7885809524201775, "grad_norm": 0.137634739279747, "learning_rate": 0.001, "loss": 2.6559, "num_input_tokens_seen": 37643874560, "step": 35900 }, { "epoch": 0.7896792545823226, "grad_norm": 0.16022460162639618, "learning_rate": 0.001, "loss": 2.6555, "num_input_tokens_seen": 37696303360, "step": 35950 }, { "epoch": 0.7907775567444676, "grad_norm": 0.147109717130661, "learning_rate": 0.001, "loss": 2.663, "num_input_tokens_seen": 37748732160, "step": 36000 }, { "epoch": 0.7907775567444676, "eval_loss": 2.556107521057129, "eval_runtime": 67.1814, "eval_samples_per_second": 74.425, "eval_steps_per_second": 18.606, "num_input_tokens_seen": 37748732160, "step": 36000 }, { "epoch": 0.7918758589066127, "grad_norm": 0.16054154932498932, "learning_rate": 0.001, "loss": 2.6516, "num_input_tokens_seen": 37801160960, "step": 36050 }, { "epoch": 0.7929741610687578, "grad_norm": 0.15180550515651703, "learning_rate": 0.001, "loss": 2.6508, "num_input_tokens_seen": 37853589760, "step": 36100 }, { "epoch": 0.794072463230903, "grad_norm": 0.19564937055110931, "learning_rate": 0.001, "loss": 2.6532, "num_input_tokens_seen": 37906018560, "step": 36150 }, { "epoch": 0.795170765393048, "grad_norm": 0.15047501027584076, "learning_rate": 0.001, "loss": 2.6567, "num_input_tokens_seen": 37958447360, "step": 36200 }, { "epoch": 0.7962690675551931, "grad_norm": 0.1420314759016037, "learning_rate": 0.001, "loss": 2.6511, "num_input_tokens_seen": 38010876160, "step": 36250 }, { "epoch": 0.7973673697173382, "grad_norm": 0.14328153431415558, "learning_rate": 0.001, "loss": 2.6601, "num_input_tokens_seen": 38063304960, "step": 36300 }, { "epoch": 0.7984656718794833, "grad_norm": 0.15527622401714325, "learning_rate": 0.001, "loss": 2.6598, "num_input_tokens_seen": 38115733760, "step": 36350 }, { "epoch": 0.7995639740416284, "grad_norm": 0.15956974029541016, "learning_rate": 0.001, "loss": 2.6522, "num_input_tokens_seen": 38168162560, "step": 36400 }, { "epoch": 0.8006622762037735, "grad_norm": 0.15193034708499908, "learning_rate": 0.001, "loss": 2.6561, "num_input_tokens_seen": 38220591360, "step": 36450 }, { "epoch": 0.8017605783659186, "grad_norm": 0.1692439615726471, "learning_rate": 0.001, "loss": 2.653, "num_input_tokens_seen": 38273020160, "step": 36500 }, { "epoch": 0.8017605783659186, "eval_loss": 2.553743362426758, "eval_runtime": 66.3488, "eval_samples_per_second": 75.359, "eval_steps_per_second": 18.84, "num_input_tokens_seen": 38273020160, "step": 36500 }, { "epoch": 0.8028588805280636, "grad_norm": 0.473707377910614, "learning_rate": 0.001, "loss": 2.6604, "num_input_tokens_seen": 38325448960, "step": 36550 }, { "epoch": 0.8039571826902088, "grad_norm": 0.16226574778556824, "learning_rate": 0.001, "loss": 2.6615, "num_input_tokens_seen": 38377877760, "step": 36600 }, { "epoch": 0.8050554848523539, "grad_norm": 0.17274035513401031, "learning_rate": 0.001, "loss": 2.6616, "num_input_tokens_seen": 38430306560, "step": 36650 }, { "epoch": 0.8061537870144989, "grad_norm": 0.14171990752220154, "learning_rate": 0.001, "loss": 2.6628, "num_input_tokens_seen": 38482735360, "step": 36700 }, { "epoch": 0.807252089176644, "grad_norm": 0.3828020989894867, "learning_rate": 0.001, "loss": 2.6717, "num_input_tokens_seen": 38535164160, "step": 36750 }, { "epoch": 0.8083503913387892, "grad_norm": 0.20836575329303741, "learning_rate": 0.001, "loss": 2.685, "num_input_tokens_seen": 38587592960, "step": 36800 }, { "epoch": 0.8094486935009343, "grad_norm": 0.14613227546215057, "learning_rate": 0.001, "loss": 2.6687, "num_input_tokens_seen": 38640021760, "step": 36850 }, { "epoch": 0.8105469956630793, "grad_norm": 0.16505028307437897, "learning_rate": 0.001, "loss": 2.6654, "num_input_tokens_seen": 38692450560, "step": 36900 }, { "epoch": 0.8116452978252244, "grad_norm": 0.15305323898792267, "learning_rate": 0.001, "loss": 2.6612, "num_input_tokens_seen": 38744879360, "step": 36950 }, { "epoch": 0.8127435999873696, "grad_norm": 0.2416296899318695, "learning_rate": 0.001, "loss": 2.6614, "num_input_tokens_seen": 38797308160, "step": 37000 }, { "epoch": 0.8127435999873696, "eval_loss": 2.5642571449279785, "eval_runtime": 66.5631, "eval_samples_per_second": 75.117, "eval_steps_per_second": 18.779, "num_input_tokens_seen": 38797308160, "step": 37000 }, { "epoch": 0.8138419021495146, "grad_norm": 0.1504666954278946, "learning_rate": 0.001, "loss": 2.6625, "num_input_tokens_seen": 38849736960, "step": 37050 }, { "epoch": 0.8149402043116597, "grad_norm": 0.15831789374351501, "learning_rate": 0.001, "loss": 2.6566, "num_input_tokens_seen": 38902165760, "step": 37100 }, { "epoch": 0.8160385064738048, "grad_norm": 0.1391575187444687, "learning_rate": 0.001, "loss": 2.6609, "num_input_tokens_seen": 38954594560, "step": 37150 }, { "epoch": 0.81713680863595, "grad_norm": 0.22168035805225372, "learning_rate": 0.001, "loss": 2.6768, "num_input_tokens_seen": 39007023360, "step": 37200 }, { "epoch": 0.818235110798095, "grad_norm": 0.1874976009130478, "learning_rate": 0.001, "loss": 2.679, "num_input_tokens_seen": 39059452160, "step": 37250 }, { "epoch": 0.8193334129602401, "grad_norm": 0.1796240657567978, "learning_rate": 0.001, "loss": 2.6644, "num_input_tokens_seen": 39111880960, "step": 37300 }, { "epoch": 0.8204317151223852, "grad_norm": 0.3271934986114502, "learning_rate": 0.001, "loss": 2.6695, "num_input_tokens_seen": 39164309760, "step": 37350 }, { "epoch": 0.8215300172845302, "grad_norm": 0.13447704911231995, "learning_rate": 0.001, "loss": 2.6656, "num_input_tokens_seen": 39216738560, "step": 37400 }, { "epoch": 0.8226283194466754, "grad_norm": 0.1367628127336502, "learning_rate": 0.001, "loss": 2.6505, "num_input_tokens_seen": 39269167360, "step": 37450 }, { "epoch": 0.8237266216088205, "grad_norm": 0.1498686671257019, "learning_rate": 0.001, "loss": 2.6594, "num_input_tokens_seen": 39321596160, "step": 37500 }, { "epoch": 0.8237266216088205, "eval_loss": 2.5516529083251953, "eval_runtime": 66.8213, "eval_samples_per_second": 74.826, "eval_steps_per_second": 18.707, "num_input_tokens_seen": 39321596160, "step": 37500 }, { "epoch": 0.8248249237709656, "grad_norm": 0.14790424704551697, "learning_rate": 0.001, "loss": 2.6519, "num_input_tokens_seen": 39374024960, "step": 37550 }, { "epoch": 0.8259232259331106, "grad_norm": 0.15297918021678925, "learning_rate": 0.001, "loss": 2.6533, "num_input_tokens_seen": 39426453760, "step": 37600 }, { "epoch": 0.8270215280952558, "grad_norm": 0.15760953724384308, "learning_rate": 0.001, "loss": 2.6584, "num_input_tokens_seen": 39478882560, "step": 37650 }, { "epoch": 0.8281198302574009, "grad_norm": 0.1545770913362503, "learning_rate": 0.001, "loss": 2.6453, "num_input_tokens_seen": 39531311360, "step": 37700 }, { "epoch": 0.8292181324195459, "grad_norm": 0.17809870839118958, "learning_rate": 0.001, "loss": 2.6547, "num_input_tokens_seen": 39583740160, "step": 37750 }, { "epoch": 0.830316434581691, "grad_norm": 0.2712576687335968, "learning_rate": 0.001, "loss": 2.6489, "num_input_tokens_seen": 39636168960, "step": 37800 }, { "epoch": 0.8314147367438361, "grad_norm": 0.1525331437587738, "learning_rate": 0.001, "loss": 2.6558, "num_input_tokens_seen": 39688597760, "step": 37850 }, { "epoch": 0.8325130389059812, "grad_norm": 0.1624525785446167, "learning_rate": 0.001, "loss": 2.6465, "num_input_tokens_seen": 39741026560, "step": 37900 }, { "epoch": 0.8336113410681263, "grad_norm": 0.14974552392959595, "learning_rate": 0.001, "loss": 2.6595, "num_input_tokens_seen": 39793455360, "step": 37950 }, { "epoch": 0.8347096432302714, "grad_norm": 0.15206202864646912, "learning_rate": 0.001, "loss": 2.6525, "num_input_tokens_seen": 39845884160, "step": 38000 }, { "epoch": 0.8347096432302714, "eval_loss": 2.549203395843506, "eval_runtime": 66.3732, "eval_samples_per_second": 75.332, "eval_steps_per_second": 18.833, "num_input_tokens_seen": 39845884160, "step": 38000 }, { "epoch": 0.8358079453924165, "grad_norm": 0.15346269309520721, "learning_rate": 0.001, "loss": 2.645, "num_input_tokens_seen": 39898312960, "step": 38050 }, { "epoch": 0.8369062475545616, "grad_norm": 0.1504630148410797, "learning_rate": 0.001, "loss": 2.666, "num_input_tokens_seen": 39950741760, "step": 38100 }, { "epoch": 0.8380045497167067, "grad_norm": 0.19098903238773346, "learning_rate": 0.001, "loss": 2.6649, "num_input_tokens_seen": 40003170560, "step": 38150 }, { "epoch": 0.8391028518788518, "grad_norm": 0.15553973615169525, "learning_rate": 0.001, "loss": 2.6565, "num_input_tokens_seen": 40055599360, "step": 38200 }, { "epoch": 0.8402011540409968, "grad_norm": 0.15650159120559692, "learning_rate": 0.001, "loss": 2.6568, "num_input_tokens_seen": 40108028160, "step": 38250 }, { "epoch": 0.841299456203142, "grad_norm": 0.17787836492061615, "learning_rate": 0.001, "loss": 2.6497, "num_input_tokens_seen": 40160456960, "step": 38300 }, { "epoch": 0.8423977583652871, "grad_norm": 0.1535162478685379, "learning_rate": 0.001, "loss": 2.6492, "num_input_tokens_seen": 40212885760, "step": 38350 }, { "epoch": 0.8434960605274322, "grad_norm": 0.16713359951972961, "learning_rate": 0.001, "loss": 2.6534, "num_input_tokens_seen": 40265314560, "step": 38400 }, { "epoch": 0.8445943626895772, "grad_norm": 0.17087998986244202, "learning_rate": 0.001, "loss": 2.6602, "num_input_tokens_seen": 40317743360, "step": 38450 }, { "epoch": 0.8456926648517223, "grad_norm": 0.15651412308216095, "learning_rate": 0.001, "loss": 2.6547, "num_input_tokens_seen": 40370172160, "step": 38500 }, { "epoch": 0.8456926648517223, "eval_loss": 2.5524706840515137, "eval_runtime": 66.5023, "eval_samples_per_second": 75.185, "eval_steps_per_second": 18.796, "num_input_tokens_seen": 40370172160, "step": 38500 }, { "epoch": 0.8467909670138675, "grad_norm": 0.15205898880958557, "learning_rate": 0.001, "loss": 2.6541, "num_input_tokens_seen": 40422600960, "step": 38550 }, { "epoch": 0.8478892691760125, "grad_norm": 0.15865832567214966, "learning_rate": 0.001, "loss": 2.6536, "num_input_tokens_seen": 40475029760, "step": 38600 }, { "epoch": 0.8489875713381576, "grad_norm": 0.133284330368042, "learning_rate": 0.001, "loss": 2.6531, "num_input_tokens_seen": 40527458560, "step": 38650 }, { "epoch": 0.8500858735003027, "grad_norm": 0.1421806663274765, "learning_rate": 0.001, "loss": 2.6558, "num_input_tokens_seen": 40579887360, "step": 38700 }, { "epoch": 0.8511841756624479, "grad_norm": 0.19429996609687805, "learning_rate": 0.001, "loss": 2.6628, "num_input_tokens_seen": 40632316160, "step": 38750 }, { "epoch": 0.8522824778245929, "grad_norm": 0.14661937952041626, "learning_rate": 0.001, "loss": 2.6594, "num_input_tokens_seen": 40684744960, "step": 38800 }, { "epoch": 0.853380779986738, "grad_norm": 0.1694687008857727, "learning_rate": 0.001, "loss": 2.6571, "num_input_tokens_seen": 40737173760, "step": 38850 }, { "epoch": 0.8544790821488831, "grad_norm": 0.152188241481781, "learning_rate": 0.001, "loss": 2.6534, "num_input_tokens_seen": 40789602560, "step": 38900 }, { "epoch": 0.8555773843110281, "grad_norm": 0.1554640680551529, "learning_rate": 0.001, "loss": 2.649, "num_input_tokens_seen": 40842031360, "step": 38950 }, { "epoch": 0.8566756864731733, "grad_norm": 0.1481955647468567, "learning_rate": 0.001, "loss": 2.6527, "num_input_tokens_seen": 40894460160, "step": 39000 }, { "epoch": 0.8566756864731733, "eval_loss": 2.547664165496826, "eval_runtime": 66.2874, "eval_samples_per_second": 75.429, "eval_steps_per_second": 18.857, "num_input_tokens_seen": 40894460160, "step": 39000 } ], "logging_steps": 50, "max_steps": 200000, "num_input_tokens_seen": 40894460160, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3289694735724052e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }