diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12505 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.333901761331791, + "eval_steps": 500, + "global_step": 70000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00023850125809413643, + "grad_norm": 1.5476292371749878, + "learning_rate": 0.0001, + "loss": 9.8285, + "num_input_tokens_seen": 13107200, + "step": 50 + }, + { + "epoch": 0.00047700251618827287, + "grad_norm": 0.47720426321029663, + "learning_rate": 0.0002, + "loss": 7.964, + "num_input_tokens_seen": 26214400, + "step": 100 + }, + { + "epoch": 0.0007155037742824094, + "grad_norm": 0.8443030714988708, + "learning_rate": 0.0003, + "loss": 7.0452, + "num_input_tokens_seen": 39321600, + "step": 150 + }, + { + "epoch": 0.0009540050323765457, + "grad_norm": 0.5895723104476929, + "learning_rate": 0.0004, + "loss": 6.355, + "num_input_tokens_seen": 52428800, + "step": 200 + }, + { + "epoch": 0.0011925062904706823, + "grad_norm": 0.8343789577484131, + "learning_rate": 0.0005, + "loss": 5.8716, + "num_input_tokens_seen": 65536000, + "step": 250 + }, + { + "epoch": 0.0014310075485648188, + "grad_norm": 0.5747953057289124, + "learning_rate": 0.0006, + "loss": 5.5086, + "num_input_tokens_seen": 78643200, + "step": 300 + }, + { + "epoch": 0.0016695088066589552, + "grad_norm": 0.8383421301841736, + "learning_rate": 0.0007, + "loss": 5.2217, + "num_input_tokens_seen": 91750400, + "step": 350 + }, + { + "epoch": 0.0019080100647530915, + "grad_norm": 0.5696113109588623, + "learning_rate": 0.0008, + "loss": 4.9683, + "num_input_tokens_seen": 104857600, + "step": 400 + }, + { + "epoch": 0.002146511322847228, + "grad_norm": 0.5431691408157349, + "learning_rate": 0.0009000000000000001, + "loss": 4.7629, + "num_input_tokens_seen": 117964800, + "step": 450 + }, + { + "epoch": 0.0023850125809413646, + "grad_norm": 0.4571855664253235, + "learning_rate": 0.001, + "loss": 4.5284, + "num_input_tokens_seen": 131072000, + "step": 500 + }, + { + "epoch": 0.0023850125809413646, + "eval_loss": 4.309167385101318, + "eval_runtime": 53.3891, + "eval_samples_per_second": 93.652, + "eval_steps_per_second": 23.413, + "num_input_tokens_seen": 131072000, + "step": 500 + }, + { + "epoch": 0.002623513839035501, + "grad_norm": 0.43464773893356323, + "learning_rate": 0.001, + "loss": 4.3379, + "num_input_tokens_seen": 144179200, + "step": 550 + }, + { + "epoch": 0.0028620150971296375, + "grad_norm": 0.49611660838127136, + "learning_rate": 0.001, + "loss": 4.1712, + "num_input_tokens_seen": 157286400, + "step": 600 + }, + { + "epoch": 0.0031005163552237738, + "grad_norm": 0.4060957729816437, + "learning_rate": 0.001, + "loss": 4.0436, + "num_input_tokens_seen": 170393600, + "step": 650 + }, + { + "epoch": 0.0033390176133179105, + "grad_norm": 0.37300577759742737, + "learning_rate": 0.001, + "loss": 3.9582, + "num_input_tokens_seen": 183500800, + "step": 700 + }, + { + "epoch": 0.0035775188714120467, + "grad_norm": 0.4117021858692169, + "learning_rate": 0.001, + "loss": 3.8674, + "num_input_tokens_seen": 196608000, + "step": 750 + }, + { + "epoch": 0.003816020129506183, + "grad_norm": 0.3335980772972107, + "learning_rate": 0.001, + "loss": 3.8031, + "num_input_tokens_seen": 209715200, + "step": 800 + }, + { + "epoch": 0.004054521387600319, + "grad_norm": 0.35943159461021423, + "learning_rate": 0.001, + "loss": 3.7534, + "num_input_tokens_seen": 222822400, + "step": 850 + }, + { + "epoch": 0.004293022645694456, + "grad_norm": 0.40000948309898376, + "learning_rate": 0.001, + "loss": 3.6867, + "num_input_tokens_seen": 235929600, + "step": 900 + }, + { + "epoch": 0.0045315239037885926, + "grad_norm": 0.3165877163410187, + "learning_rate": 0.001, + "loss": 3.6565, + "num_input_tokens_seen": 249036800, + "step": 950 + }, + { + "epoch": 0.004770025161882729, + "grad_norm": 0.3687070906162262, + "learning_rate": 0.001, + "loss": 3.6005, + "num_input_tokens_seen": 262144000, + "step": 1000 + }, + { + "epoch": 0.004770025161882729, + "eval_loss": 3.4853296279907227, + "eval_runtime": 52.477, + "eval_samples_per_second": 95.28, + "eval_steps_per_second": 23.82, + "num_input_tokens_seen": 262144000, + "step": 1000 + }, + { + "epoch": 0.005008526419976865, + "grad_norm": 0.32389721274375916, + "learning_rate": 0.001, + "loss": 3.5663, + "num_input_tokens_seen": 275251200, + "step": 1050 + }, + { + "epoch": 0.005247027678071002, + "grad_norm": 0.3202049434185028, + "learning_rate": 0.001, + "loss": 3.5376, + "num_input_tokens_seen": 288358400, + "step": 1100 + }, + { + "epoch": 0.005485528936165138, + "grad_norm": 0.30287981033325195, + "learning_rate": 0.001, + "loss": 3.5135, + "num_input_tokens_seen": 301465600, + "step": 1150 + }, + { + "epoch": 0.005724030194259275, + "grad_norm": 0.3624540865421295, + "learning_rate": 0.001, + "loss": 3.4814, + "num_input_tokens_seen": 314572800, + "step": 1200 + }, + { + "epoch": 0.005962531452353411, + "grad_norm": 0.30017992854118347, + "learning_rate": 0.001, + "loss": 3.4476, + "num_input_tokens_seen": 327680000, + "step": 1250 + }, + { + "epoch": 0.0062010327104475476, + "grad_norm": 0.3169330060482025, + "learning_rate": 0.001, + "loss": 3.4179, + "num_input_tokens_seen": 340787200, + "step": 1300 + }, + { + "epoch": 0.006439533968541684, + "grad_norm": 0.2730589210987091, + "learning_rate": 0.001, + "loss": 3.4074, + "num_input_tokens_seen": 353894400, + "step": 1350 + }, + { + "epoch": 0.006678035226635821, + "grad_norm": 0.2927146553993225, + "learning_rate": 0.001, + "loss": 3.3757, + "num_input_tokens_seen": 367001600, + "step": 1400 + }, + { + "epoch": 0.006916536484729957, + "grad_norm": 0.34230080246925354, + "learning_rate": 0.001, + "loss": 3.3502, + "num_input_tokens_seen": 380108800, + "step": 1450 + }, + { + "epoch": 0.007155037742824093, + "grad_norm": 0.30472344160079956, + "learning_rate": 0.001, + "loss": 3.3639, + "num_input_tokens_seen": 393216000, + "step": 1500 + }, + { + "epoch": 0.007155037742824093, + "eval_loss": 3.2486374378204346, + "eval_runtime": 52.482, + "eval_samples_per_second": 95.271, + "eval_steps_per_second": 23.818, + "num_input_tokens_seen": 393216000, + "step": 1500 + }, + { + "epoch": 0.00739353900091823, + "grad_norm": 0.26124337315559387, + "learning_rate": 0.001, + "loss": 3.3521, + "num_input_tokens_seen": 406323200, + "step": 1550 + }, + { + "epoch": 0.007632040259012366, + "grad_norm": 0.29117754101753235, + "learning_rate": 0.001, + "loss": 3.315, + "num_input_tokens_seen": 419430400, + "step": 1600 + }, + { + "epoch": 0.007870541517106503, + "grad_norm": 0.24080802500247955, + "learning_rate": 0.001, + "loss": 3.3103, + "num_input_tokens_seen": 432537600, + "step": 1650 + }, + { + "epoch": 0.008109042775200638, + "grad_norm": 0.29982003569602966, + "learning_rate": 0.001, + "loss": 3.2926, + "num_input_tokens_seen": 445644800, + "step": 1700 + }, + { + "epoch": 0.008347544033294775, + "grad_norm": 0.26795274019241333, + "learning_rate": 0.001, + "loss": 3.2843, + "num_input_tokens_seen": 458752000, + "step": 1750 + }, + { + "epoch": 0.008586045291388912, + "grad_norm": 0.252774715423584, + "learning_rate": 0.001, + "loss": 3.274, + "num_input_tokens_seen": 471859200, + "step": 1800 + }, + { + "epoch": 0.008824546549483048, + "grad_norm": 0.25432145595550537, + "learning_rate": 0.001, + "loss": 3.2533, + "num_input_tokens_seen": 484966400, + "step": 1850 + }, + { + "epoch": 0.009063047807577185, + "grad_norm": 0.25918108224868774, + "learning_rate": 0.001, + "loss": 3.2501, + "num_input_tokens_seen": 498073600, + "step": 1900 + }, + { + "epoch": 0.009301549065671322, + "grad_norm": 0.2482348382472992, + "learning_rate": 0.001, + "loss": 3.2541, + "num_input_tokens_seen": 511180800, + "step": 1950 + }, + { + "epoch": 0.009540050323765458, + "grad_norm": 0.2615273594856262, + "learning_rate": 0.001, + "loss": 3.2218, + "num_input_tokens_seen": 524288000, + "step": 2000 + }, + { + "epoch": 0.009540050323765458, + "eval_loss": 3.1193039417266846, + "eval_runtime": 52.7955, + "eval_samples_per_second": 94.705, + "eval_steps_per_second": 23.676, + "num_input_tokens_seen": 524288000, + "step": 2000 + }, + { + "epoch": 0.009778551581859595, + "grad_norm": 0.2637729048728943, + "learning_rate": 0.001, + "loss": 3.2285, + "num_input_tokens_seen": 537395200, + "step": 2050 + }, + { + "epoch": 0.01001705283995373, + "grad_norm": 0.23936080932617188, + "learning_rate": 0.001, + "loss": 3.2119, + "num_input_tokens_seen": 550502400, + "step": 2100 + }, + { + "epoch": 0.010255554098047867, + "grad_norm": 0.2469020038843155, + "learning_rate": 0.001, + "loss": 3.2021, + "num_input_tokens_seen": 563609600, + "step": 2150 + }, + { + "epoch": 0.010494055356142003, + "grad_norm": 0.2304004430770874, + "learning_rate": 0.001, + "loss": 3.1874, + "num_input_tokens_seen": 576716800, + "step": 2200 + }, + { + "epoch": 0.01073255661423614, + "grad_norm": 0.232864648103714, + "learning_rate": 0.001, + "loss": 3.1897, + "num_input_tokens_seen": 589824000, + "step": 2250 + }, + { + "epoch": 0.010971057872330277, + "grad_norm": 0.23161470890045166, + "learning_rate": 0.001, + "loss": 3.1689, + "num_input_tokens_seen": 602931200, + "step": 2300 + }, + { + "epoch": 0.011209559130424413, + "grad_norm": 0.20868408679962158, + "learning_rate": 0.001, + "loss": 3.1615, + "num_input_tokens_seen": 616038400, + "step": 2350 + }, + { + "epoch": 0.01144806038851855, + "grad_norm": 0.23374608159065247, + "learning_rate": 0.001, + "loss": 3.1556, + "num_input_tokens_seen": 629145600, + "step": 2400 + }, + { + "epoch": 0.011686561646612685, + "grad_norm": 0.21716611087322235, + "learning_rate": 0.001, + "loss": 3.1463, + "num_input_tokens_seen": 642252800, + "step": 2450 + }, + { + "epoch": 0.011925062904706822, + "grad_norm": 0.23689670860767365, + "learning_rate": 0.001, + "loss": 3.1433, + "num_input_tokens_seen": 655360000, + "step": 2500 + }, + { + "epoch": 0.011925062904706822, + "eval_loss": 3.040046215057373, + "eval_runtime": 52.9109, + "eval_samples_per_second": 94.498, + "eval_steps_per_second": 23.625, + "num_input_tokens_seen": 655360000, + "step": 2500 + }, + { + "epoch": 0.012163564162800958, + "grad_norm": 0.2245575189590454, + "learning_rate": 0.001, + "loss": 3.1445, + "num_input_tokens_seen": 668467200, + "step": 2550 + }, + { + "epoch": 0.012402065420895095, + "grad_norm": 0.20992259681224823, + "learning_rate": 0.001, + "loss": 3.1447, + "num_input_tokens_seen": 681574400, + "step": 2600 + }, + { + "epoch": 0.012640566678989232, + "grad_norm": 0.21792201697826385, + "learning_rate": 0.001, + "loss": 3.1323, + "num_input_tokens_seen": 694681600, + "step": 2650 + }, + { + "epoch": 0.012879067937083368, + "grad_norm": 0.243458554148674, + "learning_rate": 0.001, + "loss": 3.1084, + "num_input_tokens_seen": 707788800, + "step": 2700 + }, + { + "epoch": 0.013117569195177505, + "grad_norm": 0.21190515160560608, + "learning_rate": 0.001, + "loss": 3.1202, + "num_input_tokens_seen": 720896000, + "step": 2750 + }, + { + "epoch": 0.013356070453271642, + "grad_norm": 0.2461613118648529, + "learning_rate": 0.001, + "loss": 3.1007, + "num_input_tokens_seen": 734003200, + "step": 2800 + }, + { + "epoch": 0.013594571711365777, + "grad_norm": 0.1976248323917389, + "learning_rate": 0.001, + "loss": 3.1079, + "num_input_tokens_seen": 747110400, + "step": 2850 + }, + { + "epoch": 0.013833072969459913, + "grad_norm": 0.22097842395305634, + "learning_rate": 0.001, + "loss": 3.0846, + "num_input_tokens_seen": 760217600, + "step": 2900 + }, + { + "epoch": 0.01407157422755405, + "grad_norm": 0.20581132173538208, + "learning_rate": 0.001, + "loss": 3.0995, + "num_input_tokens_seen": 773324800, + "step": 2950 + }, + { + "epoch": 0.014310075485648187, + "grad_norm": 0.19790051877498627, + "learning_rate": 0.001, + "loss": 3.0977, + "num_input_tokens_seen": 786432000, + "step": 3000 + }, + { + "epoch": 0.014310075485648187, + "eval_loss": 2.9804909229278564, + "eval_runtime": 53.1278, + "eval_samples_per_second": 94.113, + "eval_steps_per_second": 23.528, + "num_input_tokens_seen": 786432000, + "step": 3000 + }, + { + "epoch": 0.014548576743742323, + "grad_norm": 0.20328116416931152, + "learning_rate": 0.001, + "loss": 3.0872, + "num_input_tokens_seen": 799539200, + "step": 3050 + }, + { + "epoch": 0.01478707800183646, + "grad_norm": 0.21318025887012482, + "learning_rate": 0.001, + "loss": 3.0861, + "num_input_tokens_seen": 812646400, + "step": 3100 + }, + { + "epoch": 0.015025579259930597, + "grad_norm": 0.22170069813728333, + "learning_rate": 0.001, + "loss": 3.0618, + "num_input_tokens_seen": 825753600, + "step": 3150 + }, + { + "epoch": 0.015264080518024732, + "grad_norm": 0.21292312443256378, + "learning_rate": 0.001, + "loss": 3.0567, + "num_input_tokens_seen": 838860800, + "step": 3200 + }, + { + "epoch": 0.015502581776118868, + "grad_norm": 0.2331959754228592, + "learning_rate": 0.001, + "loss": 3.0714, + "num_input_tokens_seen": 851968000, + "step": 3250 + }, + { + "epoch": 0.015741083034213007, + "grad_norm": 0.19236011803150177, + "learning_rate": 0.001, + "loss": 3.059, + "num_input_tokens_seen": 865075200, + "step": 3300 + }, + { + "epoch": 0.015979584292307142, + "grad_norm": 0.19991376996040344, + "learning_rate": 0.001, + "loss": 3.0542, + "num_input_tokens_seen": 878182400, + "step": 3350 + }, + { + "epoch": 0.016218085550401277, + "grad_norm": 0.2042934149503708, + "learning_rate": 0.001, + "loss": 3.0517, + "num_input_tokens_seen": 891289600, + "step": 3400 + }, + { + "epoch": 0.016456586808495415, + "grad_norm": 0.19254428148269653, + "learning_rate": 0.001, + "loss": 3.0415, + "num_input_tokens_seen": 904396800, + "step": 3450 + }, + { + "epoch": 0.01669508806658955, + "grad_norm": 0.19211998581886292, + "learning_rate": 0.001, + "loss": 3.0253, + "num_input_tokens_seen": 917504000, + "step": 3500 + }, + { + "epoch": 0.01669508806658955, + "eval_loss": 2.937037944793701, + "eval_runtime": 52.6773, + "eval_samples_per_second": 94.918, + "eval_steps_per_second": 23.729, + "num_input_tokens_seen": 917504000, + "step": 3500 + }, + { + "epoch": 0.01693358932468369, + "grad_norm": 0.19596482813358307, + "learning_rate": 0.001, + "loss": 3.053, + "num_input_tokens_seen": 930611200, + "step": 3550 + }, + { + "epoch": 0.017172090582777823, + "grad_norm": 0.20214103162288666, + "learning_rate": 0.001, + "loss": 3.0385, + "num_input_tokens_seen": 943718400, + "step": 3600 + }, + { + "epoch": 0.017410591840871962, + "grad_norm": 0.18580283224582672, + "learning_rate": 0.001, + "loss": 3.0354, + "num_input_tokens_seen": 956825600, + "step": 3650 + }, + { + "epoch": 0.017649093098966097, + "grad_norm": 0.18928515911102295, + "learning_rate": 0.001, + "loss": 3.0292, + "num_input_tokens_seen": 969932800, + "step": 3700 + }, + { + "epoch": 0.017887594357060232, + "grad_norm": 0.19066137075424194, + "learning_rate": 0.001, + "loss": 3.0206, + "num_input_tokens_seen": 983040000, + "step": 3750 + }, + { + "epoch": 0.01812609561515437, + "grad_norm": 0.20291416347026825, + "learning_rate": 0.001, + "loss": 3.0254, + "num_input_tokens_seen": 996147200, + "step": 3800 + }, + { + "epoch": 0.018364596873248505, + "grad_norm": 0.19991491734981537, + "learning_rate": 0.001, + "loss": 3.0212, + "num_input_tokens_seen": 1009254400, + "step": 3850 + }, + { + "epoch": 0.018603098131342644, + "grad_norm": 0.19553051888942719, + "learning_rate": 0.001, + "loss": 3.0229, + "num_input_tokens_seen": 1022361600, + "step": 3900 + }, + { + "epoch": 0.01884159938943678, + "grad_norm": 0.19302095472812653, + "learning_rate": 0.001, + "loss": 3.0137, + "num_input_tokens_seen": 1035468800, + "step": 3950 + }, + { + "epoch": 0.019080100647530917, + "grad_norm": 0.18680201470851898, + "learning_rate": 0.001, + "loss": 3.0106, + "num_input_tokens_seen": 1048576000, + "step": 4000 + }, + { + "epoch": 0.019080100647530917, + "eval_loss": 2.8984477519989014, + "eval_runtime": 52.851, + "eval_samples_per_second": 94.606, + "eval_steps_per_second": 23.651, + "num_input_tokens_seen": 1048576000, + "step": 4000 + }, + { + "epoch": 0.019318601905625052, + "grad_norm": 0.18222174048423767, + "learning_rate": 0.001, + "loss": 3.0095, + "num_input_tokens_seen": 1061683200, + "step": 4050 + }, + { + "epoch": 0.01955710316371919, + "grad_norm": 0.1929137110710144, + "learning_rate": 0.001, + "loss": 3.0022, + "num_input_tokens_seen": 1074790400, + "step": 4100 + }, + { + "epoch": 0.019795604421813325, + "grad_norm": 0.19358602166175842, + "learning_rate": 0.001, + "loss": 2.9978, + "num_input_tokens_seen": 1087897600, + "step": 4150 + }, + { + "epoch": 0.02003410567990746, + "grad_norm": 0.19070614874362946, + "learning_rate": 0.001, + "loss": 3.0016, + "num_input_tokens_seen": 1101004800, + "step": 4200 + }, + { + "epoch": 0.0202726069380016, + "grad_norm": 0.17888160049915314, + "learning_rate": 0.001, + "loss": 2.9984, + "num_input_tokens_seen": 1114112000, + "step": 4250 + }, + { + "epoch": 0.020511108196095734, + "grad_norm": 0.1823708564043045, + "learning_rate": 0.001, + "loss": 3.004, + "num_input_tokens_seen": 1127219200, + "step": 4300 + }, + { + "epoch": 0.020749609454189872, + "grad_norm": 0.1753600388765335, + "learning_rate": 0.001, + "loss": 2.9814, + "num_input_tokens_seen": 1140326400, + "step": 4350 + }, + { + "epoch": 0.020988110712284007, + "grad_norm": 0.1710510551929474, + "learning_rate": 0.001, + "loss": 2.9597, + "num_input_tokens_seen": 1153433600, + "step": 4400 + }, + { + "epoch": 0.021226611970378145, + "grad_norm": 0.18727277219295502, + "learning_rate": 0.001, + "loss": 2.9695, + "num_input_tokens_seen": 1166540800, + "step": 4450 + }, + { + "epoch": 0.02146511322847228, + "grad_norm": 0.17773132026195526, + "learning_rate": 0.001, + "loss": 2.9664, + "num_input_tokens_seen": 1179648000, + "step": 4500 + }, + { + "epoch": 0.02146511322847228, + "eval_loss": 2.871137857437134, + "eval_runtime": 51.4876, + "eval_samples_per_second": 97.111, + "eval_steps_per_second": 24.278, + "num_input_tokens_seen": 1179648000, + "step": 4500 + }, + { + "epoch": 0.021703614486566415, + "grad_norm": 0.1875799000263214, + "learning_rate": 0.001, + "loss": 2.9682, + "num_input_tokens_seen": 1192755200, + "step": 4550 + }, + { + "epoch": 0.021942115744660554, + "grad_norm": 0.18222226202487946, + "learning_rate": 0.001, + "loss": 2.9484, + "num_input_tokens_seen": 1205862400, + "step": 4600 + }, + { + "epoch": 0.02218061700275469, + "grad_norm": 0.191411092877388, + "learning_rate": 0.001, + "loss": 2.9637, + "num_input_tokens_seen": 1218969600, + "step": 4650 + }, + { + "epoch": 0.022419118260848827, + "grad_norm": 0.17608201503753662, + "learning_rate": 0.001, + "loss": 2.9792, + "num_input_tokens_seen": 1232076800, + "step": 4700 + }, + { + "epoch": 0.022657619518942962, + "grad_norm": 0.1718858927488327, + "learning_rate": 0.001, + "loss": 2.9674, + "num_input_tokens_seen": 1245184000, + "step": 4750 + }, + { + "epoch": 0.0228961207770371, + "grad_norm": 0.18428942561149597, + "learning_rate": 0.001, + "loss": 2.976, + "num_input_tokens_seen": 1258291200, + "step": 4800 + }, + { + "epoch": 0.023134622035131235, + "grad_norm": 0.16696259379386902, + "learning_rate": 0.001, + "loss": 2.9486, + "num_input_tokens_seen": 1271398400, + "step": 4850 + }, + { + "epoch": 0.02337312329322537, + "grad_norm": 0.18239040672779083, + "learning_rate": 0.001, + "loss": 2.956, + "num_input_tokens_seen": 1284505600, + "step": 4900 + }, + { + "epoch": 0.02361162455131951, + "grad_norm": 0.17167994379997253, + "learning_rate": 0.001, + "loss": 2.9449, + "num_input_tokens_seen": 1297612800, + "step": 4950 + }, + { + "epoch": 0.023850125809413644, + "grad_norm": 0.18532761931419373, + "learning_rate": 0.001, + "loss": 2.947, + "num_input_tokens_seen": 1310720000, + "step": 5000 + }, + { + "epoch": 0.023850125809413644, + "eval_loss": 2.8470754623413086, + "eval_runtime": 51.04, + "eval_samples_per_second": 97.962, + "eval_steps_per_second": 24.491, + "num_input_tokens_seen": 1310720000, + "step": 5000 + }, + { + "epoch": 0.024088627067507782, + "grad_norm": 0.21697266399860382, + "learning_rate": 0.001, + "loss": 2.963, + "num_input_tokens_seen": 1323827200, + "step": 5050 + }, + { + "epoch": 0.024327128325601917, + "grad_norm": 0.17018833756446838, + "learning_rate": 0.001, + "loss": 2.9453, + "num_input_tokens_seen": 1336934400, + "step": 5100 + }, + { + "epoch": 0.024565629583696055, + "grad_norm": 0.17473167181015015, + "learning_rate": 0.001, + "loss": 2.9516, + "num_input_tokens_seen": 1350041600, + "step": 5150 + }, + { + "epoch": 0.02480413084179019, + "grad_norm": 0.18488293886184692, + "learning_rate": 0.001, + "loss": 2.9404, + "num_input_tokens_seen": 1363148800, + "step": 5200 + }, + { + "epoch": 0.025042632099884325, + "grad_norm": 0.17348967492580414, + "learning_rate": 0.001, + "loss": 2.9275, + "num_input_tokens_seen": 1376256000, + "step": 5250 + }, + { + "epoch": 0.025281133357978464, + "grad_norm": 0.16547563672065735, + "learning_rate": 0.001, + "loss": 2.9464, + "num_input_tokens_seen": 1389363200, + "step": 5300 + }, + { + "epoch": 0.0255196346160726, + "grad_norm": 0.17538361251354218, + "learning_rate": 0.001, + "loss": 2.94, + "num_input_tokens_seen": 1402470400, + "step": 5350 + }, + { + "epoch": 0.025758135874166737, + "grad_norm": 0.17068558931350708, + "learning_rate": 0.001, + "loss": 2.9382, + "num_input_tokens_seen": 1415577600, + "step": 5400 + }, + { + "epoch": 0.025996637132260872, + "grad_norm": 0.17389337718486786, + "learning_rate": 0.001, + "loss": 2.9254, + "num_input_tokens_seen": 1428684800, + "step": 5450 + }, + { + "epoch": 0.02623513839035501, + "grad_norm": 0.17620491981506348, + "learning_rate": 0.001, + "loss": 2.9221, + "num_input_tokens_seen": 1441792000, + "step": 5500 + }, + { + "epoch": 0.02623513839035501, + "eval_loss": 2.8246922492980957, + "eval_runtime": 50.2832, + "eval_samples_per_second": 99.437, + "eval_steps_per_second": 24.859, + "num_input_tokens_seen": 1441792000, + "step": 5500 + }, + { + "epoch": 0.026473639648449145, + "grad_norm": 0.15889622271060944, + "learning_rate": 0.001, + "loss": 2.923, + "num_input_tokens_seen": 1454899200, + "step": 5550 + }, + { + "epoch": 0.026712140906543284, + "grad_norm": 0.17490123212337494, + "learning_rate": 0.001, + "loss": 2.9146, + "num_input_tokens_seen": 1468006400, + "step": 5600 + }, + { + "epoch": 0.02695064216463742, + "grad_norm": 0.17789559066295624, + "learning_rate": 0.001, + "loss": 2.9253, + "num_input_tokens_seen": 1481113600, + "step": 5650 + }, + { + "epoch": 0.027189143422731554, + "grad_norm": 0.17113780975341797, + "learning_rate": 0.001, + "loss": 2.9267, + "num_input_tokens_seen": 1494220800, + "step": 5700 + }, + { + "epoch": 0.027427644680825692, + "grad_norm": 0.1671907901763916, + "learning_rate": 0.001, + "loss": 2.9178, + "num_input_tokens_seen": 1507328000, + "step": 5750 + }, + { + "epoch": 0.027666145938919827, + "grad_norm": 0.17511603236198425, + "learning_rate": 0.001, + "loss": 2.9341, + "num_input_tokens_seen": 1520435200, + "step": 5800 + }, + { + "epoch": 0.027904647197013965, + "grad_norm": 0.1821524053812027, + "learning_rate": 0.001, + "loss": 2.9076, + "num_input_tokens_seen": 1533542400, + "step": 5850 + }, + { + "epoch": 0.0281431484551081, + "grad_norm": 0.16259051859378815, + "learning_rate": 0.001, + "loss": 2.9212, + "num_input_tokens_seen": 1546649600, + "step": 5900 + }, + { + "epoch": 0.02838164971320224, + "grad_norm": 0.18584352731704712, + "learning_rate": 0.001, + "loss": 2.927, + "num_input_tokens_seen": 1559756800, + "step": 5950 + }, + { + "epoch": 0.028620150971296374, + "grad_norm": 0.181602343916893, + "learning_rate": 0.001, + "loss": 2.9096, + "num_input_tokens_seen": 1572864000, + "step": 6000 + }, + { + "epoch": 0.028620150971296374, + "eval_loss": 2.8036017417907715, + "eval_runtime": 50.543, + "eval_samples_per_second": 98.926, + "eval_steps_per_second": 24.731, + "num_input_tokens_seen": 1572864000, + "step": 6000 + }, + { + "epoch": 0.02885865222939051, + "grad_norm": 0.1653270423412323, + "learning_rate": 0.001, + "loss": 2.9214, + "num_input_tokens_seen": 1585971200, + "step": 6050 + }, + { + "epoch": 0.029097153487484647, + "grad_norm": 0.17030183970928192, + "learning_rate": 0.001, + "loss": 2.9081, + "num_input_tokens_seen": 1599078400, + "step": 6100 + }, + { + "epoch": 0.029335654745578782, + "grad_norm": 0.17734774947166443, + "learning_rate": 0.001, + "loss": 2.9128, + "num_input_tokens_seen": 1612185600, + "step": 6150 + }, + { + "epoch": 0.02957415600367292, + "grad_norm": 0.1664343774318695, + "learning_rate": 0.001, + "loss": 2.9084, + "num_input_tokens_seen": 1625292800, + "step": 6200 + }, + { + "epoch": 0.029812657261767055, + "grad_norm": 0.15939603745937347, + "learning_rate": 0.001, + "loss": 2.9049, + "num_input_tokens_seen": 1638400000, + "step": 6250 + }, + { + "epoch": 0.030051158519861194, + "grad_norm": 0.16107864677906036, + "learning_rate": 0.001, + "loss": 2.8889, + "num_input_tokens_seen": 1651507200, + "step": 6300 + }, + { + "epoch": 0.03028965977795533, + "grad_norm": 0.1734771579504013, + "learning_rate": 0.001, + "loss": 2.8951, + "num_input_tokens_seen": 1664614400, + "step": 6350 + }, + { + "epoch": 0.030528161036049464, + "grad_norm": 0.1804204136133194, + "learning_rate": 0.001, + "loss": 2.8877, + "num_input_tokens_seen": 1677721600, + "step": 6400 + }, + { + "epoch": 0.030766662294143602, + "grad_norm": 0.16369500756263733, + "learning_rate": 0.001, + "loss": 2.8764, + "num_input_tokens_seen": 1690828800, + "step": 6450 + }, + { + "epoch": 0.031005163552237737, + "grad_norm": 0.1704144924879074, + "learning_rate": 0.001, + "loss": 2.8965, + "num_input_tokens_seen": 1703936000, + "step": 6500 + }, + { + "epoch": 0.031005163552237737, + "eval_loss": 2.787343740463257, + "eval_runtime": 50.3956, + "eval_samples_per_second": 99.215, + "eval_steps_per_second": 24.804, + "num_input_tokens_seen": 1703936000, + "step": 6500 + }, + { + "epoch": 0.031243664810331875, + "grad_norm": 0.17917555570602417, + "learning_rate": 0.001, + "loss": 2.8882, + "num_input_tokens_seen": 1717043200, + "step": 6550 + }, + { + "epoch": 0.031482166068426014, + "grad_norm": 0.18822412192821503, + "learning_rate": 0.001, + "loss": 2.8931, + "num_input_tokens_seen": 1730150400, + "step": 6600 + }, + { + "epoch": 0.031720667326520145, + "grad_norm": 0.1702752560377121, + "learning_rate": 0.001, + "loss": 2.8906, + "num_input_tokens_seen": 1743257600, + "step": 6650 + }, + { + "epoch": 0.031959168584614284, + "grad_norm": 0.16963082551956177, + "learning_rate": 0.001, + "loss": 2.8809, + "num_input_tokens_seen": 1756364800, + "step": 6700 + }, + { + "epoch": 0.03219766984270842, + "grad_norm": 0.17273569107055664, + "learning_rate": 0.001, + "loss": 2.9005, + "num_input_tokens_seen": 1769472000, + "step": 6750 + }, + { + "epoch": 0.032436171100802554, + "grad_norm": 0.21361888945102692, + "learning_rate": 0.001, + "loss": 2.8683, + "num_input_tokens_seen": 1782579200, + "step": 6800 + }, + { + "epoch": 0.03267467235889669, + "grad_norm": 0.16454364359378815, + "learning_rate": 0.001, + "loss": 2.8921, + "num_input_tokens_seen": 1795686400, + "step": 6850 + }, + { + "epoch": 0.03291317361699083, + "grad_norm": 0.1677432805299759, + "learning_rate": 0.001, + "loss": 2.8777, + "num_input_tokens_seen": 1808793600, + "step": 6900 + }, + { + "epoch": 0.03315167487508497, + "grad_norm": 0.17707760632038116, + "learning_rate": 0.001, + "loss": 2.8791, + "num_input_tokens_seen": 1821900800, + "step": 6950 + }, + { + "epoch": 0.0333901761331791, + "grad_norm": 0.1784796118736267, + "learning_rate": 0.001, + "loss": 2.8642, + "num_input_tokens_seen": 1835008000, + "step": 7000 + }, + { + "epoch": 0.0333901761331791, + "eval_loss": 2.7708475589752197, + "eval_runtime": 50.9058, + "eval_samples_per_second": 98.221, + "eval_steps_per_second": 24.555, + "num_input_tokens_seen": 1835008000, + "step": 7000 + }, + { + "epoch": 0.03362867739127324, + "grad_norm": 0.15859876573085785, + "learning_rate": 0.001, + "loss": 2.8919, + "num_input_tokens_seen": 1848115200, + "step": 7050 + }, + { + "epoch": 0.03386717864936738, + "grad_norm": 0.17061467468738556, + "learning_rate": 0.001, + "loss": 2.868, + "num_input_tokens_seen": 1861222400, + "step": 7100 + }, + { + "epoch": 0.03410567990746151, + "grad_norm": 0.17118851840496063, + "learning_rate": 0.001, + "loss": 2.8677, + "num_input_tokens_seen": 1874329600, + "step": 7150 + }, + { + "epoch": 0.03434418116555565, + "grad_norm": 0.1561940759420395, + "learning_rate": 0.001, + "loss": 2.8701, + "num_input_tokens_seen": 1887436800, + "step": 7200 + }, + { + "epoch": 0.034582682423649785, + "grad_norm": 0.17568449676036835, + "learning_rate": 0.001, + "loss": 2.8652, + "num_input_tokens_seen": 1900544000, + "step": 7250 + }, + { + "epoch": 0.034821183681743924, + "grad_norm": 0.17471665143966675, + "learning_rate": 0.001, + "loss": 2.8614, + "num_input_tokens_seen": 1913651200, + "step": 7300 + }, + { + "epoch": 0.035059684939838055, + "grad_norm": 0.17949970066547394, + "learning_rate": 0.001, + "loss": 2.862, + "num_input_tokens_seen": 1926758400, + "step": 7350 + }, + { + "epoch": 0.035298186197932194, + "grad_norm": 0.17014376819133759, + "learning_rate": 0.001, + "loss": 2.8696, + "num_input_tokens_seen": 1939865600, + "step": 7400 + }, + { + "epoch": 0.03553668745602633, + "grad_norm": 0.166939839720726, + "learning_rate": 0.001, + "loss": 2.8679, + "num_input_tokens_seen": 1952972800, + "step": 7450 + }, + { + "epoch": 0.035775188714120464, + "grad_norm": 0.16403459012508392, + "learning_rate": 0.001, + "loss": 2.8692, + "num_input_tokens_seen": 1966080000, + "step": 7500 + }, + { + "epoch": 0.035775188714120464, + "eval_loss": 2.7581658363342285, + "eval_runtime": 50.614, + "eval_samples_per_second": 98.787, + "eval_steps_per_second": 24.697, + "num_input_tokens_seen": 1966080000, + "step": 7500 + }, + { + "epoch": 0.0360136899722146, + "grad_norm": 0.16664361953735352, + "learning_rate": 0.001, + "loss": 2.8549, + "num_input_tokens_seen": 1979187200, + "step": 7550 + }, + { + "epoch": 0.03625219123030874, + "grad_norm": 0.165015310049057, + "learning_rate": 0.001, + "loss": 2.867, + "num_input_tokens_seen": 1992294400, + "step": 7600 + }, + { + "epoch": 0.03649069248840288, + "grad_norm": 0.17752580344676971, + "learning_rate": 0.001, + "loss": 2.8721, + "num_input_tokens_seen": 2005401600, + "step": 7650 + }, + { + "epoch": 0.03672919374649701, + "grad_norm": 0.1641317456960678, + "learning_rate": 0.001, + "loss": 2.8601, + "num_input_tokens_seen": 2018508800, + "step": 7700 + }, + { + "epoch": 0.03696769500459115, + "grad_norm": 0.1706378310918808, + "learning_rate": 0.001, + "loss": 2.8385, + "num_input_tokens_seen": 2031616000, + "step": 7750 + }, + { + "epoch": 0.03720619626268529, + "grad_norm": 0.18265438079833984, + "learning_rate": 0.001, + "loss": 2.8421, + "num_input_tokens_seen": 2044723200, + "step": 7800 + }, + { + "epoch": 0.037444697520779426, + "grad_norm": 0.17270897328853607, + "learning_rate": 0.001, + "loss": 2.8576, + "num_input_tokens_seen": 2057830400, + "step": 7850 + }, + { + "epoch": 0.03768319877887356, + "grad_norm": 0.17359280586242676, + "learning_rate": 0.001, + "loss": 2.8522, + "num_input_tokens_seen": 2070937600, + "step": 7900 + }, + { + "epoch": 0.037921700036967695, + "grad_norm": 0.1679411083459854, + "learning_rate": 0.001, + "loss": 2.854, + "num_input_tokens_seen": 2084044800, + "step": 7950 + }, + { + "epoch": 0.038160201295061834, + "grad_norm": 0.16735835373401642, + "learning_rate": 0.001, + "loss": 2.8494, + "num_input_tokens_seen": 2097152000, + "step": 8000 + }, + { + "epoch": 0.038160201295061834, + "eval_loss": 2.7443442344665527, + "eval_runtime": 50.3387, + "eval_samples_per_second": 99.327, + "eval_steps_per_second": 24.832, + "num_input_tokens_seen": 2097152000, + "step": 8000 + }, + { + "epoch": 0.038398702553155965, + "grad_norm": 0.16059577465057373, + "learning_rate": 0.001, + "loss": 2.8495, + "num_input_tokens_seen": 2110259200, + "step": 8050 + }, + { + "epoch": 0.038637203811250104, + "grad_norm": 0.1842387169599533, + "learning_rate": 0.001, + "loss": 2.8526, + "num_input_tokens_seen": 2123366400, + "step": 8100 + }, + { + "epoch": 0.03887570506934424, + "grad_norm": 0.15922050178050995, + "learning_rate": 0.001, + "loss": 2.8312, + "num_input_tokens_seen": 2136473600, + "step": 8150 + }, + { + "epoch": 0.03911420632743838, + "grad_norm": 0.16642028093338013, + "learning_rate": 0.001, + "loss": 2.8452, + "num_input_tokens_seen": 2149580800, + "step": 8200 + }, + { + "epoch": 0.03935270758553251, + "grad_norm": 0.16174671053886414, + "learning_rate": 0.001, + "loss": 2.8471, + "num_input_tokens_seen": 2162688000, + "step": 8250 + }, + { + "epoch": 0.03959120884362665, + "grad_norm": 0.16786591708660126, + "learning_rate": 0.001, + "loss": 2.8435, + "num_input_tokens_seen": 2175795200, + "step": 8300 + }, + { + "epoch": 0.03982971010172079, + "grad_norm": 0.17107373476028442, + "learning_rate": 0.001, + "loss": 2.862, + "num_input_tokens_seen": 2188902400, + "step": 8350 + }, + { + "epoch": 0.04006821135981492, + "grad_norm": 0.17952118813991547, + "learning_rate": 0.001, + "loss": 2.8414, + "num_input_tokens_seen": 2202009600, + "step": 8400 + }, + { + "epoch": 0.04030671261790906, + "grad_norm": 0.16836482286453247, + "learning_rate": 0.001, + "loss": 2.8363, + "num_input_tokens_seen": 2215116800, + "step": 8450 + }, + { + "epoch": 0.0405452138760032, + "grad_norm": 0.16812962293624878, + "learning_rate": 0.001, + "loss": 2.844, + "num_input_tokens_seen": 2228224000, + "step": 8500 + }, + { + "epoch": 0.0405452138760032, + "eval_loss": 2.7306976318359375, + "eval_runtime": 50.272, + "eval_samples_per_second": 99.459, + "eval_steps_per_second": 24.865, + "num_input_tokens_seen": 2228224000, + "step": 8500 + }, + { + "epoch": 0.040783715134097336, + "grad_norm": 0.1696135401725769, + "learning_rate": 0.001, + "loss": 2.8406, + "num_input_tokens_seen": 2241331200, + "step": 8550 + }, + { + "epoch": 0.04102221639219147, + "grad_norm": 0.16062459349632263, + "learning_rate": 0.001, + "loss": 2.8453, + "num_input_tokens_seen": 2254438400, + "step": 8600 + }, + { + "epoch": 0.041260717650285605, + "grad_norm": 0.17326433956623077, + "learning_rate": 0.001, + "loss": 2.8449, + "num_input_tokens_seen": 2267545600, + "step": 8650 + }, + { + "epoch": 0.041499218908379744, + "grad_norm": 0.16410672664642334, + "learning_rate": 0.001, + "loss": 2.8412, + "num_input_tokens_seen": 2280652800, + "step": 8700 + }, + { + "epoch": 0.041737720166473875, + "grad_norm": 0.16255012154579163, + "learning_rate": 0.001, + "loss": 2.8524, + "num_input_tokens_seen": 2293760000, + "step": 8750 + }, + { + "epoch": 0.041976221424568014, + "grad_norm": 0.163652241230011, + "learning_rate": 0.001, + "loss": 2.8528, + "num_input_tokens_seen": 2306867200, + "step": 8800 + }, + { + "epoch": 0.04221472268266215, + "grad_norm": 0.15598778426647186, + "learning_rate": 0.001, + "loss": 2.8255, + "num_input_tokens_seen": 2319974400, + "step": 8850 + }, + { + "epoch": 0.04245322394075629, + "grad_norm": 0.1740003079175949, + "learning_rate": 0.001, + "loss": 2.8278, + "num_input_tokens_seen": 2333081600, + "step": 8900 + }, + { + "epoch": 0.04269172519885042, + "grad_norm": 0.17225052416324615, + "learning_rate": 0.001, + "loss": 2.8334, + "num_input_tokens_seen": 2346188800, + "step": 8950 + }, + { + "epoch": 0.04293022645694456, + "grad_norm": 0.18005919456481934, + "learning_rate": 0.001, + "loss": 2.8044, + "num_input_tokens_seen": 2359296000, + "step": 9000 + }, + { + "epoch": 0.04293022645694456, + "eval_loss": 2.7220215797424316, + "eval_runtime": 50.2706, + "eval_samples_per_second": 99.462, + "eval_steps_per_second": 24.865, + "num_input_tokens_seen": 2359296000, + "step": 9000 + }, + { + "epoch": 0.0431687277150387, + "grad_norm": 0.16554109752178192, + "learning_rate": 0.001, + "loss": 2.83, + "num_input_tokens_seen": 2372403200, + "step": 9050 + }, + { + "epoch": 0.04340722897313283, + "grad_norm": 0.17308101058006287, + "learning_rate": 0.001, + "loss": 2.8204, + "num_input_tokens_seen": 2385510400, + "step": 9100 + }, + { + "epoch": 0.04364573023122697, + "grad_norm": 0.16701756417751312, + "learning_rate": 0.001, + "loss": 2.836, + "num_input_tokens_seen": 2398617600, + "step": 9150 + }, + { + "epoch": 0.04388423148932111, + "grad_norm": 0.16220535337924957, + "learning_rate": 0.001, + "loss": 2.8194, + "num_input_tokens_seen": 2411724800, + "step": 9200 + }, + { + "epoch": 0.044122732747415246, + "grad_norm": 0.16643071174621582, + "learning_rate": 0.001, + "loss": 2.8157, + "num_input_tokens_seen": 2424832000, + "step": 9250 + }, + { + "epoch": 0.04436123400550938, + "grad_norm": 0.16293680667877197, + "learning_rate": 0.001, + "loss": 2.8147, + "num_input_tokens_seen": 2437939200, + "step": 9300 + }, + { + "epoch": 0.044599735263603515, + "grad_norm": 0.1914059966802597, + "learning_rate": 0.001, + "loss": 2.8164, + "num_input_tokens_seen": 2451046400, + "step": 9350 + }, + { + "epoch": 0.044838236521697654, + "grad_norm": 0.15867285430431366, + "learning_rate": 0.001, + "loss": 2.8063, + "num_input_tokens_seen": 2464153600, + "step": 9400 + }, + { + "epoch": 0.045076737779791785, + "grad_norm": 0.16319462656974792, + "learning_rate": 0.001, + "loss": 2.8096, + "num_input_tokens_seen": 2477260800, + "step": 9450 + }, + { + "epoch": 0.045315239037885924, + "grad_norm": 0.16578581929206848, + "learning_rate": 0.001, + "loss": 2.8106, + "num_input_tokens_seen": 2490368000, + "step": 9500 + }, + { + "epoch": 0.045315239037885924, + "eval_loss": 2.7105066776275635, + "eval_runtime": 50.0838, + "eval_samples_per_second": 99.833, + "eval_steps_per_second": 24.958, + "num_input_tokens_seen": 2490368000, + "step": 9500 + }, + { + "epoch": 0.04555374029598006, + "grad_norm": 0.17125573754310608, + "learning_rate": 0.001, + "loss": 2.8259, + "num_input_tokens_seen": 2503475200, + "step": 9550 + }, + { + "epoch": 0.0457922415540742, + "grad_norm": 0.1661599725484848, + "learning_rate": 0.001, + "loss": 2.8109, + "num_input_tokens_seen": 2516582400, + "step": 9600 + }, + { + "epoch": 0.04603074281216833, + "grad_norm": 0.16203565895557404, + "learning_rate": 0.001, + "loss": 2.8198, + "num_input_tokens_seen": 2529689600, + "step": 9650 + }, + { + "epoch": 0.04626924407026247, + "grad_norm": 0.1869373619556427, + "learning_rate": 0.001, + "loss": 2.8163, + "num_input_tokens_seen": 2542796800, + "step": 9700 + }, + { + "epoch": 0.04650774532835661, + "grad_norm": 0.17401213943958282, + "learning_rate": 0.001, + "loss": 2.8209, + "num_input_tokens_seen": 2555904000, + "step": 9750 + }, + { + "epoch": 0.04674624658645074, + "grad_norm": 0.15835829079151154, + "learning_rate": 0.001, + "loss": 2.8032, + "num_input_tokens_seen": 2569011200, + "step": 9800 + }, + { + "epoch": 0.04698474784454488, + "grad_norm": 0.16554060578346252, + "learning_rate": 0.001, + "loss": 2.8072, + "num_input_tokens_seen": 2582118400, + "step": 9850 + }, + { + "epoch": 0.04722324910263902, + "grad_norm": 0.16941213607788086, + "learning_rate": 0.001, + "loss": 2.8076, + "num_input_tokens_seen": 2595225600, + "step": 9900 + }, + { + "epoch": 0.047461750360733156, + "grad_norm": 0.16324704885482788, + "learning_rate": 0.001, + "loss": 2.8097, + "num_input_tokens_seen": 2608332800, + "step": 9950 + }, + { + "epoch": 0.04770025161882729, + "grad_norm": 0.16865754127502441, + "learning_rate": 0.001, + "loss": 2.8051, + "num_input_tokens_seen": 2621440000, + "step": 10000 + }, + { + "epoch": 0.04770025161882729, + "eval_loss": 2.7000486850738525, + "eval_runtime": 50.3365, + "eval_samples_per_second": 99.331, + "eval_steps_per_second": 24.833, + "num_input_tokens_seen": 2621440000, + "step": 10000 + }, + { + "epoch": 0.047938752876921426, + "grad_norm": 0.17076526582241058, + "learning_rate": 0.001, + "loss": 2.8148, + "num_input_tokens_seen": 2634547200, + "step": 10050 + }, + { + "epoch": 0.048177254135015564, + "grad_norm": 0.1610497534275055, + "learning_rate": 0.001, + "loss": 2.8012, + "num_input_tokens_seen": 2647654400, + "step": 10100 + }, + { + "epoch": 0.048415755393109695, + "grad_norm": 0.15984536707401276, + "learning_rate": 0.001, + "loss": 2.8086, + "num_input_tokens_seen": 2660761600, + "step": 10150 + }, + { + "epoch": 0.048654256651203834, + "grad_norm": 0.21775834262371063, + "learning_rate": 0.001, + "loss": 2.8021, + "num_input_tokens_seen": 2673868800, + "step": 10200 + }, + { + "epoch": 0.04889275790929797, + "grad_norm": 0.1841157227754593, + "learning_rate": 0.001, + "loss": 2.8152, + "num_input_tokens_seen": 2686976000, + "step": 10250 + }, + { + "epoch": 0.04913125916739211, + "grad_norm": 0.17025424540042877, + "learning_rate": 0.001, + "loss": 2.8131, + "num_input_tokens_seen": 2700083200, + "step": 10300 + }, + { + "epoch": 0.04936976042548624, + "grad_norm": 0.1992417722940445, + "learning_rate": 0.001, + "loss": 2.8039, + "num_input_tokens_seen": 2713190400, + "step": 10350 + }, + { + "epoch": 0.04960826168358038, + "grad_norm": 0.1680469959974289, + "learning_rate": 0.001, + "loss": 2.7921, + "num_input_tokens_seen": 2726297600, + "step": 10400 + }, + { + "epoch": 0.04984676294167452, + "grad_norm": 0.18296252191066742, + "learning_rate": 0.001, + "loss": 2.8036, + "num_input_tokens_seen": 2739404800, + "step": 10450 + }, + { + "epoch": 0.05008526419976865, + "grad_norm": 0.16041898727416992, + "learning_rate": 0.001, + "loss": 2.7979, + "num_input_tokens_seen": 2752512000, + "step": 10500 + }, + { + "epoch": 0.05008526419976865, + "eval_loss": 2.6893723011016846, + "eval_runtime": 50.642, + "eval_samples_per_second": 98.732, + "eval_steps_per_second": 24.683, + "num_input_tokens_seen": 2752512000, + "step": 10500 + }, + { + "epoch": 0.05032376545786279, + "grad_norm": 0.16704030334949493, + "learning_rate": 0.001, + "loss": 2.79, + "num_input_tokens_seen": 2765619200, + "step": 10550 + }, + { + "epoch": 0.05056226671595693, + "grad_norm": 0.16553758084774017, + "learning_rate": 0.001, + "loss": 2.7964, + "num_input_tokens_seen": 2778726400, + "step": 10600 + }, + { + "epoch": 0.050800767974051066, + "grad_norm": 0.16027161478996277, + "learning_rate": 0.001, + "loss": 2.7937, + "num_input_tokens_seen": 2791833600, + "step": 10650 + }, + { + "epoch": 0.0510392692321452, + "grad_norm": 0.16177843511104584, + "learning_rate": 0.001, + "loss": 2.7957, + "num_input_tokens_seen": 2804940800, + "step": 10700 + }, + { + "epoch": 0.051277770490239336, + "grad_norm": 0.16713912785053253, + "learning_rate": 0.001, + "loss": 2.7949, + "num_input_tokens_seen": 2818048000, + "step": 10750 + }, + { + "epoch": 0.051516271748333474, + "grad_norm": 0.1815747618675232, + "learning_rate": 0.001, + "loss": 2.7915, + "num_input_tokens_seen": 2831155200, + "step": 10800 + }, + { + "epoch": 0.05175477300642761, + "grad_norm": 0.16732683777809143, + "learning_rate": 0.001, + "loss": 2.7994, + "num_input_tokens_seen": 2844262400, + "step": 10850 + }, + { + "epoch": 0.051993274264521744, + "grad_norm": 0.18305908143520355, + "learning_rate": 0.001, + "loss": 2.7888, + "num_input_tokens_seen": 2857369600, + "step": 10900 + }, + { + "epoch": 0.05223177552261588, + "grad_norm": 0.16450954973697662, + "learning_rate": 0.001, + "loss": 2.7834, + "num_input_tokens_seen": 2870476800, + "step": 10950 + }, + { + "epoch": 0.05247027678071002, + "grad_norm": 0.16485372185707092, + "learning_rate": 0.001, + "loss": 2.7976, + "num_input_tokens_seen": 2883584000, + "step": 11000 + }, + { + "epoch": 0.05247027678071002, + "eval_loss": 2.6825835704803467, + "eval_runtime": 50.1016, + "eval_samples_per_second": 99.797, + "eval_steps_per_second": 24.949, + "num_input_tokens_seen": 2883584000, + "step": 11000 + }, + { + "epoch": 0.05270877803880415, + "grad_norm": 0.1733204573392868, + "learning_rate": 0.001, + "loss": 2.7959, + "num_input_tokens_seen": 2896691200, + "step": 11050 + }, + { + "epoch": 0.05294727929689829, + "grad_norm": 0.16432546079158783, + "learning_rate": 0.001, + "loss": 2.793, + "num_input_tokens_seen": 2909798400, + "step": 11100 + }, + { + "epoch": 0.05318578055499243, + "grad_norm": 0.18369582295417786, + "learning_rate": 0.001, + "loss": 2.8149, + "num_input_tokens_seen": 2922905600, + "step": 11150 + }, + { + "epoch": 0.05342428181308657, + "grad_norm": 0.17782896757125854, + "learning_rate": 0.001, + "loss": 2.7878, + "num_input_tokens_seen": 2936012800, + "step": 11200 + }, + { + "epoch": 0.0536627830711807, + "grad_norm": 0.18320836126804352, + "learning_rate": 0.001, + "loss": 2.8159, + "num_input_tokens_seen": 2949120000, + "step": 11250 + }, + { + "epoch": 0.05390128432927484, + "grad_norm": 0.1667925864458084, + "learning_rate": 0.001, + "loss": 2.795, + "num_input_tokens_seen": 2962227200, + "step": 11300 + }, + { + "epoch": 0.054139785587368976, + "grad_norm": 0.19831301271915436, + "learning_rate": 0.001, + "loss": 2.7907, + "num_input_tokens_seen": 2975334400, + "step": 11350 + }, + { + "epoch": 0.05437828684546311, + "grad_norm": 0.1610182225704193, + "learning_rate": 0.001, + "loss": 2.774, + "num_input_tokens_seen": 2988441600, + "step": 11400 + }, + { + "epoch": 0.054616788103557246, + "grad_norm": 0.15938150882720947, + "learning_rate": 0.001, + "loss": 2.7766, + "num_input_tokens_seen": 3001548800, + "step": 11450 + }, + { + "epoch": 0.054855289361651384, + "grad_norm": 0.15737415850162506, + "learning_rate": 0.001, + "loss": 2.783, + "num_input_tokens_seen": 3014656000, + "step": 11500 + }, + { + "epoch": 0.054855289361651384, + "eval_loss": 2.6739256381988525, + "eval_runtime": 51.2462, + "eval_samples_per_second": 97.568, + "eval_steps_per_second": 24.392, + "num_input_tokens_seen": 3014656000, + "step": 11500 + }, + { + "epoch": 0.05509379061974552, + "grad_norm": 0.16538532078266144, + "learning_rate": 0.001, + "loss": 2.7966, + "num_input_tokens_seen": 3027763200, + "step": 11550 + }, + { + "epoch": 0.055332291877839654, + "grad_norm": 0.18035660684108734, + "learning_rate": 0.001, + "loss": 2.7789, + "num_input_tokens_seen": 3040870400, + "step": 11600 + }, + { + "epoch": 0.05557079313593379, + "grad_norm": 0.17831085622310638, + "learning_rate": 0.001, + "loss": 2.7962, + "num_input_tokens_seen": 3053977600, + "step": 11650 + }, + { + "epoch": 0.05580929439402793, + "grad_norm": 0.17723870277404785, + "learning_rate": 0.001, + "loss": 2.7791, + "num_input_tokens_seen": 3067084800, + "step": 11700 + }, + { + "epoch": 0.05604779565212206, + "grad_norm": 0.17663581669330597, + "learning_rate": 0.001, + "loss": 2.7696, + "num_input_tokens_seen": 3080192000, + "step": 11750 + }, + { + "epoch": 0.0562862969102162, + "grad_norm": 0.16684900224208832, + "learning_rate": 0.001, + "loss": 2.7762, + "num_input_tokens_seen": 3093299200, + "step": 11800 + }, + { + "epoch": 0.05652479816831034, + "grad_norm": 0.17407995462417603, + "learning_rate": 0.001, + "loss": 2.7767, + "num_input_tokens_seen": 3106406400, + "step": 11850 + }, + { + "epoch": 0.05676329942640448, + "grad_norm": 0.1750691831111908, + "learning_rate": 0.001, + "loss": 2.7785, + "num_input_tokens_seen": 3119513600, + "step": 11900 + }, + { + "epoch": 0.05700180068449861, + "grad_norm": 0.16576959192752838, + "learning_rate": 0.001, + "loss": 2.773, + "num_input_tokens_seen": 3132620800, + "step": 11950 + }, + { + "epoch": 0.05724030194259275, + "grad_norm": 0.16957831382751465, + "learning_rate": 0.001, + "loss": 2.7781, + "num_input_tokens_seen": 3145728000, + "step": 12000 + }, + { + "epoch": 0.05724030194259275, + "eval_loss": 2.6683335304260254, + "eval_runtime": 50.6428, + "eval_samples_per_second": 98.731, + "eval_steps_per_second": 24.683, + "num_input_tokens_seen": 3145728000, + "step": 12000 + }, + { + "epoch": 0.057478803200686886, + "grad_norm": 0.1645338237285614, + "learning_rate": 0.001, + "loss": 2.7709, + "num_input_tokens_seen": 3158835200, + "step": 12050 + }, + { + "epoch": 0.05771730445878102, + "grad_norm": 0.15848694741725922, + "learning_rate": 0.001, + "loss": 2.7849, + "num_input_tokens_seen": 3171942400, + "step": 12100 + }, + { + "epoch": 0.057955805716875156, + "grad_norm": 0.20003071427345276, + "learning_rate": 0.001, + "loss": 2.7691, + "num_input_tokens_seen": 3185049600, + "step": 12150 + }, + { + "epoch": 0.058194306974969294, + "grad_norm": 0.19301050901412964, + "learning_rate": 0.001, + "loss": 2.7811, + "num_input_tokens_seen": 3198156800, + "step": 12200 + }, + { + "epoch": 0.05843280823306343, + "grad_norm": 0.171390563249588, + "learning_rate": 0.001, + "loss": 2.7712, + "num_input_tokens_seen": 3211264000, + "step": 12250 + }, + { + "epoch": 0.058671309491157564, + "grad_norm": 0.1654270589351654, + "learning_rate": 0.001, + "loss": 2.7788, + "num_input_tokens_seen": 3224371200, + "step": 12300 + }, + { + "epoch": 0.0589098107492517, + "grad_norm": 0.16559672355651855, + "learning_rate": 0.001, + "loss": 2.7839, + "num_input_tokens_seen": 3237478400, + "step": 12350 + }, + { + "epoch": 0.05914831200734584, + "grad_norm": 0.16773344576358795, + "learning_rate": 0.001, + "loss": 2.7896, + "num_input_tokens_seen": 3250585600, + "step": 12400 + }, + { + "epoch": 0.05938681326543997, + "grad_norm": 0.1639021933078766, + "learning_rate": 0.001, + "loss": 2.7704, + "num_input_tokens_seen": 3263692800, + "step": 12450 + }, + { + "epoch": 0.05962531452353411, + "grad_norm": 0.15584540367126465, + "learning_rate": 0.001, + "loss": 2.7687, + "num_input_tokens_seen": 3276800000, + "step": 12500 + }, + { + "epoch": 0.05962531452353411, + "eval_loss": 2.6606011390686035, + "eval_runtime": 51.1636, + "eval_samples_per_second": 97.726, + "eval_steps_per_second": 24.431, + "num_input_tokens_seen": 3276800000, + "step": 12500 + }, + { + "epoch": 0.05986381578162825, + "grad_norm": 0.18144413828849792, + "learning_rate": 0.001, + "loss": 2.7711, + "num_input_tokens_seen": 3289907200, + "step": 12550 + }, + { + "epoch": 0.06010231703972239, + "grad_norm": 0.18225054442882538, + "learning_rate": 0.001, + "loss": 2.7675, + "num_input_tokens_seen": 3303014400, + "step": 12600 + }, + { + "epoch": 0.06034081829781652, + "grad_norm": 0.16542398929595947, + "learning_rate": 0.001, + "loss": 2.7563, + "num_input_tokens_seen": 3316121600, + "step": 12650 + }, + { + "epoch": 0.06057931955591066, + "grad_norm": 0.1765596568584442, + "learning_rate": 0.001, + "loss": 2.7807, + "num_input_tokens_seen": 3329228800, + "step": 12700 + }, + { + "epoch": 0.060817820814004796, + "grad_norm": 0.17469234764575958, + "learning_rate": 0.001, + "loss": 2.7532, + "num_input_tokens_seen": 3342336000, + "step": 12750 + }, + { + "epoch": 0.06105632207209893, + "grad_norm": 0.1841767281293869, + "learning_rate": 0.001, + "loss": 2.7824, + "num_input_tokens_seen": 3355443200, + "step": 12800 + }, + { + "epoch": 0.061294823330193066, + "grad_norm": 0.1667831838130951, + "learning_rate": 0.001, + "loss": 2.7648, + "num_input_tokens_seen": 3368550400, + "step": 12850 + }, + { + "epoch": 0.061533324588287204, + "grad_norm": 0.16561101377010345, + "learning_rate": 0.001, + "loss": 2.7798, + "num_input_tokens_seen": 3381657600, + "step": 12900 + }, + { + "epoch": 0.06177182584638134, + "grad_norm": 0.17370566725730896, + "learning_rate": 0.001, + "loss": 2.7755, + "num_input_tokens_seen": 3394764800, + "step": 12950 + }, + { + "epoch": 0.062010327104475474, + "grad_norm": 0.16871176660060883, + "learning_rate": 0.001, + "loss": 2.7676, + "num_input_tokens_seen": 3407872000, + "step": 13000 + }, + { + "epoch": 0.062010327104475474, + "eval_loss": 2.653367757797241, + "eval_runtime": 50.8399, + "eval_samples_per_second": 98.348, + "eval_steps_per_second": 24.587, + "num_input_tokens_seen": 3407872000, + "step": 13000 + }, + { + "epoch": 0.06224882836256961, + "grad_norm": 0.17592230439186096, + "learning_rate": 0.001, + "loss": 2.7639, + "num_input_tokens_seen": 3420979200, + "step": 13050 + }, + { + "epoch": 0.06248732962066375, + "grad_norm": 0.1640375405550003, + "learning_rate": 0.001, + "loss": 2.7785, + "num_input_tokens_seen": 3434086400, + "step": 13100 + }, + { + "epoch": 0.06272583087875788, + "grad_norm": 0.16389331221580505, + "learning_rate": 0.001, + "loss": 2.7475, + "num_input_tokens_seen": 3447193600, + "step": 13150 + }, + { + "epoch": 0.06296433213685203, + "grad_norm": 0.1733655482530594, + "learning_rate": 0.001, + "loss": 2.7538, + "num_input_tokens_seen": 3460300800, + "step": 13200 + }, + { + "epoch": 0.06320283339494616, + "grad_norm": 0.19206473231315613, + "learning_rate": 0.001, + "loss": 2.7819, + "num_input_tokens_seen": 3473408000, + "step": 13250 + }, + { + "epoch": 0.06344133465304029, + "grad_norm": 0.1841450184583664, + "learning_rate": 0.001, + "loss": 2.7701, + "num_input_tokens_seen": 3486515200, + "step": 13300 + }, + { + "epoch": 0.06367983591113444, + "grad_norm": 0.1701631247997284, + "learning_rate": 0.001, + "loss": 2.7587, + "num_input_tokens_seen": 3499622400, + "step": 13350 + }, + { + "epoch": 0.06391833716922857, + "grad_norm": 0.17068499326705933, + "learning_rate": 0.001, + "loss": 2.7589, + "num_input_tokens_seen": 3512729600, + "step": 13400 + }, + { + "epoch": 0.0641568384273227, + "grad_norm": 0.17927715182304382, + "learning_rate": 0.001, + "loss": 2.764, + "num_input_tokens_seen": 3525836800, + "step": 13450 + }, + { + "epoch": 0.06439533968541684, + "grad_norm": 0.19105768203735352, + "learning_rate": 0.001, + "loss": 2.7593, + "num_input_tokens_seen": 3538944000, + "step": 13500 + }, + { + "epoch": 0.06439533968541684, + "eval_loss": 2.6473631858825684, + "eval_runtime": 51.0846, + "eval_samples_per_second": 97.877, + "eval_steps_per_second": 24.469, + "num_input_tokens_seen": 3538944000, + "step": 13500 + }, + { + "epoch": 0.06463384094351098, + "grad_norm": 0.17262668907642365, + "learning_rate": 0.001, + "loss": 2.7522, + "num_input_tokens_seen": 3552051200, + "step": 13550 + }, + { + "epoch": 0.06487234220160511, + "grad_norm": 0.16810455918312073, + "learning_rate": 0.001, + "loss": 2.7664, + "num_input_tokens_seen": 3565158400, + "step": 13600 + }, + { + "epoch": 0.06511084345969925, + "grad_norm": 0.17312487959861755, + "learning_rate": 0.001, + "loss": 2.7557, + "num_input_tokens_seen": 3578265600, + "step": 13650 + }, + { + "epoch": 0.06534934471779338, + "grad_norm": 0.16985322535037994, + "learning_rate": 0.001, + "loss": 2.7449, + "num_input_tokens_seen": 3591372800, + "step": 13700 + }, + { + "epoch": 0.06558784597588753, + "grad_norm": 0.1812393069267273, + "learning_rate": 0.001, + "loss": 2.749, + "num_input_tokens_seen": 3604480000, + "step": 13750 + }, + { + "epoch": 0.06582634723398166, + "grad_norm": 0.183237224817276, + "learning_rate": 0.001, + "loss": 2.7637, + "num_input_tokens_seen": 3617587200, + "step": 13800 + }, + { + "epoch": 0.06606484849207579, + "grad_norm": 0.17770566046237946, + "learning_rate": 0.001, + "loss": 2.7602, + "num_input_tokens_seen": 3630694400, + "step": 13850 + }, + { + "epoch": 0.06630334975016994, + "grad_norm": 0.1678437739610672, + "learning_rate": 0.001, + "loss": 2.76, + "num_input_tokens_seen": 3643801600, + "step": 13900 + }, + { + "epoch": 0.06654185100826407, + "grad_norm": 0.16213107109069824, + "learning_rate": 0.001, + "loss": 2.7467, + "num_input_tokens_seen": 3656908800, + "step": 13950 + }, + { + "epoch": 0.0667803522663582, + "grad_norm": 0.17652907967567444, + "learning_rate": 0.001, + "loss": 2.7516, + "num_input_tokens_seen": 3670016000, + "step": 14000 + }, + { + "epoch": 0.0667803522663582, + "eval_loss": 2.6438815593719482, + "eval_runtime": 50.3233, + "eval_samples_per_second": 99.358, + "eval_steps_per_second": 24.839, + "num_input_tokens_seen": 3670016000, + "step": 14000 + }, + { + "epoch": 0.06701885352445235, + "grad_norm": 0.1785530298948288, + "learning_rate": 0.001, + "loss": 2.7475, + "num_input_tokens_seen": 3683123200, + "step": 14050 + }, + { + "epoch": 0.06725735478254648, + "grad_norm": 0.15644113719463348, + "learning_rate": 0.001, + "loss": 2.7541, + "num_input_tokens_seen": 3696230400, + "step": 14100 + }, + { + "epoch": 0.06749585604064061, + "grad_norm": 0.183272585272789, + "learning_rate": 0.001, + "loss": 2.7513, + "num_input_tokens_seen": 3709337600, + "step": 14150 + }, + { + "epoch": 0.06773435729873475, + "grad_norm": 0.17523212730884552, + "learning_rate": 0.001, + "loss": 2.7649, + "num_input_tokens_seen": 3722444800, + "step": 14200 + }, + { + "epoch": 0.06797285855682889, + "grad_norm": 0.1778247356414795, + "learning_rate": 0.001, + "loss": 2.7457, + "num_input_tokens_seen": 3735552000, + "step": 14250 + }, + { + "epoch": 0.06821135981492302, + "grad_norm": 0.18277810513973236, + "learning_rate": 0.001, + "loss": 2.7477, + "num_input_tokens_seen": 3748659200, + "step": 14300 + }, + { + "epoch": 0.06844986107301716, + "grad_norm": 0.17541366815567017, + "learning_rate": 0.001, + "loss": 2.7419, + "num_input_tokens_seen": 3761766400, + "step": 14350 + }, + { + "epoch": 0.0686883623311113, + "grad_norm": 0.1701425164937973, + "learning_rate": 0.001, + "loss": 2.7437, + "num_input_tokens_seen": 3774873600, + "step": 14400 + }, + { + "epoch": 0.06892686358920544, + "grad_norm": 0.16685517132282257, + "learning_rate": 0.001, + "loss": 2.7357, + "num_input_tokens_seen": 3787980800, + "step": 14450 + }, + { + "epoch": 0.06916536484729957, + "grad_norm": 0.1738167405128479, + "learning_rate": 0.001, + "loss": 2.7475, + "num_input_tokens_seen": 3801088000, + "step": 14500 + }, + { + "epoch": 0.06916536484729957, + "eval_loss": 2.635887622833252, + "eval_runtime": 50.4516, + "eval_samples_per_second": 99.105, + "eval_steps_per_second": 24.776, + "num_input_tokens_seen": 3801088000, + "step": 14500 + }, + { + "epoch": 0.0694038661053937, + "grad_norm": 0.18279027938842773, + "learning_rate": 0.001, + "loss": 2.7521, + "num_input_tokens_seen": 3814195200, + "step": 14550 + }, + { + "epoch": 0.06964236736348785, + "grad_norm": 0.1878173053264618, + "learning_rate": 0.001, + "loss": 2.7401, + "num_input_tokens_seen": 3827302400, + "step": 14600 + }, + { + "epoch": 0.06988086862158198, + "grad_norm": 0.17670077085494995, + "learning_rate": 0.001, + "loss": 2.7513, + "num_input_tokens_seen": 3840409600, + "step": 14650 + }, + { + "epoch": 0.07011936987967611, + "grad_norm": 0.17042580246925354, + "learning_rate": 0.001, + "loss": 2.7383, + "num_input_tokens_seen": 3853516800, + "step": 14700 + }, + { + "epoch": 0.07035787113777026, + "grad_norm": 0.17193050682544708, + "learning_rate": 0.001, + "loss": 2.7408, + "num_input_tokens_seen": 3866624000, + "step": 14750 + }, + { + "epoch": 0.07059637239586439, + "grad_norm": 0.16576342284679413, + "learning_rate": 0.001, + "loss": 2.7312, + "num_input_tokens_seen": 3879731200, + "step": 14800 + }, + { + "epoch": 0.07083487365395852, + "grad_norm": 0.18535619974136353, + "learning_rate": 0.001, + "loss": 2.756, + "num_input_tokens_seen": 3892838400, + "step": 14850 + }, + { + "epoch": 0.07107337491205266, + "grad_norm": 0.1729886531829834, + "learning_rate": 0.001, + "loss": 2.751, + "num_input_tokens_seen": 3905945600, + "step": 14900 + }, + { + "epoch": 0.0713118761701468, + "grad_norm": 0.16047705709934235, + "learning_rate": 0.001, + "loss": 2.7361, + "num_input_tokens_seen": 3919052800, + "step": 14950 + }, + { + "epoch": 0.07155037742824093, + "grad_norm": 0.17655611038208008, + "learning_rate": 0.001, + "loss": 2.7471, + "num_input_tokens_seen": 3932160000, + "step": 15000 + }, + { + "epoch": 0.07155037742824093, + "eval_loss": 2.6311256885528564, + "eval_runtime": 51.0361, + "eval_samples_per_second": 97.97, + "eval_steps_per_second": 24.492, + "num_input_tokens_seen": 3932160000, + "step": 15000 + }, + { + "epoch": 0.07178887868633507, + "grad_norm": 0.19243250787258148, + "learning_rate": 0.001, + "loss": 2.7551, + "num_input_tokens_seen": 3945267200, + "step": 15050 + }, + { + "epoch": 0.0720273799444292, + "grad_norm": 0.17328651249408722, + "learning_rate": 0.001, + "loss": 2.7346, + "num_input_tokens_seen": 3958374400, + "step": 15100 + }, + { + "epoch": 0.07226588120252335, + "grad_norm": 0.16357752680778503, + "learning_rate": 0.001, + "loss": 2.7523, + "num_input_tokens_seen": 3971481600, + "step": 15150 + }, + { + "epoch": 0.07250438246061748, + "grad_norm": 0.1726733148097992, + "learning_rate": 0.001, + "loss": 2.725, + "num_input_tokens_seen": 3984588800, + "step": 15200 + }, + { + "epoch": 0.07274288371871161, + "grad_norm": 0.16912953555583954, + "learning_rate": 0.001, + "loss": 2.738, + "num_input_tokens_seen": 3997696000, + "step": 15250 + }, + { + "epoch": 0.07298138497680576, + "grad_norm": 0.19751113653182983, + "learning_rate": 0.001, + "loss": 2.7532, + "num_input_tokens_seen": 4010803200, + "step": 15300 + }, + { + "epoch": 0.07321988623489989, + "grad_norm": 0.16762405633926392, + "learning_rate": 0.001, + "loss": 2.7413, + "num_input_tokens_seen": 4023910400, + "step": 15350 + }, + { + "epoch": 0.07345838749299402, + "grad_norm": 0.18106459081172943, + "learning_rate": 0.001, + "loss": 2.7411, + "num_input_tokens_seen": 4037017600, + "step": 15400 + }, + { + "epoch": 0.07369688875108817, + "grad_norm": 0.184165820479393, + "learning_rate": 0.001, + "loss": 2.7449, + "num_input_tokens_seen": 4050124800, + "step": 15450 + }, + { + "epoch": 0.0739353900091823, + "grad_norm": 0.16832765936851501, + "learning_rate": 0.001, + "loss": 2.7442, + "num_input_tokens_seen": 4063232000, + "step": 15500 + }, + { + "epoch": 0.0739353900091823, + "eval_loss": 2.6253247261047363, + "eval_runtime": 50.6964, + "eval_samples_per_second": 98.626, + "eval_steps_per_second": 24.657, + "num_input_tokens_seen": 4063232000, + "step": 15500 + }, + { + "epoch": 0.07417389126727643, + "grad_norm": 0.1663861721754074, + "learning_rate": 0.001, + "loss": 2.7461, + "num_input_tokens_seen": 4076339200, + "step": 15550 + }, + { + "epoch": 0.07441239252537057, + "grad_norm": 0.17217928171157837, + "learning_rate": 0.001, + "loss": 2.7394, + "num_input_tokens_seen": 4089446400, + "step": 15600 + }, + { + "epoch": 0.0746508937834647, + "grad_norm": 0.17169134318828583, + "learning_rate": 0.001, + "loss": 2.7474, + "num_input_tokens_seen": 4102553600, + "step": 15650 + }, + { + "epoch": 0.07488939504155885, + "grad_norm": 0.17074033617973328, + "learning_rate": 0.001, + "loss": 2.7405, + "num_input_tokens_seen": 4115660800, + "step": 15700 + }, + { + "epoch": 0.07512789629965298, + "grad_norm": 0.20199435949325562, + "learning_rate": 0.001, + "loss": 2.7412, + "num_input_tokens_seen": 4128768000, + "step": 15750 + }, + { + "epoch": 0.07536639755774711, + "grad_norm": 0.17569150030612946, + "learning_rate": 0.001, + "loss": 2.7279, + "num_input_tokens_seen": 4141875200, + "step": 15800 + }, + { + "epoch": 0.07560489881584126, + "grad_norm": 0.1753721386194229, + "learning_rate": 0.001, + "loss": 2.7442, + "num_input_tokens_seen": 4154982400, + "step": 15850 + }, + { + "epoch": 0.07584340007393539, + "grad_norm": 0.17356647551059723, + "learning_rate": 0.001, + "loss": 2.7447, + "num_input_tokens_seen": 4168089600, + "step": 15900 + }, + { + "epoch": 0.07608190133202952, + "grad_norm": 0.16931213438510895, + "learning_rate": 0.001, + "loss": 2.7194, + "num_input_tokens_seen": 4181196800, + "step": 15950 + }, + { + "epoch": 0.07632040259012367, + "grad_norm": 0.2109583616256714, + "learning_rate": 0.001, + "loss": 2.7271, + "num_input_tokens_seen": 4194304000, + "step": 16000 + }, + { + "epoch": 0.07632040259012367, + "eval_loss": 2.6222195625305176, + "eval_runtime": 50.2121, + "eval_samples_per_second": 99.578, + "eval_steps_per_second": 24.894, + "num_input_tokens_seen": 4194304000, + "step": 16000 + }, + { + "epoch": 0.0765589038482178, + "grad_norm": 0.1729741096496582, + "learning_rate": 0.001, + "loss": 2.7273, + "num_input_tokens_seen": 4207411200, + "step": 16050 + }, + { + "epoch": 0.07679740510631193, + "grad_norm": 0.178414985537529, + "learning_rate": 0.001, + "loss": 2.7119, + "num_input_tokens_seen": 4220518400, + "step": 16100 + }, + { + "epoch": 0.07703590636440608, + "grad_norm": 0.16985353827476501, + "learning_rate": 0.001, + "loss": 2.7329, + "num_input_tokens_seen": 4233625600, + "step": 16150 + }, + { + "epoch": 0.07727440762250021, + "grad_norm": 0.1792905330657959, + "learning_rate": 0.001, + "loss": 2.7331, + "num_input_tokens_seen": 4246732800, + "step": 16200 + }, + { + "epoch": 0.07751290888059434, + "grad_norm": 0.17052733898162842, + "learning_rate": 0.001, + "loss": 2.7438, + "num_input_tokens_seen": 4259840000, + "step": 16250 + }, + { + "epoch": 0.07775141013868848, + "grad_norm": 0.18520629405975342, + "learning_rate": 0.001, + "loss": 2.7292, + "num_input_tokens_seen": 4272947200, + "step": 16300 + }, + { + "epoch": 0.07798991139678262, + "grad_norm": 0.18607158958911896, + "learning_rate": 0.001, + "loss": 2.7305, + "num_input_tokens_seen": 4286054400, + "step": 16350 + }, + { + "epoch": 0.07822841265487676, + "grad_norm": 0.1774805337190628, + "learning_rate": 0.001, + "loss": 2.7237, + "num_input_tokens_seen": 4299161600, + "step": 16400 + }, + { + "epoch": 0.07846691391297089, + "grad_norm": 0.17118123173713684, + "learning_rate": 0.001, + "loss": 2.736, + "num_input_tokens_seen": 4312268800, + "step": 16450 + }, + { + "epoch": 0.07870541517106502, + "grad_norm": 0.18550898134708405, + "learning_rate": 0.001, + "loss": 2.7237, + "num_input_tokens_seen": 4325376000, + "step": 16500 + }, + { + "epoch": 0.07870541517106502, + "eval_loss": 2.6178503036499023, + "eval_runtime": 50.4959, + "eval_samples_per_second": 99.018, + "eval_steps_per_second": 24.754, + "num_input_tokens_seen": 4325376000, + "step": 16500 + }, + { + "epoch": 0.07894391642915917, + "grad_norm": 0.1839551031589508, + "learning_rate": 0.001, + "loss": 2.7312, + "num_input_tokens_seen": 4338483200, + "step": 16550 + }, + { + "epoch": 0.0791824176872533, + "grad_norm": 0.17430303990840912, + "learning_rate": 0.001, + "loss": 2.7138, + "num_input_tokens_seen": 4351590400, + "step": 16600 + }, + { + "epoch": 0.07942091894534743, + "grad_norm": 0.17208248376846313, + "learning_rate": 0.001, + "loss": 2.7459, + "num_input_tokens_seen": 4364697600, + "step": 16650 + }, + { + "epoch": 0.07965942020344158, + "grad_norm": 0.16932401061058044, + "learning_rate": 0.001, + "loss": 2.7358, + "num_input_tokens_seen": 4377804800, + "step": 16700 + }, + { + "epoch": 0.07989792146153571, + "grad_norm": 0.17707890272140503, + "learning_rate": 0.001, + "loss": 2.7169, + "num_input_tokens_seen": 4390912000, + "step": 16750 + }, + { + "epoch": 0.08013642271962984, + "grad_norm": 0.1669357717037201, + "learning_rate": 0.001, + "loss": 2.7296, + "num_input_tokens_seen": 4404019200, + "step": 16800 + }, + { + "epoch": 0.08037492397772399, + "grad_norm": 0.19266557693481445, + "learning_rate": 0.001, + "loss": 2.7187, + "num_input_tokens_seen": 4417126400, + "step": 16850 + }, + { + "epoch": 0.08061342523581812, + "grad_norm": 0.17670407891273499, + "learning_rate": 0.001, + "loss": 2.7339, + "num_input_tokens_seen": 4430233600, + "step": 16900 + }, + { + "epoch": 0.08085192649391225, + "grad_norm": 0.17866192758083344, + "learning_rate": 0.001, + "loss": 2.7167, + "num_input_tokens_seen": 4443340800, + "step": 16950 + }, + { + "epoch": 0.0810904277520064, + "grad_norm": 0.18247559666633606, + "learning_rate": 0.001, + "loss": 2.7151, + "num_input_tokens_seen": 4456448000, + "step": 17000 + }, + { + "epoch": 0.0810904277520064, + "eval_loss": 2.6127233505249023, + "eval_runtime": 50.7555, + "eval_samples_per_second": 98.512, + "eval_steps_per_second": 24.628, + "num_input_tokens_seen": 4456448000, + "step": 17000 + }, + { + "epoch": 0.08132892901010053, + "grad_norm": 0.17702773213386536, + "learning_rate": 0.001, + "loss": 2.7303, + "num_input_tokens_seen": 4469555200, + "step": 17050 + }, + { + "epoch": 0.08156743026819467, + "grad_norm": 0.18900151550769806, + "learning_rate": 0.001, + "loss": 2.7286, + "num_input_tokens_seen": 4482662400, + "step": 17100 + }, + { + "epoch": 0.0818059315262888, + "grad_norm": 0.18566136062145233, + "learning_rate": 0.001, + "loss": 2.7304, + "num_input_tokens_seen": 4495769600, + "step": 17150 + }, + { + "epoch": 0.08204443278438293, + "grad_norm": 0.1759686917066574, + "learning_rate": 0.001, + "loss": 2.7179, + "num_input_tokens_seen": 4508876800, + "step": 17200 + }, + { + "epoch": 0.08228293404247708, + "grad_norm": 0.15799184143543243, + "learning_rate": 0.001, + "loss": 2.7367, + "num_input_tokens_seen": 4521984000, + "step": 17250 + }, + { + "epoch": 0.08252143530057121, + "grad_norm": 0.18740351498126984, + "learning_rate": 0.001, + "loss": 2.72, + "num_input_tokens_seen": 4535091200, + "step": 17300 + }, + { + "epoch": 0.08275993655866534, + "grad_norm": 0.17688381671905518, + "learning_rate": 0.001, + "loss": 2.7115, + "num_input_tokens_seen": 4548198400, + "step": 17350 + }, + { + "epoch": 0.08299843781675949, + "grad_norm": 0.1807299256324768, + "learning_rate": 0.001, + "loss": 2.7346, + "num_input_tokens_seen": 4561305600, + "step": 17400 + }, + { + "epoch": 0.08323693907485362, + "grad_norm": 0.17570430040359497, + "learning_rate": 0.001, + "loss": 2.7141, + "num_input_tokens_seen": 4574412800, + "step": 17450 + }, + { + "epoch": 0.08347544033294775, + "grad_norm": 0.16912159323692322, + "learning_rate": 0.001, + "loss": 2.7164, + "num_input_tokens_seen": 4587520000, + "step": 17500 + }, + { + "epoch": 0.08347544033294775, + "eval_loss": 2.6085972785949707, + "eval_runtime": 50.619, + "eval_samples_per_second": 98.777, + "eval_steps_per_second": 24.694, + "num_input_tokens_seen": 4587520000, + "step": 17500 + }, + { + "epoch": 0.0837139415910419, + "grad_norm": 0.17684704065322876, + "learning_rate": 0.001, + "loss": 2.7293, + "num_input_tokens_seen": 4600627200, + "step": 17550 + }, + { + "epoch": 0.08395244284913603, + "grad_norm": 0.18020550906658173, + "learning_rate": 0.001, + "loss": 2.7124, + "num_input_tokens_seen": 4613734400, + "step": 17600 + }, + { + "epoch": 0.08419094410723016, + "grad_norm": 0.17311082780361176, + "learning_rate": 0.001, + "loss": 2.7047, + "num_input_tokens_seen": 4626841600, + "step": 17650 + }, + { + "epoch": 0.0844294453653243, + "grad_norm": 0.17366532981395721, + "learning_rate": 0.001, + "loss": 2.7316, + "num_input_tokens_seen": 4639948800, + "step": 17700 + }, + { + "epoch": 0.08466794662341844, + "grad_norm": 0.16526220738887787, + "learning_rate": 0.001, + "loss": 2.7212, + "num_input_tokens_seen": 4653056000, + "step": 17750 + }, + { + "epoch": 0.08490644788151258, + "grad_norm": 0.1746092140674591, + "learning_rate": 0.001, + "loss": 2.7288, + "num_input_tokens_seen": 4666163200, + "step": 17800 + }, + { + "epoch": 0.08514494913960671, + "grad_norm": 0.19404995441436768, + "learning_rate": 0.001, + "loss": 2.7129, + "num_input_tokens_seen": 4679270400, + "step": 17850 + }, + { + "epoch": 0.08538345039770084, + "grad_norm": 0.18850015103816986, + "learning_rate": 0.001, + "loss": 2.7161, + "num_input_tokens_seen": 4692377600, + "step": 17900 + }, + { + "epoch": 0.08562195165579499, + "grad_norm": 0.19126516580581665, + "learning_rate": 0.001, + "loss": 2.7206, + "num_input_tokens_seen": 4705484800, + "step": 17950 + }, + { + "epoch": 0.08586045291388912, + "grad_norm": 0.1802307665348053, + "learning_rate": 0.001, + "loss": 2.7163, + "num_input_tokens_seen": 4718592000, + "step": 18000 + }, + { + "epoch": 0.08586045291388912, + "eval_loss": 2.603686809539795, + "eval_runtime": 50.6488, + "eval_samples_per_second": 98.719, + "eval_steps_per_second": 24.68, + "num_input_tokens_seen": 4718592000, + "step": 18000 + }, + { + "epoch": 0.08609895417198325, + "grad_norm": 0.18276441097259521, + "learning_rate": 0.001, + "loss": 2.7299, + "num_input_tokens_seen": 4731699200, + "step": 18050 + }, + { + "epoch": 0.0863374554300774, + "grad_norm": 0.17280028760433197, + "learning_rate": 0.001, + "loss": 2.7175, + "num_input_tokens_seen": 4744806400, + "step": 18100 + }, + { + "epoch": 0.08657595668817153, + "grad_norm": 0.17224080860614777, + "learning_rate": 0.001, + "loss": 2.7089, + "num_input_tokens_seen": 4757913600, + "step": 18150 + }, + { + "epoch": 0.08681445794626566, + "grad_norm": 0.17205391824245453, + "learning_rate": 0.001, + "loss": 2.7072, + "num_input_tokens_seen": 4771020800, + "step": 18200 + }, + { + "epoch": 0.0870529592043598, + "grad_norm": 0.1829432249069214, + "learning_rate": 0.001, + "loss": 2.6959, + "num_input_tokens_seen": 4784128000, + "step": 18250 + }, + { + "epoch": 0.08729146046245394, + "grad_norm": 0.1669514924287796, + "learning_rate": 0.001, + "loss": 2.7209, + "num_input_tokens_seen": 4797235200, + "step": 18300 + }, + { + "epoch": 0.08752996172054807, + "grad_norm": 0.18273359537124634, + "learning_rate": 0.001, + "loss": 2.6935, + "num_input_tokens_seen": 4810342400, + "step": 18350 + }, + { + "epoch": 0.08776846297864221, + "grad_norm": 0.21061965823173523, + "learning_rate": 0.001, + "loss": 2.7204, + "num_input_tokens_seen": 4823449600, + "step": 18400 + }, + { + "epoch": 0.08800696423673635, + "grad_norm": 0.17710614204406738, + "learning_rate": 0.001, + "loss": 2.7231, + "num_input_tokens_seen": 4836556800, + "step": 18450 + }, + { + "epoch": 0.08824546549483049, + "grad_norm": 0.17370648682117462, + "learning_rate": 0.001, + "loss": 2.7064, + "num_input_tokens_seen": 4849664000, + "step": 18500 + }, + { + "epoch": 0.08824546549483049, + "eval_loss": 2.600076913833618, + "eval_runtime": 50.3596, + "eval_samples_per_second": 99.286, + "eval_steps_per_second": 24.822, + "num_input_tokens_seen": 4849664000, + "step": 18500 + }, + { + "epoch": 0.08848396675292462, + "grad_norm": 0.19398675858974457, + "learning_rate": 0.001, + "loss": 2.7118, + "num_input_tokens_seen": 4862771200, + "step": 18550 + }, + { + "epoch": 0.08872246801101875, + "grad_norm": 0.18522043526172638, + "learning_rate": 0.001, + "loss": 2.6998, + "num_input_tokens_seen": 4875878400, + "step": 18600 + }, + { + "epoch": 0.0889609692691129, + "grad_norm": 0.2682952880859375, + "learning_rate": 0.001, + "loss": 2.7057, + "num_input_tokens_seen": 4888985600, + "step": 18650 + }, + { + "epoch": 0.08919947052720703, + "grad_norm": 0.18555712699890137, + "learning_rate": 0.001, + "loss": 2.7127, + "num_input_tokens_seen": 4902092800, + "step": 18700 + }, + { + "epoch": 0.08943797178530116, + "grad_norm": 0.1940859854221344, + "learning_rate": 0.001, + "loss": 2.7054, + "num_input_tokens_seen": 4915200000, + "step": 18750 + }, + { + "epoch": 0.08967647304339531, + "grad_norm": 0.1800539344549179, + "learning_rate": 0.001, + "loss": 2.702, + "num_input_tokens_seen": 4928307200, + "step": 18800 + }, + { + "epoch": 0.08991497430148944, + "grad_norm": 0.19734695553779602, + "learning_rate": 0.001, + "loss": 2.7157, + "num_input_tokens_seen": 4941414400, + "step": 18850 + }, + { + "epoch": 0.09015347555958357, + "grad_norm": 0.16387026011943817, + "learning_rate": 0.001, + "loss": 2.7183, + "num_input_tokens_seen": 4954521600, + "step": 18900 + }, + { + "epoch": 0.09039197681767772, + "grad_norm": 0.19447770714759827, + "learning_rate": 0.001, + "loss": 2.7154, + "num_input_tokens_seen": 4967628800, + "step": 18950 + }, + { + "epoch": 0.09063047807577185, + "grad_norm": 0.17366571724414825, + "learning_rate": 0.001, + "loss": 2.6996, + "num_input_tokens_seen": 4980736000, + "step": 19000 + }, + { + "epoch": 0.09063047807577185, + "eval_loss": 2.5983569622039795, + "eval_runtime": 50.8055, + "eval_samples_per_second": 98.415, + "eval_steps_per_second": 24.604, + "num_input_tokens_seen": 4980736000, + "step": 19000 + }, + { + "epoch": 0.09086897933386599, + "grad_norm": 0.1770928055047989, + "learning_rate": 0.001, + "loss": 2.7171, + "num_input_tokens_seen": 4993843200, + "step": 19050 + }, + { + "epoch": 0.09110748059196012, + "grad_norm": 0.18122689425945282, + "learning_rate": 0.001, + "loss": 2.7064, + "num_input_tokens_seen": 5006950400, + "step": 19100 + }, + { + "epoch": 0.09134598185005426, + "grad_norm": 0.19320747256278992, + "learning_rate": 0.001, + "loss": 2.7105, + "num_input_tokens_seen": 5020057600, + "step": 19150 + }, + { + "epoch": 0.0915844831081484, + "grad_norm": 0.19556616246700287, + "learning_rate": 0.001, + "loss": 2.7018, + "num_input_tokens_seen": 5033164800, + "step": 19200 + }, + { + "epoch": 0.09182298436624253, + "grad_norm": 0.18251653015613556, + "learning_rate": 0.001, + "loss": 2.7067, + "num_input_tokens_seen": 5046272000, + "step": 19250 + }, + { + "epoch": 0.09206148562433666, + "grad_norm": 0.17226757109165192, + "learning_rate": 0.001, + "loss": 2.6803, + "num_input_tokens_seen": 5059379200, + "step": 19300 + }, + { + "epoch": 0.09229998688243081, + "grad_norm": 0.18007858097553253, + "learning_rate": 0.001, + "loss": 2.6998, + "num_input_tokens_seen": 5072486400, + "step": 19350 + }, + { + "epoch": 0.09253848814052494, + "grad_norm": 0.1664605736732483, + "learning_rate": 0.001, + "loss": 2.6985, + "num_input_tokens_seen": 5085593600, + "step": 19400 + }, + { + "epoch": 0.09277698939861907, + "grad_norm": 0.17898677289485931, + "learning_rate": 0.001, + "loss": 2.7034, + "num_input_tokens_seen": 5098700800, + "step": 19450 + }, + { + "epoch": 0.09301549065671322, + "grad_norm": 0.16403160989284515, + "learning_rate": 0.001, + "loss": 2.7012, + "num_input_tokens_seen": 5111808000, + "step": 19500 + }, + { + "epoch": 0.09301549065671322, + "eval_loss": 2.594251871109009, + "eval_runtime": 50.1924, + "eval_samples_per_second": 99.617, + "eval_steps_per_second": 24.904, + "num_input_tokens_seen": 5111808000, + "step": 19500 + }, + { + "epoch": 0.09325399191480735, + "grad_norm": 0.17973001301288605, + "learning_rate": 0.001, + "loss": 2.7039, + "num_input_tokens_seen": 5124915200, + "step": 19550 + }, + { + "epoch": 0.09349249317290148, + "grad_norm": 0.1667868047952652, + "learning_rate": 0.001, + "loss": 2.7038, + "num_input_tokens_seen": 5138022400, + "step": 19600 + }, + { + "epoch": 0.09373099443099563, + "grad_norm": 0.18338319659233093, + "learning_rate": 0.001, + "loss": 2.7138, + "num_input_tokens_seen": 5151129600, + "step": 19650 + }, + { + "epoch": 0.09396949568908976, + "grad_norm": 0.17962965369224548, + "learning_rate": 0.001, + "loss": 2.6994, + "num_input_tokens_seen": 5164236800, + "step": 19700 + }, + { + "epoch": 0.0942079969471839, + "grad_norm": 0.17233812808990479, + "learning_rate": 0.001, + "loss": 2.7073, + "num_input_tokens_seen": 5177344000, + "step": 19750 + }, + { + "epoch": 0.09444649820527803, + "grad_norm": 0.16720129549503326, + "learning_rate": 0.001, + "loss": 2.7146, + "num_input_tokens_seen": 5190451200, + "step": 19800 + }, + { + "epoch": 0.09468499946337217, + "grad_norm": 0.1732376664876938, + "learning_rate": 0.001, + "loss": 2.7126, + "num_input_tokens_seen": 5203558400, + "step": 19850 + }, + { + "epoch": 0.09492350072146631, + "grad_norm": 0.17245380580425262, + "learning_rate": 0.001, + "loss": 2.7054, + "num_input_tokens_seen": 5216665600, + "step": 19900 + }, + { + "epoch": 0.09516200197956044, + "grad_norm": 0.17415107786655426, + "learning_rate": 0.001, + "loss": 2.7027, + "num_input_tokens_seen": 5229772800, + "step": 19950 + }, + { + "epoch": 0.09540050323765457, + "grad_norm": 0.1747124344110489, + "learning_rate": 0.001, + "loss": 2.6975, + "num_input_tokens_seen": 5242880000, + "step": 20000 + }, + { + "epoch": 0.09540050323765457, + "eval_loss": 2.5900332927703857, + "eval_runtime": 50.5898, + "eval_samples_per_second": 98.834, + "eval_steps_per_second": 24.709, + "num_input_tokens_seen": 5242880000, + "step": 20000 + }, + { + "epoch": 0.09563900449574872, + "grad_norm": 0.17750214040279388, + "learning_rate": 0.001, + "loss": 2.7, + "num_input_tokens_seen": 5255987200, + "step": 20050 + }, + { + "epoch": 0.09587750575384285, + "grad_norm": 0.16490615904331207, + "learning_rate": 0.001, + "loss": 2.7188, + "num_input_tokens_seen": 5269094400, + "step": 20100 + }, + { + "epoch": 0.09611600701193698, + "grad_norm": 0.20347309112548828, + "learning_rate": 0.001, + "loss": 2.7034, + "num_input_tokens_seen": 5282201600, + "step": 20150 + }, + { + "epoch": 0.09635450827003113, + "grad_norm": 0.19717667996883392, + "learning_rate": 0.001, + "loss": 2.6864, + "num_input_tokens_seen": 5295308800, + "step": 20200 + }, + { + "epoch": 0.09659300952812526, + "grad_norm": 0.17054997384548187, + "learning_rate": 0.001, + "loss": 2.7068, + "num_input_tokens_seen": 5308416000, + "step": 20250 + }, + { + "epoch": 0.09683151078621939, + "grad_norm": 0.1771887093782425, + "learning_rate": 0.001, + "loss": 2.7037, + "num_input_tokens_seen": 5321523200, + "step": 20300 + }, + { + "epoch": 0.09707001204431354, + "grad_norm": 0.17556501924991608, + "learning_rate": 0.001, + "loss": 2.705, + "num_input_tokens_seen": 5334630400, + "step": 20350 + }, + { + "epoch": 0.09730851330240767, + "grad_norm": 0.1696256399154663, + "learning_rate": 0.001, + "loss": 2.7109, + "num_input_tokens_seen": 5347737600, + "step": 20400 + }, + { + "epoch": 0.09754701456050181, + "grad_norm": 0.18629619479179382, + "learning_rate": 0.001, + "loss": 2.7043, + "num_input_tokens_seen": 5360844800, + "step": 20450 + }, + { + "epoch": 0.09778551581859594, + "grad_norm": 0.18701018393039703, + "learning_rate": 0.001, + "loss": 2.7002, + "num_input_tokens_seen": 5373952000, + "step": 20500 + }, + { + "epoch": 0.09778551581859594, + "eval_loss": 2.587498903274536, + "eval_runtime": 51.1986, + "eval_samples_per_second": 97.659, + "eval_steps_per_second": 24.415, + "num_input_tokens_seen": 5373952000, + "step": 20500 + }, + { + "epoch": 0.09802401707669008, + "grad_norm": 0.1792842447757721, + "learning_rate": 0.001, + "loss": 2.7083, + "num_input_tokens_seen": 5387059200, + "step": 20550 + }, + { + "epoch": 0.09826251833478422, + "grad_norm": 0.18761058151721954, + "learning_rate": 0.001, + "loss": 2.6898, + "num_input_tokens_seen": 5400166400, + "step": 20600 + }, + { + "epoch": 0.09850101959287835, + "grad_norm": 0.1827591508626938, + "learning_rate": 0.001, + "loss": 2.6976, + "num_input_tokens_seen": 5413273600, + "step": 20650 + }, + { + "epoch": 0.09873952085097248, + "grad_norm": 0.16178373992443085, + "learning_rate": 0.001, + "loss": 2.7029, + "num_input_tokens_seen": 5426380800, + "step": 20700 + }, + { + "epoch": 0.09897802210906663, + "grad_norm": 0.1880313903093338, + "learning_rate": 0.001, + "loss": 2.6867, + "num_input_tokens_seen": 5439488000, + "step": 20750 + }, + { + "epoch": 0.09921652336716076, + "grad_norm": 0.17611584067344666, + "learning_rate": 0.001, + "loss": 2.6741, + "num_input_tokens_seen": 5452595200, + "step": 20800 + }, + { + "epoch": 0.09945502462525489, + "grad_norm": 0.17712561786174774, + "learning_rate": 0.001, + "loss": 2.7099, + "num_input_tokens_seen": 5465702400, + "step": 20850 + }, + { + "epoch": 0.09969352588334904, + "grad_norm": 0.18022434413433075, + "learning_rate": 0.001, + "loss": 2.6988, + "num_input_tokens_seen": 5478809600, + "step": 20900 + }, + { + "epoch": 0.09993202714144317, + "grad_norm": 0.17434161901474, + "learning_rate": 0.001, + "loss": 2.6869, + "num_input_tokens_seen": 5491916800, + "step": 20950 + }, + { + "epoch": 0.1001705283995373, + "grad_norm": 0.17802472412586212, + "learning_rate": 0.001, + "loss": 2.6935, + "num_input_tokens_seen": 5505024000, + "step": 21000 + }, + { + "epoch": 0.1001705283995373, + "eval_loss": 2.5838851928710938, + "eval_runtime": 50.1977, + "eval_samples_per_second": 99.606, + "eval_steps_per_second": 24.902, + "num_input_tokens_seen": 5505024000, + "step": 21000 + }, + { + "epoch": 0.10040902965763145, + "grad_norm": 0.1723284274339676, + "learning_rate": 0.001, + "loss": 2.694, + "num_input_tokens_seen": 5518131200, + "step": 21050 + }, + { + "epoch": 0.10064753091572558, + "grad_norm": 0.1627894937992096, + "learning_rate": 0.001, + "loss": 2.6866, + "num_input_tokens_seen": 5531238400, + "step": 21100 + }, + { + "epoch": 0.10088603217381972, + "grad_norm": 0.20949719846248627, + "learning_rate": 0.001, + "loss": 2.6915, + "num_input_tokens_seen": 5544345600, + "step": 21150 + }, + { + "epoch": 0.10112453343191385, + "grad_norm": 0.1980736404657364, + "learning_rate": 0.001, + "loss": 2.7076, + "num_input_tokens_seen": 5557452800, + "step": 21200 + }, + { + "epoch": 0.10136303469000799, + "grad_norm": 0.20961201190948486, + "learning_rate": 0.001, + "loss": 2.6978, + "num_input_tokens_seen": 5570560000, + "step": 21250 + }, + { + "epoch": 0.10160153594810213, + "grad_norm": 0.18137700855731964, + "learning_rate": 0.001, + "loss": 2.7029, + "num_input_tokens_seen": 5583667200, + "step": 21300 + }, + { + "epoch": 0.10184003720619626, + "grad_norm": 0.17235560715198517, + "learning_rate": 0.001, + "loss": 2.6979, + "num_input_tokens_seen": 5596774400, + "step": 21350 + }, + { + "epoch": 0.1020785384642904, + "grad_norm": 0.17818449437618256, + "learning_rate": 0.001, + "loss": 2.6987, + "num_input_tokens_seen": 5609881600, + "step": 21400 + }, + { + "epoch": 0.10231703972238454, + "grad_norm": 0.1798463761806488, + "learning_rate": 0.001, + "loss": 2.693, + "num_input_tokens_seen": 5622988800, + "step": 21450 + }, + { + "epoch": 0.10255554098047867, + "grad_norm": 0.19028444588184357, + "learning_rate": 0.001, + "loss": 2.7079, + "num_input_tokens_seen": 5636096000, + "step": 21500 + }, + { + "epoch": 0.10255554098047867, + "eval_loss": 2.5816380977630615, + "eval_runtime": 50.7808, + "eval_samples_per_second": 98.462, + "eval_steps_per_second": 24.616, + "num_input_tokens_seen": 5636096000, + "step": 21500 + }, + { + "epoch": 0.1027940422385728, + "grad_norm": 0.1831275075674057, + "learning_rate": 0.001, + "loss": 2.7038, + "num_input_tokens_seen": 5649203200, + "step": 21550 + }, + { + "epoch": 0.10303254349666695, + "grad_norm": 0.17404012382030487, + "learning_rate": 0.001, + "loss": 2.7071, + "num_input_tokens_seen": 5662310400, + "step": 21600 + }, + { + "epoch": 0.10327104475476108, + "grad_norm": 0.1652098149061203, + "learning_rate": 0.001, + "loss": 2.7033, + "num_input_tokens_seen": 5675417600, + "step": 21650 + }, + { + "epoch": 0.10350954601285522, + "grad_norm": 0.1914501190185547, + "learning_rate": 0.001, + "loss": 2.6844, + "num_input_tokens_seen": 5688524800, + "step": 21700 + }, + { + "epoch": 0.10374804727094936, + "grad_norm": 0.19169588387012482, + "learning_rate": 0.001, + "loss": 2.6793, + "num_input_tokens_seen": 5701632000, + "step": 21750 + }, + { + "epoch": 0.10398654852904349, + "grad_norm": 0.17937491834163666, + "learning_rate": 0.001, + "loss": 2.6972, + "num_input_tokens_seen": 5714739200, + "step": 21800 + }, + { + "epoch": 0.10422504978713763, + "grad_norm": 0.17515376210212708, + "learning_rate": 0.001, + "loss": 2.6995, + "num_input_tokens_seen": 5727846400, + "step": 21850 + }, + { + "epoch": 0.10446355104523176, + "grad_norm": 0.18881027400493622, + "learning_rate": 0.001, + "loss": 2.7097, + "num_input_tokens_seen": 5740953600, + "step": 21900 + }, + { + "epoch": 0.1047020523033259, + "grad_norm": 0.19030135869979858, + "learning_rate": 0.001, + "loss": 2.6801, + "num_input_tokens_seen": 5754060800, + "step": 21950 + }, + { + "epoch": 0.10494055356142004, + "grad_norm": 0.17325563728809357, + "learning_rate": 0.001, + "loss": 2.6803, + "num_input_tokens_seen": 5767168000, + "step": 22000 + }, + { + "epoch": 0.10494055356142004, + "eval_loss": 2.577341318130493, + "eval_runtime": 50.8482, + "eval_samples_per_second": 98.332, + "eval_steps_per_second": 24.583, + "num_input_tokens_seen": 5767168000, + "step": 22000 + }, + { + "epoch": 0.10517905481951417, + "grad_norm": 0.19298380613327026, + "learning_rate": 0.001, + "loss": 2.6966, + "num_input_tokens_seen": 5780275200, + "step": 22050 + }, + { + "epoch": 0.1054175560776083, + "grad_norm": 0.1772100180387497, + "learning_rate": 0.001, + "loss": 2.6851, + "num_input_tokens_seen": 5793382400, + "step": 22100 + }, + { + "epoch": 0.10565605733570245, + "grad_norm": 0.18548481166362762, + "learning_rate": 0.001, + "loss": 2.7028, + "num_input_tokens_seen": 5806489600, + "step": 22150 + }, + { + "epoch": 0.10589455859379658, + "grad_norm": 0.20102089643478394, + "learning_rate": 0.001, + "loss": 2.6915, + "num_input_tokens_seen": 5819596800, + "step": 22200 + }, + { + "epoch": 0.10613305985189071, + "grad_norm": 0.1833849996328354, + "learning_rate": 0.001, + "loss": 2.6872, + "num_input_tokens_seen": 5832704000, + "step": 22250 + }, + { + "epoch": 0.10637156110998486, + "grad_norm": 0.17730027437210083, + "learning_rate": 0.001, + "loss": 2.6811, + "num_input_tokens_seen": 5845811200, + "step": 22300 + }, + { + "epoch": 0.10661006236807899, + "grad_norm": 0.1818256825208664, + "learning_rate": 0.001, + "loss": 2.7, + "num_input_tokens_seen": 5858918400, + "step": 22350 + }, + { + "epoch": 0.10684856362617313, + "grad_norm": 0.16850312054157257, + "learning_rate": 0.001, + "loss": 2.6927, + "num_input_tokens_seen": 5872025600, + "step": 22400 + }, + { + "epoch": 0.10708706488426727, + "grad_norm": 0.209822878241539, + "learning_rate": 0.001, + "loss": 2.6881, + "num_input_tokens_seen": 5885132800, + "step": 22450 + }, + { + "epoch": 0.1073255661423614, + "grad_norm": 0.2131560891866684, + "learning_rate": 0.001, + "loss": 2.6797, + "num_input_tokens_seen": 5898240000, + "step": 22500 + }, + { + "epoch": 0.1073255661423614, + "eval_loss": 2.575388193130493, + "eval_runtime": 50.9696, + "eval_samples_per_second": 98.098, + "eval_steps_per_second": 24.524, + "num_input_tokens_seen": 5898240000, + "step": 22500 + }, + { + "epoch": 0.10756406740045554, + "grad_norm": 0.18200135231018066, + "learning_rate": 0.001, + "loss": 2.6837, + "num_input_tokens_seen": 5911347200, + "step": 22550 + }, + { + "epoch": 0.10780256865854967, + "grad_norm": 0.1830984354019165, + "learning_rate": 0.001, + "loss": 2.7159, + "num_input_tokens_seen": 5924454400, + "step": 22600 + }, + { + "epoch": 0.1080410699166438, + "grad_norm": 0.1700614094734192, + "learning_rate": 0.001, + "loss": 2.6852, + "num_input_tokens_seen": 5937561600, + "step": 22650 + }, + { + "epoch": 0.10827957117473795, + "grad_norm": 0.18473868072032928, + "learning_rate": 0.001, + "loss": 2.6857, + "num_input_tokens_seen": 5950668800, + "step": 22700 + }, + { + "epoch": 0.10851807243283208, + "grad_norm": 0.19345365464687347, + "learning_rate": 0.001, + "loss": 2.69, + "num_input_tokens_seen": 5963776000, + "step": 22750 + }, + { + "epoch": 0.10875657369092621, + "grad_norm": 0.18807141482830048, + "learning_rate": 0.001, + "loss": 2.6897, + "num_input_tokens_seen": 5976883200, + "step": 22800 + }, + { + "epoch": 0.10899507494902036, + "grad_norm": 0.18426446616649628, + "learning_rate": 0.001, + "loss": 2.6855, + "num_input_tokens_seen": 5989990400, + "step": 22850 + }, + { + "epoch": 0.10923357620711449, + "grad_norm": 0.19184571504592896, + "learning_rate": 0.001, + "loss": 2.6914, + "num_input_tokens_seen": 6003097600, + "step": 22900 + }, + { + "epoch": 0.10947207746520862, + "grad_norm": 0.22897471487522125, + "learning_rate": 0.001, + "loss": 2.6812, + "num_input_tokens_seen": 6016204800, + "step": 22950 + }, + { + "epoch": 0.10971057872330277, + "grad_norm": 0.1939724087715149, + "learning_rate": 0.001, + "loss": 2.6836, + "num_input_tokens_seen": 6029312000, + "step": 23000 + }, + { + "epoch": 0.10971057872330277, + "eval_loss": 2.570572853088379, + "eval_runtime": 50.0606, + "eval_samples_per_second": 99.879, + "eval_steps_per_second": 24.97, + "num_input_tokens_seen": 6029312000, + "step": 23000 + }, + { + "epoch": 0.1099490799813969, + "grad_norm": 0.17564797401428223, + "learning_rate": 0.001, + "loss": 2.6912, + "num_input_tokens_seen": 6042419200, + "step": 23050 + }, + { + "epoch": 0.11018758123949104, + "grad_norm": 0.17937473952770233, + "learning_rate": 0.001, + "loss": 2.6708, + "num_input_tokens_seen": 6055526400, + "step": 23100 + }, + { + "epoch": 0.11042608249758518, + "grad_norm": 0.18281136453151703, + "learning_rate": 0.001, + "loss": 2.6855, + "num_input_tokens_seen": 6068633600, + "step": 23150 + }, + { + "epoch": 0.11066458375567931, + "grad_norm": 0.18834726512432098, + "learning_rate": 0.001, + "loss": 2.6887, + "num_input_tokens_seen": 6081740800, + "step": 23200 + }, + { + "epoch": 0.11090308501377345, + "grad_norm": 0.2104720175266266, + "learning_rate": 0.001, + "loss": 2.6914, + "num_input_tokens_seen": 6094848000, + "step": 23250 + }, + { + "epoch": 0.11114158627186758, + "grad_norm": 0.18674172461032867, + "learning_rate": 0.001, + "loss": 2.6855, + "num_input_tokens_seen": 6107955200, + "step": 23300 + }, + { + "epoch": 0.11138008752996172, + "grad_norm": 0.19519701600074768, + "learning_rate": 0.001, + "loss": 2.6851, + "num_input_tokens_seen": 6121062400, + "step": 23350 + }, + { + "epoch": 0.11161858878805586, + "grad_norm": 0.1752537339925766, + "learning_rate": 0.001, + "loss": 2.692, + "num_input_tokens_seen": 6134169600, + "step": 23400 + }, + { + "epoch": 0.11185709004614999, + "grad_norm": 0.1786031723022461, + "learning_rate": 0.001, + "loss": 2.6785, + "num_input_tokens_seen": 6147276800, + "step": 23450 + }, + { + "epoch": 0.11209559130424412, + "grad_norm": 0.19057604670524597, + "learning_rate": 0.001, + "loss": 2.6798, + "num_input_tokens_seen": 6160384000, + "step": 23500 + }, + { + "epoch": 0.11209559130424412, + "eval_loss": 2.5710463523864746, + "eval_runtime": 50.3332, + "eval_samples_per_second": 99.338, + "eval_steps_per_second": 24.835, + "num_input_tokens_seen": 6160384000, + "step": 23500 + }, + { + "epoch": 0.11233409256233827, + "grad_norm": 0.18272963166236877, + "learning_rate": 0.001, + "loss": 2.6847, + "num_input_tokens_seen": 6173491200, + "step": 23550 + }, + { + "epoch": 0.1125725938204324, + "grad_norm": 0.1666375696659088, + "learning_rate": 0.001, + "loss": 2.6747, + "num_input_tokens_seen": 6186598400, + "step": 23600 + }, + { + "epoch": 0.11281109507852653, + "grad_norm": 0.1688246876001358, + "learning_rate": 0.001, + "loss": 2.6963, + "num_input_tokens_seen": 6199705600, + "step": 23650 + }, + { + "epoch": 0.11304959633662068, + "grad_norm": 0.1970459669828415, + "learning_rate": 0.001, + "loss": 2.6904, + "num_input_tokens_seen": 6212812800, + "step": 23700 + }, + { + "epoch": 0.11328809759471481, + "grad_norm": 0.19660720229148865, + "learning_rate": 0.001, + "loss": 2.6833, + "num_input_tokens_seen": 6225920000, + "step": 23750 + }, + { + "epoch": 0.11352659885280895, + "grad_norm": 0.18711698055267334, + "learning_rate": 0.001, + "loss": 2.6872, + "num_input_tokens_seen": 6239027200, + "step": 23800 + }, + { + "epoch": 0.11376510011090309, + "grad_norm": 0.1878872513771057, + "learning_rate": 0.001, + "loss": 2.6884, + "num_input_tokens_seen": 6252134400, + "step": 23850 + }, + { + "epoch": 0.11400360136899722, + "grad_norm": 0.1969616860151291, + "learning_rate": 0.001, + "loss": 2.6982, + "num_input_tokens_seen": 6265241600, + "step": 23900 + }, + { + "epoch": 0.11424210262709136, + "grad_norm": 0.19693812727928162, + "learning_rate": 0.001, + "loss": 2.6782, + "num_input_tokens_seen": 6278348800, + "step": 23950 + }, + { + "epoch": 0.1144806038851855, + "grad_norm": 0.1731441468000412, + "learning_rate": 0.001, + "loss": 2.6917, + "num_input_tokens_seen": 6291456000, + "step": 24000 + }, + { + "epoch": 0.1144806038851855, + "eval_loss": 2.5664987564086914, + "eval_runtime": 50.3452, + "eval_samples_per_second": 99.314, + "eval_steps_per_second": 24.829, + "num_input_tokens_seen": 6291456000, + "step": 24000 + }, + { + "epoch": 0.11471910514327963, + "grad_norm": 0.1724429428577423, + "learning_rate": 0.001, + "loss": 2.6806, + "num_input_tokens_seen": 6304563200, + "step": 24050 + }, + { + "epoch": 0.11495760640137377, + "grad_norm": 0.20449388027191162, + "learning_rate": 0.001, + "loss": 2.6873, + "num_input_tokens_seen": 6317670400, + "step": 24100 + }, + { + "epoch": 0.1151961076594679, + "grad_norm": 0.19024738669395447, + "learning_rate": 0.001, + "loss": 2.6811, + "num_input_tokens_seen": 6330777600, + "step": 24150 + }, + { + "epoch": 0.11543460891756203, + "grad_norm": 0.20510025322437286, + "learning_rate": 0.001, + "loss": 2.6643, + "num_input_tokens_seen": 6343884800, + "step": 24200 + }, + { + "epoch": 0.11567311017565618, + "grad_norm": 0.1783556044101715, + "learning_rate": 0.001, + "loss": 2.6709, + "num_input_tokens_seen": 6356992000, + "step": 24250 + }, + { + "epoch": 0.11591161143375031, + "grad_norm": 0.1771089732646942, + "learning_rate": 0.001, + "loss": 2.6677, + "num_input_tokens_seen": 6370099200, + "step": 24300 + }, + { + "epoch": 0.11615011269184444, + "grad_norm": 0.17016734182834625, + "learning_rate": 0.001, + "loss": 2.6681, + "num_input_tokens_seen": 6383206400, + "step": 24350 + }, + { + "epoch": 0.11638861394993859, + "grad_norm": 0.1901489496231079, + "learning_rate": 0.001, + "loss": 2.6811, + "num_input_tokens_seen": 6396313600, + "step": 24400 + }, + { + "epoch": 0.11662711520803272, + "grad_norm": 0.18185457587242126, + "learning_rate": 0.001, + "loss": 2.6787, + "num_input_tokens_seen": 6409420800, + "step": 24450 + }, + { + "epoch": 0.11686561646612686, + "grad_norm": 0.1789853274822235, + "learning_rate": 0.001, + "loss": 2.6657, + "num_input_tokens_seen": 6422528000, + "step": 24500 + }, + { + "epoch": 0.11686561646612686, + "eval_loss": 2.564084768295288, + "eval_runtime": 50.4559, + "eval_samples_per_second": 99.096, + "eval_steps_per_second": 24.774, + "num_input_tokens_seen": 6422528000, + "step": 24500 + }, + { + "epoch": 0.117104117724221, + "grad_norm": 0.17294436693191528, + "learning_rate": 0.001, + "loss": 2.6812, + "num_input_tokens_seen": 6435635200, + "step": 24550 + }, + { + "epoch": 0.11734261898231513, + "grad_norm": 0.1840251386165619, + "learning_rate": 0.001, + "loss": 2.6599, + "num_input_tokens_seen": 6448742400, + "step": 24600 + }, + { + "epoch": 0.11758112024040927, + "grad_norm": 0.17588932812213898, + "learning_rate": 0.001, + "loss": 2.6742, + "num_input_tokens_seen": 6461849600, + "step": 24650 + }, + { + "epoch": 0.1178196214985034, + "grad_norm": 0.1805667132139206, + "learning_rate": 0.001, + "loss": 2.6647, + "num_input_tokens_seen": 6474956800, + "step": 24700 + }, + { + "epoch": 0.11805812275659754, + "grad_norm": 0.17930665612220764, + "learning_rate": 0.001, + "loss": 2.6763, + "num_input_tokens_seen": 6488064000, + "step": 24750 + }, + { + "epoch": 0.11829662401469168, + "grad_norm": 0.19195732474327087, + "learning_rate": 0.001, + "loss": 2.6716, + "num_input_tokens_seen": 6501171200, + "step": 24800 + }, + { + "epoch": 0.11853512527278581, + "grad_norm": 0.19274356961250305, + "learning_rate": 0.001, + "loss": 2.6702, + "num_input_tokens_seen": 6514278400, + "step": 24850 + }, + { + "epoch": 0.11877362653087994, + "grad_norm": 0.17423510551452637, + "learning_rate": 0.001, + "loss": 2.6733, + "num_input_tokens_seen": 6527385600, + "step": 24900 + }, + { + "epoch": 0.11901212778897409, + "grad_norm": 0.20267954468727112, + "learning_rate": 0.001, + "loss": 2.6649, + "num_input_tokens_seen": 6540492800, + "step": 24950 + }, + { + "epoch": 0.11925062904706822, + "grad_norm": 0.1756502240896225, + "learning_rate": 0.001, + "loss": 2.6582, + "num_input_tokens_seen": 6553600000, + "step": 25000 + }, + { + "epoch": 0.11925062904706822, + "eval_loss": 2.562955379486084, + "eval_runtime": 50.0071, + "eval_samples_per_second": 99.986, + "eval_steps_per_second": 24.996, + "num_input_tokens_seen": 6553600000, + "step": 25000 + }, + { + "epoch": 0.11948913030516237, + "grad_norm": 0.19173742830753326, + "learning_rate": 0.001, + "loss": 2.6871, + "num_input_tokens_seen": 6566707200, + "step": 25050 + }, + { + "epoch": 0.1197276315632565, + "grad_norm": 0.1746075600385666, + "learning_rate": 0.001, + "loss": 2.7003, + "num_input_tokens_seen": 6579814400, + "step": 25100 + }, + { + "epoch": 0.11996613282135063, + "grad_norm": 0.17817530035972595, + "learning_rate": 0.001, + "loss": 2.6944, + "num_input_tokens_seen": 6592921600, + "step": 25150 + }, + { + "epoch": 0.12020463407944477, + "grad_norm": 0.201807901263237, + "learning_rate": 0.001, + "loss": 2.6766, + "num_input_tokens_seen": 6606028800, + "step": 25200 + }, + { + "epoch": 0.1204431353375389, + "grad_norm": 0.18620917201042175, + "learning_rate": 0.001, + "loss": 2.6802, + "num_input_tokens_seen": 6619136000, + "step": 25250 + }, + { + "epoch": 0.12068163659563304, + "grad_norm": 0.17383818328380585, + "learning_rate": 0.001, + "loss": 2.6698, + "num_input_tokens_seen": 6632243200, + "step": 25300 + }, + { + "epoch": 0.12092013785372718, + "grad_norm": 0.1766287237405777, + "learning_rate": 0.001, + "loss": 2.6705, + "num_input_tokens_seen": 6645350400, + "step": 25350 + }, + { + "epoch": 0.12115863911182131, + "grad_norm": 0.19551052153110504, + "learning_rate": 0.001, + "loss": 2.678, + "num_input_tokens_seen": 6658457600, + "step": 25400 + }, + { + "epoch": 0.12139714036991545, + "grad_norm": 0.18625982105731964, + "learning_rate": 0.001, + "loss": 2.6688, + "num_input_tokens_seen": 6671564800, + "step": 25450 + }, + { + "epoch": 0.12163564162800959, + "grad_norm": 0.18274050951004028, + "learning_rate": 0.001, + "loss": 2.6818, + "num_input_tokens_seen": 6684672000, + "step": 25500 + }, + { + "epoch": 0.12163564162800959, + "eval_loss": 2.5602569580078125, + "eval_runtime": 50.4187, + "eval_samples_per_second": 99.17, + "eval_steps_per_second": 24.792, + "num_input_tokens_seen": 6684672000, + "step": 25500 + }, + { + "epoch": 0.12187414288610372, + "grad_norm": 0.18547837436199188, + "learning_rate": 0.001, + "loss": 2.6754, + "num_input_tokens_seen": 6697779200, + "step": 25550 + }, + { + "epoch": 0.12211264414419785, + "grad_norm": 0.18558937311172485, + "learning_rate": 0.001, + "loss": 2.6767, + "num_input_tokens_seen": 6710886400, + "step": 25600 + }, + { + "epoch": 0.122351145402292, + "grad_norm": 0.17276135087013245, + "learning_rate": 0.001, + "loss": 2.6775, + "num_input_tokens_seen": 6723993600, + "step": 25650 + }, + { + "epoch": 0.12258964666038613, + "grad_norm": 0.18483039736747742, + "learning_rate": 0.001, + "loss": 2.6818, + "num_input_tokens_seen": 6737100800, + "step": 25700 + }, + { + "epoch": 0.12282814791848028, + "grad_norm": 0.18036937713623047, + "learning_rate": 0.001, + "loss": 2.6669, + "num_input_tokens_seen": 6750208000, + "step": 25750 + }, + { + "epoch": 0.12306664917657441, + "grad_norm": 0.1728815734386444, + "learning_rate": 0.001, + "loss": 2.6789, + "num_input_tokens_seen": 6763315200, + "step": 25800 + }, + { + "epoch": 0.12330515043466854, + "grad_norm": 0.19193877279758453, + "learning_rate": 0.001, + "loss": 2.6487, + "num_input_tokens_seen": 6776422400, + "step": 25850 + }, + { + "epoch": 0.12354365169276268, + "grad_norm": 0.1584886610507965, + "learning_rate": 0.001, + "loss": 2.6638, + "num_input_tokens_seen": 6789529600, + "step": 25900 + }, + { + "epoch": 0.12378215295085682, + "grad_norm": 0.18792498111724854, + "learning_rate": 0.001, + "loss": 2.6754, + "num_input_tokens_seen": 6802636800, + "step": 25950 + }, + { + "epoch": 0.12402065420895095, + "grad_norm": 0.1689581423997879, + "learning_rate": 0.001, + "loss": 2.6682, + "num_input_tokens_seen": 6815744000, + "step": 26000 + }, + { + "epoch": 0.12402065420895095, + "eval_loss": 2.5587522983551025, + "eval_runtime": 50.7858, + "eval_samples_per_second": 98.453, + "eval_steps_per_second": 24.613, + "num_input_tokens_seen": 6815744000, + "step": 26000 + }, + { + "epoch": 0.1242591554670451, + "grad_norm": 0.18573056161403656, + "learning_rate": 0.001, + "loss": 2.6565, + "num_input_tokens_seen": 6828851200, + "step": 26050 + }, + { + "epoch": 0.12449765672513922, + "grad_norm": 0.19160890579223633, + "learning_rate": 0.001, + "loss": 2.6797, + "num_input_tokens_seen": 6841958400, + "step": 26100 + }, + { + "epoch": 0.12473615798323336, + "grad_norm": 0.18323373794555664, + "learning_rate": 0.001, + "loss": 2.6602, + "num_input_tokens_seen": 6855065600, + "step": 26150 + }, + { + "epoch": 0.1249746592413275, + "grad_norm": 0.17691807448863983, + "learning_rate": 0.001, + "loss": 2.6676, + "num_input_tokens_seen": 6868172800, + "step": 26200 + }, + { + "epoch": 0.12521316049942163, + "grad_norm": 0.20718660950660706, + "learning_rate": 0.001, + "loss": 2.6588, + "num_input_tokens_seen": 6881280000, + "step": 26250 + }, + { + "epoch": 0.12545166175751576, + "grad_norm": 0.17811058461666107, + "learning_rate": 0.001, + "loss": 2.6754, + "num_input_tokens_seen": 6894387200, + "step": 26300 + }, + { + "epoch": 0.1256901630156099, + "grad_norm": 0.17490555346012115, + "learning_rate": 0.001, + "loss": 2.6605, + "num_input_tokens_seen": 6907494400, + "step": 26350 + }, + { + "epoch": 0.12592866427370406, + "grad_norm": 0.17391368746757507, + "learning_rate": 0.001, + "loss": 2.684, + "num_input_tokens_seen": 6920601600, + "step": 26400 + }, + { + "epoch": 0.1261671655317982, + "grad_norm": 0.16951416432857513, + "learning_rate": 0.001, + "loss": 2.6685, + "num_input_tokens_seen": 6933708800, + "step": 26450 + }, + { + "epoch": 0.12640566678989232, + "grad_norm": 0.17574581503868103, + "learning_rate": 0.001, + "loss": 2.6665, + "num_input_tokens_seen": 6946816000, + "step": 26500 + }, + { + "epoch": 0.12640566678989232, + "eval_loss": 2.556109666824341, + "eval_runtime": 50.8743, + "eval_samples_per_second": 98.281, + "eval_steps_per_second": 24.57, + "num_input_tokens_seen": 6946816000, + "step": 26500 + }, + { + "epoch": 0.12664416804798645, + "grad_norm": 0.19910745322704315, + "learning_rate": 0.001, + "loss": 2.6755, + "num_input_tokens_seen": 6959923200, + "step": 26550 + }, + { + "epoch": 0.12688266930608058, + "grad_norm": 0.20141273736953735, + "learning_rate": 0.001, + "loss": 2.6798, + "num_input_tokens_seen": 6973030400, + "step": 26600 + }, + { + "epoch": 0.1271211705641747, + "grad_norm": 0.1732529103755951, + "learning_rate": 0.001, + "loss": 2.6606, + "num_input_tokens_seen": 6986137600, + "step": 26650 + }, + { + "epoch": 0.12735967182226887, + "grad_norm": 0.17546698451042175, + "learning_rate": 0.001, + "loss": 2.6717, + "num_input_tokens_seen": 6999244800, + "step": 26700 + }, + { + "epoch": 0.127598173080363, + "grad_norm": 0.2186097502708435, + "learning_rate": 0.001, + "loss": 2.6702, + "num_input_tokens_seen": 7012352000, + "step": 26750 + }, + { + "epoch": 0.12783667433845713, + "grad_norm": 0.1735202819108963, + "learning_rate": 0.001, + "loss": 2.6795, + "num_input_tokens_seen": 7025459200, + "step": 26800 + }, + { + "epoch": 0.12807517559655127, + "grad_norm": 0.40701860189437866, + "learning_rate": 0.001, + "loss": 2.6591, + "num_input_tokens_seen": 7038566400, + "step": 26850 + }, + { + "epoch": 0.1283136768546454, + "grad_norm": 0.19710049033164978, + "learning_rate": 0.001, + "loss": 2.6841, + "num_input_tokens_seen": 7051673600, + "step": 26900 + }, + { + "epoch": 0.12855217811273956, + "grad_norm": 0.18638554215431213, + "learning_rate": 0.001, + "loss": 2.6718, + "num_input_tokens_seen": 7064780800, + "step": 26950 + }, + { + "epoch": 0.1287906793708337, + "grad_norm": 0.17546561360359192, + "learning_rate": 0.001, + "loss": 2.6547, + "num_input_tokens_seen": 7077888000, + "step": 27000 + }, + { + "epoch": 0.1287906793708337, + "eval_loss": 2.551922559738159, + "eval_runtime": 50.1864, + "eval_samples_per_second": 99.629, + "eval_steps_per_second": 24.907, + "num_input_tokens_seen": 7077888000, + "step": 27000 + }, + { + "epoch": 0.12902918062892782, + "grad_norm": 0.1790401190519333, + "learning_rate": 0.001, + "loss": 2.672, + "num_input_tokens_seen": 7090995200, + "step": 27050 + }, + { + "epoch": 0.12926768188702195, + "grad_norm": 0.18173836171627045, + "learning_rate": 0.001, + "loss": 2.6563, + "num_input_tokens_seen": 7104102400, + "step": 27100 + }, + { + "epoch": 0.12950618314511608, + "grad_norm": 0.1827983856201172, + "learning_rate": 0.001, + "loss": 2.665, + "num_input_tokens_seen": 7117209600, + "step": 27150 + }, + { + "epoch": 0.12974468440321021, + "grad_norm": 0.20252254605293274, + "learning_rate": 0.001, + "loss": 2.675, + "num_input_tokens_seen": 7130316800, + "step": 27200 + }, + { + "epoch": 0.12998318566130437, + "grad_norm": 0.18492095172405243, + "learning_rate": 0.001, + "loss": 2.6801, + "num_input_tokens_seen": 7143424000, + "step": 27250 + }, + { + "epoch": 0.1302216869193985, + "grad_norm": 0.1962280571460724, + "learning_rate": 0.001, + "loss": 2.6551, + "num_input_tokens_seen": 7156531200, + "step": 27300 + }, + { + "epoch": 0.13046018817749264, + "grad_norm": 0.18813727796077728, + "learning_rate": 0.001, + "loss": 2.6728, + "num_input_tokens_seen": 7169638400, + "step": 27350 + }, + { + "epoch": 0.13069868943558677, + "grad_norm": 0.18111565709114075, + "learning_rate": 0.001, + "loss": 2.6743, + "num_input_tokens_seen": 7182745600, + "step": 27400 + }, + { + "epoch": 0.1309371906936809, + "grad_norm": 0.1727459728717804, + "learning_rate": 0.001, + "loss": 2.6596, + "num_input_tokens_seen": 7195852800, + "step": 27450 + }, + { + "epoch": 0.13117569195177506, + "grad_norm": 0.20097768306732178, + "learning_rate": 0.001, + "loss": 2.6651, + "num_input_tokens_seen": 7208960000, + "step": 27500 + }, + { + "epoch": 0.13117569195177506, + "eval_loss": 2.5501132011413574, + "eval_runtime": 50.3677, + "eval_samples_per_second": 99.27, + "eval_steps_per_second": 24.817, + "num_input_tokens_seen": 7208960000, + "step": 27500 + }, + { + "epoch": 0.1314141932098692, + "grad_norm": 0.17329637706279755, + "learning_rate": 0.001, + "loss": 2.663, + "num_input_tokens_seen": 7222067200, + "step": 27550 + }, + { + "epoch": 0.13165269446796332, + "grad_norm": 0.16942919790744781, + "learning_rate": 0.001, + "loss": 2.6609, + "num_input_tokens_seen": 7235174400, + "step": 27600 + }, + { + "epoch": 0.13189119572605745, + "grad_norm": 0.19828958809375763, + "learning_rate": 0.001, + "loss": 2.6625, + "num_input_tokens_seen": 7248281600, + "step": 27650 + }, + { + "epoch": 0.13212969698415158, + "grad_norm": 0.1928141862154007, + "learning_rate": 0.001, + "loss": 2.6597, + "num_input_tokens_seen": 7261388800, + "step": 27700 + }, + { + "epoch": 0.13236819824224572, + "grad_norm": 0.1870756894350052, + "learning_rate": 0.001, + "loss": 2.6718, + "num_input_tokens_seen": 7274496000, + "step": 27750 + }, + { + "epoch": 0.13260669950033988, + "grad_norm": 0.1786762923002243, + "learning_rate": 0.001, + "loss": 2.6631, + "num_input_tokens_seen": 7287603200, + "step": 27800 + }, + { + "epoch": 0.132845200758434, + "grad_norm": 0.1710624396800995, + "learning_rate": 0.001, + "loss": 2.6717, + "num_input_tokens_seen": 7300710400, + "step": 27850 + }, + { + "epoch": 0.13308370201652814, + "grad_norm": 0.1805214285850525, + "learning_rate": 0.001, + "loss": 2.6669, + "num_input_tokens_seen": 7313817600, + "step": 27900 + }, + { + "epoch": 0.13332220327462227, + "grad_norm": 0.18169906735420227, + "learning_rate": 0.001, + "loss": 2.6659, + "num_input_tokens_seen": 7326924800, + "step": 27950 + }, + { + "epoch": 0.1335607045327164, + "grad_norm": 0.16959500312805176, + "learning_rate": 0.001, + "loss": 2.6623, + "num_input_tokens_seen": 7340032000, + "step": 28000 + }, + { + "epoch": 0.1335607045327164, + "eval_loss": 2.548675060272217, + "eval_runtime": 50.3022, + "eval_samples_per_second": 99.399, + "eval_steps_per_second": 24.85, + "num_input_tokens_seen": 7340032000, + "step": 28000 + }, + { + "epoch": 0.13379920579081056, + "grad_norm": 0.19409704208374023, + "learning_rate": 0.001, + "loss": 2.6776, + "num_input_tokens_seen": 7353139200, + "step": 28050 + }, + { + "epoch": 0.1340377070489047, + "grad_norm": 0.1712968647480011, + "learning_rate": 0.001, + "loss": 2.6679, + "num_input_tokens_seen": 7366246400, + "step": 28100 + }, + { + "epoch": 0.13427620830699882, + "grad_norm": 0.20586130023002625, + "learning_rate": 0.001, + "loss": 2.6633, + "num_input_tokens_seen": 7379353600, + "step": 28150 + }, + { + "epoch": 0.13451470956509295, + "grad_norm": 0.1776891052722931, + "learning_rate": 0.001, + "loss": 2.6683, + "num_input_tokens_seen": 7392460800, + "step": 28200 + }, + { + "epoch": 0.1347532108231871, + "grad_norm": 0.19293451309204102, + "learning_rate": 0.001, + "loss": 2.6645, + "num_input_tokens_seen": 7405568000, + "step": 28250 + }, + { + "epoch": 0.13499171208128122, + "grad_norm": 0.17754724621772766, + "learning_rate": 0.001, + "loss": 2.6685, + "num_input_tokens_seen": 7418675200, + "step": 28300 + }, + { + "epoch": 0.13523021333937538, + "grad_norm": 0.17739038169384003, + "learning_rate": 0.001, + "loss": 2.6607, + "num_input_tokens_seen": 7431782400, + "step": 28350 + }, + { + "epoch": 0.1354687145974695, + "grad_norm": 0.175009086728096, + "learning_rate": 0.001, + "loss": 2.6679, + "num_input_tokens_seen": 7444889600, + "step": 28400 + }, + { + "epoch": 0.13570721585556364, + "grad_norm": 0.2229124754667282, + "learning_rate": 0.001, + "loss": 2.6687, + "num_input_tokens_seen": 7457996800, + "step": 28450 + }, + { + "epoch": 0.13594571711365777, + "grad_norm": 0.1791590005159378, + "learning_rate": 0.001, + "loss": 2.6741, + "num_input_tokens_seen": 7471104000, + "step": 28500 + }, + { + "epoch": 0.13594571711365777, + "eval_loss": 2.5456056594848633, + "eval_runtime": 50.6342, + "eval_samples_per_second": 98.747, + "eval_steps_per_second": 24.687, + "num_input_tokens_seen": 7471104000, + "step": 28500 + }, + { + "epoch": 0.1361842183717519, + "grad_norm": 0.18920041620731354, + "learning_rate": 0.001, + "loss": 2.6612, + "num_input_tokens_seen": 7484211200, + "step": 28550 + }, + { + "epoch": 0.13642271962984603, + "grad_norm": 0.19247522950172424, + "learning_rate": 0.001, + "loss": 2.6597, + "num_input_tokens_seen": 7497318400, + "step": 28600 + }, + { + "epoch": 0.1366612208879402, + "grad_norm": 0.22499197721481323, + "learning_rate": 0.001, + "loss": 2.6583, + "num_input_tokens_seen": 7510425600, + "step": 28650 + }, + { + "epoch": 0.13689972214603432, + "grad_norm": 0.18946559727191925, + "learning_rate": 0.001, + "loss": 2.6612, + "num_input_tokens_seen": 7523532800, + "step": 28700 + }, + { + "epoch": 0.13713822340412846, + "grad_norm": 0.19621454179286957, + "learning_rate": 0.001, + "loss": 2.6425, + "num_input_tokens_seen": 7536640000, + "step": 28750 + }, + { + "epoch": 0.1373767246622226, + "grad_norm": 0.21594376862049103, + "learning_rate": 0.001, + "loss": 2.6564, + "num_input_tokens_seen": 7549747200, + "step": 28800 + }, + { + "epoch": 0.13761522592031672, + "grad_norm": 0.18186470866203308, + "learning_rate": 0.001, + "loss": 2.6728, + "num_input_tokens_seen": 7562854400, + "step": 28850 + }, + { + "epoch": 0.13785372717841088, + "grad_norm": 0.19369743764400482, + "learning_rate": 0.001, + "loss": 2.6585, + "num_input_tokens_seen": 7575961600, + "step": 28900 + }, + { + "epoch": 0.138092228436505, + "grad_norm": 0.1897999793291092, + "learning_rate": 0.001, + "loss": 2.6564, + "num_input_tokens_seen": 7589068800, + "step": 28950 + }, + { + "epoch": 0.13833072969459914, + "grad_norm": 0.18076784908771515, + "learning_rate": 0.001, + "loss": 2.6453, + "num_input_tokens_seen": 7602176000, + "step": 29000 + }, + { + "epoch": 0.13833072969459914, + "eval_loss": 2.54413104057312, + "eval_runtime": 50.9152, + "eval_samples_per_second": 98.202, + "eval_steps_per_second": 24.551, + "num_input_tokens_seen": 7602176000, + "step": 29000 + }, + { + "epoch": 0.13856923095269327, + "grad_norm": 0.18520566821098328, + "learning_rate": 0.001, + "loss": 2.6644, + "num_input_tokens_seen": 7615283200, + "step": 29050 + }, + { + "epoch": 0.1388077322107874, + "grad_norm": 0.22739861905574799, + "learning_rate": 0.001, + "loss": 2.6597, + "num_input_tokens_seen": 7628390400, + "step": 29100 + }, + { + "epoch": 0.13904623346888154, + "grad_norm": 0.18451730906963348, + "learning_rate": 0.001, + "loss": 2.6432, + "num_input_tokens_seen": 7641497600, + "step": 29150 + }, + { + "epoch": 0.1392847347269757, + "grad_norm": 0.1865098923444748, + "learning_rate": 0.001, + "loss": 2.6651, + "num_input_tokens_seen": 7654604800, + "step": 29200 + }, + { + "epoch": 0.13952323598506983, + "grad_norm": 0.18676789104938507, + "learning_rate": 0.001, + "loss": 2.6597, + "num_input_tokens_seen": 7667712000, + "step": 29250 + }, + { + "epoch": 0.13976173724316396, + "grad_norm": 0.17463742196559906, + "learning_rate": 0.001, + "loss": 2.6571, + "num_input_tokens_seen": 7680819200, + "step": 29300 + }, + { + "epoch": 0.1400002385012581, + "grad_norm": 0.21621429920196533, + "learning_rate": 0.001, + "loss": 2.6342, + "num_input_tokens_seen": 7693926400, + "step": 29350 + }, + { + "epoch": 0.14023873975935222, + "grad_norm": 0.17493990063667297, + "learning_rate": 0.001, + "loss": 2.6536, + "num_input_tokens_seen": 7707033600, + "step": 29400 + }, + { + "epoch": 0.14047724101744638, + "grad_norm": 0.17649762332439423, + "learning_rate": 0.001, + "loss": 2.6526, + "num_input_tokens_seen": 7720140800, + "step": 29450 + }, + { + "epoch": 0.1407157422755405, + "grad_norm": 0.18224874138832092, + "learning_rate": 0.001, + "loss": 2.6635, + "num_input_tokens_seen": 7733248000, + "step": 29500 + }, + { + "epoch": 0.1407157422755405, + "eval_loss": 2.5433554649353027, + "eval_runtime": 51.2973, + "eval_samples_per_second": 97.471, + "eval_steps_per_second": 24.368, + "num_input_tokens_seen": 7733248000, + "step": 29500 + }, + { + "epoch": 0.14095424353363464, + "grad_norm": 0.21109874546527863, + "learning_rate": 0.001, + "loss": 2.6788, + "num_input_tokens_seen": 7746355200, + "step": 29550 + }, + { + "epoch": 0.14119274479172877, + "grad_norm": 0.17663723230361938, + "learning_rate": 0.001, + "loss": 2.6578, + "num_input_tokens_seen": 7759462400, + "step": 29600 + }, + { + "epoch": 0.1414312460498229, + "grad_norm": 0.18385198712348938, + "learning_rate": 0.001, + "loss": 2.676, + "num_input_tokens_seen": 7772569600, + "step": 29650 + }, + { + "epoch": 0.14166974730791704, + "grad_norm": 0.1829567402601242, + "learning_rate": 0.001, + "loss": 2.6586, + "num_input_tokens_seen": 7785676800, + "step": 29700 + }, + { + "epoch": 0.1419082485660112, + "grad_norm": 0.1907297968864441, + "learning_rate": 0.001, + "loss": 2.6508, + "num_input_tokens_seen": 7798784000, + "step": 29750 + }, + { + "epoch": 0.14214674982410533, + "grad_norm": 0.2106500118970871, + "learning_rate": 0.001, + "loss": 2.6578, + "num_input_tokens_seen": 7811891200, + "step": 29800 + }, + { + "epoch": 0.14238525108219946, + "grad_norm": 0.18974357843399048, + "learning_rate": 0.001, + "loss": 2.6506, + "num_input_tokens_seen": 7824998400, + "step": 29850 + }, + { + "epoch": 0.1426237523402936, + "grad_norm": 0.18876343965530396, + "learning_rate": 0.001, + "loss": 2.6663, + "num_input_tokens_seen": 7838105600, + "step": 29900 + }, + { + "epoch": 0.14286225359838772, + "grad_norm": 0.17305608093738556, + "learning_rate": 0.001, + "loss": 2.657, + "num_input_tokens_seen": 7851212800, + "step": 29950 + }, + { + "epoch": 0.14310075485648185, + "grad_norm": 0.18900860846042633, + "learning_rate": 0.001, + "loss": 2.6502, + "num_input_tokens_seen": 7864320000, + "step": 30000 + }, + { + "epoch": 0.14310075485648185, + "eval_loss": 2.540076971054077, + "eval_runtime": 50.1464, + "eval_samples_per_second": 99.708, + "eval_steps_per_second": 24.927, + "num_input_tokens_seen": 7864320000, + "step": 30000 + }, + { + "epoch": 0.143339256114576, + "grad_norm": 0.16919030249118805, + "learning_rate": 0.001, + "loss": 2.6729, + "num_input_tokens_seen": 7877427200, + "step": 30050 + }, + { + "epoch": 0.14357775737267015, + "grad_norm": 0.17828898131847382, + "learning_rate": 0.001, + "loss": 2.647, + "num_input_tokens_seen": 7890534400, + "step": 30100 + }, + { + "epoch": 0.14381625863076428, + "grad_norm": 0.1790715903043747, + "learning_rate": 0.001, + "loss": 2.6639, + "num_input_tokens_seen": 7903641600, + "step": 30150 + }, + { + "epoch": 0.1440547598888584, + "grad_norm": 0.18818187713623047, + "learning_rate": 0.001, + "loss": 2.6485, + "num_input_tokens_seen": 7916748800, + "step": 30200 + }, + { + "epoch": 0.14429326114695254, + "grad_norm": 0.2171814739704132, + "learning_rate": 0.001, + "loss": 2.6577, + "num_input_tokens_seen": 7929856000, + "step": 30250 + }, + { + "epoch": 0.1445317624050467, + "grad_norm": 0.1844399869441986, + "learning_rate": 0.001, + "loss": 2.6473, + "num_input_tokens_seen": 7942963200, + "step": 30300 + }, + { + "epoch": 0.14477026366314083, + "grad_norm": 0.19607801735401154, + "learning_rate": 0.001, + "loss": 2.6576, + "num_input_tokens_seen": 7956070400, + "step": 30350 + }, + { + "epoch": 0.14500876492123496, + "grad_norm": 0.1967996209859848, + "learning_rate": 0.001, + "loss": 2.64, + "num_input_tokens_seen": 7969177600, + "step": 30400 + }, + { + "epoch": 0.1452472661793291, + "grad_norm": 0.2087596207857132, + "learning_rate": 0.001, + "loss": 2.6485, + "num_input_tokens_seen": 7982284800, + "step": 30450 + }, + { + "epoch": 0.14548576743742322, + "grad_norm": 0.1938595473766327, + "learning_rate": 0.001, + "loss": 2.654, + "num_input_tokens_seen": 7995392000, + "step": 30500 + }, + { + "epoch": 0.14548576743742322, + "eval_loss": 2.537402391433716, + "eval_runtime": 50.7304, + "eval_samples_per_second": 98.56, + "eval_steps_per_second": 24.64, + "num_input_tokens_seen": 7995392000, + "step": 30500 + }, + { + "epoch": 0.14572426869551736, + "grad_norm": 0.18282300233840942, + "learning_rate": 0.001, + "loss": 2.6592, + "num_input_tokens_seen": 8008499200, + "step": 30550 + }, + { + "epoch": 0.14596276995361152, + "grad_norm": 0.1829262375831604, + "learning_rate": 0.001, + "loss": 2.6618, + "num_input_tokens_seen": 8021606400, + "step": 30600 + }, + { + "epoch": 0.14620127121170565, + "grad_norm": 0.19001947343349457, + "learning_rate": 0.001, + "loss": 2.649, + "num_input_tokens_seen": 8034713600, + "step": 30650 + }, + { + "epoch": 0.14643977246979978, + "grad_norm": 0.19943153858184814, + "learning_rate": 0.001, + "loss": 2.6578, + "num_input_tokens_seen": 8047820800, + "step": 30700 + }, + { + "epoch": 0.1466782737278939, + "grad_norm": 0.18482360243797302, + "learning_rate": 0.001, + "loss": 2.6616, + "num_input_tokens_seen": 8060928000, + "step": 30750 + }, + { + "epoch": 0.14691677498598804, + "grad_norm": 0.20858009159564972, + "learning_rate": 0.001, + "loss": 2.6684, + "num_input_tokens_seen": 8074035200, + "step": 30800 + }, + { + "epoch": 0.1471552762440822, + "grad_norm": 0.2759605646133423, + "learning_rate": 0.001, + "loss": 2.713, + "num_input_tokens_seen": 8087142400, + "step": 30850 + }, + { + "epoch": 0.14739377750217633, + "grad_norm": 0.22366145253181458, + "learning_rate": 0.001, + "loss": 2.7065, + "num_input_tokens_seen": 8100249600, + "step": 30900 + }, + { + "epoch": 0.14763227876027046, + "grad_norm": 0.22143268585205078, + "learning_rate": 0.001, + "loss": 2.672, + "num_input_tokens_seen": 8113356800, + "step": 30950 + }, + { + "epoch": 0.1478707800183646, + "grad_norm": 0.25140002369880676, + "learning_rate": 0.001, + "loss": 2.6658, + "num_input_tokens_seen": 8126464000, + "step": 31000 + }, + { + "epoch": 0.1478707800183646, + "eval_loss": 2.5451457500457764, + "eval_runtime": 50.0622, + "eval_samples_per_second": 99.876, + "eval_steps_per_second": 24.969, + "num_input_tokens_seen": 8126464000, + "step": 31000 + }, + { + "epoch": 0.14810928127645873, + "grad_norm": 0.20207786560058594, + "learning_rate": 0.001, + "loss": 2.6493, + "num_input_tokens_seen": 8139571200, + "step": 31050 + }, + { + "epoch": 0.14834778253455286, + "grad_norm": 0.20135898888111115, + "learning_rate": 0.001, + "loss": 2.6555, + "num_input_tokens_seen": 8152678400, + "step": 31100 + }, + { + "epoch": 0.14858628379264702, + "grad_norm": 0.19284267723560333, + "learning_rate": 0.001, + "loss": 2.6637, + "num_input_tokens_seen": 8165785600, + "step": 31150 + }, + { + "epoch": 0.14882478505074115, + "grad_norm": 0.17214693129062653, + "learning_rate": 0.001, + "loss": 2.6663, + "num_input_tokens_seen": 8178892800, + "step": 31200 + }, + { + "epoch": 0.14906328630883528, + "grad_norm": 0.19444549083709717, + "learning_rate": 0.001, + "loss": 2.6541, + "num_input_tokens_seen": 8192000000, + "step": 31250 + }, + { + "epoch": 0.1493017875669294, + "grad_norm": 0.19992901384830475, + "learning_rate": 0.001, + "loss": 2.6419, + "num_input_tokens_seen": 8205107200, + "step": 31300 + }, + { + "epoch": 0.14954028882502354, + "grad_norm": 0.16732315719127655, + "learning_rate": 0.001, + "loss": 2.6559, + "num_input_tokens_seen": 8218214400, + "step": 31350 + }, + { + "epoch": 0.1497787900831177, + "grad_norm": 0.4210798442363739, + "learning_rate": 0.001, + "loss": 2.6478, + "num_input_tokens_seen": 8231321600, + "step": 31400 + }, + { + "epoch": 0.15001729134121183, + "grad_norm": 0.2139436900615692, + "learning_rate": 0.001, + "loss": 2.6753, + "num_input_tokens_seen": 8244428800, + "step": 31450 + }, + { + "epoch": 0.15025579259930597, + "grad_norm": 0.19131046533584595, + "learning_rate": 0.001, + "loss": 2.6675, + "num_input_tokens_seen": 8257536000, + "step": 31500 + }, + { + "epoch": 0.15025579259930597, + "eval_loss": 2.5402350425720215, + "eval_runtime": 50.477, + "eval_samples_per_second": 99.055, + "eval_steps_per_second": 24.764, + "num_input_tokens_seen": 8257536000, + "step": 31500 + }, + { + "epoch": 0.1504942938574001, + "grad_norm": 0.20711492002010345, + "learning_rate": 0.001, + "loss": 2.6654, + "num_input_tokens_seen": 8270643200, + "step": 31550 + }, + { + "epoch": 0.15073279511549423, + "grad_norm": 0.1888076812028885, + "learning_rate": 0.001, + "loss": 2.6603, + "num_input_tokens_seen": 8283750400, + "step": 31600 + }, + { + "epoch": 0.15097129637358836, + "grad_norm": 0.18534335494041443, + "learning_rate": 0.001, + "loss": 2.6539, + "num_input_tokens_seen": 8296857600, + "step": 31650 + }, + { + "epoch": 0.15120979763168252, + "grad_norm": 0.2024192214012146, + "learning_rate": 0.001, + "loss": 2.6514, + "num_input_tokens_seen": 8309964800, + "step": 31700 + }, + { + "epoch": 0.15144829888977665, + "grad_norm": 0.18967773020267487, + "learning_rate": 0.001, + "loss": 2.6457, + "num_input_tokens_seen": 8323072000, + "step": 31750 + }, + { + "epoch": 0.15168680014787078, + "grad_norm": 0.18823806941509247, + "learning_rate": 0.001, + "loss": 2.6579, + "num_input_tokens_seen": 8336179200, + "step": 31800 + }, + { + "epoch": 0.1519253014059649, + "grad_norm": 0.20198485255241394, + "learning_rate": 0.001, + "loss": 2.6623, + "num_input_tokens_seen": 8349286400, + "step": 31850 + }, + { + "epoch": 0.15216380266405904, + "grad_norm": 0.19362477958202362, + "learning_rate": 0.001, + "loss": 2.6473, + "num_input_tokens_seen": 8362393600, + "step": 31900 + }, + { + "epoch": 0.15240230392215318, + "grad_norm": 0.18454812467098236, + "learning_rate": 0.001, + "loss": 2.6411, + "num_input_tokens_seen": 8375500800, + "step": 31950 + }, + { + "epoch": 0.15264080518024734, + "grad_norm": 0.1968630850315094, + "learning_rate": 0.001, + "loss": 2.6405, + "num_input_tokens_seen": 8388608000, + "step": 32000 + }, + { + "epoch": 0.15264080518024734, + "eval_loss": 2.5325138568878174, + "eval_runtime": 51.0134, + "eval_samples_per_second": 98.013, + "eval_steps_per_second": 24.503, + "num_input_tokens_seen": 8388608000, + "step": 32000 + }, + { + "epoch": 0.15287930643834147, + "grad_norm": 0.180119588971138, + "learning_rate": 0.001, + "loss": 2.6558, + "num_input_tokens_seen": 8401715200, + "step": 32050 + }, + { + "epoch": 0.1531178076964356, + "grad_norm": 0.1952589452266693, + "learning_rate": 0.001, + "loss": 2.6465, + "num_input_tokens_seen": 8414822400, + "step": 32100 + }, + { + "epoch": 0.15335630895452973, + "grad_norm": 0.1845589131116867, + "learning_rate": 0.001, + "loss": 2.6297, + "num_input_tokens_seen": 8427929600, + "step": 32150 + }, + { + "epoch": 0.15359481021262386, + "grad_norm": 0.20116594433784485, + "learning_rate": 0.001, + "loss": 2.6422, + "num_input_tokens_seen": 8441036800, + "step": 32200 + }, + { + "epoch": 0.15383331147071802, + "grad_norm": 0.1932612508535385, + "learning_rate": 0.001, + "loss": 2.6494, + "num_input_tokens_seen": 8454144000, + "step": 32250 + }, + { + "epoch": 0.15407181272881215, + "grad_norm": 0.17934490740299225, + "learning_rate": 0.001, + "loss": 2.6474, + "num_input_tokens_seen": 8467251200, + "step": 32300 + }, + { + "epoch": 0.15431031398690628, + "grad_norm": 0.19273313879966736, + "learning_rate": 0.001, + "loss": 2.6447, + "num_input_tokens_seen": 8480358400, + "step": 32350 + }, + { + "epoch": 0.15454881524500041, + "grad_norm": 0.1921055018901825, + "learning_rate": 0.001, + "loss": 2.665, + "num_input_tokens_seen": 8493465600, + "step": 32400 + }, + { + "epoch": 0.15478731650309455, + "grad_norm": 0.37117844820022583, + "learning_rate": 0.001, + "loss": 2.6351, + "num_input_tokens_seen": 8506572800, + "step": 32450 + }, + { + "epoch": 0.15502581776118868, + "grad_norm": 0.1884016990661621, + "learning_rate": 0.001, + "loss": 2.6436, + "num_input_tokens_seen": 8519680000, + "step": 32500 + }, + { + "epoch": 0.15502581776118868, + "eval_loss": 2.5313448905944824, + "eval_runtime": 50.7051, + "eval_samples_per_second": 98.609, + "eval_steps_per_second": 24.652, + "num_input_tokens_seen": 8519680000, + "step": 32500 + }, + { + "epoch": 0.15526431901928284, + "grad_norm": 0.22205407917499542, + "learning_rate": 0.001, + "loss": 2.6464, + "num_input_tokens_seen": 8532787200, + "step": 32550 + }, + { + "epoch": 0.15550282027737697, + "grad_norm": 0.18515361845493317, + "learning_rate": 0.001, + "loss": 2.642, + "num_input_tokens_seen": 8545894400, + "step": 32600 + }, + { + "epoch": 0.1557413215354711, + "grad_norm": 0.18903231620788574, + "learning_rate": 0.001, + "loss": 2.6446, + "num_input_tokens_seen": 8559001600, + "step": 32650 + }, + { + "epoch": 0.15597982279356523, + "grad_norm": 0.1857556253671646, + "learning_rate": 0.001, + "loss": 2.6561, + "num_input_tokens_seen": 8572108800, + "step": 32700 + }, + { + "epoch": 0.15621832405165936, + "grad_norm": 0.45706707239151, + "learning_rate": 0.001, + "loss": 2.6487, + "num_input_tokens_seen": 8585216000, + "step": 32750 + }, + { + "epoch": 0.15645682530975352, + "grad_norm": 0.20191136002540588, + "learning_rate": 0.001, + "loss": 2.6593, + "num_input_tokens_seen": 8598323200, + "step": 32800 + }, + { + "epoch": 0.15669532656784765, + "grad_norm": 0.21191105246543884, + "learning_rate": 0.001, + "loss": 2.659, + "num_input_tokens_seen": 8611430400, + "step": 32850 + }, + { + "epoch": 0.15693382782594179, + "grad_norm": 0.20596672594547272, + "learning_rate": 0.001, + "loss": 2.6354, + "num_input_tokens_seen": 8624537600, + "step": 32900 + }, + { + "epoch": 0.15717232908403592, + "grad_norm": 0.2952199876308441, + "learning_rate": 0.001, + "loss": 2.6501, + "num_input_tokens_seen": 8637644800, + "step": 32950 + }, + { + "epoch": 0.15741083034213005, + "grad_norm": 0.2217044234275818, + "learning_rate": 0.001, + "loss": 2.6495, + "num_input_tokens_seen": 8650752000, + "step": 33000 + }, + { + "epoch": 0.15741083034213005, + "eval_loss": 2.5319430828094482, + "eval_runtime": 50.8413, + "eval_samples_per_second": 98.345, + "eval_steps_per_second": 24.586, + "num_input_tokens_seen": 8650752000, + "step": 33000 + }, + { + "epoch": 0.15764933160022418, + "grad_norm": 0.2384626269340515, + "learning_rate": 0.001, + "loss": 2.6503, + "num_input_tokens_seen": 8663859200, + "step": 33050 + }, + { + "epoch": 0.15788783285831834, + "grad_norm": 0.18387843668460846, + "learning_rate": 0.001, + "loss": 2.6469, + "num_input_tokens_seen": 8676966400, + "step": 33100 + }, + { + "epoch": 0.15812633411641247, + "grad_norm": 0.23530641198158264, + "learning_rate": 0.001, + "loss": 2.6484, + "num_input_tokens_seen": 8690073600, + "step": 33150 + }, + { + "epoch": 0.1583648353745066, + "grad_norm": 0.2027565985918045, + "learning_rate": 0.001, + "loss": 2.6564, + "num_input_tokens_seen": 8703180800, + "step": 33200 + }, + { + "epoch": 0.15860333663260073, + "grad_norm": 0.21472220122814178, + "learning_rate": 0.001, + "loss": 2.6543, + "num_input_tokens_seen": 8716288000, + "step": 33250 + }, + { + "epoch": 0.15884183789069486, + "grad_norm": 0.19012615084648132, + "learning_rate": 0.001, + "loss": 2.6378, + "num_input_tokens_seen": 8729395200, + "step": 33300 + }, + { + "epoch": 0.159080339148789, + "grad_norm": 0.18018738925457, + "learning_rate": 0.001, + "loss": 2.6461, + "num_input_tokens_seen": 8742502400, + "step": 33350 + }, + { + "epoch": 0.15931884040688316, + "grad_norm": 0.20139184594154358, + "learning_rate": 0.001, + "loss": 2.6419, + "num_input_tokens_seen": 8755609600, + "step": 33400 + }, + { + "epoch": 0.1595573416649773, + "grad_norm": 0.20734767615795135, + "learning_rate": 0.001, + "loss": 2.6299, + "num_input_tokens_seen": 8768716800, + "step": 33450 + }, + { + "epoch": 0.15979584292307142, + "grad_norm": 0.18958640098571777, + "learning_rate": 0.001, + "loss": 2.6525, + "num_input_tokens_seen": 8781824000, + "step": 33500 + }, + { + "epoch": 0.15979584292307142, + "eval_loss": 2.5301430225372314, + "eval_runtime": 64.6928, + "eval_samples_per_second": 77.288, + "eval_steps_per_second": 19.322, + "num_input_tokens_seen": 8781824000, + "step": 33500 + }, + { + "epoch": 0.16003434418116555, + "grad_norm": 0.20421727001667023, + "learning_rate": 0.001, + "loss": 2.6445, + "num_input_tokens_seen": 8794931200, + "step": 33550 + }, + { + "epoch": 0.16027284543925968, + "grad_norm": 0.18347379565238953, + "learning_rate": 0.001, + "loss": 2.6525, + "num_input_tokens_seen": 8808038400, + "step": 33600 + }, + { + "epoch": 0.16051134669735384, + "grad_norm": 0.19450639188289642, + "learning_rate": 0.001, + "loss": 2.6356, + "num_input_tokens_seen": 8821145600, + "step": 33650 + }, + { + "epoch": 0.16074984795544797, + "grad_norm": 0.17953775823116302, + "learning_rate": 0.001, + "loss": 2.6424, + "num_input_tokens_seen": 8834252800, + "step": 33700 + }, + { + "epoch": 0.1609883492135421, + "grad_norm": 0.1990649551153183, + "learning_rate": 0.001, + "loss": 2.6608, + "num_input_tokens_seen": 8847360000, + "step": 33750 + }, + { + "epoch": 0.16122685047163623, + "grad_norm": 0.19343194365501404, + "learning_rate": 0.001, + "loss": 2.6604, + "num_input_tokens_seen": 8860467200, + "step": 33800 + }, + { + "epoch": 0.16146535172973037, + "grad_norm": 0.19385921955108643, + "learning_rate": 0.001, + "loss": 2.6354, + "num_input_tokens_seen": 8873574400, + "step": 33850 + }, + { + "epoch": 0.1617038529878245, + "grad_norm": 0.1828273981809616, + "learning_rate": 0.001, + "loss": 2.6578, + "num_input_tokens_seen": 8886681600, + "step": 33900 + }, + { + "epoch": 0.16194235424591866, + "grad_norm": 0.216063991189003, + "learning_rate": 0.001, + "loss": 2.6575, + "num_input_tokens_seen": 8899788800, + "step": 33950 + }, + { + "epoch": 0.1621808555040128, + "grad_norm": 0.20358648896217346, + "learning_rate": 0.001, + "loss": 2.6499, + "num_input_tokens_seen": 8912896000, + "step": 34000 + }, + { + "epoch": 0.1621808555040128, + "eval_loss": 2.5330910682678223, + "eval_runtime": 50.7961, + "eval_samples_per_second": 98.433, + "eval_steps_per_second": 24.608, + "num_input_tokens_seen": 8912896000, + "step": 34000 + }, + { + "epoch": 0.16241935676210692, + "grad_norm": 0.1935052126646042, + "learning_rate": 0.001, + "loss": 2.6583, + "num_input_tokens_seen": 8926003200, + "step": 34050 + }, + { + "epoch": 0.16265785802020105, + "grad_norm": 0.7825157642364502, + "learning_rate": 0.001, + "loss": 2.6481, + "num_input_tokens_seen": 8939110400, + "step": 34100 + }, + { + "epoch": 0.16289635927829518, + "grad_norm": 0.23290683329105377, + "learning_rate": 0.001, + "loss": 2.6925, + "num_input_tokens_seen": 8952217600, + "step": 34150 + }, + { + "epoch": 0.16313486053638934, + "grad_norm": 0.23564130067825317, + "learning_rate": 0.001, + "loss": 2.6495, + "num_input_tokens_seen": 8965324800, + "step": 34200 + }, + { + "epoch": 0.16337336179448347, + "grad_norm": 0.19592130184173584, + "learning_rate": 0.001, + "loss": 2.6536, + "num_input_tokens_seen": 8978432000, + "step": 34250 + }, + { + "epoch": 0.1636118630525776, + "grad_norm": 0.23535041511058807, + "learning_rate": 0.001, + "loss": 2.6608, + "num_input_tokens_seen": 8991539200, + "step": 34300 + }, + { + "epoch": 0.16385036431067174, + "grad_norm": 0.1991938352584839, + "learning_rate": 0.001, + "loss": 2.6458, + "num_input_tokens_seen": 9004641856, + "step": 34350 + }, + { + "epoch": 0.16408886556876587, + "grad_norm": 0.19363388419151306, + "learning_rate": 0.001, + "loss": 2.6531, + "num_input_tokens_seen": 9017749056, + "step": 34400 + }, + { + "epoch": 0.16432736682686, + "grad_norm": 0.18500390648841858, + "learning_rate": 0.001, + "loss": 2.6391, + "num_input_tokens_seen": 9030856256, + "step": 34450 + }, + { + "epoch": 0.16456586808495416, + "grad_norm": 0.2774065434932709, + "learning_rate": 0.001, + "loss": 2.6619, + "num_input_tokens_seen": 9043963456, + "step": 34500 + }, + { + "epoch": 0.16456586808495416, + "eval_loss": 2.5325100421905518, + "eval_runtime": 51.5954, + "eval_samples_per_second": 96.908, + "eval_steps_per_second": 24.227, + "num_input_tokens_seen": 9043963456, + "step": 34500 + }, + { + "epoch": 0.1648043693430483, + "grad_norm": 0.1957511603832245, + "learning_rate": 0.001, + "loss": 2.6456, + "num_input_tokens_seen": 9057070656, + "step": 34550 + }, + { + "epoch": 0.16504287060114242, + "grad_norm": 0.20958378911018372, + "learning_rate": 0.001, + "loss": 2.6452, + "num_input_tokens_seen": 9070177856, + "step": 34600 + }, + { + "epoch": 0.16528137185923655, + "grad_norm": 0.206208735704422, + "learning_rate": 0.001, + "loss": 2.6548, + "num_input_tokens_seen": 9083285056, + "step": 34650 + }, + { + "epoch": 0.16551987311733068, + "grad_norm": 0.22349481284618378, + "learning_rate": 0.001, + "loss": 2.6653, + "num_input_tokens_seen": 9096392256, + "step": 34700 + }, + { + "epoch": 0.16575837437542484, + "grad_norm": 0.22599968314170837, + "learning_rate": 0.001, + "loss": 2.6329, + "num_input_tokens_seen": 9109499456, + "step": 34750 + }, + { + "epoch": 0.16599687563351898, + "grad_norm": 0.19219790399074554, + "learning_rate": 0.001, + "loss": 2.6404, + "num_input_tokens_seen": 9122606656, + "step": 34800 + }, + { + "epoch": 0.1662353768916131, + "grad_norm": 0.2006351351737976, + "learning_rate": 0.001, + "loss": 2.6522, + "num_input_tokens_seen": 9135713856, + "step": 34850 + }, + { + "epoch": 0.16647387814970724, + "grad_norm": 0.18393316864967346, + "learning_rate": 0.001, + "loss": 2.6464, + "num_input_tokens_seen": 9148821056, + "step": 34900 + }, + { + "epoch": 0.16671237940780137, + "grad_norm": 0.19820146262645721, + "learning_rate": 0.001, + "loss": 2.6402, + "num_input_tokens_seen": 9161928256, + "step": 34950 + }, + { + "epoch": 0.1669508806658955, + "grad_norm": 0.1995670199394226, + "learning_rate": 0.001, + "loss": 2.652, + "num_input_tokens_seen": 9175035456, + "step": 35000 + }, + { + "epoch": 0.1669508806658955, + "eval_loss": 2.5248045921325684, + "eval_runtime": 50.8205, + "eval_samples_per_second": 98.386, + "eval_steps_per_second": 24.596, + "num_input_tokens_seen": 9175035456, + "step": 35000 + }, + { + "epoch": 0.16718938192398966, + "grad_norm": 0.2099646031856537, + "learning_rate": 0.001, + "loss": 2.6307, + "num_input_tokens_seen": 9188142656, + "step": 35050 + }, + { + "epoch": 0.1674278831820838, + "grad_norm": 0.18913927674293518, + "learning_rate": 0.001, + "loss": 2.6368, + "num_input_tokens_seen": 9201249856, + "step": 35100 + }, + { + "epoch": 0.16766638444017792, + "grad_norm": 0.19193056225776672, + "learning_rate": 0.001, + "loss": 2.6325, + "num_input_tokens_seen": 9214357056, + "step": 35150 + }, + { + "epoch": 0.16790488569827206, + "grad_norm": 0.19911837577819824, + "learning_rate": 0.001, + "loss": 2.6543, + "num_input_tokens_seen": 9227464256, + "step": 35200 + }, + { + "epoch": 0.1681433869563662, + "grad_norm": 0.1985558718442917, + "learning_rate": 0.001, + "loss": 2.6518, + "num_input_tokens_seen": 9240571456, + "step": 35250 + }, + { + "epoch": 0.16838188821446032, + "grad_norm": 0.2079145759344101, + "learning_rate": 0.001, + "loss": 2.646, + "num_input_tokens_seen": 9253678656, + "step": 35300 + }, + { + "epoch": 0.16862038947255448, + "grad_norm": 0.18524424731731415, + "learning_rate": 0.001, + "loss": 2.6378, + "num_input_tokens_seen": 9266785856, + "step": 35350 + }, + { + "epoch": 0.1688588907306486, + "grad_norm": 0.19140370190143585, + "learning_rate": 0.001, + "loss": 2.6488, + "num_input_tokens_seen": 9279893056, + "step": 35400 + }, + { + "epoch": 0.16909739198874274, + "grad_norm": 0.18006138503551483, + "learning_rate": 0.001, + "loss": 2.6632, + "num_input_tokens_seen": 9293000256, + "step": 35450 + }, + { + "epoch": 0.16933589324683687, + "grad_norm": 0.18754282593727112, + "learning_rate": 0.001, + "loss": 2.6436, + "num_input_tokens_seen": 9306107456, + "step": 35500 + }, + { + "epoch": 0.16933589324683687, + "eval_loss": 2.5230932235717773, + "eval_runtime": 50.9895, + "eval_samples_per_second": 98.059, + "eval_steps_per_second": 24.515, + "num_input_tokens_seen": 9306107456, + "step": 35500 + }, + { + "epoch": 0.169574394504931, + "grad_norm": 0.18708109855651855, + "learning_rate": 0.001, + "loss": 2.6509, + "num_input_tokens_seen": 9319214656, + "step": 35550 + }, + { + "epoch": 0.16981289576302516, + "grad_norm": 0.2019611895084381, + "learning_rate": 0.001, + "loss": 2.6333, + "num_input_tokens_seen": 9332321856, + "step": 35600 + }, + { + "epoch": 0.1700513970211193, + "grad_norm": 0.22504755854606628, + "learning_rate": 0.001, + "loss": 2.6359, + "num_input_tokens_seen": 9345429056, + "step": 35650 + }, + { + "epoch": 0.17028989827921343, + "grad_norm": 0.1972053200006485, + "learning_rate": 0.001, + "loss": 2.6362, + "num_input_tokens_seen": 9358536256, + "step": 35700 + }, + { + "epoch": 0.17052839953730756, + "grad_norm": 0.21156789362430573, + "learning_rate": 0.001, + "loss": 2.637, + "num_input_tokens_seen": 9371643456, + "step": 35750 + }, + { + "epoch": 0.1707669007954017, + "grad_norm": 0.2680750787258148, + "learning_rate": 0.001, + "loss": 2.6332, + "num_input_tokens_seen": 9384750656, + "step": 35800 + }, + { + "epoch": 0.17100540205349582, + "grad_norm": 0.24413707852363586, + "learning_rate": 0.001, + "loss": 2.6366, + "num_input_tokens_seen": 9397857856, + "step": 35850 + }, + { + "epoch": 0.17124390331158998, + "grad_norm": 0.19973772764205933, + "learning_rate": 0.001, + "loss": 2.6381, + "num_input_tokens_seen": 9410965056, + "step": 35900 + }, + { + "epoch": 0.1714824045696841, + "grad_norm": 0.20807349681854248, + "learning_rate": 0.001, + "loss": 2.6416, + "num_input_tokens_seen": 9424072256, + "step": 35950 + }, + { + "epoch": 0.17172090582777824, + "grad_norm": 0.20126542448997498, + "learning_rate": 0.001, + "loss": 2.6377, + "num_input_tokens_seen": 9437179456, + "step": 36000 + }, + { + "epoch": 0.17172090582777824, + "eval_loss": 2.52500581741333, + "eval_runtime": 51.4353, + "eval_samples_per_second": 97.21, + "eval_steps_per_second": 24.302, + "num_input_tokens_seen": 9437179456, + "step": 36000 + }, + { + "epoch": 0.17195940708587237, + "grad_norm": 0.19696597754955292, + "learning_rate": 0.001, + "loss": 2.6521, + "num_input_tokens_seen": 9450286656, + "step": 36050 + }, + { + "epoch": 0.1721979083439665, + "grad_norm": 0.18839424848556519, + "learning_rate": 0.001, + "loss": 2.6484, + "num_input_tokens_seen": 9463393856, + "step": 36100 + }, + { + "epoch": 0.17243640960206066, + "grad_norm": 0.33748558163642883, + "learning_rate": 0.001, + "loss": 2.6496, + "num_input_tokens_seen": 9476501056, + "step": 36150 + }, + { + "epoch": 0.1726749108601548, + "grad_norm": 0.19529207050800323, + "learning_rate": 0.001, + "loss": 2.6484, + "num_input_tokens_seen": 9489608256, + "step": 36200 + }, + { + "epoch": 0.17291341211824893, + "grad_norm": 0.21542242169380188, + "learning_rate": 0.001, + "loss": 2.6572, + "num_input_tokens_seen": 9502715456, + "step": 36250 + }, + { + "epoch": 0.17315191337634306, + "grad_norm": 0.37017494440078735, + "learning_rate": 0.001, + "loss": 2.6517, + "num_input_tokens_seen": 9515822656, + "step": 36300 + }, + { + "epoch": 0.1733904146344372, + "grad_norm": 0.27284151315689087, + "learning_rate": 0.001, + "loss": 2.66, + "num_input_tokens_seen": 9528929856, + "step": 36350 + }, + { + "epoch": 0.17362891589253132, + "grad_norm": 0.4666242003440857, + "learning_rate": 0.001, + "loss": 2.6514, + "num_input_tokens_seen": 9542037056, + "step": 36400 + }, + { + "epoch": 0.17386741715062548, + "grad_norm": 0.2031467854976654, + "learning_rate": 0.001, + "loss": 2.6577, + "num_input_tokens_seen": 9555144256, + "step": 36450 + }, + { + "epoch": 0.1741059184087196, + "grad_norm": 0.2086576223373413, + "learning_rate": 0.001, + "loss": 2.6372, + "num_input_tokens_seen": 9568251456, + "step": 36500 + }, + { + "epoch": 0.1741059184087196, + "eval_loss": 2.5252223014831543, + "eval_runtime": 51.1282, + "eval_samples_per_second": 97.793, + "eval_steps_per_second": 24.448, + "num_input_tokens_seen": 9568251456, + "step": 36500 + }, + { + "epoch": 0.17434441966681374, + "grad_norm": 0.19739161431789398, + "learning_rate": 0.001, + "loss": 2.6184, + "num_input_tokens_seen": 9581358656, + "step": 36550 + }, + { + "epoch": 0.17458292092490788, + "grad_norm": 0.22384846210479736, + "learning_rate": 0.001, + "loss": 2.6504, + "num_input_tokens_seen": 9594465856, + "step": 36600 + }, + { + "epoch": 0.174821422183002, + "grad_norm": 0.2055511772632599, + "learning_rate": 0.001, + "loss": 2.6333, + "num_input_tokens_seen": 9607573056, + "step": 36650 + }, + { + "epoch": 0.17505992344109614, + "grad_norm": 0.18193551898002625, + "learning_rate": 0.001, + "loss": 2.6518, + "num_input_tokens_seen": 9620680256, + "step": 36700 + }, + { + "epoch": 0.1752984246991903, + "grad_norm": 0.1968860775232315, + "learning_rate": 0.001, + "loss": 2.6211, + "num_input_tokens_seen": 9633787456, + "step": 36750 + }, + { + "epoch": 0.17553692595728443, + "grad_norm": 0.20429988205432892, + "learning_rate": 0.001, + "loss": 2.6269, + "num_input_tokens_seen": 9646894656, + "step": 36800 + }, + { + "epoch": 0.17577542721537856, + "grad_norm": 0.18364110589027405, + "learning_rate": 0.001, + "loss": 2.6337, + "num_input_tokens_seen": 9660001856, + "step": 36850 + }, + { + "epoch": 0.1760139284734727, + "grad_norm": 0.2051621973514557, + "learning_rate": 0.001, + "loss": 2.6297, + "num_input_tokens_seen": 9673109056, + "step": 36900 + }, + { + "epoch": 0.17625242973156682, + "grad_norm": 0.25841349363327026, + "learning_rate": 0.001, + "loss": 2.6678, + "num_input_tokens_seen": 9686216256, + "step": 36950 + }, + { + "epoch": 0.17649093098966098, + "grad_norm": 0.198688805103302, + "learning_rate": 0.001, + "loss": 2.6521, + "num_input_tokens_seen": 9699323456, + "step": 37000 + }, + { + "epoch": 0.17649093098966098, + "eval_loss": 2.5248863697052, + "eval_runtime": 51.363, + "eval_samples_per_second": 97.346, + "eval_steps_per_second": 24.337, + "num_input_tokens_seen": 9699323456, + "step": 37000 + }, + { + "epoch": 0.1767294322477551, + "grad_norm": 0.2030065804719925, + "learning_rate": 0.001, + "loss": 2.6481, + "num_input_tokens_seen": 9712430656, + "step": 37050 + }, + { + "epoch": 0.17696793350584925, + "grad_norm": 0.20191729068756104, + "learning_rate": 0.001, + "loss": 2.6332, + "num_input_tokens_seen": 9725537856, + "step": 37100 + }, + { + "epoch": 0.17720643476394338, + "grad_norm": 0.19462484121322632, + "learning_rate": 0.001, + "loss": 2.6444, + "num_input_tokens_seen": 9738645056, + "step": 37150 + }, + { + "epoch": 0.1774449360220375, + "grad_norm": 0.27893325686454773, + "learning_rate": 0.001, + "loss": 2.6309, + "num_input_tokens_seen": 9751752256, + "step": 37200 + }, + { + "epoch": 0.17768343728013164, + "grad_norm": 0.20646531879901886, + "learning_rate": 0.001, + "loss": 2.646, + "num_input_tokens_seen": 9764859456, + "step": 37250 + }, + { + "epoch": 0.1779219385382258, + "grad_norm": 0.20815566182136536, + "learning_rate": 0.001, + "loss": 2.6374, + "num_input_tokens_seen": 9777966656, + "step": 37300 + }, + { + "epoch": 0.17816043979631993, + "grad_norm": 0.2194615602493286, + "learning_rate": 0.001, + "loss": 2.6313, + "num_input_tokens_seen": 9791073856, + "step": 37350 + }, + { + "epoch": 0.17839894105441406, + "grad_norm": 0.23223313689231873, + "learning_rate": 0.001, + "loss": 2.6435, + "num_input_tokens_seen": 9804181056, + "step": 37400 + }, + { + "epoch": 0.1786374423125082, + "grad_norm": 0.1731143593788147, + "learning_rate": 0.001, + "loss": 2.6397, + "num_input_tokens_seen": 9817288256, + "step": 37450 + }, + { + "epoch": 0.17887594357060232, + "grad_norm": 0.1929951161146164, + "learning_rate": 0.001, + "loss": 2.6406, + "num_input_tokens_seen": 9830395456, + "step": 37500 + }, + { + "epoch": 0.17887594357060232, + "eval_loss": 2.518928050994873, + "eval_runtime": 51.8733, + "eval_samples_per_second": 96.389, + "eval_steps_per_second": 24.097, + "num_input_tokens_seen": 9830395456, + "step": 37500 + }, + { + "epoch": 0.17911444482869648, + "grad_norm": 0.19979524612426758, + "learning_rate": 0.001, + "loss": 2.6363, + "num_input_tokens_seen": 9843502656, + "step": 37550 + }, + { + "epoch": 0.17935294608679062, + "grad_norm": 0.17963503301143646, + "learning_rate": 0.001, + "loss": 2.6423, + "num_input_tokens_seen": 9856609856, + "step": 37600 + }, + { + "epoch": 0.17959144734488475, + "grad_norm": 0.18216437101364136, + "learning_rate": 0.001, + "loss": 2.6351, + "num_input_tokens_seen": 9869717056, + "step": 37650 + }, + { + "epoch": 0.17982994860297888, + "grad_norm": 0.16782627999782562, + "learning_rate": 0.001, + "loss": 2.623, + "num_input_tokens_seen": 9882824256, + "step": 37700 + }, + { + "epoch": 0.180068449861073, + "grad_norm": 0.21884289383888245, + "learning_rate": 0.001, + "loss": 2.6418, + "num_input_tokens_seen": 9895931456, + "step": 37750 + }, + { + "epoch": 0.18030695111916714, + "grad_norm": 0.18940453231334686, + "learning_rate": 0.001, + "loss": 2.6371, + "num_input_tokens_seen": 9909038656, + "step": 37800 + }, + { + "epoch": 0.1805454523772613, + "grad_norm": 0.2075282484292984, + "learning_rate": 0.001, + "loss": 2.6347, + "num_input_tokens_seen": 9922145856, + "step": 37850 + }, + { + "epoch": 0.18078395363535543, + "grad_norm": 0.18504877388477325, + "learning_rate": 0.001, + "loss": 2.6391, + "num_input_tokens_seen": 9935253056, + "step": 37900 + }, + { + "epoch": 0.18102245489344956, + "grad_norm": 0.17926527559757233, + "learning_rate": 0.001, + "loss": 2.6358, + "num_input_tokens_seen": 9948360256, + "step": 37950 + }, + { + "epoch": 0.1812609561515437, + "grad_norm": 0.20022514462471008, + "learning_rate": 0.001, + "loss": 2.6369, + "num_input_tokens_seen": 9961467456, + "step": 38000 + }, + { + "epoch": 0.1812609561515437, + "eval_loss": 2.5171313285827637, + "eval_runtime": 51.617, + "eval_samples_per_second": 96.867, + "eval_steps_per_second": 24.217, + "num_input_tokens_seen": 9961467456, + "step": 38000 + }, + { + "epoch": 0.18149945740963783, + "grad_norm": 0.19376301765441895, + "learning_rate": 0.001, + "loss": 2.6274, + "num_input_tokens_seen": 9974574656, + "step": 38050 + }, + { + "epoch": 0.18173795866773199, + "grad_norm": 0.2077150195837021, + "learning_rate": 0.001, + "loss": 2.6303, + "num_input_tokens_seen": 9987681856, + "step": 38100 + }, + { + "epoch": 0.18197645992582612, + "grad_norm": 0.19407787919044495, + "learning_rate": 0.001, + "loss": 2.6246, + "num_input_tokens_seen": 10000789056, + "step": 38150 + }, + { + "epoch": 0.18221496118392025, + "grad_norm": 0.20558005571365356, + "learning_rate": 0.001, + "loss": 2.6291, + "num_input_tokens_seen": 10013896256, + "step": 38200 + }, + { + "epoch": 0.18245346244201438, + "grad_norm": 0.22928735613822937, + "learning_rate": 0.001, + "loss": 2.6336, + "num_input_tokens_seen": 10027003456, + "step": 38250 + }, + { + "epoch": 0.1826919637001085, + "grad_norm": 0.23481298983097076, + "learning_rate": 0.001, + "loss": 2.6412, + "num_input_tokens_seen": 10040110656, + "step": 38300 + }, + { + "epoch": 0.18293046495820264, + "grad_norm": 0.19808940589427948, + "learning_rate": 0.001, + "loss": 2.6395, + "num_input_tokens_seen": 10053217856, + "step": 38350 + }, + { + "epoch": 0.1831689662162968, + "grad_norm": 0.20152992010116577, + "learning_rate": 0.001, + "loss": 2.6224, + "num_input_tokens_seen": 10066325056, + "step": 38400 + }, + { + "epoch": 0.18340746747439093, + "grad_norm": 0.18065959215164185, + "learning_rate": 0.001, + "loss": 2.626, + "num_input_tokens_seen": 10079432256, + "step": 38450 + }, + { + "epoch": 0.18364596873248507, + "grad_norm": 0.20382963120937347, + "learning_rate": 0.001, + "loss": 2.6382, + "num_input_tokens_seen": 10092539456, + "step": 38500 + }, + { + "epoch": 0.18364596873248507, + "eval_loss": 2.5152854919433594, + "eval_runtime": 51.2566, + "eval_samples_per_second": 97.548, + "eval_steps_per_second": 24.387, + "num_input_tokens_seen": 10092539456, + "step": 38500 + }, + { + "epoch": 0.1838844699905792, + "grad_norm": 0.17728358507156372, + "learning_rate": 0.001, + "loss": 2.6293, + "num_input_tokens_seen": 10105646656, + "step": 38550 + }, + { + "epoch": 0.18412297124867333, + "grad_norm": 0.20164869725704193, + "learning_rate": 0.001, + "loss": 2.6372, + "num_input_tokens_seen": 10118753856, + "step": 38600 + }, + { + "epoch": 0.18436147250676746, + "grad_norm": 0.20125731825828552, + "learning_rate": 0.001, + "loss": 2.6326, + "num_input_tokens_seen": 10131861056, + "step": 38650 + }, + { + "epoch": 0.18459997376486162, + "grad_norm": 0.21193954348564148, + "learning_rate": 0.001, + "loss": 2.6249, + "num_input_tokens_seen": 10144968256, + "step": 38700 + }, + { + "epoch": 0.18483847502295575, + "grad_norm": 0.1925983726978302, + "learning_rate": 0.001, + "loss": 2.6424, + "num_input_tokens_seen": 10158075456, + "step": 38750 + }, + { + "epoch": 0.18507697628104988, + "grad_norm": 0.19814860820770264, + "learning_rate": 0.001, + "loss": 2.6431, + "num_input_tokens_seen": 10171182656, + "step": 38800 + }, + { + "epoch": 0.185315477539144, + "grad_norm": 0.1909031718969345, + "learning_rate": 0.001, + "loss": 2.6068, + "num_input_tokens_seen": 10184289856, + "step": 38850 + }, + { + "epoch": 0.18555397879723814, + "grad_norm": 0.20779775083065033, + "learning_rate": 0.001, + "loss": 2.625, + "num_input_tokens_seen": 10197397056, + "step": 38900 + }, + { + "epoch": 0.1857924800553323, + "grad_norm": 0.1768522411584854, + "learning_rate": 0.001, + "loss": 2.6112, + "num_input_tokens_seen": 10210504256, + "step": 38950 + }, + { + "epoch": 0.18603098131342644, + "grad_norm": 0.20275786519050598, + "learning_rate": 0.001, + "loss": 2.6284, + "num_input_tokens_seen": 10223611456, + "step": 39000 + }, + { + "epoch": 0.18603098131342644, + "eval_loss": 2.5149083137512207, + "eval_runtime": 51.6703, + "eval_samples_per_second": 96.767, + "eval_steps_per_second": 24.192, + "num_input_tokens_seen": 10223611456, + "step": 39000 + }, + { + "epoch": 0.18626948257152057, + "grad_norm": 0.19634057581424713, + "learning_rate": 0.001, + "loss": 2.6342, + "num_input_tokens_seen": 10236718656, + "step": 39050 + }, + { + "epoch": 0.1865079838296147, + "grad_norm": 0.19488537311553955, + "learning_rate": 0.001, + "loss": 2.6305, + "num_input_tokens_seen": 10249825856, + "step": 39100 + }, + { + "epoch": 0.18674648508770883, + "grad_norm": 0.2082369476556778, + "learning_rate": 0.001, + "loss": 2.6067, + "num_input_tokens_seen": 10262933056, + "step": 39150 + }, + { + "epoch": 0.18698498634580296, + "grad_norm": 0.21019776165485382, + "learning_rate": 0.001, + "loss": 2.628, + "num_input_tokens_seen": 10276040256, + "step": 39200 + }, + { + "epoch": 0.18722348760389712, + "grad_norm": 0.19929739832878113, + "learning_rate": 0.001, + "loss": 2.6256, + "num_input_tokens_seen": 10289147456, + "step": 39250 + }, + { + "epoch": 0.18746198886199125, + "grad_norm": 0.204230397939682, + "learning_rate": 0.001, + "loss": 2.6113, + "num_input_tokens_seen": 10302254656, + "step": 39300 + }, + { + "epoch": 0.18770049012008538, + "grad_norm": 0.2217213660478592, + "learning_rate": 0.001, + "loss": 2.6253, + "num_input_tokens_seen": 10315361856, + "step": 39350 + }, + { + "epoch": 0.18793899137817952, + "grad_norm": 0.19329366087913513, + "learning_rate": 0.001, + "loss": 2.6317, + "num_input_tokens_seen": 10328469056, + "step": 39400 + }, + { + "epoch": 0.18817749263627365, + "grad_norm": 0.18244336545467377, + "learning_rate": 0.001, + "loss": 2.6476, + "num_input_tokens_seen": 10341576256, + "step": 39450 + }, + { + "epoch": 0.1884159938943678, + "grad_norm": 0.1864692121744156, + "learning_rate": 0.001, + "loss": 2.642, + "num_input_tokens_seen": 10354683456, + "step": 39500 + }, + { + "epoch": 0.1884159938943678, + "eval_loss": 2.514141321182251, + "eval_runtime": 51.1111, + "eval_samples_per_second": 97.826, + "eval_steps_per_second": 24.457, + "num_input_tokens_seen": 10354683456, + "step": 39500 + }, + { + "epoch": 0.18865449515246194, + "grad_norm": 0.25003623962402344, + "learning_rate": 0.001, + "loss": 2.6299, + "num_input_tokens_seen": 10367790656, + "step": 39550 + }, + { + "epoch": 0.18889299641055607, + "grad_norm": 0.19642098248004913, + "learning_rate": 0.001, + "loss": 2.6412, + "num_input_tokens_seen": 10380897856, + "step": 39600 + }, + { + "epoch": 0.1891314976686502, + "grad_norm": 0.21947956085205078, + "learning_rate": 0.001, + "loss": 2.6211, + "num_input_tokens_seen": 10394005056, + "step": 39650 + }, + { + "epoch": 0.18936999892674433, + "grad_norm": 0.19838476181030273, + "learning_rate": 0.001, + "loss": 2.6451, + "num_input_tokens_seen": 10407112256, + "step": 39700 + }, + { + "epoch": 0.18960850018483846, + "grad_norm": 0.21131113171577454, + "learning_rate": 0.001, + "loss": 2.6375, + "num_input_tokens_seen": 10420219456, + "step": 39750 + }, + { + "epoch": 0.18984700144293262, + "grad_norm": 0.17576864361763, + "learning_rate": 0.001, + "loss": 2.6325, + "num_input_tokens_seen": 10433326656, + "step": 39800 + }, + { + "epoch": 0.19008550270102675, + "grad_norm": 0.2113037258386612, + "learning_rate": 0.001, + "loss": 2.6254, + "num_input_tokens_seen": 10446433856, + "step": 39850 + }, + { + "epoch": 0.19032400395912089, + "grad_norm": 0.1972583681344986, + "learning_rate": 0.001, + "loss": 2.6277, + "num_input_tokens_seen": 10459541056, + "step": 39900 + }, + { + "epoch": 0.19056250521721502, + "grad_norm": 0.43353378772735596, + "learning_rate": 0.001, + "loss": 2.6295, + "num_input_tokens_seen": 10472648256, + "step": 39950 + }, + { + "epoch": 0.19080100647530915, + "grad_norm": 0.22195081412792206, + "learning_rate": 0.001, + "loss": 2.6422, + "num_input_tokens_seen": 10485755456, + "step": 40000 + }, + { + "epoch": 0.19080100647530915, + "eval_loss": 2.5144667625427246, + "eval_runtime": 51.0006, + "eval_samples_per_second": 98.038, + "eval_steps_per_second": 24.51, + "num_input_tokens_seen": 10485755456, + "step": 40000 + }, + { + "epoch": 0.19103950773340328, + "grad_norm": 0.18717694282531738, + "learning_rate": 0.001, + "loss": 2.6512, + "num_input_tokens_seen": 10498862656, + "step": 40050 + }, + { + "epoch": 0.19127800899149744, + "grad_norm": 0.2009858638048172, + "learning_rate": 0.001, + "loss": 2.6289, + "num_input_tokens_seen": 10511969856, + "step": 40100 + }, + { + "epoch": 0.19151651024959157, + "grad_norm": 0.2515949010848999, + "learning_rate": 0.001, + "loss": 2.6342, + "num_input_tokens_seen": 10525077056, + "step": 40150 + }, + { + "epoch": 0.1917550115076857, + "grad_norm": 0.19864948093891144, + "learning_rate": 0.001, + "loss": 2.6191, + "num_input_tokens_seen": 10538184256, + "step": 40200 + }, + { + "epoch": 0.19199351276577983, + "grad_norm": 0.17704185843467712, + "learning_rate": 0.001, + "loss": 2.6176, + "num_input_tokens_seen": 10551291456, + "step": 40250 + }, + { + "epoch": 0.19223201402387396, + "grad_norm": 0.2097242772579193, + "learning_rate": 0.001, + "loss": 2.6509, + "num_input_tokens_seen": 10564398656, + "step": 40300 + }, + { + "epoch": 0.19247051528196812, + "grad_norm": 0.18630579113960266, + "learning_rate": 0.001, + "loss": 2.6273, + "num_input_tokens_seen": 10577505856, + "step": 40350 + }, + { + "epoch": 0.19270901654006226, + "grad_norm": 0.24162743985652924, + "learning_rate": 0.001, + "loss": 2.6405, + "num_input_tokens_seen": 10590613056, + "step": 40400 + }, + { + "epoch": 0.1929475177981564, + "grad_norm": 0.19576874375343323, + "learning_rate": 0.001, + "loss": 2.6403, + "num_input_tokens_seen": 10603720256, + "step": 40450 + }, + { + "epoch": 0.19318601905625052, + "grad_norm": 0.18408045172691345, + "learning_rate": 0.001, + "loss": 2.6149, + "num_input_tokens_seen": 10616827456, + "step": 40500 + }, + { + "epoch": 0.19318601905625052, + "eval_loss": 2.511899709701538, + "eval_runtime": 51.5326, + "eval_samples_per_second": 97.026, + "eval_steps_per_second": 24.257, + "num_input_tokens_seen": 10616827456, + "step": 40500 + }, + { + "epoch": 0.19342452031434465, + "grad_norm": 0.20845313370227814, + "learning_rate": 0.001, + "loss": 2.6242, + "num_input_tokens_seen": 10629934656, + "step": 40550 + }, + { + "epoch": 0.19366302157243878, + "grad_norm": 0.20603816211223602, + "learning_rate": 0.001, + "loss": 2.6305, + "num_input_tokens_seen": 10643041856, + "step": 40600 + }, + { + "epoch": 0.19390152283053294, + "grad_norm": 0.2180013507604599, + "learning_rate": 0.001, + "loss": 2.6271, + "num_input_tokens_seen": 10656149056, + "step": 40650 + }, + { + "epoch": 0.19414002408862707, + "grad_norm": 0.22217005491256714, + "learning_rate": 0.001, + "loss": 2.6407, + "num_input_tokens_seen": 10669256256, + "step": 40700 + }, + { + "epoch": 0.1943785253467212, + "grad_norm": 0.21379347145557404, + "learning_rate": 0.001, + "loss": 2.6209, + "num_input_tokens_seen": 10682363456, + "step": 40750 + }, + { + "epoch": 0.19461702660481534, + "grad_norm": 0.2011626958847046, + "learning_rate": 0.001, + "loss": 2.6471, + "num_input_tokens_seen": 10695470656, + "step": 40800 + }, + { + "epoch": 0.19485552786290947, + "grad_norm": 0.1946493685245514, + "learning_rate": 0.001, + "loss": 2.6267, + "num_input_tokens_seen": 10708577856, + "step": 40850 + }, + { + "epoch": 0.19509402912100363, + "grad_norm": 0.19157454371452332, + "learning_rate": 0.001, + "loss": 2.6362, + "num_input_tokens_seen": 10721685056, + "step": 40900 + }, + { + "epoch": 0.19533253037909776, + "grad_norm": 0.1978122442960739, + "learning_rate": 0.001, + "loss": 2.6448, + "num_input_tokens_seen": 10734792256, + "step": 40950 + }, + { + "epoch": 0.1955710316371919, + "grad_norm": 0.19996555149555206, + "learning_rate": 0.001, + "loss": 2.626, + "num_input_tokens_seen": 10747899456, + "step": 41000 + }, + { + "epoch": 0.1955710316371919, + "eval_loss": 2.5084941387176514, + "eval_runtime": 51.6987, + "eval_samples_per_second": 96.714, + "eval_steps_per_second": 24.179, + "num_input_tokens_seen": 10747899456, + "step": 41000 + }, + { + "epoch": 0.19580953289528602, + "grad_norm": 0.20298945903778076, + "learning_rate": 0.001, + "loss": 2.6233, + "num_input_tokens_seen": 10761006656, + "step": 41050 + }, + { + "epoch": 0.19604803415338015, + "grad_norm": 0.2280716896057129, + "learning_rate": 0.001, + "loss": 2.6427, + "num_input_tokens_seen": 10774113856, + "step": 41100 + }, + { + "epoch": 0.19628653541147428, + "grad_norm": 0.19223643839359283, + "learning_rate": 0.001, + "loss": 2.6263, + "num_input_tokens_seen": 10787221056, + "step": 41150 + }, + { + "epoch": 0.19652503666956844, + "grad_norm": 0.19221842288970947, + "learning_rate": 0.001, + "loss": 2.6401, + "num_input_tokens_seen": 10800328256, + "step": 41200 + }, + { + "epoch": 0.19676353792766257, + "grad_norm": 0.19479979574680328, + "learning_rate": 0.001, + "loss": 2.6269, + "num_input_tokens_seen": 10813435456, + "step": 41250 + }, + { + "epoch": 0.1970020391857567, + "grad_norm": 0.24501195549964905, + "learning_rate": 0.001, + "loss": 2.618, + "num_input_tokens_seen": 10826542656, + "step": 41300 + }, + { + "epoch": 0.19724054044385084, + "grad_norm": 0.1994044929742813, + "learning_rate": 0.001, + "loss": 2.64, + "num_input_tokens_seen": 10839649856, + "step": 41350 + }, + { + "epoch": 0.19747904170194497, + "grad_norm": 0.20831650495529175, + "learning_rate": 0.001, + "loss": 2.6513, + "num_input_tokens_seen": 10852757056, + "step": 41400 + }, + { + "epoch": 0.19771754296003913, + "grad_norm": 0.21919438242912292, + "learning_rate": 0.001, + "loss": 2.6379, + "num_input_tokens_seen": 10865864256, + "step": 41450 + }, + { + "epoch": 0.19795604421813326, + "grad_norm": 0.23088768124580383, + "learning_rate": 0.001, + "loss": 2.6449, + "num_input_tokens_seen": 10878971456, + "step": 41500 + }, + { + "epoch": 0.19795604421813326, + "eval_loss": 2.5156567096710205, + "eval_runtime": 51.6776, + "eval_samples_per_second": 96.754, + "eval_steps_per_second": 24.188, + "num_input_tokens_seen": 10878971456, + "step": 41500 + }, + { + "epoch": 0.1981945454762274, + "grad_norm": 0.1982518881559372, + "learning_rate": 0.001, + "loss": 2.6304, + "num_input_tokens_seen": 10892078656, + "step": 41550 + }, + { + "epoch": 0.19843304673432152, + "grad_norm": 0.2099853903055191, + "learning_rate": 0.001, + "loss": 2.6305, + "num_input_tokens_seen": 10905185856, + "step": 41600 + }, + { + "epoch": 0.19867154799241565, + "grad_norm": 0.19403131306171417, + "learning_rate": 0.001, + "loss": 2.6419, + "num_input_tokens_seen": 10918293056, + "step": 41650 + }, + { + "epoch": 0.19891004925050979, + "grad_norm": 0.20865993201732635, + "learning_rate": 0.001, + "loss": 2.6116, + "num_input_tokens_seen": 10931400256, + "step": 41700 + }, + { + "epoch": 0.19914855050860394, + "grad_norm": 0.19042626023292542, + "learning_rate": 0.001, + "loss": 2.6271, + "num_input_tokens_seen": 10944507456, + "step": 41750 + }, + { + "epoch": 0.19938705176669808, + "grad_norm": 0.20514579117298126, + "learning_rate": 0.001, + "loss": 2.6348, + "num_input_tokens_seen": 10957614656, + "step": 41800 + }, + { + "epoch": 0.1996255530247922, + "grad_norm": 0.21224668622016907, + "learning_rate": 0.001, + "loss": 2.6314, + "num_input_tokens_seen": 10970721856, + "step": 41850 + }, + { + "epoch": 0.19986405428288634, + "grad_norm": 0.18857082724571228, + "learning_rate": 0.001, + "loss": 2.6217, + "num_input_tokens_seen": 10983829056, + "step": 41900 + }, + { + "epoch": 0.20010255554098047, + "grad_norm": 0.18431074917316437, + "learning_rate": 0.001, + "loss": 2.6267, + "num_input_tokens_seen": 10996936256, + "step": 41950 + }, + { + "epoch": 0.2003410567990746, + "grad_norm": 0.20570099353790283, + "learning_rate": 0.001, + "loss": 2.6016, + "num_input_tokens_seen": 11010043456, + "step": 42000 + }, + { + "epoch": 0.2003410567990746, + "eval_loss": 2.506241798400879, + "eval_runtime": 51.5548, + "eval_samples_per_second": 96.984, + "eval_steps_per_second": 24.246, + "num_input_tokens_seen": 11010043456, + "step": 42000 + }, + { + "epoch": 0.20057955805716876, + "grad_norm": 0.17952106893062592, + "learning_rate": 0.001, + "loss": 2.6165, + "num_input_tokens_seen": 11023150656, + "step": 42050 + }, + { + "epoch": 0.2008180593152629, + "grad_norm": 0.20292694866657257, + "learning_rate": 0.001, + "loss": 2.6357, + "num_input_tokens_seen": 11036257856, + "step": 42100 + }, + { + "epoch": 0.20105656057335702, + "grad_norm": 0.19588933885097504, + "learning_rate": 0.001, + "loss": 2.6102, + "num_input_tokens_seen": 11049365056, + "step": 42150 + }, + { + "epoch": 0.20129506183145116, + "grad_norm": 0.1982785314321518, + "learning_rate": 0.001, + "loss": 2.6019, + "num_input_tokens_seen": 11062472256, + "step": 42200 + }, + { + "epoch": 0.2015335630895453, + "grad_norm": 0.18049876391887665, + "learning_rate": 0.001, + "loss": 2.6081, + "num_input_tokens_seen": 11075579456, + "step": 42250 + }, + { + "epoch": 0.20177206434763945, + "grad_norm": 0.2069908082485199, + "learning_rate": 0.001, + "loss": 2.6173, + "num_input_tokens_seen": 11088686656, + "step": 42300 + }, + { + "epoch": 0.20201056560573358, + "grad_norm": 0.2415982335805893, + "learning_rate": 0.001, + "loss": 2.6173, + "num_input_tokens_seen": 11101793856, + "step": 42350 + }, + { + "epoch": 0.2022490668638277, + "grad_norm": 0.20267252624034882, + "learning_rate": 0.001, + "loss": 2.6299, + "num_input_tokens_seen": 11114901056, + "step": 42400 + }, + { + "epoch": 0.20248756812192184, + "grad_norm": 0.20683065056800842, + "learning_rate": 0.001, + "loss": 2.6282, + "num_input_tokens_seen": 11128008256, + "step": 42450 + }, + { + "epoch": 0.20272606938001597, + "grad_norm": 0.22137881815433502, + "learning_rate": 0.001, + "loss": 2.6271, + "num_input_tokens_seen": 11141115456, + "step": 42500 + }, + { + "epoch": 0.20272606938001597, + "eval_loss": 2.5125572681427, + "eval_runtime": 51.794, + "eval_samples_per_second": 96.536, + "eval_steps_per_second": 24.134, + "num_input_tokens_seen": 11141115456, + "step": 42500 + }, + { + "epoch": 0.2029645706381101, + "grad_norm": 0.20610037446022034, + "learning_rate": 0.001, + "loss": 2.6255, + "num_input_tokens_seen": 11154222656, + "step": 42550 + }, + { + "epoch": 0.20320307189620426, + "grad_norm": 0.21218810975551605, + "learning_rate": 0.001, + "loss": 2.6149, + "num_input_tokens_seen": 11167329856, + "step": 42600 + }, + { + "epoch": 0.2034415731542984, + "grad_norm": 0.19685466587543488, + "learning_rate": 0.001, + "loss": 2.6208, + "num_input_tokens_seen": 11180437056, + "step": 42650 + }, + { + "epoch": 0.20368007441239253, + "grad_norm": 0.20507460832595825, + "learning_rate": 0.001, + "loss": 2.6227, + "num_input_tokens_seen": 11193544256, + "step": 42700 + }, + { + "epoch": 0.20391857567048666, + "grad_norm": 0.20014505088329315, + "learning_rate": 0.001, + "loss": 2.6238, + "num_input_tokens_seen": 11206651456, + "step": 42750 + }, + { + "epoch": 0.2041570769285808, + "grad_norm": 0.1907282918691635, + "learning_rate": 0.001, + "loss": 2.6157, + "num_input_tokens_seen": 11219758656, + "step": 42800 + }, + { + "epoch": 0.20439557818667495, + "grad_norm": 0.18553833663463593, + "learning_rate": 0.001, + "loss": 2.6123, + "num_input_tokens_seen": 11232865856, + "step": 42850 + }, + { + "epoch": 0.20463407944476908, + "grad_norm": 0.20382866263389587, + "learning_rate": 0.001, + "loss": 2.6163, + "num_input_tokens_seen": 11245973056, + "step": 42900 + }, + { + "epoch": 0.2048725807028632, + "grad_norm": 0.18923860788345337, + "learning_rate": 0.001, + "loss": 2.5981, + "num_input_tokens_seen": 11259080256, + "step": 42950 + }, + { + "epoch": 0.20511108196095734, + "grad_norm": 0.19230851531028748, + "learning_rate": 0.001, + "loss": 2.618, + "num_input_tokens_seen": 11272187456, + "step": 43000 + }, + { + "epoch": 0.20511108196095734, + "eval_loss": 2.5047237873077393, + "eval_runtime": 51.2959, + "eval_samples_per_second": 97.474, + "eval_steps_per_second": 24.368, + "num_input_tokens_seen": 11272187456, + "step": 43000 + }, + { + "epoch": 0.20534958321905147, + "grad_norm": 0.22746357321739197, + "learning_rate": 0.001, + "loss": 2.6281, + "num_input_tokens_seen": 11285294656, + "step": 43050 + }, + { + "epoch": 0.2055880844771456, + "grad_norm": 0.21107150614261627, + "learning_rate": 0.001, + "loss": 2.6154, + "num_input_tokens_seen": 11298401856, + "step": 43100 + }, + { + "epoch": 0.20582658573523976, + "grad_norm": 0.18025045096874237, + "learning_rate": 0.001, + "loss": 2.6141, + "num_input_tokens_seen": 11311509056, + "step": 43150 + }, + { + "epoch": 0.2060650869933339, + "grad_norm": 0.2009642869234085, + "learning_rate": 0.001, + "loss": 2.6133, + "num_input_tokens_seen": 11324616256, + "step": 43200 + }, + { + "epoch": 0.20630358825142803, + "grad_norm": 0.1872788518667221, + "learning_rate": 0.001, + "loss": 2.6197, + "num_input_tokens_seen": 11337723456, + "step": 43250 + }, + { + "epoch": 0.20654208950952216, + "grad_norm": 0.216310054063797, + "learning_rate": 0.001, + "loss": 2.6353, + "num_input_tokens_seen": 11350830656, + "step": 43300 + }, + { + "epoch": 0.2067805907676163, + "grad_norm": 0.2705513536930084, + "learning_rate": 0.001, + "loss": 2.6333, + "num_input_tokens_seen": 11363937856, + "step": 43350 + }, + { + "epoch": 0.20701909202571045, + "grad_norm": 0.3040550649166107, + "learning_rate": 0.001, + "loss": 2.6094, + "num_input_tokens_seen": 11377045056, + "step": 43400 + }, + { + "epoch": 0.20725759328380458, + "grad_norm": 0.2075599879026413, + "learning_rate": 0.001, + "loss": 2.6225, + "num_input_tokens_seen": 11390152256, + "step": 43450 + }, + { + "epoch": 0.2074960945418987, + "grad_norm": 0.22293590009212494, + "learning_rate": 0.001, + "loss": 2.6271, + "num_input_tokens_seen": 11403259456, + "step": 43500 + }, + { + "epoch": 0.2074960945418987, + "eval_loss": 2.5097975730895996, + "eval_runtime": 51.7037, + "eval_samples_per_second": 96.705, + "eval_steps_per_second": 24.176, + "num_input_tokens_seen": 11403259456, + "step": 43500 + }, + { + "epoch": 0.20773459579999284, + "grad_norm": 0.21221335232257843, + "learning_rate": 0.001, + "loss": 2.618, + "num_input_tokens_seen": 11416366656, + "step": 43550 + }, + { + "epoch": 0.20797309705808698, + "grad_norm": 0.19894948601722717, + "learning_rate": 0.001, + "loss": 2.6305, + "num_input_tokens_seen": 11429473856, + "step": 43600 + }, + { + "epoch": 0.2082115983161811, + "grad_norm": 0.29371336102485657, + "learning_rate": 0.001, + "loss": 2.6211, + "num_input_tokens_seen": 11442581056, + "step": 43650 + }, + { + "epoch": 0.20845009957427527, + "grad_norm": 0.19441936910152435, + "learning_rate": 0.001, + "loss": 2.6355, + "num_input_tokens_seen": 11455688256, + "step": 43700 + }, + { + "epoch": 0.2086886008323694, + "grad_norm": 0.19868114590644836, + "learning_rate": 0.001, + "loss": 2.6206, + "num_input_tokens_seen": 11468795456, + "step": 43750 + }, + { + "epoch": 0.20892710209046353, + "grad_norm": 0.19971340894699097, + "learning_rate": 0.001, + "loss": 2.6124, + "num_input_tokens_seen": 11481902656, + "step": 43800 + }, + { + "epoch": 0.20916560334855766, + "grad_norm": 0.22261051833629608, + "learning_rate": 0.001, + "loss": 2.623, + "num_input_tokens_seen": 11495009856, + "step": 43850 + }, + { + "epoch": 0.2094041046066518, + "grad_norm": 0.20982281863689423, + "learning_rate": 0.001, + "loss": 2.6182, + "num_input_tokens_seen": 11508117056, + "step": 43900 + }, + { + "epoch": 0.20964260586474592, + "grad_norm": 0.2216535359621048, + "learning_rate": 0.001, + "loss": 2.6086, + "num_input_tokens_seen": 11521224256, + "step": 43950 + }, + { + "epoch": 0.20988110712284008, + "grad_norm": 0.19298988580703735, + "learning_rate": 0.001, + "loss": 2.6364, + "num_input_tokens_seen": 11534331456, + "step": 44000 + }, + { + "epoch": 0.20988110712284008, + "eval_loss": 2.5009121894836426, + "eval_runtime": 51.4356, + "eval_samples_per_second": 97.209, + "eval_steps_per_second": 24.302, + "num_input_tokens_seen": 11534331456, + "step": 44000 + }, + { + "epoch": 0.21011960838093421, + "grad_norm": 0.19737008213996887, + "learning_rate": 0.001, + "loss": 2.6272, + "num_input_tokens_seen": 11547438656, + "step": 44050 + }, + { + "epoch": 0.21035810963902835, + "grad_norm": 0.1984977424144745, + "learning_rate": 0.001, + "loss": 2.6417, + "num_input_tokens_seen": 11560545856, + "step": 44100 + }, + { + "epoch": 0.21059661089712248, + "grad_norm": 0.19575904309749603, + "learning_rate": 0.001, + "loss": 2.6277, + "num_input_tokens_seen": 11573653056, + "step": 44150 + }, + { + "epoch": 0.2108351121552166, + "grad_norm": 0.19875651597976685, + "learning_rate": 0.001, + "loss": 2.6362, + "num_input_tokens_seen": 11586760256, + "step": 44200 + }, + { + "epoch": 0.21107361341331077, + "grad_norm": 0.20936185121536255, + "learning_rate": 0.001, + "loss": 2.6217, + "num_input_tokens_seen": 11599867456, + "step": 44250 + }, + { + "epoch": 0.2113121146714049, + "grad_norm": 0.19474463164806366, + "learning_rate": 0.001, + "loss": 2.6235, + "num_input_tokens_seen": 11612974656, + "step": 44300 + }, + { + "epoch": 0.21155061592949903, + "grad_norm": 0.20833207666873932, + "learning_rate": 0.001, + "loss": 2.6, + "num_input_tokens_seen": 11626081856, + "step": 44350 + }, + { + "epoch": 0.21178911718759316, + "grad_norm": 0.19269512593746185, + "learning_rate": 0.001, + "loss": 2.6211, + "num_input_tokens_seen": 11639189056, + "step": 44400 + }, + { + "epoch": 0.2120276184456873, + "grad_norm": 0.21018226444721222, + "learning_rate": 0.001, + "loss": 2.6294, + "num_input_tokens_seen": 11652296256, + "step": 44450 + }, + { + "epoch": 0.21226611970378143, + "grad_norm": 0.19836543500423431, + "learning_rate": 0.001, + "loss": 2.6051, + "num_input_tokens_seen": 11665403456, + "step": 44500 + }, + { + "epoch": 0.21226611970378143, + "eval_loss": 2.499817132949829, + "eval_runtime": 50.9003, + "eval_samples_per_second": 98.231, + "eval_steps_per_second": 24.558, + "num_input_tokens_seen": 11665403456, + "step": 44500 + }, + { + "epoch": 0.21250462096187558, + "grad_norm": 0.18411967158317566, + "learning_rate": 0.001, + "loss": 2.6228, + "num_input_tokens_seen": 11678510656, + "step": 44550 + }, + { + "epoch": 0.21274312221996972, + "grad_norm": 0.19387467205524445, + "learning_rate": 0.001, + "loss": 2.5902, + "num_input_tokens_seen": 11691617856, + "step": 44600 + }, + { + "epoch": 0.21298162347806385, + "grad_norm": 0.22076952457427979, + "learning_rate": 0.001, + "loss": 2.613, + "num_input_tokens_seen": 11704725056, + "step": 44650 + }, + { + "epoch": 0.21322012473615798, + "grad_norm": 0.33861082792282104, + "learning_rate": 0.001, + "loss": 2.6142, + "num_input_tokens_seen": 11717832256, + "step": 44700 + }, + { + "epoch": 0.2134586259942521, + "grad_norm": 0.20097902417182922, + "learning_rate": 0.001, + "loss": 2.6549, + "num_input_tokens_seen": 11730939456, + "step": 44750 + }, + { + "epoch": 0.21369712725234627, + "grad_norm": 0.24534635245800018, + "learning_rate": 0.001, + "loss": 2.6293, + "num_input_tokens_seen": 11744046656, + "step": 44800 + }, + { + "epoch": 0.2139356285104404, + "grad_norm": 0.2439020723104477, + "learning_rate": 0.001, + "loss": 2.635, + "num_input_tokens_seen": 11757153856, + "step": 44850 + }, + { + "epoch": 0.21417412976853453, + "grad_norm": 0.24259154498577118, + "learning_rate": 0.001, + "loss": 2.6232, + "num_input_tokens_seen": 11770261056, + "step": 44900 + }, + { + "epoch": 0.21441263102662866, + "grad_norm": 0.23554636538028717, + "learning_rate": 0.001, + "loss": 2.6061, + "num_input_tokens_seen": 11783368256, + "step": 44950 + }, + { + "epoch": 0.2146511322847228, + "grad_norm": 0.20377275347709656, + "learning_rate": 0.001, + "loss": 2.6156, + "num_input_tokens_seen": 11796475456, + "step": 45000 + }, + { + "epoch": 0.2146511322847228, + "eval_loss": 2.503781318664551, + "eval_runtime": 51.1656, + "eval_samples_per_second": 97.722, + "eval_steps_per_second": 24.43, + "num_input_tokens_seen": 11796475456, + "step": 45000 + }, + { + "epoch": 0.21488963354281693, + "grad_norm": 0.226406991481781, + "learning_rate": 0.001, + "loss": 2.626, + "num_input_tokens_seen": 11809582656, + "step": 45050 + }, + { + "epoch": 0.21512813480091109, + "grad_norm": 0.20505741238594055, + "learning_rate": 0.001, + "loss": 2.6095, + "num_input_tokens_seen": 11822689856, + "step": 45100 + }, + { + "epoch": 0.21536663605900522, + "grad_norm": 0.2917146682739258, + "learning_rate": 0.001, + "loss": 2.6439, + "num_input_tokens_seen": 11835797056, + "step": 45150 + }, + { + "epoch": 0.21560513731709935, + "grad_norm": 0.24030283093452454, + "learning_rate": 0.001, + "loss": 2.6386, + "num_input_tokens_seen": 11848904256, + "step": 45200 + }, + { + "epoch": 0.21584363857519348, + "grad_norm": 0.1799454241991043, + "learning_rate": 0.001, + "loss": 2.6344, + "num_input_tokens_seen": 11862011456, + "step": 45250 + }, + { + "epoch": 0.2160821398332876, + "grad_norm": 0.2093718945980072, + "learning_rate": 0.001, + "loss": 2.6152, + "num_input_tokens_seen": 11875118656, + "step": 45300 + }, + { + "epoch": 0.21632064109138174, + "grad_norm": 0.19477079808712006, + "learning_rate": 0.001, + "loss": 2.622, + "num_input_tokens_seen": 11888225856, + "step": 45350 + }, + { + "epoch": 0.2165591423494759, + "grad_norm": 0.2764741778373718, + "learning_rate": 0.001, + "loss": 2.5951, + "num_input_tokens_seen": 11901333056, + "step": 45400 + }, + { + "epoch": 0.21679764360757003, + "grad_norm": 0.2127208709716797, + "learning_rate": 0.001, + "loss": 2.6231, + "num_input_tokens_seen": 11914440256, + "step": 45450 + }, + { + "epoch": 0.21703614486566417, + "grad_norm": 0.21089383959770203, + "learning_rate": 0.001, + "loss": 2.6099, + "num_input_tokens_seen": 11927547456, + "step": 45500 + }, + { + "epoch": 0.21703614486566417, + "eval_loss": 2.502464771270752, + "eval_runtime": 50.946, + "eval_samples_per_second": 98.143, + "eval_steps_per_second": 24.536, + "num_input_tokens_seen": 11927547456, + "step": 45500 + }, + { + "epoch": 0.2172746461237583, + "grad_norm": 0.19550016522407532, + "learning_rate": 0.001, + "loss": 2.6365, + "num_input_tokens_seen": 11940654656, + "step": 45550 + }, + { + "epoch": 0.21751314738185243, + "grad_norm": 0.18284358084201813, + "learning_rate": 0.001, + "loss": 2.6358, + "num_input_tokens_seen": 11953761856, + "step": 45600 + }, + { + "epoch": 0.2177516486399466, + "grad_norm": 0.21821847558021545, + "learning_rate": 0.001, + "loss": 2.607, + "num_input_tokens_seen": 11966869056, + "step": 45650 + }, + { + "epoch": 0.21799014989804072, + "grad_norm": 0.2195073515176773, + "learning_rate": 0.001, + "loss": 2.6195, + "num_input_tokens_seen": 11979976256, + "step": 45700 + }, + { + "epoch": 0.21822865115613485, + "grad_norm": 0.19679750502109528, + "learning_rate": 0.001, + "loss": 2.6259, + "num_input_tokens_seen": 11993083456, + "step": 45750 + }, + { + "epoch": 0.21846715241422898, + "grad_norm": 0.1985604166984558, + "learning_rate": 0.001, + "loss": 2.6224, + "num_input_tokens_seen": 12006190656, + "step": 45800 + }, + { + "epoch": 0.2187056536723231, + "grad_norm": 0.18398787081241608, + "learning_rate": 0.001, + "loss": 2.6215, + "num_input_tokens_seen": 12019297856, + "step": 45850 + }, + { + "epoch": 0.21894415493041725, + "grad_norm": 0.2306145578622818, + "learning_rate": 0.001, + "loss": 2.6346, + "num_input_tokens_seen": 12032405056, + "step": 45900 + }, + { + "epoch": 0.2191826561885114, + "grad_norm": 0.21335257589817047, + "learning_rate": 0.001, + "loss": 2.6232, + "num_input_tokens_seen": 12045512256, + "step": 45950 + }, + { + "epoch": 0.21942115744660554, + "grad_norm": 0.22988814115524292, + "learning_rate": 0.001, + "loss": 2.6132, + "num_input_tokens_seen": 12058619456, + "step": 46000 + }, + { + "epoch": 0.21942115744660554, + "eval_loss": 2.499041795730591, + "eval_runtime": 50.6868, + "eval_samples_per_second": 98.645, + "eval_steps_per_second": 24.661, + "num_input_tokens_seen": 12058619456, + "step": 46000 + }, + { + "epoch": 0.21965965870469967, + "grad_norm": 0.19492709636688232, + "learning_rate": 0.001, + "loss": 2.6196, + "num_input_tokens_seen": 12071726656, + "step": 46050 + }, + { + "epoch": 0.2198981599627938, + "grad_norm": 0.19643568992614746, + "learning_rate": 0.001, + "loss": 2.6108, + "num_input_tokens_seen": 12084833856, + "step": 46100 + }, + { + "epoch": 0.22013666122088793, + "grad_norm": 0.18720099329948425, + "learning_rate": 0.001, + "loss": 2.6181, + "num_input_tokens_seen": 12097941056, + "step": 46150 + }, + { + "epoch": 0.2203751624789821, + "grad_norm": 0.1929876208305359, + "learning_rate": 0.001, + "loss": 2.6152, + "num_input_tokens_seen": 12111048256, + "step": 46200 + }, + { + "epoch": 0.22061366373707622, + "grad_norm": 0.19732603430747986, + "learning_rate": 0.001, + "loss": 2.6267, + "num_input_tokens_seen": 12124155456, + "step": 46250 + }, + { + "epoch": 0.22085216499517035, + "grad_norm": 0.1964132934808731, + "learning_rate": 0.001, + "loss": 2.605, + "num_input_tokens_seen": 12137262656, + "step": 46300 + }, + { + "epoch": 0.22109066625326448, + "grad_norm": 0.1927288919687271, + "learning_rate": 0.001, + "loss": 2.6178, + "num_input_tokens_seen": 12150369856, + "step": 46350 + }, + { + "epoch": 0.22132916751135862, + "grad_norm": 0.17873398959636688, + "learning_rate": 0.001, + "loss": 2.6033, + "num_input_tokens_seen": 12163477056, + "step": 46400 + }, + { + "epoch": 0.22156766876945275, + "grad_norm": 0.24716190993785858, + "learning_rate": 0.001, + "loss": 2.6141, + "num_input_tokens_seen": 12176584256, + "step": 46450 + }, + { + "epoch": 0.2218061700275469, + "grad_norm": 0.2021339386701584, + "learning_rate": 0.001, + "loss": 2.6259, + "num_input_tokens_seen": 12189691456, + "step": 46500 + }, + { + "epoch": 0.2218061700275469, + "eval_loss": 2.4975087642669678, + "eval_runtime": 50.8921, + "eval_samples_per_second": 98.247, + "eval_steps_per_second": 24.562, + "num_input_tokens_seen": 12189691456, + "step": 46500 + }, + { + "epoch": 0.22204467128564104, + "grad_norm": 0.20796166360378265, + "learning_rate": 0.001, + "loss": 2.6211, + "num_input_tokens_seen": 12202798656, + "step": 46550 + }, + { + "epoch": 0.22228317254373517, + "grad_norm": 0.20472556352615356, + "learning_rate": 0.001, + "loss": 2.6123, + "num_input_tokens_seen": 12215905856, + "step": 46600 + }, + { + "epoch": 0.2225216738018293, + "grad_norm": 0.20017485320568085, + "learning_rate": 0.001, + "loss": 2.6037, + "num_input_tokens_seen": 12229013056, + "step": 46650 + }, + { + "epoch": 0.22276017505992343, + "grad_norm": 0.2037762850522995, + "learning_rate": 0.001, + "loss": 2.6155, + "num_input_tokens_seen": 12242120256, + "step": 46700 + }, + { + "epoch": 0.2229986763180176, + "grad_norm": 0.19346804916858673, + "learning_rate": 0.001, + "loss": 2.601, + "num_input_tokens_seen": 12255227456, + "step": 46750 + }, + { + "epoch": 0.22323717757611172, + "grad_norm": 0.18640096485614777, + "learning_rate": 0.001, + "loss": 2.6168, + "num_input_tokens_seen": 12268334656, + "step": 46800 + }, + { + "epoch": 0.22347567883420585, + "grad_norm": 0.20295055210590363, + "learning_rate": 0.001, + "loss": 2.6221, + "num_input_tokens_seen": 12281441856, + "step": 46850 + }, + { + "epoch": 0.22371418009229999, + "grad_norm": 0.20705671608448029, + "learning_rate": 0.001, + "loss": 2.6202, + "num_input_tokens_seen": 12294549056, + "step": 46900 + }, + { + "epoch": 0.22395268135039412, + "grad_norm": 0.18724282085895538, + "learning_rate": 0.001, + "loss": 2.6061, + "num_input_tokens_seen": 12307656256, + "step": 46950 + }, + { + "epoch": 0.22419118260848825, + "grad_norm": 0.18210910260677338, + "learning_rate": 0.001, + "loss": 2.6045, + "num_input_tokens_seen": 12320763456, + "step": 47000 + }, + { + "epoch": 0.22419118260848825, + "eval_loss": 2.497344493865967, + "eval_runtime": 51.17, + "eval_samples_per_second": 97.713, + "eval_steps_per_second": 24.428, + "num_input_tokens_seen": 12320763456, + "step": 47000 + }, + { + "epoch": 0.2244296838665824, + "grad_norm": 0.18894509971141815, + "learning_rate": 0.001, + "loss": 2.6069, + "num_input_tokens_seen": 12333870656, + "step": 47050 + }, + { + "epoch": 0.22466818512467654, + "grad_norm": 0.23441652953624725, + "learning_rate": 0.001, + "loss": 2.6092, + "num_input_tokens_seen": 12346977856, + "step": 47100 + }, + { + "epoch": 0.22490668638277067, + "grad_norm": 0.20195326209068298, + "learning_rate": 0.001, + "loss": 2.6135, + "num_input_tokens_seen": 12360085056, + "step": 47150 + }, + { + "epoch": 0.2251451876408648, + "grad_norm": 0.22025838494300842, + "learning_rate": 0.001, + "loss": 2.6034, + "num_input_tokens_seen": 12373192256, + "step": 47200 + }, + { + "epoch": 0.22538368889895893, + "grad_norm": 0.19111979007720947, + "learning_rate": 0.001, + "loss": 2.6151, + "num_input_tokens_seen": 12386299456, + "step": 47250 + }, + { + "epoch": 0.22562219015705307, + "grad_norm": 0.2010103464126587, + "learning_rate": 0.001, + "loss": 2.6031, + "num_input_tokens_seen": 12399406656, + "step": 47300 + }, + { + "epoch": 0.22586069141514722, + "grad_norm": 0.21569807827472687, + "learning_rate": 0.001, + "loss": 2.6012, + "num_input_tokens_seen": 12412513856, + "step": 47350 + }, + { + "epoch": 0.22609919267324136, + "grad_norm": 0.18600653111934662, + "learning_rate": 0.001, + "loss": 2.6087, + "num_input_tokens_seen": 12425621056, + "step": 47400 + }, + { + "epoch": 0.2263376939313355, + "grad_norm": 0.19476164877414703, + "learning_rate": 0.001, + "loss": 2.6179, + "num_input_tokens_seen": 12438728256, + "step": 47450 + }, + { + "epoch": 0.22657619518942962, + "grad_norm": 0.19705821573734283, + "learning_rate": 0.001, + "loss": 2.5983, + "num_input_tokens_seen": 12451835456, + "step": 47500 + }, + { + "epoch": 0.22657619518942962, + "eval_loss": 2.495936393737793, + "eval_runtime": 51.8116, + "eval_samples_per_second": 96.504, + "eval_steps_per_second": 24.126, + "num_input_tokens_seen": 12451835456, + "step": 47500 + }, + { + "epoch": 0.22681469644752375, + "grad_norm": 0.23161695897579193, + "learning_rate": 0.001, + "loss": 2.5974, + "num_input_tokens_seen": 12464942656, + "step": 47550 + }, + { + "epoch": 0.2270531977056179, + "grad_norm": 0.2022540420293808, + "learning_rate": 0.001, + "loss": 2.6251, + "num_input_tokens_seen": 12478049856, + "step": 47600 + }, + { + "epoch": 0.22729169896371204, + "grad_norm": 1.0341856479644775, + "learning_rate": 0.001, + "loss": 2.5831, + "num_input_tokens_seen": 12491157056, + "step": 47650 + }, + { + "epoch": 0.22753020022180617, + "grad_norm": 0.3812394440174103, + "learning_rate": 0.001, + "loss": 2.6407, + "num_input_tokens_seen": 12504264256, + "step": 47700 + }, + { + "epoch": 0.2277687014799003, + "grad_norm": 0.27030590176582336, + "learning_rate": 0.001, + "loss": 2.6327, + "num_input_tokens_seen": 12517371456, + "step": 47750 + }, + { + "epoch": 0.22800720273799444, + "grad_norm": 1.3918724060058594, + "learning_rate": 0.001, + "loss": 2.6344, + "num_input_tokens_seen": 12530478656, + "step": 47800 + }, + { + "epoch": 0.22824570399608857, + "grad_norm": 0.22610582411289215, + "learning_rate": 0.001, + "loss": 2.6444, + "num_input_tokens_seen": 12543585856, + "step": 47850 + }, + { + "epoch": 0.22848420525418273, + "grad_norm": 0.21421480178833008, + "learning_rate": 0.001, + "loss": 2.6169, + "num_input_tokens_seen": 12556693056, + "step": 47900 + }, + { + "epoch": 0.22872270651227686, + "grad_norm": 0.20389467477798462, + "learning_rate": 0.001, + "loss": 2.6158, + "num_input_tokens_seen": 12569800256, + "step": 47950 + }, + { + "epoch": 0.228961207770371, + "grad_norm": 0.2265746295452118, + "learning_rate": 0.001, + "loss": 2.6101, + "num_input_tokens_seen": 12582907456, + "step": 48000 + }, + { + "epoch": 0.228961207770371, + "eval_loss": 2.4971351623535156, + "eval_runtime": 54.0453, + "eval_samples_per_second": 92.515, + "eval_steps_per_second": 23.129, + "num_input_tokens_seen": 12582907456, + "step": 48000 + }, + { + "epoch": 0.22919970902846512, + "grad_norm": 0.20247948169708252, + "learning_rate": 0.001, + "loss": 2.6122, + "num_input_tokens_seen": 12596014656, + "step": 48050 + }, + { + "epoch": 0.22943821028655925, + "grad_norm": 0.20237554609775543, + "learning_rate": 0.001, + "loss": 2.6235, + "num_input_tokens_seen": 12609121856, + "step": 48100 + }, + { + "epoch": 0.2296767115446534, + "grad_norm": 0.19862660765647888, + "learning_rate": 0.001, + "loss": 2.6264, + "num_input_tokens_seen": 12622229056, + "step": 48150 + }, + { + "epoch": 0.22991521280274754, + "grad_norm": 0.20839153230190277, + "learning_rate": 0.001, + "loss": 2.5915, + "num_input_tokens_seen": 12635336256, + "step": 48200 + }, + { + "epoch": 0.23015371406084167, + "grad_norm": 0.19385166466236115, + "learning_rate": 0.001, + "loss": 2.5979, + "num_input_tokens_seen": 12648443456, + "step": 48250 + }, + { + "epoch": 0.2303922153189358, + "grad_norm": 0.197597935795784, + "learning_rate": 0.001, + "loss": 2.6093, + "num_input_tokens_seen": 12661550656, + "step": 48300 + }, + { + "epoch": 0.23063071657702994, + "grad_norm": 0.20289985835552216, + "learning_rate": 0.001, + "loss": 2.6039, + "num_input_tokens_seen": 12674657856, + "step": 48350 + }, + { + "epoch": 0.23086921783512407, + "grad_norm": 0.1986515372991562, + "learning_rate": 0.001, + "loss": 2.6048, + "num_input_tokens_seen": 12687765056, + "step": 48400 + }, + { + "epoch": 0.23110771909321823, + "grad_norm": 0.19720982015132904, + "learning_rate": 0.001, + "loss": 2.6171, + "num_input_tokens_seen": 12700872256, + "step": 48450 + }, + { + "epoch": 0.23134622035131236, + "grad_norm": 0.24635523557662964, + "learning_rate": 0.001, + "loss": 2.6242, + "num_input_tokens_seen": 12713979456, + "step": 48500 + }, + { + "epoch": 0.23134622035131236, + "eval_loss": 2.495468854904175, + "eval_runtime": 53.4259, + "eval_samples_per_second": 93.588, + "eval_steps_per_second": 23.397, + "num_input_tokens_seen": 12713979456, + "step": 48500 + }, + { + "epoch": 0.2315847216094065, + "grad_norm": 0.5883195996284485, + "learning_rate": 0.001, + "loss": 2.6399, + "num_input_tokens_seen": 12727086656, + "step": 48550 + }, + { + "epoch": 0.23182322286750062, + "grad_norm": 0.20890024304389954, + "learning_rate": 0.001, + "loss": 2.6325, + "num_input_tokens_seen": 12740193856, + "step": 48600 + }, + { + "epoch": 0.23206172412559475, + "grad_norm": 0.21251678466796875, + "learning_rate": 0.001, + "loss": 2.6233, + "num_input_tokens_seen": 12753301056, + "step": 48650 + }, + { + "epoch": 0.23230022538368889, + "grad_norm": 0.20996986329555511, + "learning_rate": 0.001, + "loss": 2.6174, + "num_input_tokens_seen": 12766408256, + "step": 48700 + }, + { + "epoch": 0.23253872664178304, + "grad_norm": 0.23039382696151733, + "learning_rate": 0.001, + "loss": 2.6305, + "num_input_tokens_seen": 12779515456, + "step": 48750 + }, + { + "epoch": 0.23277722789987718, + "grad_norm": 0.23922136425971985, + "learning_rate": 0.001, + "loss": 2.6108, + "num_input_tokens_seen": 12792622656, + "step": 48800 + }, + { + "epoch": 0.2330157291579713, + "grad_norm": 0.22746366262435913, + "learning_rate": 0.001, + "loss": 2.6219, + "num_input_tokens_seen": 12805729856, + "step": 48850 + }, + { + "epoch": 0.23325423041606544, + "grad_norm": 0.22131897509098053, + "learning_rate": 0.001, + "loss": 2.6205, + "num_input_tokens_seen": 12818837056, + "step": 48900 + }, + { + "epoch": 0.23349273167415957, + "grad_norm": 0.25431814789772034, + "learning_rate": 0.001, + "loss": 2.6252, + "num_input_tokens_seen": 12831944256, + "step": 48950 + }, + { + "epoch": 0.23373123293225373, + "grad_norm": 0.2622738778591156, + "learning_rate": 0.001, + "loss": 2.6288, + "num_input_tokens_seen": 12845051456, + "step": 49000 + }, + { + "epoch": 0.23373123293225373, + "eval_loss": 2.498055934906006, + "eval_runtime": 53.8861, + "eval_samples_per_second": 92.788, + "eval_steps_per_second": 23.197, + "num_input_tokens_seen": 12845051456, + "step": 49000 + }, + { + "epoch": 0.23396973419034786, + "grad_norm": 0.209337517619133, + "learning_rate": 0.001, + "loss": 2.6348, + "num_input_tokens_seen": 12858158656, + "step": 49050 + }, + { + "epoch": 0.234208235448442, + "grad_norm": 0.1974038928747177, + "learning_rate": 0.001, + "loss": 2.6158, + "num_input_tokens_seen": 12871265856, + "step": 49100 + }, + { + "epoch": 0.23444673670653612, + "grad_norm": 0.28099164366722107, + "learning_rate": 0.001, + "loss": 2.6101, + "num_input_tokens_seen": 12884373056, + "step": 49150 + }, + { + "epoch": 0.23468523796463026, + "grad_norm": 0.2172873318195343, + "learning_rate": 0.001, + "loss": 2.596, + "num_input_tokens_seen": 12897480256, + "step": 49200 + }, + { + "epoch": 0.2349237392227244, + "grad_norm": 0.2120896875858307, + "learning_rate": 0.001, + "loss": 2.5994, + "num_input_tokens_seen": 12910587456, + "step": 49250 + }, + { + "epoch": 0.23516224048081855, + "grad_norm": 0.20109935104846954, + "learning_rate": 0.001, + "loss": 2.6101, + "num_input_tokens_seen": 12923694656, + "step": 49300 + }, + { + "epoch": 0.23540074173891268, + "grad_norm": 0.20735585689544678, + "learning_rate": 0.001, + "loss": 2.6142, + "num_input_tokens_seen": 12936801856, + "step": 49350 + }, + { + "epoch": 0.2356392429970068, + "grad_norm": 0.21295137703418732, + "learning_rate": 0.001, + "loss": 2.6226, + "num_input_tokens_seen": 12949909056, + "step": 49400 + }, + { + "epoch": 0.23587774425510094, + "grad_norm": 0.20560845732688904, + "learning_rate": 0.001, + "loss": 2.6027, + "num_input_tokens_seen": 12963016256, + "step": 49450 + }, + { + "epoch": 0.23611624551319507, + "grad_norm": 0.33747321367263794, + "learning_rate": 0.001, + "loss": 2.6231, + "num_input_tokens_seen": 12976123456, + "step": 49500 + }, + { + "epoch": 0.23611624551319507, + "eval_loss": 2.5008058547973633, + "eval_runtime": 54.2104, + "eval_samples_per_second": 92.233, + "eval_steps_per_second": 23.058, + "num_input_tokens_seen": 12976123456, + "step": 49500 + }, + { + "epoch": 0.23635474677128923, + "grad_norm": 0.24593485891819, + "learning_rate": 0.001, + "loss": 2.6336, + "num_input_tokens_seen": 12989230656, + "step": 49550 + }, + { + "epoch": 0.23659324802938336, + "grad_norm": 0.25253933668136597, + "learning_rate": 0.001, + "loss": 2.643, + "num_input_tokens_seen": 13002337856, + "step": 49600 + }, + { + "epoch": 0.2368317492874775, + "grad_norm": 0.24231670796871185, + "learning_rate": 0.001, + "loss": 2.6074, + "num_input_tokens_seen": 13015445056, + "step": 49650 + }, + { + "epoch": 0.23707025054557163, + "grad_norm": 0.2178962677717209, + "learning_rate": 0.001, + "loss": 2.6184, + "num_input_tokens_seen": 13028552256, + "step": 49700 + }, + { + "epoch": 0.23730875180366576, + "grad_norm": 0.2651260793209076, + "learning_rate": 0.001, + "loss": 2.6335, + "num_input_tokens_seen": 13041659456, + "step": 49750 + }, + { + "epoch": 0.2375472530617599, + "grad_norm": 0.1909639537334442, + "learning_rate": 0.001, + "loss": 2.61, + "num_input_tokens_seen": 13054766656, + "step": 49800 + }, + { + "epoch": 0.23778575431985405, + "grad_norm": 0.21107855439186096, + "learning_rate": 0.001, + "loss": 2.6333, + "num_input_tokens_seen": 13067873856, + "step": 49850 + }, + { + "epoch": 0.23802425557794818, + "grad_norm": 0.19366736710071564, + "learning_rate": 0.001, + "loss": 2.6068, + "num_input_tokens_seen": 13080981056, + "step": 49900 + }, + { + "epoch": 0.2382627568360423, + "grad_norm": 0.2851523458957672, + "learning_rate": 0.001, + "loss": 2.6183, + "num_input_tokens_seen": 13094088256, + "step": 49950 + }, + { + "epoch": 0.23850125809413644, + "grad_norm": 0.23617912828922272, + "learning_rate": 0.001, + "loss": 2.617, + "num_input_tokens_seen": 13107195456, + "step": 50000 + }, + { + "epoch": 0.23850125809413644, + "eval_loss": 2.497406005859375, + "eval_runtime": 53.6538, + "eval_samples_per_second": 93.19, + "eval_steps_per_second": 23.298, + "num_input_tokens_seen": 13107195456, + "step": 50000 + }, + { + "epoch": 0.23873975935223057, + "grad_norm": 0.5069316029548645, + "learning_rate": 0.001, + "loss": 2.6591, + "num_input_tokens_seen": 13120302656, + "step": 50050 + }, + { + "epoch": 0.23897826061032473, + "grad_norm": 0.21306034922599792, + "learning_rate": 0.001, + "loss": 2.6455, + "num_input_tokens_seen": 13133409856, + "step": 50100 + }, + { + "epoch": 0.23921676186841886, + "grad_norm": 0.2045888900756836, + "learning_rate": 0.001, + "loss": 2.6227, + "num_input_tokens_seen": 13146517056, + "step": 50150 + }, + { + "epoch": 0.239455263126513, + "grad_norm": 0.2335623949766159, + "learning_rate": 0.001, + "loss": 2.6097, + "num_input_tokens_seen": 13159624256, + "step": 50200 + }, + { + "epoch": 0.23969376438460713, + "grad_norm": 0.19884036481380463, + "learning_rate": 0.001, + "loss": 2.6189, + "num_input_tokens_seen": 13172731456, + "step": 50250 + }, + { + "epoch": 0.23993226564270126, + "grad_norm": 0.21080589294433594, + "learning_rate": 0.001, + "loss": 2.6057, + "num_input_tokens_seen": 13185838656, + "step": 50300 + }, + { + "epoch": 0.2401707669007954, + "grad_norm": 0.21613669395446777, + "learning_rate": 0.001, + "loss": 2.6045, + "num_input_tokens_seen": 13198945856, + "step": 50350 + }, + { + "epoch": 0.24040926815888955, + "grad_norm": 0.2029023915529251, + "learning_rate": 0.001, + "loss": 2.6127, + "num_input_tokens_seen": 13212053056, + "step": 50400 + }, + { + "epoch": 0.24064776941698368, + "grad_norm": 0.2275777906179428, + "learning_rate": 0.001, + "loss": 2.6149, + "num_input_tokens_seen": 13225160256, + "step": 50450 + }, + { + "epoch": 0.2408862706750778, + "grad_norm": 0.3332397937774658, + "learning_rate": 0.001, + "loss": 2.6013, + "num_input_tokens_seen": 13238267456, + "step": 50500 + }, + { + "epoch": 0.2408862706750778, + "eval_loss": 2.5022270679473877, + "eval_runtime": 53.5942, + "eval_samples_per_second": 93.294, + "eval_steps_per_second": 23.323, + "num_input_tokens_seen": 13238267456, + "step": 50500 + }, + { + "epoch": 0.24112477193317194, + "grad_norm": 0.2197851538658142, + "learning_rate": 0.001, + "loss": 2.6326, + "num_input_tokens_seen": 13251374656, + "step": 50550 + }, + { + "epoch": 0.24136327319126608, + "grad_norm": 0.2201780080795288, + "learning_rate": 0.001, + "loss": 2.6265, + "num_input_tokens_seen": 13264481856, + "step": 50600 + }, + { + "epoch": 0.2416017744493602, + "grad_norm": 0.2196362316608429, + "learning_rate": 0.001, + "loss": 2.6272, + "num_input_tokens_seen": 13277589056, + "step": 50650 + }, + { + "epoch": 0.24184027570745437, + "grad_norm": 0.2234160453081131, + "learning_rate": 0.001, + "loss": 2.6178, + "num_input_tokens_seen": 13290696256, + "step": 50700 + }, + { + "epoch": 0.2420787769655485, + "grad_norm": 0.24019016325473785, + "learning_rate": 0.001, + "loss": 2.6142, + "num_input_tokens_seen": 13303803456, + "step": 50750 + }, + { + "epoch": 0.24231727822364263, + "grad_norm": 0.21481236815452576, + "learning_rate": 0.001, + "loss": 2.6149, + "num_input_tokens_seen": 13316910656, + "step": 50800 + }, + { + "epoch": 0.24255577948173676, + "grad_norm": 0.20477178692817688, + "learning_rate": 0.001, + "loss": 2.5977, + "num_input_tokens_seen": 13330017856, + "step": 50850 + }, + { + "epoch": 0.2427942807398309, + "grad_norm": 0.20742499828338623, + "learning_rate": 0.001, + "loss": 2.6153, + "num_input_tokens_seen": 13343125056, + "step": 50900 + }, + { + "epoch": 0.24303278199792505, + "grad_norm": 0.21933062374591827, + "learning_rate": 0.001, + "loss": 2.5966, + "num_input_tokens_seen": 13356232256, + "step": 50950 + }, + { + "epoch": 0.24327128325601918, + "grad_norm": 0.3282420337200165, + "learning_rate": 0.001, + "loss": 2.6063, + "num_input_tokens_seen": 13369339456, + "step": 51000 + }, + { + "epoch": 0.24327128325601918, + "eval_loss": 2.4981296062469482, + "eval_runtime": 53.5536, + "eval_samples_per_second": 93.364, + "eval_steps_per_second": 23.341, + "num_input_tokens_seen": 13369339456, + "step": 51000 + }, + { + "epoch": 0.24350978451411331, + "grad_norm": 0.20502831041812897, + "learning_rate": 0.001, + "loss": 2.6059, + "num_input_tokens_seen": 13382446656, + "step": 51050 + }, + { + "epoch": 0.24374828577220745, + "grad_norm": 0.20750559866428375, + "learning_rate": 0.001, + "loss": 2.6056, + "num_input_tokens_seen": 13395553856, + "step": 51100 + }, + { + "epoch": 0.24398678703030158, + "grad_norm": 0.19882823526859283, + "learning_rate": 0.001, + "loss": 2.5983, + "num_input_tokens_seen": 13408661056, + "step": 51150 + }, + { + "epoch": 0.2442252882883957, + "grad_norm": 0.20900660753250122, + "learning_rate": 0.001, + "loss": 2.6087, + "num_input_tokens_seen": 13421768256, + "step": 51200 + }, + { + "epoch": 0.24446378954648987, + "grad_norm": 0.21428415179252625, + "learning_rate": 0.001, + "loss": 2.5901, + "num_input_tokens_seen": 13434875456, + "step": 51250 + }, + { + "epoch": 0.244702290804584, + "grad_norm": 0.19987250864505768, + "learning_rate": 0.001, + "loss": 2.5982, + "num_input_tokens_seen": 13447982656, + "step": 51300 + }, + { + "epoch": 0.24494079206267813, + "grad_norm": 0.2045862078666687, + "learning_rate": 0.001, + "loss": 2.6058, + "num_input_tokens_seen": 13461089856, + "step": 51350 + }, + { + "epoch": 0.24517929332077226, + "grad_norm": 0.22261273860931396, + "learning_rate": 0.001, + "loss": 2.5972, + "num_input_tokens_seen": 13474197056, + "step": 51400 + }, + { + "epoch": 0.2454177945788664, + "grad_norm": 0.20395706593990326, + "learning_rate": 0.001, + "loss": 2.6064, + "num_input_tokens_seen": 13487304256, + "step": 51450 + }, + { + "epoch": 0.24565629583696055, + "grad_norm": 0.21490858495235443, + "learning_rate": 0.001, + "loss": 2.5922, + "num_input_tokens_seen": 13500411456, + "step": 51500 + }, + { + "epoch": 0.24565629583696055, + "eval_loss": 2.488300085067749, + "eval_runtime": 53.7972, + "eval_samples_per_second": 92.942, + "eval_steps_per_second": 23.235, + "num_input_tokens_seen": 13500411456, + "step": 51500 + }, + { + "epoch": 0.24589479709505468, + "grad_norm": 0.2039102464914322, + "learning_rate": 0.001, + "loss": 2.5894, + "num_input_tokens_seen": 13513518656, + "step": 51550 + }, + { + "epoch": 0.24613329835314882, + "grad_norm": 0.21426360309123993, + "learning_rate": 0.001, + "loss": 2.6089, + "num_input_tokens_seen": 13526625856, + "step": 51600 + }, + { + "epoch": 0.24637179961124295, + "grad_norm": 0.194682314991951, + "learning_rate": 0.001, + "loss": 2.5932, + "num_input_tokens_seen": 13539733056, + "step": 51650 + }, + { + "epoch": 0.24661030086933708, + "grad_norm": 0.1901472508907318, + "learning_rate": 0.001, + "loss": 2.6031, + "num_input_tokens_seen": 13552840256, + "step": 51700 + }, + { + "epoch": 0.2468488021274312, + "grad_norm": 0.20517823100090027, + "learning_rate": 0.001, + "loss": 2.5978, + "num_input_tokens_seen": 13565947456, + "step": 51750 + }, + { + "epoch": 0.24708730338552537, + "grad_norm": 0.23713302612304688, + "learning_rate": 0.001, + "loss": 2.6061, + "num_input_tokens_seen": 13579054656, + "step": 51800 + }, + { + "epoch": 0.2473258046436195, + "grad_norm": 0.2431441992521286, + "learning_rate": 0.001, + "loss": 2.6062, + "num_input_tokens_seen": 13592161856, + "step": 51850 + }, + { + "epoch": 0.24756430590171363, + "grad_norm": 0.20358557999134064, + "learning_rate": 0.001, + "loss": 2.6161, + "num_input_tokens_seen": 13605269056, + "step": 51900 + }, + { + "epoch": 0.24780280715980776, + "grad_norm": 0.21245016157627106, + "learning_rate": 0.001, + "loss": 2.6166, + "num_input_tokens_seen": 13618376256, + "step": 51950 + }, + { + "epoch": 0.2480413084179019, + "grad_norm": 0.24295999109745026, + "learning_rate": 0.001, + "loss": 2.6139, + "num_input_tokens_seen": 13631483456, + "step": 52000 + }, + { + "epoch": 0.2480413084179019, + "eval_loss": 2.4932186603546143, + "eval_runtime": 53.6797, + "eval_samples_per_second": 93.145, + "eval_steps_per_second": 23.286, + "num_input_tokens_seen": 13631483456, + "step": 52000 + }, + { + "epoch": 0.24827980967599603, + "grad_norm": 0.22135989367961884, + "learning_rate": 0.001, + "loss": 2.5947, + "num_input_tokens_seen": 13644590656, + "step": 52050 + }, + { + "epoch": 0.2485183109340902, + "grad_norm": 0.3656958341598511, + "learning_rate": 0.001, + "loss": 2.6263, + "num_input_tokens_seen": 13657697856, + "step": 52100 + }, + { + "epoch": 0.24875681219218432, + "grad_norm": 0.2960817813873291, + "learning_rate": 0.001, + "loss": 2.6086, + "num_input_tokens_seen": 13670805056, + "step": 52150 + }, + { + "epoch": 0.24899531345027845, + "grad_norm": 0.2150612622499466, + "learning_rate": 0.001, + "loss": 2.6314, + "num_input_tokens_seen": 13683912256, + "step": 52200 + }, + { + "epoch": 0.24923381470837258, + "grad_norm": 0.23089592158794403, + "learning_rate": 0.001, + "loss": 2.6072, + "num_input_tokens_seen": 13697019456, + "step": 52250 + }, + { + "epoch": 0.2494723159664667, + "grad_norm": 0.19151148200035095, + "learning_rate": 0.001, + "loss": 2.6177, + "num_input_tokens_seen": 13710126656, + "step": 52300 + }, + { + "epoch": 0.24971081722456087, + "grad_norm": 0.47803962230682373, + "learning_rate": 0.001, + "loss": 2.6018, + "num_input_tokens_seen": 13723233856, + "step": 52350 + }, + { + "epoch": 0.249949318482655, + "grad_norm": 0.2346401810646057, + "learning_rate": 0.001, + "loss": 2.6068, + "num_input_tokens_seen": 13736341056, + "step": 52400 + }, + { + "epoch": 0.2501878197407491, + "grad_norm": 0.21514126658439636, + "learning_rate": 0.001, + "loss": 2.6186, + "num_input_tokens_seen": 13749448256, + "step": 52450 + }, + { + "epoch": 0.25042632099884327, + "grad_norm": 0.20311090350151062, + "learning_rate": 0.001, + "loss": 2.595, + "num_input_tokens_seen": 13762555456, + "step": 52500 + }, + { + "epoch": 0.25042632099884327, + "eval_loss": 2.490104913711548, + "eval_runtime": 53.8709, + "eval_samples_per_second": 92.814, + "eval_steps_per_second": 23.204, + "num_input_tokens_seen": 13762555456, + "step": 52500 + }, + { + "epoch": 0.2506648222569374, + "grad_norm": 0.2120152711868286, + "learning_rate": 0.001, + "loss": 2.6027, + "num_input_tokens_seen": 13775662656, + "step": 52550 + }, + { + "epoch": 0.25090332351503153, + "grad_norm": 0.3172776401042938, + "learning_rate": 0.001, + "loss": 2.6089, + "num_input_tokens_seen": 13788769856, + "step": 52600 + }, + { + "epoch": 0.2511418247731257, + "grad_norm": 0.24425551295280457, + "learning_rate": 0.001, + "loss": 2.611, + "num_input_tokens_seen": 13801877056, + "step": 52650 + }, + { + "epoch": 0.2513803260312198, + "grad_norm": 0.24523352086544037, + "learning_rate": 0.001, + "loss": 2.6066, + "num_input_tokens_seen": 13814984256, + "step": 52700 + }, + { + "epoch": 0.25161882728931395, + "grad_norm": 0.21642154455184937, + "learning_rate": 0.001, + "loss": 2.6069, + "num_input_tokens_seen": 13828091456, + "step": 52750 + }, + { + "epoch": 0.2518573285474081, + "grad_norm": 0.21867206692695618, + "learning_rate": 0.001, + "loss": 2.6163, + "num_input_tokens_seen": 13841198656, + "step": 52800 + }, + { + "epoch": 0.2520958298055022, + "grad_norm": 0.2124466449022293, + "learning_rate": 0.001, + "loss": 2.6045, + "num_input_tokens_seen": 13854305856, + "step": 52850 + }, + { + "epoch": 0.2523343310635964, + "grad_norm": 0.20598042011260986, + "learning_rate": 0.001, + "loss": 2.5881, + "num_input_tokens_seen": 13867413056, + "step": 52900 + }, + { + "epoch": 0.2525728323216905, + "grad_norm": 0.1949404776096344, + "learning_rate": 0.001, + "loss": 2.6051, + "num_input_tokens_seen": 13880520256, + "step": 52950 + }, + { + "epoch": 0.25281133357978464, + "grad_norm": 0.18877142667770386, + "learning_rate": 0.001, + "loss": 2.608, + "num_input_tokens_seen": 13893627456, + "step": 53000 + }, + { + "epoch": 0.25281133357978464, + "eval_loss": 2.485513210296631, + "eval_runtime": 53.7202, + "eval_samples_per_second": 93.075, + "eval_steps_per_second": 23.269, + "num_input_tokens_seen": 13893627456, + "step": 53000 + }, + { + "epoch": 0.2530498348378788, + "grad_norm": 0.20486177504062653, + "learning_rate": 0.001, + "loss": 2.5977, + "num_input_tokens_seen": 13906734656, + "step": 53050 + }, + { + "epoch": 0.2532883360959729, + "grad_norm": 0.18098385632038116, + "learning_rate": 0.001, + "loss": 2.5931, + "num_input_tokens_seen": 13919841856, + "step": 53100 + }, + { + "epoch": 0.25352683735406706, + "grad_norm": 0.1933833658695221, + "learning_rate": 0.001, + "loss": 2.6058, + "num_input_tokens_seen": 13932949056, + "step": 53150 + }, + { + "epoch": 0.25376533861216116, + "grad_norm": 0.29640141129493713, + "learning_rate": 0.001, + "loss": 2.5864, + "num_input_tokens_seen": 13946056256, + "step": 53200 + }, + { + "epoch": 0.2540038398702553, + "grad_norm": 0.2559553384780884, + "learning_rate": 0.001, + "loss": 2.6137, + "num_input_tokens_seen": 13959163456, + "step": 53250 + }, + { + "epoch": 0.2542423411283494, + "grad_norm": 0.21698619425296783, + "learning_rate": 0.001, + "loss": 2.6184, + "num_input_tokens_seen": 13972270656, + "step": 53300 + }, + { + "epoch": 0.2544808423864436, + "grad_norm": 0.19658173620700836, + "learning_rate": 0.001, + "loss": 2.5938, + "num_input_tokens_seen": 13985377856, + "step": 53350 + }, + { + "epoch": 0.25471934364453774, + "grad_norm": 0.2056342512369156, + "learning_rate": 0.001, + "loss": 2.5952, + "num_input_tokens_seen": 13998485056, + "step": 53400 + }, + { + "epoch": 0.25495784490263185, + "grad_norm": 0.1932424008846283, + "learning_rate": 0.001, + "loss": 2.6101, + "num_input_tokens_seen": 14011592256, + "step": 53450 + }, + { + "epoch": 0.255196346160726, + "grad_norm": 0.19347251951694489, + "learning_rate": 0.001, + "loss": 2.5976, + "num_input_tokens_seen": 14024699456, + "step": 53500 + }, + { + "epoch": 0.255196346160726, + "eval_loss": 2.4863245487213135, + "eval_runtime": 53.2426, + "eval_samples_per_second": 93.91, + "eval_steps_per_second": 23.477, + "num_input_tokens_seen": 14024699456, + "step": 53500 + }, + { + "epoch": 0.2554348474188201, + "grad_norm": 0.1986820101737976, + "learning_rate": 0.001, + "loss": 2.6066, + "num_input_tokens_seen": 14037806656, + "step": 53550 + }, + { + "epoch": 0.25567334867691427, + "grad_norm": 0.21295565366744995, + "learning_rate": 0.001, + "loss": 2.6107, + "num_input_tokens_seen": 14050913856, + "step": 53600 + }, + { + "epoch": 0.25591184993500843, + "grad_norm": 0.21585114300251007, + "learning_rate": 0.001, + "loss": 2.6077, + "num_input_tokens_seen": 14064021056, + "step": 53650 + }, + { + "epoch": 0.25615035119310253, + "grad_norm": 0.19424305856227875, + "learning_rate": 0.001, + "loss": 2.5931, + "num_input_tokens_seen": 14077128256, + "step": 53700 + }, + { + "epoch": 0.2563888524511967, + "grad_norm": 0.20265349745750427, + "learning_rate": 0.001, + "loss": 2.5901, + "num_input_tokens_seen": 14090235456, + "step": 53750 + }, + { + "epoch": 0.2566273537092908, + "grad_norm": 1.037636160850525, + "learning_rate": 0.001, + "loss": 2.5775, + "num_input_tokens_seen": 14103342656, + "step": 53800 + }, + { + "epoch": 0.25686585496738495, + "grad_norm": 0.32030293345451355, + "learning_rate": 0.001, + "loss": 2.6242, + "num_input_tokens_seen": 14116449856, + "step": 53850 + }, + { + "epoch": 0.2571043562254791, + "grad_norm": 0.2339978665113449, + "learning_rate": 0.001, + "loss": 2.6122, + "num_input_tokens_seen": 14129557056, + "step": 53900 + }, + { + "epoch": 0.2573428574835732, + "grad_norm": 0.22179783880710602, + "learning_rate": 0.001, + "loss": 2.6025, + "num_input_tokens_seen": 14142664256, + "step": 53950 + }, + { + "epoch": 0.2575813587416674, + "grad_norm": 0.22616736590862274, + "learning_rate": 0.001, + "loss": 2.5916, + "num_input_tokens_seen": 14155771456, + "step": 54000 + }, + { + "epoch": 0.2575813587416674, + "eval_loss": 2.4871394634246826, + "eval_runtime": 53.8695, + "eval_samples_per_second": 92.817, + "eval_steps_per_second": 23.204, + "num_input_tokens_seen": 14155771456, + "step": 54000 + }, + { + "epoch": 0.2578198599997615, + "grad_norm": 0.2028844654560089, + "learning_rate": 0.001, + "loss": 2.6039, + "num_input_tokens_seen": 14168878656, + "step": 54050 + }, + { + "epoch": 0.25805836125785564, + "grad_norm": 0.19936658442020416, + "learning_rate": 0.001, + "loss": 2.5985, + "num_input_tokens_seen": 14181985856, + "step": 54100 + }, + { + "epoch": 0.2582968625159498, + "grad_norm": 0.2087993025779724, + "learning_rate": 0.001, + "loss": 2.62, + "num_input_tokens_seen": 14195093056, + "step": 54150 + }, + { + "epoch": 0.2585353637740439, + "grad_norm": 0.18972960114479065, + "learning_rate": 0.001, + "loss": 2.5936, + "num_input_tokens_seen": 14208200256, + "step": 54200 + }, + { + "epoch": 0.25877386503213806, + "grad_norm": 0.2162945419549942, + "learning_rate": 0.001, + "loss": 2.6125, + "num_input_tokens_seen": 14221307456, + "step": 54250 + }, + { + "epoch": 0.25901236629023217, + "grad_norm": 0.2538411319255829, + "learning_rate": 0.001, + "loss": 2.6197, + "num_input_tokens_seen": 14234414656, + "step": 54300 + }, + { + "epoch": 0.2592508675483263, + "grad_norm": 0.28060850501060486, + "learning_rate": 0.001, + "loss": 2.6194, + "num_input_tokens_seen": 14247521856, + "step": 54350 + }, + { + "epoch": 0.25948936880642043, + "grad_norm": 0.21557608246803284, + "learning_rate": 0.001, + "loss": 2.623, + "num_input_tokens_seen": 14260629056, + "step": 54400 + }, + { + "epoch": 0.2597278700645146, + "grad_norm": 0.21628426015377045, + "learning_rate": 0.001, + "loss": 2.6077, + "num_input_tokens_seen": 14273736256, + "step": 54450 + }, + { + "epoch": 0.25996637132260875, + "grad_norm": 0.19123327732086182, + "learning_rate": 0.001, + "loss": 2.5991, + "num_input_tokens_seen": 14286843456, + "step": 54500 + }, + { + "epoch": 0.25996637132260875, + "eval_loss": 2.4861645698547363, + "eval_runtime": 53.6448, + "eval_samples_per_second": 93.206, + "eval_steps_per_second": 23.301, + "num_input_tokens_seen": 14286843456, + "step": 54500 + }, + { + "epoch": 0.26020487258070285, + "grad_norm": 0.20462968945503235, + "learning_rate": 0.001, + "loss": 2.5887, + "num_input_tokens_seen": 14299950656, + "step": 54550 + }, + { + "epoch": 0.260443373838797, + "grad_norm": 0.20952938497066498, + "learning_rate": 0.001, + "loss": 2.608, + "num_input_tokens_seen": 14313057856, + "step": 54600 + }, + { + "epoch": 0.2606818750968911, + "grad_norm": 0.2095402032136917, + "learning_rate": 0.001, + "loss": 2.6079, + "num_input_tokens_seen": 14326165056, + "step": 54650 + }, + { + "epoch": 0.2609203763549853, + "grad_norm": 0.2343517541885376, + "learning_rate": 0.001, + "loss": 2.6124, + "num_input_tokens_seen": 14339272256, + "step": 54700 + }, + { + "epoch": 0.26115887761307943, + "grad_norm": 0.23840700089931488, + "learning_rate": 0.001, + "loss": 2.6015, + "num_input_tokens_seen": 14352379456, + "step": 54750 + }, + { + "epoch": 0.26139737887117354, + "grad_norm": 0.22024671733379364, + "learning_rate": 0.001, + "loss": 2.5812, + "num_input_tokens_seen": 14365486656, + "step": 54800 + }, + { + "epoch": 0.2616358801292677, + "grad_norm": 0.19884246587753296, + "learning_rate": 0.001, + "loss": 2.6118, + "num_input_tokens_seen": 14378593856, + "step": 54850 + }, + { + "epoch": 0.2618743813873618, + "grad_norm": 0.46560585498809814, + "learning_rate": 0.001, + "loss": 2.6024, + "num_input_tokens_seen": 14391701056, + "step": 54900 + }, + { + "epoch": 0.26211288264545596, + "grad_norm": 0.2956256568431854, + "learning_rate": 0.001, + "loss": 2.6073, + "num_input_tokens_seen": 14404808256, + "step": 54950 + }, + { + "epoch": 0.2623513839035501, + "grad_norm": 0.286327064037323, + "learning_rate": 0.001, + "loss": 2.5946, + "num_input_tokens_seen": 14417915456, + "step": 55000 + }, + { + "epoch": 0.2623513839035501, + "eval_loss": 2.4892399311065674, + "eval_runtime": 53.3184, + "eval_samples_per_second": 93.776, + "eval_steps_per_second": 23.444, + "num_input_tokens_seen": 14417915456, + "step": 55000 + }, + { + "epoch": 0.2625898851616442, + "grad_norm": 0.22046101093292236, + "learning_rate": 0.001, + "loss": 2.6077, + "num_input_tokens_seen": 14431022656, + "step": 55050 + }, + { + "epoch": 0.2628283864197384, + "grad_norm": 0.4682837724685669, + "learning_rate": 0.001, + "loss": 2.6065, + "num_input_tokens_seen": 14444129856, + "step": 55100 + }, + { + "epoch": 0.2630668876778325, + "grad_norm": 0.21442484855651855, + "learning_rate": 0.001, + "loss": 2.6079, + "num_input_tokens_seen": 14457237056, + "step": 55150 + }, + { + "epoch": 0.26330538893592664, + "grad_norm": 0.2513403296470642, + "learning_rate": 0.001, + "loss": 2.6037, + "num_input_tokens_seen": 14470344256, + "step": 55200 + }, + { + "epoch": 0.26354389019402075, + "grad_norm": 0.21526487171649933, + "learning_rate": 0.001, + "loss": 2.6049, + "num_input_tokens_seen": 14483451456, + "step": 55250 + }, + { + "epoch": 0.2637823914521149, + "grad_norm": 0.22567112743854523, + "learning_rate": 0.001, + "loss": 2.5953, + "num_input_tokens_seen": 14496558656, + "step": 55300 + }, + { + "epoch": 0.26402089271020907, + "grad_norm": 0.20226064324378967, + "learning_rate": 0.001, + "loss": 2.609, + "num_input_tokens_seen": 14509665856, + "step": 55350 + }, + { + "epoch": 0.26425939396830317, + "grad_norm": 0.31736019253730774, + "learning_rate": 0.001, + "loss": 2.6174, + "num_input_tokens_seen": 14522773056, + "step": 55400 + }, + { + "epoch": 0.26449789522639733, + "grad_norm": 0.2573414146900177, + "learning_rate": 0.001, + "loss": 2.612, + "num_input_tokens_seen": 14535880256, + "step": 55450 + }, + { + "epoch": 0.26473639648449143, + "grad_norm": 0.278160959482193, + "learning_rate": 0.001, + "loss": 2.6713, + "num_input_tokens_seen": 14548987456, + "step": 55500 + }, + { + "epoch": 0.26473639648449143, + "eval_loss": 2.5104730129241943, + "eval_runtime": 54.2403, + "eval_samples_per_second": 92.182, + "eval_steps_per_second": 23.046, + "num_input_tokens_seen": 14548987456, + "step": 55500 + }, + { + "epoch": 0.2649748977425856, + "grad_norm": 0.25843819975852966, + "learning_rate": 0.001, + "loss": 2.6223, + "num_input_tokens_seen": 14562094656, + "step": 55550 + }, + { + "epoch": 0.26521339900067975, + "grad_norm": 0.42813193798065186, + "learning_rate": 0.001, + "loss": 2.6114, + "num_input_tokens_seen": 14575201856, + "step": 55600 + }, + { + "epoch": 0.26545190025877385, + "grad_norm": 0.23324181139469147, + "learning_rate": 0.001, + "loss": 2.6149, + "num_input_tokens_seen": 14588309056, + "step": 55650 + }, + { + "epoch": 0.265690401516868, + "grad_norm": 0.2795487940311432, + "learning_rate": 0.001, + "loss": 2.6067, + "num_input_tokens_seen": 14601416256, + "step": 55700 + }, + { + "epoch": 0.2659289027749621, + "grad_norm": 0.6856834888458252, + "learning_rate": 0.001, + "loss": 2.6135, + "num_input_tokens_seen": 14614523456, + "step": 55750 + }, + { + "epoch": 0.2661674040330563, + "grad_norm": 0.348906934261322, + "learning_rate": 0.001, + "loss": 2.6384, + "num_input_tokens_seen": 14627630656, + "step": 55800 + }, + { + "epoch": 0.26640590529115044, + "grad_norm": 0.2510247528553009, + "learning_rate": 0.001, + "loss": 2.6224, + "num_input_tokens_seen": 14640737856, + "step": 55850 + }, + { + "epoch": 0.26664440654924454, + "grad_norm": 0.34429189562797546, + "learning_rate": 0.001, + "loss": 2.6139, + "num_input_tokens_seen": 14653845056, + "step": 55900 + }, + { + "epoch": 0.2668829078073387, + "grad_norm": 0.25697243213653564, + "learning_rate": 0.001, + "loss": 2.6143, + "num_input_tokens_seen": 14666952256, + "step": 55950 + }, + { + "epoch": 0.2671214090654328, + "grad_norm": 0.2812611758708954, + "learning_rate": 0.001, + "loss": 2.6172, + "num_input_tokens_seen": 14680059456, + "step": 56000 + }, + { + "epoch": 0.2671214090654328, + "eval_loss": 2.492490291595459, + "eval_runtime": 53.3814, + "eval_samples_per_second": 93.666, + "eval_steps_per_second": 23.416, + "num_input_tokens_seen": 14680059456, + "step": 56000 + }, + { + "epoch": 0.26735991032352696, + "grad_norm": 0.22615984082221985, + "learning_rate": 0.0009999685283773503, + "loss": 2.5961, + "num_input_tokens_seen": 14693166656, + "step": 56050 + }, + { + "epoch": 0.2675984115816211, + "grad_norm": 0.2738794982433319, + "learning_rate": 0.0009998741174712534, + "loss": 2.612, + "num_input_tokens_seen": 14706273856, + "step": 56100 + }, + { + "epoch": 0.2678369128397152, + "grad_norm": 0.23470066487789154, + "learning_rate": 0.0009997167791667668, + "loss": 2.6071, + "num_input_tokens_seen": 14719381056, + "step": 56150 + }, + { + "epoch": 0.2680754140978094, + "grad_norm": 0.23558543622493744, + "learning_rate": 0.0009994965332706573, + "loss": 2.5956, + "num_input_tokens_seen": 14732488256, + "step": 56200 + }, + { + "epoch": 0.2683139153559035, + "grad_norm": 0.2274416983127594, + "learning_rate": 0.0009992134075089082, + "loss": 2.5873, + "num_input_tokens_seen": 14745595456, + "step": 56250 + }, + { + "epoch": 0.26855241661399765, + "grad_norm": 0.21609161794185638, + "learning_rate": 0.000998867437523228, + "loss": 2.6043, + "num_input_tokens_seen": 14758702656, + "step": 56300 + }, + { + "epoch": 0.26879091787209175, + "grad_norm": 0.2368565797805786, + "learning_rate": 0.000998458666866564, + "loss": 2.5952, + "num_input_tokens_seen": 14771809856, + "step": 56350 + }, + { + "epoch": 0.2690294191301859, + "grad_norm": 0.22180891036987305, + "learning_rate": 0.0009979871469976197, + "loss": 2.5934, + "num_input_tokens_seen": 14784917056, + "step": 56400 + }, + { + "epoch": 0.26926792038828007, + "grad_norm": 0.3060019910335541, + "learning_rate": 0.0009974529372743762, + "loss": 2.6224, + "num_input_tokens_seen": 14798024256, + "step": 56450 + }, + { + "epoch": 0.2695064216463742, + "grad_norm": 0.2387322634458542, + "learning_rate": 0.0009968561049466214, + "loss": 2.5905, + "num_input_tokens_seen": 14811131456, + "step": 56500 + }, + { + "epoch": 0.2695064216463742, + "eval_loss": 2.4835996627807617, + "eval_runtime": 53.8478, + "eval_samples_per_second": 92.854, + "eval_steps_per_second": 23.214, + "num_input_tokens_seen": 14811131456, + "step": 56500 + }, + { + "epoch": 0.26974492290446833, + "grad_norm": 0.22091372311115265, + "learning_rate": 0.0009961967251474822, + "loss": 2.6139, + "num_input_tokens_seen": 14824238656, + "step": 56550 + }, + { + "epoch": 0.26998342416256244, + "grad_norm": 0.2304680198431015, + "learning_rate": 0.0009954748808839674, + "loss": 2.6167, + "num_input_tokens_seen": 14837345856, + "step": 56600 + }, + { + "epoch": 0.2702219254206566, + "grad_norm": 0.19777421653270721, + "learning_rate": 0.0009946906630265184, + "loss": 2.6082, + "num_input_tokens_seen": 14850453056, + "step": 56650 + }, + { + "epoch": 0.27046042667875075, + "grad_norm": 0.2113979458808899, + "learning_rate": 0.0009938441702975688, + "loss": 2.5981, + "num_input_tokens_seen": 14863560256, + "step": 56700 + }, + { + "epoch": 0.27069892793684486, + "grad_norm": 0.19911637902259827, + "learning_rate": 0.0009929355092591179, + "loss": 2.5904, + "num_input_tokens_seen": 14876667456, + "step": 56750 + }, + { + "epoch": 0.270937429194939, + "grad_norm": 0.20081694424152374, + "learning_rate": 0.0009919647942993148, + "loss": 2.6012, + "num_input_tokens_seen": 14889774656, + "step": 56800 + }, + { + "epoch": 0.2711759304530331, + "grad_norm": 0.22752800583839417, + "learning_rate": 0.0009909321476180592, + "loss": 2.6017, + "num_input_tokens_seen": 14902881856, + "step": 56850 + }, + { + "epoch": 0.2714144317111273, + "grad_norm": 0.23174402117729187, + "learning_rate": 0.0009898376992116178, + "loss": 2.6012, + "num_input_tokens_seen": 14915989056, + "step": 56900 + }, + { + "epoch": 0.27165293296922144, + "grad_norm": 0.22149533033370972, + "learning_rate": 0.0009886815868562597, + "loss": 2.5881, + "num_input_tokens_seen": 14929096256, + "step": 56950 + }, + { + "epoch": 0.27189143422731554, + "grad_norm": 0.22576771676540375, + "learning_rate": 0.0009874639560909118, + "loss": 2.6021, + "num_input_tokens_seen": 14942203456, + "step": 57000 + }, + { + "epoch": 0.27189143422731554, + "eval_loss": 2.482896566390991, + "eval_runtime": 53.3773, + "eval_samples_per_second": 93.673, + "eval_steps_per_second": 23.418, + "num_input_tokens_seen": 14942203456, + "step": 57000 + }, + { + "epoch": 0.2721299354854097, + "grad_norm": 0.22044019401073456, + "learning_rate": 0.0009861849601988384, + "loss": 2.6119, + "num_input_tokens_seen": 14955310656, + "step": 57050 + }, + { + "epoch": 0.2723684367435038, + "grad_norm": 0.2155238389968872, + "learning_rate": 0.0009848447601883434, + "loss": 2.5869, + "num_input_tokens_seen": 14968417856, + "step": 57100 + }, + { + "epoch": 0.27260693800159796, + "grad_norm": 0.21131549775600433, + "learning_rate": 0.0009834435247725033, + "loss": 2.5988, + "num_input_tokens_seen": 14981525056, + "step": 57150 + }, + { + "epoch": 0.27284543925969207, + "grad_norm": 0.21247337758541107, + "learning_rate": 0.0009819814303479266, + "loss": 2.6198, + "num_input_tokens_seen": 14994632256, + "step": 57200 + }, + { + "epoch": 0.27308394051778623, + "grad_norm": 0.21916711330413818, + "learning_rate": 0.00098045866097255, + "loss": 2.6019, + "num_input_tokens_seen": 15007739456, + "step": 57250 + }, + { + "epoch": 0.2733224417758804, + "grad_norm": 0.1925441473722458, + "learning_rate": 0.0009788754083424652, + "loss": 2.6143, + "num_input_tokens_seen": 15020846656, + "step": 57300 + }, + { + "epoch": 0.2735609430339745, + "grad_norm": 0.38578665256500244, + "learning_rate": 0.0009772318717677904, + "loss": 2.6037, + "num_input_tokens_seen": 15033953856, + "step": 57350 + }, + { + "epoch": 0.27379944429206865, + "grad_norm": 0.19650611281394958, + "learning_rate": 0.0009755282581475768, + "loss": 2.5745, + "num_input_tokens_seen": 15047061056, + "step": 57400 + }, + { + "epoch": 0.27403794555016275, + "grad_norm": 0.2376088798046112, + "learning_rate": 0.0009737647819437645, + "loss": 2.5968, + "num_input_tokens_seen": 15060168256, + "step": 57450 + }, + { + "epoch": 0.2742764468082569, + "grad_norm": 0.21746863424777985, + "learning_rate": 0.0009719416651541838, + "loss": 2.5965, + "num_input_tokens_seen": 15073275456, + "step": 57500 + }, + { + "epoch": 0.2742764468082569, + "eval_loss": 2.483751058578491, + "eval_runtime": 53.9622, + "eval_samples_per_second": 92.657, + "eval_steps_per_second": 23.164, + "num_input_tokens_seen": 15073275456, + "step": 57500 + }, + { + "epoch": 0.27451494806635107, + "grad_norm": 0.2898815870285034, + "learning_rate": 0.0009700591372846095, + "loss": 2.6105, + "num_input_tokens_seen": 15086382656, + "step": 57550 + }, + { + "epoch": 0.2747534493244452, + "grad_norm": 0.24887384474277496, + "learning_rate": 0.0009681174353198686, + "loss": 2.6103, + "num_input_tokens_seen": 15099489856, + "step": 57600 + }, + { + "epoch": 0.27499195058253934, + "grad_norm": 0.26613715291023254, + "learning_rate": 0.0009661168036940071, + "loss": 2.6296, + "num_input_tokens_seen": 15112597056, + "step": 57650 + }, + { + "epoch": 0.27523045184063344, + "grad_norm": 0.23983849585056305, + "learning_rate": 0.0009640574942595195, + "loss": 2.6008, + "num_input_tokens_seen": 15125704256, + "step": 57700 + }, + { + "epoch": 0.2754689530987276, + "grad_norm": 0.23169022798538208, + "learning_rate": 0.0009619397662556434, + "loss": 2.596, + "num_input_tokens_seen": 15138811456, + "step": 57750 + }, + { + "epoch": 0.27570745435682176, + "grad_norm": 0.21353812515735626, + "learning_rate": 0.0009597638862757254, + "loss": 2.6039, + "num_input_tokens_seen": 15151918656, + "step": 57800 + }, + { + "epoch": 0.27594595561491586, + "grad_norm": 0.2561227083206177, + "learning_rate": 0.00095753012823366, + "loss": 2.6046, + "num_input_tokens_seen": 15165025856, + "step": 57850 + }, + { + "epoch": 0.27618445687301, + "grad_norm": 0.20380394160747528, + "learning_rate": 0.000955238773329408, + "loss": 2.5968, + "num_input_tokens_seen": 15178133056, + "step": 57900 + }, + { + "epoch": 0.2764229581311041, + "grad_norm": 0.26447024941444397, + "learning_rate": 0.000952890110013597, + "loss": 2.5848, + "num_input_tokens_seen": 15191240256, + "step": 57950 + }, + { + "epoch": 0.2766614593891983, + "grad_norm": 0.23530781269073486, + "learning_rate": 0.0009504844339512095, + "loss": 2.582, + "num_input_tokens_seen": 15204347456, + "step": 58000 + }, + { + "epoch": 0.2766614593891983, + "eval_loss": 2.482050895690918, + "eval_runtime": 53.5775, + "eval_samples_per_second": 93.323, + "eval_steps_per_second": 23.331, + "num_input_tokens_seen": 15204347456, + "step": 58000 + }, + { + "epoch": 0.2768999606472924, + "grad_norm": 0.2281644344329834, + "learning_rate": 0.0009480220479843627, + "loss": 2.6212, + "num_input_tokens_seen": 15217454656, + "step": 58050 + }, + { + "epoch": 0.27713846190538655, + "grad_norm": 0.2181713730096817, + "learning_rate": 0.0009455032620941839, + "loss": 2.5927, + "num_input_tokens_seen": 15230561856, + "step": 58100 + }, + { + "epoch": 0.2773769631634807, + "grad_norm": 0.21573083102703094, + "learning_rate": 0.00094292839336179, + "loss": 2.6112, + "num_input_tokens_seen": 15243669056, + "step": 58150 + }, + { + "epoch": 0.2776154644215748, + "grad_norm": 0.2686486840248108, + "learning_rate": 0.000940297765928369, + "loss": 2.6133, + "num_input_tokens_seen": 15256776256, + "step": 58200 + }, + { + "epoch": 0.27785396567966897, + "grad_norm": 0.2320137470960617, + "learning_rate": 0.0009376117109543769, + "loss": 2.6094, + "num_input_tokens_seen": 15269883456, + "step": 58250 + }, + { + "epoch": 0.27809246693776307, + "grad_norm": 0.22277672588825226, + "learning_rate": 0.0009348705665778478, + "loss": 2.5885, + "num_input_tokens_seen": 15282990656, + "step": 58300 + }, + { + "epoch": 0.27833096819585723, + "grad_norm": 0.22681231796741486, + "learning_rate": 0.0009320746778718274, + "loss": 2.6005, + "num_input_tokens_seen": 15296097856, + "step": 58350 + }, + { + "epoch": 0.2785694694539514, + "grad_norm": 0.25187453627586365, + "learning_rate": 0.000929224396800933, + "loss": 2.5944, + "num_input_tokens_seen": 15309205056, + "step": 58400 + }, + { + "epoch": 0.2788079707120455, + "grad_norm": 0.24962358176708221, + "learning_rate": 0.0009263200821770461, + "loss": 2.5888, + "num_input_tokens_seen": 15322312256, + "step": 58450 + }, + { + "epoch": 0.27904647197013965, + "grad_norm": 0.18929679691791534, + "learning_rate": 0.0009233620996141421, + "loss": 2.5927, + "num_input_tokens_seen": 15335419456, + "step": 58500 + }, + { + "epoch": 0.27904647197013965, + "eval_loss": 2.4754066467285156, + "eval_runtime": 53.7558, + "eval_samples_per_second": 93.013, + "eval_steps_per_second": 23.253, + "num_input_tokens_seen": 15335419456, + "step": 58500 + }, + { + "epoch": 0.27928497322823376, + "grad_norm": 0.22240912914276123, + "learning_rate": 0.0009203508214822651, + "loss": 2.5944, + "num_input_tokens_seen": 15348526656, + "step": 58550 + }, + { + "epoch": 0.2795234744863279, + "grad_norm": 0.2096235305070877, + "learning_rate": 0.0009172866268606513, + "loss": 2.5964, + "num_input_tokens_seen": 15361633856, + "step": 58600 + }, + { + "epoch": 0.2797619757444221, + "grad_norm": 0.2913396954536438, + "learning_rate": 0.0009141699014900082, + "loss": 2.5975, + "num_input_tokens_seen": 15374741056, + "step": 58650 + }, + { + "epoch": 0.2800004770025162, + "grad_norm": 0.21000444889068604, + "learning_rate": 0.0009110010377239551, + "loss": 2.5987, + "num_input_tokens_seen": 15387848256, + "step": 58700 + }, + { + "epoch": 0.28023897826061034, + "grad_norm": 0.18561489880084991, + "learning_rate": 0.0009077804344796301, + "loss": 2.5955, + "num_input_tokens_seen": 15400955456, + "step": 58750 + }, + { + "epoch": 0.28047747951870444, + "grad_norm": 0.330816388130188, + "learning_rate": 0.0009045084971874737, + "loss": 2.5837, + "num_input_tokens_seen": 15414062656, + "step": 58800 + }, + { + "epoch": 0.2807159807767986, + "grad_norm": 0.21823953092098236, + "learning_rate": 0.000901185637740189, + "loss": 2.5921, + "num_input_tokens_seen": 15427169856, + "step": 58850 + }, + { + "epoch": 0.28095448203489276, + "grad_norm": 0.28721505403518677, + "learning_rate": 0.0008978122744408905, + "loss": 2.5893, + "num_input_tokens_seen": 15440277056, + "step": 58900 + }, + { + "epoch": 0.28119298329298686, + "grad_norm": 0.2468225359916687, + "learning_rate": 0.0008943888319504456, + "loss": 2.5999, + "num_input_tokens_seen": 15453384256, + "step": 58950 + }, + { + "epoch": 0.281431484551081, + "grad_norm": 0.20486761629581451, + "learning_rate": 0.000890915741234015, + "loss": 2.6026, + "num_input_tokens_seen": 15466491456, + "step": 59000 + }, + { + "epoch": 0.281431484551081, + "eval_loss": 2.4756667613983154, + "eval_runtime": 53.3408, + "eval_samples_per_second": 93.737, + "eval_steps_per_second": 23.434, + "num_input_tokens_seen": 15466491456, + "step": 59000 + }, + { + "epoch": 0.2816699858091751, + "grad_norm": 0.3338637351989746, + "learning_rate": 0.0008873934395068005, + "loss": 2.587, + "num_input_tokens_seen": 15479598656, + "step": 59050 + }, + { + "epoch": 0.2819084870672693, + "grad_norm": 0.20848780870437622, + "learning_rate": 0.0008838223701790055, + "loss": 2.5989, + "num_input_tokens_seen": 15492705856, + "step": 59100 + }, + { + "epoch": 0.2821469883253634, + "grad_norm": 0.21479378640651703, + "learning_rate": 0.0008802029828000156, + "loss": 2.6052, + "num_input_tokens_seen": 15505813056, + "step": 59150 + }, + { + "epoch": 0.28238548958345755, + "grad_norm": 0.1944151073694229, + "learning_rate": 0.0008765357330018055, + "loss": 2.6044, + "num_input_tokens_seen": 15518920256, + "step": 59200 + }, + { + "epoch": 0.2826239908415517, + "grad_norm": 0.2078033685684204, + "learning_rate": 0.0008728210824415827, + "loss": 2.5929, + "num_input_tokens_seen": 15532027456, + "step": 59250 + }, + { + "epoch": 0.2828624920996458, + "grad_norm": 0.19340284168720245, + "learning_rate": 0.0008690594987436704, + "loss": 2.5875, + "num_input_tokens_seen": 15545134656, + "step": 59300 + }, + { + "epoch": 0.28310099335773997, + "grad_norm": 0.22354012727737427, + "learning_rate": 0.0008652514554406388, + "loss": 2.5976, + "num_input_tokens_seen": 15558241856, + "step": 59350 + }, + { + "epoch": 0.2833394946158341, + "grad_norm": 0.26784005761146545, + "learning_rate": 0.0008613974319136957, + "loss": 2.5868, + "num_input_tokens_seen": 15571349056, + "step": 59400 + }, + { + "epoch": 0.28357799587392823, + "grad_norm": 0.20749828219413757, + "learning_rate": 0.0008574979133323377, + "loss": 2.5784, + "num_input_tokens_seen": 15584456256, + "step": 59450 + }, + { + "epoch": 0.2838164971320224, + "grad_norm": 0.21545729041099548, + "learning_rate": 0.0008535533905932737, + "loss": 2.5939, + "num_input_tokens_seen": 15597563456, + "step": 59500 + }, + { + "epoch": 0.2838164971320224, + "eval_loss": 2.469989538192749, + "eval_runtime": 54.0784, + "eval_samples_per_second": 92.458, + "eval_steps_per_second": 23.115, + "num_input_tokens_seen": 15597563456, + "step": 59500 + }, + { + "epoch": 0.2840549983901165, + "grad_norm": 0.20836423337459564, + "learning_rate": 0.0008495643602586287, + "loss": 2.5858, + "num_input_tokens_seen": 15610670656, + "step": 59550 + }, + { + "epoch": 0.28429349964821066, + "grad_norm": 0.20427604019641876, + "learning_rate": 0.0008455313244934324, + "loss": 2.5781, + "num_input_tokens_seen": 15623777856, + "step": 59600 + }, + { + "epoch": 0.28453200090630476, + "grad_norm": 0.2341683804988861, + "learning_rate": 0.0008414547910024035, + "loss": 2.5713, + "num_input_tokens_seen": 15636885056, + "step": 59650 + }, + { + "epoch": 0.2847705021643989, + "grad_norm": 0.20808522403240204, + "learning_rate": 0.0008373352729660373, + "loss": 2.5751, + "num_input_tokens_seen": 15649992256, + "step": 59700 + }, + { + "epoch": 0.2850090034224931, + "grad_norm": 0.21032562851905823, + "learning_rate": 0.000833173288976002, + "loss": 2.5784, + "num_input_tokens_seen": 15663099456, + "step": 59750 + }, + { + "epoch": 0.2852475046805872, + "grad_norm": 0.23485584557056427, + "learning_rate": 0.0008289693629698564, + "loss": 2.5974, + "num_input_tokens_seen": 15676206656, + "step": 59800 + }, + { + "epoch": 0.28548600593868134, + "grad_norm": 0.2229880541563034, + "learning_rate": 0.0008247240241650918, + "loss": 2.5834, + "num_input_tokens_seen": 15689313856, + "step": 59850 + }, + { + "epoch": 0.28572450719677545, + "grad_norm": 0.21837118268013, + "learning_rate": 0.000820437806992512, + "loss": 2.5734, + "num_input_tokens_seen": 15702421056, + "step": 59900 + }, + { + "epoch": 0.2859630084548696, + "grad_norm": 0.2157929688692093, + "learning_rate": 0.0008161112510289549, + "loss": 2.587, + "num_input_tokens_seen": 15715528256, + "step": 59950 + }, + { + "epoch": 0.2862015097129637, + "grad_norm": 0.24053893983364105, + "learning_rate": 0.0008117449009293668, + "loss": 2.5853, + "num_input_tokens_seen": 15728635456, + "step": 60000 + }, + { + "epoch": 0.2862015097129637, + "eval_loss": 2.470459461212158, + "eval_runtime": 53.5859, + "eval_samples_per_second": 93.308, + "eval_steps_per_second": 23.327, + "num_input_tokens_seen": 15728635456, + "step": 60000 + }, + { + "epoch": 0.28644001097105787, + "grad_norm": 0.25951045751571655, + "learning_rate": 0.0008073393063582386, + "loss": 2.5946, + "num_input_tokens_seen": 15741742656, + "step": 60050 + }, + { + "epoch": 0.286678512229152, + "grad_norm": 0.22712726891040802, + "learning_rate": 0.00080289502192041, + "loss": 2.5882, + "num_input_tokens_seen": 15754849856, + "step": 60100 + }, + { + "epoch": 0.28691701348724613, + "grad_norm": 0.2236946076154709, + "learning_rate": 0.0007984126070912518, + "loss": 2.5854, + "num_input_tokens_seen": 15767957056, + "step": 60150 + }, + { + "epoch": 0.2871555147453403, + "grad_norm": 0.3175867795944214, + "learning_rate": 0.0007938926261462366, + "loss": 2.5855, + "num_input_tokens_seen": 15781064256, + "step": 60200 + }, + { + "epoch": 0.2873940160034344, + "grad_norm": 0.22954128682613373, + "learning_rate": 0.000789335648089903, + "loss": 2.595, + "num_input_tokens_seen": 15794171456, + "step": 60250 + }, + { + "epoch": 0.28763251726152855, + "grad_norm": 0.23379147052764893, + "learning_rate": 0.000784742246584226, + "loss": 2.5872, + "num_input_tokens_seen": 15807278656, + "step": 60300 + }, + { + "epoch": 0.2878710185196227, + "grad_norm": 0.22107115387916565, + "learning_rate": 0.0007801129998764014, + "loss": 2.5704, + "num_input_tokens_seen": 15820385856, + "step": 60350 + }, + { + "epoch": 0.2881095197777168, + "grad_norm": 0.21197494864463806, + "learning_rate": 0.0007754484907260512, + "loss": 2.5751, + "num_input_tokens_seen": 15833493056, + "step": 60400 + }, + { + "epoch": 0.288348021035811, + "grad_norm": 0.21372662484645844, + "learning_rate": 0.0007707493063318629, + "loss": 2.5901, + "num_input_tokens_seen": 15846600256, + "step": 60450 + }, + { + "epoch": 0.2885865222939051, + "grad_norm": 0.23300603032112122, + "learning_rate": 0.0007660160382576683, + "loss": 2.5888, + "num_input_tokens_seen": 15859707456, + "step": 60500 + }, + { + "epoch": 0.2885865222939051, + "eval_loss": 2.463745355606079, + "eval_runtime": 53.032, + "eval_samples_per_second": 94.283, + "eval_steps_per_second": 23.571, + "num_input_tokens_seen": 15859707456, + "step": 60500 + }, + { + "epoch": 0.28882502355199924, + "grad_norm": 0.2108684778213501, + "learning_rate": 0.0007612492823579744, + "loss": 2.5965, + "num_input_tokens_seen": 15872814656, + "step": 60550 + }, + { + "epoch": 0.2890635248100934, + "grad_norm": 0.20625820755958557, + "learning_rate": 0.0007564496387029531, + "loss": 2.5615, + "num_input_tokens_seen": 15885921856, + "step": 60600 + }, + { + "epoch": 0.2893020260681875, + "grad_norm": 0.22595694661140442, + "learning_rate": 0.0007516177115029001, + "loss": 2.5871, + "num_input_tokens_seen": 15899029056, + "step": 60650 + }, + { + "epoch": 0.28954052732628166, + "grad_norm": 0.2095574140548706, + "learning_rate": 0.0007467541090321735, + "loss": 2.5867, + "num_input_tokens_seen": 15912136256, + "step": 60700 + }, + { + "epoch": 0.28977902858437576, + "grad_norm": 0.1979990303516388, + "learning_rate": 0.00074185944355262, + "loss": 2.586, + "num_input_tokens_seen": 15925243456, + "step": 60750 + }, + { + "epoch": 0.2900175298424699, + "grad_norm": 0.3573000431060791, + "learning_rate": 0.0007369343312364993, + "loss": 2.5807, + "num_input_tokens_seen": 15938350656, + "step": 60800 + }, + { + "epoch": 0.2902560311005641, + "grad_norm": 0.2209523618221283, + "learning_rate": 0.0007319793920889171, + "loss": 2.5867, + "num_input_tokens_seen": 15951457856, + "step": 60850 + }, + { + "epoch": 0.2904945323586582, + "grad_norm": 0.1979866325855255, + "learning_rate": 0.0007269952498697733, + "loss": 2.5679, + "num_input_tokens_seen": 15964565056, + "step": 60900 + }, + { + "epoch": 0.29073303361675235, + "grad_norm": 0.2013344019651413, + "learning_rate": 0.0007219825320152411, + "loss": 2.5842, + "num_input_tokens_seen": 15977672256, + "step": 60950 + }, + { + "epoch": 0.29097153487484645, + "grad_norm": 0.20511233806610107, + "learning_rate": 0.0007169418695587791, + "loss": 2.5864, + "num_input_tokens_seen": 15990779456, + "step": 61000 + }, + { + "epoch": 0.29097153487484645, + "eval_loss": 2.4598097801208496, + "eval_runtime": 53.5493, + "eval_samples_per_second": 93.372, + "eval_steps_per_second": 23.343, + "num_input_tokens_seen": 15990779456, + "step": 61000 + }, + { + "epoch": 0.2912100361329406, + "grad_norm": 0.19767510890960693, + "learning_rate": 0.0007118738970516943, + "loss": 2.5963, + "num_input_tokens_seen": 16003886656, + "step": 61050 + }, + { + "epoch": 0.2914485373910347, + "grad_norm": 0.21463529765605927, + "learning_rate": 0.0007067792524832604, + "loss": 2.5825, + "num_input_tokens_seen": 16016993856, + "step": 61100 + }, + { + "epoch": 0.29168703864912887, + "grad_norm": 0.2011532485485077, + "learning_rate": 0.0007016585772004026, + "loss": 2.5783, + "num_input_tokens_seen": 16030101056, + "step": 61150 + }, + { + "epoch": 0.29192553990722303, + "grad_norm": 0.19351401925086975, + "learning_rate": 0.0006965125158269618, + "loss": 2.5619, + "num_input_tokens_seen": 16043208256, + "step": 61200 + }, + { + "epoch": 0.29216404116531713, + "grad_norm": 0.1988568902015686, + "learning_rate": 0.000691341716182545, + "loss": 2.6007, + "num_input_tokens_seen": 16056315456, + "step": 61250 + }, + { + "epoch": 0.2924025424234113, + "grad_norm": 0.20459413528442383, + "learning_rate": 0.0006861468292009726, + "loss": 2.5762, + "num_input_tokens_seen": 16069422656, + "step": 61300 + }, + { + "epoch": 0.2926410436815054, + "grad_norm": 0.1914205551147461, + "learning_rate": 0.0006809285088483361, + "loss": 2.5734, + "num_input_tokens_seen": 16082529856, + "step": 61350 + }, + { + "epoch": 0.29287954493959956, + "grad_norm": 0.194325253367424, + "learning_rate": 0.0006756874120406714, + "loss": 2.5874, + "num_input_tokens_seen": 16095637056, + "step": 61400 + }, + { + "epoch": 0.2931180461976937, + "grad_norm": 0.20854853093624115, + "learning_rate": 0.0006704241985612625, + "loss": 2.5865, + "num_input_tokens_seen": 16108744256, + "step": 61450 + }, + { + "epoch": 0.2933565474557878, + "grad_norm": 0.190395787358284, + "learning_rate": 0.0006651395309775837, + "loss": 2.5716, + "num_input_tokens_seen": 16121851456, + "step": 61500 + }, + { + "epoch": 0.2933565474557878, + "eval_loss": 2.4551966190338135, + "eval_runtime": 53.3343, + "eval_samples_per_second": 93.748, + "eval_steps_per_second": 23.437, + "num_input_tokens_seen": 16121851456, + "step": 61500 + }, + { + "epoch": 0.293595048713882, + "grad_norm": 0.20652073621749878, + "learning_rate": 0.0006598340745578908, + "loss": 2.5765, + "num_input_tokens_seen": 16134958656, + "step": 61550 + }, + { + "epoch": 0.2938335499719761, + "grad_norm": 0.20701836049556732, + "learning_rate": 0.0006545084971874737, + "loss": 2.5653, + "num_input_tokens_seen": 16148065856, + "step": 61600 + }, + { + "epoch": 0.29407205123007024, + "grad_norm": 0.1792392134666443, + "learning_rate": 0.000649163469284578, + "loss": 2.577, + "num_input_tokens_seen": 16161173056, + "step": 61650 + }, + { + "epoch": 0.2943105524881644, + "grad_norm": 0.21742790937423706, + "learning_rate": 0.0006437996637160086, + "loss": 2.574, + "num_input_tokens_seen": 16174280256, + "step": 61700 + }, + { + "epoch": 0.2945490537462585, + "grad_norm": 0.20747682452201843, + "learning_rate": 0.0006384177557124247, + "loss": 2.564, + "num_input_tokens_seen": 16187387456, + "step": 61750 + }, + { + "epoch": 0.29478755500435266, + "grad_norm": 0.19990311563014984, + "learning_rate": 0.0006330184227833376, + "loss": 2.5866, + "num_input_tokens_seen": 16200494656, + "step": 61800 + }, + { + "epoch": 0.29502605626244677, + "grad_norm": 0.20410317182540894, + "learning_rate": 0.0006276023446318213, + "loss": 2.5559, + "num_input_tokens_seen": 16213601856, + "step": 61850 + }, + { + "epoch": 0.2952645575205409, + "grad_norm": 0.19365034997463226, + "learning_rate": 0.000622170203068947, + "loss": 2.5705, + "num_input_tokens_seen": 16226709056, + "step": 61900 + }, + { + "epoch": 0.29550305877863503, + "grad_norm": 0.2115161269903183, + "learning_rate": 0.0006167226819279528, + "loss": 2.5621, + "num_input_tokens_seen": 16239816256, + "step": 61950 + }, + { + "epoch": 0.2957415600367292, + "grad_norm": 0.22992485761642456, + "learning_rate": 0.0006112604669781572, + "loss": 2.5587, + "num_input_tokens_seen": 16252923456, + "step": 62000 + }, + { + "epoch": 0.2957415600367292, + "eval_loss": 2.452096462249756, + "eval_runtime": 53.6354, + "eval_samples_per_second": 93.222, + "eval_steps_per_second": 23.306, + "num_input_tokens_seen": 16252923456, + "step": 62000 + }, + { + "epoch": 0.29598006129482335, + "grad_norm": 0.1945638656616211, + "learning_rate": 0.0006057842458386314, + "loss": 2.5582, + "num_input_tokens_seen": 16266030656, + "step": 62050 + }, + { + "epoch": 0.29621856255291745, + "grad_norm": 0.201882466673851, + "learning_rate": 0.0006002947078916364, + "loss": 2.5764, + "num_input_tokens_seen": 16279137856, + "step": 62100 + }, + { + "epoch": 0.2964570638110116, + "grad_norm": 0.2137998789548874, + "learning_rate": 0.0005947925441958392, + "loss": 2.5689, + "num_input_tokens_seen": 16292245056, + "step": 62150 + }, + { + "epoch": 0.2966955650691057, + "grad_norm": 0.18265672028064728, + "learning_rate": 0.0005892784473993184, + "loss": 2.5741, + "num_input_tokens_seen": 16305352256, + "step": 62200 + }, + { + "epoch": 0.2969340663271999, + "grad_norm": 0.16944251954555511, + "learning_rate": 0.0005837531116523682, + "loss": 2.5537, + "num_input_tokens_seen": 16318459456, + "step": 62250 + }, + { + "epoch": 0.29717256758529403, + "grad_norm": 0.20273485779762268, + "learning_rate": 0.0005782172325201155, + "loss": 2.5512, + "num_input_tokens_seen": 16331566656, + "step": 62300 + }, + { + "epoch": 0.29741106884338814, + "grad_norm": 0.19320476055145264, + "learning_rate": 0.0005726715068949564, + "loss": 2.5823, + "num_input_tokens_seen": 16344673856, + "step": 62350 + }, + { + "epoch": 0.2976495701014823, + "grad_norm": 0.21321871876716614, + "learning_rate": 0.0005671166329088278, + "loss": 2.5608, + "num_input_tokens_seen": 16357781056, + "step": 62400 + }, + { + "epoch": 0.2978880713595764, + "grad_norm": 0.2007117122411728, + "learning_rate": 0.0005615533098453215, + "loss": 2.5685, + "num_input_tokens_seen": 16370888256, + "step": 62450 + }, + { + "epoch": 0.29812657261767056, + "grad_norm": 0.1896267682313919, + "learning_rate": 0.0005559822380516539, + "loss": 2.56, + "num_input_tokens_seen": 16383995456, + "step": 62500 + }, + { + "epoch": 0.29812657261767056, + "eval_loss": 2.448042154312134, + "eval_runtime": 54.1994, + "eval_samples_per_second": 92.252, + "eval_steps_per_second": 23.063, + "num_input_tokens_seen": 16383995456, + "step": 62500 + }, + { + "epoch": 0.2983650738757647, + "grad_norm": 0.18581034243106842, + "learning_rate": 0.0005504041188505022, + "loss": 2.5691, + "num_input_tokens_seen": 16397102656, + "step": 62550 + }, + { + "epoch": 0.2986035751338588, + "grad_norm": 0.19272533059120178, + "learning_rate": 0.0005448196544517168, + "loss": 2.5635, + "num_input_tokens_seen": 16410209856, + "step": 62600 + }, + { + "epoch": 0.298842076391953, + "grad_norm": 0.19940300285816193, + "learning_rate": 0.0005392295478639225, + "loss": 2.5755, + "num_input_tokens_seen": 16423317056, + "step": 62650 + }, + { + "epoch": 0.2990805776500471, + "grad_norm": 0.18894875049591064, + "learning_rate": 0.0005336345028060199, + "loss": 2.5718, + "num_input_tokens_seen": 16436424256, + "step": 62700 + }, + { + "epoch": 0.29931907890814125, + "grad_norm": 0.19226962327957153, + "learning_rate": 0.0005280352236185959, + "loss": 2.563, + "num_input_tokens_seen": 16449531456, + "step": 62750 + }, + { + "epoch": 0.2995575801662354, + "grad_norm": 0.20716702938079834, + "learning_rate": 0.0005224324151752575, + "loss": 2.5532, + "num_input_tokens_seen": 16462638656, + "step": 62800 + }, + { + "epoch": 0.2997960814243295, + "grad_norm": 0.20232325792312622, + "learning_rate": 0.000516826782793897, + "loss": 2.5691, + "num_input_tokens_seen": 16475745856, + "step": 62850 + }, + { + "epoch": 0.30003458268242367, + "grad_norm": 0.19828926026821136, + "learning_rate": 0.0005112190321479025, + "loss": 2.5602, + "num_input_tokens_seen": 16488853056, + "step": 62900 + }, + { + "epoch": 0.30027308394051777, + "grad_norm": 0.22366905212402344, + "learning_rate": 0.000505609869177323, + "loss": 2.5556, + "num_input_tokens_seen": 16501960256, + "step": 62950 + }, + { + "epoch": 0.30051158519861193, + "grad_norm": 0.1883884221315384, + "learning_rate": 0.0005, + "loss": 2.5567, + "num_input_tokens_seen": 16515067456, + "step": 63000 + }, + { + "epoch": 0.30051158519861193, + "eval_loss": 2.4441678524017334, + "eval_runtime": 54.2448, + "eval_samples_per_second": 92.175, + "eval_steps_per_second": 23.044, + "num_input_tokens_seen": 16515067456, + "step": 63000 + }, + { + "epoch": 0.30075008645670603, + "grad_norm": 0.20152603089809418, + "learning_rate": 0.0004943901308226771, + "loss": 2.5562, + "num_input_tokens_seen": 16528174656, + "step": 63050 + }, + { + "epoch": 0.3009885877148002, + "grad_norm": 0.18534454703330994, + "learning_rate": 0.0004887809678520976, + "loss": 2.5559, + "num_input_tokens_seen": 16541281856, + "step": 63100 + }, + { + "epoch": 0.30122708897289435, + "grad_norm": 0.18770301342010498, + "learning_rate": 0.0004831732172061032, + "loss": 2.5538, + "num_input_tokens_seen": 16554389056, + "step": 63150 + }, + { + "epoch": 0.30146559023098846, + "grad_norm": 0.19565705955028534, + "learning_rate": 0.0004775675848247427, + "loss": 2.5593, + "num_input_tokens_seen": 16567496256, + "step": 63200 + }, + { + "epoch": 0.3017040914890826, + "grad_norm": 0.1954822540283203, + "learning_rate": 0.00047196477638140405, + "loss": 2.5694, + "num_input_tokens_seen": 16580603456, + "step": 63250 + }, + { + "epoch": 0.3019425927471767, + "grad_norm": 0.18120840191841125, + "learning_rate": 0.0004663654971939802, + "loss": 2.5622, + "num_input_tokens_seen": 16593710656, + "step": 63300 + }, + { + "epoch": 0.3021810940052709, + "grad_norm": 0.18100927770137787, + "learning_rate": 0.0004607704521360776, + "loss": 2.5437, + "num_input_tokens_seen": 16606817856, + "step": 63350 + }, + { + "epoch": 0.30241959526336504, + "grad_norm": 0.20565176010131836, + "learning_rate": 0.0004551803455482833, + "loss": 2.5463, + "num_input_tokens_seen": 16619925056, + "step": 63400 + }, + { + "epoch": 0.30265809652145914, + "grad_norm": 0.18989761173725128, + "learning_rate": 0.0004495958811494978, + "loss": 2.5609, + "num_input_tokens_seen": 16633032256, + "step": 63450 + }, + { + "epoch": 0.3028965977795533, + "grad_norm": 0.1870686262845993, + "learning_rate": 0.0004440177619483461, + "loss": 2.5554, + "num_input_tokens_seen": 16646139456, + "step": 63500 + }, + { + "epoch": 0.3028965977795533, + "eval_loss": 2.4395649433135986, + "eval_runtime": 53.4665, + "eval_samples_per_second": 93.516, + "eval_steps_per_second": 23.379, + "num_input_tokens_seen": 16646139456, + "step": 63500 + }, + { + "epoch": 0.3031350990376474, + "grad_norm": 0.1891048699617386, + "learning_rate": 0.00043844669015467863, + "loss": 2.5627, + "num_input_tokens_seen": 16659246656, + "step": 63550 + }, + { + "epoch": 0.30337360029574156, + "grad_norm": 0.18591411411762238, + "learning_rate": 0.0004328833670911724, + "loss": 2.5545, + "num_input_tokens_seen": 16672353856, + "step": 63600 + }, + { + "epoch": 0.3036121015538357, + "grad_norm": 0.18640951812267303, + "learning_rate": 0.0004273284931050438, + "loss": 2.5672, + "num_input_tokens_seen": 16685461056, + "step": 63650 + }, + { + "epoch": 0.3038506028119298, + "grad_norm": 0.1919756680727005, + "learning_rate": 0.0004217827674798845, + "loss": 2.5492, + "num_input_tokens_seen": 16698568256, + "step": 63700 + }, + { + "epoch": 0.304089104070024, + "grad_norm": 0.18388938903808594, + "learning_rate": 0.00041624688834763184, + "loss": 2.5487, + "num_input_tokens_seen": 16711675456, + "step": 63750 + }, + { + "epoch": 0.3043276053281181, + "grad_norm": 0.1851562261581421, + "learning_rate": 0.0004107215526006817, + "loss": 2.5539, + "num_input_tokens_seen": 16724782656, + "step": 63800 + }, + { + "epoch": 0.30456610658621225, + "grad_norm": 0.17315496504306793, + "learning_rate": 0.0004052074558041608, + "loss": 2.5544, + "num_input_tokens_seen": 16737889856, + "step": 63850 + }, + { + "epoch": 0.30480460784430635, + "grad_norm": 0.17985352873802185, + "learning_rate": 0.00039970529210836363, + "loss": 2.5511, + "num_input_tokens_seen": 16750997056, + "step": 63900 + }, + { + "epoch": 0.3050431091024005, + "grad_norm": 0.20455212891101837, + "learning_rate": 0.0003942157541613686, + "loss": 2.5593, + "num_input_tokens_seen": 16764104256, + "step": 63950 + }, + { + "epoch": 0.30528161036049467, + "grad_norm": 0.1965632140636444, + "learning_rate": 0.00038873953302184284, + "loss": 2.5599, + "num_input_tokens_seen": 16777211456, + "step": 64000 + }, + { + "epoch": 0.30528161036049467, + "eval_loss": 2.437380790710449, + "eval_runtime": 53.2524, + "eval_samples_per_second": 93.893, + "eval_steps_per_second": 23.473, + "num_input_tokens_seen": 16777211456, + "step": 64000 + }, + { + "epoch": 0.3055201116185888, + "grad_norm": 0.1703004688024521, + "learning_rate": 0.00038327731807204744, + "loss": 2.5506, + "num_input_tokens_seen": 16790318656, + "step": 64050 + }, + { + "epoch": 0.30575861287668293, + "grad_norm": 0.19769616425037384, + "learning_rate": 0.00037782979693105293, + "loss": 2.542, + "num_input_tokens_seen": 16803425856, + "step": 64100 + }, + { + "epoch": 0.30599711413477704, + "grad_norm": 0.20674961805343628, + "learning_rate": 0.00037239765536817873, + "loss": 2.539, + "num_input_tokens_seen": 16816533056, + "step": 64150 + }, + { + "epoch": 0.3062356153928712, + "grad_norm": 0.19121839106082916, + "learning_rate": 0.0003669815772166625, + "loss": 2.5573, + "num_input_tokens_seen": 16829640256, + "step": 64200 + }, + { + "epoch": 0.30647411665096536, + "grad_norm": 0.1734025925397873, + "learning_rate": 0.00036158224428757535, + "loss": 2.5416, + "num_input_tokens_seen": 16842747456, + "step": 64250 + }, + { + "epoch": 0.30671261790905946, + "grad_norm": 0.1857634037733078, + "learning_rate": 0.0003562003362839914, + "loss": 2.5652, + "num_input_tokens_seen": 16855854656, + "step": 64300 + }, + { + "epoch": 0.3069511191671536, + "grad_norm": 0.17733143270015717, + "learning_rate": 0.000350836530715422, + "loss": 2.5299, + "num_input_tokens_seen": 16868961856, + "step": 64350 + }, + { + "epoch": 0.3071896204252477, + "grad_norm": 0.18323005735874176, + "learning_rate": 0.00034549150281252633, + "loss": 2.5691, + "num_input_tokens_seen": 16882069056, + "step": 64400 + }, + { + "epoch": 0.3074281216833419, + "grad_norm": 0.18570365011692047, + "learning_rate": 0.00034016592544210936, + "loss": 2.5436, + "num_input_tokens_seen": 16895176256, + "step": 64450 + }, + { + "epoch": 0.30766662294143604, + "grad_norm": 0.18571798503398895, + "learning_rate": 0.00033486046902241664, + "loss": 2.5382, + "num_input_tokens_seen": 16908283456, + "step": 64500 + }, + { + "epoch": 0.30766662294143604, + "eval_loss": 2.4323015213012695, + "eval_runtime": 53.7237, + "eval_samples_per_second": 93.069, + "eval_steps_per_second": 23.267, + "num_input_tokens_seen": 16908283456, + "step": 64500 + }, + { + "epoch": 0.30790512419953014, + "grad_norm": 0.1829528957605362, + "learning_rate": 0.0003295758014387375, + "loss": 2.5453, + "num_input_tokens_seen": 16921390656, + "step": 64550 + }, + { + "epoch": 0.3081436254576243, + "grad_norm": 0.1703086644411087, + "learning_rate": 0.0003243125879593286, + "loss": 2.5441, + "num_input_tokens_seen": 16934497856, + "step": 64600 + }, + { + "epoch": 0.3083821267157184, + "grad_norm": 0.17826180160045624, + "learning_rate": 0.000319071491151664, + "loss": 2.545, + "num_input_tokens_seen": 16947605056, + "step": 64650 + }, + { + "epoch": 0.30862062797381257, + "grad_norm": 0.17889030277729034, + "learning_rate": 0.00031385317079902743, + "loss": 2.5405, + "num_input_tokens_seen": 16960712256, + "step": 64700 + }, + { + "epoch": 0.30885912923190667, + "grad_norm": 0.1711336225271225, + "learning_rate": 0.0003086582838174551, + "loss": 2.5222, + "num_input_tokens_seen": 16973819456, + "step": 64750 + }, + { + "epoch": 0.30909763049000083, + "grad_norm": 0.17962214350700378, + "learning_rate": 0.0003034874841730382, + "loss": 2.5376, + "num_input_tokens_seen": 16986926656, + "step": 64800 + }, + { + "epoch": 0.309336131748095, + "grad_norm": 0.1699627935886383, + "learning_rate": 0.0002983414227995975, + "loss": 2.5616, + "num_input_tokens_seen": 17000033856, + "step": 64850 + }, + { + "epoch": 0.3095746330061891, + "grad_norm": 0.18442535400390625, + "learning_rate": 0.00029322074751673977, + "loss": 2.5377, + "num_input_tokens_seen": 17013141056, + "step": 64900 + }, + { + "epoch": 0.30981313426428325, + "grad_norm": 0.17972196638584137, + "learning_rate": 0.0002881261029483057, + "loss": 2.5474, + "num_input_tokens_seen": 17026248256, + "step": 64950 + }, + { + "epoch": 0.31005163552237736, + "grad_norm": 0.1810217946767807, + "learning_rate": 0.00028305813044122096, + "loss": 2.5286, + "num_input_tokens_seen": 17039355456, + "step": 65000 + }, + { + "epoch": 0.31005163552237736, + "eval_loss": 2.4292306900024414, + "eval_runtime": 53.3956, + "eval_samples_per_second": 93.641, + "eval_steps_per_second": 23.41, + "num_input_tokens_seen": 17039355456, + "step": 65000 + }, + { + "epoch": 0.3102901367804715, + "grad_norm": 0.17116400599479675, + "learning_rate": 0.000278017467984759, + "loss": 2.5504, + "num_input_tokens_seen": 17052462656, + "step": 65050 + }, + { + "epoch": 0.3105286380385657, + "grad_norm": 0.17055106163024902, + "learning_rate": 0.00027300475013022663, + "loss": 2.543, + "num_input_tokens_seen": 17065569856, + "step": 65100 + }, + { + "epoch": 0.3107671392966598, + "grad_norm": 0.17849299311637878, + "learning_rate": 0.000268020607911083, + "loss": 2.5476, + "num_input_tokens_seen": 17078677056, + "step": 65150 + }, + { + "epoch": 0.31100564055475394, + "grad_norm": 0.17608341574668884, + "learning_rate": 0.0002630656687635007, + "loss": 2.5452, + "num_input_tokens_seen": 17091784256, + "step": 65200 + }, + { + "epoch": 0.31124414181284804, + "grad_norm": 0.19086676836013794, + "learning_rate": 0.0002581405564473801, + "loss": 2.5562, + "num_input_tokens_seen": 17104891456, + "step": 65250 + }, + { + "epoch": 0.3114826430709422, + "grad_norm": 0.1721603125333786, + "learning_rate": 0.00025324589096782657, + "loss": 2.5402, + "num_input_tokens_seen": 17117998656, + "step": 65300 + }, + { + "epoch": 0.31172114432903636, + "grad_norm": 0.16727598011493683, + "learning_rate": 0.00024838228849709997, + "loss": 2.5253, + "num_input_tokens_seen": 17131105856, + "step": 65350 + }, + { + "epoch": 0.31195964558713046, + "grad_norm": 0.1664544939994812, + "learning_rate": 0.000243550361297047, + "loss": 2.5519, + "num_input_tokens_seen": 17144213056, + "step": 65400 + }, + { + "epoch": 0.3121981468452246, + "grad_norm": 0.17195752263069153, + "learning_rate": 0.00023875071764202561, + "loss": 2.5297, + "num_input_tokens_seen": 17157320256, + "step": 65450 + }, + { + "epoch": 0.3124366481033187, + "grad_norm": 0.19001176953315735, + "learning_rate": 0.00023398396174233177, + "loss": 2.5439, + "num_input_tokens_seen": 17170427456, + "step": 65500 + }, + { + "epoch": 0.3124366481033187, + "eval_loss": 2.426327705383301, + "eval_runtime": 53.7603, + "eval_samples_per_second": 93.005, + "eval_steps_per_second": 23.251, + "num_input_tokens_seen": 17170427456, + "step": 65500 + }, + { + "epoch": 0.3126751493614129, + "grad_norm": 0.17215538024902344, + "learning_rate": 0.00022925069366813716, + "loss": 2.5442, + "num_input_tokens_seen": 17183534656, + "step": 65550 + }, + { + "epoch": 0.31291365061950704, + "grad_norm": 0.16736114025115967, + "learning_rate": 0.0002245515092739488, + "loss": 2.5472, + "num_input_tokens_seen": 17196641856, + "step": 65600 + }, + { + "epoch": 0.31315215187760115, + "grad_norm": 0.1739792823791504, + "learning_rate": 0.00021988700012359863, + "loss": 2.5401, + "num_input_tokens_seen": 17209749056, + "step": 65650 + }, + { + "epoch": 0.3133906531356953, + "grad_norm": 0.17363224923610687, + "learning_rate": 0.00021525775341577403, + "loss": 2.5539, + "num_input_tokens_seen": 17222856256, + "step": 65700 + }, + { + "epoch": 0.3136291543937894, + "grad_norm": 0.16787610948085785, + "learning_rate": 0.00021066435191009715, + "loss": 2.5338, + "num_input_tokens_seen": 17235963456, + "step": 65750 + }, + { + "epoch": 0.31386765565188357, + "grad_norm": 0.17158125340938568, + "learning_rate": 0.00020610737385376348, + "loss": 2.5531, + "num_input_tokens_seen": 17249070656, + "step": 65800 + }, + { + "epoch": 0.3141061569099777, + "grad_norm": 0.1693524569272995, + "learning_rate": 0.00020158739290874821, + "loss": 2.5286, + "num_input_tokens_seen": 17262177856, + "step": 65850 + }, + { + "epoch": 0.31434465816807183, + "grad_norm": 0.1730414181947708, + "learning_rate": 0.0001971049780795901, + "loss": 2.5228, + "num_input_tokens_seen": 17275285056, + "step": 65900 + }, + { + "epoch": 0.314583159426166, + "grad_norm": 0.16220349073410034, + "learning_rate": 0.00019266069364176142, + "loss": 2.5445, + "num_input_tokens_seen": 17288392256, + "step": 65950 + }, + { + "epoch": 0.3148216606842601, + "grad_norm": 0.1605050265789032, + "learning_rate": 0.00018825509907063325, + "loss": 2.5491, + "num_input_tokens_seen": 17301499456, + "step": 66000 + }, + { + "epoch": 0.3148216606842601, + "eval_loss": 2.4224469661712646, + "eval_runtime": 53.2989, + "eval_samples_per_second": 93.811, + "eval_steps_per_second": 23.453, + "num_input_tokens_seen": 17301499456, + "step": 66000 + }, + { + "epoch": 0.31506016194235426, + "grad_norm": 0.16586218774318695, + "learning_rate": 0.00018388874897104518, + "loss": 2.5468, + "num_input_tokens_seen": 17314606656, + "step": 66050 + }, + { + "epoch": 0.31529866320044836, + "grad_norm": 0.1646813303232193, + "learning_rate": 0.00017956219300748795, + "loss": 2.5352, + "num_input_tokens_seen": 17327713856, + "step": 66100 + }, + { + "epoch": 0.3155371644585425, + "grad_norm": 0.18712937831878662, + "learning_rate": 0.00017527597583490823, + "loss": 2.5412, + "num_input_tokens_seen": 17340821056, + "step": 66150 + }, + { + "epoch": 0.3157756657166367, + "grad_norm": 0.1631355583667755, + "learning_rate": 0.00017103063703014372, + "loss": 2.5272, + "num_input_tokens_seen": 17353928256, + "step": 66200 + }, + { + "epoch": 0.3160141669747308, + "grad_norm": 0.15910203754901886, + "learning_rate": 0.00016682671102399805, + "loss": 2.5333, + "num_input_tokens_seen": 17367035456, + "step": 66250 + }, + { + "epoch": 0.31625266823282494, + "grad_norm": 0.5742849707603455, + "learning_rate": 0.00016266472703396284, + "loss": 2.5463, + "num_input_tokens_seen": 17380142656, + "step": 66300 + }, + { + "epoch": 0.31649116949091904, + "grad_norm": 0.17517830431461334, + "learning_rate": 0.00015854520899759655, + "loss": 2.5511, + "num_input_tokens_seen": 17393249856, + "step": 66350 + }, + { + "epoch": 0.3167296707490132, + "grad_norm": 0.6962131857872009, + "learning_rate": 0.00015446867550656767, + "loss": 2.5452, + "num_input_tokens_seen": 17406357056, + "step": 66400 + }, + { + "epoch": 0.31696817200710736, + "grad_norm": 0.16677837073802948, + "learning_rate": 0.00015043563974137132, + "loss": 2.5392, + "num_input_tokens_seen": 17419464256, + "step": 66450 + }, + { + "epoch": 0.31720667326520147, + "grad_norm": 0.16235870122909546, + "learning_rate": 0.00014644660940672628, + "loss": 2.5125, + "num_input_tokens_seen": 17432571456, + "step": 66500 + }, + { + "epoch": 0.31720667326520147, + "eval_loss": 2.419802188873291, + "eval_runtime": 52.8641, + "eval_samples_per_second": 94.582, + "eval_steps_per_second": 23.646, + "num_input_tokens_seen": 17432571456, + "step": 66500 + }, + { + "epoch": 0.3174451745232956, + "grad_norm": 0.17308832705020905, + "learning_rate": 0.00014250208666766236, + "loss": 2.5349, + "num_input_tokens_seen": 17445678656, + "step": 66550 + }, + { + "epoch": 0.31768367578138973, + "grad_norm": 0.16299477219581604, + "learning_rate": 0.00013860256808630427, + "loss": 2.5277, + "num_input_tokens_seen": 17458785856, + "step": 66600 + }, + { + "epoch": 0.3179221770394839, + "grad_norm": 0.18277022242546082, + "learning_rate": 0.00013474854455936125, + "loss": 2.5203, + "num_input_tokens_seen": 17471893056, + "step": 66650 + }, + { + "epoch": 0.318160678297578, + "grad_norm": 0.16096614301204681, + "learning_rate": 0.00013094050125632973, + "loss": 2.535, + "num_input_tokens_seen": 17485000256, + "step": 66700 + }, + { + "epoch": 0.31839917955567215, + "grad_norm": 0.1723272204399109, + "learning_rate": 0.0001271789175584172, + "loss": 2.549, + "num_input_tokens_seen": 17498107456, + "step": 66750 + }, + { + "epoch": 0.3186376808137663, + "grad_norm": 0.15782694518566132, + "learning_rate": 0.00012346426699819457, + "loss": 2.5317, + "num_input_tokens_seen": 17511214656, + "step": 66800 + }, + { + "epoch": 0.3188761820718604, + "grad_norm": 0.1627569943666458, + "learning_rate": 0.00011979701719998454, + "loss": 2.5382, + "num_input_tokens_seen": 17524321856, + "step": 66850 + }, + { + "epoch": 0.3191146833299546, + "grad_norm": 0.16340333223342896, + "learning_rate": 0.00011617762982099444, + "loss": 2.5477, + "num_input_tokens_seen": 17537429056, + "step": 66900 + }, + { + "epoch": 0.3193531845880487, + "grad_norm": 0.15788671374320984, + "learning_rate": 0.00011260656049319957, + "loss": 2.537, + "num_input_tokens_seen": 17550536256, + "step": 66950 + }, + { + "epoch": 0.31959168584614284, + "grad_norm": 0.16191193461418152, + "learning_rate": 0.0001090842587659851, + "loss": 2.5394, + "num_input_tokens_seen": 17563643456, + "step": 67000 + }, + { + "epoch": 0.31959168584614284, + "eval_loss": 2.417813301086426, + "eval_runtime": 53.532, + "eval_samples_per_second": 93.402, + "eval_steps_per_second": 23.351, + "num_input_tokens_seen": 17563643456, + "step": 67000 + }, + { + "epoch": 0.319830187104237, + "grad_norm": 0.1690913438796997, + "learning_rate": 0.00010561116804955451, + "loss": 2.5364, + "num_input_tokens_seen": 17576750656, + "step": 67050 + }, + { + "epoch": 0.3200686883623311, + "grad_norm": 0.16436229646205902, + "learning_rate": 0.00010218772555910954, + "loss": 2.5298, + "num_input_tokens_seen": 17589857856, + "step": 67100 + }, + { + "epoch": 0.32030718962042526, + "grad_norm": 0.15499907732009888, + "learning_rate": 9.881436225981105e-05, + "loss": 2.5484, + "num_input_tokens_seen": 17602965056, + "step": 67150 + }, + { + "epoch": 0.32054569087851936, + "grad_norm": 0.16237874329090118, + "learning_rate": 9.549150281252633e-05, + "loss": 2.5271, + "num_input_tokens_seen": 17616072256, + "step": 67200 + }, + { + "epoch": 0.3207841921366135, + "grad_norm": 0.16813968122005463, + "learning_rate": 9.221956552036992e-05, + "loss": 2.5295, + "num_input_tokens_seen": 17629179456, + "step": 67250 + }, + { + "epoch": 0.3210226933947077, + "grad_norm": 0.15672080218791962, + "learning_rate": 8.899896227604509e-05, + "loss": 2.528, + "num_input_tokens_seen": 17642286656, + "step": 67300 + }, + { + "epoch": 0.3212611946528018, + "grad_norm": 0.16523708403110504, + "learning_rate": 8.58300985099918e-05, + "loss": 2.5288, + "num_input_tokens_seen": 17655393856, + "step": 67350 + }, + { + "epoch": 0.32149969591089594, + "grad_norm": 0.16759687662124634, + "learning_rate": 8.271337313934868e-05, + "loss": 2.5431, + "num_input_tokens_seen": 17668501056, + "step": 67400 + }, + { + "epoch": 0.32173819716899005, + "grad_norm": 0.15507538616657257, + "learning_rate": 7.964917851773496e-05, + "loss": 2.5342, + "num_input_tokens_seen": 17681608256, + "step": 67450 + }, + { + "epoch": 0.3219766984270842, + "grad_norm": 0.1556961089372635, + "learning_rate": 7.663790038585794e-05, + "loss": 2.5189, + "num_input_tokens_seen": 17694715456, + "step": 67500 + }, + { + "epoch": 0.3219766984270842, + "eval_loss": 2.415555000305176, + "eval_runtime": 53.2935, + "eval_samples_per_second": 93.82, + "eval_steps_per_second": 23.455, + "num_input_tokens_seen": 17694715456, + "step": 67500 + }, + { + "epoch": 0.32221519968517837, + "grad_norm": 0.16804397106170654, + "learning_rate": 7.367991782295391e-05, + "loss": 2.5218, + "num_input_tokens_seen": 17707822656, + "step": 67550 + }, + { + "epoch": 0.32245370094327247, + "grad_norm": 0.15728074312210083, + "learning_rate": 7.077560319906695e-05, + "loss": 2.5261, + "num_input_tokens_seen": 17720929856, + "step": 67600 + }, + { + "epoch": 0.32269220220136663, + "grad_norm": 0.1641319841146469, + "learning_rate": 6.792532212817271e-05, + "loss": 2.5398, + "num_input_tokens_seen": 17734037056, + "step": 67650 + }, + { + "epoch": 0.32293070345946073, + "grad_norm": 0.1575596034526825, + "learning_rate": 6.512943342215233e-05, + "loss": 2.5211, + "num_input_tokens_seen": 17747144256, + "step": 67700 + }, + { + "epoch": 0.3231692047175549, + "grad_norm": 0.16352206468582153, + "learning_rate": 6.238828904562316e-05, + "loss": 2.5143, + "num_input_tokens_seen": 17760251456, + "step": 67750 + }, + { + "epoch": 0.323407705975649, + "grad_norm": 0.16303551197052002, + "learning_rate": 5.9702234071631e-05, + "loss": 2.5262, + "num_input_tokens_seen": 17773358656, + "step": 67800 + }, + { + "epoch": 0.32364620723374316, + "grad_norm": 0.15572308003902435, + "learning_rate": 5.7071606638210094e-05, + "loss": 2.5278, + "num_input_tokens_seen": 17786465856, + "step": 67850 + }, + { + "epoch": 0.3238847084918373, + "grad_norm": 0.15960544347763062, + "learning_rate": 5.449673790581611e-05, + "loss": 2.522, + "num_input_tokens_seen": 17799573056, + "step": 67900 + }, + { + "epoch": 0.3241232097499314, + "grad_norm": 0.15617695450782776, + "learning_rate": 5.197795201563743e-05, + "loss": 2.5151, + "num_input_tokens_seen": 17812680256, + "step": 67950 + }, + { + "epoch": 0.3243617110080256, + "grad_norm": 0.1527390033006668, + "learning_rate": 4.9515566048790485e-05, + "loss": 2.5213, + "num_input_tokens_seen": 17825787456, + "step": 68000 + }, + { + "epoch": 0.3243617110080256, + "eval_loss": 2.4139962196350098, + "eval_runtime": 53.933, + "eval_samples_per_second": 92.708, + "eval_steps_per_second": 23.177, + "num_input_tokens_seen": 17825787456, + "step": 68000 + }, + { + "epoch": 0.3246002122661197, + "grad_norm": 0.15067367255687714, + "learning_rate": 4.7109889986402973e-05, + "loss": 2.5181, + "num_input_tokens_seen": 17838894656, + "step": 68050 + }, + { + "epoch": 0.32483871352421384, + "grad_norm": 0.1534261703491211, + "learning_rate": 4.476122667059207e-05, + "loss": 2.533, + "num_input_tokens_seen": 17852001856, + "step": 68100 + }, + { + "epoch": 0.325077214782308, + "grad_norm": 0.1585472822189331, + "learning_rate": 4.2469871766340095e-05, + "loss": 2.509, + "num_input_tokens_seen": 17865109056, + "step": 68150 + }, + { + "epoch": 0.3253157160404021, + "grad_norm": 0.15480853617191315, + "learning_rate": 4.0236113724274713e-05, + "loss": 2.524, + "num_input_tokens_seen": 17878216256, + "step": 68200 + }, + { + "epoch": 0.32555421729849626, + "grad_norm": 0.24341611564159393, + "learning_rate": 3.806023374435663e-05, + "loss": 2.5293, + "num_input_tokens_seen": 17891323456, + "step": 68250 + }, + { + "epoch": 0.32579271855659037, + "grad_norm": 0.15290473401546478, + "learning_rate": 3.594250574048058e-05, + "loss": 2.5149, + "num_input_tokens_seen": 17904430656, + "step": 68300 + }, + { + "epoch": 0.3260312198146845, + "grad_norm": 0.1606835126876831, + "learning_rate": 3.3883196305992905e-05, + "loss": 2.5327, + "num_input_tokens_seen": 17917537856, + "step": 68350 + }, + { + "epoch": 0.3262697210727787, + "grad_norm": 0.1537574976682663, + "learning_rate": 3.18825646801314e-05, + "loss": 2.5416, + "num_input_tokens_seen": 17930645056, + "step": 68400 + }, + { + "epoch": 0.3265082223308728, + "grad_norm": 0.16943201422691345, + "learning_rate": 2.994086271539048e-05, + "loss": 2.5233, + "num_input_tokens_seen": 17943752256, + "step": 68450 + }, + { + "epoch": 0.32674672358896695, + "grad_norm": 0.15832561254501343, + "learning_rate": 2.8058334845816213e-05, + "loss": 2.5439, + "num_input_tokens_seen": 17956859456, + "step": 68500 + }, + { + "epoch": 0.32674672358896695, + "eval_loss": 2.4128847122192383, + "eval_runtime": 53.1054, + "eval_samples_per_second": 94.152, + "eval_steps_per_second": 23.538, + "num_input_tokens_seen": 17956859456, + "step": 68500 + }, + { + "epoch": 0.32698522484706105, + "grad_norm": 0.15245509147644043, + "learning_rate": 2.6235218056235634e-05, + "loss": 2.5209, + "num_input_tokens_seen": 17969966656, + "step": 68550 + }, + { + "epoch": 0.3272237261051552, + "grad_norm": 0.15148235857486725, + "learning_rate": 2.4471741852423235e-05, + "loss": 2.5284, + "num_input_tokens_seen": 17983073856, + "step": 68600 + }, + { + "epoch": 0.3274622273632493, + "grad_norm": 0.15678688883781433, + "learning_rate": 2.276812823220964e-05, + "loss": 2.537, + "num_input_tokens_seen": 17996181056, + "step": 68650 + }, + { + "epoch": 0.3277007286213435, + "grad_norm": 0.15105360746383667, + "learning_rate": 2.1124591657534777e-05, + "loss": 2.5321, + "num_input_tokens_seen": 18009288256, + "step": 68700 + }, + { + "epoch": 0.32793922987943763, + "grad_norm": 0.15369552373886108, + "learning_rate": 1.9541339027450256e-05, + "loss": 2.5291, + "num_input_tokens_seen": 18022395456, + "step": 68750 + }, + { + "epoch": 0.32817773113753174, + "grad_norm": 0.1551530808210373, + "learning_rate": 1.801856965207338e-05, + "loss": 2.5201, + "num_input_tokens_seen": 18035502656, + "step": 68800 + }, + { + "epoch": 0.3284162323956259, + "grad_norm": 0.14859162271022797, + "learning_rate": 1.6556475227496815e-05, + "loss": 2.5436, + "num_input_tokens_seen": 18048609856, + "step": 68850 + }, + { + "epoch": 0.32865473365372, + "grad_norm": 0.14972691237926483, + "learning_rate": 1.5155239811656562e-05, + "loss": 2.5221, + "num_input_tokens_seen": 18061717056, + "step": 68900 + }, + { + "epoch": 0.32889323491181416, + "grad_norm": 0.156805619597435, + "learning_rate": 1.3815039801161721e-05, + "loss": 2.5248, + "num_input_tokens_seen": 18074824256, + "step": 68950 + }, + { + "epoch": 0.3291317361699083, + "grad_norm": 0.148334801197052, + "learning_rate": 1.2536043909088191e-05, + "loss": 2.5361, + "num_input_tokens_seen": 18087931456, + "step": 69000 + }, + { + "epoch": 0.3291317361699083, + "eval_loss": 2.4120428562164307, + "eval_runtime": 52.9258, + "eval_samples_per_second": 94.472, + "eval_steps_per_second": 23.618, + "num_input_tokens_seen": 18087931456, + "step": 69000 + }, + { + "epoch": 0.3293702374280024, + "grad_norm": 0.14565595984458923, + "learning_rate": 1.1318413143740436e-05, + "loss": 2.5358, + "num_input_tokens_seen": 18101038656, + "step": 69050 + }, + { + "epoch": 0.3296087386860966, + "grad_norm": 0.15810008347034454, + "learning_rate": 1.0162300788382261e-05, + "loss": 2.5288, + "num_input_tokens_seen": 18114145856, + "step": 69100 + }, + { + "epoch": 0.3298472399441907, + "grad_norm": 0.14960281550884247, + "learning_rate": 9.0678523819408e-06, + "loss": 2.5267, + "num_input_tokens_seen": 18127253056, + "step": 69150 + }, + { + "epoch": 0.33008574120228484, + "grad_norm": 0.14473624527454376, + "learning_rate": 8.035205700685167e-06, + "loss": 2.5133, + "num_input_tokens_seen": 18140360256, + "step": 69200 + }, + { + "epoch": 0.330324242460379, + "grad_norm": 0.1450708657503128, + "learning_rate": 7.064490740882057e-06, + "loss": 2.5302, + "num_input_tokens_seen": 18153467456, + "step": 69250 + }, + { + "epoch": 0.3305627437184731, + "grad_norm": 0.14883211255073547, + "learning_rate": 6.15582970243117e-06, + "loss": 2.5307, + "num_input_tokens_seen": 18166574656, + "step": 69300 + }, + { + "epoch": 0.33080124497656727, + "grad_norm": 0.15696081519126892, + "learning_rate": 5.309336973481682e-06, + "loss": 2.5341, + "num_input_tokens_seen": 18179681856, + "step": 69350 + }, + { + "epoch": 0.33103974623466137, + "grad_norm": 0.1564367264509201, + "learning_rate": 4.52511911603265e-06, + "loss": 2.5299, + "num_input_tokens_seen": 18192789056, + "step": 69400 + }, + { + "epoch": 0.33127824749275553, + "grad_norm": 0.15558916330337524, + "learning_rate": 3.803274852517968e-06, + "loss": 2.5197, + "num_input_tokens_seen": 18205896256, + "step": 69450 + }, + { + "epoch": 0.3315167487508497, + "grad_norm": 0.1532556265592575, + "learning_rate": 3.143895053378698e-06, + "loss": 2.5176, + "num_input_tokens_seen": 18219003456, + "step": 69500 + }, + { + "epoch": 0.3315167487508497, + "eval_loss": 2.412046194076538, + "eval_runtime": 53.2476, + "eval_samples_per_second": 93.901, + "eval_steps_per_second": 23.475, + "num_input_tokens_seen": 18219003456, + "step": 69500 + }, + { + "epoch": 0.3317552500089438, + "grad_norm": 0.1502823829650879, + "learning_rate": 2.547062725623828e-06, + "loss": 2.5207, + "num_input_tokens_seen": 18232110656, + "step": 69550 + }, + { + "epoch": 0.33199375126703795, + "grad_norm": 0.1560440957546234, + "learning_rate": 2.012853002380466e-06, + "loss": 2.5078, + "num_input_tokens_seen": 18245217856, + "step": 69600 + }, + { + "epoch": 0.33223225252513205, + "grad_norm": 0.15284490585327148, + "learning_rate": 1.541333133436018e-06, + "loss": 2.5404, + "num_input_tokens_seen": 18258325056, + "step": 69650 + }, + { + "epoch": 0.3324707537832262, + "grad_norm": 0.14594900608062744, + "learning_rate": 1.132562476771959e-06, + "loss": 2.5267, + "num_input_tokens_seen": 18271432256, + "step": 69700 + }, + { + "epoch": 0.3327092550413203, + "grad_norm": 0.15198394656181335, + "learning_rate": 7.865924910916978e-07, + "loss": 2.5232, + "num_input_tokens_seen": 18284539456, + "step": 69750 + }, + { + "epoch": 0.3329477562994145, + "grad_norm": 0.15011271834373474, + "learning_rate": 5.034667293427053e-07, + "loss": 2.5308, + "num_input_tokens_seen": 18297646656, + "step": 69800 + }, + { + "epoch": 0.33318625755750864, + "grad_norm": 0.147654727101326, + "learning_rate": 2.8322083323334415e-07, + "loss": 2.5281, + "num_input_tokens_seen": 18310753856, + "step": 69850 + }, + { + "epoch": 0.33342475881560274, + "grad_norm": 0.15056386590003967, + "learning_rate": 1.2588252874673466e-07, + "loss": 2.5112, + "num_input_tokens_seen": 18323861056, + "step": 69900 + }, + { + "epoch": 0.3336632600736969, + "grad_norm": 0.14858213067054749, + "learning_rate": 3.147162264971471e-08, + "loss": 2.5226, + "num_input_tokens_seen": 18336968256, + "step": 69950 + }, + { + "epoch": 0.333901761331791, + "grad_norm": 0.1534891128540039, + "learning_rate": 0.0, + "loss": 2.5303, + "num_input_tokens_seen": 18350075456, + "step": 70000 + }, + { + "epoch": 0.333901761331791, + "eval_loss": 2.411842107772827, + "eval_runtime": 53.9812, + "eval_samples_per_second": 92.625, + "eval_steps_per_second": 23.156, + "num_input_tokens_seen": 18350075456, + "step": 70000 + }, + { + "epoch": 0.333901761331791, + "num_input_tokens_seen": 18350075456, + "step": 70000, + "total_flos": 4.908824281216451e+18, + "train_loss": 2.719220863015311, + "train_runtime": 98058.4226, + "train_samples_per_second": 182.748, + "train_steps_per_second": 0.714, + "train_tokens_per_second": 187134.155 + } + ], + "logging_steps": 50, + "max_steps": 70000, + "num_input_tokens_seen": 18350075456, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.908824281216451e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}