{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.333901761331791, "eval_steps": 500, "global_step": 70000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00023850125809413643, "grad_norm": 1.5476292371749878, "learning_rate": 0.0001, "loss": 9.8285, "num_input_tokens_seen": 13107200, "step": 50 }, { "epoch": 0.00047700251618827287, "grad_norm": 0.47720426321029663, "learning_rate": 0.0002, "loss": 7.964, "num_input_tokens_seen": 26214400, "step": 100 }, { "epoch": 0.0007155037742824094, "grad_norm": 0.8443030714988708, "learning_rate": 0.0003, "loss": 7.0452, "num_input_tokens_seen": 39321600, "step": 150 }, { "epoch": 0.0009540050323765457, "grad_norm": 0.5895723104476929, "learning_rate": 0.0004, "loss": 6.355, "num_input_tokens_seen": 52428800, "step": 200 }, { "epoch": 0.0011925062904706823, "grad_norm": 0.8343789577484131, "learning_rate": 0.0005, "loss": 5.8716, "num_input_tokens_seen": 65536000, "step": 250 }, { "epoch": 0.0014310075485648188, "grad_norm": 0.5747953057289124, "learning_rate": 0.0006, "loss": 5.5086, "num_input_tokens_seen": 78643200, "step": 300 }, { "epoch": 0.0016695088066589552, "grad_norm": 0.8383421301841736, "learning_rate": 0.0007, "loss": 5.2217, "num_input_tokens_seen": 91750400, "step": 350 }, { "epoch": 0.0019080100647530915, "grad_norm": 0.5696113109588623, "learning_rate": 0.0008, "loss": 4.9683, "num_input_tokens_seen": 104857600, "step": 400 }, { "epoch": 0.002146511322847228, "grad_norm": 0.5431691408157349, "learning_rate": 0.0009000000000000001, "loss": 4.7629, "num_input_tokens_seen": 117964800, "step": 450 }, { "epoch": 0.0023850125809413646, "grad_norm": 0.4571855664253235, "learning_rate": 0.001, "loss": 4.5284, "num_input_tokens_seen": 131072000, "step": 500 }, { "epoch": 0.0023850125809413646, "eval_loss": 4.309167385101318, "eval_runtime": 53.3891, "eval_samples_per_second": 93.652, "eval_steps_per_second": 23.413, "num_input_tokens_seen": 131072000, "step": 500 }, { "epoch": 0.002623513839035501, "grad_norm": 0.43464773893356323, "learning_rate": 0.001, "loss": 4.3379, "num_input_tokens_seen": 144179200, "step": 550 }, { "epoch": 0.0028620150971296375, "grad_norm": 0.49611660838127136, "learning_rate": 0.001, "loss": 4.1712, "num_input_tokens_seen": 157286400, "step": 600 }, { "epoch": 0.0031005163552237738, "grad_norm": 0.4060957729816437, "learning_rate": 0.001, "loss": 4.0436, "num_input_tokens_seen": 170393600, "step": 650 }, { "epoch": 0.0033390176133179105, "grad_norm": 0.37300577759742737, "learning_rate": 0.001, "loss": 3.9582, "num_input_tokens_seen": 183500800, "step": 700 }, { "epoch": 0.0035775188714120467, "grad_norm": 0.4117021858692169, "learning_rate": 0.001, "loss": 3.8674, "num_input_tokens_seen": 196608000, "step": 750 }, { "epoch": 0.003816020129506183, "grad_norm": 0.3335980772972107, "learning_rate": 0.001, "loss": 3.8031, "num_input_tokens_seen": 209715200, "step": 800 }, { "epoch": 0.004054521387600319, "grad_norm": 0.35943159461021423, "learning_rate": 0.001, "loss": 3.7534, "num_input_tokens_seen": 222822400, "step": 850 }, { "epoch": 0.004293022645694456, "grad_norm": 0.40000948309898376, "learning_rate": 0.001, "loss": 3.6867, "num_input_tokens_seen": 235929600, "step": 900 }, { "epoch": 0.0045315239037885926, "grad_norm": 0.3165877163410187, "learning_rate": 0.001, "loss": 3.6565, "num_input_tokens_seen": 249036800, "step": 950 }, { "epoch": 0.004770025161882729, "grad_norm": 0.3687070906162262, "learning_rate": 0.001, "loss": 3.6005, "num_input_tokens_seen": 262144000, "step": 1000 }, { "epoch": 0.004770025161882729, "eval_loss": 3.4853296279907227, "eval_runtime": 52.477, "eval_samples_per_second": 95.28, "eval_steps_per_second": 23.82, "num_input_tokens_seen": 262144000, "step": 1000 }, { "epoch": 0.005008526419976865, "grad_norm": 0.32389721274375916, "learning_rate": 0.001, "loss": 3.5663, "num_input_tokens_seen": 275251200, "step": 1050 }, { "epoch": 0.005247027678071002, "grad_norm": 0.3202049434185028, "learning_rate": 0.001, "loss": 3.5376, "num_input_tokens_seen": 288358400, "step": 1100 }, { "epoch": 0.005485528936165138, "grad_norm": 0.30287981033325195, "learning_rate": 0.001, "loss": 3.5135, "num_input_tokens_seen": 301465600, "step": 1150 }, { "epoch": 0.005724030194259275, "grad_norm": 0.3624540865421295, "learning_rate": 0.001, "loss": 3.4814, "num_input_tokens_seen": 314572800, "step": 1200 }, { "epoch": 0.005962531452353411, "grad_norm": 0.30017992854118347, "learning_rate": 0.001, "loss": 3.4476, "num_input_tokens_seen": 327680000, "step": 1250 }, { "epoch": 0.0062010327104475476, "grad_norm": 0.3169330060482025, "learning_rate": 0.001, "loss": 3.4179, "num_input_tokens_seen": 340787200, "step": 1300 }, { "epoch": 0.006439533968541684, "grad_norm": 0.2730589210987091, "learning_rate": 0.001, "loss": 3.4074, "num_input_tokens_seen": 353894400, "step": 1350 }, { "epoch": 0.006678035226635821, "grad_norm": 0.2927146553993225, "learning_rate": 0.001, "loss": 3.3757, "num_input_tokens_seen": 367001600, "step": 1400 }, { "epoch": 0.006916536484729957, "grad_norm": 0.34230080246925354, "learning_rate": 0.001, "loss": 3.3502, "num_input_tokens_seen": 380108800, "step": 1450 }, { "epoch": 0.007155037742824093, "grad_norm": 0.30472344160079956, "learning_rate": 0.001, "loss": 3.3639, "num_input_tokens_seen": 393216000, "step": 1500 }, { "epoch": 0.007155037742824093, "eval_loss": 3.2486374378204346, "eval_runtime": 52.482, "eval_samples_per_second": 95.271, "eval_steps_per_second": 23.818, "num_input_tokens_seen": 393216000, "step": 1500 }, { "epoch": 0.00739353900091823, "grad_norm": 0.26124337315559387, "learning_rate": 0.001, "loss": 3.3521, "num_input_tokens_seen": 406323200, "step": 1550 }, { "epoch": 0.007632040259012366, "grad_norm": 0.29117754101753235, "learning_rate": 0.001, "loss": 3.315, "num_input_tokens_seen": 419430400, "step": 1600 }, { "epoch": 0.007870541517106503, "grad_norm": 0.24080802500247955, "learning_rate": 0.001, "loss": 3.3103, "num_input_tokens_seen": 432537600, "step": 1650 }, { "epoch": 0.008109042775200638, "grad_norm": 0.29982003569602966, "learning_rate": 0.001, "loss": 3.2926, "num_input_tokens_seen": 445644800, "step": 1700 }, { "epoch": 0.008347544033294775, "grad_norm": 0.26795274019241333, "learning_rate": 0.001, "loss": 3.2843, "num_input_tokens_seen": 458752000, "step": 1750 }, { "epoch": 0.008586045291388912, "grad_norm": 0.252774715423584, "learning_rate": 0.001, "loss": 3.274, "num_input_tokens_seen": 471859200, "step": 1800 }, { "epoch": 0.008824546549483048, "grad_norm": 0.25432145595550537, "learning_rate": 0.001, "loss": 3.2533, "num_input_tokens_seen": 484966400, "step": 1850 }, { "epoch": 0.009063047807577185, "grad_norm": 0.25918108224868774, "learning_rate": 0.001, "loss": 3.2501, "num_input_tokens_seen": 498073600, "step": 1900 }, { "epoch": 0.009301549065671322, "grad_norm": 0.2482348382472992, "learning_rate": 0.001, "loss": 3.2541, "num_input_tokens_seen": 511180800, "step": 1950 }, { "epoch": 0.009540050323765458, "grad_norm": 0.2615273594856262, "learning_rate": 0.001, "loss": 3.2218, "num_input_tokens_seen": 524288000, "step": 2000 }, { "epoch": 0.009540050323765458, "eval_loss": 3.1193039417266846, "eval_runtime": 52.7955, "eval_samples_per_second": 94.705, "eval_steps_per_second": 23.676, "num_input_tokens_seen": 524288000, "step": 2000 }, { "epoch": 0.009778551581859595, "grad_norm": 0.2637729048728943, "learning_rate": 0.001, "loss": 3.2285, "num_input_tokens_seen": 537395200, "step": 2050 }, { "epoch": 0.01001705283995373, "grad_norm": 0.23936080932617188, "learning_rate": 0.001, "loss": 3.2119, "num_input_tokens_seen": 550502400, "step": 2100 }, { "epoch": 0.010255554098047867, "grad_norm": 0.2469020038843155, "learning_rate": 0.001, "loss": 3.2021, "num_input_tokens_seen": 563609600, "step": 2150 }, { "epoch": 0.010494055356142003, "grad_norm": 0.2304004430770874, "learning_rate": 0.001, "loss": 3.1874, "num_input_tokens_seen": 576716800, "step": 2200 }, { "epoch": 0.01073255661423614, "grad_norm": 0.232864648103714, "learning_rate": 0.001, "loss": 3.1897, "num_input_tokens_seen": 589824000, "step": 2250 }, { "epoch": 0.010971057872330277, "grad_norm": 0.23161470890045166, "learning_rate": 0.001, "loss": 3.1689, "num_input_tokens_seen": 602931200, "step": 2300 }, { "epoch": 0.011209559130424413, "grad_norm": 0.20868408679962158, "learning_rate": 0.001, "loss": 3.1615, "num_input_tokens_seen": 616038400, "step": 2350 }, { "epoch": 0.01144806038851855, "grad_norm": 0.23374608159065247, "learning_rate": 0.001, "loss": 3.1556, "num_input_tokens_seen": 629145600, "step": 2400 }, { "epoch": 0.011686561646612685, "grad_norm": 0.21716611087322235, "learning_rate": 0.001, "loss": 3.1463, "num_input_tokens_seen": 642252800, "step": 2450 }, { "epoch": 0.011925062904706822, "grad_norm": 0.23689670860767365, "learning_rate": 0.001, "loss": 3.1433, "num_input_tokens_seen": 655360000, "step": 2500 }, { "epoch": 0.011925062904706822, "eval_loss": 3.040046215057373, "eval_runtime": 52.9109, "eval_samples_per_second": 94.498, "eval_steps_per_second": 23.625, "num_input_tokens_seen": 655360000, "step": 2500 }, { "epoch": 0.012163564162800958, "grad_norm": 0.2245575189590454, "learning_rate": 0.001, "loss": 3.1445, "num_input_tokens_seen": 668467200, "step": 2550 }, { "epoch": 0.012402065420895095, "grad_norm": 0.20992259681224823, "learning_rate": 0.001, "loss": 3.1447, "num_input_tokens_seen": 681574400, "step": 2600 }, { "epoch": 0.012640566678989232, "grad_norm": 0.21792201697826385, "learning_rate": 0.001, "loss": 3.1323, "num_input_tokens_seen": 694681600, "step": 2650 }, { "epoch": 0.012879067937083368, "grad_norm": 0.243458554148674, "learning_rate": 0.001, "loss": 3.1084, "num_input_tokens_seen": 707788800, "step": 2700 }, { "epoch": 0.013117569195177505, "grad_norm": 0.21190515160560608, "learning_rate": 0.001, "loss": 3.1202, "num_input_tokens_seen": 720896000, "step": 2750 }, { "epoch": 0.013356070453271642, "grad_norm": 0.2461613118648529, "learning_rate": 0.001, "loss": 3.1007, "num_input_tokens_seen": 734003200, "step": 2800 }, { "epoch": 0.013594571711365777, "grad_norm": 0.1976248323917389, "learning_rate": 0.001, "loss": 3.1079, "num_input_tokens_seen": 747110400, "step": 2850 }, { "epoch": 0.013833072969459913, "grad_norm": 0.22097842395305634, "learning_rate": 0.001, "loss": 3.0846, "num_input_tokens_seen": 760217600, "step": 2900 }, { "epoch": 0.01407157422755405, "grad_norm": 0.20581132173538208, "learning_rate": 0.001, "loss": 3.0995, "num_input_tokens_seen": 773324800, "step": 2950 }, { "epoch": 0.014310075485648187, "grad_norm": 0.19790051877498627, "learning_rate": 0.001, "loss": 3.0977, "num_input_tokens_seen": 786432000, "step": 3000 }, { "epoch": 0.014310075485648187, "eval_loss": 2.9804909229278564, "eval_runtime": 53.1278, "eval_samples_per_second": 94.113, "eval_steps_per_second": 23.528, "num_input_tokens_seen": 786432000, "step": 3000 }, { "epoch": 0.014548576743742323, "grad_norm": 0.20328116416931152, "learning_rate": 0.001, "loss": 3.0872, "num_input_tokens_seen": 799539200, "step": 3050 }, { "epoch": 0.01478707800183646, "grad_norm": 0.21318025887012482, "learning_rate": 0.001, "loss": 3.0861, "num_input_tokens_seen": 812646400, "step": 3100 }, { "epoch": 0.015025579259930597, "grad_norm": 0.22170069813728333, "learning_rate": 0.001, "loss": 3.0618, "num_input_tokens_seen": 825753600, "step": 3150 }, { "epoch": 0.015264080518024732, "grad_norm": 0.21292312443256378, "learning_rate": 0.001, "loss": 3.0567, "num_input_tokens_seen": 838860800, "step": 3200 }, { "epoch": 0.015502581776118868, "grad_norm": 0.2331959754228592, "learning_rate": 0.001, "loss": 3.0714, "num_input_tokens_seen": 851968000, "step": 3250 }, { "epoch": 0.015741083034213007, "grad_norm": 0.19236011803150177, "learning_rate": 0.001, "loss": 3.059, "num_input_tokens_seen": 865075200, "step": 3300 }, { "epoch": 0.015979584292307142, "grad_norm": 0.19991376996040344, "learning_rate": 0.001, "loss": 3.0542, "num_input_tokens_seen": 878182400, "step": 3350 }, { "epoch": 0.016218085550401277, "grad_norm": 0.2042934149503708, "learning_rate": 0.001, "loss": 3.0517, "num_input_tokens_seen": 891289600, "step": 3400 }, { "epoch": 0.016456586808495415, "grad_norm": 0.19254428148269653, "learning_rate": 0.001, "loss": 3.0415, "num_input_tokens_seen": 904396800, "step": 3450 }, { "epoch": 0.01669508806658955, "grad_norm": 0.19211998581886292, "learning_rate": 0.001, "loss": 3.0253, "num_input_tokens_seen": 917504000, "step": 3500 }, { "epoch": 0.01669508806658955, "eval_loss": 2.937037944793701, "eval_runtime": 52.6773, "eval_samples_per_second": 94.918, "eval_steps_per_second": 23.729, "num_input_tokens_seen": 917504000, "step": 3500 }, { "epoch": 0.01693358932468369, "grad_norm": 0.19596482813358307, "learning_rate": 0.001, "loss": 3.053, "num_input_tokens_seen": 930611200, "step": 3550 }, { "epoch": 0.017172090582777823, "grad_norm": 0.20214103162288666, "learning_rate": 0.001, "loss": 3.0385, "num_input_tokens_seen": 943718400, "step": 3600 }, { "epoch": 0.017410591840871962, "grad_norm": 0.18580283224582672, "learning_rate": 0.001, "loss": 3.0354, "num_input_tokens_seen": 956825600, "step": 3650 }, { "epoch": 0.017649093098966097, "grad_norm": 0.18928515911102295, "learning_rate": 0.001, "loss": 3.0292, "num_input_tokens_seen": 969932800, "step": 3700 }, { "epoch": 0.017887594357060232, "grad_norm": 0.19066137075424194, "learning_rate": 0.001, "loss": 3.0206, "num_input_tokens_seen": 983040000, "step": 3750 }, { "epoch": 0.01812609561515437, "grad_norm": 0.20291416347026825, "learning_rate": 0.001, "loss": 3.0254, "num_input_tokens_seen": 996147200, "step": 3800 }, { "epoch": 0.018364596873248505, "grad_norm": 0.19991491734981537, "learning_rate": 0.001, "loss": 3.0212, "num_input_tokens_seen": 1009254400, "step": 3850 }, { "epoch": 0.018603098131342644, "grad_norm": 0.19553051888942719, "learning_rate": 0.001, "loss": 3.0229, "num_input_tokens_seen": 1022361600, "step": 3900 }, { "epoch": 0.01884159938943678, "grad_norm": 0.19302095472812653, "learning_rate": 0.001, "loss": 3.0137, "num_input_tokens_seen": 1035468800, "step": 3950 }, { "epoch": 0.019080100647530917, "grad_norm": 0.18680201470851898, "learning_rate": 0.001, "loss": 3.0106, "num_input_tokens_seen": 1048576000, "step": 4000 }, { "epoch": 0.019080100647530917, "eval_loss": 2.8984477519989014, "eval_runtime": 52.851, "eval_samples_per_second": 94.606, "eval_steps_per_second": 23.651, "num_input_tokens_seen": 1048576000, "step": 4000 }, { "epoch": 0.019318601905625052, "grad_norm": 0.18222174048423767, "learning_rate": 0.001, "loss": 3.0095, "num_input_tokens_seen": 1061683200, "step": 4050 }, { "epoch": 0.01955710316371919, "grad_norm": 0.1929137110710144, "learning_rate": 0.001, "loss": 3.0022, "num_input_tokens_seen": 1074790400, "step": 4100 }, { "epoch": 0.019795604421813325, "grad_norm": 0.19358602166175842, "learning_rate": 0.001, "loss": 2.9978, "num_input_tokens_seen": 1087897600, "step": 4150 }, { "epoch": 0.02003410567990746, "grad_norm": 0.19070614874362946, "learning_rate": 0.001, "loss": 3.0016, "num_input_tokens_seen": 1101004800, "step": 4200 }, { "epoch": 0.0202726069380016, "grad_norm": 0.17888160049915314, "learning_rate": 0.001, "loss": 2.9984, "num_input_tokens_seen": 1114112000, "step": 4250 }, { "epoch": 0.020511108196095734, "grad_norm": 0.1823708564043045, "learning_rate": 0.001, "loss": 3.004, "num_input_tokens_seen": 1127219200, "step": 4300 }, { "epoch": 0.020749609454189872, "grad_norm": 0.1753600388765335, "learning_rate": 0.001, "loss": 2.9814, "num_input_tokens_seen": 1140326400, "step": 4350 }, { "epoch": 0.020988110712284007, "grad_norm": 0.1710510551929474, "learning_rate": 0.001, "loss": 2.9597, "num_input_tokens_seen": 1153433600, "step": 4400 }, { "epoch": 0.021226611970378145, "grad_norm": 0.18727277219295502, "learning_rate": 0.001, "loss": 2.9695, "num_input_tokens_seen": 1166540800, "step": 4450 }, { "epoch": 0.02146511322847228, "grad_norm": 0.17773132026195526, "learning_rate": 0.001, "loss": 2.9664, "num_input_tokens_seen": 1179648000, "step": 4500 }, { "epoch": 0.02146511322847228, "eval_loss": 2.871137857437134, "eval_runtime": 51.4876, "eval_samples_per_second": 97.111, "eval_steps_per_second": 24.278, "num_input_tokens_seen": 1179648000, "step": 4500 }, { "epoch": 0.021703614486566415, "grad_norm": 0.1875799000263214, "learning_rate": 0.001, "loss": 2.9682, "num_input_tokens_seen": 1192755200, "step": 4550 }, { "epoch": 0.021942115744660554, "grad_norm": 0.18222226202487946, "learning_rate": 0.001, "loss": 2.9484, "num_input_tokens_seen": 1205862400, "step": 4600 }, { "epoch": 0.02218061700275469, "grad_norm": 0.191411092877388, "learning_rate": 0.001, "loss": 2.9637, "num_input_tokens_seen": 1218969600, "step": 4650 }, { "epoch": 0.022419118260848827, "grad_norm": 0.17608201503753662, "learning_rate": 0.001, "loss": 2.9792, "num_input_tokens_seen": 1232076800, "step": 4700 }, { "epoch": 0.022657619518942962, "grad_norm": 0.1718858927488327, "learning_rate": 0.001, "loss": 2.9674, "num_input_tokens_seen": 1245184000, "step": 4750 }, { "epoch": 0.0228961207770371, "grad_norm": 0.18428942561149597, "learning_rate": 0.001, "loss": 2.976, "num_input_tokens_seen": 1258291200, "step": 4800 }, { "epoch": 0.023134622035131235, "grad_norm": 0.16696259379386902, "learning_rate": 0.001, "loss": 2.9486, "num_input_tokens_seen": 1271398400, "step": 4850 }, { "epoch": 0.02337312329322537, "grad_norm": 0.18239040672779083, "learning_rate": 0.001, "loss": 2.956, "num_input_tokens_seen": 1284505600, "step": 4900 }, { "epoch": 0.02361162455131951, "grad_norm": 0.17167994379997253, "learning_rate": 0.001, "loss": 2.9449, "num_input_tokens_seen": 1297612800, "step": 4950 }, { "epoch": 0.023850125809413644, "grad_norm": 0.18532761931419373, "learning_rate": 0.001, "loss": 2.947, "num_input_tokens_seen": 1310720000, "step": 5000 }, { "epoch": 0.023850125809413644, "eval_loss": 2.8470754623413086, "eval_runtime": 51.04, "eval_samples_per_second": 97.962, "eval_steps_per_second": 24.491, "num_input_tokens_seen": 1310720000, "step": 5000 }, { "epoch": 0.024088627067507782, "grad_norm": 0.21697266399860382, "learning_rate": 0.001, "loss": 2.963, "num_input_tokens_seen": 1323827200, "step": 5050 }, { "epoch": 0.024327128325601917, "grad_norm": 0.17018833756446838, "learning_rate": 0.001, "loss": 2.9453, "num_input_tokens_seen": 1336934400, "step": 5100 }, { "epoch": 0.024565629583696055, "grad_norm": 0.17473167181015015, "learning_rate": 0.001, "loss": 2.9516, "num_input_tokens_seen": 1350041600, "step": 5150 }, { "epoch": 0.02480413084179019, "grad_norm": 0.18488293886184692, "learning_rate": 0.001, "loss": 2.9404, "num_input_tokens_seen": 1363148800, "step": 5200 }, { "epoch": 0.025042632099884325, "grad_norm": 0.17348967492580414, "learning_rate": 0.001, "loss": 2.9275, "num_input_tokens_seen": 1376256000, "step": 5250 }, { "epoch": 0.025281133357978464, "grad_norm": 0.16547563672065735, "learning_rate": 0.001, "loss": 2.9464, "num_input_tokens_seen": 1389363200, "step": 5300 }, { "epoch": 0.0255196346160726, "grad_norm": 0.17538361251354218, "learning_rate": 0.001, "loss": 2.94, "num_input_tokens_seen": 1402470400, "step": 5350 }, { "epoch": 0.025758135874166737, "grad_norm": 0.17068558931350708, "learning_rate": 0.001, "loss": 2.9382, "num_input_tokens_seen": 1415577600, "step": 5400 }, { "epoch": 0.025996637132260872, "grad_norm": 0.17389337718486786, "learning_rate": 0.001, "loss": 2.9254, "num_input_tokens_seen": 1428684800, "step": 5450 }, { "epoch": 0.02623513839035501, "grad_norm": 0.17620491981506348, "learning_rate": 0.001, "loss": 2.9221, "num_input_tokens_seen": 1441792000, "step": 5500 }, { "epoch": 0.02623513839035501, "eval_loss": 2.8246922492980957, "eval_runtime": 50.2832, "eval_samples_per_second": 99.437, "eval_steps_per_second": 24.859, "num_input_tokens_seen": 1441792000, "step": 5500 }, { "epoch": 0.026473639648449145, "grad_norm": 0.15889622271060944, "learning_rate": 0.001, "loss": 2.923, "num_input_tokens_seen": 1454899200, "step": 5550 }, { "epoch": 0.026712140906543284, "grad_norm": 0.17490123212337494, "learning_rate": 0.001, "loss": 2.9146, "num_input_tokens_seen": 1468006400, "step": 5600 }, { "epoch": 0.02695064216463742, "grad_norm": 0.17789559066295624, "learning_rate": 0.001, "loss": 2.9253, "num_input_tokens_seen": 1481113600, "step": 5650 }, { "epoch": 0.027189143422731554, "grad_norm": 0.17113780975341797, "learning_rate": 0.001, "loss": 2.9267, "num_input_tokens_seen": 1494220800, "step": 5700 }, { "epoch": 0.027427644680825692, "grad_norm": 0.1671907901763916, "learning_rate": 0.001, "loss": 2.9178, "num_input_tokens_seen": 1507328000, "step": 5750 }, { "epoch": 0.027666145938919827, "grad_norm": 0.17511603236198425, "learning_rate": 0.001, "loss": 2.9341, "num_input_tokens_seen": 1520435200, "step": 5800 }, { "epoch": 0.027904647197013965, "grad_norm": 0.1821524053812027, "learning_rate": 0.001, "loss": 2.9076, "num_input_tokens_seen": 1533542400, "step": 5850 }, { "epoch": 0.0281431484551081, "grad_norm": 0.16259051859378815, "learning_rate": 0.001, "loss": 2.9212, "num_input_tokens_seen": 1546649600, "step": 5900 }, { "epoch": 0.02838164971320224, "grad_norm": 0.18584352731704712, "learning_rate": 0.001, "loss": 2.927, "num_input_tokens_seen": 1559756800, "step": 5950 }, { "epoch": 0.028620150971296374, "grad_norm": 0.181602343916893, "learning_rate": 0.001, "loss": 2.9096, "num_input_tokens_seen": 1572864000, "step": 6000 }, { "epoch": 0.028620150971296374, "eval_loss": 2.8036017417907715, "eval_runtime": 50.543, "eval_samples_per_second": 98.926, "eval_steps_per_second": 24.731, "num_input_tokens_seen": 1572864000, "step": 6000 }, { "epoch": 0.02885865222939051, "grad_norm": 0.1653270423412323, "learning_rate": 0.001, "loss": 2.9214, "num_input_tokens_seen": 1585971200, "step": 6050 }, { "epoch": 0.029097153487484647, "grad_norm": 0.17030183970928192, "learning_rate": 0.001, "loss": 2.9081, "num_input_tokens_seen": 1599078400, "step": 6100 }, { "epoch": 0.029335654745578782, "grad_norm": 0.17734774947166443, "learning_rate": 0.001, "loss": 2.9128, "num_input_tokens_seen": 1612185600, "step": 6150 }, { "epoch": 0.02957415600367292, "grad_norm": 0.1664343774318695, "learning_rate": 0.001, "loss": 2.9084, "num_input_tokens_seen": 1625292800, "step": 6200 }, { "epoch": 0.029812657261767055, "grad_norm": 0.15939603745937347, "learning_rate": 0.001, "loss": 2.9049, "num_input_tokens_seen": 1638400000, "step": 6250 }, { "epoch": 0.030051158519861194, "grad_norm": 0.16107864677906036, "learning_rate": 0.001, "loss": 2.8889, "num_input_tokens_seen": 1651507200, "step": 6300 }, { "epoch": 0.03028965977795533, "grad_norm": 0.1734771579504013, "learning_rate": 0.001, "loss": 2.8951, "num_input_tokens_seen": 1664614400, "step": 6350 }, { "epoch": 0.030528161036049464, "grad_norm": 0.1804204136133194, "learning_rate": 0.001, "loss": 2.8877, "num_input_tokens_seen": 1677721600, "step": 6400 }, { "epoch": 0.030766662294143602, "grad_norm": 0.16369500756263733, "learning_rate": 0.001, "loss": 2.8764, "num_input_tokens_seen": 1690828800, "step": 6450 }, { "epoch": 0.031005163552237737, "grad_norm": 0.1704144924879074, "learning_rate": 0.001, "loss": 2.8965, "num_input_tokens_seen": 1703936000, "step": 6500 }, { "epoch": 0.031005163552237737, "eval_loss": 2.787343740463257, "eval_runtime": 50.3956, "eval_samples_per_second": 99.215, "eval_steps_per_second": 24.804, "num_input_tokens_seen": 1703936000, "step": 6500 }, { "epoch": 0.031243664810331875, "grad_norm": 0.17917555570602417, "learning_rate": 0.001, "loss": 2.8882, "num_input_tokens_seen": 1717043200, "step": 6550 }, { "epoch": 0.031482166068426014, "grad_norm": 0.18822412192821503, "learning_rate": 0.001, "loss": 2.8931, "num_input_tokens_seen": 1730150400, "step": 6600 }, { "epoch": 0.031720667326520145, "grad_norm": 0.1702752560377121, "learning_rate": 0.001, "loss": 2.8906, "num_input_tokens_seen": 1743257600, "step": 6650 }, { "epoch": 0.031959168584614284, "grad_norm": 0.16963082551956177, "learning_rate": 0.001, "loss": 2.8809, "num_input_tokens_seen": 1756364800, "step": 6700 }, { "epoch": 0.03219766984270842, "grad_norm": 0.17273569107055664, "learning_rate": 0.001, "loss": 2.9005, "num_input_tokens_seen": 1769472000, "step": 6750 }, { "epoch": 0.032436171100802554, "grad_norm": 0.21361888945102692, "learning_rate": 0.001, "loss": 2.8683, "num_input_tokens_seen": 1782579200, "step": 6800 }, { "epoch": 0.03267467235889669, "grad_norm": 0.16454364359378815, "learning_rate": 0.001, "loss": 2.8921, "num_input_tokens_seen": 1795686400, "step": 6850 }, { "epoch": 0.03291317361699083, "grad_norm": 0.1677432805299759, "learning_rate": 0.001, "loss": 2.8777, "num_input_tokens_seen": 1808793600, "step": 6900 }, { "epoch": 0.03315167487508497, "grad_norm": 0.17707760632038116, "learning_rate": 0.001, "loss": 2.8791, "num_input_tokens_seen": 1821900800, "step": 6950 }, { "epoch": 0.0333901761331791, "grad_norm": 0.1784796118736267, "learning_rate": 0.001, "loss": 2.8642, "num_input_tokens_seen": 1835008000, "step": 7000 }, { "epoch": 0.0333901761331791, "eval_loss": 2.7708475589752197, "eval_runtime": 50.9058, "eval_samples_per_second": 98.221, "eval_steps_per_second": 24.555, "num_input_tokens_seen": 1835008000, "step": 7000 }, { "epoch": 0.03362867739127324, "grad_norm": 0.15859876573085785, "learning_rate": 0.001, "loss": 2.8919, "num_input_tokens_seen": 1848115200, "step": 7050 }, { "epoch": 0.03386717864936738, "grad_norm": 0.17061467468738556, "learning_rate": 0.001, "loss": 2.868, "num_input_tokens_seen": 1861222400, "step": 7100 }, { "epoch": 0.03410567990746151, "grad_norm": 0.17118851840496063, "learning_rate": 0.001, "loss": 2.8677, "num_input_tokens_seen": 1874329600, "step": 7150 }, { "epoch": 0.03434418116555565, "grad_norm": 0.1561940759420395, "learning_rate": 0.001, "loss": 2.8701, "num_input_tokens_seen": 1887436800, "step": 7200 }, { "epoch": 0.034582682423649785, "grad_norm": 0.17568449676036835, "learning_rate": 0.001, "loss": 2.8652, "num_input_tokens_seen": 1900544000, "step": 7250 }, { "epoch": 0.034821183681743924, "grad_norm": 0.17471665143966675, "learning_rate": 0.001, "loss": 2.8614, "num_input_tokens_seen": 1913651200, "step": 7300 }, { "epoch": 0.035059684939838055, "grad_norm": 0.17949970066547394, "learning_rate": 0.001, "loss": 2.862, "num_input_tokens_seen": 1926758400, "step": 7350 }, { "epoch": 0.035298186197932194, "grad_norm": 0.17014376819133759, "learning_rate": 0.001, "loss": 2.8696, "num_input_tokens_seen": 1939865600, "step": 7400 }, { "epoch": 0.03553668745602633, "grad_norm": 0.166939839720726, "learning_rate": 0.001, "loss": 2.8679, "num_input_tokens_seen": 1952972800, "step": 7450 }, { "epoch": 0.035775188714120464, "grad_norm": 0.16403459012508392, "learning_rate": 0.001, "loss": 2.8692, "num_input_tokens_seen": 1966080000, "step": 7500 }, { "epoch": 0.035775188714120464, "eval_loss": 2.7581658363342285, "eval_runtime": 50.614, "eval_samples_per_second": 98.787, "eval_steps_per_second": 24.697, "num_input_tokens_seen": 1966080000, "step": 7500 }, { "epoch": 0.0360136899722146, "grad_norm": 0.16664361953735352, "learning_rate": 0.001, "loss": 2.8549, "num_input_tokens_seen": 1979187200, "step": 7550 }, { "epoch": 0.03625219123030874, "grad_norm": 0.165015310049057, "learning_rate": 0.001, "loss": 2.867, "num_input_tokens_seen": 1992294400, "step": 7600 }, { "epoch": 0.03649069248840288, "grad_norm": 0.17752580344676971, "learning_rate": 0.001, "loss": 2.8721, "num_input_tokens_seen": 2005401600, "step": 7650 }, { "epoch": 0.03672919374649701, "grad_norm": 0.1641317456960678, "learning_rate": 0.001, "loss": 2.8601, "num_input_tokens_seen": 2018508800, "step": 7700 }, { "epoch": 0.03696769500459115, "grad_norm": 0.1706378310918808, "learning_rate": 0.001, "loss": 2.8385, "num_input_tokens_seen": 2031616000, "step": 7750 }, { "epoch": 0.03720619626268529, "grad_norm": 0.18265438079833984, "learning_rate": 0.001, "loss": 2.8421, "num_input_tokens_seen": 2044723200, "step": 7800 }, { "epoch": 0.037444697520779426, "grad_norm": 0.17270897328853607, "learning_rate": 0.001, "loss": 2.8576, "num_input_tokens_seen": 2057830400, "step": 7850 }, { "epoch": 0.03768319877887356, "grad_norm": 0.17359280586242676, "learning_rate": 0.001, "loss": 2.8522, "num_input_tokens_seen": 2070937600, "step": 7900 }, { "epoch": 0.037921700036967695, "grad_norm": 0.1679411083459854, "learning_rate": 0.001, "loss": 2.854, "num_input_tokens_seen": 2084044800, "step": 7950 }, { "epoch": 0.038160201295061834, "grad_norm": 0.16735835373401642, "learning_rate": 0.001, "loss": 2.8494, "num_input_tokens_seen": 2097152000, "step": 8000 }, { "epoch": 0.038160201295061834, "eval_loss": 2.7443442344665527, "eval_runtime": 50.3387, "eval_samples_per_second": 99.327, "eval_steps_per_second": 24.832, "num_input_tokens_seen": 2097152000, "step": 8000 }, { "epoch": 0.038398702553155965, "grad_norm": 0.16059577465057373, "learning_rate": 0.001, "loss": 2.8495, "num_input_tokens_seen": 2110259200, "step": 8050 }, { "epoch": 0.038637203811250104, "grad_norm": 0.1842387169599533, "learning_rate": 0.001, "loss": 2.8526, "num_input_tokens_seen": 2123366400, "step": 8100 }, { "epoch": 0.03887570506934424, "grad_norm": 0.15922050178050995, "learning_rate": 0.001, "loss": 2.8312, "num_input_tokens_seen": 2136473600, "step": 8150 }, { "epoch": 0.03911420632743838, "grad_norm": 0.16642028093338013, "learning_rate": 0.001, "loss": 2.8452, "num_input_tokens_seen": 2149580800, "step": 8200 }, { "epoch": 0.03935270758553251, "grad_norm": 0.16174671053886414, "learning_rate": 0.001, "loss": 2.8471, "num_input_tokens_seen": 2162688000, "step": 8250 }, { "epoch": 0.03959120884362665, "grad_norm": 0.16786591708660126, "learning_rate": 0.001, "loss": 2.8435, "num_input_tokens_seen": 2175795200, "step": 8300 }, { "epoch": 0.03982971010172079, "grad_norm": 0.17107373476028442, "learning_rate": 0.001, "loss": 2.862, "num_input_tokens_seen": 2188902400, "step": 8350 }, { "epoch": 0.04006821135981492, "grad_norm": 0.17952118813991547, "learning_rate": 0.001, "loss": 2.8414, "num_input_tokens_seen": 2202009600, "step": 8400 }, { "epoch": 0.04030671261790906, "grad_norm": 0.16836482286453247, "learning_rate": 0.001, "loss": 2.8363, "num_input_tokens_seen": 2215116800, "step": 8450 }, { "epoch": 0.0405452138760032, "grad_norm": 0.16812962293624878, "learning_rate": 0.001, "loss": 2.844, "num_input_tokens_seen": 2228224000, "step": 8500 }, { "epoch": 0.0405452138760032, "eval_loss": 2.7306976318359375, "eval_runtime": 50.272, "eval_samples_per_second": 99.459, "eval_steps_per_second": 24.865, "num_input_tokens_seen": 2228224000, "step": 8500 }, { "epoch": 0.040783715134097336, "grad_norm": 0.1696135401725769, "learning_rate": 0.001, "loss": 2.8406, "num_input_tokens_seen": 2241331200, "step": 8550 }, { "epoch": 0.04102221639219147, "grad_norm": 0.16062459349632263, "learning_rate": 0.001, "loss": 2.8453, "num_input_tokens_seen": 2254438400, "step": 8600 }, { "epoch": 0.041260717650285605, "grad_norm": 0.17326433956623077, "learning_rate": 0.001, "loss": 2.8449, "num_input_tokens_seen": 2267545600, "step": 8650 }, { "epoch": 0.041499218908379744, "grad_norm": 0.16410672664642334, "learning_rate": 0.001, "loss": 2.8412, "num_input_tokens_seen": 2280652800, "step": 8700 }, { "epoch": 0.041737720166473875, "grad_norm": 0.16255012154579163, "learning_rate": 0.001, "loss": 2.8524, "num_input_tokens_seen": 2293760000, "step": 8750 }, { "epoch": 0.041976221424568014, "grad_norm": 0.163652241230011, "learning_rate": 0.001, "loss": 2.8528, "num_input_tokens_seen": 2306867200, "step": 8800 }, { "epoch": 0.04221472268266215, "grad_norm": 0.15598778426647186, "learning_rate": 0.001, "loss": 2.8255, "num_input_tokens_seen": 2319974400, "step": 8850 }, { "epoch": 0.04245322394075629, "grad_norm": 0.1740003079175949, "learning_rate": 0.001, "loss": 2.8278, "num_input_tokens_seen": 2333081600, "step": 8900 }, { "epoch": 0.04269172519885042, "grad_norm": 0.17225052416324615, "learning_rate": 0.001, "loss": 2.8334, "num_input_tokens_seen": 2346188800, "step": 8950 }, { "epoch": 0.04293022645694456, "grad_norm": 0.18005919456481934, "learning_rate": 0.001, "loss": 2.8044, "num_input_tokens_seen": 2359296000, "step": 9000 }, { "epoch": 0.04293022645694456, "eval_loss": 2.7220215797424316, "eval_runtime": 50.2706, "eval_samples_per_second": 99.462, "eval_steps_per_second": 24.865, "num_input_tokens_seen": 2359296000, "step": 9000 }, { "epoch": 0.0431687277150387, "grad_norm": 0.16554109752178192, "learning_rate": 0.001, "loss": 2.83, "num_input_tokens_seen": 2372403200, "step": 9050 }, { "epoch": 0.04340722897313283, "grad_norm": 0.17308101058006287, "learning_rate": 0.001, "loss": 2.8204, "num_input_tokens_seen": 2385510400, "step": 9100 }, { "epoch": 0.04364573023122697, "grad_norm": 0.16701756417751312, "learning_rate": 0.001, "loss": 2.836, "num_input_tokens_seen": 2398617600, "step": 9150 }, { "epoch": 0.04388423148932111, "grad_norm": 0.16220535337924957, "learning_rate": 0.001, "loss": 2.8194, "num_input_tokens_seen": 2411724800, "step": 9200 }, { "epoch": 0.044122732747415246, "grad_norm": 0.16643071174621582, "learning_rate": 0.001, "loss": 2.8157, "num_input_tokens_seen": 2424832000, "step": 9250 }, { "epoch": 0.04436123400550938, "grad_norm": 0.16293680667877197, "learning_rate": 0.001, "loss": 2.8147, "num_input_tokens_seen": 2437939200, "step": 9300 }, { "epoch": 0.044599735263603515, "grad_norm": 0.1914059966802597, "learning_rate": 0.001, "loss": 2.8164, "num_input_tokens_seen": 2451046400, "step": 9350 }, { "epoch": 0.044838236521697654, "grad_norm": 0.15867285430431366, "learning_rate": 0.001, "loss": 2.8063, "num_input_tokens_seen": 2464153600, "step": 9400 }, { "epoch": 0.045076737779791785, "grad_norm": 0.16319462656974792, "learning_rate": 0.001, "loss": 2.8096, "num_input_tokens_seen": 2477260800, "step": 9450 }, { "epoch": 0.045315239037885924, "grad_norm": 0.16578581929206848, "learning_rate": 0.001, "loss": 2.8106, "num_input_tokens_seen": 2490368000, "step": 9500 }, { "epoch": 0.045315239037885924, "eval_loss": 2.7105066776275635, "eval_runtime": 50.0838, "eval_samples_per_second": 99.833, "eval_steps_per_second": 24.958, "num_input_tokens_seen": 2490368000, "step": 9500 }, { "epoch": 0.04555374029598006, "grad_norm": 0.17125573754310608, "learning_rate": 0.001, "loss": 2.8259, "num_input_tokens_seen": 2503475200, "step": 9550 }, { "epoch": 0.0457922415540742, "grad_norm": 0.1661599725484848, "learning_rate": 0.001, "loss": 2.8109, "num_input_tokens_seen": 2516582400, "step": 9600 }, { "epoch": 0.04603074281216833, "grad_norm": 0.16203565895557404, "learning_rate": 0.001, "loss": 2.8198, "num_input_tokens_seen": 2529689600, "step": 9650 }, { "epoch": 0.04626924407026247, "grad_norm": 0.1869373619556427, "learning_rate": 0.001, "loss": 2.8163, "num_input_tokens_seen": 2542796800, "step": 9700 }, { "epoch": 0.04650774532835661, "grad_norm": 0.17401213943958282, "learning_rate": 0.001, "loss": 2.8209, "num_input_tokens_seen": 2555904000, "step": 9750 }, { "epoch": 0.04674624658645074, "grad_norm": 0.15835829079151154, "learning_rate": 0.001, "loss": 2.8032, "num_input_tokens_seen": 2569011200, "step": 9800 }, { "epoch": 0.04698474784454488, "grad_norm": 0.16554060578346252, "learning_rate": 0.001, "loss": 2.8072, "num_input_tokens_seen": 2582118400, "step": 9850 }, { "epoch": 0.04722324910263902, "grad_norm": 0.16941213607788086, "learning_rate": 0.001, "loss": 2.8076, "num_input_tokens_seen": 2595225600, "step": 9900 }, { "epoch": 0.047461750360733156, "grad_norm": 0.16324704885482788, "learning_rate": 0.001, "loss": 2.8097, "num_input_tokens_seen": 2608332800, "step": 9950 }, { "epoch": 0.04770025161882729, "grad_norm": 0.16865754127502441, "learning_rate": 0.001, "loss": 2.8051, "num_input_tokens_seen": 2621440000, "step": 10000 }, { "epoch": 0.04770025161882729, "eval_loss": 2.7000486850738525, "eval_runtime": 50.3365, "eval_samples_per_second": 99.331, "eval_steps_per_second": 24.833, "num_input_tokens_seen": 2621440000, "step": 10000 }, { "epoch": 0.047938752876921426, "grad_norm": 0.17076526582241058, "learning_rate": 0.001, "loss": 2.8148, "num_input_tokens_seen": 2634547200, "step": 10050 }, { "epoch": 0.048177254135015564, "grad_norm": 0.1610497534275055, "learning_rate": 0.001, "loss": 2.8012, "num_input_tokens_seen": 2647654400, "step": 10100 }, { "epoch": 0.048415755393109695, "grad_norm": 0.15984536707401276, "learning_rate": 0.001, "loss": 2.8086, "num_input_tokens_seen": 2660761600, "step": 10150 }, { "epoch": 0.048654256651203834, "grad_norm": 0.21775834262371063, "learning_rate": 0.001, "loss": 2.8021, "num_input_tokens_seen": 2673868800, "step": 10200 }, { "epoch": 0.04889275790929797, "grad_norm": 0.1841157227754593, "learning_rate": 0.001, "loss": 2.8152, "num_input_tokens_seen": 2686976000, "step": 10250 }, { "epoch": 0.04913125916739211, "grad_norm": 0.17025424540042877, "learning_rate": 0.001, "loss": 2.8131, "num_input_tokens_seen": 2700083200, "step": 10300 }, { "epoch": 0.04936976042548624, "grad_norm": 0.1992417722940445, "learning_rate": 0.001, "loss": 2.8039, "num_input_tokens_seen": 2713190400, "step": 10350 }, { "epoch": 0.04960826168358038, "grad_norm": 0.1680469959974289, "learning_rate": 0.001, "loss": 2.7921, "num_input_tokens_seen": 2726297600, "step": 10400 }, { "epoch": 0.04984676294167452, "grad_norm": 0.18296252191066742, "learning_rate": 0.001, "loss": 2.8036, "num_input_tokens_seen": 2739404800, "step": 10450 }, { "epoch": 0.05008526419976865, "grad_norm": 0.16041898727416992, "learning_rate": 0.001, "loss": 2.7979, "num_input_tokens_seen": 2752512000, "step": 10500 }, { "epoch": 0.05008526419976865, "eval_loss": 2.6893723011016846, "eval_runtime": 50.642, "eval_samples_per_second": 98.732, "eval_steps_per_second": 24.683, "num_input_tokens_seen": 2752512000, "step": 10500 }, { "epoch": 0.05032376545786279, "grad_norm": 0.16704030334949493, "learning_rate": 0.001, "loss": 2.79, "num_input_tokens_seen": 2765619200, "step": 10550 }, { "epoch": 0.05056226671595693, "grad_norm": 0.16553758084774017, "learning_rate": 0.001, "loss": 2.7964, "num_input_tokens_seen": 2778726400, "step": 10600 }, { "epoch": 0.050800767974051066, "grad_norm": 0.16027161478996277, "learning_rate": 0.001, "loss": 2.7937, "num_input_tokens_seen": 2791833600, "step": 10650 }, { "epoch": 0.0510392692321452, "grad_norm": 0.16177843511104584, "learning_rate": 0.001, "loss": 2.7957, "num_input_tokens_seen": 2804940800, "step": 10700 }, { "epoch": 0.051277770490239336, "grad_norm": 0.16713912785053253, "learning_rate": 0.001, "loss": 2.7949, "num_input_tokens_seen": 2818048000, "step": 10750 }, { "epoch": 0.051516271748333474, "grad_norm": 0.1815747618675232, "learning_rate": 0.001, "loss": 2.7915, "num_input_tokens_seen": 2831155200, "step": 10800 }, { "epoch": 0.05175477300642761, "grad_norm": 0.16732683777809143, "learning_rate": 0.001, "loss": 2.7994, "num_input_tokens_seen": 2844262400, "step": 10850 }, { "epoch": 0.051993274264521744, "grad_norm": 0.18305908143520355, "learning_rate": 0.001, "loss": 2.7888, "num_input_tokens_seen": 2857369600, "step": 10900 }, { "epoch": 0.05223177552261588, "grad_norm": 0.16450954973697662, "learning_rate": 0.001, "loss": 2.7834, "num_input_tokens_seen": 2870476800, "step": 10950 }, { "epoch": 0.05247027678071002, "grad_norm": 0.16485372185707092, "learning_rate": 0.001, "loss": 2.7976, "num_input_tokens_seen": 2883584000, "step": 11000 }, { "epoch": 0.05247027678071002, "eval_loss": 2.6825835704803467, "eval_runtime": 50.1016, "eval_samples_per_second": 99.797, "eval_steps_per_second": 24.949, "num_input_tokens_seen": 2883584000, "step": 11000 }, { "epoch": 0.05270877803880415, "grad_norm": 0.1733204573392868, "learning_rate": 0.001, "loss": 2.7959, "num_input_tokens_seen": 2896691200, "step": 11050 }, { "epoch": 0.05294727929689829, "grad_norm": 0.16432546079158783, "learning_rate": 0.001, "loss": 2.793, "num_input_tokens_seen": 2909798400, "step": 11100 }, { "epoch": 0.05318578055499243, "grad_norm": 0.18369582295417786, "learning_rate": 0.001, "loss": 2.8149, "num_input_tokens_seen": 2922905600, "step": 11150 }, { "epoch": 0.05342428181308657, "grad_norm": 0.17782896757125854, "learning_rate": 0.001, "loss": 2.7878, "num_input_tokens_seen": 2936012800, "step": 11200 }, { "epoch": 0.0536627830711807, "grad_norm": 0.18320836126804352, "learning_rate": 0.001, "loss": 2.8159, "num_input_tokens_seen": 2949120000, "step": 11250 }, { "epoch": 0.05390128432927484, "grad_norm": 0.1667925864458084, "learning_rate": 0.001, "loss": 2.795, "num_input_tokens_seen": 2962227200, "step": 11300 }, { "epoch": 0.054139785587368976, "grad_norm": 0.19831301271915436, "learning_rate": 0.001, "loss": 2.7907, "num_input_tokens_seen": 2975334400, "step": 11350 }, { "epoch": 0.05437828684546311, "grad_norm": 0.1610182225704193, "learning_rate": 0.001, "loss": 2.774, "num_input_tokens_seen": 2988441600, "step": 11400 }, { "epoch": 0.054616788103557246, "grad_norm": 0.15938150882720947, "learning_rate": 0.001, "loss": 2.7766, "num_input_tokens_seen": 3001548800, "step": 11450 }, { "epoch": 0.054855289361651384, "grad_norm": 0.15737415850162506, "learning_rate": 0.001, "loss": 2.783, "num_input_tokens_seen": 3014656000, "step": 11500 }, { "epoch": 0.054855289361651384, "eval_loss": 2.6739256381988525, "eval_runtime": 51.2462, "eval_samples_per_second": 97.568, "eval_steps_per_second": 24.392, "num_input_tokens_seen": 3014656000, "step": 11500 }, { "epoch": 0.05509379061974552, "grad_norm": 0.16538532078266144, "learning_rate": 0.001, "loss": 2.7966, "num_input_tokens_seen": 3027763200, "step": 11550 }, { "epoch": 0.055332291877839654, "grad_norm": 0.18035660684108734, "learning_rate": 0.001, "loss": 2.7789, "num_input_tokens_seen": 3040870400, "step": 11600 }, { "epoch": 0.05557079313593379, "grad_norm": 0.17831085622310638, "learning_rate": 0.001, "loss": 2.7962, "num_input_tokens_seen": 3053977600, "step": 11650 }, { "epoch": 0.05580929439402793, "grad_norm": 0.17723870277404785, "learning_rate": 0.001, "loss": 2.7791, "num_input_tokens_seen": 3067084800, "step": 11700 }, { "epoch": 0.05604779565212206, "grad_norm": 0.17663581669330597, "learning_rate": 0.001, "loss": 2.7696, "num_input_tokens_seen": 3080192000, "step": 11750 }, { "epoch": 0.0562862969102162, "grad_norm": 0.16684900224208832, "learning_rate": 0.001, "loss": 2.7762, "num_input_tokens_seen": 3093299200, "step": 11800 }, { "epoch": 0.05652479816831034, "grad_norm": 0.17407995462417603, "learning_rate": 0.001, "loss": 2.7767, "num_input_tokens_seen": 3106406400, "step": 11850 }, { "epoch": 0.05676329942640448, "grad_norm": 0.1750691831111908, "learning_rate": 0.001, "loss": 2.7785, "num_input_tokens_seen": 3119513600, "step": 11900 }, { "epoch": 0.05700180068449861, "grad_norm": 0.16576959192752838, "learning_rate": 0.001, "loss": 2.773, "num_input_tokens_seen": 3132620800, "step": 11950 }, { "epoch": 0.05724030194259275, "grad_norm": 0.16957831382751465, "learning_rate": 0.001, "loss": 2.7781, "num_input_tokens_seen": 3145728000, "step": 12000 }, { "epoch": 0.05724030194259275, "eval_loss": 2.6683335304260254, "eval_runtime": 50.6428, "eval_samples_per_second": 98.731, "eval_steps_per_second": 24.683, "num_input_tokens_seen": 3145728000, "step": 12000 }, { "epoch": 0.057478803200686886, "grad_norm": 0.1645338237285614, "learning_rate": 0.001, "loss": 2.7709, "num_input_tokens_seen": 3158835200, "step": 12050 }, { "epoch": 0.05771730445878102, "grad_norm": 0.15848694741725922, "learning_rate": 0.001, "loss": 2.7849, "num_input_tokens_seen": 3171942400, "step": 12100 }, { "epoch": 0.057955805716875156, "grad_norm": 0.20003071427345276, "learning_rate": 0.001, "loss": 2.7691, "num_input_tokens_seen": 3185049600, "step": 12150 }, { "epoch": 0.058194306974969294, "grad_norm": 0.19301050901412964, "learning_rate": 0.001, "loss": 2.7811, "num_input_tokens_seen": 3198156800, "step": 12200 }, { "epoch": 0.05843280823306343, "grad_norm": 0.171390563249588, "learning_rate": 0.001, "loss": 2.7712, "num_input_tokens_seen": 3211264000, "step": 12250 }, { "epoch": 0.058671309491157564, "grad_norm": 0.1654270589351654, "learning_rate": 0.001, "loss": 2.7788, "num_input_tokens_seen": 3224371200, "step": 12300 }, { "epoch": 0.0589098107492517, "grad_norm": 0.16559672355651855, "learning_rate": 0.001, "loss": 2.7839, "num_input_tokens_seen": 3237478400, "step": 12350 }, { "epoch": 0.05914831200734584, "grad_norm": 0.16773344576358795, "learning_rate": 0.001, "loss": 2.7896, "num_input_tokens_seen": 3250585600, "step": 12400 }, { "epoch": 0.05938681326543997, "grad_norm": 0.1639021933078766, "learning_rate": 0.001, "loss": 2.7704, "num_input_tokens_seen": 3263692800, "step": 12450 }, { "epoch": 0.05962531452353411, "grad_norm": 0.15584540367126465, "learning_rate": 0.001, "loss": 2.7687, "num_input_tokens_seen": 3276800000, "step": 12500 }, { "epoch": 0.05962531452353411, "eval_loss": 2.6606011390686035, "eval_runtime": 51.1636, "eval_samples_per_second": 97.726, "eval_steps_per_second": 24.431, "num_input_tokens_seen": 3276800000, "step": 12500 }, { "epoch": 0.05986381578162825, "grad_norm": 0.18144413828849792, "learning_rate": 0.001, "loss": 2.7711, "num_input_tokens_seen": 3289907200, "step": 12550 }, { "epoch": 0.06010231703972239, "grad_norm": 0.18225054442882538, "learning_rate": 0.001, "loss": 2.7675, "num_input_tokens_seen": 3303014400, "step": 12600 }, { "epoch": 0.06034081829781652, "grad_norm": 0.16542398929595947, "learning_rate": 0.001, "loss": 2.7563, "num_input_tokens_seen": 3316121600, "step": 12650 }, { "epoch": 0.06057931955591066, "grad_norm": 0.1765596568584442, "learning_rate": 0.001, "loss": 2.7807, "num_input_tokens_seen": 3329228800, "step": 12700 }, { "epoch": 0.060817820814004796, "grad_norm": 0.17469234764575958, "learning_rate": 0.001, "loss": 2.7532, "num_input_tokens_seen": 3342336000, "step": 12750 }, { "epoch": 0.06105632207209893, "grad_norm": 0.1841767281293869, "learning_rate": 0.001, "loss": 2.7824, "num_input_tokens_seen": 3355443200, "step": 12800 }, { "epoch": 0.061294823330193066, "grad_norm": 0.1667831838130951, "learning_rate": 0.001, "loss": 2.7648, "num_input_tokens_seen": 3368550400, "step": 12850 }, { "epoch": 0.061533324588287204, "grad_norm": 0.16561101377010345, "learning_rate": 0.001, "loss": 2.7798, "num_input_tokens_seen": 3381657600, "step": 12900 }, { "epoch": 0.06177182584638134, "grad_norm": 0.17370566725730896, "learning_rate": 0.001, "loss": 2.7755, "num_input_tokens_seen": 3394764800, "step": 12950 }, { "epoch": 0.062010327104475474, "grad_norm": 0.16871176660060883, "learning_rate": 0.001, "loss": 2.7676, "num_input_tokens_seen": 3407872000, "step": 13000 }, { "epoch": 0.062010327104475474, "eval_loss": 2.653367757797241, "eval_runtime": 50.8399, "eval_samples_per_second": 98.348, "eval_steps_per_second": 24.587, "num_input_tokens_seen": 3407872000, "step": 13000 }, { "epoch": 0.06224882836256961, "grad_norm": 0.17592230439186096, "learning_rate": 0.001, "loss": 2.7639, "num_input_tokens_seen": 3420979200, "step": 13050 }, { "epoch": 0.06248732962066375, "grad_norm": 0.1640375405550003, "learning_rate": 0.001, "loss": 2.7785, "num_input_tokens_seen": 3434086400, "step": 13100 }, { "epoch": 0.06272583087875788, "grad_norm": 0.16389331221580505, "learning_rate": 0.001, "loss": 2.7475, "num_input_tokens_seen": 3447193600, "step": 13150 }, { "epoch": 0.06296433213685203, "grad_norm": 0.1733655482530594, "learning_rate": 0.001, "loss": 2.7538, "num_input_tokens_seen": 3460300800, "step": 13200 }, { "epoch": 0.06320283339494616, "grad_norm": 0.19206473231315613, "learning_rate": 0.001, "loss": 2.7819, "num_input_tokens_seen": 3473408000, "step": 13250 }, { "epoch": 0.06344133465304029, "grad_norm": 0.1841450184583664, "learning_rate": 0.001, "loss": 2.7701, "num_input_tokens_seen": 3486515200, "step": 13300 }, { "epoch": 0.06367983591113444, "grad_norm": 0.1701631247997284, "learning_rate": 0.001, "loss": 2.7587, "num_input_tokens_seen": 3499622400, "step": 13350 }, { "epoch": 0.06391833716922857, "grad_norm": 0.17068499326705933, "learning_rate": 0.001, "loss": 2.7589, "num_input_tokens_seen": 3512729600, "step": 13400 }, { "epoch": 0.0641568384273227, "grad_norm": 0.17927715182304382, "learning_rate": 0.001, "loss": 2.764, "num_input_tokens_seen": 3525836800, "step": 13450 }, { "epoch": 0.06439533968541684, "grad_norm": 0.19105768203735352, "learning_rate": 0.001, "loss": 2.7593, "num_input_tokens_seen": 3538944000, "step": 13500 }, { "epoch": 0.06439533968541684, "eval_loss": 2.6473631858825684, "eval_runtime": 51.0846, "eval_samples_per_second": 97.877, "eval_steps_per_second": 24.469, "num_input_tokens_seen": 3538944000, "step": 13500 }, { "epoch": 0.06463384094351098, "grad_norm": 0.17262668907642365, "learning_rate": 0.001, "loss": 2.7522, "num_input_tokens_seen": 3552051200, "step": 13550 }, { "epoch": 0.06487234220160511, "grad_norm": 0.16810455918312073, "learning_rate": 0.001, "loss": 2.7664, "num_input_tokens_seen": 3565158400, "step": 13600 }, { "epoch": 0.06511084345969925, "grad_norm": 0.17312487959861755, "learning_rate": 0.001, "loss": 2.7557, "num_input_tokens_seen": 3578265600, "step": 13650 }, { "epoch": 0.06534934471779338, "grad_norm": 0.16985322535037994, "learning_rate": 0.001, "loss": 2.7449, "num_input_tokens_seen": 3591372800, "step": 13700 }, { "epoch": 0.06558784597588753, "grad_norm": 0.1812393069267273, "learning_rate": 0.001, "loss": 2.749, "num_input_tokens_seen": 3604480000, "step": 13750 }, { "epoch": 0.06582634723398166, "grad_norm": 0.183237224817276, "learning_rate": 0.001, "loss": 2.7637, "num_input_tokens_seen": 3617587200, "step": 13800 }, { "epoch": 0.06606484849207579, "grad_norm": 0.17770566046237946, "learning_rate": 0.001, "loss": 2.7602, "num_input_tokens_seen": 3630694400, "step": 13850 }, { "epoch": 0.06630334975016994, "grad_norm": 0.1678437739610672, "learning_rate": 0.001, "loss": 2.76, "num_input_tokens_seen": 3643801600, "step": 13900 }, { "epoch": 0.06654185100826407, "grad_norm": 0.16213107109069824, "learning_rate": 0.001, "loss": 2.7467, "num_input_tokens_seen": 3656908800, "step": 13950 }, { "epoch": 0.0667803522663582, "grad_norm": 0.17652907967567444, "learning_rate": 0.001, "loss": 2.7516, "num_input_tokens_seen": 3670016000, "step": 14000 }, { "epoch": 0.0667803522663582, "eval_loss": 2.6438815593719482, "eval_runtime": 50.3233, "eval_samples_per_second": 99.358, "eval_steps_per_second": 24.839, "num_input_tokens_seen": 3670016000, "step": 14000 }, { "epoch": 0.06701885352445235, "grad_norm": 0.1785530298948288, "learning_rate": 0.001, "loss": 2.7475, "num_input_tokens_seen": 3683123200, "step": 14050 }, { "epoch": 0.06725735478254648, "grad_norm": 0.15644113719463348, "learning_rate": 0.001, "loss": 2.7541, "num_input_tokens_seen": 3696230400, "step": 14100 }, { "epoch": 0.06749585604064061, "grad_norm": 0.183272585272789, "learning_rate": 0.001, "loss": 2.7513, "num_input_tokens_seen": 3709337600, "step": 14150 }, { "epoch": 0.06773435729873475, "grad_norm": 0.17523212730884552, "learning_rate": 0.001, "loss": 2.7649, "num_input_tokens_seen": 3722444800, "step": 14200 }, { "epoch": 0.06797285855682889, "grad_norm": 0.1778247356414795, "learning_rate": 0.001, "loss": 2.7457, "num_input_tokens_seen": 3735552000, "step": 14250 }, { "epoch": 0.06821135981492302, "grad_norm": 0.18277810513973236, "learning_rate": 0.001, "loss": 2.7477, "num_input_tokens_seen": 3748659200, "step": 14300 }, { "epoch": 0.06844986107301716, "grad_norm": 0.17541366815567017, "learning_rate": 0.001, "loss": 2.7419, "num_input_tokens_seen": 3761766400, "step": 14350 }, { "epoch": 0.0686883623311113, "grad_norm": 0.1701425164937973, "learning_rate": 0.001, "loss": 2.7437, "num_input_tokens_seen": 3774873600, "step": 14400 }, { "epoch": 0.06892686358920544, "grad_norm": 0.16685517132282257, "learning_rate": 0.001, "loss": 2.7357, "num_input_tokens_seen": 3787980800, "step": 14450 }, { "epoch": 0.06916536484729957, "grad_norm": 0.1738167405128479, "learning_rate": 0.001, "loss": 2.7475, "num_input_tokens_seen": 3801088000, "step": 14500 }, { "epoch": 0.06916536484729957, "eval_loss": 2.635887622833252, "eval_runtime": 50.4516, "eval_samples_per_second": 99.105, "eval_steps_per_second": 24.776, "num_input_tokens_seen": 3801088000, "step": 14500 }, { "epoch": 0.0694038661053937, "grad_norm": 0.18279027938842773, "learning_rate": 0.001, "loss": 2.7521, "num_input_tokens_seen": 3814195200, "step": 14550 }, { "epoch": 0.06964236736348785, "grad_norm": 0.1878173053264618, "learning_rate": 0.001, "loss": 2.7401, "num_input_tokens_seen": 3827302400, "step": 14600 }, { "epoch": 0.06988086862158198, "grad_norm": 0.17670077085494995, "learning_rate": 0.001, "loss": 2.7513, "num_input_tokens_seen": 3840409600, "step": 14650 }, { "epoch": 0.07011936987967611, "grad_norm": 0.17042580246925354, "learning_rate": 0.001, "loss": 2.7383, "num_input_tokens_seen": 3853516800, "step": 14700 }, { "epoch": 0.07035787113777026, "grad_norm": 0.17193050682544708, "learning_rate": 0.001, "loss": 2.7408, "num_input_tokens_seen": 3866624000, "step": 14750 }, { "epoch": 0.07059637239586439, "grad_norm": 0.16576342284679413, "learning_rate": 0.001, "loss": 2.7312, "num_input_tokens_seen": 3879731200, "step": 14800 }, { "epoch": 0.07083487365395852, "grad_norm": 0.18535619974136353, "learning_rate": 0.001, "loss": 2.756, "num_input_tokens_seen": 3892838400, "step": 14850 }, { "epoch": 0.07107337491205266, "grad_norm": 0.1729886531829834, "learning_rate": 0.001, "loss": 2.751, "num_input_tokens_seen": 3905945600, "step": 14900 }, { "epoch": 0.0713118761701468, "grad_norm": 0.16047705709934235, "learning_rate": 0.001, "loss": 2.7361, "num_input_tokens_seen": 3919052800, "step": 14950 }, { "epoch": 0.07155037742824093, "grad_norm": 0.17655611038208008, "learning_rate": 0.001, "loss": 2.7471, "num_input_tokens_seen": 3932160000, "step": 15000 }, { "epoch": 0.07155037742824093, "eval_loss": 2.6311256885528564, "eval_runtime": 51.0361, "eval_samples_per_second": 97.97, "eval_steps_per_second": 24.492, "num_input_tokens_seen": 3932160000, "step": 15000 }, { "epoch": 0.07178887868633507, "grad_norm": 0.19243250787258148, "learning_rate": 0.001, "loss": 2.7551, "num_input_tokens_seen": 3945267200, "step": 15050 }, { "epoch": 0.0720273799444292, "grad_norm": 0.17328651249408722, "learning_rate": 0.001, "loss": 2.7346, "num_input_tokens_seen": 3958374400, "step": 15100 }, { "epoch": 0.07226588120252335, "grad_norm": 0.16357752680778503, "learning_rate": 0.001, "loss": 2.7523, "num_input_tokens_seen": 3971481600, "step": 15150 }, { "epoch": 0.07250438246061748, "grad_norm": 0.1726733148097992, "learning_rate": 0.001, "loss": 2.725, "num_input_tokens_seen": 3984588800, "step": 15200 }, { "epoch": 0.07274288371871161, "grad_norm": 0.16912953555583954, "learning_rate": 0.001, "loss": 2.738, "num_input_tokens_seen": 3997696000, "step": 15250 }, { "epoch": 0.07298138497680576, "grad_norm": 0.19751113653182983, "learning_rate": 0.001, "loss": 2.7532, "num_input_tokens_seen": 4010803200, "step": 15300 }, { "epoch": 0.07321988623489989, "grad_norm": 0.16762405633926392, "learning_rate": 0.001, "loss": 2.7413, "num_input_tokens_seen": 4023910400, "step": 15350 }, { "epoch": 0.07345838749299402, "grad_norm": 0.18106459081172943, "learning_rate": 0.001, "loss": 2.7411, "num_input_tokens_seen": 4037017600, "step": 15400 }, { "epoch": 0.07369688875108817, "grad_norm": 0.184165820479393, "learning_rate": 0.001, "loss": 2.7449, "num_input_tokens_seen": 4050124800, "step": 15450 }, { "epoch": 0.0739353900091823, "grad_norm": 0.16832765936851501, "learning_rate": 0.001, "loss": 2.7442, "num_input_tokens_seen": 4063232000, "step": 15500 }, { "epoch": 0.0739353900091823, "eval_loss": 2.6253247261047363, "eval_runtime": 50.6964, "eval_samples_per_second": 98.626, "eval_steps_per_second": 24.657, "num_input_tokens_seen": 4063232000, "step": 15500 }, { "epoch": 0.07417389126727643, "grad_norm": 0.1663861721754074, "learning_rate": 0.001, "loss": 2.7461, "num_input_tokens_seen": 4076339200, "step": 15550 }, { "epoch": 0.07441239252537057, "grad_norm": 0.17217928171157837, "learning_rate": 0.001, "loss": 2.7394, "num_input_tokens_seen": 4089446400, "step": 15600 }, { "epoch": 0.0746508937834647, "grad_norm": 0.17169134318828583, "learning_rate": 0.001, "loss": 2.7474, "num_input_tokens_seen": 4102553600, "step": 15650 }, { "epoch": 0.07488939504155885, "grad_norm": 0.17074033617973328, "learning_rate": 0.001, "loss": 2.7405, "num_input_tokens_seen": 4115660800, "step": 15700 }, { "epoch": 0.07512789629965298, "grad_norm": 0.20199435949325562, "learning_rate": 0.001, "loss": 2.7412, "num_input_tokens_seen": 4128768000, "step": 15750 }, { "epoch": 0.07536639755774711, "grad_norm": 0.17569150030612946, "learning_rate": 0.001, "loss": 2.7279, "num_input_tokens_seen": 4141875200, "step": 15800 }, { "epoch": 0.07560489881584126, "grad_norm": 0.1753721386194229, "learning_rate": 0.001, "loss": 2.7442, "num_input_tokens_seen": 4154982400, "step": 15850 }, { "epoch": 0.07584340007393539, "grad_norm": 0.17356647551059723, "learning_rate": 0.001, "loss": 2.7447, "num_input_tokens_seen": 4168089600, "step": 15900 }, { "epoch": 0.07608190133202952, "grad_norm": 0.16931213438510895, "learning_rate": 0.001, "loss": 2.7194, "num_input_tokens_seen": 4181196800, "step": 15950 }, { "epoch": 0.07632040259012367, "grad_norm": 0.2109583616256714, "learning_rate": 0.001, "loss": 2.7271, "num_input_tokens_seen": 4194304000, "step": 16000 }, { "epoch": 0.07632040259012367, "eval_loss": 2.6222195625305176, "eval_runtime": 50.2121, "eval_samples_per_second": 99.578, "eval_steps_per_second": 24.894, "num_input_tokens_seen": 4194304000, "step": 16000 }, { "epoch": 0.0765589038482178, "grad_norm": 0.1729741096496582, "learning_rate": 0.001, "loss": 2.7273, "num_input_tokens_seen": 4207411200, "step": 16050 }, { "epoch": 0.07679740510631193, "grad_norm": 0.178414985537529, "learning_rate": 0.001, "loss": 2.7119, "num_input_tokens_seen": 4220518400, "step": 16100 }, { "epoch": 0.07703590636440608, "grad_norm": 0.16985353827476501, "learning_rate": 0.001, "loss": 2.7329, "num_input_tokens_seen": 4233625600, "step": 16150 }, { "epoch": 0.07727440762250021, "grad_norm": 0.1792905330657959, "learning_rate": 0.001, "loss": 2.7331, "num_input_tokens_seen": 4246732800, "step": 16200 }, { "epoch": 0.07751290888059434, "grad_norm": 0.17052733898162842, "learning_rate": 0.001, "loss": 2.7438, "num_input_tokens_seen": 4259840000, "step": 16250 }, { "epoch": 0.07775141013868848, "grad_norm": 0.18520629405975342, "learning_rate": 0.001, "loss": 2.7292, "num_input_tokens_seen": 4272947200, "step": 16300 }, { "epoch": 0.07798991139678262, "grad_norm": 0.18607158958911896, "learning_rate": 0.001, "loss": 2.7305, "num_input_tokens_seen": 4286054400, "step": 16350 }, { "epoch": 0.07822841265487676, "grad_norm": 0.1774805337190628, "learning_rate": 0.001, "loss": 2.7237, "num_input_tokens_seen": 4299161600, "step": 16400 }, { "epoch": 0.07846691391297089, "grad_norm": 0.17118123173713684, "learning_rate": 0.001, "loss": 2.736, "num_input_tokens_seen": 4312268800, "step": 16450 }, { "epoch": 0.07870541517106502, "grad_norm": 0.18550898134708405, "learning_rate": 0.001, "loss": 2.7237, "num_input_tokens_seen": 4325376000, "step": 16500 }, { "epoch": 0.07870541517106502, "eval_loss": 2.6178503036499023, "eval_runtime": 50.4959, "eval_samples_per_second": 99.018, "eval_steps_per_second": 24.754, "num_input_tokens_seen": 4325376000, "step": 16500 }, { "epoch": 0.07894391642915917, "grad_norm": 0.1839551031589508, "learning_rate": 0.001, "loss": 2.7312, "num_input_tokens_seen": 4338483200, "step": 16550 }, { "epoch": 0.0791824176872533, "grad_norm": 0.17430303990840912, "learning_rate": 0.001, "loss": 2.7138, "num_input_tokens_seen": 4351590400, "step": 16600 }, { "epoch": 0.07942091894534743, "grad_norm": 0.17208248376846313, "learning_rate": 0.001, "loss": 2.7459, "num_input_tokens_seen": 4364697600, "step": 16650 }, { "epoch": 0.07965942020344158, "grad_norm": 0.16932401061058044, "learning_rate": 0.001, "loss": 2.7358, "num_input_tokens_seen": 4377804800, "step": 16700 }, { "epoch": 0.07989792146153571, "grad_norm": 0.17707890272140503, "learning_rate": 0.001, "loss": 2.7169, "num_input_tokens_seen": 4390912000, "step": 16750 }, { "epoch": 0.08013642271962984, "grad_norm": 0.1669357717037201, "learning_rate": 0.001, "loss": 2.7296, "num_input_tokens_seen": 4404019200, "step": 16800 }, { "epoch": 0.08037492397772399, "grad_norm": 0.19266557693481445, "learning_rate": 0.001, "loss": 2.7187, "num_input_tokens_seen": 4417126400, "step": 16850 }, { "epoch": 0.08061342523581812, "grad_norm": 0.17670407891273499, "learning_rate": 0.001, "loss": 2.7339, "num_input_tokens_seen": 4430233600, "step": 16900 }, { "epoch": 0.08085192649391225, "grad_norm": 0.17866192758083344, "learning_rate": 0.001, "loss": 2.7167, "num_input_tokens_seen": 4443340800, "step": 16950 }, { "epoch": 0.0810904277520064, "grad_norm": 0.18247559666633606, "learning_rate": 0.001, "loss": 2.7151, "num_input_tokens_seen": 4456448000, "step": 17000 }, { "epoch": 0.0810904277520064, "eval_loss": 2.6127233505249023, "eval_runtime": 50.7555, "eval_samples_per_second": 98.512, "eval_steps_per_second": 24.628, "num_input_tokens_seen": 4456448000, "step": 17000 }, { "epoch": 0.08132892901010053, "grad_norm": 0.17702773213386536, "learning_rate": 0.001, "loss": 2.7303, "num_input_tokens_seen": 4469555200, "step": 17050 }, { "epoch": 0.08156743026819467, "grad_norm": 0.18900151550769806, "learning_rate": 0.001, "loss": 2.7286, "num_input_tokens_seen": 4482662400, "step": 17100 }, { "epoch": 0.0818059315262888, "grad_norm": 0.18566136062145233, "learning_rate": 0.001, "loss": 2.7304, "num_input_tokens_seen": 4495769600, "step": 17150 }, { "epoch": 0.08204443278438293, "grad_norm": 0.1759686917066574, "learning_rate": 0.001, "loss": 2.7179, "num_input_tokens_seen": 4508876800, "step": 17200 }, { "epoch": 0.08228293404247708, "grad_norm": 0.15799184143543243, "learning_rate": 0.001, "loss": 2.7367, "num_input_tokens_seen": 4521984000, "step": 17250 }, { "epoch": 0.08252143530057121, "grad_norm": 0.18740351498126984, "learning_rate": 0.001, "loss": 2.72, "num_input_tokens_seen": 4535091200, "step": 17300 }, { "epoch": 0.08275993655866534, "grad_norm": 0.17688381671905518, "learning_rate": 0.001, "loss": 2.7115, "num_input_tokens_seen": 4548198400, "step": 17350 }, { "epoch": 0.08299843781675949, "grad_norm": 0.1807299256324768, "learning_rate": 0.001, "loss": 2.7346, "num_input_tokens_seen": 4561305600, "step": 17400 }, { "epoch": 0.08323693907485362, "grad_norm": 0.17570430040359497, "learning_rate": 0.001, "loss": 2.7141, "num_input_tokens_seen": 4574412800, "step": 17450 }, { "epoch": 0.08347544033294775, "grad_norm": 0.16912159323692322, "learning_rate": 0.001, "loss": 2.7164, "num_input_tokens_seen": 4587520000, "step": 17500 }, { "epoch": 0.08347544033294775, "eval_loss": 2.6085972785949707, "eval_runtime": 50.619, "eval_samples_per_second": 98.777, "eval_steps_per_second": 24.694, "num_input_tokens_seen": 4587520000, "step": 17500 }, { "epoch": 0.0837139415910419, "grad_norm": 0.17684704065322876, "learning_rate": 0.001, "loss": 2.7293, "num_input_tokens_seen": 4600627200, "step": 17550 }, { "epoch": 0.08395244284913603, "grad_norm": 0.18020550906658173, "learning_rate": 0.001, "loss": 2.7124, "num_input_tokens_seen": 4613734400, "step": 17600 }, { "epoch": 0.08419094410723016, "grad_norm": 0.17311082780361176, "learning_rate": 0.001, "loss": 2.7047, "num_input_tokens_seen": 4626841600, "step": 17650 }, { "epoch": 0.0844294453653243, "grad_norm": 0.17366532981395721, "learning_rate": 0.001, "loss": 2.7316, "num_input_tokens_seen": 4639948800, "step": 17700 }, { "epoch": 0.08466794662341844, "grad_norm": 0.16526220738887787, "learning_rate": 0.001, "loss": 2.7212, "num_input_tokens_seen": 4653056000, "step": 17750 }, { "epoch": 0.08490644788151258, "grad_norm": 0.1746092140674591, "learning_rate": 0.001, "loss": 2.7288, "num_input_tokens_seen": 4666163200, "step": 17800 }, { "epoch": 0.08514494913960671, "grad_norm": 0.19404995441436768, "learning_rate": 0.001, "loss": 2.7129, "num_input_tokens_seen": 4679270400, "step": 17850 }, { "epoch": 0.08538345039770084, "grad_norm": 0.18850015103816986, "learning_rate": 0.001, "loss": 2.7161, "num_input_tokens_seen": 4692377600, "step": 17900 }, { "epoch": 0.08562195165579499, "grad_norm": 0.19126516580581665, "learning_rate": 0.001, "loss": 2.7206, "num_input_tokens_seen": 4705484800, "step": 17950 }, { "epoch": 0.08586045291388912, "grad_norm": 0.1802307665348053, "learning_rate": 0.001, "loss": 2.7163, "num_input_tokens_seen": 4718592000, "step": 18000 }, { "epoch": 0.08586045291388912, "eval_loss": 2.603686809539795, "eval_runtime": 50.6488, "eval_samples_per_second": 98.719, "eval_steps_per_second": 24.68, "num_input_tokens_seen": 4718592000, "step": 18000 }, { "epoch": 0.08609895417198325, "grad_norm": 0.18276441097259521, "learning_rate": 0.001, "loss": 2.7299, "num_input_tokens_seen": 4731699200, "step": 18050 }, { "epoch": 0.0863374554300774, "grad_norm": 0.17280028760433197, "learning_rate": 0.001, "loss": 2.7175, "num_input_tokens_seen": 4744806400, "step": 18100 }, { "epoch": 0.08657595668817153, "grad_norm": 0.17224080860614777, "learning_rate": 0.001, "loss": 2.7089, "num_input_tokens_seen": 4757913600, "step": 18150 }, { "epoch": 0.08681445794626566, "grad_norm": 0.17205391824245453, "learning_rate": 0.001, "loss": 2.7072, "num_input_tokens_seen": 4771020800, "step": 18200 }, { "epoch": 0.0870529592043598, "grad_norm": 0.1829432249069214, "learning_rate": 0.001, "loss": 2.6959, "num_input_tokens_seen": 4784128000, "step": 18250 }, { "epoch": 0.08729146046245394, "grad_norm": 0.1669514924287796, "learning_rate": 0.001, "loss": 2.7209, "num_input_tokens_seen": 4797235200, "step": 18300 }, { "epoch": 0.08752996172054807, "grad_norm": 0.18273359537124634, "learning_rate": 0.001, "loss": 2.6935, "num_input_tokens_seen": 4810342400, "step": 18350 }, { "epoch": 0.08776846297864221, "grad_norm": 0.21061965823173523, "learning_rate": 0.001, "loss": 2.7204, "num_input_tokens_seen": 4823449600, "step": 18400 }, { "epoch": 0.08800696423673635, "grad_norm": 0.17710614204406738, "learning_rate": 0.001, "loss": 2.7231, "num_input_tokens_seen": 4836556800, "step": 18450 }, { "epoch": 0.08824546549483049, "grad_norm": 0.17370648682117462, "learning_rate": 0.001, "loss": 2.7064, "num_input_tokens_seen": 4849664000, "step": 18500 }, { "epoch": 0.08824546549483049, "eval_loss": 2.600076913833618, "eval_runtime": 50.3596, "eval_samples_per_second": 99.286, "eval_steps_per_second": 24.822, "num_input_tokens_seen": 4849664000, "step": 18500 }, { "epoch": 0.08848396675292462, "grad_norm": 0.19398675858974457, "learning_rate": 0.001, "loss": 2.7118, "num_input_tokens_seen": 4862771200, "step": 18550 }, { "epoch": 0.08872246801101875, "grad_norm": 0.18522043526172638, "learning_rate": 0.001, "loss": 2.6998, "num_input_tokens_seen": 4875878400, "step": 18600 }, { "epoch": 0.0889609692691129, "grad_norm": 0.2682952880859375, "learning_rate": 0.001, "loss": 2.7057, "num_input_tokens_seen": 4888985600, "step": 18650 }, { "epoch": 0.08919947052720703, "grad_norm": 0.18555712699890137, "learning_rate": 0.001, "loss": 2.7127, "num_input_tokens_seen": 4902092800, "step": 18700 }, { "epoch": 0.08943797178530116, "grad_norm": 0.1940859854221344, "learning_rate": 0.001, "loss": 2.7054, "num_input_tokens_seen": 4915200000, "step": 18750 }, { "epoch": 0.08967647304339531, "grad_norm": 0.1800539344549179, "learning_rate": 0.001, "loss": 2.702, "num_input_tokens_seen": 4928307200, "step": 18800 }, { "epoch": 0.08991497430148944, "grad_norm": 0.19734695553779602, "learning_rate": 0.001, "loss": 2.7157, "num_input_tokens_seen": 4941414400, "step": 18850 }, { "epoch": 0.09015347555958357, "grad_norm": 0.16387026011943817, "learning_rate": 0.001, "loss": 2.7183, "num_input_tokens_seen": 4954521600, "step": 18900 }, { "epoch": 0.09039197681767772, "grad_norm": 0.19447770714759827, "learning_rate": 0.001, "loss": 2.7154, "num_input_tokens_seen": 4967628800, "step": 18950 }, { "epoch": 0.09063047807577185, "grad_norm": 0.17366571724414825, "learning_rate": 0.001, "loss": 2.6996, "num_input_tokens_seen": 4980736000, "step": 19000 }, { "epoch": 0.09063047807577185, "eval_loss": 2.5983569622039795, "eval_runtime": 50.8055, "eval_samples_per_second": 98.415, "eval_steps_per_second": 24.604, "num_input_tokens_seen": 4980736000, "step": 19000 }, { "epoch": 0.09086897933386599, "grad_norm": 0.1770928055047989, "learning_rate": 0.001, "loss": 2.7171, "num_input_tokens_seen": 4993843200, "step": 19050 }, { "epoch": 0.09110748059196012, "grad_norm": 0.18122689425945282, "learning_rate": 0.001, "loss": 2.7064, "num_input_tokens_seen": 5006950400, "step": 19100 }, { "epoch": 0.09134598185005426, "grad_norm": 0.19320747256278992, "learning_rate": 0.001, "loss": 2.7105, "num_input_tokens_seen": 5020057600, "step": 19150 }, { "epoch": 0.0915844831081484, "grad_norm": 0.19556616246700287, "learning_rate": 0.001, "loss": 2.7018, "num_input_tokens_seen": 5033164800, "step": 19200 }, { "epoch": 0.09182298436624253, "grad_norm": 0.18251653015613556, "learning_rate": 0.001, "loss": 2.7067, "num_input_tokens_seen": 5046272000, "step": 19250 }, { "epoch": 0.09206148562433666, "grad_norm": 0.17226757109165192, "learning_rate": 0.001, "loss": 2.6803, "num_input_tokens_seen": 5059379200, "step": 19300 }, { "epoch": 0.09229998688243081, "grad_norm": 0.18007858097553253, "learning_rate": 0.001, "loss": 2.6998, "num_input_tokens_seen": 5072486400, "step": 19350 }, { "epoch": 0.09253848814052494, "grad_norm": 0.1664605736732483, "learning_rate": 0.001, "loss": 2.6985, "num_input_tokens_seen": 5085593600, "step": 19400 }, { "epoch": 0.09277698939861907, "grad_norm": 0.17898677289485931, "learning_rate": 0.001, "loss": 2.7034, "num_input_tokens_seen": 5098700800, "step": 19450 }, { "epoch": 0.09301549065671322, "grad_norm": 0.16403160989284515, "learning_rate": 0.001, "loss": 2.7012, "num_input_tokens_seen": 5111808000, "step": 19500 }, { "epoch": 0.09301549065671322, "eval_loss": 2.594251871109009, "eval_runtime": 50.1924, "eval_samples_per_second": 99.617, "eval_steps_per_second": 24.904, "num_input_tokens_seen": 5111808000, "step": 19500 }, { "epoch": 0.09325399191480735, "grad_norm": 0.17973001301288605, "learning_rate": 0.001, "loss": 2.7039, "num_input_tokens_seen": 5124915200, "step": 19550 }, { "epoch": 0.09349249317290148, "grad_norm": 0.1667868047952652, "learning_rate": 0.001, "loss": 2.7038, "num_input_tokens_seen": 5138022400, "step": 19600 }, { "epoch": 0.09373099443099563, "grad_norm": 0.18338319659233093, "learning_rate": 0.001, "loss": 2.7138, "num_input_tokens_seen": 5151129600, "step": 19650 }, { "epoch": 0.09396949568908976, "grad_norm": 0.17962965369224548, "learning_rate": 0.001, "loss": 2.6994, "num_input_tokens_seen": 5164236800, "step": 19700 }, { "epoch": 0.0942079969471839, "grad_norm": 0.17233812808990479, "learning_rate": 0.001, "loss": 2.7073, "num_input_tokens_seen": 5177344000, "step": 19750 }, { "epoch": 0.09444649820527803, "grad_norm": 0.16720129549503326, "learning_rate": 0.001, "loss": 2.7146, "num_input_tokens_seen": 5190451200, "step": 19800 }, { "epoch": 0.09468499946337217, "grad_norm": 0.1732376664876938, "learning_rate": 0.001, "loss": 2.7126, "num_input_tokens_seen": 5203558400, "step": 19850 }, { "epoch": 0.09492350072146631, "grad_norm": 0.17245380580425262, "learning_rate": 0.001, "loss": 2.7054, "num_input_tokens_seen": 5216665600, "step": 19900 }, { "epoch": 0.09516200197956044, "grad_norm": 0.17415107786655426, "learning_rate": 0.001, "loss": 2.7027, "num_input_tokens_seen": 5229772800, "step": 19950 }, { "epoch": 0.09540050323765457, "grad_norm": 0.1747124344110489, "learning_rate": 0.001, "loss": 2.6975, "num_input_tokens_seen": 5242880000, "step": 20000 }, { "epoch": 0.09540050323765457, "eval_loss": 2.5900332927703857, "eval_runtime": 50.5898, "eval_samples_per_second": 98.834, "eval_steps_per_second": 24.709, "num_input_tokens_seen": 5242880000, "step": 20000 }, { "epoch": 0.09563900449574872, "grad_norm": 0.17750214040279388, "learning_rate": 0.001, "loss": 2.7, "num_input_tokens_seen": 5255987200, "step": 20050 }, { "epoch": 0.09587750575384285, "grad_norm": 0.16490615904331207, "learning_rate": 0.001, "loss": 2.7188, "num_input_tokens_seen": 5269094400, "step": 20100 }, { "epoch": 0.09611600701193698, "grad_norm": 0.20347309112548828, "learning_rate": 0.001, "loss": 2.7034, "num_input_tokens_seen": 5282201600, "step": 20150 }, { "epoch": 0.09635450827003113, "grad_norm": 0.19717667996883392, "learning_rate": 0.001, "loss": 2.6864, "num_input_tokens_seen": 5295308800, "step": 20200 }, { "epoch": 0.09659300952812526, "grad_norm": 0.17054997384548187, "learning_rate": 0.001, "loss": 2.7068, "num_input_tokens_seen": 5308416000, "step": 20250 }, { "epoch": 0.09683151078621939, "grad_norm": 0.1771887093782425, "learning_rate": 0.001, "loss": 2.7037, "num_input_tokens_seen": 5321523200, "step": 20300 }, { "epoch": 0.09707001204431354, "grad_norm": 0.17556501924991608, "learning_rate": 0.001, "loss": 2.705, "num_input_tokens_seen": 5334630400, "step": 20350 }, { "epoch": 0.09730851330240767, "grad_norm": 0.1696256399154663, "learning_rate": 0.001, "loss": 2.7109, "num_input_tokens_seen": 5347737600, "step": 20400 }, { "epoch": 0.09754701456050181, "grad_norm": 0.18629619479179382, "learning_rate": 0.001, "loss": 2.7043, "num_input_tokens_seen": 5360844800, "step": 20450 }, { "epoch": 0.09778551581859594, "grad_norm": 0.18701018393039703, "learning_rate": 0.001, "loss": 2.7002, "num_input_tokens_seen": 5373952000, "step": 20500 }, { "epoch": 0.09778551581859594, "eval_loss": 2.587498903274536, "eval_runtime": 51.1986, "eval_samples_per_second": 97.659, "eval_steps_per_second": 24.415, "num_input_tokens_seen": 5373952000, "step": 20500 }, { "epoch": 0.09802401707669008, "grad_norm": 0.1792842447757721, "learning_rate": 0.001, "loss": 2.7083, "num_input_tokens_seen": 5387059200, "step": 20550 }, { "epoch": 0.09826251833478422, "grad_norm": 0.18761058151721954, "learning_rate": 0.001, "loss": 2.6898, "num_input_tokens_seen": 5400166400, "step": 20600 }, { "epoch": 0.09850101959287835, "grad_norm": 0.1827591508626938, "learning_rate": 0.001, "loss": 2.6976, "num_input_tokens_seen": 5413273600, "step": 20650 }, { "epoch": 0.09873952085097248, "grad_norm": 0.16178373992443085, "learning_rate": 0.001, "loss": 2.7029, "num_input_tokens_seen": 5426380800, "step": 20700 }, { "epoch": 0.09897802210906663, "grad_norm": 0.1880313903093338, "learning_rate": 0.001, "loss": 2.6867, "num_input_tokens_seen": 5439488000, "step": 20750 }, { "epoch": 0.09921652336716076, "grad_norm": 0.17611584067344666, "learning_rate": 0.001, "loss": 2.6741, "num_input_tokens_seen": 5452595200, "step": 20800 }, { "epoch": 0.09945502462525489, "grad_norm": 0.17712561786174774, "learning_rate": 0.001, "loss": 2.7099, "num_input_tokens_seen": 5465702400, "step": 20850 }, { "epoch": 0.09969352588334904, "grad_norm": 0.18022434413433075, "learning_rate": 0.001, "loss": 2.6988, "num_input_tokens_seen": 5478809600, "step": 20900 }, { "epoch": 0.09993202714144317, "grad_norm": 0.17434161901474, "learning_rate": 0.001, "loss": 2.6869, "num_input_tokens_seen": 5491916800, "step": 20950 }, { "epoch": 0.1001705283995373, "grad_norm": 0.17802472412586212, "learning_rate": 0.001, "loss": 2.6935, "num_input_tokens_seen": 5505024000, "step": 21000 }, { "epoch": 0.1001705283995373, "eval_loss": 2.5838851928710938, "eval_runtime": 50.1977, "eval_samples_per_second": 99.606, "eval_steps_per_second": 24.902, "num_input_tokens_seen": 5505024000, "step": 21000 }, { "epoch": 0.10040902965763145, "grad_norm": 0.1723284274339676, "learning_rate": 0.001, "loss": 2.694, "num_input_tokens_seen": 5518131200, "step": 21050 }, { "epoch": 0.10064753091572558, "grad_norm": 0.1627894937992096, "learning_rate": 0.001, "loss": 2.6866, "num_input_tokens_seen": 5531238400, "step": 21100 }, { "epoch": 0.10088603217381972, "grad_norm": 0.20949719846248627, "learning_rate": 0.001, "loss": 2.6915, "num_input_tokens_seen": 5544345600, "step": 21150 }, { "epoch": 0.10112453343191385, "grad_norm": 0.1980736404657364, "learning_rate": 0.001, "loss": 2.7076, "num_input_tokens_seen": 5557452800, "step": 21200 }, { "epoch": 0.10136303469000799, "grad_norm": 0.20961201190948486, "learning_rate": 0.001, "loss": 2.6978, "num_input_tokens_seen": 5570560000, "step": 21250 }, { "epoch": 0.10160153594810213, "grad_norm": 0.18137700855731964, "learning_rate": 0.001, "loss": 2.7029, "num_input_tokens_seen": 5583667200, "step": 21300 }, { "epoch": 0.10184003720619626, "grad_norm": 0.17235560715198517, "learning_rate": 0.001, "loss": 2.6979, "num_input_tokens_seen": 5596774400, "step": 21350 }, { "epoch": 0.1020785384642904, "grad_norm": 0.17818449437618256, "learning_rate": 0.001, "loss": 2.6987, "num_input_tokens_seen": 5609881600, "step": 21400 }, { "epoch": 0.10231703972238454, "grad_norm": 0.1798463761806488, "learning_rate": 0.001, "loss": 2.693, "num_input_tokens_seen": 5622988800, "step": 21450 }, { "epoch": 0.10255554098047867, "grad_norm": 0.19028444588184357, "learning_rate": 0.001, "loss": 2.7079, "num_input_tokens_seen": 5636096000, "step": 21500 }, { "epoch": 0.10255554098047867, "eval_loss": 2.5816380977630615, "eval_runtime": 50.7808, "eval_samples_per_second": 98.462, "eval_steps_per_second": 24.616, "num_input_tokens_seen": 5636096000, "step": 21500 }, { "epoch": 0.1027940422385728, "grad_norm": 0.1831275075674057, "learning_rate": 0.001, "loss": 2.7038, "num_input_tokens_seen": 5649203200, "step": 21550 }, { "epoch": 0.10303254349666695, "grad_norm": 0.17404012382030487, "learning_rate": 0.001, "loss": 2.7071, "num_input_tokens_seen": 5662310400, "step": 21600 }, { "epoch": 0.10327104475476108, "grad_norm": 0.1652098149061203, "learning_rate": 0.001, "loss": 2.7033, "num_input_tokens_seen": 5675417600, "step": 21650 }, { "epoch": 0.10350954601285522, "grad_norm": 0.1914501190185547, "learning_rate": 0.001, "loss": 2.6844, "num_input_tokens_seen": 5688524800, "step": 21700 }, { "epoch": 0.10374804727094936, "grad_norm": 0.19169588387012482, "learning_rate": 0.001, "loss": 2.6793, "num_input_tokens_seen": 5701632000, "step": 21750 }, { "epoch": 0.10398654852904349, "grad_norm": 0.17937491834163666, "learning_rate": 0.001, "loss": 2.6972, "num_input_tokens_seen": 5714739200, "step": 21800 }, { "epoch": 0.10422504978713763, "grad_norm": 0.17515376210212708, "learning_rate": 0.001, "loss": 2.6995, "num_input_tokens_seen": 5727846400, "step": 21850 }, { "epoch": 0.10446355104523176, "grad_norm": 0.18881027400493622, "learning_rate": 0.001, "loss": 2.7097, "num_input_tokens_seen": 5740953600, "step": 21900 }, { "epoch": 0.1047020523033259, "grad_norm": 0.19030135869979858, "learning_rate": 0.001, "loss": 2.6801, "num_input_tokens_seen": 5754060800, "step": 21950 }, { "epoch": 0.10494055356142004, "grad_norm": 0.17325563728809357, "learning_rate": 0.001, "loss": 2.6803, "num_input_tokens_seen": 5767168000, "step": 22000 }, { "epoch": 0.10494055356142004, "eval_loss": 2.577341318130493, "eval_runtime": 50.8482, "eval_samples_per_second": 98.332, "eval_steps_per_second": 24.583, "num_input_tokens_seen": 5767168000, "step": 22000 }, { "epoch": 0.10517905481951417, "grad_norm": 0.19298380613327026, "learning_rate": 0.001, "loss": 2.6966, "num_input_tokens_seen": 5780275200, "step": 22050 }, { "epoch": 0.1054175560776083, "grad_norm": 0.1772100180387497, "learning_rate": 0.001, "loss": 2.6851, "num_input_tokens_seen": 5793382400, "step": 22100 }, { "epoch": 0.10565605733570245, "grad_norm": 0.18548481166362762, "learning_rate": 0.001, "loss": 2.7028, "num_input_tokens_seen": 5806489600, "step": 22150 }, { "epoch": 0.10589455859379658, "grad_norm": 0.20102089643478394, "learning_rate": 0.001, "loss": 2.6915, "num_input_tokens_seen": 5819596800, "step": 22200 }, { "epoch": 0.10613305985189071, "grad_norm": 0.1833849996328354, "learning_rate": 0.001, "loss": 2.6872, "num_input_tokens_seen": 5832704000, "step": 22250 }, { "epoch": 0.10637156110998486, "grad_norm": 0.17730027437210083, "learning_rate": 0.001, "loss": 2.6811, "num_input_tokens_seen": 5845811200, "step": 22300 }, { "epoch": 0.10661006236807899, "grad_norm": 0.1818256825208664, "learning_rate": 0.001, "loss": 2.7, "num_input_tokens_seen": 5858918400, "step": 22350 }, { "epoch": 0.10684856362617313, "grad_norm": 0.16850312054157257, "learning_rate": 0.001, "loss": 2.6927, "num_input_tokens_seen": 5872025600, "step": 22400 }, { "epoch": 0.10708706488426727, "grad_norm": 0.209822878241539, "learning_rate": 0.001, "loss": 2.6881, "num_input_tokens_seen": 5885132800, "step": 22450 }, { "epoch": 0.1073255661423614, "grad_norm": 0.2131560891866684, "learning_rate": 0.001, "loss": 2.6797, "num_input_tokens_seen": 5898240000, "step": 22500 }, { "epoch": 0.1073255661423614, "eval_loss": 2.575388193130493, "eval_runtime": 50.9696, "eval_samples_per_second": 98.098, "eval_steps_per_second": 24.524, "num_input_tokens_seen": 5898240000, "step": 22500 }, { "epoch": 0.10756406740045554, "grad_norm": 0.18200135231018066, "learning_rate": 0.001, "loss": 2.6837, "num_input_tokens_seen": 5911347200, "step": 22550 }, { "epoch": 0.10780256865854967, "grad_norm": 0.1830984354019165, "learning_rate": 0.001, "loss": 2.7159, "num_input_tokens_seen": 5924454400, "step": 22600 }, { "epoch": 0.1080410699166438, "grad_norm": 0.1700614094734192, "learning_rate": 0.001, "loss": 2.6852, "num_input_tokens_seen": 5937561600, "step": 22650 }, { "epoch": 0.10827957117473795, "grad_norm": 0.18473868072032928, "learning_rate": 0.001, "loss": 2.6857, "num_input_tokens_seen": 5950668800, "step": 22700 }, { "epoch": 0.10851807243283208, "grad_norm": 0.19345365464687347, "learning_rate": 0.001, "loss": 2.69, "num_input_tokens_seen": 5963776000, "step": 22750 }, { "epoch": 0.10875657369092621, "grad_norm": 0.18807141482830048, "learning_rate": 0.001, "loss": 2.6897, "num_input_tokens_seen": 5976883200, "step": 22800 }, { "epoch": 0.10899507494902036, "grad_norm": 0.18426446616649628, "learning_rate": 0.001, "loss": 2.6855, "num_input_tokens_seen": 5989990400, "step": 22850 }, { "epoch": 0.10923357620711449, "grad_norm": 0.19184571504592896, "learning_rate": 0.001, "loss": 2.6914, "num_input_tokens_seen": 6003097600, "step": 22900 }, { "epoch": 0.10947207746520862, "grad_norm": 0.22897471487522125, "learning_rate": 0.001, "loss": 2.6812, "num_input_tokens_seen": 6016204800, "step": 22950 }, { "epoch": 0.10971057872330277, "grad_norm": 0.1939724087715149, "learning_rate": 0.001, "loss": 2.6836, "num_input_tokens_seen": 6029312000, "step": 23000 }, { "epoch": 0.10971057872330277, "eval_loss": 2.570572853088379, "eval_runtime": 50.0606, "eval_samples_per_second": 99.879, "eval_steps_per_second": 24.97, "num_input_tokens_seen": 6029312000, "step": 23000 }, { "epoch": 0.1099490799813969, "grad_norm": 0.17564797401428223, "learning_rate": 0.001, "loss": 2.6912, "num_input_tokens_seen": 6042419200, "step": 23050 }, { "epoch": 0.11018758123949104, "grad_norm": 0.17937473952770233, "learning_rate": 0.001, "loss": 2.6708, "num_input_tokens_seen": 6055526400, "step": 23100 }, { "epoch": 0.11042608249758518, "grad_norm": 0.18281136453151703, "learning_rate": 0.001, "loss": 2.6855, "num_input_tokens_seen": 6068633600, "step": 23150 }, { "epoch": 0.11066458375567931, "grad_norm": 0.18834726512432098, "learning_rate": 0.001, "loss": 2.6887, "num_input_tokens_seen": 6081740800, "step": 23200 }, { "epoch": 0.11090308501377345, "grad_norm": 0.2104720175266266, "learning_rate": 0.001, "loss": 2.6914, "num_input_tokens_seen": 6094848000, "step": 23250 }, { "epoch": 0.11114158627186758, "grad_norm": 0.18674172461032867, "learning_rate": 0.001, "loss": 2.6855, "num_input_tokens_seen": 6107955200, "step": 23300 }, { "epoch": 0.11138008752996172, "grad_norm": 0.19519701600074768, "learning_rate": 0.001, "loss": 2.6851, "num_input_tokens_seen": 6121062400, "step": 23350 }, { "epoch": 0.11161858878805586, "grad_norm": 0.1752537339925766, "learning_rate": 0.001, "loss": 2.692, "num_input_tokens_seen": 6134169600, "step": 23400 }, { "epoch": 0.11185709004614999, "grad_norm": 0.1786031723022461, "learning_rate": 0.001, "loss": 2.6785, "num_input_tokens_seen": 6147276800, "step": 23450 }, { "epoch": 0.11209559130424412, "grad_norm": 0.19057604670524597, "learning_rate": 0.001, "loss": 2.6798, "num_input_tokens_seen": 6160384000, "step": 23500 }, { "epoch": 0.11209559130424412, "eval_loss": 2.5710463523864746, "eval_runtime": 50.3332, "eval_samples_per_second": 99.338, "eval_steps_per_second": 24.835, "num_input_tokens_seen": 6160384000, "step": 23500 }, { "epoch": 0.11233409256233827, "grad_norm": 0.18272963166236877, "learning_rate": 0.001, "loss": 2.6847, "num_input_tokens_seen": 6173491200, "step": 23550 }, { "epoch": 0.1125725938204324, "grad_norm": 0.1666375696659088, "learning_rate": 0.001, "loss": 2.6747, "num_input_tokens_seen": 6186598400, "step": 23600 }, { "epoch": 0.11281109507852653, "grad_norm": 0.1688246876001358, "learning_rate": 0.001, "loss": 2.6963, "num_input_tokens_seen": 6199705600, "step": 23650 }, { "epoch": 0.11304959633662068, "grad_norm": 0.1970459669828415, "learning_rate": 0.001, "loss": 2.6904, "num_input_tokens_seen": 6212812800, "step": 23700 }, { "epoch": 0.11328809759471481, "grad_norm": 0.19660720229148865, "learning_rate": 0.001, "loss": 2.6833, "num_input_tokens_seen": 6225920000, "step": 23750 }, { "epoch": 0.11352659885280895, "grad_norm": 0.18711698055267334, "learning_rate": 0.001, "loss": 2.6872, "num_input_tokens_seen": 6239027200, "step": 23800 }, { "epoch": 0.11376510011090309, "grad_norm": 0.1878872513771057, "learning_rate": 0.001, "loss": 2.6884, "num_input_tokens_seen": 6252134400, "step": 23850 }, { "epoch": 0.11400360136899722, "grad_norm": 0.1969616860151291, "learning_rate": 0.001, "loss": 2.6982, "num_input_tokens_seen": 6265241600, "step": 23900 }, { "epoch": 0.11424210262709136, "grad_norm": 0.19693812727928162, "learning_rate": 0.001, "loss": 2.6782, "num_input_tokens_seen": 6278348800, "step": 23950 }, { "epoch": 0.1144806038851855, "grad_norm": 0.1731441468000412, "learning_rate": 0.001, "loss": 2.6917, "num_input_tokens_seen": 6291456000, "step": 24000 }, { "epoch": 0.1144806038851855, "eval_loss": 2.5664987564086914, "eval_runtime": 50.3452, "eval_samples_per_second": 99.314, "eval_steps_per_second": 24.829, "num_input_tokens_seen": 6291456000, "step": 24000 }, { "epoch": 0.11471910514327963, "grad_norm": 0.1724429428577423, "learning_rate": 0.001, "loss": 2.6806, "num_input_tokens_seen": 6304563200, "step": 24050 }, { "epoch": 0.11495760640137377, "grad_norm": 0.20449388027191162, "learning_rate": 0.001, "loss": 2.6873, "num_input_tokens_seen": 6317670400, "step": 24100 }, { "epoch": 0.1151961076594679, "grad_norm": 0.19024738669395447, "learning_rate": 0.001, "loss": 2.6811, "num_input_tokens_seen": 6330777600, "step": 24150 }, { "epoch": 0.11543460891756203, "grad_norm": 0.20510025322437286, "learning_rate": 0.001, "loss": 2.6643, "num_input_tokens_seen": 6343884800, "step": 24200 }, { "epoch": 0.11567311017565618, "grad_norm": 0.1783556044101715, "learning_rate": 0.001, "loss": 2.6709, "num_input_tokens_seen": 6356992000, "step": 24250 }, { "epoch": 0.11591161143375031, "grad_norm": 0.1771089732646942, "learning_rate": 0.001, "loss": 2.6677, "num_input_tokens_seen": 6370099200, "step": 24300 }, { "epoch": 0.11615011269184444, "grad_norm": 0.17016734182834625, "learning_rate": 0.001, "loss": 2.6681, "num_input_tokens_seen": 6383206400, "step": 24350 }, { "epoch": 0.11638861394993859, "grad_norm": 0.1901489496231079, "learning_rate": 0.001, "loss": 2.6811, "num_input_tokens_seen": 6396313600, "step": 24400 }, { "epoch": 0.11662711520803272, "grad_norm": 0.18185457587242126, "learning_rate": 0.001, "loss": 2.6787, "num_input_tokens_seen": 6409420800, "step": 24450 }, { "epoch": 0.11686561646612686, "grad_norm": 0.1789853274822235, "learning_rate": 0.001, "loss": 2.6657, "num_input_tokens_seen": 6422528000, "step": 24500 }, { "epoch": 0.11686561646612686, "eval_loss": 2.564084768295288, "eval_runtime": 50.4559, "eval_samples_per_second": 99.096, "eval_steps_per_second": 24.774, "num_input_tokens_seen": 6422528000, "step": 24500 }, { "epoch": 0.117104117724221, "grad_norm": 0.17294436693191528, "learning_rate": 0.001, "loss": 2.6812, "num_input_tokens_seen": 6435635200, "step": 24550 }, { "epoch": 0.11734261898231513, "grad_norm": 0.1840251386165619, "learning_rate": 0.001, "loss": 2.6599, "num_input_tokens_seen": 6448742400, "step": 24600 }, { "epoch": 0.11758112024040927, "grad_norm": 0.17588932812213898, "learning_rate": 0.001, "loss": 2.6742, "num_input_tokens_seen": 6461849600, "step": 24650 }, { "epoch": 0.1178196214985034, "grad_norm": 0.1805667132139206, "learning_rate": 0.001, "loss": 2.6647, "num_input_tokens_seen": 6474956800, "step": 24700 }, { "epoch": 0.11805812275659754, "grad_norm": 0.17930665612220764, "learning_rate": 0.001, "loss": 2.6763, "num_input_tokens_seen": 6488064000, "step": 24750 }, { "epoch": 0.11829662401469168, "grad_norm": 0.19195732474327087, "learning_rate": 0.001, "loss": 2.6716, "num_input_tokens_seen": 6501171200, "step": 24800 }, { "epoch": 0.11853512527278581, "grad_norm": 0.19274356961250305, "learning_rate": 0.001, "loss": 2.6702, "num_input_tokens_seen": 6514278400, "step": 24850 }, { "epoch": 0.11877362653087994, "grad_norm": 0.17423510551452637, "learning_rate": 0.001, "loss": 2.6733, "num_input_tokens_seen": 6527385600, "step": 24900 }, { "epoch": 0.11901212778897409, "grad_norm": 0.20267954468727112, "learning_rate": 0.001, "loss": 2.6649, "num_input_tokens_seen": 6540492800, "step": 24950 }, { "epoch": 0.11925062904706822, "grad_norm": 0.1756502240896225, "learning_rate": 0.001, "loss": 2.6582, "num_input_tokens_seen": 6553600000, "step": 25000 }, { "epoch": 0.11925062904706822, "eval_loss": 2.562955379486084, "eval_runtime": 50.0071, "eval_samples_per_second": 99.986, "eval_steps_per_second": 24.996, "num_input_tokens_seen": 6553600000, "step": 25000 }, { "epoch": 0.11948913030516237, "grad_norm": 0.19173742830753326, "learning_rate": 0.001, "loss": 2.6871, "num_input_tokens_seen": 6566707200, "step": 25050 }, { "epoch": 0.1197276315632565, "grad_norm": 0.1746075600385666, "learning_rate": 0.001, "loss": 2.7003, "num_input_tokens_seen": 6579814400, "step": 25100 }, { "epoch": 0.11996613282135063, "grad_norm": 0.17817530035972595, "learning_rate": 0.001, "loss": 2.6944, "num_input_tokens_seen": 6592921600, "step": 25150 }, { "epoch": 0.12020463407944477, "grad_norm": 0.201807901263237, "learning_rate": 0.001, "loss": 2.6766, "num_input_tokens_seen": 6606028800, "step": 25200 }, { "epoch": 0.1204431353375389, "grad_norm": 0.18620917201042175, "learning_rate": 0.001, "loss": 2.6802, "num_input_tokens_seen": 6619136000, "step": 25250 }, { "epoch": 0.12068163659563304, "grad_norm": 0.17383818328380585, "learning_rate": 0.001, "loss": 2.6698, "num_input_tokens_seen": 6632243200, "step": 25300 }, { "epoch": 0.12092013785372718, "grad_norm": 0.1766287237405777, "learning_rate": 0.001, "loss": 2.6705, "num_input_tokens_seen": 6645350400, "step": 25350 }, { "epoch": 0.12115863911182131, "grad_norm": 0.19551052153110504, "learning_rate": 0.001, "loss": 2.678, "num_input_tokens_seen": 6658457600, "step": 25400 }, { "epoch": 0.12139714036991545, "grad_norm": 0.18625982105731964, "learning_rate": 0.001, "loss": 2.6688, "num_input_tokens_seen": 6671564800, "step": 25450 }, { "epoch": 0.12163564162800959, "grad_norm": 0.18274050951004028, "learning_rate": 0.001, "loss": 2.6818, "num_input_tokens_seen": 6684672000, "step": 25500 }, { "epoch": 0.12163564162800959, "eval_loss": 2.5602569580078125, "eval_runtime": 50.4187, "eval_samples_per_second": 99.17, "eval_steps_per_second": 24.792, "num_input_tokens_seen": 6684672000, "step": 25500 }, { "epoch": 0.12187414288610372, "grad_norm": 0.18547837436199188, "learning_rate": 0.001, "loss": 2.6754, "num_input_tokens_seen": 6697779200, "step": 25550 }, { "epoch": 0.12211264414419785, "grad_norm": 0.18558937311172485, "learning_rate": 0.001, "loss": 2.6767, "num_input_tokens_seen": 6710886400, "step": 25600 }, { "epoch": 0.122351145402292, "grad_norm": 0.17276135087013245, "learning_rate": 0.001, "loss": 2.6775, "num_input_tokens_seen": 6723993600, "step": 25650 }, { "epoch": 0.12258964666038613, "grad_norm": 0.18483039736747742, "learning_rate": 0.001, "loss": 2.6818, "num_input_tokens_seen": 6737100800, "step": 25700 }, { "epoch": 0.12282814791848028, "grad_norm": 0.18036937713623047, "learning_rate": 0.001, "loss": 2.6669, "num_input_tokens_seen": 6750208000, "step": 25750 }, { "epoch": 0.12306664917657441, "grad_norm": 0.1728815734386444, "learning_rate": 0.001, "loss": 2.6789, "num_input_tokens_seen": 6763315200, "step": 25800 }, { "epoch": 0.12330515043466854, "grad_norm": 0.19193877279758453, "learning_rate": 0.001, "loss": 2.6487, "num_input_tokens_seen": 6776422400, "step": 25850 }, { "epoch": 0.12354365169276268, "grad_norm": 0.1584886610507965, "learning_rate": 0.001, "loss": 2.6638, "num_input_tokens_seen": 6789529600, "step": 25900 }, { "epoch": 0.12378215295085682, "grad_norm": 0.18792498111724854, "learning_rate": 0.001, "loss": 2.6754, "num_input_tokens_seen": 6802636800, "step": 25950 }, { "epoch": 0.12402065420895095, "grad_norm": 0.1689581423997879, "learning_rate": 0.001, "loss": 2.6682, "num_input_tokens_seen": 6815744000, "step": 26000 }, { "epoch": 0.12402065420895095, "eval_loss": 2.5587522983551025, "eval_runtime": 50.7858, "eval_samples_per_second": 98.453, "eval_steps_per_second": 24.613, "num_input_tokens_seen": 6815744000, "step": 26000 }, { "epoch": 0.1242591554670451, "grad_norm": 0.18573056161403656, "learning_rate": 0.001, "loss": 2.6565, "num_input_tokens_seen": 6828851200, "step": 26050 }, { "epoch": 0.12449765672513922, "grad_norm": 0.19160890579223633, "learning_rate": 0.001, "loss": 2.6797, "num_input_tokens_seen": 6841958400, "step": 26100 }, { "epoch": 0.12473615798323336, "grad_norm": 0.18323373794555664, "learning_rate": 0.001, "loss": 2.6602, "num_input_tokens_seen": 6855065600, "step": 26150 }, { "epoch": 0.1249746592413275, "grad_norm": 0.17691807448863983, "learning_rate": 0.001, "loss": 2.6676, "num_input_tokens_seen": 6868172800, "step": 26200 }, { "epoch": 0.12521316049942163, "grad_norm": 0.20718660950660706, "learning_rate": 0.001, "loss": 2.6588, "num_input_tokens_seen": 6881280000, "step": 26250 }, { "epoch": 0.12545166175751576, "grad_norm": 0.17811058461666107, "learning_rate": 0.001, "loss": 2.6754, "num_input_tokens_seen": 6894387200, "step": 26300 }, { "epoch": 0.1256901630156099, "grad_norm": 0.17490555346012115, "learning_rate": 0.001, "loss": 2.6605, "num_input_tokens_seen": 6907494400, "step": 26350 }, { "epoch": 0.12592866427370406, "grad_norm": 0.17391368746757507, "learning_rate": 0.001, "loss": 2.684, "num_input_tokens_seen": 6920601600, "step": 26400 }, { "epoch": 0.1261671655317982, "grad_norm": 0.16951416432857513, "learning_rate": 0.001, "loss": 2.6685, "num_input_tokens_seen": 6933708800, "step": 26450 }, { "epoch": 0.12640566678989232, "grad_norm": 0.17574581503868103, "learning_rate": 0.001, "loss": 2.6665, "num_input_tokens_seen": 6946816000, "step": 26500 }, { "epoch": 0.12640566678989232, "eval_loss": 2.556109666824341, "eval_runtime": 50.8743, "eval_samples_per_second": 98.281, "eval_steps_per_second": 24.57, "num_input_tokens_seen": 6946816000, "step": 26500 }, { "epoch": 0.12664416804798645, "grad_norm": 0.19910745322704315, "learning_rate": 0.001, "loss": 2.6755, "num_input_tokens_seen": 6959923200, "step": 26550 }, { "epoch": 0.12688266930608058, "grad_norm": 0.20141273736953735, "learning_rate": 0.001, "loss": 2.6798, "num_input_tokens_seen": 6973030400, "step": 26600 }, { "epoch": 0.1271211705641747, "grad_norm": 0.1732529103755951, "learning_rate": 0.001, "loss": 2.6606, "num_input_tokens_seen": 6986137600, "step": 26650 }, { "epoch": 0.12735967182226887, "grad_norm": 0.17546698451042175, "learning_rate": 0.001, "loss": 2.6717, "num_input_tokens_seen": 6999244800, "step": 26700 }, { "epoch": 0.127598173080363, "grad_norm": 0.2186097502708435, "learning_rate": 0.001, "loss": 2.6702, "num_input_tokens_seen": 7012352000, "step": 26750 }, { "epoch": 0.12783667433845713, "grad_norm": 0.1735202819108963, "learning_rate": 0.001, "loss": 2.6795, "num_input_tokens_seen": 7025459200, "step": 26800 }, { "epoch": 0.12807517559655127, "grad_norm": 0.40701860189437866, "learning_rate": 0.001, "loss": 2.6591, "num_input_tokens_seen": 7038566400, "step": 26850 }, { "epoch": 0.1283136768546454, "grad_norm": 0.19710049033164978, "learning_rate": 0.001, "loss": 2.6841, "num_input_tokens_seen": 7051673600, "step": 26900 }, { "epoch": 0.12855217811273956, "grad_norm": 0.18638554215431213, "learning_rate": 0.001, "loss": 2.6718, "num_input_tokens_seen": 7064780800, "step": 26950 }, { "epoch": 0.1287906793708337, "grad_norm": 0.17546561360359192, "learning_rate": 0.001, "loss": 2.6547, "num_input_tokens_seen": 7077888000, "step": 27000 }, { "epoch": 0.1287906793708337, "eval_loss": 2.551922559738159, "eval_runtime": 50.1864, "eval_samples_per_second": 99.629, "eval_steps_per_second": 24.907, "num_input_tokens_seen": 7077888000, "step": 27000 }, { "epoch": 0.12902918062892782, "grad_norm": 0.1790401190519333, "learning_rate": 0.001, "loss": 2.672, "num_input_tokens_seen": 7090995200, "step": 27050 }, { "epoch": 0.12926768188702195, "grad_norm": 0.18173836171627045, "learning_rate": 0.001, "loss": 2.6563, "num_input_tokens_seen": 7104102400, "step": 27100 }, { "epoch": 0.12950618314511608, "grad_norm": 0.1827983856201172, "learning_rate": 0.001, "loss": 2.665, "num_input_tokens_seen": 7117209600, "step": 27150 }, { "epoch": 0.12974468440321021, "grad_norm": 0.20252254605293274, "learning_rate": 0.001, "loss": 2.675, "num_input_tokens_seen": 7130316800, "step": 27200 }, { "epoch": 0.12998318566130437, "grad_norm": 0.18492095172405243, "learning_rate": 0.001, "loss": 2.6801, "num_input_tokens_seen": 7143424000, "step": 27250 }, { "epoch": 0.1302216869193985, "grad_norm": 0.1962280571460724, "learning_rate": 0.001, "loss": 2.6551, "num_input_tokens_seen": 7156531200, "step": 27300 }, { "epoch": 0.13046018817749264, "grad_norm": 0.18813727796077728, "learning_rate": 0.001, "loss": 2.6728, "num_input_tokens_seen": 7169638400, "step": 27350 }, { "epoch": 0.13069868943558677, "grad_norm": 0.18111565709114075, "learning_rate": 0.001, "loss": 2.6743, "num_input_tokens_seen": 7182745600, "step": 27400 }, { "epoch": 0.1309371906936809, "grad_norm": 0.1727459728717804, "learning_rate": 0.001, "loss": 2.6596, "num_input_tokens_seen": 7195852800, "step": 27450 }, { "epoch": 0.13117569195177506, "grad_norm": 0.20097768306732178, "learning_rate": 0.001, "loss": 2.6651, "num_input_tokens_seen": 7208960000, "step": 27500 }, { "epoch": 0.13117569195177506, "eval_loss": 2.5501132011413574, "eval_runtime": 50.3677, "eval_samples_per_second": 99.27, "eval_steps_per_second": 24.817, "num_input_tokens_seen": 7208960000, "step": 27500 }, { "epoch": 0.1314141932098692, "grad_norm": 0.17329637706279755, "learning_rate": 0.001, "loss": 2.663, "num_input_tokens_seen": 7222067200, "step": 27550 }, { "epoch": 0.13165269446796332, "grad_norm": 0.16942919790744781, "learning_rate": 0.001, "loss": 2.6609, "num_input_tokens_seen": 7235174400, "step": 27600 }, { "epoch": 0.13189119572605745, "grad_norm": 0.19828958809375763, "learning_rate": 0.001, "loss": 2.6625, "num_input_tokens_seen": 7248281600, "step": 27650 }, { "epoch": 0.13212969698415158, "grad_norm": 0.1928141862154007, "learning_rate": 0.001, "loss": 2.6597, "num_input_tokens_seen": 7261388800, "step": 27700 }, { "epoch": 0.13236819824224572, "grad_norm": 0.1870756894350052, "learning_rate": 0.001, "loss": 2.6718, "num_input_tokens_seen": 7274496000, "step": 27750 }, { "epoch": 0.13260669950033988, "grad_norm": 0.1786762923002243, "learning_rate": 0.001, "loss": 2.6631, "num_input_tokens_seen": 7287603200, "step": 27800 }, { "epoch": 0.132845200758434, "grad_norm": 0.1710624396800995, "learning_rate": 0.001, "loss": 2.6717, "num_input_tokens_seen": 7300710400, "step": 27850 }, { "epoch": 0.13308370201652814, "grad_norm": 0.1805214285850525, "learning_rate": 0.001, "loss": 2.6669, "num_input_tokens_seen": 7313817600, "step": 27900 }, { "epoch": 0.13332220327462227, "grad_norm": 0.18169906735420227, "learning_rate": 0.001, "loss": 2.6659, "num_input_tokens_seen": 7326924800, "step": 27950 }, { "epoch": 0.1335607045327164, "grad_norm": 0.16959500312805176, "learning_rate": 0.001, "loss": 2.6623, "num_input_tokens_seen": 7340032000, "step": 28000 }, { "epoch": 0.1335607045327164, "eval_loss": 2.548675060272217, "eval_runtime": 50.3022, "eval_samples_per_second": 99.399, "eval_steps_per_second": 24.85, "num_input_tokens_seen": 7340032000, "step": 28000 }, { "epoch": 0.13379920579081056, "grad_norm": 0.19409704208374023, "learning_rate": 0.001, "loss": 2.6776, "num_input_tokens_seen": 7353139200, "step": 28050 }, { "epoch": 0.1340377070489047, "grad_norm": 0.1712968647480011, "learning_rate": 0.001, "loss": 2.6679, "num_input_tokens_seen": 7366246400, "step": 28100 }, { "epoch": 0.13427620830699882, "grad_norm": 0.20586130023002625, "learning_rate": 0.001, "loss": 2.6633, "num_input_tokens_seen": 7379353600, "step": 28150 }, { "epoch": 0.13451470956509295, "grad_norm": 0.1776891052722931, "learning_rate": 0.001, "loss": 2.6683, "num_input_tokens_seen": 7392460800, "step": 28200 }, { "epoch": 0.1347532108231871, "grad_norm": 0.19293451309204102, "learning_rate": 0.001, "loss": 2.6645, "num_input_tokens_seen": 7405568000, "step": 28250 }, { "epoch": 0.13499171208128122, "grad_norm": 0.17754724621772766, "learning_rate": 0.001, "loss": 2.6685, "num_input_tokens_seen": 7418675200, "step": 28300 }, { "epoch": 0.13523021333937538, "grad_norm": 0.17739038169384003, "learning_rate": 0.001, "loss": 2.6607, "num_input_tokens_seen": 7431782400, "step": 28350 }, { "epoch": 0.1354687145974695, "grad_norm": 0.175009086728096, "learning_rate": 0.001, "loss": 2.6679, "num_input_tokens_seen": 7444889600, "step": 28400 }, { "epoch": 0.13570721585556364, "grad_norm": 0.2229124754667282, "learning_rate": 0.001, "loss": 2.6687, "num_input_tokens_seen": 7457996800, "step": 28450 }, { "epoch": 0.13594571711365777, "grad_norm": 0.1791590005159378, "learning_rate": 0.001, "loss": 2.6741, "num_input_tokens_seen": 7471104000, "step": 28500 }, { "epoch": 0.13594571711365777, "eval_loss": 2.5456056594848633, "eval_runtime": 50.6342, "eval_samples_per_second": 98.747, "eval_steps_per_second": 24.687, "num_input_tokens_seen": 7471104000, "step": 28500 }, { "epoch": 0.1361842183717519, "grad_norm": 0.18920041620731354, "learning_rate": 0.001, "loss": 2.6612, "num_input_tokens_seen": 7484211200, "step": 28550 }, { "epoch": 0.13642271962984603, "grad_norm": 0.19247522950172424, "learning_rate": 0.001, "loss": 2.6597, "num_input_tokens_seen": 7497318400, "step": 28600 }, { "epoch": 0.1366612208879402, "grad_norm": 0.22499197721481323, "learning_rate": 0.001, "loss": 2.6583, "num_input_tokens_seen": 7510425600, "step": 28650 }, { "epoch": 0.13689972214603432, "grad_norm": 0.18946559727191925, "learning_rate": 0.001, "loss": 2.6612, "num_input_tokens_seen": 7523532800, "step": 28700 }, { "epoch": 0.13713822340412846, "grad_norm": 0.19621454179286957, "learning_rate": 0.001, "loss": 2.6425, "num_input_tokens_seen": 7536640000, "step": 28750 }, { "epoch": 0.1373767246622226, "grad_norm": 0.21594376862049103, "learning_rate": 0.001, "loss": 2.6564, "num_input_tokens_seen": 7549747200, "step": 28800 }, { "epoch": 0.13761522592031672, "grad_norm": 0.18186470866203308, "learning_rate": 0.001, "loss": 2.6728, "num_input_tokens_seen": 7562854400, "step": 28850 }, { "epoch": 0.13785372717841088, "grad_norm": 0.19369743764400482, "learning_rate": 0.001, "loss": 2.6585, "num_input_tokens_seen": 7575961600, "step": 28900 }, { "epoch": 0.138092228436505, "grad_norm": 0.1897999793291092, "learning_rate": 0.001, "loss": 2.6564, "num_input_tokens_seen": 7589068800, "step": 28950 }, { "epoch": 0.13833072969459914, "grad_norm": 0.18076784908771515, "learning_rate": 0.001, "loss": 2.6453, "num_input_tokens_seen": 7602176000, "step": 29000 }, { "epoch": 0.13833072969459914, "eval_loss": 2.54413104057312, "eval_runtime": 50.9152, "eval_samples_per_second": 98.202, "eval_steps_per_second": 24.551, "num_input_tokens_seen": 7602176000, "step": 29000 }, { "epoch": 0.13856923095269327, "grad_norm": 0.18520566821098328, "learning_rate": 0.001, "loss": 2.6644, "num_input_tokens_seen": 7615283200, "step": 29050 }, { "epoch": 0.1388077322107874, "grad_norm": 0.22739861905574799, "learning_rate": 0.001, "loss": 2.6597, "num_input_tokens_seen": 7628390400, "step": 29100 }, { "epoch": 0.13904623346888154, "grad_norm": 0.18451730906963348, "learning_rate": 0.001, "loss": 2.6432, "num_input_tokens_seen": 7641497600, "step": 29150 }, { "epoch": 0.1392847347269757, "grad_norm": 0.1865098923444748, "learning_rate": 0.001, "loss": 2.6651, "num_input_tokens_seen": 7654604800, "step": 29200 }, { "epoch": 0.13952323598506983, "grad_norm": 0.18676789104938507, "learning_rate": 0.001, "loss": 2.6597, "num_input_tokens_seen": 7667712000, "step": 29250 }, { "epoch": 0.13976173724316396, "grad_norm": 0.17463742196559906, "learning_rate": 0.001, "loss": 2.6571, "num_input_tokens_seen": 7680819200, "step": 29300 }, { "epoch": 0.1400002385012581, "grad_norm": 0.21621429920196533, "learning_rate": 0.001, "loss": 2.6342, "num_input_tokens_seen": 7693926400, "step": 29350 }, { "epoch": 0.14023873975935222, "grad_norm": 0.17493990063667297, "learning_rate": 0.001, "loss": 2.6536, "num_input_tokens_seen": 7707033600, "step": 29400 }, { "epoch": 0.14047724101744638, "grad_norm": 0.17649762332439423, "learning_rate": 0.001, "loss": 2.6526, "num_input_tokens_seen": 7720140800, "step": 29450 }, { "epoch": 0.1407157422755405, "grad_norm": 0.18224874138832092, "learning_rate": 0.001, "loss": 2.6635, "num_input_tokens_seen": 7733248000, "step": 29500 }, { "epoch": 0.1407157422755405, "eval_loss": 2.5433554649353027, "eval_runtime": 51.2973, "eval_samples_per_second": 97.471, "eval_steps_per_second": 24.368, "num_input_tokens_seen": 7733248000, "step": 29500 }, { "epoch": 0.14095424353363464, "grad_norm": 0.21109874546527863, "learning_rate": 0.001, "loss": 2.6788, "num_input_tokens_seen": 7746355200, "step": 29550 }, { "epoch": 0.14119274479172877, "grad_norm": 0.17663723230361938, "learning_rate": 0.001, "loss": 2.6578, "num_input_tokens_seen": 7759462400, "step": 29600 }, { "epoch": 0.1414312460498229, "grad_norm": 0.18385198712348938, "learning_rate": 0.001, "loss": 2.676, "num_input_tokens_seen": 7772569600, "step": 29650 }, { "epoch": 0.14166974730791704, "grad_norm": 0.1829567402601242, "learning_rate": 0.001, "loss": 2.6586, "num_input_tokens_seen": 7785676800, "step": 29700 }, { "epoch": 0.1419082485660112, "grad_norm": 0.1907297968864441, "learning_rate": 0.001, "loss": 2.6508, "num_input_tokens_seen": 7798784000, "step": 29750 }, { "epoch": 0.14214674982410533, "grad_norm": 0.2106500118970871, "learning_rate": 0.001, "loss": 2.6578, "num_input_tokens_seen": 7811891200, "step": 29800 }, { "epoch": 0.14238525108219946, "grad_norm": 0.18974357843399048, "learning_rate": 0.001, "loss": 2.6506, "num_input_tokens_seen": 7824998400, "step": 29850 }, { "epoch": 0.1426237523402936, "grad_norm": 0.18876343965530396, "learning_rate": 0.001, "loss": 2.6663, "num_input_tokens_seen": 7838105600, "step": 29900 }, { "epoch": 0.14286225359838772, "grad_norm": 0.17305608093738556, "learning_rate": 0.001, "loss": 2.657, "num_input_tokens_seen": 7851212800, "step": 29950 }, { "epoch": 0.14310075485648185, "grad_norm": 0.18900860846042633, "learning_rate": 0.001, "loss": 2.6502, "num_input_tokens_seen": 7864320000, "step": 30000 }, { "epoch": 0.14310075485648185, "eval_loss": 2.540076971054077, "eval_runtime": 50.1464, "eval_samples_per_second": 99.708, "eval_steps_per_second": 24.927, "num_input_tokens_seen": 7864320000, "step": 30000 }, { "epoch": 0.143339256114576, "grad_norm": 0.16919030249118805, "learning_rate": 0.001, "loss": 2.6729, "num_input_tokens_seen": 7877427200, "step": 30050 }, { "epoch": 0.14357775737267015, "grad_norm": 0.17828898131847382, "learning_rate": 0.001, "loss": 2.647, "num_input_tokens_seen": 7890534400, "step": 30100 }, { "epoch": 0.14381625863076428, "grad_norm": 0.1790715903043747, "learning_rate": 0.001, "loss": 2.6639, "num_input_tokens_seen": 7903641600, "step": 30150 }, { "epoch": 0.1440547598888584, "grad_norm": 0.18818187713623047, "learning_rate": 0.001, "loss": 2.6485, "num_input_tokens_seen": 7916748800, "step": 30200 }, { "epoch": 0.14429326114695254, "grad_norm": 0.2171814739704132, "learning_rate": 0.001, "loss": 2.6577, "num_input_tokens_seen": 7929856000, "step": 30250 }, { "epoch": 0.1445317624050467, "grad_norm": 0.1844399869441986, "learning_rate": 0.001, "loss": 2.6473, "num_input_tokens_seen": 7942963200, "step": 30300 }, { "epoch": 0.14477026366314083, "grad_norm": 0.19607801735401154, "learning_rate": 0.001, "loss": 2.6576, "num_input_tokens_seen": 7956070400, "step": 30350 }, { "epoch": 0.14500876492123496, "grad_norm": 0.1967996209859848, "learning_rate": 0.001, "loss": 2.64, "num_input_tokens_seen": 7969177600, "step": 30400 }, { "epoch": 0.1452472661793291, "grad_norm": 0.2087596207857132, "learning_rate": 0.001, "loss": 2.6485, "num_input_tokens_seen": 7982284800, "step": 30450 }, { "epoch": 0.14548576743742322, "grad_norm": 0.1938595473766327, "learning_rate": 0.001, "loss": 2.654, "num_input_tokens_seen": 7995392000, "step": 30500 }, { "epoch": 0.14548576743742322, "eval_loss": 2.537402391433716, "eval_runtime": 50.7304, "eval_samples_per_second": 98.56, "eval_steps_per_second": 24.64, "num_input_tokens_seen": 7995392000, "step": 30500 }, { "epoch": 0.14572426869551736, "grad_norm": 0.18282300233840942, "learning_rate": 0.001, "loss": 2.6592, "num_input_tokens_seen": 8008499200, "step": 30550 }, { "epoch": 0.14596276995361152, "grad_norm": 0.1829262375831604, "learning_rate": 0.001, "loss": 2.6618, "num_input_tokens_seen": 8021606400, "step": 30600 }, { "epoch": 0.14620127121170565, "grad_norm": 0.19001947343349457, "learning_rate": 0.001, "loss": 2.649, "num_input_tokens_seen": 8034713600, "step": 30650 }, { "epoch": 0.14643977246979978, "grad_norm": 0.19943153858184814, "learning_rate": 0.001, "loss": 2.6578, "num_input_tokens_seen": 8047820800, "step": 30700 }, { "epoch": 0.1466782737278939, "grad_norm": 0.18482360243797302, "learning_rate": 0.001, "loss": 2.6616, "num_input_tokens_seen": 8060928000, "step": 30750 }, { "epoch": 0.14691677498598804, "grad_norm": 0.20858009159564972, "learning_rate": 0.001, "loss": 2.6684, "num_input_tokens_seen": 8074035200, "step": 30800 }, { "epoch": 0.1471552762440822, "grad_norm": 0.2759605646133423, "learning_rate": 0.001, "loss": 2.713, "num_input_tokens_seen": 8087142400, "step": 30850 }, { "epoch": 0.14739377750217633, "grad_norm": 0.22366145253181458, "learning_rate": 0.001, "loss": 2.7065, "num_input_tokens_seen": 8100249600, "step": 30900 }, { "epoch": 0.14763227876027046, "grad_norm": 0.22143268585205078, "learning_rate": 0.001, "loss": 2.672, "num_input_tokens_seen": 8113356800, "step": 30950 }, { "epoch": 0.1478707800183646, "grad_norm": 0.25140002369880676, "learning_rate": 0.001, "loss": 2.6658, "num_input_tokens_seen": 8126464000, "step": 31000 }, { "epoch": 0.1478707800183646, "eval_loss": 2.5451457500457764, "eval_runtime": 50.0622, "eval_samples_per_second": 99.876, "eval_steps_per_second": 24.969, "num_input_tokens_seen": 8126464000, "step": 31000 }, { "epoch": 0.14810928127645873, "grad_norm": 0.20207786560058594, "learning_rate": 0.001, "loss": 2.6493, "num_input_tokens_seen": 8139571200, "step": 31050 }, { "epoch": 0.14834778253455286, "grad_norm": 0.20135898888111115, "learning_rate": 0.001, "loss": 2.6555, "num_input_tokens_seen": 8152678400, "step": 31100 }, { "epoch": 0.14858628379264702, "grad_norm": 0.19284267723560333, "learning_rate": 0.001, "loss": 2.6637, "num_input_tokens_seen": 8165785600, "step": 31150 }, { "epoch": 0.14882478505074115, "grad_norm": 0.17214693129062653, "learning_rate": 0.001, "loss": 2.6663, "num_input_tokens_seen": 8178892800, "step": 31200 }, { "epoch": 0.14906328630883528, "grad_norm": 0.19444549083709717, "learning_rate": 0.001, "loss": 2.6541, "num_input_tokens_seen": 8192000000, "step": 31250 }, { "epoch": 0.1493017875669294, "grad_norm": 0.19992901384830475, "learning_rate": 0.001, "loss": 2.6419, "num_input_tokens_seen": 8205107200, "step": 31300 }, { "epoch": 0.14954028882502354, "grad_norm": 0.16732315719127655, "learning_rate": 0.001, "loss": 2.6559, "num_input_tokens_seen": 8218214400, "step": 31350 }, { "epoch": 0.1497787900831177, "grad_norm": 0.4210798442363739, "learning_rate": 0.001, "loss": 2.6478, "num_input_tokens_seen": 8231321600, "step": 31400 }, { "epoch": 0.15001729134121183, "grad_norm": 0.2139436900615692, "learning_rate": 0.001, "loss": 2.6753, "num_input_tokens_seen": 8244428800, "step": 31450 }, { "epoch": 0.15025579259930597, "grad_norm": 0.19131046533584595, "learning_rate": 0.001, "loss": 2.6675, "num_input_tokens_seen": 8257536000, "step": 31500 }, { "epoch": 0.15025579259930597, "eval_loss": 2.5402350425720215, "eval_runtime": 50.477, "eval_samples_per_second": 99.055, "eval_steps_per_second": 24.764, "num_input_tokens_seen": 8257536000, "step": 31500 }, { "epoch": 0.1504942938574001, "grad_norm": 0.20711492002010345, "learning_rate": 0.001, "loss": 2.6654, "num_input_tokens_seen": 8270643200, "step": 31550 }, { "epoch": 0.15073279511549423, "grad_norm": 0.1888076812028885, "learning_rate": 0.001, "loss": 2.6603, "num_input_tokens_seen": 8283750400, "step": 31600 }, { "epoch": 0.15097129637358836, "grad_norm": 0.18534335494041443, "learning_rate": 0.001, "loss": 2.6539, "num_input_tokens_seen": 8296857600, "step": 31650 }, { "epoch": 0.15120979763168252, "grad_norm": 0.2024192214012146, "learning_rate": 0.001, "loss": 2.6514, "num_input_tokens_seen": 8309964800, "step": 31700 }, { "epoch": 0.15144829888977665, "grad_norm": 0.18967773020267487, "learning_rate": 0.001, "loss": 2.6457, "num_input_tokens_seen": 8323072000, "step": 31750 }, { "epoch": 0.15168680014787078, "grad_norm": 0.18823806941509247, "learning_rate": 0.001, "loss": 2.6579, "num_input_tokens_seen": 8336179200, "step": 31800 }, { "epoch": 0.1519253014059649, "grad_norm": 0.20198485255241394, "learning_rate": 0.001, "loss": 2.6623, "num_input_tokens_seen": 8349286400, "step": 31850 }, { "epoch": 0.15216380266405904, "grad_norm": 0.19362477958202362, "learning_rate": 0.001, "loss": 2.6473, "num_input_tokens_seen": 8362393600, "step": 31900 }, { "epoch": 0.15240230392215318, "grad_norm": 0.18454812467098236, "learning_rate": 0.001, "loss": 2.6411, "num_input_tokens_seen": 8375500800, "step": 31950 }, { "epoch": 0.15264080518024734, "grad_norm": 0.1968630850315094, "learning_rate": 0.001, "loss": 2.6405, "num_input_tokens_seen": 8388608000, "step": 32000 }, { "epoch": 0.15264080518024734, "eval_loss": 2.5325138568878174, "eval_runtime": 51.0134, "eval_samples_per_second": 98.013, "eval_steps_per_second": 24.503, "num_input_tokens_seen": 8388608000, "step": 32000 }, { "epoch": 0.15287930643834147, "grad_norm": 0.180119588971138, "learning_rate": 0.001, "loss": 2.6558, "num_input_tokens_seen": 8401715200, "step": 32050 }, { "epoch": 0.1531178076964356, "grad_norm": 0.1952589452266693, "learning_rate": 0.001, "loss": 2.6465, "num_input_tokens_seen": 8414822400, "step": 32100 }, { "epoch": 0.15335630895452973, "grad_norm": 0.1845589131116867, "learning_rate": 0.001, "loss": 2.6297, "num_input_tokens_seen": 8427929600, "step": 32150 }, { "epoch": 0.15359481021262386, "grad_norm": 0.20116594433784485, "learning_rate": 0.001, "loss": 2.6422, "num_input_tokens_seen": 8441036800, "step": 32200 }, { "epoch": 0.15383331147071802, "grad_norm": 0.1932612508535385, "learning_rate": 0.001, "loss": 2.6494, "num_input_tokens_seen": 8454144000, "step": 32250 }, { "epoch": 0.15407181272881215, "grad_norm": 0.17934490740299225, "learning_rate": 0.001, "loss": 2.6474, "num_input_tokens_seen": 8467251200, "step": 32300 }, { "epoch": 0.15431031398690628, "grad_norm": 0.19273313879966736, "learning_rate": 0.001, "loss": 2.6447, "num_input_tokens_seen": 8480358400, "step": 32350 }, { "epoch": 0.15454881524500041, "grad_norm": 0.1921055018901825, "learning_rate": 0.001, "loss": 2.665, "num_input_tokens_seen": 8493465600, "step": 32400 }, { "epoch": 0.15478731650309455, "grad_norm": 0.37117844820022583, "learning_rate": 0.001, "loss": 2.6351, "num_input_tokens_seen": 8506572800, "step": 32450 }, { "epoch": 0.15502581776118868, "grad_norm": 0.1884016990661621, "learning_rate": 0.001, "loss": 2.6436, "num_input_tokens_seen": 8519680000, "step": 32500 }, { "epoch": 0.15502581776118868, "eval_loss": 2.5313448905944824, "eval_runtime": 50.7051, "eval_samples_per_second": 98.609, "eval_steps_per_second": 24.652, "num_input_tokens_seen": 8519680000, "step": 32500 }, { "epoch": 0.15526431901928284, "grad_norm": 0.22205407917499542, "learning_rate": 0.001, "loss": 2.6464, "num_input_tokens_seen": 8532787200, "step": 32550 }, { "epoch": 0.15550282027737697, "grad_norm": 0.18515361845493317, "learning_rate": 0.001, "loss": 2.642, "num_input_tokens_seen": 8545894400, "step": 32600 }, { "epoch": 0.1557413215354711, "grad_norm": 0.18903231620788574, "learning_rate": 0.001, "loss": 2.6446, "num_input_tokens_seen": 8559001600, "step": 32650 }, { "epoch": 0.15597982279356523, "grad_norm": 0.1857556253671646, "learning_rate": 0.001, "loss": 2.6561, "num_input_tokens_seen": 8572108800, "step": 32700 }, { "epoch": 0.15621832405165936, "grad_norm": 0.45706707239151, "learning_rate": 0.001, "loss": 2.6487, "num_input_tokens_seen": 8585216000, "step": 32750 }, { "epoch": 0.15645682530975352, "grad_norm": 0.20191136002540588, "learning_rate": 0.001, "loss": 2.6593, "num_input_tokens_seen": 8598323200, "step": 32800 }, { "epoch": 0.15669532656784765, "grad_norm": 0.21191105246543884, "learning_rate": 0.001, "loss": 2.659, "num_input_tokens_seen": 8611430400, "step": 32850 }, { "epoch": 0.15693382782594179, "grad_norm": 0.20596672594547272, "learning_rate": 0.001, "loss": 2.6354, "num_input_tokens_seen": 8624537600, "step": 32900 }, { "epoch": 0.15717232908403592, "grad_norm": 0.2952199876308441, "learning_rate": 0.001, "loss": 2.6501, "num_input_tokens_seen": 8637644800, "step": 32950 }, { "epoch": 0.15741083034213005, "grad_norm": 0.2217044234275818, "learning_rate": 0.001, "loss": 2.6495, "num_input_tokens_seen": 8650752000, "step": 33000 }, { "epoch": 0.15741083034213005, "eval_loss": 2.5319430828094482, "eval_runtime": 50.8413, "eval_samples_per_second": 98.345, "eval_steps_per_second": 24.586, "num_input_tokens_seen": 8650752000, "step": 33000 }, { "epoch": 0.15764933160022418, "grad_norm": 0.2384626269340515, "learning_rate": 0.001, "loss": 2.6503, "num_input_tokens_seen": 8663859200, "step": 33050 }, { "epoch": 0.15788783285831834, "grad_norm": 0.18387843668460846, "learning_rate": 0.001, "loss": 2.6469, "num_input_tokens_seen": 8676966400, "step": 33100 }, { "epoch": 0.15812633411641247, "grad_norm": 0.23530641198158264, "learning_rate": 0.001, "loss": 2.6484, "num_input_tokens_seen": 8690073600, "step": 33150 }, { "epoch": 0.1583648353745066, "grad_norm": 0.2027565985918045, "learning_rate": 0.001, "loss": 2.6564, "num_input_tokens_seen": 8703180800, "step": 33200 }, { "epoch": 0.15860333663260073, "grad_norm": 0.21472220122814178, "learning_rate": 0.001, "loss": 2.6543, "num_input_tokens_seen": 8716288000, "step": 33250 }, { "epoch": 0.15884183789069486, "grad_norm": 0.19012615084648132, "learning_rate": 0.001, "loss": 2.6378, "num_input_tokens_seen": 8729395200, "step": 33300 }, { "epoch": 0.159080339148789, "grad_norm": 0.18018738925457, "learning_rate": 0.001, "loss": 2.6461, "num_input_tokens_seen": 8742502400, "step": 33350 }, { "epoch": 0.15931884040688316, "grad_norm": 0.20139184594154358, "learning_rate": 0.001, "loss": 2.6419, "num_input_tokens_seen": 8755609600, "step": 33400 }, { "epoch": 0.1595573416649773, "grad_norm": 0.20734767615795135, "learning_rate": 0.001, "loss": 2.6299, "num_input_tokens_seen": 8768716800, "step": 33450 }, { "epoch": 0.15979584292307142, "grad_norm": 0.18958640098571777, "learning_rate": 0.001, "loss": 2.6525, "num_input_tokens_seen": 8781824000, "step": 33500 }, { "epoch": 0.15979584292307142, "eval_loss": 2.5301430225372314, "eval_runtime": 64.6928, "eval_samples_per_second": 77.288, "eval_steps_per_second": 19.322, "num_input_tokens_seen": 8781824000, "step": 33500 }, { "epoch": 0.16003434418116555, "grad_norm": 0.20421727001667023, "learning_rate": 0.001, "loss": 2.6445, "num_input_tokens_seen": 8794931200, "step": 33550 }, { "epoch": 0.16027284543925968, "grad_norm": 0.18347379565238953, "learning_rate": 0.001, "loss": 2.6525, "num_input_tokens_seen": 8808038400, "step": 33600 }, { "epoch": 0.16051134669735384, "grad_norm": 0.19450639188289642, "learning_rate": 0.001, "loss": 2.6356, "num_input_tokens_seen": 8821145600, "step": 33650 }, { "epoch": 0.16074984795544797, "grad_norm": 0.17953775823116302, "learning_rate": 0.001, "loss": 2.6424, "num_input_tokens_seen": 8834252800, "step": 33700 }, { "epoch": 0.1609883492135421, "grad_norm": 0.1990649551153183, "learning_rate": 0.001, "loss": 2.6608, "num_input_tokens_seen": 8847360000, "step": 33750 }, { "epoch": 0.16122685047163623, "grad_norm": 0.19343194365501404, "learning_rate": 0.001, "loss": 2.6604, "num_input_tokens_seen": 8860467200, "step": 33800 }, { "epoch": 0.16146535172973037, "grad_norm": 0.19385921955108643, "learning_rate": 0.001, "loss": 2.6354, "num_input_tokens_seen": 8873574400, "step": 33850 }, { "epoch": 0.1617038529878245, "grad_norm": 0.1828273981809616, "learning_rate": 0.001, "loss": 2.6578, "num_input_tokens_seen": 8886681600, "step": 33900 }, { "epoch": 0.16194235424591866, "grad_norm": 0.216063991189003, "learning_rate": 0.001, "loss": 2.6575, "num_input_tokens_seen": 8899788800, "step": 33950 }, { "epoch": 0.1621808555040128, "grad_norm": 0.20358648896217346, "learning_rate": 0.001, "loss": 2.6499, "num_input_tokens_seen": 8912896000, "step": 34000 }, { "epoch": 0.1621808555040128, "eval_loss": 2.5330910682678223, "eval_runtime": 50.7961, "eval_samples_per_second": 98.433, "eval_steps_per_second": 24.608, "num_input_tokens_seen": 8912896000, "step": 34000 }, { "epoch": 0.16241935676210692, "grad_norm": 0.1935052126646042, "learning_rate": 0.001, "loss": 2.6583, "num_input_tokens_seen": 8926003200, "step": 34050 }, { "epoch": 0.16265785802020105, "grad_norm": 0.7825157642364502, "learning_rate": 0.001, "loss": 2.6481, "num_input_tokens_seen": 8939110400, "step": 34100 }, { "epoch": 0.16289635927829518, "grad_norm": 0.23290683329105377, "learning_rate": 0.001, "loss": 2.6925, "num_input_tokens_seen": 8952217600, "step": 34150 }, { "epoch": 0.16313486053638934, "grad_norm": 0.23564130067825317, "learning_rate": 0.001, "loss": 2.6495, "num_input_tokens_seen": 8965324800, "step": 34200 }, { "epoch": 0.16337336179448347, "grad_norm": 0.19592130184173584, "learning_rate": 0.001, "loss": 2.6536, "num_input_tokens_seen": 8978432000, "step": 34250 }, { "epoch": 0.1636118630525776, "grad_norm": 0.23535041511058807, "learning_rate": 0.001, "loss": 2.6608, "num_input_tokens_seen": 8991539200, "step": 34300 }, { "epoch": 0.16385036431067174, "grad_norm": 0.1991938352584839, "learning_rate": 0.001, "loss": 2.6458, "num_input_tokens_seen": 9004641856, "step": 34350 }, { "epoch": 0.16408886556876587, "grad_norm": 0.19363388419151306, "learning_rate": 0.001, "loss": 2.6531, "num_input_tokens_seen": 9017749056, "step": 34400 }, { "epoch": 0.16432736682686, "grad_norm": 0.18500390648841858, "learning_rate": 0.001, "loss": 2.6391, "num_input_tokens_seen": 9030856256, "step": 34450 }, { "epoch": 0.16456586808495416, "grad_norm": 0.2774065434932709, "learning_rate": 0.001, "loss": 2.6619, "num_input_tokens_seen": 9043963456, "step": 34500 }, { "epoch": 0.16456586808495416, "eval_loss": 2.5325100421905518, "eval_runtime": 51.5954, "eval_samples_per_second": 96.908, "eval_steps_per_second": 24.227, "num_input_tokens_seen": 9043963456, "step": 34500 }, { "epoch": 0.1648043693430483, "grad_norm": 0.1957511603832245, "learning_rate": 0.001, "loss": 2.6456, "num_input_tokens_seen": 9057070656, "step": 34550 }, { "epoch": 0.16504287060114242, "grad_norm": 0.20958378911018372, "learning_rate": 0.001, "loss": 2.6452, "num_input_tokens_seen": 9070177856, "step": 34600 }, { "epoch": 0.16528137185923655, "grad_norm": 0.206208735704422, "learning_rate": 0.001, "loss": 2.6548, "num_input_tokens_seen": 9083285056, "step": 34650 }, { "epoch": 0.16551987311733068, "grad_norm": 0.22349481284618378, "learning_rate": 0.001, "loss": 2.6653, "num_input_tokens_seen": 9096392256, "step": 34700 }, { "epoch": 0.16575837437542484, "grad_norm": 0.22599968314170837, "learning_rate": 0.001, "loss": 2.6329, "num_input_tokens_seen": 9109499456, "step": 34750 }, { "epoch": 0.16599687563351898, "grad_norm": 0.19219790399074554, "learning_rate": 0.001, "loss": 2.6404, "num_input_tokens_seen": 9122606656, "step": 34800 }, { "epoch": 0.1662353768916131, "grad_norm": 0.2006351351737976, "learning_rate": 0.001, "loss": 2.6522, "num_input_tokens_seen": 9135713856, "step": 34850 }, { "epoch": 0.16647387814970724, "grad_norm": 0.18393316864967346, "learning_rate": 0.001, "loss": 2.6464, "num_input_tokens_seen": 9148821056, "step": 34900 }, { "epoch": 0.16671237940780137, "grad_norm": 0.19820146262645721, "learning_rate": 0.001, "loss": 2.6402, "num_input_tokens_seen": 9161928256, "step": 34950 }, { "epoch": 0.1669508806658955, "grad_norm": 0.1995670199394226, "learning_rate": 0.001, "loss": 2.652, "num_input_tokens_seen": 9175035456, "step": 35000 }, { "epoch": 0.1669508806658955, "eval_loss": 2.5248045921325684, "eval_runtime": 50.8205, "eval_samples_per_second": 98.386, "eval_steps_per_second": 24.596, "num_input_tokens_seen": 9175035456, "step": 35000 }, { "epoch": 0.16718938192398966, "grad_norm": 0.2099646031856537, "learning_rate": 0.001, "loss": 2.6307, "num_input_tokens_seen": 9188142656, "step": 35050 }, { "epoch": 0.1674278831820838, "grad_norm": 0.18913927674293518, "learning_rate": 0.001, "loss": 2.6368, "num_input_tokens_seen": 9201249856, "step": 35100 }, { "epoch": 0.16766638444017792, "grad_norm": 0.19193056225776672, "learning_rate": 0.001, "loss": 2.6325, "num_input_tokens_seen": 9214357056, "step": 35150 }, { "epoch": 0.16790488569827206, "grad_norm": 0.19911837577819824, "learning_rate": 0.001, "loss": 2.6543, "num_input_tokens_seen": 9227464256, "step": 35200 }, { "epoch": 0.1681433869563662, "grad_norm": 0.1985558718442917, "learning_rate": 0.001, "loss": 2.6518, "num_input_tokens_seen": 9240571456, "step": 35250 }, { "epoch": 0.16838188821446032, "grad_norm": 0.2079145759344101, "learning_rate": 0.001, "loss": 2.646, "num_input_tokens_seen": 9253678656, "step": 35300 }, { "epoch": 0.16862038947255448, "grad_norm": 0.18524424731731415, "learning_rate": 0.001, "loss": 2.6378, "num_input_tokens_seen": 9266785856, "step": 35350 }, { "epoch": 0.1688588907306486, "grad_norm": 0.19140370190143585, "learning_rate": 0.001, "loss": 2.6488, "num_input_tokens_seen": 9279893056, "step": 35400 }, { "epoch": 0.16909739198874274, "grad_norm": 0.18006138503551483, "learning_rate": 0.001, "loss": 2.6632, "num_input_tokens_seen": 9293000256, "step": 35450 }, { "epoch": 0.16933589324683687, "grad_norm": 0.18754282593727112, "learning_rate": 0.001, "loss": 2.6436, "num_input_tokens_seen": 9306107456, "step": 35500 }, { "epoch": 0.16933589324683687, "eval_loss": 2.5230932235717773, "eval_runtime": 50.9895, "eval_samples_per_second": 98.059, "eval_steps_per_second": 24.515, "num_input_tokens_seen": 9306107456, "step": 35500 }, { "epoch": 0.169574394504931, "grad_norm": 0.18708109855651855, "learning_rate": 0.001, "loss": 2.6509, "num_input_tokens_seen": 9319214656, "step": 35550 }, { "epoch": 0.16981289576302516, "grad_norm": 0.2019611895084381, "learning_rate": 0.001, "loss": 2.6333, "num_input_tokens_seen": 9332321856, "step": 35600 }, { "epoch": 0.1700513970211193, "grad_norm": 0.22504755854606628, "learning_rate": 0.001, "loss": 2.6359, "num_input_tokens_seen": 9345429056, "step": 35650 }, { "epoch": 0.17028989827921343, "grad_norm": 0.1972053200006485, "learning_rate": 0.001, "loss": 2.6362, "num_input_tokens_seen": 9358536256, "step": 35700 }, { "epoch": 0.17052839953730756, "grad_norm": 0.21156789362430573, "learning_rate": 0.001, "loss": 2.637, "num_input_tokens_seen": 9371643456, "step": 35750 }, { "epoch": 0.1707669007954017, "grad_norm": 0.2680750787258148, "learning_rate": 0.001, "loss": 2.6332, "num_input_tokens_seen": 9384750656, "step": 35800 }, { "epoch": 0.17100540205349582, "grad_norm": 0.24413707852363586, "learning_rate": 0.001, "loss": 2.6366, "num_input_tokens_seen": 9397857856, "step": 35850 }, { "epoch": 0.17124390331158998, "grad_norm": 0.19973772764205933, "learning_rate": 0.001, "loss": 2.6381, "num_input_tokens_seen": 9410965056, "step": 35900 }, { "epoch": 0.1714824045696841, "grad_norm": 0.20807349681854248, "learning_rate": 0.001, "loss": 2.6416, "num_input_tokens_seen": 9424072256, "step": 35950 }, { "epoch": 0.17172090582777824, "grad_norm": 0.20126542448997498, "learning_rate": 0.001, "loss": 2.6377, "num_input_tokens_seen": 9437179456, "step": 36000 }, { "epoch": 0.17172090582777824, "eval_loss": 2.52500581741333, "eval_runtime": 51.4353, "eval_samples_per_second": 97.21, "eval_steps_per_second": 24.302, "num_input_tokens_seen": 9437179456, "step": 36000 }, { "epoch": 0.17195940708587237, "grad_norm": 0.19696597754955292, "learning_rate": 0.001, "loss": 2.6521, "num_input_tokens_seen": 9450286656, "step": 36050 }, { "epoch": 0.1721979083439665, "grad_norm": 0.18839424848556519, "learning_rate": 0.001, "loss": 2.6484, "num_input_tokens_seen": 9463393856, "step": 36100 }, { "epoch": 0.17243640960206066, "grad_norm": 0.33748558163642883, "learning_rate": 0.001, "loss": 2.6496, "num_input_tokens_seen": 9476501056, "step": 36150 }, { "epoch": 0.1726749108601548, "grad_norm": 0.19529207050800323, "learning_rate": 0.001, "loss": 2.6484, "num_input_tokens_seen": 9489608256, "step": 36200 }, { "epoch": 0.17291341211824893, "grad_norm": 0.21542242169380188, "learning_rate": 0.001, "loss": 2.6572, "num_input_tokens_seen": 9502715456, "step": 36250 }, { "epoch": 0.17315191337634306, "grad_norm": 0.37017494440078735, "learning_rate": 0.001, "loss": 2.6517, "num_input_tokens_seen": 9515822656, "step": 36300 }, { "epoch": 0.1733904146344372, "grad_norm": 0.27284151315689087, "learning_rate": 0.001, "loss": 2.66, "num_input_tokens_seen": 9528929856, "step": 36350 }, { "epoch": 0.17362891589253132, "grad_norm": 0.4666242003440857, "learning_rate": 0.001, "loss": 2.6514, "num_input_tokens_seen": 9542037056, "step": 36400 }, { "epoch": 0.17386741715062548, "grad_norm": 0.2031467854976654, "learning_rate": 0.001, "loss": 2.6577, "num_input_tokens_seen": 9555144256, "step": 36450 }, { "epoch": 0.1741059184087196, "grad_norm": 0.2086576223373413, "learning_rate": 0.001, "loss": 2.6372, "num_input_tokens_seen": 9568251456, "step": 36500 }, { "epoch": 0.1741059184087196, "eval_loss": 2.5252223014831543, "eval_runtime": 51.1282, "eval_samples_per_second": 97.793, "eval_steps_per_second": 24.448, "num_input_tokens_seen": 9568251456, "step": 36500 }, { "epoch": 0.17434441966681374, "grad_norm": 0.19739161431789398, "learning_rate": 0.001, "loss": 2.6184, "num_input_tokens_seen": 9581358656, "step": 36550 }, { "epoch": 0.17458292092490788, "grad_norm": 0.22384846210479736, "learning_rate": 0.001, "loss": 2.6504, "num_input_tokens_seen": 9594465856, "step": 36600 }, { "epoch": 0.174821422183002, "grad_norm": 0.2055511772632599, "learning_rate": 0.001, "loss": 2.6333, "num_input_tokens_seen": 9607573056, "step": 36650 }, { "epoch": 0.17505992344109614, "grad_norm": 0.18193551898002625, "learning_rate": 0.001, "loss": 2.6518, "num_input_tokens_seen": 9620680256, "step": 36700 }, { "epoch": 0.1752984246991903, "grad_norm": 0.1968860775232315, "learning_rate": 0.001, "loss": 2.6211, "num_input_tokens_seen": 9633787456, "step": 36750 }, { "epoch": 0.17553692595728443, "grad_norm": 0.20429988205432892, "learning_rate": 0.001, "loss": 2.6269, "num_input_tokens_seen": 9646894656, "step": 36800 }, { "epoch": 0.17577542721537856, "grad_norm": 0.18364110589027405, "learning_rate": 0.001, "loss": 2.6337, "num_input_tokens_seen": 9660001856, "step": 36850 }, { "epoch": 0.1760139284734727, "grad_norm": 0.2051621973514557, "learning_rate": 0.001, "loss": 2.6297, "num_input_tokens_seen": 9673109056, "step": 36900 }, { "epoch": 0.17625242973156682, "grad_norm": 0.25841349363327026, "learning_rate": 0.001, "loss": 2.6678, "num_input_tokens_seen": 9686216256, "step": 36950 }, { "epoch": 0.17649093098966098, "grad_norm": 0.198688805103302, "learning_rate": 0.001, "loss": 2.6521, "num_input_tokens_seen": 9699323456, "step": 37000 }, { "epoch": 0.17649093098966098, "eval_loss": 2.5248863697052, "eval_runtime": 51.363, "eval_samples_per_second": 97.346, "eval_steps_per_second": 24.337, "num_input_tokens_seen": 9699323456, "step": 37000 }, { "epoch": 0.1767294322477551, "grad_norm": 0.2030065804719925, "learning_rate": 0.001, "loss": 2.6481, "num_input_tokens_seen": 9712430656, "step": 37050 }, { "epoch": 0.17696793350584925, "grad_norm": 0.20191729068756104, "learning_rate": 0.001, "loss": 2.6332, "num_input_tokens_seen": 9725537856, "step": 37100 }, { "epoch": 0.17720643476394338, "grad_norm": 0.19462484121322632, "learning_rate": 0.001, "loss": 2.6444, "num_input_tokens_seen": 9738645056, "step": 37150 }, { "epoch": 0.1774449360220375, "grad_norm": 0.27893325686454773, "learning_rate": 0.001, "loss": 2.6309, "num_input_tokens_seen": 9751752256, "step": 37200 }, { "epoch": 0.17768343728013164, "grad_norm": 0.20646531879901886, "learning_rate": 0.001, "loss": 2.646, "num_input_tokens_seen": 9764859456, "step": 37250 }, { "epoch": 0.1779219385382258, "grad_norm": 0.20815566182136536, "learning_rate": 0.001, "loss": 2.6374, "num_input_tokens_seen": 9777966656, "step": 37300 }, { "epoch": 0.17816043979631993, "grad_norm": 0.2194615602493286, "learning_rate": 0.001, "loss": 2.6313, "num_input_tokens_seen": 9791073856, "step": 37350 }, { "epoch": 0.17839894105441406, "grad_norm": 0.23223313689231873, "learning_rate": 0.001, "loss": 2.6435, "num_input_tokens_seen": 9804181056, "step": 37400 }, { "epoch": 0.1786374423125082, "grad_norm": 0.1731143593788147, "learning_rate": 0.001, "loss": 2.6397, "num_input_tokens_seen": 9817288256, "step": 37450 }, { "epoch": 0.17887594357060232, "grad_norm": 0.1929951161146164, "learning_rate": 0.001, "loss": 2.6406, "num_input_tokens_seen": 9830395456, "step": 37500 }, { "epoch": 0.17887594357060232, "eval_loss": 2.518928050994873, "eval_runtime": 51.8733, "eval_samples_per_second": 96.389, "eval_steps_per_second": 24.097, "num_input_tokens_seen": 9830395456, "step": 37500 }, { "epoch": 0.17911444482869648, "grad_norm": 0.19979524612426758, "learning_rate": 0.001, "loss": 2.6363, "num_input_tokens_seen": 9843502656, "step": 37550 }, { "epoch": 0.17935294608679062, "grad_norm": 0.17963503301143646, "learning_rate": 0.001, "loss": 2.6423, "num_input_tokens_seen": 9856609856, "step": 37600 }, { "epoch": 0.17959144734488475, "grad_norm": 0.18216437101364136, "learning_rate": 0.001, "loss": 2.6351, "num_input_tokens_seen": 9869717056, "step": 37650 }, { "epoch": 0.17982994860297888, "grad_norm": 0.16782627999782562, "learning_rate": 0.001, "loss": 2.623, "num_input_tokens_seen": 9882824256, "step": 37700 }, { "epoch": 0.180068449861073, "grad_norm": 0.21884289383888245, "learning_rate": 0.001, "loss": 2.6418, "num_input_tokens_seen": 9895931456, "step": 37750 }, { "epoch": 0.18030695111916714, "grad_norm": 0.18940453231334686, "learning_rate": 0.001, "loss": 2.6371, "num_input_tokens_seen": 9909038656, "step": 37800 }, { "epoch": 0.1805454523772613, "grad_norm": 0.2075282484292984, "learning_rate": 0.001, "loss": 2.6347, "num_input_tokens_seen": 9922145856, "step": 37850 }, { "epoch": 0.18078395363535543, "grad_norm": 0.18504877388477325, "learning_rate": 0.001, "loss": 2.6391, "num_input_tokens_seen": 9935253056, "step": 37900 }, { "epoch": 0.18102245489344956, "grad_norm": 0.17926527559757233, "learning_rate": 0.001, "loss": 2.6358, "num_input_tokens_seen": 9948360256, "step": 37950 }, { "epoch": 0.1812609561515437, "grad_norm": 0.20022514462471008, "learning_rate": 0.001, "loss": 2.6369, "num_input_tokens_seen": 9961467456, "step": 38000 }, { "epoch": 0.1812609561515437, "eval_loss": 2.5171313285827637, "eval_runtime": 51.617, "eval_samples_per_second": 96.867, "eval_steps_per_second": 24.217, "num_input_tokens_seen": 9961467456, "step": 38000 }, { "epoch": 0.18149945740963783, "grad_norm": 0.19376301765441895, "learning_rate": 0.001, "loss": 2.6274, "num_input_tokens_seen": 9974574656, "step": 38050 }, { "epoch": 0.18173795866773199, "grad_norm": 0.2077150195837021, "learning_rate": 0.001, "loss": 2.6303, "num_input_tokens_seen": 9987681856, "step": 38100 }, { "epoch": 0.18197645992582612, "grad_norm": 0.19407787919044495, "learning_rate": 0.001, "loss": 2.6246, "num_input_tokens_seen": 10000789056, "step": 38150 }, { "epoch": 0.18221496118392025, "grad_norm": 0.20558005571365356, "learning_rate": 0.001, "loss": 2.6291, "num_input_tokens_seen": 10013896256, "step": 38200 }, { "epoch": 0.18245346244201438, "grad_norm": 0.22928735613822937, "learning_rate": 0.001, "loss": 2.6336, "num_input_tokens_seen": 10027003456, "step": 38250 }, { "epoch": 0.1826919637001085, "grad_norm": 0.23481298983097076, "learning_rate": 0.001, "loss": 2.6412, "num_input_tokens_seen": 10040110656, "step": 38300 }, { "epoch": 0.18293046495820264, "grad_norm": 0.19808940589427948, "learning_rate": 0.001, "loss": 2.6395, "num_input_tokens_seen": 10053217856, "step": 38350 }, { "epoch": 0.1831689662162968, "grad_norm": 0.20152992010116577, "learning_rate": 0.001, "loss": 2.6224, "num_input_tokens_seen": 10066325056, "step": 38400 }, { "epoch": 0.18340746747439093, "grad_norm": 0.18065959215164185, "learning_rate": 0.001, "loss": 2.626, "num_input_tokens_seen": 10079432256, "step": 38450 }, { "epoch": 0.18364596873248507, "grad_norm": 0.20382963120937347, "learning_rate": 0.001, "loss": 2.6382, "num_input_tokens_seen": 10092539456, "step": 38500 }, { "epoch": 0.18364596873248507, "eval_loss": 2.5152854919433594, "eval_runtime": 51.2566, "eval_samples_per_second": 97.548, "eval_steps_per_second": 24.387, "num_input_tokens_seen": 10092539456, "step": 38500 }, { "epoch": 0.1838844699905792, "grad_norm": 0.17728358507156372, "learning_rate": 0.001, "loss": 2.6293, "num_input_tokens_seen": 10105646656, "step": 38550 }, { "epoch": 0.18412297124867333, "grad_norm": 0.20164869725704193, "learning_rate": 0.001, "loss": 2.6372, "num_input_tokens_seen": 10118753856, "step": 38600 }, { "epoch": 0.18436147250676746, "grad_norm": 0.20125731825828552, "learning_rate": 0.001, "loss": 2.6326, "num_input_tokens_seen": 10131861056, "step": 38650 }, { "epoch": 0.18459997376486162, "grad_norm": 0.21193954348564148, "learning_rate": 0.001, "loss": 2.6249, "num_input_tokens_seen": 10144968256, "step": 38700 }, { "epoch": 0.18483847502295575, "grad_norm": 0.1925983726978302, "learning_rate": 0.001, "loss": 2.6424, "num_input_tokens_seen": 10158075456, "step": 38750 }, { "epoch": 0.18507697628104988, "grad_norm": 0.19814860820770264, "learning_rate": 0.001, "loss": 2.6431, "num_input_tokens_seen": 10171182656, "step": 38800 }, { "epoch": 0.185315477539144, "grad_norm": 0.1909031718969345, "learning_rate": 0.001, "loss": 2.6068, "num_input_tokens_seen": 10184289856, "step": 38850 }, { "epoch": 0.18555397879723814, "grad_norm": 0.20779775083065033, "learning_rate": 0.001, "loss": 2.625, "num_input_tokens_seen": 10197397056, "step": 38900 }, { "epoch": 0.1857924800553323, "grad_norm": 0.1768522411584854, "learning_rate": 0.001, "loss": 2.6112, "num_input_tokens_seen": 10210504256, "step": 38950 }, { "epoch": 0.18603098131342644, "grad_norm": 0.20275786519050598, "learning_rate": 0.001, "loss": 2.6284, "num_input_tokens_seen": 10223611456, "step": 39000 }, { "epoch": 0.18603098131342644, "eval_loss": 2.5149083137512207, "eval_runtime": 51.6703, "eval_samples_per_second": 96.767, "eval_steps_per_second": 24.192, "num_input_tokens_seen": 10223611456, "step": 39000 }, { "epoch": 0.18626948257152057, "grad_norm": 0.19634057581424713, "learning_rate": 0.001, "loss": 2.6342, "num_input_tokens_seen": 10236718656, "step": 39050 }, { "epoch": 0.1865079838296147, "grad_norm": 0.19488537311553955, "learning_rate": 0.001, "loss": 2.6305, "num_input_tokens_seen": 10249825856, "step": 39100 }, { "epoch": 0.18674648508770883, "grad_norm": 0.2082369476556778, "learning_rate": 0.001, "loss": 2.6067, "num_input_tokens_seen": 10262933056, "step": 39150 }, { "epoch": 0.18698498634580296, "grad_norm": 0.21019776165485382, "learning_rate": 0.001, "loss": 2.628, "num_input_tokens_seen": 10276040256, "step": 39200 }, { "epoch": 0.18722348760389712, "grad_norm": 0.19929739832878113, "learning_rate": 0.001, "loss": 2.6256, "num_input_tokens_seen": 10289147456, "step": 39250 }, { "epoch": 0.18746198886199125, "grad_norm": 0.204230397939682, "learning_rate": 0.001, "loss": 2.6113, "num_input_tokens_seen": 10302254656, "step": 39300 }, { "epoch": 0.18770049012008538, "grad_norm": 0.2217213660478592, "learning_rate": 0.001, "loss": 2.6253, "num_input_tokens_seen": 10315361856, "step": 39350 }, { "epoch": 0.18793899137817952, "grad_norm": 0.19329366087913513, "learning_rate": 0.001, "loss": 2.6317, "num_input_tokens_seen": 10328469056, "step": 39400 }, { "epoch": 0.18817749263627365, "grad_norm": 0.18244336545467377, "learning_rate": 0.001, "loss": 2.6476, "num_input_tokens_seen": 10341576256, "step": 39450 }, { "epoch": 0.1884159938943678, "grad_norm": 0.1864692121744156, "learning_rate": 0.001, "loss": 2.642, "num_input_tokens_seen": 10354683456, "step": 39500 }, { "epoch": 0.1884159938943678, "eval_loss": 2.514141321182251, "eval_runtime": 51.1111, "eval_samples_per_second": 97.826, "eval_steps_per_second": 24.457, "num_input_tokens_seen": 10354683456, "step": 39500 }, { "epoch": 0.18865449515246194, "grad_norm": 0.25003623962402344, "learning_rate": 0.001, "loss": 2.6299, "num_input_tokens_seen": 10367790656, "step": 39550 }, { "epoch": 0.18889299641055607, "grad_norm": 0.19642098248004913, "learning_rate": 0.001, "loss": 2.6412, "num_input_tokens_seen": 10380897856, "step": 39600 }, { "epoch": 0.1891314976686502, "grad_norm": 0.21947956085205078, "learning_rate": 0.001, "loss": 2.6211, "num_input_tokens_seen": 10394005056, "step": 39650 }, { "epoch": 0.18936999892674433, "grad_norm": 0.19838476181030273, "learning_rate": 0.001, "loss": 2.6451, "num_input_tokens_seen": 10407112256, "step": 39700 }, { "epoch": 0.18960850018483846, "grad_norm": 0.21131113171577454, "learning_rate": 0.001, "loss": 2.6375, "num_input_tokens_seen": 10420219456, "step": 39750 }, { "epoch": 0.18984700144293262, "grad_norm": 0.17576864361763, "learning_rate": 0.001, "loss": 2.6325, "num_input_tokens_seen": 10433326656, "step": 39800 }, { "epoch": 0.19008550270102675, "grad_norm": 0.2113037258386612, "learning_rate": 0.001, "loss": 2.6254, "num_input_tokens_seen": 10446433856, "step": 39850 }, { "epoch": 0.19032400395912089, "grad_norm": 0.1972583681344986, "learning_rate": 0.001, "loss": 2.6277, "num_input_tokens_seen": 10459541056, "step": 39900 }, { "epoch": 0.19056250521721502, "grad_norm": 0.43353378772735596, "learning_rate": 0.001, "loss": 2.6295, "num_input_tokens_seen": 10472648256, "step": 39950 }, { "epoch": 0.19080100647530915, "grad_norm": 0.22195081412792206, "learning_rate": 0.001, "loss": 2.6422, "num_input_tokens_seen": 10485755456, "step": 40000 }, { "epoch": 0.19080100647530915, "eval_loss": 2.5144667625427246, "eval_runtime": 51.0006, "eval_samples_per_second": 98.038, "eval_steps_per_second": 24.51, "num_input_tokens_seen": 10485755456, "step": 40000 }, { "epoch": 0.19103950773340328, "grad_norm": 0.18717694282531738, "learning_rate": 0.001, "loss": 2.6512, "num_input_tokens_seen": 10498862656, "step": 40050 }, { "epoch": 0.19127800899149744, "grad_norm": 0.2009858638048172, "learning_rate": 0.001, "loss": 2.6289, "num_input_tokens_seen": 10511969856, "step": 40100 }, { "epoch": 0.19151651024959157, "grad_norm": 0.2515949010848999, "learning_rate": 0.001, "loss": 2.6342, "num_input_tokens_seen": 10525077056, "step": 40150 }, { "epoch": 0.1917550115076857, "grad_norm": 0.19864948093891144, "learning_rate": 0.001, "loss": 2.6191, "num_input_tokens_seen": 10538184256, "step": 40200 }, { "epoch": 0.19199351276577983, "grad_norm": 0.17704185843467712, "learning_rate": 0.001, "loss": 2.6176, "num_input_tokens_seen": 10551291456, "step": 40250 }, { "epoch": 0.19223201402387396, "grad_norm": 0.2097242772579193, "learning_rate": 0.001, "loss": 2.6509, "num_input_tokens_seen": 10564398656, "step": 40300 }, { "epoch": 0.19247051528196812, "grad_norm": 0.18630579113960266, "learning_rate": 0.001, "loss": 2.6273, "num_input_tokens_seen": 10577505856, "step": 40350 }, { "epoch": 0.19270901654006226, "grad_norm": 0.24162743985652924, "learning_rate": 0.001, "loss": 2.6405, "num_input_tokens_seen": 10590613056, "step": 40400 }, { "epoch": 0.1929475177981564, "grad_norm": 0.19576874375343323, "learning_rate": 0.001, "loss": 2.6403, "num_input_tokens_seen": 10603720256, "step": 40450 }, { "epoch": 0.19318601905625052, "grad_norm": 0.18408045172691345, "learning_rate": 0.001, "loss": 2.6149, "num_input_tokens_seen": 10616827456, "step": 40500 }, { "epoch": 0.19318601905625052, "eval_loss": 2.511899709701538, "eval_runtime": 51.5326, "eval_samples_per_second": 97.026, "eval_steps_per_second": 24.257, "num_input_tokens_seen": 10616827456, "step": 40500 }, { "epoch": 0.19342452031434465, "grad_norm": 0.20845313370227814, "learning_rate": 0.001, "loss": 2.6242, "num_input_tokens_seen": 10629934656, "step": 40550 }, { "epoch": 0.19366302157243878, "grad_norm": 0.20603816211223602, "learning_rate": 0.001, "loss": 2.6305, "num_input_tokens_seen": 10643041856, "step": 40600 }, { "epoch": 0.19390152283053294, "grad_norm": 0.2180013507604599, "learning_rate": 0.001, "loss": 2.6271, "num_input_tokens_seen": 10656149056, "step": 40650 }, { "epoch": 0.19414002408862707, "grad_norm": 0.22217005491256714, "learning_rate": 0.001, "loss": 2.6407, "num_input_tokens_seen": 10669256256, "step": 40700 }, { "epoch": 0.1943785253467212, "grad_norm": 0.21379347145557404, "learning_rate": 0.001, "loss": 2.6209, "num_input_tokens_seen": 10682363456, "step": 40750 }, { "epoch": 0.19461702660481534, "grad_norm": 0.2011626958847046, "learning_rate": 0.001, "loss": 2.6471, "num_input_tokens_seen": 10695470656, "step": 40800 }, { "epoch": 0.19485552786290947, "grad_norm": 0.1946493685245514, "learning_rate": 0.001, "loss": 2.6267, "num_input_tokens_seen": 10708577856, "step": 40850 }, { "epoch": 0.19509402912100363, "grad_norm": 0.19157454371452332, "learning_rate": 0.001, "loss": 2.6362, "num_input_tokens_seen": 10721685056, "step": 40900 }, { "epoch": 0.19533253037909776, "grad_norm": 0.1978122442960739, "learning_rate": 0.001, "loss": 2.6448, "num_input_tokens_seen": 10734792256, "step": 40950 }, { "epoch": 0.1955710316371919, "grad_norm": 0.19996555149555206, "learning_rate": 0.001, "loss": 2.626, "num_input_tokens_seen": 10747899456, "step": 41000 }, { "epoch": 0.1955710316371919, "eval_loss": 2.5084941387176514, "eval_runtime": 51.6987, "eval_samples_per_second": 96.714, "eval_steps_per_second": 24.179, "num_input_tokens_seen": 10747899456, "step": 41000 }, { "epoch": 0.19580953289528602, "grad_norm": 0.20298945903778076, "learning_rate": 0.001, "loss": 2.6233, "num_input_tokens_seen": 10761006656, "step": 41050 }, { "epoch": 0.19604803415338015, "grad_norm": 0.2280716896057129, "learning_rate": 0.001, "loss": 2.6427, "num_input_tokens_seen": 10774113856, "step": 41100 }, { "epoch": 0.19628653541147428, "grad_norm": 0.19223643839359283, "learning_rate": 0.001, "loss": 2.6263, "num_input_tokens_seen": 10787221056, "step": 41150 }, { "epoch": 0.19652503666956844, "grad_norm": 0.19221842288970947, "learning_rate": 0.001, "loss": 2.6401, "num_input_tokens_seen": 10800328256, "step": 41200 }, { "epoch": 0.19676353792766257, "grad_norm": 0.19479979574680328, "learning_rate": 0.001, "loss": 2.6269, "num_input_tokens_seen": 10813435456, "step": 41250 }, { "epoch": 0.1970020391857567, "grad_norm": 0.24501195549964905, "learning_rate": 0.001, "loss": 2.618, "num_input_tokens_seen": 10826542656, "step": 41300 }, { "epoch": 0.19724054044385084, "grad_norm": 0.1994044929742813, "learning_rate": 0.001, "loss": 2.64, "num_input_tokens_seen": 10839649856, "step": 41350 }, { "epoch": 0.19747904170194497, "grad_norm": 0.20831650495529175, "learning_rate": 0.001, "loss": 2.6513, "num_input_tokens_seen": 10852757056, "step": 41400 }, { "epoch": 0.19771754296003913, "grad_norm": 0.21919438242912292, "learning_rate": 0.001, "loss": 2.6379, "num_input_tokens_seen": 10865864256, "step": 41450 }, { "epoch": 0.19795604421813326, "grad_norm": 0.23088768124580383, "learning_rate": 0.001, "loss": 2.6449, "num_input_tokens_seen": 10878971456, "step": 41500 }, { "epoch": 0.19795604421813326, "eval_loss": 2.5156567096710205, "eval_runtime": 51.6776, "eval_samples_per_second": 96.754, "eval_steps_per_second": 24.188, "num_input_tokens_seen": 10878971456, "step": 41500 }, { "epoch": 0.1981945454762274, "grad_norm": 0.1982518881559372, "learning_rate": 0.001, "loss": 2.6304, "num_input_tokens_seen": 10892078656, "step": 41550 }, { "epoch": 0.19843304673432152, "grad_norm": 0.2099853903055191, "learning_rate": 0.001, "loss": 2.6305, "num_input_tokens_seen": 10905185856, "step": 41600 }, { "epoch": 0.19867154799241565, "grad_norm": 0.19403131306171417, "learning_rate": 0.001, "loss": 2.6419, "num_input_tokens_seen": 10918293056, "step": 41650 }, { "epoch": 0.19891004925050979, "grad_norm": 0.20865993201732635, "learning_rate": 0.001, "loss": 2.6116, "num_input_tokens_seen": 10931400256, "step": 41700 }, { "epoch": 0.19914855050860394, "grad_norm": 0.19042626023292542, "learning_rate": 0.001, "loss": 2.6271, "num_input_tokens_seen": 10944507456, "step": 41750 }, { "epoch": 0.19938705176669808, "grad_norm": 0.20514579117298126, "learning_rate": 0.001, "loss": 2.6348, "num_input_tokens_seen": 10957614656, "step": 41800 }, { "epoch": 0.1996255530247922, "grad_norm": 0.21224668622016907, "learning_rate": 0.001, "loss": 2.6314, "num_input_tokens_seen": 10970721856, "step": 41850 }, { "epoch": 0.19986405428288634, "grad_norm": 0.18857082724571228, "learning_rate": 0.001, "loss": 2.6217, "num_input_tokens_seen": 10983829056, "step": 41900 }, { "epoch": 0.20010255554098047, "grad_norm": 0.18431074917316437, "learning_rate": 0.001, "loss": 2.6267, "num_input_tokens_seen": 10996936256, "step": 41950 }, { "epoch": 0.2003410567990746, "grad_norm": 0.20570099353790283, "learning_rate": 0.001, "loss": 2.6016, "num_input_tokens_seen": 11010043456, "step": 42000 }, { "epoch": 0.2003410567990746, "eval_loss": 2.506241798400879, "eval_runtime": 51.5548, "eval_samples_per_second": 96.984, "eval_steps_per_second": 24.246, "num_input_tokens_seen": 11010043456, "step": 42000 }, { "epoch": 0.20057955805716876, "grad_norm": 0.17952106893062592, "learning_rate": 0.001, "loss": 2.6165, "num_input_tokens_seen": 11023150656, "step": 42050 }, { "epoch": 0.2008180593152629, "grad_norm": 0.20292694866657257, "learning_rate": 0.001, "loss": 2.6357, "num_input_tokens_seen": 11036257856, "step": 42100 }, { "epoch": 0.20105656057335702, "grad_norm": 0.19588933885097504, "learning_rate": 0.001, "loss": 2.6102, "num_input_tokens_seen": 11049365056, "step": 42150 }, { "epoch": 0.20129506183145116, "grad_norm": 0.1982785314321518, "learning_rate": 0.001, "loss": 2.6019, "num_input_tokens_seen": 11062472256, "step": 42200 }, { "epoch": 0.2015335630895453, "grad_norm": 0.18049876391887665, "learning_rate": 0.001, "loss": 2.6081, "num_input_tokens_seen": 11075579456, "step": 42250 }, { "epoch": 0.20177206434763945, "grad_norm": 0.2069908082485199, "learning_rate": 0.001, "loss": 2.6173, "num_input_tokens_seen": 11088686656, "step": 42300 }, { "epoch": 0.20201056560573358, "grad_norm": 0.2415982335805893, "learning_rate": 0.001, "loss": 2.6173, "num_input_tokens_seen": 11101793856, "step": 42350 }, { "epoch": 0.2022490668638277, "grad_norm": 0.20267252624034882, "learning_rate": 0.001, "loss": 2.6299, "num_input_tokens_seen": 11114901056, "step": 42400 }, { "epoch": 0.20248756812192184, "grad_norm": 0.20683065056800842, "learning_rate": 0.001, "loss": 2.6282, "num_input_tokens_seen": 11128008256, "step": 42450 }, { "epoch": 0.20272606938001597, "grad_norm": 0.22137881815433502, "learning_rate": 0.001, "loss": 2.6271, "num_input_tokens_seen": 11141115456, "step": 42500 }, { "epoch": 0.20272606938001597, "eval_loss": 2.5125572681427, "eval_runtime": 51.794, "eval_samples_per_second": 96.536, "eval_steps_per_second": 24.134, "num_input_tokens_seen": 11141115456, "step": 42500 }, { "epoch": 0.2029645706381101, "grad_norm": 0.20610037446022034, "learning_rate": 0.001, "loss": 2.6255, "num_input_tokens_seen": 11154222656, "step": 42550 }, { "epoch": 0.20320307189620426, "grad_norm": 0.21218810975551605, "learning_rate": 0.001, "loss": 2.6149, "num_input_tokens_seen": 11167329856, "step": 42600 }, { "epoch": 0.2034415731542984, "grad_norm": 0.19685466587543488, "learning_rate": 0.001, "loss": 2.6208, "num_input_tokens_seen": 11180437056, "step": 42650 }, { "epoch": 0.20368007441239253, "grad_norm": 0.20507460832595825, "learning_rate": 0.001, "loss": 2.6227, "num_input_tokens_seen": 11193544256, "step": 42700 }, { "epoch": 0.20391857567048666, "grad_norm": 0.20014505088329315, "learning_rate": 0.001, "loss": 2.6238, "num_input_tokens_seen": 11206651456, "step": 42750 }, { "epoch": 0.2041570769285808, "grad_norm": 0.1907282918691635, "learning_rate": 0.001, "loss": 2.6157, "num_input_tokens_seen": 11219758656, "step": 42800 }, { "epoch": 0.20439557818667495, "grad_norm": 0.18553833663463593, "learning_rate": 0.001, "loss": 2.6123, "num_input_tokens_seen": 11232865856, "step": 42850 }, { "epoch": 0.20463407944476908, "grad_norm": 0.20382866263389587, "learning_rate": 0.001, "loss": 2.6163, "num_input_tokens_seen": 11245973056, "step": 42900 }, { "epoch": 0.2048725807028632, "grad_norm": 0.18923860788345337, "learning_rate": 0.001, "loss": 2.5981, "num_input_tokens_seen": 11259080256, "step": 42950 }, { "epoch": 0.20511108196095734, "grad_norm": 0.19230851531028748, "learning_rate": 0.001, "loss": 2.618, "num_input_tokens_seen": 11272187456, "step": 43000 }, { "epoch": 0.20511108196095734, "eval_loss": 2.5047237873077393, "eval_runtime": 51.2959, "eval_samples_per_second": 97.474, "eval_steps_per_second": 24.368, "num_input_tokens_seen": 11272187456, "step": 43000 }, { "epoch": 0.20534958321905147, "grad_norm": 0.22746357321739197, "learning_rate": 0.001, "loss": 2.6281, "num_input_tokens_seen": 11285294656, "step": 43050 }, { "epoch": 0.2055880844771456, "grad_norm": 0.21107150614261627, "learning_rate": 0.001, "loss": 2.6154, "num_input_tokens_seen": 11298401856, "step": 43100 }, { "epoch": 0.20582658573523976, "grad_norm": 0.18025045096874237, "learning_rate": 0.001, "loss": 2.6141, "num_input_tokens_seen": 11311509056, "step": 43150 }, { "epoch": 0.2060650869933339, "grad_norm": 0.2009642869234085, "learning_rate": 0.001, "loss": 2.6133, "num_input_tokens_seen": 11324616256, "step": 43200 }, { "epoch": 0.20630358825142803, "grad_norm": 0.1872788518667221, "learning_rate": 0.001, "loss": 2.6197, "num_input_tokens_seen": 11337723456, "step": 43250 }, { "epoch": 0.20654208950952216, "grad_norm": 0.216310054063797, "learning_rate": 0.001, "loss": 2.6353, "num_input_tokens_seen": 11350830656, "step": 43300 }, { "epoch": 0.2067805907676163, "grad_norm": 0.2705513536930084, "learning_rate": 0.001, "loss": 2.6333, "num_input_tokens_seen": 11363937856, "step": 43350 }, { "epoch": 0.20701909202571045, "grad_norm": 0.3040550649166107, "learning_rate": 0.001, "loss": 2.6094, "num_input_tokens_seen": 11377045056, "step": 43400 }, { "epoch": 0.20725759328380458, "grad_norm": 0.2075599879026413, "learning_rate": 0.001, "loss": 2.6225, "num_input_tokens_seen": 11390152256, "step": 43450 }, { "epoch": 0.2074960945418987, "grad_norm": 0.22293590009212494, "learning_rate": 0.001, "loss": 2.6271, "num_input_tokens_seen": 11403259456, "step": 43500 }, { "epoch": 0.2074960945418987, "eval_loss": 2.5097975730895996, "eval_runtime": 51.7037, "eval_samples_per_second": 96.705, "eval_steps_per_second": 24.176, "num_input_tokens_seen": 11403259456, "step": 43500 }, { "epoch": 0.20773459579999284, "grad_norm": 0.21221335232257843, "learning_rate": 0.001, "loss": 2.618, "num_input_tokens_seen": 11416366656, "step": 43550 }, { "epoch": 0.20797309705808698, "grad_norm": 0.19894948601722717, "learning_rate": 0.001, "loss": 2.6305, "num_input_tokens_seen": 11429473856, "step": 43600 }, { "epoch": 0.2082115983161811, "grad_norm": 0.29371336102485657, "learning_rate": 0.001, "loss": 2.6211, "num_input_tokens_seen": 11442581056, "step": 43650 }, { "epoch": 0.20845009957427527, "grad_norm": 0.19441936910152435, "learning_rate": 0.001, "loss": 2.6355, "num_input_tokens_seen": 11455688256, "step": 43700 }, { "epoch": 0.2086886008323694, "grad_norm": 0.19868114590644836, "learning_rate": 0.001, "loss": 2.6206, "num_input_tokens_seen": 11468795456, "step": 43750 }, { "epoch": 0.20892710209046353, "grad_norm": 0.19971340894699097, "learning_rate": 0.001, "loss": 2.6124, "num_input_tokens_seen": 11481902656, "step": 43800 }, { "epoch": 0.20916560334855766, "grad_norm": 0.22261051833629608, "learning_rate": 0.001, "loss": 2.623, "num_input_tokens_seen": 11495009856, "step": 43850 }, { "epoch": 0.2094041046066518, "grad_norm": 0.20982281863689423, "learning_rate": 0.001, "loss": 2.6182, "num_input_tokens_seen": 11508117056, "step": 43900 }, { "epoch": 0.20964260586474592, "grad_norm": 0.2216535359621048, "learning_rate": 0.001, "loss": 2.6086, "num_input_tokens_seen": 11521224256, "step": 43950 }, { "epoch": 0.20988110712284008, "grad_norm": 0.19298988580703735, "learning_rate": 0.001, "loss": 2.6364, "num_input_tokens_seen": 11534331456, "step": 44000 }, { "epoch": 0.20988110712284008, "eval_loss": 2.5009121894836426, "eval_runtime": 51.4356, "eval_samples_per_second": 97.209, "eval_steps_per_second": 24.302, "num_input_tokens_seen": 11534331456, "step": 44000 }, { "epoch": 0.21011960838093421, "grad_norm": 0.19737008213996887, "learning_rate": 0.001, "loss": 2.6272, "num_input_tokens_seen": 11547438656, "step": 44050 }, { "epoch": 0.21035810963902835, "grad_norm": 0.1984977424144745, "learning_rate": 0.001, "loss": 2.6417, "num_input_tokens_seen": 11560545856, "step": 44100 }, { "epoch": 0.21059661089712248, "grad_norm": 0.19575904309749603, "learning_rate": 0.001, "loss": 2.6277, "num_input_tokens_seen": 11573653056, "step": 44150 }, { "epoch": 0.2108351121552166, "grad_norm": 0.19875651597976685, "learning_rate": 0.001, "loss": 2.6362, "num_input_tokens_seen": 11586760256, "step": 44200 }, { "epoch": 0.21107361341331077, "grad_norm": 0.20936185121536255, "learning_rate": 0.001, "loss": 2.6217, "num_input_tokens_seen": 11599867456, "step": 44250 }, { "epoch": 0.2113121146714049, "grad_norm": 0.19474463164806366, "learning_rate": 0.001, "loss": 2.6235, "num_input_tokens_seen": 11612974656, "step": 44300 }, { "epoch": 0.21155061592949903, "grad_norm": 0.20833207666873932, "learning_rate": 0.001, "loss": 2.6, "num_input_tokens_seen": 11626081856, "step": 44350 }, { "epoch": 0.21178911718759316, "grad_norm": 0.19269512593746185, "learning_rate": 0.001, "loss": 2.6211, "num_input_tokens_seen": 11639189056, "step": 44400 }, { "epoch": 0.2120276184456873, "grad_norm": 0.21018226444721222, "learning_rate": 0.001, "loss": 2.6294, "num_input_tokens_seen": 11652296256, "step": 44450 }, { "epoch": 0.21226611970378143, "grad_norm": 0.19836543500423431, "learning_rate": 0.001, "loss": 2.6051, "num_input_tokens_seen": 11665403456, "step": 44500 }, { "epoch": 0.21226611970378143, "eval_loss": 2.499817132949829, "eval_runtime": 50.9003, "eval_samples_per_second": 98.231, "eval_steps_per_second": 24.558, "num_input_tokens_seen": 11665403456, "step": 44500 }, { "epoch": 0.21250462096187558, "grad_norm": 0.18411967158317566, "learning_rate": 0.001, "loss": 2.6228, "num_input_tokens_seen": 11678510656, "step": 44550 }, { "epoch": 0.21274312221996972, "grad_norm": 0.19387467205524445, "learning_rate": 0.001, "loss": 2.5902, "num_input_tokens_seen": 11691617856, "step": 44600 }, { "epoch": 0.21298162347806385, "grad_norm": 0.22076952457427979, "learning_rate": 0.001, "loss": 2.613, "num_input_tokens_seen": 11704725056, "step": 44650 }, { "epoch": 0.21322012473615798, "grad_norm": 0.33861082792282104, "learning_rate": 0.001, "loss": 2.6142, "num_input_tokens_seen": 11717832256, "step": 44700 }, { "epoch": 0.2134586259942521, "grad_norm": 0.20097902417182922, "learning_rate": 0.001, "loss": 2.6549, "num_input_tokens_seen": 11730939456, "step": 44750 }, { "epoch": 0.21369712725234627, "grad_norm": 0.24534635245800018, "learning_rate": 0.001, "loss": 2.6293, "num_input_tokens_seen": 11744046656, "step": 44800 }, { "epoch": 0.2139356285104404, "grad_norm": 0.2439020723104477, "learning_rate": 0.001, "loss": 2.635, "num_input_tokens_seen": 11757153856, "step": 44850 }, { "epoch": 0.21417412976853453, "grad_norm": 0.24259154498577118, "learning_rate": 0.001, "loss": 2.6232, "num_input_tokens_seen": 11770261056, "step": 44900 }, { "epoch": 0.21441263102662866, "grad_norm": 0.23554636538028717, "learning_rate": 0.001, "loss": 2.6061, "num_input_tokens_seen": 11783368256, "step": 44950 }, { "epoch": 0.2146511322847228, "grad_norm": 0.20377275347709656, "learning_rate": 0.001, "loss": 2.6156, "num_input_tokens_seen": 11796475456, "step": 45000 }, { "epoch": 0.2146511322847228, "eval_loss": 2.503781318664551, "eval_runtime": 51.1656, "eval_samples_per_second": 97.722, "eval_steps_per_second": 24.43, "num_input_tokens_seen": 11796475456, "step": 45000 }, { "epoch": 0.21488963354281693, "grad_norm": 0.226406991481781, "learning_rate": 0.001, "loss": 2.626, "num_input_tokens_seen": 11809582656, "step": 45050 }, { "epoch": 0.21512813480091109, "grad_norm": 0.20505741238594055, "learning_rate": 0.001, "loss": 2.6095, "num_input_tokens_seen": 11822689856, "step": 45100 }, { "epoch": 0.21536663605900522, "grad_norm": 0.2917146682739258, "learning_rate": 0.001, "loss": 2.6439, "num_input_tokens_seen": 11835797056, "step": 45150 }, { "epoch": 0.21560513731709935, "grad_norm": 0.24030283093452454, "learning_rate": 0.001, "loss": 2.6386, "num_input_tokens_seen": 11848904256, "step": 45200 }, { "epoch": 0.21584363857519348, "grad_norm": 0.1799454241991043, "learning_rate": 0.001, "loss": 2.6344, "num_input_tokens_seen": 11862011456, "step": 45250 }, { "epoch": 0.2160821398332876, "grad_norm": 0.2093718945980072, "learning_rate": 0.001, "loss": 2.6152, "num_input_tokens_seen": 11875118656, "step": 45300 }, { "epoch": 0.21632064109138174, "grad_norm": 0.19477079808712006, "learning_rate": 0.001, "loss": 2.622, "num_input_tokens_seen": 11888225856, "step": 45350 }, { "epoch": 0.2165591423494759, "grad_norm": 0.2764741778373718, "learning_rate": 0.001, "loss": 2.5951, "num_input_tokens_seen": 11901333056, "step": 45400 }, { "epoch": 0.21679764360757003, "grad_norm": 0.2127208709716797, "learning_rate": 0.001, "loss": 2.6231, "num_input_tokens_seen": 11914440256, "step": 45450 }, { "epoch": 0.21703614486566417, "grad_norm": 0.21089383959770203, "learning_rate": 0.001, "loss": 2.6099, "num_input_tokens_seen": 11927547456, "step": 45500 }, { "epoch": 0.21703614486566417, "eval_loss": 2.502464771270752, "eval_runtime": 50.946, "eval_samples_per_second": 98.143, "eval_steps_per_second": 24.536, "num_input_tokens_seen": 11927547456, "step": 45500 }, { "epoch": 0.2172746461237583, "grad_norm": 0.19550016522407532, "learning_rate": 0.001, "loss": 2.6365, "num_input_tokens_seen": 11940654656, "step": 45550 }, { "epoch": 0.21751314738185243, "grad_norm": 0.18284358084201813, "learning_rate": 0.001, "loss": 2.6358, "num_input_tokens_seen": 11953761856, "step": 45600 }, { "epoch": 0.2177516486399466, "grad_norm": 0.21821847558021545, "learning_rate": 0.001, "loss": 2.607, "num_input_tokens_seen": 11966869056, "step": 45650 }, { "epoch": 0.21799014989804072, "grad_norm": 0.2195073515176773, "learning_rate": 0.001, "loss": 2.6195, "num_input_tokens_seen": 11979976256, "step": 45700 }, { "epoch": 0.21822865115613485, "grad_norm": 0.19679750502109528, "learning_rate": 0.001, "loss": 2.6259, "num_input_tokens_seen": 11993083456, "step": 45750 }, { "epoch": 0.21846715241422898, "grad_norm": 0.1985604166984558, "learning_rate": 0.001, "loss": 2.6224, "num_input_tokens_seen": 12006190656, "step": 45800 }, { "epoch": 0.2187056536723231, "grad_norm": 0.18398787081241608, "learning_rate": 0.001, "loss": 2.6215, "num_input_tokens_seen": 12019297856, "step": 45850 }, { "epoch": 0.21894415493041725, "grad_norm": 0.2306145578622818, "learning_rate": 0.001, "loss": 2.6346, "num_input_tokens_seen": 12032405056, "step": 45900 }, { "epoch": 0.2191826561885114, "grad_norm": 0.21335257589817047, "learning_rate": 0.001, "loss": 2.6232, "num_input_tokens_seen": 12045512256, "step": 45950 }, { "epoch": 0.21942115744660554, "grad_norm": 0.22988814115524292, "learning_rate": 0.001, "loss": 2.6132, "num_input_tokens_seen": 12058619456, "step": 46000 }, { "epoch": 0.21942115744660554, "eval_loss": 2.499041795730591, "eval_runtime": 50.6868, "eval_samples_per_second": 98.645, "eval_steps_per_second": 24.661, "num_input_tokens_seen": 12058619456, "step": 46000 }, { "epoch": 0.21965965870469967, "grad_norm": 0.19492709636688232, "learning_rate": 0.001, "loss": 2.6196, "num_input_tokens_seen": 12071726656, "step": 46050 }, { "epoch": 0.2198981599627938, "grad_norm": 0.19643568992614746, "learning_rate": 0.001, "loss": 2.6108, "num_input_tokens_seen": 12084833856, "step": 46100 }, { "epoch": 0.22013666122088793, "grad_norm": 0.18720099329948425, "learning_rate": 0.001, "loss": 2.6181, "num_input_tokens_seen": 12097941056, "step": 46150 }, { "epoch": 0.2203751624789821, "grad_norm": 0.1929876208305359, "learning_rate": 0.001, "loss": 2.6152, "num_input_tokens_seen": 12111048256, "step": 46200 }, { "epoch": 0.22061366373707622, "grad_norm": 0.19732603430747986, "learning_rate": 0.001, "loss": 2.6267, "num_input_tokens_seen": 12124155456, "step": 46250 }, { "epoch": 0.22085216499517035, "grad_norm": 0.1964132934808731, "learning_rate": 0.001, "loss": 2.605, "num_input_tokens_seen": 12137262656, "step": 46300 }, { "epoch": 0.22109066625326448, "grad_norm": 0.1927288919687271, "learning_rate": 0.001, "loss": 2.6178, "num_input_tokens_seen": 12150369856, "step": 46350 }, { "epoch": 0.22132916751135862, "grad_norm": 0.17873398959636688, "learning_rate": 0.001, "loss": 2.6033, "num_input_tokens_seen": 12163477056, "step": 46400 }, { "epoch": 0.22156766876945275, "grad_norm": 0.24716190993785858, "learning_rate": 0.001, "loss": 2.6141, "num_input_tokens_seen": 12176584256, "step": 46450 }, { "epoch": 0.2218061700275469, "grad_norm": 0.2021339386701584, "learning_rate": 0.001, "loss": 2.6259, "num_input_tokens_seen": 12189691456, "step": 46500 }, { "epoch": 0.2218061700275469, "eval_loss": 2.4975087642669678, "eval_runtime": 50.8921, "eval_samples_per_second": 98.247, "eval_steps_per_second": 24.562, "num_input_tokens_seen": 12189691456, "step": 46500 }, { "epoch": 0.22204467128564104, "grad_norm": 0.20796166360378265, "learning_rate": 0.001, "loss": 2.6211, "num_input_tokens_seen": 12202798656, "step": 46550 }, { "epoch": 0.22228317254373517, "grad_norm": 0.20472556352615356, "learning_rate": 0.001, "loss": 2.6123, "num_input_tokens_seen": 12215905856, "step": 46600 }, { "epoch": 0.2225216738018293, "grad_norm": 0.20017485320568085, "learning_rate": 0.001, "loss": 2.6037, "num_input_tokens_seen": 12229013056, "step": 46650 }, { "epoch": 0.22276017505992343, "grad_norm": 0.2037762850522995, "learning_rate": 0.001, "loss": 2.6155, "num_input_tokens_seen": 12242120256, "step": 46700 }, { "epoch": 0.2229986763180176, "grad_norm": 0.19346804916858673, "learning_rate": 0.001, "loss": 2.601, "num_input_tokens_seen": 12255227456, "step": 46750 }, { "epoch": 0.22323717757611172, "grad_norm": 0.18640096485614777, "learning_rate": 0.001, "loss": 2.6168, "num_input_tokens_seen": 12268334656, "step": 46800 }, { "epoch": 0.22347567883420585, "grad_norm": 0.20295055210590363, "learning_rate": 0.001, "loss": 2.6221, "num_input_tokens_seen": 12281441856, "step": 46850 }, { "epoch": 0.22371418009229999, "grad_norm": 0.20705671608448029, "learning_rate": 0.001, "loss": 2.6202, "num_input_tokens_seen": 12294549056, "step": 46900 }, { "epoch": 0.22395268135039412, "grad_norm": 0.18724282085895538, "learning_rate": 0.001, "loss": 2.6061, "num_input_tokens_seen": 12307656256, "step": 46950 }, { "epoch": 0.22419118260848825, "grad_norm": 0.18210910260677338, "learning_rate": 0.001, "loss": 2.6045, "num_input_tokens_seen": 12320763456, "step": 47000 }, { "epoch": 0.22419118260848825, "eval_loss": 2.497344493865967, "eval_runtime": 51.17, "eval_samples_per_second": 97.713, "eval_steps_per_second": 24.428, "num_input_tokens_seen": 12320763456, "step": 47000 }, { "epoch": 0.2244296838665824, "grad_norm": 0.18894509971141815, "learning_rate": 0.001, "loss": 2.6069, "num_input_tokens_seen": 12333870656, "step": 47050 }, { "epoch": 0.22466818512467654, "grad_norm": 0.23441652953624725, "learning_rate": 0.001, "loss": 2.6092, "num_input_tokens_seen": 12346977856, "step": 47100 }, { "epoch": 0.22490668638277067, "grad_norm": 0.20195326209068298, "learning_rate": 0.001, "loss": 2.6135, "num_input_tokens_seen": 12360085056, "step": 47150 }, { "epoch": 0.2251451876408648, "grad_norm": 0.22025838494300842, "learning_rate": 0.001, "loss": 2.6034, "num_input_tokens_seen": 12373192256, "step": 47200 }, { "epoch": 0.22538368889895893, "grad_norm": 0.19111979007720947, "learning_rate": 0.001, "loss": 2.6151, "num_input_tokens_seen": 12386299456, "step": 47250 }, { "epoch": 0.22562219015705307, "grad_norm": 0.2010103464126587, "learning_rate": 0.001, "loss": 2.6031, "num_input_tokens_seen": 12399406656, "step": 47300 }, { "epoch": 0.22586069141514722, "grad_norm": 0.21569807827472687, "learning_rate": 0.001, "loss": 2.6012, "num_input_tokens_seen": 12412513856, "step": 47350 }, { "epoch": 0.22609919267324136, "grad_norm": 0.18600653111934662, "learning_rate": 0.001, "loss": 2.6087, "num_input_tokens_seen": 12425621056, "step": 47400 }, { "epoch": 0.2263376939313355, "grad_norm": 0.19476164877414703, "learning_rate": 0.001, "loss": 2.6179, "num_input_tokens_seen": 12438728256, "step": 47450 }, { "epoch": 0.22657619518942962, "grad_norm": 0.19705821573734283, "learning_rate": 0.001, "loss": 2.5983, "num_input_tokens_seen": 12451835456, "step": 47500 }, { "epoch": 0.22657619518942962, "eval_loss": 2.495936393737793, "eval_runtime": 51.8116, "eval_samples_per_second": 96.504, "eval_steps_per_second": 24.126, "num_input_tokens_seen": 12451835456, "step": 47500 }, { "epoch": 0.22681469644752375, "grad_norm": 0.23161695897579193, "learning_rate": 0.001, "loss": 2.5974, "num_input_tokens_seen": 12464942656, "step": 47550 }, { "epoch": 0.2270531977056179, "grad_norm": 0.2022540420293808, "learning_rate": 0.001, "loss": 2.6251, "num_input_tokens_seen": 12478049856, "step": 47600 }, { "epoch": 0.22729169896371204, "grad_norm": 1.0341856479644775, "learning_rate": 0.001, "loss": 2.5831, "num_input_tokens_seen": 12491157056, "step": 47650 }, { "epoch": 0.22753020022180617, "grad_norm": 0.3812394440174103, "learning_rate": 0.001, "loss": 2.6407, "num_input_tokens_seen": 12504264256, "step": 47700 }, { "epoch": 0.2277687014799003, "grad_norm": 0.27030590176582336, "learning_rate": 0.001, "loss": 2.6327, "num_input_tokens_seen": 12517371456, "step": 47750 }, { "epoch": 0.22800720273799444, "grad_norm": 1.3918724060058594, "learning_rate": 0.001, "loss": 2.6344, "num_input_tokens_seen": 12530478656, "step": 47800 }, { "epoch": 0.22824570399608857, "grad_norm": 0.22610582411289215, "learning_rate": 0.001, "loss": 2.6444, "num_input_tokens_seen": 12543585856, "step": 47850 }, { "epoch": 0.22848420525418273, "grad_norm": 0.21421480178833008, "learning_rate": 0.001, "loss": 2.6169, "num_input_tokens_seen": 12556693056, "step": 47900 }, { "epoch": 0.22872270651227686, "grad_norm": 0.20389467477798462, "learning_rate": 0.001, "loss": 2.6158, "num_input_tokens_seen": 12569800256, "step": 47950 }, { "epoch": 0.228961207770371, "grad_norm": 0.2265746295452118, "learning_rate": 0.001, "loss": 2.6101, "num_input_tokens_seen": 12582907456, "step": 48000 }, { "epoch": 0.228961207770371, "eval_loss": 2.4971351623535156, "eval_runtime": 54.0453, "eval_samples_per_second": 92.515, "eval_steps_per_second": 23.129, "num_input_tokens_seen": 12582907456, "step": 48000 }, { "epoch": 0.22919970902846512, "grad_norm": 0.20247948169708252, "learning_rate": 0.001, "loss": 2.6122, "num_input_tokens_seen": 12596014656, "step": 48050 }, { "epoch": 0.22943821028655925, "grad_norm": 0.20237554609775543, "learning_rate": 0.001, "loss": 2.6235, "num_input_tokens_seen": 12609121856, "step": 48100 }, { "epoch": 0.2296767115446534, "grad_norm": 0.19862660765647888, "learning_rate": 0.001, "loss": 2.6264, "num_input_tokens_seen": 12622229056, "step": 48150 }, { "epoch": 0.22991521280274754, "grad_norm": 0.20839153230190277, "learning_rate": 0.001, "loss": 2.5915, "num_input_tokens_seen": 12635336256, "step": 48200 }, { "epoch": 0.23015371406084167, "grad_norm": 0.19385166466236115, "learning_rate": 0.001, "loss": 2.5979, "num_input_tokens_seen": 12648443456, "step": 48250 }, { "epoch": 0.2303922153189358, "grad_norm": 0.197597935795784, "learning_rate": 0.001, "loss": 2.6093, "num_input_tokens_seen": 12661550656, "step": 48300 }, { "epoch": 0.23063071657702994, "grad_norm": 0.20289985835552216, "learning_rate": 0.001, "loss": 2.6039, "num_input_tokens_seen": 12674657856, "step": 48350 }, { "epoch": 0.23086921783512407, "grad_norm": 0.1986515372991562, "learning_rate": 0.001, "loss": 2.6048, "num_input_tokens_seen": 12687765056, "step": 48400 }, { "epoch": 0.23110771909321823, "grad_norm": 0.19720982015132904, "learning_rate": 0.001, "loss": 2.6171, "num_input_tokens_seen": 12700872256, "step": 48450 }, { "epoch": 0.23134622035131236, "grad_norm": 0.24635523557662964, "learning_rate": 0.001, "loss": 2.6242, "num_input_tokens_seen": 12713979456, "step": 48500 }, { "epoch": 0.23134622035131236, "eval_loss": 2.495468854904175, "eval_runtime": 53.4259, "eval_samples_per_second": 93.588, "eval_steps_per_second": 23.397, "num_input_tokens_seen": 12713979456, "step": 48500 }, { "epoch": 0.2315847216094065, "grad_norm": 0.5883195996284485, "learning_rate": 0.001, "loss": 2.6399, "num_input_tokens_seen": 12727086656, "step": 48550 }, { "epoch": 0.23182322286750062, "grad_norm": 0.20890024304389954, "learning_rate": 0.001, "loss": 2.6325, "num_input_tokens_seen": 12740193856, "step": 48600 }, { "epoch": 0.23206172412559475, "grad_norm": 0.21251678466796875, "learning_rate": 0.001, "loss": 2.6233, "num_input_tokens_seen": 12753301056, "step": 48650 }, { "epoch": 0.23230022538368889, "grad_norm": 0.20996986329555511, "learning_rate": 0.001, "loss": 2.6174, "num_input_tokens_seen": 12766408256, "step": 48700 }, { "epoch": 0.23253872664178304, "grad_norm": 0.23039382696151733, "learning_rate": 0.001, "loss": 2.6305, "num_input_tokens_seen": 12779515456, "step": 48750 }, { "epoch": 0.23277722789987718, "grad_norm": 0.23922136425971985, "learning_rate": 0.001, "loss": 2.6108, "num_input_tokens_seen": 12792622656, "step": 48800 }, { "epoch": 0.2330157291579713, "grad_norm": 0.22746366262435913, "learning_rate": 0.001, "loss": 2.6219, "num_input_tokens_seen": 12805729856, "step": 48850 }, { "epoch": 0.23325423041606544, "grad_norm": 0.22131897509098053, "learning_rate": 0.001, "loss": 2.6205, "num_input_tokens_seen": 12818837056, "step": 48900 }, { "epoch": 0.23349273167415957, "grad_norm": 0.25431814789772034, "learning_rate": 0.001, "loss": 2.6252, "num_input_tokens_seen": 12831944256, "step": 48950 }, { "epoch": 0.23373123293225373, "grad_norm": 0.2622738778591156, "learning_rate": 0.001, "loss": 2.6288, "num_input_tokens_seen": 12845051456, "step": 49000 }, { "epoch": 0.23373123293225373, "eval_loss": 2.498055934906006, "eval_runtime": 53.8861, "eval_samples_per_second": 92.788, "eval_steps_per_second": 23.197, "num_input_tokens_seen": 12845051456, "step": 49000 }, { "epoch": 0.23396973419034786, "grad_norm": 0.209337517619133, "learning_rate": 0.001, "loss": 2.6348, "num_input_tokens_seen": 12858158656, "step": 49050 }, { "epoch": 0.234208235448442, "grad_norm": 0.1974038928747177, "learning_rate": 0.001, "loss": 2.6158, "num_input_tokens_seen": 12871265856, "step": 49100 }, { "epoch": 0.23444673670653612, "grad_norm": 0.28099164366722107, "learning_rate": 0.001, "loss": 2.6101, "num_input_tokens_seen": 12884373056, "step": 49150 }, { "epoch": 0.23468523796463026, "grad_norm": 0.2172873318195343, "learning_rate": 0.001, "loss": 2.596, "num_input_tokens_seen": 12897480256, "step": 49200 }, { "epoch": 0.2349237392227244, "grad_norm": 0.2120896875858307, "learning_rate": 0.001, "loss": 2.5994, "num_input_tokens_seen": 12910587456, "step": 49250 }, { "epoch": 0.23516224048081855, "grad_norm": 0.20109935104846954, "learning_rate": 0.001, "loss": 2.6101, "num_input_tokens_seen": 12923694656, "step": 49300 }, { "epoch": 0.23540074173891268, "grad_norm": 0.20735585689544678, "learning_rate": 0.001, "loss": 2.6142, "num_input_tokens_seen": 12936801856, "step": 49350 }, { "epoch": 0.2356392429970068, "grad_norm": 0.21295137703418732, "learning_rate": 0.001, "loss": 2.6226, "num_input_tokens_seen": 12949909056, "step": 49400 }, { "epoch": 0.23587774425510094, "grad_norm": 0.20560845732688904, "learning_rate": 0.001, "loss": 2.6027, "num_input_tokens_seen": 12963016256, "step": 49450 }, { "epoch": 0.23611624551319507, "grad_norm": 0.33747321367263794, "learning_rate": 0.001, "loss": 2.6231, "num_input_tokens_seen": 12976123456, "step": 49500 }, { "epoch": 0.23611624551319507, "eval_loss": 2.5008058547973633, "eval_runtime": 54.2104, "eval_samples_per_second": 92.233, "eval_steps_per_second": 23.058, "num_input_tokens_seen": 12976123456, "step": 49500 }, { "epoch": 0.23635474677128923, "grad_norm": 0.24593485891819, "learning_rate": 0.001, "loss": 2.6336, "num_input_tokens_seen": 12989230656, "step": 49550 }, { "epoch": 0.23659324802938336, "grad_norm": 0.25253933668136597, "learning_rate": 0.001, "loss": 2.643, "num_input_tokens_seen": 13002337856, "step": 49600 }, { "epoch": 0.2368317492874775, "grad_norm": 0.24231670796871185, "learning_rate": 0.001, "loss": 2.6074, "num_input_tokens_seen": 13015445056, "step": 49650 }, { "epoch": 0.23707025054557163, "grad_norm": 0.2178962677717209, "learning_rate": 0.001, "loss": 2.6184, "num_input_tokens_seen": 13028552256, "step": 49700 }, { "epoch": 0.23730875180366576, "grad_norm": 0.2651260793209076, "learning_rate": 0.001, "loss": 2.6335, "num_input_tokens_seen": 13041659456, "step": 49750 }, { "epoch": 0.2375472530617599, "grad_norm": 0.1909639537334442, "learning_rate": 0.001, "loss": 2.61, "num_input_tokens_seen": 13054766656, "step": 49800 }, { "epoch": 0.23778575431985405, "grad_norm": 0.21107855439186096, "learning_rate": 0.001, "loss": 2.6333, "num_input_tokens_seen": 13067873856, "step": 49850 }, { "epoch": 0.23802425557794818, "grad_norm": 0.19366736710071564, "learning_rate": 0.001, "loss": 2.6068, "num_input_tokens_seen": 13080981056, "step": 49900 }, { "epoch": 0.2382627568360423, "grad_norm": 0.2851523458957672, "learning_rate": 0.001, "loss": 2.6183, "num_input_tokens_seen": 13094088256, "step": 49950 }, { "epoch": 0.23850125809413644, "grad_norm": 0.23617912828922272, "learning_rate": 0.001, "loss": 2.617, "num_input_tokens_seen": 13107195456, "step": 50000 }, { "epoch": 0.23850125809413644, "eval_loss": 2.497406005859375, "eval_runtime": 53.6538, "eval_samples_per_second": 93.19, "eval_steps_per_second": 23.298, "num_input_tokens_seen": 13107195456, "step": 50000 }, { "epoch": 0.23873975935223057, "grad_norm": 0.5069316029548645, "learning_rate": 0.001, "loss": 2.6591, "num_input_tokens_seen": 13120302656, "step": 50050 }, { "epoch": 0.23897826061032473, "grad_norm": 0.21306034922599792, "learning_rate": 0.001, "loss": 2.6455, "num_input_tokens_seen": 13133409856, "step": 50100 }, { "epoch": 0.23921676186841886, "grad_norm": 0.2045888900756836, "learning_rate": 0.001, "loss": 2.6227, "num_input_tokens_seen": 13146517056, "step": 50150 }, { "epoch": 0.239455263126513, "grad_norm": 0.2335623949766159, "learning_rate": 0.001, "loss": 2.6097, "num_input_tokens_seen": 13159624256, "step": 50200 }, { "epoch": 0.23969376438460713, "grad_norm": 0.19884036481380463, "learning_rate": 0.001, "loss": 2.6189, "num_input_tokens_seen": 13172731456, "step": 50250 }, { "epoch": 0.23993226564270126, "grad_norm": 0.21080589294433594, "learning_rate": 0.001, "loss": 2.6057, "num_input_tokens_seen": 13185838656, "step": 50300 }, { "epoch": 0.2401707669007954, "grad_norm": 0.21613669395446777, "learning_rate": 0.001, "loss": 2.6045, "num_input_tokens_seen": 13198945856, "step": 50350 }, { "epoch": 0.24040926815888955, "grad_norm": 0.2029023915529251, "learning_rate": 0.001, "loss": 2.6127, "num_input_tokens_seen": 13212053056, "step": 50400 }, { "epoch": 0.24064776941698368, "grad_norm": 0.2275777906179428, "learning_rate": 0.001, "loss": 2.6149, "num_input_tokens_seen": 13225160256, "step": 50450 }, { "epoch": 0.2408862706750778, "grad_norm": 0.3332397937774658, "learning_rate": 0.001, "loss": 2.6013, "num_input_tokens_seen": 13238267456, "step": 50500 }, { "epoch": 0.2408862706750778, "eval_loss": 2.5022270679473877, "eval_runtime": 53.5942, "eval_samples_per_second": 93.294, "eval_steps_per_second": 23.323, "num_input_tokens_seen": 13238267456, "step": 50500 }, { "epoch": 0.24112477193317194, "grad_norm": 0.2197851538658142, "learning_rate": 0.001, "loss": 2.6326, "num_input_tokens_seen": 13251374656, "step": 50550 }, { "epoch": 0.24136327319126608, "grad_norm": 0.2201780080795288, "learning_rate": 0.001, "loss": 2.6265, "num_input_tokens_seen": 13264481856, "step": 50600 }, { "epoch": 0.2416017744493602, "grad_norm": 0.2196362316608429, "learning_rate": 0.001, "loss": 2.6272, "num_input_tokens_seen": 13277589056, "step": 50650 }, { "epoch": 0.24184027570745437, "grad_norm": 0.2234160453081131, "learning_rate": 0.001, "loss": 2.6178, "num_input_tokens_seen": 13290696256, "step": 50700 }, { "epoch": 0.2420787769655485, "grad_norm": 0.24019016325473785, "learning_rate": 0.001, "loss": 2.6142, "num_input_tokens_seen": 13303803456, "step": 50750 }, { "epoch": 0.24231727822364263, "grad_norm": 0.21481236815452576, "learning_rate": 0.001, "loss": 2.6149, "num_input_tokens_seen": 13316910656, "step": 50800 }, { "epoch": 0.24255577948173676, "grad_norm": 0.20477178692817688, "learning_rate": 0.001, "loss": 2.5977, "num_input_tokens_seen": 13330017856, "step": 50850 }, { "epoch": 0.2427942807398309, "grad_norm": 0.20742499828338623, "learning_rate": 0.001, "loss": 2.6153, "num_input_tokens_seen": 13343125056, "step": 50900 }, { "epoch": 0.24303278199792505, "grad_norm": 0.21933062374591827, "learning_rate": 0.001, "loss": 2.5966, "num_input_tokens_seen": 13356232256, "step": 50950 }, { "epoch": 0.24327128325601918, "grad_norm": 0.3282420337200165, "learning_rate": 0.001, "loss": 2.6063, "num_input_tokens_seen": 13369339456, "step": 51000 }, { "epoch": 0.24327128325601918, "eval_loss": 2.4981296062469482, "eval_runtime": 53.5536, "eval_samples_per_second": 93.364, "eval_steps_per_second": 23.341, "num_input_tokens_seen": 13369339456, "step": 51000 }, { "epoch": 0.24350978451411331, "grad_norm": 0.20502831041812897, "learning_rate": 0.001, "loss": 2.6059, "num_input_tokens_seen": 13382446656, "step": 51050 }, { "epoch": 0.24374828577220745, "grad_norm": 0.20750559866428375, "learning_rate": 0.001, "loss": 2.6056, "num_input_tokens_seen": 13395553856, "step": 51100 }, { "epoch": 0.24398678703030158, "grad_norm": 0.19882823526859283, "learning_rate": 0.001, "loss": 2.5983, "num_input_tokens_seen": 13408661056, "step": 51150 }, { "epoch": 0.2442252882883957, "grad_norm": 0.20900660753250122, "learning_rate": 0.001, "loss": 2.6087, "num_input_tokens_seen": 13421768256, "step": 51200 }, { "epoch": 0.24446378954648987, "grad_norm": 0.21428415179252625, "learning_rate": 0.001, "loss": 2.5901, "num_input_tokens_seen": 13434875456, "step": 51250 }, { "epoch": 0.244702290804584, "grad_norm": 0.19987250864505768, "learning_rate": 0.001, "loss": 2.5982, "num_input_tokens_seen": 13447982656, "step": 51300 }, { "epoch": 0.24494079206267813, "grad_norm": 0.2045862078666687, "learning_rate": 0.001, "loss": 2.6058, "num_input_tokens_seen": 13461089856, "step": 51350 }, { "epoch": 0.24517929332077226, "grad_norm": 0.22261273860931396, "learning_rate": 0.001, "loss": 2.5972, "num_input_tokens_seen": 13474197056, "step": 51400 }, { "epoch": 0.2454177945788664, "grad_norm": 0.20395706593990326, "learning_rate": 0.001, "loss": 2.6064, "num_input_tokens_seen": 13487304256, "step": 51450 }, { "epoch": 0.24565629583696055, "grad_norm": 0.21490858495235443, "learning_rate": 0.001, "loss": 2.5922, "num_input_tokens_seen": 13500411456, "step": 51500 }, { "epoch": 0.24565629583696055, "eval_loss": 2.488300085067749, "eval_runtime": 53.7972, "eval_samples_per_second": 92.942, "eval_steps_per_second": 23.235, "num_input_tokens_seen": 13500411456, "step": 51500 }, { "epoch": 0.24589479709505468, "grad_norm": 0.2039102464914322, "learning_rate": 0.001, "loss": 2.5894, "num_input_tokens_seen": 13513518656, "step": 51550 }, { "epoch": 0.24613329835314882, "grad_norm": 0.21426360309123993, "learning_rate": 0.001, "loss": 2.6089, "num_input_tokens_seen": 13526625856, "step": 51600 }, { "epoch": 0.24637179961124295, "grad_norm": 0.194682314991951, "learning_rate": 0.001, "loss": 2.5932, "num_input_tokens_seen": 13539733056, "step": 51650 }, { "epoch": 0.24661030086933708, "grad_norm": 0.1901472508907318, "learning_rate": 0.001, "loss": 2.6031, "num_input_tokens_seen": 13552840256, "step": 51700 }, { "epoch": 0.2468488021274312, "grad_norm": 0.20517823100090027, "learning_rate": 0.001, "loss": 2.5978, "num_input_tokens_seen": 13565947456, "step": 51750 }, { "epoch": 0.24708730338552537, "grad_norm": 0.23713302612304688, "learning_rate": 0.001, "loss": 2.6061, "num_input_tokens_seen": 13579054656, "step": 51800 }, { "epoch": 0.2473258046436195, "grad_norm": 0.2431441992521286, "learning_rate": 0.001, "loss": 2.6062, "num_input_tokens_seen": 13592161856, "step": 51850 }, { "epoch": 0.24756430590171363, "grad_norm": 0.20358557999134064, "learning_rate": 0.001, "loss": 2.6161, "num_input_tokens_seen": 13605269056, "step": 51900 }, { "epoch": 0.24780280715980776, "grad_norm": 0.21245016157627106, "learning_rate": 0.001, "loss": 2.6166, "num_input_tokens_seen": 13618376256, "step": 51950 }, { "epoch": 0.2480413084179019, "grad_norm": 0.24295999109745026, "learning_rate": 0.001, "loss": 2.6139, "num_input_tokens_seen": 13631483456, "step": 52000 }, { "epoch": 0.2480413084179019, "eval_loss": 2.4932186603546143, "eval_runtime": 53.6797, "eval_samples_per_second": 93.145, "eval_steps_per_second": 23.286, "num_input_tokens_seen": 13631483456, "step": 52000 }, { "epoch": 0.24827980967599603, "grad_norm": 0.22135989367961884, "learning_rate": 0.001, "loss": 2.5947, "num_input_tokens_seen": 13644590656, "step": 52050 }, { "epoch": 0.2485183109340902, "grad_norm": 0.3656958341598511, "learning_rate": 0.001, "loss": 2.6263, "num_input_tokens_seen": 13657697856, "step": 52100 }, { "epoch": 0.24875681219218432, "grad_norm": 0.2960817813873291, "learning_rate": 0.001, "loss": 2.6086, "num_input_tokens_seen": 13670805056, "step": 52150 }, { "epoch": 0.24899531345027845, "grad_norm": 0.2150612622499466, "learning_rate": 0.001, "loss": 2.6314, "num_input_tokens_seen": 13683912256, "step": 52200 }, { "epoch": 0.24923381470837258, "grad_norm": 0.23089592158794403, "learning_rate": 0.001, "loss": 2.6072, "num_input_tokens_seen": 13697019456, "step": 52250 }, { "epoch": 0.2494723159664667, "grad_norm": 0.19151148200035095, "learning_rate": 0.001, "loss": 2.6177, "num_input_tokens_seen": 13710126656, "step": 52300 }, { "epoch": 0.24971081722456087, "grad_norm": 0.47803962230682373, "learning_rate": 0.001, "loss": 2.6018, "num_input_tokens_seen": 13723233856, "step": 52350 }, { "epoch": 0.249949318482655, "grad_norm": 0.2346401810646057, "learning_rate": 0.001, "loss": 2.6068, "num_input_tokens_seen": 13736341056, "step": 52400 }, { "epoch": 0.2501878197407491, "grad_norm": 0.21514126658439636, "learning_rate": 0.001, "loss": 2.6186, "num_input_tokens_seen": 13749448256, "step": 52450 }, { "epoch": 0.25042632099884327, "grad_norm": 0.20311090350151062, "learning_rate": 0.001, "loss": 2.595, "num_input_tokens_seen": 13762555456, "step": 52500 }, { "epoch": 0.25042632099884327, "eval_loss": 2.490104913711548, "eval_runtime": 53.8709, "eval_samples_per_second": 92.814, "eval_steps_per_second": 23.204, "num_input_tokens_seen": 13762555456, "step": 52500 }, { "epoch": 0.2506648222569374, "grad_norm": 0.2120152711868286, "learning_rate": 0.001, "loss": 2.6027, "num_input_tokens_seen": 13775662656, "step": 52550 }, { "epoch": 0.25090332351503153, "grad_norm": 0.3172776401042938, "learning_rate": 0.001, "loss": 2.6089, "num_input_tokens_seen": 13788769856, "step": 52600 }, { "epoch": 0.2511418247731257, "grad_norm": 0.24425551295280457, "learning_rate": 0.001, "loss": 2.611, "num_input_tokens_seen": 13801877056, "step": 52650 }, { "epoch": 0.2513803260312198, "grad_norm": 0.24523352086544037, "learning_rate": 0.001, "loss": 2.6066, "num_input_tokens_seen": 13814984256, "step": 52700 }, { "epoch": 0.25161882728931395, "grad_norm": 0.21642154455184937, "learning_rate": 0.001, "loss": 2.6069, "num_input_tokens_seen": 13828091456, "step": 52750 }, { "epoch": 0.2518573285474081, "grad_norm": 0.21867206692695618, "learning_rate": 0.001, "loss": 2.6163, "num_input_tokens_seen": 13841198656, "step": 52800 }, { "epoch": 0.2520958298055022, "grad_norm": 0.2124466449022293, "learning_rate": 0.001, "loss": 2.6045, "num_input_tokens_seen": 13854305856, "step": 52850 }, { "epoch": 0.2523343310635964, "grad_norm": 0.20598042011260986, "learning_rate": 0.001, "loss": 2.5881, "num_input_tokens_seen": 13867413056, "step": 52900 }, { "epoch": 0.2525728323216905, "grad_norm": 0.1949404776096344, "learning_rate": 0.001, "loss": 2.6051, "num_input_tokens_seen": 13880520256, "step": 52950 }, { "epoch": 0.25281133357978464, "grad_norm": 0.18877142667770386, "learning_rate": 0.001, "loss": 2.608, "num_input_tokens_seen": 13893627456, "step": 53000 }, { "epoch": 0.25281133357978464, "eval_loss": 2.485513210296631, "eval_runtime": 53.7202, "eval_samples_per_second": 93.075, "eval_steps_per_second": 23.269, "num_input_tokens_seen": 13893627456, "step": 53000 }, { "epoch": 0.2530498348378788, "grad_norm": 0.20486177504062653, "learning_rate": 0.001, "loss": 2.5977, "num_input_tokens_seen": 13906734656, "step": 53050 }, { "epoch": 0.2532883360959729, "grad_norm": 0.18098385632038116, "learning_rate": 0.001, "loss": 2.5931, "num_input_tokens_seen": 13919841856, "step": 53100 }, { "epoch": 0.25352683735406706, "grad_norm": 0.1933833658695221, "learning_rate": 0.001, "loss": 2.6058, "num_input_tokens_seen": 13932949056, "step": 53150 }, { "epoch": 0.25376533861216116, "grad_norm": 0.29640141129493713, "learning_rate": 0.001, "loss": 2.5864, "num_input_tokens_seen": 13946056256, "step": 53200 }, { "epoch": 0.2540038398702553, "grad_norm": 0.2559553384780884, "learning_rate": 0.001, "loss": 2.6137, "num_input_tokens_seen": 13959163456, "step": 53250 }, { "epoch": 0.2542423411283494, "grad_norm": 0.21698619425296783, "learning_rate": 0.001, "loss": 2.6184, "num_input_tokens_seen": 13972270656, "step": 53300 }, { "epoch": 0.2544808423864436, "grad_norm": 0.19658173620700836, "learning_rate": 0.001, "loss": 2.5938, "num_input_tokens_seen": 13985377856, "step": 53350 }, { "epoch": 0.25471934364453774, "grad_norm": 0.2056342512369156, "learning_rate": 0.001, "loss": 2.5952, "num_input_tokens_seen": 13998485056, "step": 53400 }, { "epoch": 0.25495784490263185, "grad_norm": 0.1932424008846283, "learning_rate": 0.001, "loss": 2.6101, "num_input_tokens_seen": 14011592256, "step": 53450 }, { "epoch": 0.255196346160726, "grad_norm": 0.19347251951694489, "learning_rate": 0.001, "loss": 2.5976, "num_input_tokens_seen": 14024699456, "step": 53500 }, { "epoch": 0.255196346160726, "eval_loss": 2.4863245487213135, "eval_runtime": 53.2426, "eval_samples_per_second": 93.91, "eval_steps_per_second": 23.477, "num_input_tokens_seen": 14024699456, "step": 53500 }, { "epoch": 0.2554348474188201, "grad_norm": 0.1986820101737976, "learning_rate": 0.001, "loss": 2.6066, "num_input_tokens_seen": 14037806656, "step": 53550 }, { "epoch": 0.25567334867691427, "grad_norm": 0.21295565366744995, "learning_rate": 0.001, "loss": 2.6107, "num_input_tokens_seen": 14050913856, "step": 53600 }, { "epoch": 0.25591184993500843, "grad_norm": 0.21585114300251007, "learning_rate": 0.001, "loss": 2.6077, "num_input_tokens_seen": 14064021056, "step": 53650 }, { "epoch": 0.25615035119310253, "grad_norm": 0.19424305856227875, "learning_rate": 0.001, "loss": 2.5931, "num_input_tokens_seen": 14077128256, "step": 53700 }, { "epoch": 0.2563888524511967, "grad_norm": 0.20265349745750427, "learning_rate": 0.001, "loss": 2.5901, "num_input_tokens_seen": 14090235456, "step": 53750 }, { "epoch": 0.2566273537092908, "grad_norm": 1.037636160850525, "learning_rate": 0.001, "loss": 2.5775, "num_input_tokens_seen": 14103342656, "step": 53800 }, { "epoch": 0.25686585496738495, "grad_norm": 0.32030293345451355, "learning_rate": 0.001, "loss": 2.6242, "num_input_tokens_seen": 14116449856, "step": 53850 }, { "epoch": 0.2571043562254791, "grad_norm": 0.2339978665113449, "learning_rate": 0.001, "loss": 2.6122, "num_input_tokens_seen": 14129557056, "step": 53900 }, { "epoch": 0.2573428574835732, "grad_norm": 0.22179783880710602, "learning_rate": 0.001, "loss": 2.6025, "num_input_tokens_seen": 14142664256, "step": 53950 }, { "epoch": 0.2575813587416674, "grad_norm": 0.22616736590862274, "learning_rate": 0.001, "loss": 2.5916, "num_input_tokens_seen": 14155771456, "step": 54000 }, { "epoch": 0.2575813587416674, "eval_loss": 2.4871394634246826, "eval_runtime": 53.8695, "eval_samples_per_second": 92.817, "eval_steps_per_second": 23.204, "num_input_tokens_seen": 14155771456, "step": 54000 }, { "epoch": 0.2578198599997615, "grad_norm": 0.2028844654560089, "learning_rate": 0.001, "loss": 2.6039, "num_input_tokens_seen": 14168878656, "step": 54050 }, { "epoch": 0.25805836125785564, "grad_norm": 0.19936658442020416, "learning_rate": 0.001, "loss": 2.5985, "num_input_tokens_seen": 14181985856, "step": 54100 }, { "epoch": 0.2582968625159498, "grad_norm": 0.2087993025779724, "learning_rate": 0.001, "loss": 2.62, "num_input_tokens_seen": 14195093056, "step": 54150 }, { "epoch": 0.2585353637740439, "grad_norm": 0.18972960114479065, "learning_rate": 0.001, "loss": 2.5936, "num_input_tokens_seen": 14208200256, "step": 54200 }, { "epoch": 0.25877386503213806, "grad_norm": 0.2162945419549942, "learning_rate": 0.001, "loss": 2.6125, "num_input_tokens_seen": 14221307456, "step": 54250 }, { "epoch": 0.25901236629023217, "grad_norm": 0.2538411319255829, "learning_rate": 0.001, "loss": 2.6197, "num_input_tokens_seen": 14234414656, "step": 54300 }, { "epoch": 0.2592508675483263, "grad_norm": 0.28060850501060486, "learning_rate": 0.001, "loss": 2.6194, "num_input_tokens_seen": 14247521856, "step": 54350 }, { "epoch": 0.25948936880642043, "grad_norm": 0.21557608246803284, "learning_rate": 0.001, "loss": 2.623, "num_input_tokens_seen": 14260629056, "step": 54400 }, { "epoch": 0.2597278700645146, "grad_norm": 0.21628426015377045, "learning_rate": 0.001, "loss": 2.6077, "num_input_tokens_seen": 14273736256, "step": 54450 }, { "epoch": 0.25996637132260875, "grad_norm": 0.19123327732086182, "learning_rate": 0.001, "loss": 2.5991, "num_input_tokens_seen": 14286843456, "step": 54500 }, { "epoch": 0.25996637132260875, "eval_loss": 2.4861645698547363, "eval_runtime": 53.6448, "eval_samples_per_second": 93.206, "eval_steps_per_second": 23.301, "num_input_tokens_seen": 14286843456, "step": 54500 }, { "epoch": 0.26020487258070285, "grad_norm": 0.20462968945503235, "learning_rate": 0.001, "loss": 2.5887, "num_input_tokens_seen": 14299950656, "step": 54550 }, { "epoch": 0.260443373838797, "grad_norm": 0.20952938497066498, "learning_rate": 0.001, "loss": 2.608, "num_input_tokens_seen": 14313057856, "step": 54600 }, { "epoch": 0.2606818750968911, "grad_norm": 0.2095402032136917, "learning_rate": 0.001, "loss": 2.6079, "num_input_tokens_seen": 14326165056, "step": 54650 }, { "epoch": 0.2609203763549853, "grad_norm": 0.2343517541885376, "learning_rate": 0.001, "loss": 2.6124, "num_input_tokens_seen": 14339272256, "step": 54700 }, { "epoch": 0.26115887761307943, "grad_norm": 0.23840700089931488, "learning_rate": 0.001, "loss": 2.6015, "num_input_tokens_seen": 14352379456, "step": 54750 }, { "epoch": 0.26139737887117354, "grad_norm": 0.22024671733379364, "learning_rate": 0.001, "loss": 2.5812, "num_input_tokens_seen": 14365486656, "step": 54800 }, { "epoch": 0.2616358801292677, "grad_norm": 0.19884246587753296, "learning_rate": 0.001, "loss": 2.6118, "num_input_tokens_seen": 14378593856, "step": 54850 }, { "epoch": 0.2618743813873618, "grad_norm": 0.46560585498809814, "learning_rate": 0.001, "loss": 2.6024, "num_input_tokens_seen": 14391701056, "step": 54900 }, { "epoch": 0.26211288264545596, "grad_norm": 0.2956256568431854, "learning_rate": 0.001, "loss": 2.6073, "num_input_tokens_seen": 14404808256, "step": 54950 }, { "epoch": 0.2623513839035501, "grad_norm": 0.286327064037323, "learning_rate": 0.001, "loss": 2.5946, "num_input_tokens_seen": 14417915456, "step": 55000 }, { "epoch": 0.2623513839035501, "eval_loss": 2.4892399311065674, "eval_runtime": 53.3184, "eval_samples_per_second": 93.776, "eval_steps_per_second": 23.444, "num_input_tokens_seen": 14417915456, "step": 55000 }, { "epoch": 0.2625898851616442, "grad_norm": 0.22046101093292236, "learning_rate": 0.001, "loss": 2.6077, "num_input_tokens_seen": 14431022656, "step": 55050 }, { "epoch": 0.2628283864197384, "grad_norm": 0.4682837724685669, "learning_rate": 0.001, "loss": 2.6065, "num_input_tokens_seen": 14444129856, "step": 55100 }, { "epoch": 0.2630668876778325, "grad_norm": 0.21442484855651855, "learning_rate": 0.001, "loss": 2.6079, "num_input_tokens_seen": 14457237056, "step": 55150 }, { "epoch": 0.26330538893592664, "grad_norm": 0.2513403296470642, "learning_rate": 0.001, "loss": 2.6037, "num_input_tokens_seen": 14470344256, "step": 55200 }, { "epoch": 0.26354389019402075, "grad_norm": 0.21526487171649933, "learning_rate": 0.001, "loss": 2.6049, "num_input_tokens_seen": 14483451456, "step": 55250 }, { "epoch": 0.2637823914521149, "grad_norm": 0.22567112743854523, "learning_rate": 0.001, "loss": 2.5953, "num_input_tokens_seen": 14496558656, "step": 55300 }, { "epoch": 0.26402089271020907, "grad_norm": 0.20226064324378967, "learning_rate": 0.001, "loss": 2.609, "num_input_tokens_seen": 14509665856, "step": 55350 }, { "epoch": 0.26425939396830317, "grad_norm": 0.31736019253730774, "learning_rate": 0.001, "loss": 2.6174, "num_input_tokens_seen": 14522773056, "step": 55400 }, { "epoch": 0.26449789522639733, "grad_norm": 0.2573414146900177, "learning_rate": 0.001, "loss": 2.612, "num_input_tokens_seen": 14535880256, "step": 55450 }, { "epoch": 0.26473639648449143, "grad_norm": 0.278160959482193, "learning_rate": 0.001, "loss": 2.6713, "num_input_tokens_seen": 14548987456, "step": 55500 }, { "epoch": 0.26473639648449143, "eval_loss": 2.5104730129241943, "eval_runtime": 54.2403, "eval_samples_per_second": 92.182, "eval_steps_per_second": 23.046, "num_input_tokens_seen": 14548987456, "step": 55500 }, { "epoch": 0.2649748977425856, "grad_norm": 0.25843819975852966, "learning_rate": 0.001, "loss": 2.6223, "num_input_tokens_seen": 14562094656, "step": 55550 }, { "epoch": 0.26521339900067975, "grad_norm": 0.42813193798065186, "learning_rate": 0.001, "loss": 2.6114, "num_input_tokens_seen": 14575201856, "step": 55600 }, { "epoch": 0.26545190025877385, "grad_norm": 0.23324181139469147, "learning_rate": 0.001, "loss": 2.6149, "num_input_tokens_seen": 14588309056, "step": 55650 }, { "epoch": 0.265690401516868, "grad_norm": 0.2795487940311432, "learning_rate": 0.001, "loss": 2.6067, "num_input_tokens_seen": 14601416256, "step": 55700 }, { "epoch": 0.2659289027749621, "grad_norm": 0.6856834888458252, "learning_rate": 0.001, "loss": 2.6135, "num_input_tokens_seen": 14614523456, "step": 55750 }, { "epoch": 0.2661674040330563, "grad_norm": 0.348906934261322, "learning_rate": 0.001, "loss": 2.6384, "num_input_tokens_seen": 14627630656, "step": 55800 }, { "epoch": 0.26640590529115044, "grad_norm": 0.2510247528553009, "learning_rate": 0.001, "loss": 2.6224, "num_input_tokens_seen": 14640737856, "step": 55850 }, { "epoch": 0.26664440654924454, "grad_norm": 0.34429189562797546, "learning_rate": 0.001, "loss": 2.6139, "num_input_tokens_seen": 14653845056, "step": 55900 }, { "epoch": 0.2668829078073387, "grad_norm": 0.25697243213653564, "learning_rate": 0.001, "loss": 2.6143, "num_input_tokens_seen": 14666952256, "step": 55950 }, { "epoch": 0.2671214090654328, "grad_norm": 0.2812611758708954, "learning_rate": 0.001, "loss": 2.6172, "num_input_tokens_seen": 14680059456, "step": 56000 }, { "epoch": 0.2671214090654328, "eval_loss": 2.492490291595459, "eval_runtime": 53.3814, "eval_samples_per_second": 93.666, "eval_steps_per_second": 23.416, "num_input_tokens_seen": 14680059456, "step": 56000 }, { "epoch": 0.26735991032352696, "grad_norm": 0.22615984082221985, "learning_rate": 0.0009999685283773503, "loss": 2.5961, "num_input_tokens_seen": 14693166656, "step": 56050 }, { "epoch": 0.2675984115816211, "grad_norm": 0.2738794982433319, "learning_rate": 0.0009998741174712534, "loss": 2.612, "num_input_tokens_seen": 14706273856, "step": 56100 }, { "epoch": 0.2678369128397152, "grad_norm": 0.23470066487789154, "learning_rate": 0.0009997167791667668, "loss": 2.6071, "num_input_tokens_seen": 14719381056, "step": 56150 }, { "epoch": 0.2680754140978094, "grad_norm": 0.23558543622493744, "learning_rate": 0.0009994965332706573, "loss": 2.5956, "num_input_tokens_seen": 14732488256, "step": 56200 }, { "epoch": 0.2683139153559035, "grad_norm": 0.2274416983127594, "learning_rate": 0.0009992134075089082, "loss": 2.5873, "num_input_tokens_seen": 14745595456, "step": 56250 }, { "epoch": 0.26855241661399765, "grad_norm": 0.21609161794185638, "learning_rate": 0.000998867437523228, "loss": 2.6043, "num_input_tokens_seen": 14758702656, "step": 56300 }, { "epoch": 0.26879091787209175, "grad_norm": 0.2368565797805786, "learning_rate": 0.000998458666866564, "loss": 2.5952, "num_input_tokens_seen": 14771809856, "step": 56350 }, { "epoch": 0.2690294191301859, "grad_norm": 0.22180891036987305, "learning_rate": 0.0009979871469976197, "loss": 2.5934, "num_input_tokens_seen": 14784917056, "step": 56400 }, { "epoch": 0.26926792038828007, "grad_norm": 0.3060019910335541, "learning_rate": 0.0009974529372743762, "loss": 2.6224, "num_input_tokens_seen": 14798024256, "step": 56450 }, { "epoch": 0.2695064216463742, "grad_norm": 0.2387322634458542, "learning_rate": 0.0009968561049466214, "loss": 2.5905, "num_input_tokens_seen": 14811131456, "step": 56500 }, { "epoch": 0.2695064216463742, "eval_loss": 2.4835996627807617, "eval_runtime": 53.8478, "eval_samples_per_second": 92.854, "eval_steps_per_second": 23.214, "num_input_tokens_seen": 14811131456, "step": 56500 }, { "epoch": 0.26974492290446833, "grad_norm": 0.22091372311115265, "learning_rate": 0.0009961967251474822, "loss": 2.6139, "num_input_tokens_seen": 14824238656, "step": 56550 }, { "epoch": 0.26998342416256244, "grad_norm": 0.2304680198431015, "learning_rate": 0.0009954748808839674, "loss": 2.6167, "num_input_tokens_seen": 14837345856, "step": 56600 }, { "epoch": 0.2702219254206566, "grad_norm": 0.19777421653270721, "learning_rate": 0.0009946906630265184, "loss": 2.6082, "num_input_tokens_seen": 14850453056, "step": 56650 }, { "epoch": 0.27046042667875075, "grad_norm": 0.2113979458808899, "learning_rate": 0.0009938441702975688, "loss": 2.5981, "num_input_tokens_seen": 14863560256, "step": 56700 }, { "epoch": 0.27069892793684486, "grad_norm": 0.19911637902259827, "learning_rate": 0.0009929355092591179, "loss": 2.5904, "num_input_tokens_seen": 14876667456, "step": 56750 }, { "epoch": 0.270937429194939, "grad_norm": 0.20081694424152374, "learning_rate": 0.0009919647942993148, "loss": 2.6012, "num_input_tokens_seen": 14889774656, "step": 56800 }, { "epoch": 0.2711759304530331, "grad_norm": 0.22752800583839417, "learning_rate": 0.0009909321476180592, "loss": 2.6017, "num_input_tokens_seen": 14902881856, "step": 56850 }, { "epoch": 0.2714144317111273, "grad_norm": 0.23174402117729187, "learning_rate": 0.0009898376992116178, "loss": 2.6012, "num_input_tokens_seen": 14915989056, "step": 56900 }, { "epoch": 0.27165293296922144, "grad_norm": 0.22149533033370972, "learning_rate": 0.0009886815868562597, "loss": 2.5881, "num_input_tokens_seen": 14929096256, "step": 56950 }, { "epoch": 0.27189143422731554, "grad_norm": 0.22576771676540375, "learning_rate": 0.0009874639560909118, "loss": 2.6021, "num_input_tokens_seen": 14942203456, "step": 57000 }, { "epoch": 0.27189143422731554, "eval_loss": 2.482896566390991, "eval_runtime": 53.3773, "eval_samples_per_second": 93.673, "eval_steps_per_second": 23.418, "num_input_tokens_seen": 14942203456, "step": 57000 }, { "epoch": 0.2721299354854097, "grad_norm": 0.22044019401073456, "learning_rate": 0.0009861849601988384, "loss": 2.6119, "num_input_tokens_seen": 14955310656, "step": 57050 }, { "epoch": 0.2723684367435038, "grad_norm": 0.2155238389968872, "learning_rate": 0.0009848447601883434, "loss": 2.5869, "num_input_tokens_seen": 14968417856, "step": 57100 }, { "epoch": 0.27260693800159796, "grad_norm": 0.21131549775600433, "learning_rate": 0.0009834435247725033, "loss": 2.5988, "num_input_tokens_seen": 14981525056, "step": 57150 }, { "epoch": 0.27284543925969207, "grad_norm": 0.21247337758541107, "learning_rate": 0.0009819814303479266, "loss": 2.6198, "num_input_tokens_seen": 14994632256, "step": 57200 }, { "epoch": 0.27308394051778623, "grad_norm": 0.21916711330413818, "learning_rate": 0.00098045866097255, "loss": 2.6019, "num_input_tokens_seen": 15007739456, "step": 57250 }, { "epoch": 0.2733224417758804, "grad_norm": 0.1925441473722458, "learning_rate": 0.0009788754083424652, "loss": 2.6143, "num_input_tokens_seen": 15020846656, "step": 57300 }, { "epoch": 0.2735609430339745, "grad_norm": 0.38578665256500244, "learning_rate": 0.0009772318717677904, "loss": 2.6037, "num_input_tokens_seen": 15033953856, "step": 57350 }, { "epoch": 0.27379944429206865, "grad_norm": 0.19650611281394958, "learning_rate": 0.0009755282581475768, "loss": 2.5745, "num_input_tokens_seen": 15047061056, "step": 57400 }, { "epoch": 0.27403794555016275, "grad_norm": 0.2376088798046112, "learning_rate": 0.0009737647819437645, "loss": 2.5968, "num_input_tokens_seen": 15060168256, "step": 57450 }, { "epoch": 0.2742764468082569, "grad_norm": 0.21746863424777985, "learning_rate": 0.0009719416651541838, "loss": 2.5965, "num_input_tokens_seen": 15073275456, "step": 57500 }, { "epoch": 0.2742764468082569, "eval_loss": 2.483751058578491, "eval_runtime": 53.9622, "eval_samples_per_second": 92.657, "eval_steps_per_second": 23.164, "num_input_tokens_seen": 15073275456, "step": 57500 }, { "epoch": 0.27451494806635107, "grad_norm": 0.2898815870285034, "learning_rate": 0.0009700591372846095, "loss": 2.6105, "num_input_tokens_seen": 15086382656, "step": 57550 }, { "epoch": 0.2747534493244452, "grad_norm": 0.24887384474277496, "learning_rate": 0.0009681174353198686, "loss": 2.6103, "num_input_tokens_seen": 15099489856, "step": 57600 }, { "epoch": 0.27499195058253934, "grad_norm": 0.26613715291023254, "learning_rate": 0.0009661168036940071, "loss": 2.6296, "num_input_tokens_seen": 15112597056, "step": 57650 }, { "epoch": 0.27523045184063344, "grad_norm": 0.23983849585056305, "learning_rate": 0.0009640574942595195, "loss": 2.6008, "num_input_tokens_seen": 15125704256, "step": 57700 }, { "epoch": 0.2754689530987276, "grad_norm": 0.23169022798538208, "learning_rate": 0.0009619397662556434, "loss": 2.596, "num_input_tokens_seen": 15138811456, "step": 57750 }, { "epoch": 0.27570745435682176, "grad_norm": 0.21353812515735626, "learning_rate": 0.0009597638862757254, "loss": 2.6039, "num_input_tokens_seen": 15151918656, "step": 57800 }, { "epoch": 0.27594595561491586, "grad_norm": 0.2561227083206177, "learning_rate": 0.00095753012823366, "loss": 2.6046, "num_input_tokens_seen": 15165025856, "step": 57850 }, { "epoch": 0.27618445687301, "grad_norm": 0.20380394160747528, "learning_rate": 0.000955238773329408, "loss": 2.5968, "num_input_tokens_seen": 15178133056, "step": 57900 }, { "epoch": 0.2764229581311041, "grad_norm": 0.26447024941444397, "learning_rate": 0.000952890110013597, "loss": 2.5848, "num_input_tokens_seen": 15191240256, "step": 57950 }, { "epoch": 0.2766614593891983, "grad_norm": 0.23530781269073486, "learning_rate": 0.0009504844339512095, "loss": 2.582, "num_input_tokens_seen": 15204347456, "step": 58000 }, { "epoch": 0.2766614593891983, "eval_loss": 2.482050895690918, "eval_runtime": 53.5775, "eval_samples_per_second": 93.323, "eval_steps_per_second": 23.331, "num_input_tokens_seen": 15204347456, "step": 58000 }, { "epoch": 0.2768999606472924, "grad_norm": 0.2281644344329834, "learning_rate": 0.0009480220479843627, "loss": 2.6212, "num_input_tokens_seen": 15217454656, "step": 58050 }, { "epoch": 0.27713846190538655, "grad_norm": 0.2181713730096817, "learning_rate": 0.0009455032620941839, "loss": 2.5927, "num_input_tokens_seen": 15230561856, "step": 58100 }, { "epoch": 0.2773769631634807, "grad_norm": 0.21573083102703094, "learning_rate": 0.00094292839336179, "loss": 2.6112, "num_input_tokens_seen": 15243669056, "step": 58150 }, { "epoch": 0.2776154644215748, "grad_norm": 0.2686486840248108, "learning_rate": 0.000940297765928369, "loss": 2.6133, "num_input_tokens_seen": 15256776256, "step": 58200 }, { "epoch": 0.27785396567966897, "grad_norm": 0.2320137470960617, "learning_rate": 0.0009376117109543769, "loss": 2.6094, "num_input_tokens_seen": 15269883456, "step": 58250 }, { "epoch": 0.27809246693776307, "grad_norm": 0.22277672588825226, "learning_rate": 0.0009348705665778478, "loss": 2.5885, "num_input_tokens_seen": 15282990656, "step": 58300 }, { "epoch": 0.27833096819585723, "grad_norm": 0.22681231796741486, "learning_rate": 0.0009320746778718274, "loss": 2.6005, "num_input_tokens_seen": 15296097856, "step": 58350 }, { "epoch": 0.2785694694539514, "grad_norm": 0.25187453627586365, "learning_rate": 0.000929224396800933, "loss": 2.5944, "num_input_tokens_seen": 15309205056, "step": 58400 }, { "epoch": 0.2788079707120455, "grad_norm": 0.24962358176708221, "learning_rate": 0.0009263200821770461, "loss": 2.5888, "num_input_tokens_seen": 15322312256, "step": 58450 }, { "epoch": 0.27904647197013965, "grad_norm": 0.18929679691791534, "learning_rate": 0.0009233620996141421, "loss": 2.5927, "num_input_tokens_seen": 15335419456, "step": 58500 }, { "epoch": 0.27904647197013965, "eval_loss": 2.4754066467285156, "eval_runtime": 53.7558, "eval_samples_per_second": 93.013, "eval_steps_per_second": 23.253, "num_input_tokens_seen": 15335419456, "step": 58500 }, { "epoch": 0.27928497322823376, "grad_norm": 0.22240912914276123, "learning_rate": 0.0009203508214822651, "loss": 2.5944, "num_input_tokens_seen": 15348526656, "step": 58550 }, { "epoch": 0.2795234744863279, "grad_norm": 0.2096235305070877, "learning_rate": 0.0009172866268606513, "loss": 2.5964, "num_input_tokens_seen": 15361633856, "step": 58600 }, { "epoch": 0.2797619757444221, "grad_norm": 0.2913396954536438, "learning_rate": 0.0009141699014900082, "loss": 2.5975, "num_input_tokens_seen": 15374741056, "step": 58650 }, { "epoch": 0.2800004770025162, "grad_norm": 0.21000444889068604, "learning_rate": 0.0009110010377239551, "loss": 2.5987, "num_input_tokens_seen": 15387848256, "step": 58700 }, { "epoch": 0.28023897826061034, "grad_norm": 0.18561489880084991, "learning_rate": 0.0009077804344796301, "loss": 2.5955, "num_input_tokens_seen": 15400955456, "step": 58750 }, { "epoch": 0.28047747951870444, "grad_norm": 0.330816388130188, "learning_rate": 0.0009045084971874737, "loss": 2.5837, "num_input_tokens_seen": 15414062656, "step": 58800 }, { "epoch": 0.2807159807767986, "grad_norm": 0.21823953092098236, "learning_rate": 0.000901185637740189, "loss": 2.5921, "num_input_tokens_seen": 15427169856, "step": 58850 }, { "epoch": 0.28095448203489276, "grad_norm": 0.28721505403518677, "learning_rate": 0.0008978122744408905, "loss": 2.5893, "num_input_tokens_seen": 15440277056, "step": 58900 }, { "epoch": 0.28119298329298686, "grad_norm": 0.2468225359916687, "learning_rate": 0.0008943888319504456, "loss": 2.5999, "num_input_tokens_seen": 15453384256, "step": 58950 }, { "epoch": 0.281431484551081, "grad_norm": 0.20486761629581451, "learning_rate": 0.000890915741234015, "loss": 2.6026, "num_input_tokens_seen": 15466491456, "step": 59000 }, { "epoch": 0.281431484551081, "eval_loss": 2.4756667613983154, "eval_runtime": 53.3408, "eval_samples_per_second": 93.737, "eval_steps_per_second": 23.434, "num_input_tokens_seen": 15466491456, "step": 59000 }, { "epoch": 0.2816699858091751, "grad_norm": 0.3338637351989746, "learning_rate": 0.0008873934395068005, "loss": 2.587, "num_input_tokens_seen": 15479598656, "step": 59050 }, { "epoch": 0.2819084870672693, "grad_norm": 0.20848780870437622, "learning_rate": 0.0008838223701790055, "loss": 2.5989, "num_input_tokens_seen": 15492705856, "step": 59100 }, { "epoch": 0.2821469883253634, "grad_norm": 0.21479378640651703, "learning_rate": 0.0008802029828000156, "loss": 2.6052, "num_input_tokens_seen": 15505813056, "step": 59150 }, { "epoch": 0.28238548958345755, "grad_norm": 0.1944151073694229, "learning_rate": 0.0008765357330018055, "loss": 2.6044, "num_input_tokens_seen": 15518920256, "step": 59200 }, { "epoch": 0.2826239908415517, "grad_norm": 0.2078033685684204, "learning_rate": 0.0008728210824415827, "loss": 2.5929, "num_input_tokens_seen": 15532027456, "step": 59250 }, { "epoch": 0.2828624920996458, "grad_norm": 0.19340284168720245, "learning_rate": 0.0008690594987436704, "loss": 2.5875, "num_input_tokens_seen": 15545134656, "step": 59300 }, { "epoch": 0.28310099335773997, "grad_norm": 0.22354012727737427, "learning_rate": 0.0008652514554406388, "loss": 2.5976, "num_input_tokens_seen": 15558241856, "step": 59350 }, { "epoch": 0.2833394946158341, "grad_norm": 0.26784005761146545, "learning_rate": 0.0008613974319136957, "loss": 2.5868, "num_input_tokens_seen": 15571349056, "step": 59400 }, { "epoch": 0.28357799587392823, "grad_norm": 0.20749828219413757, "learning_rate": 0.0008574979133323377, "loss": 2.5784, "num_input_tokens_seen": 15584456256, "step": 59450 }, { "epoch": 0.2838164971320224, "grad_norm": 0.21545729041099548, "learning_rate": 0.0008535533905932737, "loss": 2.5939, "num_input_tokens_seen": 15597563456, "step": 59500 }, { "epoch": 0.2838164971320224, "eval_loss": 2.469989538192749, "eval_runtime": 54.0784, "eval_samples_per_second": 92.458, "eval_steps_per_second": 23.115, "num_input_tokens_seen": 15597563456, "step": 59500 }, { "epoch": 0.2840549983901165, "grad_norm": 0.20836423337459564, "learning_rate": 0.0008495643602586287, "loss": 2.5858, "num_input_tokens_seen": 15610670656, "step": 59550 }, { "epoch": 0.28429349964821066, "grad_norm": 0.20427604019641876, "learning_rate": 0.0008455313244934324, "loss": 2.5781, "num_input_tokens_seen": 15623777856, "step": 59600 }, { "epoch": 0.28453200090630476, "grad_norm": 0.2341683804988861, "learning_rate": 0.0008414547910024035, "loss": 2.5713, "num_input_tokens_seen": 15636885056, "step": 59650 }, { "epoch": 0.2847705021643989, "grad_norm": 0.20808522403240204, "learning_rate": 0.0008373352729660373, "loss": 2.5751, "num_input_tokens_seen": 15649992256, "step": 59700 }, { "epoch": 0.2850090034224931, "grad_norm": 0.21032562851905823, "learning_rate": 0.000833173288976002, "loss": 2.5784, "num_input_tokens_seen": 15663099456, "step": 59750 }, { "epoch": 0.2852475046805872, "grad_norm": 0.23485584557056427, "learning_rate": 0.0008289693629698564, "loss": 2.5974, "num_input_tokens_seen": 15676206656, "step": 59800 }, { "epoch": 0.28548600593868134, "grad_norm": 0.2229880541563034, "learning_rate": 0.0008247240241650918, "loss": 2.5834, "num_input_tokens_seen": 15689313856, "step": 59850 }, { "epoch": 0.28572450719677545, "grad_norm": 0.21837118268013, "learning_rate": 0.000820437806992512, "loss": 2.5734, "num_input_tokens_seen": 15702421056, "step": 59900 }, { "epoch": 0.2859630084548696, "grad_norm": 0.2157929688692093, "learning_rate": 0.0008161112510289549, "loss": 2.587, "num_input_tokens_seen": 15715528256, "step": 59950 }, { "epoch": 0.2862015097129637, "grad_norm": 0.24053893983364105, "learning_rate": 0.0008117449009293668, "loss": 2.5853, "num_input_tokens_seen": 15728635456, "step": 60000 }, { "epoch": 0.2862015097129637, "eval_loss": 2.470459461212158, "eval_runtime": 53.5859, "eval_samples_per_second": 93.308, "eval_steps_per_second": 23.327, "num_input_tokens_seen": 15728635456, "step": 60000 }, { "epoch": 0.28644001097105787, "grad_norm": 0.25951045751571655, "learning_rate": 0.0008073393063582386, "loss": 2.5946, "num_input_tokens_seen": 15741742656, "step": 60050 }, { "epoch": 0.286678512229152, "grad_norm": 0.22712726891040802, "learning_rate": 0.00080289502192041, "loss": 2.5882, "num_input_tokens_seen": 15754849856, "step": 60100 }, { "epoch": 0.28691701348724613, "grad_norm": 0.2236946076154709, "learning_rate": 0.0007984126070912518, "loss": 2.5854, "num_input_tokens_seen": 15767957056, "step": 60150 }, { "epoch": 0.2871555147453403, "grad_norm": 0.3175867795944214, "learning_rate": 0.0007938926261462366, "loss": 2.5855, "num_input_tokens_seen": 15781064256, "step": 60200 }, { "epoch": 0.2873940160034344, "grad_norm": 0.22954128682613373, "learning_rate": 0.000789335648089903, "loss": 2.595, "num_input_tokens_seen": 15794171456, "step": 60250 }, { "epoch": 0.28763251726152855, "grad_norm": 0.23379147052764893, "learning_rate": 0.000784742246584226, "loss": 2.5872, "num_input_tokens_seen": 15807278656, "step": 60300 }, { "epoch": 0.2878710185196227, "grad_norm": 0.22107115387916565, "learning_rate": 0.0007801129998764014, "loss": 2.5704, "num_input_tokens_seen": 15820385856, "step": 60350 }, { "epoch": 0.2881095197777168, "grad_norm": 0.21197494864463806, "learning_rate": 0.0007754484907260512, "loss": 2.5751, "num_input_tokens_seen": 15833493056, "step": 60400 }, { "epoch": 0.288348021035811, "grad_norm": 0.21372662484645844, "learning_rate": 0.0007707493063318629, "loss": 2.5901, "num_input_tokens_seen": 15846600256, "step": 60450 }, { "epoch": 0.2885865222939051, "grad_norm": 0.23300603032112122, "learning_rate": 0.0007660160382576683, "loss": 2.5888, "num_input_tokens_seen": 15859707456, "step": 60500 }, { "epoch": 0.2885865222939051, "eval_loss": 2.463745355606079, "eval_runtime": 53.032, "eval_samples_per_second": 94.283, "eval_steps_per_second": 23.571, "num_input_tokens_seen": 15859707456, "step": 60500 }, { "epoch": 0.28882502355199924, "grad_norm": 0.2108684778213501, "learning_rate": 0.0007612492823579744, "loss": 2.5965, "num_input_tokens_seen": 15872814656, "step": 60550 }, { "epoch": 0.2890635248100934, "grad_norm": 0.20625820755958557, "learning_rate": 0.0007564496387029531, "loss": 2.5615, "num_input_tokens_seen": 15885921856, "step": 60600 }, { "epoch": 0.2893020260681875, "grad_norm": 0.22595694661140442, "learning_rate": 0.0007516177115029001, "loss": 2.5871, "num_input_tokens_seen": 15899029056, "step": 60650 }, { "epoch": 0.28954052732628166, "grad_norm": 0.2095574140548706, "learning_rate": 0.0007467541090321735, "loss": 2.5867, "num_input_tokens_seen": 15912136256, "step": 60700 }, { "epoch": 0.28977902858437576, "grad_norm": 0.1979990303516388, "learning_rate": 0.00074185944355262, "loss": 2.586, "num_input_tokens_seen": 15925243456, "step": 60750 }, { "epoch": 0.2900175298424699, "grad_norm": 0.3573000431060791, "learning_rate": 0.0007369343312364993, "loss": 2.5807, "num_input_tokens_seen": 15938350656, "step": 60800 }, { "epoch": 0.2902560311005641, "grad_norm": 0.2209523618221283, "learning_rate": 0.0007319793920889171, "loss": 2.5867, "num_input_tokens_seen": 15951457856, "step": 60850 }, { "epoch": 0.2904945323586582, "grad_norm": 0.1979866325855255, "learning_rate": 0.0007269952498697733, "loss": 2.5679, "num_input_tokens_seen": 15964565056, "step": 60900 }, { "epoch": 0.29073303361675235, "grad_norm": 0.2013344019651413, "learning_rate": 0.0007219825320152411, "loss": 2.5842, "num_input_tokens_seen": 15977672256, "step": 60950 }, { "epoch": 0.29097153487484645, "grad_norm": 0.20511233806610107, "learning_rate": 0.0007169418695587791, "loss": 2.5864, "num_input_tokens_seen": 15990779456, "step": 61000 }, { "epoch": 0.29097153487484645, "eval_loss": 2.4598097801208496, "eval_runtime": 53.5493, "eval_samples_per_second": 93.372, "eval_steps_per_second": 23.343, "num_input_tokens_seen": 15990779456, "step": 61000 }, { "epoch": 0.2912100361329406, "grad_norm": 0.19767510890960693, "learning_rate": 0.0007118738970516943, "loss": 2.5963, "num_input_tokens_seen": 16003886656, "step": 61050 }, { "epoch": 0.2914485373910347, "grad_norm": 0.21463529765605927, "learning_rate": 0.0007067792524832604, "loss": 2.5825, "num_input_tokens_seen": 16016993856, "step": 61100 }, { "epoch": 0.29168703864912887, "grad_norm": 0.2011532485485077, "learning_rate": 0.0007016585772004026, "loss": 2.5783, "num_input_tokens_seen": 16030101056, "step": 61150 }, { "epoch": 0.29192553990722303, "grad_norm": 0.19351401925086975, "learning_rate": 0.0006965125158269618, "loss": 2.5619, "num_input_tokens_seen": 16043208256, "step": 61200 }, { "epoch": 0.29216404116531713, "grad_norm": 0.1988568902015686, "learning_rate": 0.000691341716182545, "loss": 2.6007, "num_input_tokens_seen": 16056315456, "step": 61250 }, { "epoch": 0.2924025424234113, "grad_norm": 0.20459413528442383, "learning_rate": 0.0006861468292009726, "loss": 2.5762, "num_input_tokens_seen": 16069422656, "step": 61300 }, { "epoch": 0.2926410436815054, "grad_norm": 0.1914205551147461, "learning_rate": 0.0006809285088483361, "loss": 2.5734, "num_input_tokens_seen": 16082529856, "step": 61350 }, { "epoch": 0.29287954493959956, "grad_norm": 0.194325253367424, "learning_rate": 0.0006756874120406714, "loss": 2.5874, "num_input_tokens_seen": 16095637056, "step": 61400 }, { "epoch": 0.2931180461976937, "grad_norm": 0.20854853093624115, "learning_rate": 0.0006704241985612625, "loss": 2.5865, "num_input_tokens_seen": 16108744256, "step": 61450 }, { "epoch": 0.2933565474557878, "grad_norm": 0.190395787358284, "learning_rate": 0.0006651395309775837, "loss": 2.5716, "num_input_tokens_seen": 16121851456, "step": 61500 }, { "epoch": 0.2933565474557878, "eval_loss": 2.4551966190338135, "eval_runtime": 53.3343, "eval_samples_per_second": 93.748, "eval_steps_per_second": 23.437, "num_input_tokens_seen": 16121851456, "step": 61500 }, { "epoch": 0.293595048713882, "grad_norm": 0.20652073621749878, "learning_rate": 0.0006598340745578908, "loss": 2.5765, "num_input_tokens_seen": 16134958656, "step": 61550 }, { "epoch": 0.2938335499719761, "grad_norm": 0.20701836049556732, "learning_rate": 0.0006545084971874737, "loss": 2.5653, "num_input_tokens_seen": 16148065856, "step": 61600 }, { "epoch": 0.29407205123007024, "grad_norm": 0.1792392134666443, "learning_rate": 0.000649163469284578, "loss": 2.577, "num_input_tokens_seen": 16161173056, "step": 61650 }, { "epoch": 0.2943105524881644, "grad_norm": 0.21742790937423706, "learning_rate": 0.0006437996637160086, "loss": 2.574, "num_input_tokens_seen": 16174280256, "step": 61700 }, { "epoch": 0.2945490537462585, "grad_norm": 0.20747682452201843, "learning_rate": 0.0006384177557124247, "loss": 2.564, "num_input_tokens_seen": 16187387456, "step": 61750 }, { "epoch": 0.29478755500435266, "grad_norm": 0.19990311563014984, "learning_rate": 0.0006330184227833376, "loss": 2.5866, "num_input_tokens_seen": 16200494656, "step": 61800 }, { "epoch": 0.29502605626244677, "grad_norm": 0.20410317182540894, "learning_rate": 0.0006276023446318213, "loss": 2.5559, "num_input_tokens_seen": 16213601856, "step": 61850 }, { "epoch": 0.2952645575205409, "grad_norm": 0.19365034997463226, "learning_rate": 0.000622170203068947, "loss": 2.5705, "num_input_tokens_seen": 16226709056, "step": 61900 }, { "epoch": 0.29550305877863503, "grad_norm": 0.2115161269903183, "learning_rate": 0.0006167226819279528, "loss": 2.5621, "num_input_tokens_seen": 16239816256, "step": 61950 }, { "epoch": 0.2957415600367292, "grad_norm": 0.22992485761642456, "learning_rate": 0.0006112604669781572, "loss": 2.5587, "num_input_tokens_seen": 16252923456, "step": 62000 }, { "epoch": 0.2957415600367292, "eval_loss": 2.452096462249756, "eval_runtime": 53.6354, "eval_samples_per_second": 93.222, "eval_steps_per_second": 23.306, "num_input_tokens_seen": 16252923456, "step": 62000 }, { "epoch": 0.29598006129482335, "grad_norm": 0.1945638656616211, "learning_rate": 0.0006057842458386314, "loss": 2.5582, "num_input_tokens_seen": 16266030656, "step": 62050 }, { "epoch": 0.29621856255291745, "grad_norm": 0.201882466673851, "learning_rate": 0.0006002947078916364, "loss": 2.5764, "num_input_tokens_seen": 16279137856, "step": 62100 }, { "epoch": 0.2964570638110116, "grad_norm": 0.2137998789548874, "learning_rate": 0.0005947925441958392, "loss": 2.5689, "num_input_tokens_seen": 16292245056, "step": 62150 }, { "epoch": 0.2966955650691057, "grad_norm": 0.18265672028064728, "learning_rate": 0.0005892784473993184, "loss": 2.5741, "num_input_tokens_seen": 16305352256, "step": 62200 }, { "epoch": 0.2969340663271999, "grad_norm": 0.16944251954555511, "learning_rate": 0.0005837531116523682, "loss": 2.5537, "num_input_tokens_seen": 16318459456, "step": 62250 }, { "epoch": 0.29717256758529403, "grad_norm": 0.20273485779762268, "learning_rate": 0.0005782172325201155, "loss": 2.5512, "num_input_tokens_seen": 16331566656, "step": 62300 }, { "epoch": 0.29741106884338814, "grad_norm": 0.19320476055145264, "learning_rate": 0.0005726715068949564, "loss": 2.5823, "num_input_tokens_seen": 16344673856, "step": 62350 }, { "epoch": 0.2976495701014823, "grad_norm": 0.21321871876716614, "learning_rate": 0.0005671166329088278, "loss": 2.5608, "num_input_tokens_seen": 16357781056, "step": 62400 }, { "epoch": 0.2978880713595764, "grad_norm": 0.2007117122411728, "learning_rate": 0.0005615533098453215, "loss": 2.5685, "num_input_tokens_seen": 16370888256, "step": 62450 }, { "epoch": 0.29812657261767056, "grad_norm": 0.1896267682313919, "learning_rate": 0.0005559822380516539, "loss": 2.56, "num_input_tokens_seen": 16383995456, "step": 62500 }, { "epoch": 0.29812657261767056, "eval_loss": 2.448042154312134, "eval_runtime": 54.1994, "eval_samples_per_second": 92.252, "eval_steps_per_second": 23.063, "num_input_tokens_seen": 16383995456, "step": 62500 }, { "epoch": 0.2983650738757647, "grad_norm": 0.18581034243106842, "learning_rate": 0.0005504041188505022, "loss": 2.5691, "num_input_tokens_seen": 16397102656, "step": 62550 }, { "epoch": 0.2986035751338588, "grad_norm": 0.19272533059120178, "learning_rate": 0.0005448196544517168, "loss": 2.5635, "num_input_tokens_seen": 16410209856, "step": 62600 }, { "epoch": 0.298842076391953, "grad_norm": 0.19940300285816193, "learning_rate": 0.0005392295478639225, "loss": 2.5755, "num_input_tokens_seen": 16423317056, "step": 62650 }, { "epoch": 0.2990805776500471, "grad_norm": 0.18894875049591064, "learning_rate": 0.0005336345028060199, "loss": 2.5718, "num_input_tokens_seen": 16436424256, "step": 62700 }, { "epoch": 0.29931907890814125, "grad_norm": 0.19226962327957153, "learning_rate": 0.0005280352236185959, "loss": 2.563, "num_input_tokens_seen": 16449531456, "step": 62750 }, { "epoch": 0.2995575801662354, "grad_norm": 0.20716702938079834, "learning_rate": 0.0005224324151752575, "loss": 2.5532, "num_input_tokens_seen": 16462638656, "step": 62800 }, { "epoch": 0.2997960814243295, "grad_norm": 0.20232325792312622, "learning_rate": 0.000516826782793897, "loss": 2.5691, "num_input_tokens_seen": 16475745856, "step": 62850 }, { "epoch": 0.30003458268242367, "grad_norm": 0.19828926026821136, "learning_rate": 0.0005112190321479025, "loss": 2.5602, "num_input_tokens_seen": 16488853056, "step": 62900 }, { "epoch": 0.30027308394051777, "grad_norm": 0.22366905212402344, "learning_rate": 0.000505609869177323, "loss": 2.5556, "num_input_tokens_seen": 16501960256, "step": 62950 }, { "epoch": 0.30051158519861193, "grad_norm": 0.1883884221315384, "learning_rate": 0.0005, "loss": 2.5567, "num_input_tokens_seen": 16515067456, "step": 63000 }, { "epoch": 0.30051158519861193, "eval_loss": 2.4441678524017334, "eval_runtime": 54.2448, "eval_samples_per_second": 92.175, "eval_steps_per_second": 23.044, "num_input_tokens_seen": 16515067456, "step": 63000 }, { "epoch": 0.30075008645670603, "grad_norm": 0.20152603089809418, "learning_rate": 0.0004943901308226771, "loss": 2.5562, "num_input_tokens_seen": 16528174656, "step": 63050 }, { "epoch": 0.3009885877148002, "grad_norm": 0.18534454703330994, "learning_rate": 0.0004887809678520976, "loss": 2.5559, "num_input_tokens_seen": 16541281856, "step": 63100 }, { "epoch": 0.30122708897289435, "grad_norm": 0.18770301342010498, "learning_rate": 0.0004831732172061032, "loss": 2.5538, "num_input_tokens_seen": 16554389056, "step": 63150 }, { "epoch": 0.30146559023098846, "grad_norm": 0.19565705955028534, "learning_rate": 0.0004775675848247427, "loss": 2.5593, "num_input_tokens_seen": 16567496256, "step": 63200 }, { "epoch": 0.3017040914890826, "grad_norm": 0.1954822540283203, "learning_rate": 0.00047196477638140405, "loss": 2.5694, "num_input_tokens_seen": 16580603456, "step": 63250 }, { "epoch": 0.3019425927471767, "grad_norm": 0.18120840191841125, "learning_rate": 0.0004663654971939802, "loss": 2.5622, "num_input_tokens_seen": 16593710656, "step": 63300 }, { "epoch": 0.3021810940052709, "grad_norm": 0.18100927770137787, "learning_rate": 0.0004607704521360776, "loss": 2.5437, "num_input_tokens_seen": 16606817856, "step": 63350 }, { "epoch": 0.30241959526336504, "grad_norm": 0.20565176010131836, "learning_rate": 0.0004551803455482833, "loss": 2.5463, "num_input_tokens_seen": 16619925056, "step": 63400 }, { "epoch": 0.30265809652145914, "grad_norm": 0.18989761173725128, "learning_rate": 0.0004495958811494978, "loss": 2.5609, "num_input_tokens_seen": 16633032256, "step": 63450 }, { "epoch": 0.3028965977795533, "grad_norm": 0.1870686262845993, "learning_rate": 0.0004440177619483461, "loss": 2.5554, "num_input_tokens_seen": 16646139456, "step": 63500 }, { "epoch": 0.3028965977795533, "eval_loss": 2.4395649433135986, "eval_runtime": 53.4665, "eval_samples_per_second": 93.516, "eval_steps_per_second": 23.379, "num_input_tokens_seen": 16646139456, "step": 63500 }, { "epoch": 0.3031350990376474, "grad_norm": 0.1891048699617386, "learning_rate": 0.00043844669015467863, "loss": 2.5627, "num_input_tokens_seen": 16659246656, "step": 63550 }, { "epoch": 0.30337360029574156, "grad_norm": 0.18591411411762238, "learning_rate": 0.0004328833670911724, "loss": 2.5545, "num_input_tokens_seen": 16672353856, "step": 63600 }, { "epoch": 0.3036121015538357, "grad_norm": 0.18640951812267303, "learning_rate": 0.0004273284931050438, "loss": 2.5672, "num_input_tokens_seen": 16685461056, "step": 63650 }, { "epoch": 0.3038506028119298, "grad_norm": 0.1919756680727005, "learning_rate": 0.0004217827674798845, "loss": 2.5492, "num_input_tokens_seen": 16698568256, "step": 63700 }, { "epoch": 0.304089104070024, "grad_norm": 0.18388938903808594, "learning_rate": 0.00041624688834763184, "loss": 2.5487, "num_input_tokens_seen": 16711675456, "step": 63750 }, { "epoch": 0.3043276053281181, "grad_norm": 0.1851562261581421, "learning_rate": 0.0004107215526006817, "loss": 2.5539, "num_input_tokens_seen": 16724782656, "step": 63800 }, { "epoch": 0.30456610658621225, "grad_norm": 0.17315496504306793, "learning_rate": 0.0004052074558041608, "loss": 2.5544, "num_input_tokens_seen": 16737889856, "step": 63850 }, { "epoch": 0.30480460784430635, "grad_norm": 0.17985352873802185, "learning_rate": 0.00039970529210836363, "loss": 2.5511, "num_input_tokens_seen": 16750997056, "step": 63900 }, { "epoch": 0.3050431091024005, "grad_norm": 0.20455212891101837, "learning_rate": 0.0003942157541613686, "loss": 2.5593, "num_input_tokens_seen": 16764104256, "step": 63950 }, { "epoch": 0.30528161036049467, "grad_norm": 0.1965632140636444, "learning_rate": 0.00038873953302184284, "loss": 2.5599, "num_input_tokens_seen": 16777211456, "step": 64000 }, { "epoch": 0.30528161036049467, "eval_loss": 2.437380790710449, "eval_runtime": 53.2524, "eval_samples_per_second": 93.893, "eval_steps_per_second": 23.473, "num_input_tokens_seen": 16777211456, "step": 64000 }, { "epoch": 0.3055201116185888, "grad_norm": 0.1703004688024521, "learning_rate": 0.00038327731807204744, "loss": 2.5506, "num_input_tokens_seen": 16790318656, "step": 64050 }, { "epoch": 0.30575861287668293, "grad_norm": 0.19769616425037384, "learning_rate": 0.00037782979693105293, "loss": 2.542, "num_input_tokens_seen": 16803425856, "step": 64100 }, { "epoch": 0.30599711413477704, "grad_norm": 0.20674961805343628, "learning_rate": 0.00037239765536817873, "loss": 2.539, "num_input_tokens_seen": 16816533056, "step": 64150 }, { "epoch": 0.3062356153928712, "grad_norm": 0.19121839106082916, "learning_rate": 0.0003669815772166625, "loss": 2.5573, "num_input_tokens_seen": 16829640256, "step": 64200 }, { "epoch": 0.30647411665096536, "grad_norm": 0.1734025925397873, "learning_rate": 0.00036158224428757535, "loss": 2.5416, "num_input_tokens_seen": 16842747456, "step": 64250 }, { "epoch": 0.30671261790905946, "grad_norm": 0.1857634037733078, "learning_rate": 0.0003562003362839914, "loss": 2.5652, "num_input_tokens_seen": 16855854656, "step": 64300 }, { "epoch": 0.3069511191671536, "grad_norm": 0.17733143270015717, "learning_rate": 0.000350836530715422, "loss": 2.5299, "num_input_tokens_seen": 16868961856, "step": 64350 }, { "epoch": 0.3071896204252477, "grad_norm": 0.18323005735874176, "learning_rate": 0.00034549150281252633, "loss": 2.5691, "num_input_tokens_seen": 16882069056, "step": 64400 }, { "epoch": 0.3074281216833419, "grad_norm": 0.18570365011692047, "learning_rate": 0.00034016592544210936, "loss": 2.5436, "num_input_tokens_seen": 16895176256, "step": 64450 }, { "epoch": 0.30766662294143604, "grad_norm": 0.18571798503398895, "learning_rate": 0.00033486046902241664, "loss": 2.5382, "num_input_tokens_seen": 16908283456, "step": 64500 }, { "epoch": 0.30766662294143604, "eval_loss": 2.4323015213012695, "eval_runtime": 53.7237, "eval_samples_per_second": 93.069, "eval_steps_per_second": 23.267, "num_input_tokens_seen": 16908283456, "step": 64500 }, { "epoch": 0.30790512419953014, "grad_norm": 0.1829528957605362, "learning_rate": 0.0003295758014387375, "loss": 2.5453, "num_input_tokens_seen": 16921390656, "step": 64550 }, { "epoch": 0.3081436254576243, "grad_norm": 0.1703086644411087, "learning_rate": 0.0003243125879593286, "loss": 2.5441, "num_input_tokens_seen": 16934497856, "step": 64600 }, { "epoch": 0.3083821267157184, "grad_norm": 0.17826180160045624, "learning_rate": 0.000319071491151664, "loss": 2.545, "num_input_tokens_seen": 16947605056, "step": 64650 }, { "epoch": 0.30862062797381257, "grad_norm": 0.17889030277729034, "learning_rate": 0.00031385317079902743, "loss": 2.5405, "num_input_tokens_seen": 16960712256, "step": 64700 }, { "epoch": 0.30885912923190667, "grad_norm": 0.1711336225271225, "learning_rate": 0.0003086582838174551, "loss": 2.5222, "num_input_tokens_seen": 16973819456, "step": 64750 }, { "epoch": 0.30909763049000083, "grad_norm": 0.17962214350700378, "learning_rate": 0.0003034874841730382, "loss": 2.5376, "num_input_tokens_seen": 16986926656, "step": 64800 }, { "epoch": 0.309336131748095, "grad_norm": 0.1699627935886383, "learning_rate": 0.0002983414227995975, "loss": 2.5616, "num_input_tokens_seen": 17000033856, "step": 64850 }, { "epoch": 0.3095746330061891, "grad_norm": 0.18442535400390625, "learning_rate": 0.00029322074751673977, "loss": 2.5377, "num_input_tokens_seen": 17013141056, "step": 64900 }, { "epoch": 0.30981313426428325, "grad_norm": 0.17972196638584137, "learning_rate": 0.0002881261029483057, "loss": 2.5474, "num_input_tokens_seen": 17026248256, "step": 64950 }, { "epoch": 0.31005163552237736, "grad_norm": 0.1810217946767807, "learning_rate": 0.00028305813044122096, "loss": 2.5286, "num_input_tokens_seen": 17039355456, "step": 65000 }, { "epoch": 0.31005163552237736, "eval_loss": 2.4292306900024414, "eval_runtime": 53.3956, "eval_samples_per_second": 93.641, "eval_steps_per_second": 23.41, "num_input_tokens_seen": 17039355456, "step": 65000 }, { "epoch": 0.3102901367804715, "grad_norm": 0.17116400599479675, "learning_rate": 0.000278017467984759, "loss": 2.5504, "num_input_tokens_seen": 17052462656, "step": 65050 }, { "epoch": 0.3105286380385657, "grad_norm": 0.17055106163024902, "learning_rate": 0.00027300475013022663, "loss": 2.543, "num_input_tokens_seen": 17065569856, "step": 65100 }, { "epoch": 0.3107671392966598, "grad_norm": 0.17849299311637878, "learning_rate": 0.000268020607911083, "loss": 2.5476, "num_input_tokens_seen": 17078677056, "step": 65150 }, { "epoch": 0.31100564055475394, "grad_norm": 0.17608341574668884, "learning_rate": 0.0002630656687635007, "loss": 2.5452, "num_input_tokens_seen": 17091784256, "step": 65200 }, { "epoch": 0.31124414181284804, "grad_norm": 0.19086676836013794, "learning_rate": 0.0002581405564473801, "loss": 2.5562, "num_input_tokens_seen": 17104891456, "step": 65250 }, { "epoch": 0.3114826430709422, "grad_norm": 0.1721603125333786, "learning_rate": 0.00025324589096782657, "loss": 2.5402, "num_input_tokens_seen": 17117998656, "step": 65300 }, { "epoch": 0.31172114432903636, "grad_norm": 0.16727598011493683, "learning_rate": 0.00024838228849709997, "loss": 2.5253, "num_input_tokens_seen": 17131105856, "step": 65350 }, { "epoch": 0.31195964558713046, "grad_norm": 0.1664544939994812, "learning_rate": 0.000243550361297047, "loss": 2.5519, "num_input_tokens_seen": 17144213056, "step": 65400 }, { "epoch": 0.3121981468452246, "grad_norm": 0.17195752263069153, "learning_rate": 0.00023875071764202561, "loss": 2.5297, "num_input_tokens_seen": 17157320256, "step": 65450 }, { "epoch": 0.3124366481033187, "grad_norm": 0.19001176953315735, "learning_rate": 0.00023398396174233177, "loss": 2.5439, "num_input_tokens_seen": 17170427456, "step": 65500 }, { "epoch": 0.3124366481033187, "eval_loss": 2.426327705383301, "eval_runtime": 53.7603, "eval_samples_per_second": 93.005, "eval_steps_per_second": 23.251, "num_input_tokens_seen": 17170427456, "step": 65500 }, { "epoch": 0.3126751493614129, "grad_norm": 0.17215538024902344, "learning_rate": 0.00022925069366813716, "loss": 2.5442, "num_input_tokens_seen": 17183534656, "step": 65550 }, { "epoch": 0.31291365061950704, "grad_norm": 0.16736114025115967, "learning_rate": 0.0002245515092739488, "loss": 2.5472, "num_input_tokens_seen": 17196641856, "step": 65600 }, { "epoch": 0.31315215187760115, "grad_norm": 0.1739792823791504, "learning_rate": 0.00021988700012359863, "loss": 2.5401, "num_input_tokens_seen": 17209749056, "step": 65650 }, { "epoch": 0.3133906531356953, "grad_norm": 0.17363224923610687, "learning_rate": 0.00021525775341577403, "loss": 2.5539, "num_input_tokens_seen": 17222856256, "step": 65700 }, { "epoch": 0.3136291543937894, "grad_norm": 0.16787610948085785, "learning_rate": 0.00021066435191009715, "loss": 2.5338, "num_input_tokens_seen": 17235963456, "step": 65750 }, { "epoch": 0.31386765565188357, "grad_norm": 0.17158125340938568, "learning_rate": 0.00020610737385376348, "loss": 2.5531, "num_input_tokens_seen": 17249070656, "step": 65800 }, { "epoch": 0.3141061569099777, "grad_norm": 0.1693524569272995, "learning_rate": 0.00020158739290874821, "loss": 2.5286, "num_input_tokens_seen": 17262177856, "step": 65850 }, { "epoch": 0.31434465816807183, "grad_norm": 0.1730414181947708, "learning_rate": 0.0001971049780795901, "loss": 2.5228, "num_input_tokens_seen": 17275285056, "step": 65900 }, { "epoch": 0.314583159426166, "grad_norm": 0.16220349073410034, "learning_rate": 0.00019266069364176142, "loss": 2.5445, "num_input_tokens_seen": 17288392256, "step": 65950 }, { "epoch": 0.3148216606842601, "grad_norm": 0.1605050265789032, "learning_rate": 0.00018825509907063325, "loss": 2.5491, "num_input_tokens_seen": 17301499456, "step": 66000 }, { "epoch": 0.3148216606842601, "eval_loss": 2.4224469661712646, "eval_runtime": 53.2989, "eval_samples_per_second": 93.811, "eval_steps_per_second": 23.453, "num_input_tokens_seen": 17301499456, "step": 66000 }, { "epoch": 0.31506016194235426, "grad_norm": 0.16586218774318695, "learning_rate": 0.00018388874897104518, "loss": 2.5468, "num_input_tokens_seen": 17314606656, "step": 66050 }, { "epoch": 0.31529866320044836, "grad_norm": 0.1646813303232193, "learning_rate": 0.00017956219300748795, "loss": 2.5352, "num_input_tokens_seen": 17327713856, "step": 66100 }, { "epoch": 0.3155371644585425, "grad_norm": 0.18712937831878662, "learning_rate": 0.00017527597583490823, "loss": 2.5412, "num_input_tokens_seen": 17340821056, "step": 66150 }, { "epoch": 0.3157756657166367, "grad_norm": 0.1631355583667755, "learning_rate": 0.00017103063703014372, "loss": 2.5272, "num_input_tokens_seen": 17353928256, "step": 66200 }, { "epoch": 0.3160141669747308, "grad_norm": 0.15910203754901886, "learning_rate": 0.00016682671102399805, "loss": 2.5333, "num_input_tokens_seen": 17367035456, "step": 66250 }, { "epoch": 0.31625266823282494, "grad_norm": 0.5742849707603455, "learning_rate": 0.00016266472703396284, "loss": 2.5463, "num_input_tokens_seen": 17380142656, "step": 66300 }, { "epoch": 0.31649116949091904, "grad_norm": 0.17517830431461334, "learning_rate": 0.00015854520899759655, "loss": 2.5511, "num_input_tokens_seen": 17393249856, "step": 66350 }, { "epoch": 0.3167296707490132, "grad_norm": 0.6962131857872009, "learning_rate": 0.00015446867550656767, "loss": 2.5452, "num_input_tokens_seen": 17406357056, "step": 66400 }, { "epoch": 0.31696817200710736, "grad_norm": 0.16677837073802948, "learning_rate": 0.00015043563974137132, "loss": 2.5392, "num_input_tokens_seen": 17419464256, "step": 66450 }, { "epoch": 0.31720667326520147, "grad_norm": 0.16235870122909546, "learning_rate": 0.00014644660940672628, "loss": 2.5125, "num_input_tokens_seen": 17432571456, "step": 66500 }, { "epoch": 0.31720667326520147, "eval_loss": 2.419802188873291, "eval_runtime": 52.8641, "eval_samples_per_second": 94.582, "eval_steps_per_second": 23.646, "num_input_tokens_seen": 17432571456, "step": 66500 }, { "epoch": 0.3174451745232956, "grad_norm": 0.17308832705020905, "learning_rate": 0.00014250208666766236, "loss": 2.5349, "num_input_tokens_seen": 17445678656, "step": 66550 }, { "epoch": 0.31768367578138973, "grad_norm": 0.16299477219581604, "learning_rate": 0.00013860256808630427, "loss": 2.5277, "num_input_tokens_seen": 17458785856, "step": 66600 }, { "epoch": 0.3179221770394839, "grad_norm": 0.18277022242546082, "learning_rate": 0.00013474854455936125, "loss": 2.5203, "num_input_tokens_seen": 17471893056, "step": 66650 }, { "epoch": 0.318160678297578, "grad_norm": 0.16096614301204681, "learning_rate": 0.00013094050125632973, "loss": 2.535, "num_input_tokens_seen": 17485000256, "step": 66700 }, { "epoch": 0.31839917955567215, "grad_norm": 0.1723272204399109, "learning_rate": 0.0001271789175584172, "loss": 2.549, "num_input_tokens_seen": 17498107456, "step": 66750 }, { "epoch": 0.3186376808137663, "grad_norm": 0.15782694518566132, "learning_rate": 0.00012346426699819457, "loss": 2.5317, "num_input_tokens_seen": 17511214656, "step": 66800 }, { "epoch": 0.3188761820718604, "grad_norm": 0.1627569943666458, "learning_rate": 0.00011979701719998454, "loss": 2.5382, "num_input_tokens_seen": 17524321856, "step": 66850 }, { "epoch": 0.3191146833299546, "grad_norm": 0.16340333223342896, "learning_rate": 0.00011617762982099444, "loss": 2.5477, "num_input_tokens_seen": 17537429056, "step": 66900 }, { "epoch": 0.3193531845880487, "grad_norm": 0.15788671374320984, "learning_rate": 0.00011260656049319957, "loss": 2.537, "num_input_tokens_seen": 17550536256, "step": 66950 }, { "epoch": 0.31959168584614284, "grad_norm": 0.16191193461418152, "learning_rate": 0.0001090842587659851, "loss": 2.5394, "num_input_tokens_seen": 17563643456, "step": 67000 }, { "epoch": 0.31959168584614284, "eval_loss": 2.417813301086426, "eval_runtime": 53.532, "eval_samples_per_second": 93.402, "eval_steps_per_second": 23.351, "num_input_tokens_seen": 17563643456, "step": 67000 }, { "epoch": 0.319830187104237, "grad_norm": 0.1690913438796997, "learning_rate": 0.00010561116804955451, "loss": 2.5364, "num_input_tokens_seen": 17576750656, "step": 67050 }, { "epoch": 0.3200686883623311, "grad_norm": 0.16436229646205902, "learning_rate": 0.00010218772555910954, "loss": 2.5298, "num_input_tokens_seen": 17589857856, "step": 67100 }, { "epoch": 0.32030718962042526, "grad_norm": 0.15499907732009888, "learning_rate": 9.881436225981105e-05, "loss": 2.5484, "num_input_tokens_seen": 17602965056, "step": 67150 }, { "epoch": 0.32054569087851936, "grad_norm": 0.16237874329090118, "learning_rate": 9.549150281252633e-05, "loss": 2.5271, "num_input_tokens_seen": 17616072256, "step": 67200 }, { "epoch": 0.3207841921366135, "grad_norm": 0.16813968122005463, "learning_rate": 9.221956552036992e-05, "loss": 2.5295, "num_input_tokens_seen": 17629179456, "step": 67250 }, { "epoch": 0.3210226933947077, "grad_norm": 0.15672080218791962, "learning_rate": 8.899896227604509e-05, "loss": 2.528, "num_input_tokens_seen": 17642286656, "step": 67300 }, { "epoch": 0.3212611946528018, "grad_norm": 0.16523708403110504, "learning_rate": 8.58300985099918e-05, "loss": 2.5288, "num_input_tokens_seen": 17655393856, "step": 67350 }, { "epoch": 0.32149969591089594, "grad_norm": 0.16759687662124634, "learning_rate": 8.271337313934868e-05, "loss": 2.5431, "num_input_tokens_seen": 17668501056, "step": 67400 }, { "epoch": 0.32173819716899005, "grad_norm": 0.15507538616657257, "learning_rate": 7.964917851773496e-05, "loss": 2.5342, "num_input_tokens_seen": 17681608256, "step": 67450 }, { "epoch": 0.3219766984270842, "grad_norm": 0.1556961089372635, "learning_rate": 7.663790038585794e-05, "loss": 2.5189, "num_input_tokens_seen": 17694715456, "step": 67500 }, { "epoch": 0.3219766984270842, "eval_loss": 2.415555000305176, "eval_runtime": 53.2935, "eval_samples_per_second": 93.82, "eval_steps_per_second": 23.455, "num_input_tokens_seen": 17694715456, "step": 67500 }, { "epoch": 0.32221519968517837, "grad_norm": 0.16804397106170654, "learning_rate": 7.367991782295391e-05, "loss": 2.5218, "num_input_tokens_seen": 17707822656, "step": 67550 }, { "epoch": 0.32245370094327247, "grad_norm": 0.15728074312210083, "learning_rate": 7.077560319906695e-05, "loss": 2.5261, "num_input_tokens_seen": 17720929856, "step": 67600 }, { "epoch": 0.32269220220136663, "grad_norm": 0.1641319841146469, "learning_rate": 6.792532212817271e-05, "loss": 2.5398, "num_input_tokens_seen": 17734037056, "step": 67650 }, { "epoch": 0.32293070345946073, "grad_norm": 0.1575596034526825, "learning_rate": 6.512943342215233e-05, "loss": 2.5211, "num_input_tokens_seen": 17747144256, "step": 67700 }, { "epoch": 0.3231692047175549, "grad_norm": 0.16352206468582153, "learning_rate": 6.238828904562316e-05, "loss": 2.5143, "num_input_tokens_seen": 17760251456, "step": 67750 }, { "epoch": 0.323407705975649, "grad_norm": 0.16303551197052002, "learning_rate": 5.9702234071631e-05, "loss": 2.5262, "num_input_tokens_seen": 17773358656, "step": 67800 }, { "epoch": 0.32364620723374316, "grad_norm": 0.15572308003902435, "learning_rate": 5.7071606638210094e-05, "loss": 2.5278, "num_input_tokens_seen": 17786465856, "step": 67850 }, { "epoch": 0.3238847084918373, "grad_norm": 0.15960544347763062, "learning_rate": 5.449673790581611e-05, "loss": 2.522, "num_input_tokens_seen": 17799573056, "step": 67900 }, { "epoch": 0.3241232097499314, "grad_norm": 0.15617695450782776, "learning_rate": 5.197795201563743e-05, "loss": 2.5151, "num_input_tokens_seen": 17812680256, "step": 67950 }, { "epoch": 0.3243617110080256, "grad_norm": 0.1527390033006668, "learning_rate": 4.9515566048790485e-05, "loss": 2.5213, "num_input_tokens_seen": 17825787456, "step": 68000 }, { "epoch": 0.3243617110080256, "eval_loss": 2.4139962196350098, "eval_runtime": 53.933, "eval_samples_per_second": 92.708, "eval_steps_per_second": 23.177, "num_input_tokens_seen": 17825787456, "step": 68000 }, { "epoch": 0.3246002122661197, "grad_norm": 0.15067367255687714, "learning_rate": 4.7109889986402973e-05, "loss": 2.5181, "num_input_tokens_seen": 17838894656, "step": 68050 }, { "epoch": 0.32483871352421384, "grad_norm": 0.1534261703491211, "learning_rate": 4.476122667059207e-05, "loss": 2.533, "num_input_tokens_seen": 17852001856, "step": 68100 }, { "epoch": 0.325077214782308, "grad_norm": 0.1585472822189331, "learning_rate": 4.2469871766340095e-05, "loss": 2.509, "num_input_tokens_seen": 17865109056, "step": 68150 }, { "epoch": 0.3253157160404021, "grad_norm": 0.15480853617191315, "learning_rate": 4.0236113724274713e-05, "loss": 2.524, "num_input_tokens_seen": 17878216256, "step": 68200 }, { "epoch": 0.32555421729849626, "grad_norm": 0.24341611564159393, "learning_rate": 3.806023374435663e-05, "loss": 2.5293, "num_input_tokens_seen": 17891323456, "step": 68250 }, { "epoch": 0.32579271855659037, "grad_norm": 0.15290473401546478, "learning_rate": 3.594250574048058e-05, "loss": 2.5149, "num_input_tokens_seen": 17904430656, "step": 68300 }, { "epoch": 0.3260312198146845, "grad_norm": 0.1606835126876831, "learning_rate": 3.3883196305992905e-05, "loss": 2.5327, "num_input_tokens_seen": 17917537856, "step": 68350 }, { "epoch": 0.3262697210727787, "grad_norm": 0.1537574976682663, "learning_rate": 3.18825646801314e-05, "loss": 2.5416, "num_input_tokens_seen": 17930645056, "step": 68400 }, { "epoch": 0.3265082223308728, "grad_norm": 0.16943201422691345, "learning_rate": 2.994086271539048e-05, "loss": 2.5233, "num_input_tokens_seen": 17943752256, "step": 68450 }, { "epoch": 0.32674672358896695, "grad_norm": 0.15832561254501343, "learning_rate": 2.8058334845816213e-05, "loss": 2.5439, "num_input_tokens_seen": 17956859456, "step": 68500 }, { "epoch": 0.32674672358896695, "eval_loss": 2.4128847122192383, "eval_runtime": 53.1054, "eval_samples_per_second": 94.152, "eval_steps_per_second": 23.538, "num_input_tokens_seen": 17956859456, "step": 68500 }, { "epoch": 0.32698522484706105, "grad_norm": 0.15245509147644043, "learning_rate": 2.6235218056235634e-05, "loss": 2.5209, "num_input_tokens_seen": 17969966656, "step": 68550 }, { "epoch": 0.3272237261051552, "grad_norm": 0.15148235857486725, "learning_rate": 2.4471741852423235e-05, "loss": 2.5284, "num_input_tokens_seen": 17983073856, "step": 68600 }, { "epoch": 0.3274622273632493, "grad_norm": 0.15678688883781433, "learning_rate": 2.276812823220964e-05, "loss": 2.537, "num_input_tokens_seen": 17996181056, "step": 68650 }, { "epoch": 0.3277007286213435, "grad_norm": 0.15105360746383667, "learning_rate": 2.1124591657534777e-05, "loss": 2.5321, "num_input_tokens_seen": 18009288256, "step": 68700 }, { "epoch": 0.32793922987943763, "grad_norm": 0.15369552373886108, "learning_rate": 1.9541339027450256e-05, "loss": 2.5291, "num_input_tokens_seen": 18022395456, "step": 68750 }, { "epoch": 0.32817773113753174, "grad_norm": 0.1551530808210373, "learning_rate": 1.801856965207338e-05, "loss": 2.5201, "num_input_tokens_seen": 18035502656, "step": 68800 }, { "epoch": 0.3284162323956259, "grad_norm": 0.14859162271022797, "learning_rate": 1.6556475227496815e-05, "loss": 2.5436, "num_input_tokens_seen": 18048609856, "step": 68850 }, { "epoch": 0.32865473365372, "grad_norm": 0.14972691237926483, "learning_rate": 1.5155239811656562e-05, "loss": 2.5221, "num_input_tokens_seen": 18061717056, "step": 68900 }, { "epoch": 0.32889323491181416, "grad_norm": 0.156805619597435, "learning_rate": 1.3815039801161721e-05, "loss": 2.5248, "num_input_tokens_seen": 18074824256, "step": 68950 }, { "epoch": 0.3291317361699083, "grad_norm": 0.148334801197052, "learning_rate": 1.2536043909088191e-05, "loss": 2.5361, "num_input_tokens_seen": 18087931456, "step": 69000 }, { "epoch": 0.3291317361699083, "eval_loss": 2.4120428562164307, "eval_runtime": 52.9258, "eval_samples_per_second": 94.472, "eval_steps_per_second": 23.618, "num_input_tokens_seen": 18087931456, "step": 69000 }, { "epoch": 0.3293702374280024, "grad_norm": 0.14565595984458923, "learning_rate": 1.1318413143740436e-05, "loss": 2.5358, "num_input_tokens_seen": 18101038656, "step": 69050 }, { "epoch": 0.3296087386860966, "grad_norm": 0.15810008347034454, "learning_rate": 1.0162300788382261e-05, "loss": 2.5288, "num_input_tokens_seen": 18114145856, "step": 69100 }, { "epoch": 0.3298472399441907, "grad_norm": 0.14960281550884247, "learning_rate": 9.0678523819408e-06, "loss": 2.5267, "num_input_tokens_seen": 18127253056, "step": 69150 }, { "epoch": 0.33008574120228484, "grad_norm": 0.14473624527454376, "learning_rate": 8.035205700685167e-06, "loss": 2.5133, "num_input_tokens_seen": 18140360256, "step": 69200 }, { "epoch": 0.330324242460379, "grad_norm": 0.1450708657503128, "learning_rate": 7.064490740882057e-06, "loss": 2.5302, "num_input_tokens_seen": 18153467456, "step": 69250 }, { "epoch": 0.3305627437184731, "grad_norm": 0.14883211255073547, "learning_rate": 6.15582970243117e-06, "loss": 2.5307, "num_input_tokens_seen": 18166574656, "step": 69300 }, { "epoch": 0.33080124497656727, "grad_norm": 0.15696081519126892, "learning_rate": 5.309336973481682e-06, "loss": 2.5341, "num_input_tokens_seen": 18179681856, "step": 69350 }, { "epoch": 0.33103974623466137, "grad_norm": 0.1564367264509201, "learning_rate": 4.52511911603265e-06, "loss": 2.5299, "num_input_tokens_seen": 18192789056, "step": 69400 }, { "epoch": 0.33127824749275553, "grad_norm": 0.15558916330337524, "learning_rate": 3.803274852517968e-06, "loss": 2.5197, "num_input_tokens_seen": 18205896256, "step": 69450 }, { "epoch": 0.3315167487508497, "grad_norm": 0.1532556265592575, "learning_rate": 3.143895053378698e-06, "loss": 2.5176, "num_input_tokens_seen": 18219003456, "step": 69500 }, { "epoch": 0.3315167487508497, "eval_loss": 2.412046194076538, "eval_runtime": 53.2476, "eval_samples_per_second": 93.901, "eval_steps_per_second": 23.475, "num_input_tokens_seen": 18219003456, "step": 69500 }, { "epoch": 0.3317552500089438, "grad_norm": 0.1502823829650879, "learning_rate": 2.547062725623828e-06, "loss": 2.5207, "num_input_tokens_seen": 18232110656, "step": 69550 }, { "epoch": 0.33199375126703795, "grad_norm": 0.1560440957546234, "learning_rate": 2.012853002380466e-06, "loss": 2.5078, "num_input_tokens_seen": 18245217856, "step": 69600 }, { "epoch": 0.33223225252513205, "grad_norm": 0.15284490585327148, "learning_rate": 1.541333133436018e-06, "loss": 2.5404, "num_input_tokens_seen": 18258325056, "step": 69650 }, { "epoch": 0.3324707537832262, "grad_norm": 0.14594900608062744, "learning_rate": 1.132562476771959e-06, "loss": 2.5267, "num_input_tokens_seen": 18271432256, "step": 69700 }, { "epoch": 0.3327092550413203, "grad_norm": 0.15198394656181335, "learning_rate": 7.865924910916978e-07, "loss": 2.5232, "num_input_tokens_seen": 18284539456, "step": 69750 }, { "epoch": 0.3329477562994145, "grad_norm": 0.15011271834373474, "learning_rate": 5.034667293427053e-07, "loss": 2.5308, "num_input_tokens_seen": 18297646656, "step": 69800 }, { "epoch": 0.33318625755750864, "grad_norm": 0.147654727101326, "learning_rate": 2.8322083323334415e-07, "loss": 2.5281, "num_input_tokens_seen": 18310753856, "step": 69850 }, { "epoch": 0.33342475881560274, "grad_norm": 0.15056386590003967, "learning_rate": 1.2588252874673466e-07, "loss": 2.5112, "num_input_tokens_seen": 18323861056, "step": 69900 }, { "epoch": 0.3336632600736969, "grad_norm": 0.14858213067054749, "learning_rate": 3.147162264971471e-08, "loss": 2.5226, "num_input_tokens_seen": 18336968256, "step": 69950 }, { "epoch": 0.333901761331791, "grad_norm": 0.1534891128540039, "learning_rate": 0.0, "loss": 2.5303, "num_input_tokens_seen": 18350075456, "step": 70000 }, { "epoch": 0.333901761331791, "eval_loss": 2.411842107772827, "eval_runtime": 53.9812, "eval_samples_per_second": 92.625, "eval_steps_per_second": 23.156, "num_input_tokens_seen": 18350075456, "step": 70000 }, { "epoch": 0.333901761331791, "num_input_tokens_seen": 18350075456, "step": 70000, "total_flos": 4.908824281216451e+18, "train_loss": 2.719220863015311, "train_runtime": 98058.4226, "train_samples_per_second": 182.748, "train_steps_per_second": 0.714, "train_tokens_per_second": 187134.155 } ], "logging_steps": 50, "max_steps": 70000, "num_input_tokens_seen": 18350075456, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.908824281216451e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }