smallm_70 / last-checkpoint /trainer_state.json
Azrail's picture
Training in progress, step 65000, checkpoint
295ed29 verified
raw
history blame
302 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.31005163552237736,
"eval_steps": 500,
"global_step": 65000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00023850125809413643,
"grad_norm": 1.5476292371749878,
"learning_rate": 0.0001,
"loss": 9.8285,
"num_input_tokens_seen": 13107200,
"step": 50
},
{
"epoch": 0.00047700251618827287,
"grad_norm": 0.47720426321029663,
"learning_rate": 0.0002,
"loss": 7.964,
"num_input_tokens_seen": 26214400,
"step": 100
},
{
"epoch": 0.0007155037742824094,
"grad_norm": 0.8443030714988708,
"learning_rate": 0.0003,
"loss": 7.0452,
"num_input_tokens_seen": 39321600,
"step": 150
},
{
"epoch": 0.0009540050323765457,
"grad_norm": 0.5895723104476929,
"learning_rate": 0.0004,
"loss": 6.355,
"num_input_tokens_seen": 52428800,
"step": 200
},
{
"epoch": 0.0011925062904706823,
"grad_norm": 0.8343789577484131,
"learning_rate": 0.0005,
"loss": 5.8716,
"num_input_tokens_seen": 65536000,
"step": 250
},
{
"epoch": 0.0014310075485648188,
"grad_norm": 0.5747953057289124,
"learning_rate": 0.0006,
"loss": 5.5086,
"num_input_tokens_seen": 78643200,
"step": 300
},
{
"epoch": 0.0016695088066589552,
"grad_norm": 0.8383421301841736,
"learning_rate": 0.0007,
"loss": 5.2217,
"num_input_tokens_seen": 91750400,
"step": 350
},
{
"epoch": 0.0019080100647530915,
"grad_norm": 0.5696113109588623,
"learning_rate": 0.0008,
"loss": 4.9683,
"num_input_tokens_seen": 104857600,
"step": 400
},
{
"epoch": 0.002146511322847228,
"grad_norm": 0.5431691408157349,
"learning_rate": 0.0009000000000000001,
"loss": 4.7629,
"num_input_tokens_seen": 117964800,
"step": 450
},
{
"epoch": 0.0023850125809413646,
"grad_norm": 0.4571855664253235,
"learning_rate": 0.001,
"loss": 4.5284,
"num_input_tokens_seen": 131072000,
"step": 500
},
{
"epoch": 0.0023850125809413646,
"eval_loss": 4.309167385101318,
"eval_runtime": 53.3891,
"eval_samples_per_second": 93.652,
"eval_steps_per_second": 23.413,
"num_input_tokens_seen": 131072000,
"step": 500
},
{
"epoch": 0.002623513839035501,
"grad_norm": 0.43464773893356323,
"learning_rate": 0.001,
"loss": 4.3379,
"num_input_tokens_seen": 144179200,
"step": 550
},
{
"epoch": 0.0028620150971296375,
"grad_norm": 0.49611660838127136,
"learning_rate": 0.001,
"loss": 4.1712,
"num_input_tokens_seen": 157286400,
"step": 600
},
{
"epoch": 0.0031005163552237738,
"grad_norm": 0.4060957729816437,
"learning_rate": 0.001,
"loss": 4.0436,
"num_input_tokens_seen": 170393600,
"step": 650
},
{
"epoch": 0.0033390176133179105,
"grad_norm": 0.37300577759742737,
"learning_rate": 0.001,
"loss": 3.9582,
"num_input_tokens_seen": 183500800,
"step": 700
},
{
"epoch": 0.0035775188714120467,
"grad_norm": 0.4117021858692169,
"learning_rate": 0.001,
"loss": 3.8674,
"num_input_tokens_seen": 196608000,
"step": 750
},
{
"epoch": 0.003816020129506183,
"grad_norm": 0.3335980772972107,
"learning_rate": 0.001,
"loss": 3.8031,
"num_input_tokens_seen": 209715200,
"step": 800
},
{
"epoch": 0.004054521387600319,
"grad_norm": 0.35943159461021423,
"learning_rate": 0.001,
"loss": 3.7534,
"num_input_tokens_seen": 222822400,
"step": 850
},
{
"epoch": 0.004293022645694456,
"grad_norm": 0.40000948309898376,
"learning_rate": 0.001,
"loss": 3.6867,
"num_input_tokens_seen": 235929600,
"step": 900
},
{
"epoch": 0.0045315239037885926,
"grad_norm": 0.3165877163410187,
"learning_rate": 0.001,
"loss": 3.6565,
"num_input_tokens_seen": 249036800,
"step": 950
},
{
"epoch": 0.004770025161882729,
"grad_norm": 0.3687070906162262,
"learning_rate": 0.001,
"loss": 3.6005,
"num_input_tokens_seen": 262144000,
"step": 1000
},
{
"epoch": 0.004770025161882729,
"eval_loss": 3.4853296279907227,
"eval_runtime": 52.477,
"eval_samples_per_second": 95.28,
"eval_steps_per_second": 23.82,
"num_input_tokens_seen": 262144000,
"step": 1000
},
{
"epoch": 0.005008526419976865,
"grad_norm": 0.32389721274375916,
"learning_rate": 0.001,
"loss": 3.5663,
"num_input_tokens_seen": 275251200,
"step": 1050
},
{
"epoch": 0.005247027678071002,
"grad_norm": 0.3202049434185028,
"learning_rate": 0.001,
"loss": 3.5376,
"num_input_tokens_seen": 288358400,
"step": 1100
},
{
"epoch": 0.005485528936165138,
"grad_norm": 0.30287981033325195,
"learning_rate": 0.001,
"loss": 3.5135,
"num_input_tokens_seen": 301465600,
"step": 1150
},
{
"epoch": 0.005724030194259275,
"grad_norm": 0.3624540865421295,
"learning_rate": 0.001,
"loss": 3.4814,
"num_input_tokens_seen": 314572800,
"step": 1200
},
{
"epoch": 0.005962531452353411,
"grad_norm": 0.30017992854118347,
"learning_rate": 0.001,
"loss": 3.4476,
"num_input_tokens_seen": 327680000,
"step": 1250
},
{
"epoch": 0.0062010327104475476,
"grad_norm": 0.3169330060482025,
"learning_rate": 0.001,
"loss": 3.4179,
"num_input_tokens_seen": 340787200,
"step": 1300
},
{
"epoch": 0.006439533968541684,
"grad_norm": 0.2730589210987091,
"learning_rate": 0.001,
"loss": 3.4074,
"num_input_tokens_seen": 353894400,
"step": 1350
},
{
"epoch": 0.006678035226635821,
"grad_norm": 0.2927146553993225,
"learning_rate": 0.001,
"loss": 3.3757,
"num_input_tokens_seen": 367001600,
"step": 1400
},
{
"epoch": 0.006916536484729957,
"grad_norm": 0.34230080246925354,
"learning_rate": 0.001,
"loss": 3.3502,
"num_input_tokens_seen": 380108800,
"step": 1450
},
{
"epoch": 0.007155037742824093,
"grad_norm": 0.30472344160079956,
"learning_rate": 0.001,
"loss": 3.3639,
"num_input_tokens_seen": 393216000,
"step": 1500
},
{
"epoch": 0.007155037742824093,
"eval_loss": 3.2486374378204346,
"eval_runtime": 52.482,
"eval_samples_per_second": 95.271,
"eval_steps_per_second": 23.818,
"num_input_tokens_seen": 393216000,
"step": 1500
},
{
"epoch": 0.00739353900091823,
"grad_norm": 0.26124337315559387,
"learning_rate": 0.001,
"loss": 3.3521,
"num_input_tokens_seen": 406323200,
"step": 1550
},
{
"epoch": 0.007632040259012366,
"grad_norm": 0.29117754101753235,
"learning_rate": 0.001,
"loss": 3.315,
"num_input_tokens_seen": 419430400,
"step": 1600
},
{
"epoch": 0.007870541517106503,
"grad_norm": 0.24080802500247955,
"learning_rate": 0.001,
"loss": 3.3103,
"num_input_tokens_seen": 432537600,
"step": 1650
},
{
"epoch": 0.008109042775200638,
"grad_norm": 0.29982003569602966,
"learning_rate": 0.001,
"loss": 3.2926,
"num_input_tokens_seen": 445644800,
"step": 1700
},
{
"epoch": 0.008347544033294775,
"grad_norm": 0.26795274019241333,
"learning_rate": 0.001,
"loss": 3.2843,
"num_input_tokens_seen": 458752000,
"step": 1750
},
{
"epoch": 0.008586045291388912,
"grad_norm": 0.252774715423584,
"learning_rate": 0.001,
"loss": 3.274,
"num_input_tokens_seen": 471859200,
"step": 1800
},
{
"epoch": 0.008824546549483048,
"grad_norm": 0.25432145595550537,
"learning_rate": 0.001,
"loss": 3.2533,
"num_input_tokens_seen": 484966400,
"step": 1850
},
{
"epoch": 0.009063047807577185,
"grad_norm": 0.25918108224868774,
"learning_rate": 0.001,
"loss": 3.2501,
"num_input_tokens_seen": 498073600,
"step": 1900
},
{
"epoch": 0.009301549065671322,
"grad_norm": 0.2482348382472992,
"learning_rate": 0.001,
"loss": 3.2541,
"num_input_tokens_seen": 511180800,
"step": 1950
},
{
"epoch": 0.009540050323765458,
"grad_norm": 0.2615273594856262,
"learning_rate": 0.001,
"loss": 3.2218,
"num_input_tokens_seen": 524288000,
"step": 2000
},
{
"epoch": 0.009540050323765458,
"eval_loss": 3.1193039417266846,
"eval_runtime": 52.7955,
"eval_samples_per_second": 94.705,
"eval_steps_per_second": 23.676,
"num_input_tokens_seen": 524288000,
"step": 2000
},
{
"epoch": 0.009778551581859595,
"grad_norm": 0.2637729048728943,
"learning_rate": 0.001,
"loss": 3.2285,
"num_input_tokens_seen": 537395200,
"step": 2050
},
{
"epoch": 0.01001705283995373,
"grad_norm": 0.23936080932617188,
"learning_rate": 0.001,
"loss": 3.2119,
"num_input_tokens_seen": 550502400,
"step": 2100
},
{
"epoch": 0.010255554098047867,
"grad_norm": 0.2469020038843155,
"learning_rate": 0.001,
"loss": 3.2021,
"num_input_tokens_seen": 563609600,
"step": 2150
},
{
"epoch": 0.010494055356142003,
"grad_norm": 0.2304004430770874,
"learning_rate": 0.001,
"loss": 3.1874,
"num_input_tokens_seen": 576716800,
"step": 2200
},
{
"epoch": 0.01073255661423614,
"grad_norm": 0.232864648103714,
"learning_rate": 0.001,
"loss": 3.1897,
"num_input_tokens_seen": 589824000,
"step": 2250
},
{
"epoch": 0.010971057872330277,
"grad_norm": 0.23161470890045166,
"learning_rate": 0.001,
"loss": 3.1689,
"num_input_tokens_seen": 602931200,
"step": 2300
},
{
"epoch": 0.011209559130424413,
"grad_norm": 0.20868408679962158,
"learning_rate": 0.001,
"loss": 3.1615,
"num_input_tokens_seen": 616038400,
"step": 2350
},
{
"epoch": 0.01144806038851855,
"grad_norm": 0.23374608159065247,
"learning_rate": 0.001,
"loss": 3.1556,
"num_input_tokens_seen": 629145600,
"step": 2400
},
{
"epoch": 0.011686561646612685,
"grad_norm": 0.21716611087322235,
"learning_rate": 0.001,
"loss": 3.1463,
"num_input_tokens_seen": 642252800,
"step": 2450
},
{
"epoch": 0.011925062904706822,
"grad_norm": 0.23689670860767365,
"learning_rate": 0.001,
"loss": 3.1433,
"num_input_tokens_seen": 655360000,
"step": 2500
},
{
"epoch": 0.011925062904706822,
"eval_loss": 3.040046215057373,
"eval_runtime": 52.9109,
"eval_samples_per_second": 94.498,
"eval_steps_per_second": 23.625,
"num_input_tokens_seen": 655360000,
"step": 2500
},
{
"epoch": 0.012163564162800958,
"grad_norm": 0.2245575189590454,
"learning_rate": 0.001,
"loss": 3.1445,
"num_input_tokens_seen": 668467200,
"step": 2550
},
{
"epoch": 0.012402065420895095,
"grad_norm": 0.20992259681224823,
"learning_rate": 0.001,
"loss": 3.1447,
"num_input_tokens_seen": 681574400,
"step": 2600
},
{
"epoch": 0.012640566678989232,
"grad_norm": 0.21792201697826385,
"learning_rate": 0.001,
"loss": 3.1323,
"num_input_tokens_seen": 694681600,
"step": 2650
},
{
"epoch": 0.012879067937083368,
"grad_norm": 0.243458554148674,
"learning_rate": 0.001,
"loss": 3.1084,
"num_input_tokens_seen": 707788800,
"step": 2700
},
{
"epoch": 0.013117569195177505,
"grad_norm": 0.21190515160560608,
"learning_rate": 0.001,
"loss": 3.1202,
"num_input_tokens_seen": 720896000,
"step": 2750
},
{
"epoch": 0.013356070453271642,
"grad_norm": 0.2461613118648529,
"learning_rate": 0.001,
"loss": 3.1007,
"num_input_tokens_seen": 734003200,
"step": 2800
},
{
"epoch": 0.013594571711365777,
"grad_norm": 0.1976248323917389,
"learning_rate": 0.001,
"loss": 3.1079,
"num_input_tokens_seen": 747110400,
"step": 2850
},
{
"epoch": 0.013833072969459913,
"grad_norm": 0.22097842395305634,
"learning_rate": 0.001,
"loss": 3.0846,
"num_input_tokens_seen": 760217600,
"step": 2900
},
{
"epoch": 0.01407157422755405,
"grad_norm": 0.20581132173538208,
"learning_rate": 0.001,
"loss": 3.0995,
"num_input_tokens_seen": 773324800,
"step": 2950
},
{
"epoch": 0.014310075485648187,
"grad_norm": 0.19790051877498627,
"learning_rate": 0.001,
"loss": 3.0977,
"num_input_tokens_seen": 786432000,
"step": 3000
},
{
"epoch": 0.014310075485648187,
"eval_loss": 2.9804909229278564,
"eval_runtime": 53.1278,
"eval_samples_per_second": 94.113,
"eval_steps_per_second": 23.528,
"num_input_tokens_seen": 786432000,
"step": 3000
},
{
"epoch": 0.014548576743742323,
"grad_norm": 0.20328116416931152,
"learning_rate": 0.001,
"loss": 3.0872,
"num_input_tokens_seen": 799539200,
"step": 3050
},
{
"epoch": 0.01478707800183646,
"grad_norm": 0.21318025887012482,
"learning_rate": 0.001,
"loss": 3.0861,
"num_input_tokens_seen": 812646400,
"step": 3100
},
{
"epoch": 0.015025579259930597,
"grad_norm": 0.22170069813728333,
"learning_rate": 0.001,
"loss": 3.0618,
"num_input_tokens_seen": 825753600,
"step": 3150
},
{
"epoch": 0.015264080518024732,
"grad_norm": 0.21292312443256378,
"learning_rate": 0.001,
"loss": 3.0567,
"num_input_tokens_seen": 838860800,
"step": 3200
},
{
"epoch": 0.015502581776118868,
"grad_norm": 0.2331959754228592,
"learning_rate": 0.001,
"loss": 3.0714,
"num_input_tokens_seen": 851968000,
"step": 3250
},
{
"epoch": 0.015741083034213007,
"grad_norm": 0.19236011803150177,
"learning_rate": 0.001,
"loss": 3.059,
"num_input_tokens_seen": 865075200,
"step": 3300
},
{
"epoch": 0.015979584292307142,
"grad_norm": 0.19991376996040344,
"learning_rate": 0.001,
"loss": 3.0542,
"num_input_tokens_seen": 878182400,
"step": 3350
},
{
"epoch": 0.016218085550401277,
"grad_norm": 0.2042934149503708,
"learning_rate": 0.001,
"loss": 3.0517,
"num_input_tokens_seen": 891289600,
"step": 3400
},
{
"epoch": 0.016456586808495415,
"grad_norm": 0.19254428148269653,
"learning_rate": 0.001,
"loss": 3.0415,
"num_input_tokens_seen": 904396800,
"step": 3450
},
{
"epoch": 0.01669508806658955,
"grad_norm": 0.19211998581886292,
"learning_rate": 0.001,
"loss": 3.0253,
"num_input_tokens_seen": 917504000,
"step": 3500
},
{
"epoch": 0.01669508806658955,
"eval_loss": 2.937037944793701,
"eval_runtime": 52.6773,
"eval_samples_per_second": 94.918,
"eval_steps_per_second": 23.729,
"num_input_tokens_seen": 917504000,
"step": 3500
},
{
"epoch": 0.01693358932468369,
"grad_norm": 0.19596482813358307,
"learning_rate": 0.001,
"loss": 3.053,
"num_input_tokens_seen": 930611200,
"step": 3550
},
{
"epoch": 0.017172090582777823,
"grad_norm": 0.20214103162288666,
"learning_rate": 0.001,
"loss": 3.0385,
"num_input_tokens_seen": 943718400,
"step": 3600
},
{
"epoch": 0.017410591840871962,
"grad_norm": 0.18580283224582672,
"learning_rate": 0.001,
"loss": 3.0354,
"num_input_tokens_seen": 956825600,
"step": 3650
},
{
"epoch": 0.017649093098966097,
"grad_norm": 0.18928515911102295,
"learning_rate": 0.001,
"loss": 3.0292,
"num_input_tokens_seen": 969932800,
"step": 3700
},
{
"epoch": 0.017887594357060232,
"grad_norm": 0.19066137075424194,
"learning_rate": 0.001,
"loss": 3.0206,
"num_input_tokens_seen": 983040000,
"step": 3750
},
{
"epoch": 0.01812609561515437,
"grad_norm": 0.20291416347026825,
"learning_rate": 0.001,
"loss": 3.0254,
"num_input_tokens_seen": 996147200,
"step": 3800
},
{
"epoch": 0.018364596873248505,
"grad_norm": 0.19991491734981537,
"learning_rate": 0.001,
"loss": 3.0212,
"num_input_tokens_seen": 1009254400,
"step": 3850
},
{
"epoch": 0.018603098131342644,
"grad_norm": 0.19553051888942719,
"learning_rate": 0.001,
"loss": 3.0229,
"num_input_tokens_seen": 1022361600,
"step": 3900
},
{
"epoch": 0.01884159938943678,
"grad_norm": 0.19302095472812653,
"learning_rate": 0.001,
"loss": 3.0137,
"num_input_tokens_seen": 1035468800,
"step": 3950
},
{
"epoch": 0.019080100647530917,
"grad_norm": 0.18680201470851898,
"learning_rate": 0.001,
"loss": 3.0106,
"num_input_tokens_seen": 1048576000,
"step": 4000
},
{
"epoch": 0.019080100647530917,
"eval_loss": 2.8984477519989014,
"eval_runtime": 52.851,
"eval_samples_per_second": 94.606,
"eval_steps_per_second": 23.651,
"num_input_tokens_seen": 1048576000,
"step": 4000
},
{
"epoch": 0.019318601905625052,
"grad_norm": 0.18222174048423767,
"learning_rate": 0.001,
"loss": 3.0095,
"num_input_tokens_seen": 1061683200,
"step": 4050
},
{
"epoch": 0.01955710316371919,
"grad_norm": 0.1929137110710144,
"learning_rate": 0.001,
"loss": 3.0022,
"num_input_tokens_seen": 1074790400,
"step": 4100
},
{
"epoch": 0.019795604421813325,
"grad_norm": 0.19358602166175842,
"learning_rate": 0.001,
"loss": 2.9978,
"num_input_tokens_seen": 1087897600,
"step": 4150
},
{
"epoch": 0.02003410567990746,
"grad_norm": 0.19070614874362946,
"learning_rate": 0.001,
"loss": 3.0016,
"num_input_tokens_seen": 1101004800,
"step": 4200
},
{
"epoch": 0.0202726069380016,
"grad_norm": 0.17888160049915314,
"learning_rate": 0.001,
"loss": 2.9984,
"num_input_tokens_seen": 1114112000,
"step": 4250
},
{
"epoch": 0.020511108196095734,
"grad_norm": 0.1823708564043045,
"learning_rate": 0.001,
"loss": 3.004,
"num_input_tokens_seen": 1127219200,
"step": 4300
},
{
"epoch": 0.020749609454189872,
"grad_norm": 0.1753600388765335,
"learning_rate": 0.001,
"loss": 2.9814,
"num_input_tokens_seen": 1140326400,
"step": 4350
},
{
"epoch": 0.020988110712284007,
"grad_norm": 0.1710510551929474,
"learning_rate": 0.001,
"loss": 2.9597,
"num_input_tokens_seen": 1153433600,
"step": 4400
},
{
"epoch": 0.021226611970378145,
"grad_norm": 0.18727277219295502,
"learning_rate": 0.001,
"loss": 2.9695,
"num_input_tokens_seen": 1166540800,
"step": 4450
},
{
"epoch": 0.02146511322847228,
"grad_norm": 0.17773132026195526,
"learning_rate": 0.001,
"loss": 2.9664,
"num_input_tokens_seen": 1179648000,
"step": 4500
},
{
"epoch": 0.02146511322847228,
"eval_loss": 2.871137857437134,
"eval_runtime": 51.4876,
"eval_samples_per_second": 97.111,
"eval_steps_per_second": 24.278,
"num_input_tokens_seen": 1179648000,
"step": 4500
},
{
"epoch": 0.021703614486566415,
"grad_norm": 0.1875799000263214,
"learning_rate": 0.001,
"loss": 2.9682,
"num_input_tokens_seen": 1192755200,
"step": 4550
},
{
"epoch": 0.021942115744660554,
"grad_norm": 0.18222226202487946,
"learning_rate": 0.001,
"loss": 2.9484,
"num_input_tokens_seen": 1205862400,
"step": 4600
},
{
"epoch": 0.02218061700275469,
"grad_norm": 0.191411092877388,
"learning_rate": 0.001,
"loss": 2.9637,
"num_input_tokens_seen": 1218969600,
"step": 4650
},
{
"epoch": 0.022419118260848827,
"grad_norm": 0.17608201503753662,
"learning_rate": 0.001,
"loss": 2.9792,
"num_input_tokens_seen": 1232076800,
"step": 4700
},
{
"epoch": 0.022657619518942962,
"grad_norm": 0.1718858927488327,
"learning_rate": 0.001,
"loss": 2.9674,
"num_input_tokens_seen": 1245184000,
"step": 4750
},
{
"epoch": 0.0228961207770371,
"grad_norm": 0.18428942561149597,
"learning_rate": 0.001,
"loss": 2.976,
"num_input_tokens_seen": 1258291200,
"step": 4800
},
{
"epoch": 0.023134622035131235,
"grad_norm": 0.16696259379386902,
"learning_rate": 0.001,
"loss": 2.9486,
"num_input_tokens_seen": 1271398400,
"step": 4850
},
{
"epoch": 0.02337312329322537,
"grad_norm": 0.18239040672779083,
"learning_rate": 0.001,
"loss": 2.956,
"num_input_tokens_seen": 1284505600,
"step": 4900
},
{
"epoch": 0.02361162455131951,
"grad_norm": 0.17167994379997253,
"learning_rate": 0.001,
"loss": 2.9449,
"num_input_tokens_seen": 1297612800,
"step": 4950
},
{
"epoch": 0.023850125809413644,
"grad_norm": 0.18532761931419373,
"learning_rate": 0.001,
"loss": 2.947,
"num_input_tokens_seen": 1310720000,
"step": 5000
},
{
"epoch": 0.023850125809413644,
"eval_loss": 2.8470754623413086,
"eval_runtime": 51.04,
"eval_samples_per_second": 97.962,
"eval_steps_per_second": 24.491,
"num_input_tokens_seen": 1310720000,
"step": 5000
},
{
"epoch": 0.024088627067507782,
"grad_norm": 0.21697266399860382,
"learning_rate": 0.001,
"loss": 2.963,
"num_input_tokens_seen": 1323827200,
"step": 5050
},
{
"epoch": 0.024327128325601917,
"grad_norm": 0.17018833756446838,
"learning_rate": 0.001,
"loss": 2.9453,
"num_input_tokens_seen": 1336934400,
"step": 5100
},
{
"epoch": 0.024565629583696055,
"grad_norm": 0.17473167181015015,
"learning_rate": 0.001,
"loss": 2.9516,
"num_input_tokens_seen": 1350041600,
"step": 5150
},
{
"epoch": 0.02480413084179019,
"grad_norm": 0.18488293886184692,
"learning_rate": 0.001,
"loss": 2.9404,
"num_input_tokens_seen": 1363148800,
"step": 5200
},
{
"epoch": 0.025042632099884325,
"grad_norm": 0.17348967492580414,
"learning_rate": 0.001,
"loss": 2.9275,
"num_input_tokens_seen": 1376256000,
"step": 5250
},
{
"epoch": 0.025281133357978464,
"grad_norm": 0.16547563672065735,
"learning_rate": 0.001,
"loss": 2.9464,
"num_input_tokens_seen": 1389363200,
"step": 5300
},
{
"epoch": 0.0255196346160726,
"grad_norm": 0.17538361251354218,
"learning_rate": 0.001,
"loss": 2.94,
"num_input_tokens_seen": 1402470400,
"step": 5350
},
{
"epoch": 0.025758135874166737,
"grad_norm": 0.17068558931350708,
"learning_rate": 0.001,
"loss": 2.9382,
"num_input_tokens_seen": 1415577600,
"step": 5400
},
{
"epoch": 0.025996637132260872,
"grad_norm": 0.17389337718486786,
"learning_rate": 0.001,
"loss": 2.9254,
"num_input_tokens_seen": 1428684800,
"step": 5450
},
{
"epoch": 0.02623513839035501,
"grad_norm": 0.17620491981506348,
"learning_rate": 0.001,
"loss": 2.9221,
"num_input_tokens_seen": 1441792000,
"step": 5500
},
{
"epoch": 0.02623513839035501,
"eval_loss": 2.8246922492980957,
"eval_runtime": 50.2832,
"eval_samples_per_second": 99.437,
"eval_steps_per_second": 24.859,
"num_input_tokens_seen": 1441792000,
"step": 5500
},
{
"epoch": 0.026473639648449145,
"grad_norm": 0.15889622271060944,
"learning_rate": 0.001,
"loss": 2.923,
"num_input_tokens_seen": 1454899200,
"step": 5550
},
{
"epoch": 0.026712140906543284,
"grad_norm": 0.17490123212337494,
"learning_rate": 0.001,
"loss": 2.9146,
"num_input_tokens_seen": 1468006400,
"step": 5600
},
{
"epoch": 0.02695064216463742,
"grad_norm": 0.17789559066295624,
"learning_rate": 0.001,
"loss": 2.9253,
"num_input_tokens_seen": 1481113600,
"step": 5650
},
{
"epoch": 0.027189143422731554,
"grad_norm": 0.17113780975341797,
"learning_rate": 0.001,
"loss": 2.9267,
"num_input_tokens_seen": 1494220800,
"step": 5700
},
{
"epoch": 0.027427644680825692,
"grad_norm": 0.1671907901763916,
"learning_rate": 0.001,
"loss": 2.9178,
"num_input_tokens_seen": 1507328000,
"step": 5750
},
{
"epoch": 0.027666145938919827,
"grad_norm": 0.17511603236198425,
"learning_rate": 0.001,
"loss": 2.9341,
"num_input_tokens_seen": 1520435200,
"step": 5800
},
{
"epoch": 0.027904647197013965,
"grad_norm": 0.1821524053812027,
"learning_rate": 0.001,
"loss": 2.9076,
"num_input_tokens_seen": 1533542400,
"step": 5850
},
{
"epoch": 0.0281431484551081,
"grad_norm": 0.16259051859378815,
"learning_rate": 0.001,
"loss": 2.9212,
"num_input_tokens_seen": 1546649600,
"step": 5900
},
{
"epoch": 0.02838164971320224,
"grad_norm": 0.18584352731704712,
"learning_rate": 0.001,
"loss": 2.927,
"num_input_tokens_seen": 1559756800,
"step": 5950
},
{
"epoch": 0.028620150971296374,
"grad_norm": 0.181602343916893,
"learning_rate": 0.001,
"loss": 2.9096,
"num_input_tokens_seen": 1572864000,
"step": 6000
},
{
"epoch": 0.028620150971296374,
"eval_loss": 2.8036017417907715,
"eval_runtime": 50.543,
"eval_samples_per_second": 98.926,
"eval_steps_per_second": 24.731,
"num_input_tokens_seen": 1572864000,
"step": 6000
},
{
"epoch": 0.02885865222939051,
"grad_norm": 0.1653270423412323,
"learning_rate": 0.001,
"loss": 2.9214,
"num_input_tokens_seen": 1585971200,
"step": 6050
},
{
"epoch": 0.029097153487484647,
"grad_norm": 0.17030183970928192,
"learning_rate": 0.001,
"loss": 2.9081,
"num_input_tokens_seen": 1599078400,
"step": 6100
},
{
"epoch": 0.029335654745578782,
"grad_norm": 0.17734774947166443,
"learning_rate": 0.001,
"loss": 2.9128,
"num_input_tokens_seen": 1612185600,
"step": 6150
},
{
"epoch": 0.02957415600367292,
"grad_norm": 0.1664343774318695,
"learning_rate": 0.001,
"loss": 2.9084,
"num_input_tokens_seen": 1625292800,
"step": 6200
},
{
"epoch": 0.029812657261767055,
"grad_norm": 0.15939603745937347,
"learning_rate": 0.001,
"loss": 2.9049,
"num_input_tokens_seen": 1638400000,
"step": 6250
},
{
"epoch": 0.030051158519861194,
"grad_norm": 0.16107864677906036,
"learning_rate": 0.001,
"loss": 2.8889,
"num_input_tokens_seen": 1651507200,
"step": 6300
},
{
"epoch": 0.03028965977795533,
"grad_norm": 0.1734771579504013,
"learning_rate": 0.001,
"loss": 2.8951,
"num_input_tokens_seen": 1664614400,
"step": 6350
},
{
"epoch": 0.030528161036049464,
"grad_norm": 0.1804204136133194,
"learning_rate": 0.001,
"loss": 2.8877,
"num_input_tokens_seen": 1677721600,
"step": 6400
},
{
"epoch": 0.030766662294143602,
"grad_norm": 0.16369500756263733,
"learning_rate": 0.001,
"loss": 2.8764,
"num_input_tokens_seen": 1690828800,
"step": 6450
},
{
"epoch": 0.031005163552237737,
"grad_norm": 0.1704144924879074,
"learning_rate": 0.001,
"loss": 2.8965,
"num_input_tokens_seen": 1703936000,
"step": 6500
},
{
"epoch": 0.031005163552237737,
"eval_loss": 2.787343740463257,
"eval_runtime": 50.3956,
"eval_samples_per_second": 99.215,
"eval_steps_per_second": 24.804,
"num_input_tokens_seen": 1703936000,
"step": 6500
},
{
"epoch": 0.031243664810331875,
"grad_norm": 0.17917555570602417,
"learning_rate": 0.001,
"loss": 2.8882,
"num_input_tokens_seen": 1717043200,
"step": 6550
},
{
"epoch": 0.031482166068426014,
"grad_norm": 0.18822412192821503,
"learning_rate": 0.001,
"loss": 2.8931,
"num_input_tokens_seen": 1730150400,
"step": 6600
},
{
"epoch": 0.031720667326520145,
"grad_norm": 0.1702752560377121,
"learning_rate": 0.001,
"loss": 2.8906,
"num_input_tokens_seen": 1743257600,
"step": 6650
},
{
"epoch": 0.031959168584614284,
"grad_norm": 0.16963082551956177,
"learning_rate": 0.001,
"loss": 2.8809,
"num_input_tokens_seen": 1756364800,
"step": 6700
},
{
"epoch": 0.03219766984270842,
"grad_norm": 0.17273569107055664,
"learning_rate": 0.001,
"loss": 2.9005,
"num_input_tokens_seen": 1769472000,
"step": 6750
},
{
"epoch": 0.032436171100802554,
"grad_norm": 0.21361888945102692,
"learning_rate": 0.001,
"loss": 2.8683,
"num_input_tokens_seen": 1782579200,
"step": 6800
},
{
"epoch": 0.03267467235889669,
"grad_norm": 0.16454364359378815,
"learning_rate": 0.001,
"loss": 2.8921,
"num_input_tokens_seen": 1795686400,
"step": 6850
},
{
"epoch": 0.03291317361699083,
"grad_norm": 0.1677432805299759,
"learning_rate": 0.001,
"loss": 2.8777,
"num_input_tokens_seen": 1808793600,
"step": 6900
},
{
"epoch": 0.03315167487508497,
"grad_norm": 0.17707760632038116,
"learning_rate": 0.001,
"loss": 2.8791,
"num_input_tokens_seen": 1821900800,
"step": 6950
},
{
"epoch": 0.0333901761331791,
"grad_norm": 0.1784796118736267,
"learning_rate": 0.001,
"loss": 2.8642,
"num_input_tokens_seen": 1835008000,
"step": 7000
},
{
"epoch": 0.0333901761331791,
"eval_loss": 2.7708475589752197,
"eval_runtime": 50.9058,
"eval_samples_per_second": 98.221,
"eval_steps_per_second": 24.555,
"num_input_tokens_seen": 1835008000,
"step": 7000
},
{
"epoch": 0.03362867739127324,
"grad_norm": 0.15859876573085785,
"learning_rate": 0.001,
"loss": 2.8919,
"num_input_tokens_seen": 1848115200,
"step": 7050
},
{
"epoch": 0.03386717864936738,
"grad_norm": 0.17061467468738556,
"learning_rate": 0.001,
"loss": 2.868,
"num_input_tokens_seen": 1861222400,
"step": 7100
},
{
"epoch": 0.03410567990746151,
"grad_norm": 0.17118851840496063,
"learning_rate": 0.001,
"loss": 2.8677,
"num_input_tokens_seen": 1874329600,
"step": 7150
},
{
"epoch": 0.03434418116555565,
"grad_norm": 0.1561940759420395,
"learning_rate": 0.001,
"loss": 2.8701,
"num_input_tokens_seen": 1887436800,
"step": 7200
},
{
"epoch": 0.034582682423649785,
"grad_norm": 0.17568449676036835,
"learning_rate": 0.001,
"loss": 2.8652,
"num_input_tokens_seen": 1900544000,
"step": 7250
},
{
"epoch": 0.034821183681743924,
"grad_norm": 0.17471665143966675,
"learning_rate": 0.001,
"loss": 2.8614,
"num_input_tokens_seen": 1913651200,
"step": 7300
},
{
"epoch": 0.035059684939838055,
"grad_norm": 0.17949970066547394,
"learning_rate": 0.001,
"loss": 2.862,
"num_input_tokens_seen": 1926758400,
"step": 7350
},
{
"epoch": 0.035298186197932194,
"grad_norm": 0.17014376819133759,
"learning_rate": 0.001,
"loss": 2.8696,
"num_input_tokens_seen": 1939865600,
"step": 7400
},
{
"epoch": 0.03553668745602633,
"grad_norm": 0.166939839720726,
"learning_rate": 0.001,
"loss": 2.8679,
"num_input_tokens_seen": 1952972800,
"step": 7450
},
{
"epoch": 0.035775188714120464,
"grad_norm": 0.16403459012508392,
"learning_rate": 0.001,
"loss": 2.8692,
"num_input_tokens_seen": 1966080000,
"step": 7500
},
{
"epoch": 0.035775188714120464,
"eval_loss": 2.7581658363342285,
"eval_runtime": 50.614,
"eval_samples_per_second": 98.787,
"eval_steps_per_second": 24.697,
"num_input_tokens_seen": 1966080000,
"step": 7500
},
{
"epoch": 0.0360136899722146,
"grad_norm": 0.16664361953735352,
"learning_rate": 0.001,
"loss": 2.8549,
"num_input_tokens_seen": 1979187200,
"step": 7550
},
{
"epoch": 0.03625219123030874,
"grad_norm": 0.165015310049057,
"learning_rate": 0.001,
"loss": 2.867,
"num_input_tokens_seen": 1992294400,
"step": 7600
},
{
"epoch": 0.03649069248840288,
"grad_norm": 0.17752580344676971,
"learning_rate": 0.001,
"loss": 2.8721,
"num_input_tokens_seen": 2005401600,
"step": 7650
},
{
"epoch": 0.03672919374649701,
"grad_norm": 0.1641317456960678,
"learning_rate": 0.001,
"loss": 2.8601,
"num_input_tokens_seen": 2018508800,
"step": 7700
},
{
"epoch": 0.03696769500459115,
"grad_norm": 0.1706378310918808,
"learning_rate": 0.001,
"loss": 2.8385,
"num_input_tokens_seen": 2031616000,
"step": 7750
},
{
"epoch": 0.03720619626268529,
"grad_norm": 0.18265438079833984,
"learning_rate": 0.001,
"loss": 2.8421,
"num_input_tokens_seen": 2044723200,
"step": 7800
},
{
"epoch": 0.037444697520779426,
"grad_norm": 0.17270897328853607,
"learning_rate": 0.001,
"loss": 2.8576,
"num_input_tokens_seen": 2057830400,
"step": 7850
},
{
"epoch": 0.03768319877887356,
"grad_norm": 0.17359280586242676,
"learning_rate": 0.001,
"loss": 2.8522,
"num_input_tokens_seen": 2070937600,
"step": 7900
},
{
"epoch": 0.037921700036967695,
"grad_norm": 0.1679411083459854,
"learning_rate": 0.001,
"loss": 2.854,
"num_input_tokens_seen": 2084044800,
"step": 7950
},
{
"epoch": 0.038160201295061834,
"grad_norm": 0.16735835373401642,
"learning_rate": 0.001,
"loss": 2.8494,
"num_input_tokens_seen": 2097152000,
"step": 8000
},
{
"epoch": 0.038160201295061834,
"eval_loss": 2.7443442344665527,
"eval_runtime": 50.3387,
"eval_samples_per_second": 99.327,
"eval_steps_per_second": 24.832,
"num_input_tokens_seen": 2097152000,
"step": 8000
},
{
"epoch": 0.038398702553155965,
"grad_norm": 0.16059577465057373,
"learning_rate": 0.001,
"loss": 2.8495,
"num_input_tokens_seen": 2110259200,
"step": 8050
},
{
"epoch": 0.038637203811250104,
"grad_norm": 0.1842387169599533,
"learning_rate": 0.001,
"loss": 2.8526,
"num_input_tokens_seen": 2123366400,
"step": 8100
},
{
"epoch": 0.03887570506934424,
"grad_norm": 0.15922050178050995,
"learning_rate": 0.001,
"loss": 2.8312,
"num_input_tokens_seen": 2136473600,
"step": 8150
},
{
"epoch": 0.03911420632743838,
"grad_norm": 0.16642028093338013,
"learning_rate": 0.001,
"loss": 2.8452,
"num_input_tokens_seen": 2149580800,
"step": 8200
},
{
"epoch": 0.03935270758553251,
"grad_norm": 0.16174671053886414,
"learning_rate": 0.001,
"loss": 2.8471,
"num_input_tokens_seen": 2162688000,
"step": 8250
},
{
"epoch": 0.03959120884362665,
"grad_norm": 0.16786591708660126,
"learning_rate": 0.001,
"loss": 2.8435,
"num_input_tokens_seen": 2175795200,
"step": 8300
},
{
"epoch": 0.03982971010172079,
"grad_norm": 0.17107373476028442,
"learning_rate": 0.001,
"loss": 2.862,
"num_input_tokens_seen": 2188902400,
"step": 8350
},
{
"epoch": 0.04006821135981492,
"grad_norm": 0.17952118813991547,
"learning_rate": 0.001,
"loss": 2.8414,
"num_input_tokens_seen": 2202009600,
"step": 8400
},
{
"epoch": 0.04030671261790906,
"grad_norm": 0.16836482286453247,
"learning_rate": 0.001,
"loss": 2.8363,
"num_input_tokens_seen": 2215116800,
"step": 8450
},
{
"epoch": 0.0405452138760032,
"grad_norm": 0.16812962293624878,
"learning_rate": 0.001,
"loss": 2.844,
"num_input_tokens_seen": 2228224000,
"step": 8500
},
{
"epoch": 0.0405452138760032,
"eval_loss": 2.7306976318359375,
"eval_runtime": 50.272,
"eval_samples_per_second": 99.459,
"eval_steps_per_second": 24.865,
"num_input_tokens_seen": 2228224000,
"step": 8500
},
{
"epoch": 0.040783715134097336,
"grad_norm": 0.1696135401725769,
"learning_rate": 0.001,
"loss": 2.8406,
"num_input_tokens_seen": 2241331200,
"step": 8550
},
{
"epoch": 0.04102221639219147,
"grad_norm": 0.16062459349632263,
"learning_rate": 0.001,
"loss": 2.8453,
"num_input_tokens_seen": 2254438400,
"step": 8600
},
{
"epoch": 0.041260717650285605,
"grad_norm": 0.17326433956623077,
"learning_rate": 0.001,
"loss": 2.8449,
"num_input_tokens_seen": 2267545600,
"step": 8650
},
{
"epoch": 0.041499218908379744,
"grad_norm": 0.16410672664642334,
"learning_rate": 0.001,
"loss": 2.8412,
"num_input_tokens_seen": 2280652800,
"step": 8700
},
{
"epoch": 0.041737720166473875,
"grad_norm": 0.16255012154579163,
"learning_rate": 0.001,
"loss": 2.8524,
"num_input_tokens_seen": 2293760000,
"step": 8750
},
{
"epoch": 0.041976221424568014,
"grad_norm": 0.163652241230011,
"learning_rate": 0.001,
"loss": 2.8528,
"num_input_tokens_seen": 2306867200,
"step": 8800
},
{
"epoch": 0.04221472268266215,
"grad_norm": 0.15598778426647186,
"learning_rate": 0.001,
"loss": 2.8255,
"num_input_tokens_seen": 2319974400,
"step": 8850
},
{
"epoch": 0.04245322394075629,
"grad_norm": 0.1740003079175949,
"learning_rate": 0.001,
"loss": 2.8278,
"num_input_tokens_seen": 2333081600,
"step": 8900
},
{
"epoch": 0.04269172519885042,
"grad_norm": 0.17225052416324615,
"learning_rate": 0.001,
"loss": 2.8334,
"num_input_tokens_seen": 2346188800,
"step": 8950
},
{
"epoch": 0.04293022645694456,
"grad_norm": 0.18005919456481934,
"learning_rate": 0.001,
"loss": 2.8044,
"num_input_tokens_seen": 2359296000,
"step": 9000
},
{
"epoch": 0.04293022645694456,
"eval_loss": 2.7220215797424316,
"eval_runtime": 50.2706,
"eval_samples_per_second": 99.462,
"eval_steps_per_second": 24.865,
"num_input_tokens_seen": 2359296000,
"step": 9000
},
{
"epoch": 0.0431687277150387,
"grad_norm": 0.16554109752178192,
"learning_rate": 0.001,
"loss": 2.83,
"num_input_tokens_seen": 2372403200,
"step": 9050
},
{
"epoch": 0.04340722897313283,
"grad_norm": 0.17308101058006287,
"learning_rate": 0.001,
"loss": 2.8204,
"num_input_tokens_seen": 2385510400,
"step": 9100
},
{
"epoch": 0.04364573023122697,
"grad_norm": 0.16701756417751312,
"learning_rate": 0.001,
"loss": 2.836,
"num_input_tokens_seen": 2398617600,
"step": 9150
},
{
"epoch": 0.04388423148932111,
"grad_norm": 0.16220535337924957,
"learning_rate": 0.001,
"loss": 2.8194,
"num_input_tokens_seen": 2411724800,
"step": 9200
},
{
"epoch": 0.044122732747415246,
"grad_norm": 0.16643071174621582,
"learning_rate": 0.001,
"loss": 2.8157,
"num_input_tokens_seen": 2424832000,
"step": 9250
},
{
"epoch": 0.04436123400550938,
"grad_norm": 0.16293680667877197,
"learning_rate": 0.001,
"loss": 2.8147,
"num_input_tokens_seen": 2437939200,
"step": 9300
},
{
"epoch": 0.044599735263603515,
"grad_norm": 0.1914059966802597,
"learning_rate": 0.001,
"loss": 2.8164,
"num_input_tokens_seen": 2451046400,
"step": 9350
},
{
"epoch": 0.044838236521697654,
"grad_norm": 0.15867285430431366,
"learning_rate": 0.001,
"loss": 2.8063,
"num_input_tokens_seen": 2464153600,
"step": 9400
},
{
"epoch": 0.045076737779791785,
"grad_norm": 0.16319462656974792,
"learning_rate": 0.001,
"loss": 2.8096,
"num_input_tokens_seen": 2477260800,
"step": 9450
},
{
"epoch": 0.045315239037885924,
"grad_norm": 0.16578581929206848,
"learning_rate": 0.001,
"loss": 2.8106,
"num_input_tokens_seen": 2490368000,
"step": 9500
},
{
"epoch": 0.045315239037885924,
"eval_loss": 2.7105066776275635,
"eval_runtime": 50.0838,
"eval_samples_per_second": 99.833,
"eval_steps_per_second": 24.958,
"num_input_tokens_seen": 2490368000,
"step": 9500
},
{
"epoch": 0.04555374029598006,
"grad_norm": 0.17125573754310608,
"learning_rate": 0.001,
"loss": 2.8259,
"num_input_tokens_seen": 2503475200,
"step": 9550
},
{
"epoch": 0.0457922415540742,
"grad_norm": 0.1661599725484848,
"learning_rate": 0.001,
"loss": 2.8109,
"num_input_tokens_seen": 2516582400,
"step": 9600
},
{
"epoch": 0.04603074281216833,
"grad_norm": 0.16203565895557404,
"learning_rate": 0.001,
"loss": 2.8198,
"num_input_tokens_seen": 2529689600,
"step": 9650
},
{
"epoch": 0.04626924407026247,
"grad_norm": 0.1869373619556427,
"learning_rate": 0.001,
"loss": 2.8163,
"num_input_tokens_seen": 2542796800,
"step": 9700
},
{
"epoch": 0.04650774532835661,
"grad_norm": 0.17401213943958282,
"learning_rate": 0.001,
"loss": 2.8209,
"num_input_tokens_seen": 2555904000,
"step": 9750
},
{
"epoch": 0.04674624658645074,
"grad_norm": 0.15835829079151154,
"learning_rate": 0.001,
"loss": 2.8032,
"num_input_tokens_seen": 2569011200,
"step": 9800
},
{
"epoch": 0.04698474784454488,
"grad_norm": 0.16554060578346252,
"learning_rate": 0.001,
"loss": 2.8072,
"num_input_tokens_seen": 2582118400,
"step": 9850
},
{
"epoch": 0.04722324910263902,
"grad_norm": 0.16941213607788086,
"learning_rate": 0.001,
"loss": 2.8076,
"num_input_tokens_seen": 2595225600,
"step": 9900
},
{
"epoch": 0.047461750360733156,
"grad_norm": 0.16324704885482788,
"learning_rate": 0.001,
"loss": 2.8097,
"num_input_tokens_seen": 2608332800,
"step": 9950
},
{
"epoch": 0.04770025161882729,
"grad_norm": 0.16865754127502441,
"learning_rate": 0.001,
"loss": 2.8051,
"num_input_tokens_seen": 2621440000,
"step": 10000
},
{
"epoch": 0.04770025161882729,
"eval_loss": 2.7000486850738525,
"eval_runtime": 50.3365,
"eval_samples_per_second": 99.331,
"eval_steps_per_second": 24.833,
"num_input_tokens_seen": 2621440000,
"step": 10000
},
{
"epoch": 0.047938752876921426,
"grad_norm": 0.17076526582241058,
"learning_rate": 0.001,
"loss": 2.8148,
"num_input_tokens_seen": 2634547200,
"step": 10050
},
{
"epoch": 0.048177254135015564,
"grad_norm": 0.1610497534275055,
"learning_rate": 0.001,
"loss": 2.8012,
"num_input_tokens_seen": 2647654400,
"step": 10100
},
{
"epoch": 0.048415755393109695,
"grad_norm": 0.15984536707401276,
"learning_rate": 0.001,
"loss": 2.8086,
"num_input_tokens_seen": 2660761600,
"step": 10150
},
{
"epoch": 0.048654256651203834,
"grad_norm": 0.21775834262371063,
"learning_rate": 0.001,
"loss": 2.8021,
"num_input_tokens_seen": 2673868800,
"step": 10200
},
{
"epoch": 0.04889275790929797,
"grad_norm": 0.1841157227754593,
"learning_rate": 0.001,
"loss": 2.8152,
"num_input_tokens_seen": 2686976000,
"step": 10250
},
{
"epoch": 0.04913125916739211,
"grad_norm": 0.17025424540042877,
"learning_rate": 0.001,
"loss": 2.8131,
"num_input_tokens_seen": 2700083200,
"step": 10300
},
{
"epoch": 0.04936976042548624,
"grad_norm": 0.1992417722940445,
"learning_rate": 0.001,
"loss": 2.8039,
"num_input_tokens_seen": 2713190400,
"step": 10350
},
{
"epoch": 0.04960826168358038,
"grad_norm": 0.1680469959974289,
"learning_rate": 0.001,
"loss": 2.7921,
"num_input_tokens_seen": 2726297600,
"step": 10400
},
{
"epoch": 0.04984676294167452,
"grad_norm": 0.18296252191066742,
"learning_rate": 0.001,
"loss": 2.8036,
"num_input_tokens_seen": 2739404800,
"step": 10450
},
{
"epoch": 0.05008526419976865,
"grad_norm": 0.16041898727416992,
"learning_rate": 0.001,
"loss": 2.7979,
"num_input_tokens_seen": 2752512000,
"step": 10500
},
{
"epoch": 0.05008526419976865,
"eval_loss": 2.6893723011016846,
"eval_runtime": 50.642,
"eval_samples_per_second": 98.732,
"eval_steps_per_second": 24.683,
"num_input_tokens_seen": 2752512000,
"step": 10500
},
{
"epoch": 0.05032376545786279,
"grad_norm": 0.16704030334949493,
"learning_rate": 0.001,
"loss": 2.79,
"num_input_tokens_seen": 2765619200,
"step": 10550
},
{
"epoch": 0.05056226671595693,
"grad_norm": 0.16553758084774017,
"learning_rate": 0.001,
"loss": 2.7964,
"num_input_tokens_seen": 2778726400,
"step": 10600
},
{
"epoch": 0.050800767974051066,
"grad_norm": 0.16027161478996277,
"learning_rate": 0.001,
"loss": 2.7937,
"num_input_tokens_seen": 2791833600,
"step": 10650
},
{
"epoch": 0.0510392692321452,
"grad_norm": 0.16177843511104584,
"learning_rate": 0.001,
"loss": 2.7957,
"num_input_tokens_seen": 2804940800,
"step": 10700
},
{
"epoch": 0.051277770490239336,
"grad_norm": 0.16713912785053253,
"learning_rate": 0.001,
"loss": 2.7949,
"num_input_tokens_seen": 2818048000,
"step": 10750
},
{
"epoch": 0.051516271748333474,
"grad_norm": 0.1815747618675232,
"learning_rate": 0.001,
"loss": 2.7915,
"num_input_tokens_seen": 2831155200,
"step": 10800
},
{
"epoch": 0.05175477300642761,
"grad_norm": 0.16732683777809143,
"learning_rate": 0.001,
"loss": 2.7994,
"num_input_tokens_seen": 2844262400,
"step": 10850
},
{
"epoch": 0.051993274264521744,
"grad_norm": 0.18305908143520355,
"learning_rate": 0.001,
"loss": 2.7888,
"num_input_tokens_seen": 2857369600,
"step": 10900
},
{
"epoch": 0.05223177552261588,
"grad_norm": 0.16450954973697662,
"learning_rate": 0.001,
"loss": 2.7834,
"num_input_tokens_seen": 2870476800,
"step": 10950
},
{
"epoch": 0.05247027678071002,
"grad_norm": 0.16485372185707092,
"learning_rate": 0.001,
"loss": 2.7976,
"num_input_tokens_seen": 2883584000,
"step": 11000
},
{
"epoch": 0.05247027678071002,
"eval_loss": 2.6825835704803467,
"eval_runtime": 50.1016,
"eval_samples_per_second": 99.797,
"eval_steps_per_second": 24.949,
"num_input_tokens_seen": 2883584000,
"step": 11000
},
{
"epoch": 0.05270877803880415,
"grad_norm": 0.1733204573392868,
"learning_rate": 0.001,
"loss": 2.7959,
"num_input_tokens_seen": 2896691200,
"step": 11050
},
{
"epoch": 0.05294727929689829,
"grad_norm": 0.16432546079158783,
"learning_rate": 0.001,
"loss": 2.793,
"num_input_tokens_seen": 2909798400,
"step": 11100
},
{
"epoch": 0.05318578055499243,
"grad_norm": 0.18369582295417786,
"learning_rate": 0.001,
"loss": 2.8149,
"num_input_tokens_seen": 2922905600,
"step": 11150
},
{
"epoch": 0.05342428181308657,
"grad_norm": 0.17782896757125854,
"learning_rate": 0.001,
"loss": 2.7878,
"num_input_tokens_seen": 2936012800,
"step": 11200
},
{
"epoch": 0.0536627830711807,
"grad_norm": 0.18320836126804352,
"learning_rate": 0.001,
"loss": 2.8159,
"num_input_tokens_seen": 2949120000,
"step": 11250
},
{
"epoch": 0.05390128432927484,
"grad_norm": 0.1667925864458084,
"learning_rate": 0.001,
"loss": 2.795,
"num_input_tokens_seen": 2962227200,
"step": 11300
},
{
"epoch": 0.054139785587368976,
"grad_norm": 0.19831301271915436,
"learning_rate": 0.001,
"loss": 2.7907,
"num_input_tokens_seen": 2975334400,
"step": 11350
},
{
"epoch": 0.05437828684546311,
"grad_norm": 0.1610182225704193,
"learning_rate": 0.001,
"loss": 2.774,
"num_input_tokens_seen": 2988441600,
"step": 11400
},
{
"epoch": 0.054616788103557246,
"grad_norm": 0.15938150882720947,
"learning_rate": 0.001,
"loss": 2.7766,
"num_input_tokens_seen": 3001548800,
"step": 11450
},
{
"epoch": 0.054855289361651384,
"grad_norm": 0.15737415850162506,
"learning_rate": 0.001,
"loss": 2.783,
"num_input_tokens_seen": 3014656000,
"step": 11500
},
{
"epoch": 0.054855289361651384,
"eval_loss": 2.6739256381988525,
"eval_runtime": 51.2462,
"eval_samples_per_second": 97.568,
"eval_steps_per_second": 24.392,
"num_input_tokens_seen": 3014656000,
"step": 11500
},
{
"epoch": 0.05509379061974552,
"grad_norm": 0.16538532078266144,
"learning_rate": 0.001,
"loss": 2.7966,
"num_input_tokens_seen": 3027763200,
"step": 11550
},
{
"epoch": 0.055332291877839654,
"grad_norm": 0.18035660684108734,
"learning_rate": 0.001,
"loss": 2.7789,
"num_input_tokens_seen": 3040870400,
"step": 11600
},
{
"epoch": 0.05557079313593379,
"grad_norm": 0.17831085622310638,
"learning_rate": 0.001,
"loss": 2.7962,
"num_input_tokens_seen": 3053977600,
"step": 11650
},
{
"epoch": 0.05580929439402793,
"grad_norm": 0.17723870277404785,
"learning_rate": 0.001,
"loss": 2.7791,
"num_input_tokens_seen": 3067084800,
"step": 11700
},
{
"epoch": 0.05604779565212206,
"grad_norm": 0.17663581669330597,
"learning_rate": 0.001,
"loss": 2.7696,
"num_input_tokens_seen": 3080192000,
"step": 11750
},
{
"epoch": 0.0562862969102162,
"grad_norm": 0.16684900224208832,
"learning_rate": 0.001,
"loss": 2.7762,
"num_input_tokens_seen": 3093299200,
"step": 11800
},
{
"epoch": 0.05652479816831034,
"grad_norm": 0.17407995462417603,
"learning_rate": 0.001,
"loss": 2.7767,
"num_input_tokens_seen": 3106406400,
"step": 11850
},
{
"epoch": 0.05676329942640448,
"grad_norm": 0.1750691831111908,
"learning_rate": 0.001,
"loss": 2.7785,
"num_input_tokens_seen": 3119513600,
"step": 11900
},
{
"epoch": 0.05700180068449861,
"grad_norm": 0.16576959192752838,
"learning_rate": 0.001,
"loss": 2.773,
"num_input_tokens_seen": 3132620800,
"step": 11950
},
{
"epoch": 0.05724030194259275,
"grad_norm": 0.16957831382751465,
"learning_rate": 0.001,
"loss": 2.7781,
"num_input_tokens_seen": 3145728000,
"step": 12000
},
{
"epoch": 0.05724030194259275,
"eval_loss": 2.6683335304260254,
"eval_runtime": 50.6428,
"eval_samples_per_second": 98.731,
"eval_steps_per_second": 24.683,
"num_input_tokens_seen": 3145728000,
"step": 12000
},
{
"epoch": 0.057478803200686886,
"grad_norm": 0.1645338237285614,
"learning_rate": 0.001,
"loss": 2.7709,
"num_input_tokens_seen": 3158835200,
"step": 12050
},
{
"epoch": 0.05771730445878102,
"grad_norm": 0.15848694741725922,
"learning_rate": 0.001,
"loss": 2.7849,
"num_input_tokens_seen": 3171942400,
"step": 12100
},
{
"epoch": 0.057955805716875156,
"grad_norm": 0.20003071427345276,
"learning_rate": 0.001,
"loss": 2.7691,
"num_input_tokens_seen": 3185049600,
"step": 12150
},
{
"epoch": 0.058194306974969294,
"grad_norm": 0.19301050901412964,
"learning_rate": 0.001,
"loss": 2.7811,
"num_input_tokens_seen": 3198156800,
"step": 12200
},
{
"epoch": 0.05843280823306343,
"grad_norm": 0.171390563249588,
"learning_rate": 0.001,
"loss": 2.7712,
"num_input_tokens_seen": 3211264000,
"step": 12250
},
{
"epoch": 0.058671309491157564,
"grad_norm": 0.1654270589351654,
"learning_rate": 0.001,
"loss": 2.7788,
"num_input_tokens_seen": 3224371200,
"step": 12300
},
{
"epoch": 0.0589098107492517,
"grad_norm": 0.16559672355651855,
"learning_rate": 0.001,
"loss": 2.7839,
"num_input_tokens_seen": 3237478400,
"step": 12350
},
{
"epoch": 0.05914831200734584,
"grad_norm": 0.16773344576358795,
"learning_rate": 0.001,
"loss": 2.7896,
"num_input_tokens_seen": 3250585600,
"step": 12400
},
{
"epoch": 0.05938681326543997,
"grad_norm": 0.1639021933078766,
"learning_rate": 0.001,
"loss": 2.7704,
"num_input_tokens_seen": 3263692800,
"step": 12450
},
{
"epoch": 0.05962531452353411,
"grad_norm": 0.15584540367126465,
"learning_rate": 0.001,
"loss": 2.7687,
"num_input_tokens_seen": 3276800000,
"step": 12500
},
{
"epoch": 0.05962531452353411,
"eval_loss": 2.6606011390686035,
"eval_runtime": 51.1636,
"eval_samples_per_second": 97.726,
"eval_steps_per_second": 24.431,
"num_input_tokens_seen": 3276800000,
"step": 12500
},
{
"epoch": 0.05986381578162825,
"grad_norm": 0.18144413828849792,
"learning_rate": 0.001,
"loss": 2.7711,
"num_input_tokens_seen": 3289907200,
"step": 12550
},
{
"epoch": 0.06010231703972239,
"grad_norm": 0.18225054442882538,
"learning_rate": 0.001,
"loss": 2.7675,
"num_input_tokens_seen": 3303014400,
"step": 12600
},
{
"epoch": 0.06034081829781652,
"grad_norm": 0.16542398929595947,
"learning_rate": 0.001,
"loss": 2.7563,
"num_input_tokens_seen": 3316121600,
"step": 12650
},
{
"epoch": 0.06057931955591066,
"grad_norm": 0.1765596568584442,
"learning_rate": 0.001,
"loss": 2.7807,
"num_input_tokens_seen": 3329228800,
"step": 12700
},
{
"epoch": 0.060817820814004796,
"grad_norm": 0.17469234764575958,
"learning_rate": 0.001,
"loss": 2.7532,
"num_input_tokens_seen": 3342336000,
"step": 12750
},
{
"epoch": 0.06105632207209893,
"grad_norm": 0.1841767281293869,
"learning_rate": 0.001,
"loss": 2.7824,
"num_input_tokens_seen": 3355443200,
"step": 12800
},
{
"epoch": 0.061294823330193066,
"grad_norm": 0.1667831838130951,
"learning_rate": 0.001,
"loss": 2.7648,
"num_input_tokens_seen": 3368550400,
"step": 12850
},
{
"epoch": 0.061533324588287204,
"grad_norm": 0.16561101377010345,
"learning_rate": 0.001,
"loss": 2.7798,
"num_input_tokens_seen": 3381657600,
"step": 12900
},
{
"epoch": 0.06177182584638134,
"grad_norm": 0.17370566725730896,
"learning_rate": 0.001,
"loss": 2.7755,
"num_input_tokens_seen": 3394764800,
"step": 12950
},
{
"epoch": 0.062010327104475474,
"grad_norm": 0.16871176660060883,
"learning_rate": 0.001,
"loss": 2.7676,
"num_input_tokens_seen": 3407872000,
"step": 13000
},
{
"epoch": 0.062010327104475474,
"eval_loss": 2.653367757797241,
"eval_runtime": 50.8399,
"eval_samples_per_second": 98.348,
"eval_steps_per_second": 24.587,
"num_input_tokens_seen": 3407872000,
"step": 13000
},
{
"epoch": 0.06224882836256961,
"grad_norm": 0.17592230439186096,
"learning_rate": 0.001,
"loss": 2.7639,
"num_input_tokens_seen": 3420979200,
"step": 13050
},
{
"epoch": 0.06248732962066375,
"grad_norm": 0.1640375405550003,
"learning_rate": 0.001,
"loss": 2.7785,
"num_input_tokens_seen": 3434086400,
"step": 13100
},
{
"epoch": 0.06272583087875788,
"grad_norm": 0.16389331221580505,
"learning_rate": 0.001,
"loss": 2.7475,
"num_input_tokens_seen": 3447193600,
"step": 13150
},
{
"epoch": 0.06296433213685203,
"grad_norm": 0.1733655482530594,
"learning_rate": 0.001,
"loss": 2.7538,
"num_input_tokens_seen": 3460300800,
"step": 13200
},
{
"epoch": 0.06320283339494616,
"grad_norm": 0.19206473231315613,
"learning_rate": 0.001,
"loss": 2.7819,
"num_input_tokens_seen": 3473408000,
"step": 13250
},
{
"epoch": 0.06344133465304029,
"grad_norm": 0.1841450184583664,
"learning_rate": 0.001,
"loss": 2.7701,
"num_input_tokens_seen": 3486515200,
"step": 13300
},
{
"epoch": 0.06367983591113444,
"grad_norm": 0.1701631247997284,
"learning_rate": 0.001,
"loss": 2.7587,
"num_input_tokens_seen": 3499622400,
"step": 13350
},
{
"epoch": 0.06391833716922857,
"grad_norm": 0.17068499326705933,
"learning_rate": 0.001,
"loss": 2.7589,
"num_input_tokens_seen": 3512729600,
"step": 13400
},
{
"epoch": 0.0641568384273227,
"grad_norm": 0.17927715182304382,
"learning_rate": 0.001,
"loss": 2.764,
"num_input_tokens_seen": 3525836800,
"step": 13450
},
{
"epoch": 0.06439533968541684,
"grad_norm": 0.19105768203735352,
"learning_rate": 0.001,
"loss": 2.7593,
"num_input_tokens_seen": 3538944000,
"step": 13500
},
{
"epoch": 0.06439533968541684,
"eval_loss": 2.6473631858825684,
"eval_runtime": 51.0846,
"eval_samples_per_second": 97.877,
"eval_steps_per_second": 24.469,
"num_input_tokens_seen": 3538944000,
"step": 13500
},
{
"epoch": 0.06463384094351098,
"grad_norm": 0.17262668907642365,
"learning_rate": 0.001,
"loss": 2.7522,
"num_input_tokens_seen": 3552051200,
"step": 13550
},
{
"epoch": 0.06487234220160511,
"grad_norm": 0.16810455918312073,
"learning_rate": 0.001,
"loss": 2.7664,
"num_input_tokens_seen": 3565158400,
"step": 13600
},
{
"epoch": 0.06511084345969925,
"grad_norm": 0.17312487959861755,
"learning_rate": 0.001,
"loss": 2.7557,
"num_input_tokens_seen": 3578265600,
"step": 13650
},
{
"epoch": 0.06534934471779338,
"grad_norm": 0.16985322535037994,
"learning_rate": 0.001,
"loss": 2.7449,
"num_input_tokens_seen": 3591372800,
"step": 13700
},
{
"epoch": 0.06558784597588753,
"grad_norm": 0.1812393069267273,
"learning_rate": 0.001,
"loss": 2.749,
"num_input_tokens_seen": 3604480000,
"step": 13750
},
{
"epoch": 0.06582634723398166,
"grad_norm": 0.183237224817276,
"learning_rate": 0.001,
"loss": 2.7637,
"num_input_tokens_seen": 3617587200,
"step": 13800
},
{
"epoch": 0.06606484849207579,
"grad_norm": 0.17770566046237946,
"learning_rate": 0.001,
"loss": 2.7602,
"num_input_tokens_seen": 3630694400,
"step": 13850
},
{
"epoch": 0.06630334975016994,
"grad_norm": 0.1678437739610672,
"learning_rate": 0.001,
"loss": 2.76,
"num_input_tokens_seen": 3643801600,
"step": 13900
},
{
"epoch": 0.06654185100826407,
"grad_norm": 0.16213107109069824,
"learning_rate": 0.001,
"loss": 2.7467,
"num_input_tokens_seen": 3656908800,
"step": 13950
},
{
"epoch": 0.0667803522663582,
"grad_norm": 0.17652907967567444,
"learning_rate": 0.001,
"loss": 2.7516,
"num_input_tokens_seen": 3670016000,
"step": 14000
},
{
"epoch": 0.0667803522663582,
"eval_loss": 2.6438815593719482,
"eval_runtime": 50.3233,
"eval_samples_per_second": 99.358,
"eval_steps_per_second": 24.839,
"num_input_tokens_seen": 3670016000,
"step": 14000
},
{
"epoch": 0.06701885352445235,
"grad_norm": 0.1785530298948288,
"learning_rate": 0.001,
"loss": 2.7475,
"num_input_tokens_seen": 3683123200,
"step": 14050
},
{
"epoch": 0.06725735478254648,
"grad_norm": 0.15644113719463348,
"learning_rate": 0.001,
"loss": 2.7541,
"num_input_tokens_seen": 3696230400,
"step": 14100
},
{
"epoch": 0.06749585604064061,
"grad_norm": 0.183272585272789,
"learning_rate": 0.001,
"loss": 2.7513,
"num_input_tokens_seen": 3709337600,
"step": 14150
},
{
"epoch": 0.06773435729873475,
"grad_norm": 0.17523212730884552,
"learning_rate": 0.001,
"loss": 2.7649,
"num_input_tokens_seen": 3722444800,
"step": 14200
},
{
"epoch": 0.06797285855682889,
"grad_norm": 0.1778247356414795,
"learning_rate": 0.001,
"loss": 2.7457,
"num_input_tokens_seen": 3735552000,
"step": 14250
},
{
"epoch": 0.06821135981492302,
"grad_norm": 0.18277810513973236,
"learning_rate": 0.001,
"loss": 2.7477,
"num_input_tokens_seen": 3748659200,
"step": 14300
},
{
"epoch": 0.06844986107301716,
"grad_norm": 0.17541366815567017,
"learning_rate": 0.001,
"loss": 2.7419,
"num_input_tokens_seen": 3761766400,
"step": 14350
},
{
"epoch": 0.0686883623311113,
"grad_norm": 0.1701425164937973,
"learning_rate": 0.001,
"loss": 2.7437,
"num_input_tokens_seen": 3774873600,
"step": 14400
},
{
"epoch": 0.06892686358920544,
"grad_norm": 0.16685517132282257,
"learning_rate": 0.001,
"loss": 2.7357,
"num_input_tokens_seen": 3787980800,
"step": 14450
},
{
"epoch": 0.06916536484729957,
"grad_norm": 0.1738167405128479,
"learning_rate": 0.001,
"loss": 2.7475,
"num_input_tokens_seen": 3801088000,
"step": 14500
},
{
"epoch": 0.06916536484729957,
"eval_loss": 2.635887622833252,
"eval_runtime": 50.4516,
"eval_samples_per_second": 99.105,
"eval_steps_per_second": 24.776,
"num_input_tokens_seen": 3801088000,
"step": 14500
},
{
"epoch": 0.0694038661053937,
"grad_norm": 0.18279027938842773,
"learning_rate": 0.001,
"loss": 2.7521,
"num_input_tokens_seen": 3814195200,
"step": 14550
},
{
"epoch": 0.06964236736348785,
"grad_norm": 0.1878173053264618,
"learning_rate": 0.001,
"loss": 2.7401,
"num_input_tokens_seen": 3827302400,
"step": 14600
},
{
"epoch": 0.06988086862158198,
"grad_norm": 0.17670077085494995,
"learning_rate": 0.001,
"loss": 2.7513,
"num_input_tokens_seen": 3840409600,
"step": 14650
},
{
"epoch": 0.07011936987967611,
"grad_norm": 0.17042580246925354,
"learning_rate": 0.001,
"loss": 2.7383,
"num_input_tokens_seen": 3853516800,
"step": 14700
},
{
"epoch": 0.07035787113777026,
"grad_norm": 0.17193050682544708,
"learning_rate": 0.001,
"loss": 2.7408,
"num_input_tokens_seen": 3866624000,
"step": 14750
},
{
"epoch": 0.07059637239586439,
"grad_norm": 0.16576342284679413,
"learning_rate": 0.001,
"loss": 2.7312,
"num_input_tokens_seen": 3879731200,
"step": 14800
},
{
"epoch": 0.07083487365395852,
"grad_norm": 0.18535619974136353,
"learning_rate": 0.001,
"loss": 2.756,
"num_input_tokens_seen": 3892838400,
"step": 14850
},
{
"epoch": 0.07107337491205266,
"grad_norm": 0.1729886531829834,
"learning_rate": 0.001,
"loss": 2.751,
"num_input_tokens_seen": 3905945600,
"step": 14900
},
{
"epoch": 0.0713118761701468,
"grad_norm": 0.16047705709934235,
"learning_rate": 0.001,
"loss": 2.7361,
"num_input_tokens_seen": 3919052800,
"step": 14950
},
{
"epoch": 0.07155037742824093,
"grad_norm": 0.17655611038208008,
"learning_rate": 0.001,
"loss": 2.7471,
"num_input_tokens_seen": 3932160000,
"step": 15000
},
{
"epoch": 0.07155037742824093,
"eval_loss": 2.6311256885528564,
"eval_runtime": 51.0361,
"eval_samples_per_second": 97.97,
"eval_steps_per_second": 24.492,
"num_input_tokens_seen": 3932160000,
"step": 15000
},
{
"epoch": 0.07178887868633507,
"grad_norm": 0.19243250787258148,
"learning_rate": 0.001,
"loss": 2.7551,
"num_input_tokens_seen": 3945267200,
"step": 15050
},
{
"epoch": 0.0720273799444292,
"grad_norm": 0.17328651249408722,
"learning_rate": 0.001,
"loss": 2.7346,
"num_input_tokens_seen": 3958374400,
"step": 15100
},
{
"epoch": 0.07226588120252335,
"grad_norm": 0.16357752680778503,
"learning_rate": 0.001,
"loss": 2.7523,
"num_input_tokens_seen": 3971481600,
"step": 15150
},
{
"epoch": 0.07250438246061748,
"grad_norm": 0.1726733148097992,
"learning_rate": 0.001,
"loss": 2.725,
"num_input_tokens_seen": 3984588800,
"step": 15200
},
{
"epoch": 0.07274288371871161,
"grad_norm": 0.16912953555583954,
"learning_rate": 0.001,
"loss": 2.738,
"num_input_tokens_seen": 3997696000,
"step": 15250
},
{
"epoch": 0.07298138497680576,
"grad_norm": 0.19751113653182983,
"learning_rate": 0.001,
"loss": 2.7532,
"num_input_tokens_seen": 4010803200,
"step": 15300
},
{
"epoch": 0.07321988623489989,
"grad_norm": 0.16762405633926392,
"learning_rate": 0.001,
"loss": 2.7413,
"num_input_tokens_seen": 4023910400,
"step": 15350
},
{
"epoch": 0.07345838749299402,
"grad_norm": 0.18106459081172943,
"learning_rate": 0.001,
"loss": 2.7411,
"num_input_tokens_seen": 4037017600,
"step": 15400
},
{
"epoch": 0.07369688875108817,
"grad_norm": 0.184165820479393,
"learning_rate": 0.001,
"loss": 2.7449,
"num_input_tokens_seen": 4050124800,
"step": 15450
},
{
"epoch": 0.0739353900091823,
"grad_norm": 0.16832765936851501,
"learning_rate": 0.001,
"loss": 2.7442,
"num_input_tokens_seen": 4063232000,
"step": 15500
},
{
"epoch": 0.0739353900091823,
"eval_loss": 2.6253247261047363,
"eval_runtime": 50.6964,
"eval_samples_per_second": 98.626,
"eval_steps_per_second": 24.657,
"num_input_tokens_seen": 4063232000,
"step": 15500
},
{
"epoch": 0.07417389126727643,
"grad_norm": 0.1663861721754074,
"learning_rate": 0.001,
"loss": 2.7461,
"num_input_tokens_seen": 4076339200,
"step": 15550
},
{
"epoch": 0.07441239252537057,
"grad_norm": 0.17217928171157837,
"learning_rate": 0.001,
"loss": 2.7394,
"num_input_tokens_seen": 4089446400,
"step": 15600
},
{
"epoch": 0.0746508937834647,
"grad_norm": 0.17169134318828583,
"learning_rate": 0.001,
"loss": 2.7474,
"num_input_tokens_seen": 4102553600,
"step": 15650
},
{
"epoch": 0.07488939504155885,
"grad_norm": 0.17074033617973328,
"learning_rate": 0.001,
"loss": 2.7405,
"num_input_tokens_seen": 4115660800,
"step": 15700
},
{
"epoch": 0.07512789629965298,
"grad_norm": 0.20199435949325562,
"learning_rate": 0.001,
"loss": 2.7412,
"num_input_tokens_seen": 4128768000,
"step": 15750
},
{
"epoch": 0.07536639755774711,
"grad_norm": 0.17569150030612946,
"learning_rate": 0.001,
"loss": 2.7279,
"num_input_tokens_seen": 4141875200,
"step": 15800
},
{
"epoch": 0.07560489881584126,
"grad_norm": 0.1753721386194229,
"learning_rate": 0.001,
"loss": 2.7442,
"num_input_tokens_seen": 4154982400,
"step": 15850
},
{
"epoch": 0.07584340007393539,
"grad_norm": 0.17356647551059723,
"learning_rate": 0.001,
"loss": 2.7447,
"num_input_tokens_seen": 4168089600,
"step": 15900
},
{
"epoch": 0.07608190133202952,
"grad_norm": 0.16931213438510895,
"learning_rate": 0.001,
"loss": 2.7194,
"num_input_tokens_seen": 4181196800,
"step": 15950
},
{
"epoch": 0.07632040259012367,
"grad_norm": 0.2109583616256714,
"learning_rate": 0.001,
"loss": 2.7271,
"num_input_tokens_seen": 4194304000,
"step": 16000
},
{
"epoch": 0.07632040259012367,
"eval_loss": 2.6222195625305176,
"eval_runtime": 50.2121,
"eval_samples_per_second": 99.578,
"eval_steps_per_second": 24.894,
"num_input_tokens_seen": 4194304000,
"step": 16000
},
{
"epoch": 0.0765589038482178,
"grad_norm": 0.1729741096496582,
"learning_rate": 0.001,
"loss": 2.7273,
"num_input_tokens_seen": 4207411200,
"step": 16050
},
{
"epoch": 0.07679740510631193,
"grad_norm": 0.178414985537529,
"learning_rate": 0.001,
"loss": 2.7119,
"num_input_tokens_seen": 4220518400,
"step": 16100
},
{
"epoch": 0.07703590636440608,
"grad_norm": 0.16985353827476501,
"learning_rate": 0.001,
"loss": 2.7329,
"num_input_tokens_seen": 4233625600,
"step": 16150
},
{
"epoch": 0.07727440762250021,
"grad_norm": 0.1792905330657959,
"learning_rate": 0.001,
"loss": 2.7331,
"num_input_tokens_seen": 4246732800,
"step": 16200
},
{
"epoch": 0.07751290888059434,
"grad_norm": 0.17052733898162842,
"learning_rate": 0.001,
"loss": 2.7438,
"num_input_tokens_seen": 4259840000,
"step": 16250
},
{
"epoch": 0.07775141013868848,
"grad_norm": 0.18520629405975342,
"learning_rate": 0.001,
"loss": 2.7292,
"num_input_tokens_seen": 4272947200,
"step": 16300
},
{
"epoch": 0.07798991139678262,
"grad_norm": 0.18607158958911896,
"learning_rate": 0.001,
"loss": 2.7305,
"num_input_tokens_seen": 4286054400,
"step": 16350
},
{
"epoch": 0.07822841265487676,
"grad_norm": 0.1774805337190628,
"learning_rate": 0.001,
"loss": 2.7237,
"num_input_tokens_seen": 4299161600,
"step": 16400
},
{
"epoch": 0.07846691391297089,
"grad_norm": 0.17118123173713684,
"learning_rate": 0.001,
"loss": 2.736,
"num_input_tokens_seen": 4312268800,
"step": 16450
},
{
"epoch": 0.07870541517106502,
"grad_norm": 0.18550898134708405,
"learning_rate": 0.001,
"loss": 2.7237,
"num_input_tokens_seen": 4325376000,
"step": 16500
},
{
"epoch": 0.07870541517106502,
"eval_loss": 2.6178503036499023,
"eval_runtime": 50.4959,
"eval_samples_per_second": 99.018,
"eval_steps_per_second": 24.754,
"num_input_tokens_seen": 4325376000,
"step": 16500
},
{
"epoch": 0.07894391642915917,
"grad_norm": 0.1839551031589508,
"learning_rate": 0.001,
"loss": 2.7312,
"num_input_tokens_seen": 4338483200,
"step": 16550
},
{
"epoch": 0.0791824176872533,
"grad_norm": 0.17430303990840912,
"learning_rate": 0.001,
"loss": 2.7138,
"num_input_tokens_seen": 4351590400,
"step": 16600
},
{
"epoch": 0.07942091894534743,
"grad_norm": 0.17208248376846313,
"learning_rate": 0.001,
"loss": 2.7459,
"num_input_tokens_seen": 4364697600,
"step": 16650
},
{
"epoch": 0.07965942020344158,
"grad_norm": 0.16932401061058044,
"learning_rate": 0.001,
"loss": 2.7358,
"num_input_tokens_seen": 4377804800,
"step": 16700
},
{
"epoch": 0.07989792146153571,
"grad_norm": 0.17707890272140503,
"learning_rate": 0.001,
"loss": 2.7169,
"num_input_tokens_seen": 4390912000,
"step": 16750
},
{
"epoch": 0.08013642271962984,
"grad_norm": 0.1669357717037201,
"learning_rate": 0.001,
"loss": 2.7296,
"num_input_tokens_seen": 4404019200,
"step": 16800
},
{
"epoch": 0.08037492397772399,
"grad_norm": 0.19266557693481445,
"learning_rate": 0.001,
"loss": 2.7187,
"num_input_tokens_seen": 4417126400,
"step": 16850
},
{
"epoch": 0.08061342523581812,
"grad_norm": 0.17670407891273499,
"learning_rate": 0.001,
"loss": 2.7339,
"num_input_tokens_seen": 4430233600,
"step": 16900
},
{
"epoch": 0.08085192649391225,
"grad_norm": 0.17866192758083344,
"learning_rate": 0.001,
"loss": 2.7167,
"num_input_tokens_seen": 4443340800,
"step": 16950
},
{
"epoch": 0.0810904277520064,
"grad_norm": 0.18247559666633606,
"learning_rate": 0.001,
"loss": 2.7151,
"num_input_tokens_seen": 4456448000,
"step": 17000
},
{
"epoch": 0.0810904277520064,
"eval_loss": 2.6127233505249023,
"eval_runtime": 50.7555,
"eval_samples_per_second": 98.512,
"eval_steps_per_second": 24.628,
"num_input_tokens_seen": 4456448000,
"step": 17000
},
{
"epoch": 0.08132892901010053,
"grad_norm": 0.17702773213386536,
"learning_rate": 0.001,
"loss": 2.7303,
"num_input_tokens_seen": 4469555200,
"step": 17050
},
{
"epoch": 0.08156743026819467,
"grad_norm": 0.18900151550769806,
"learning_rate": 0.001,
"loss": 2.7286,
"num_input_tokens_seen": 4482662400,
"step": 17100
},
{
"epoch": 0.0818059315262888,
"grad_norm": 0.18566136062145233,
"learning_rate": 0.001,
"loss": 2.7304,
"num_input_tokens_seen": 4495769600,
"step": 17150
},
{
"epoch": 0.08204443278438293,
"grad_norm": 0.1759686917066574,
"learning_rate": 0.001,
"loss": 2.7179,
"num_input_tokens_seen": 4508876800,
"step": 17200
},
{
"epoch": 0.08228293404247708,
"grad_norm": 0.15799184143543243,
"learning_rate": 0.001,
"loss": 2.7367,
"num_input_tokens_seen": 4521984000,
"step": 17250
},
{
"epoch": 0.08252143530057121,
"grad_norm": 0.18740351498126984,
"learning_rate": 0.001,
"loss": 2.72,
"num_input_tokens_seen": 4535091200,
"step": 17300
},
{
"epoch": 0.08275993655866534,
"grad_norm": 0.17688381671905518,
"learning_rate": 0.001,
"loss": 2.7115,
"num_input_tokens_seen": 4548198400,
"step": 17350
},
{
"epoch": 0.08299843781675949,
"grad_norm": 0.1807299256324768,
"learning_rate": 0.001,
"loss": 2.7346,
"num_input_tokens_seen": 4561305600,
"step": 17400
},
{
"epoch": 0.08323693907485362,
"grad_norm": 0.17570430040359497,
"learning_rate": 0.001,
"loss": 2.7141,
"num_input_tokens_seen": 4574412800,
"step": 17450
},
{
"epoch": 0.08347544033294775,
"grad_norm": 0.16912159323692322,
"learning_rate": 0.001,
"loss": 2.7164,
"num_input_tokens_seen": 4587520000,
"step": 17500
},
{
"epoch": 0.08347544033294775,
"eval_loss": 2.6085972785949707,
"eval_runtime": 50.619,
"eval_samples_per_second": 98.777,
"eval_steps_per_second": 24.694,
"num_input_tokens_seen": 4587520000,
"step": 17500
},
{
"epoch": 0.0837139415910419,
"grad_norm": 0.17684704065322876,
"learning_rate": 0.001,
"loss": 2.7293,
"num_input_tokens_seen": 4600627200,
"step": 17550
},
{
"epoch": 0.08395244284913603,
"grad_norm": 0.18020550906658173,
"learning_rate": 0.001,
"loss": 2.7124,
"num_input_tokens_seen": 4613734400,
"step": 17600
},
{
"epoch": 0.08419094410723016,
"grad_norm": 0.17311082780361176,
"learning_rate": 0.001,
"loss": 2.7047,
"num_input_tokens_seen": 4626841600,
"step": 17650
},
{
"epoch": 0.0844294453653243,
"grad_norm": 0.17366532981395721,
"learning_rate": 0.001,
"loss": 2.7316,
"num_input_tokens_seen": 4639948800,
"step": 17700
},
{
"epoch": 0.08466794662341844,
"grad_norm": 0.16526220738887787,
"learning_rate": 0.001,
"loss": 2.7212,
"num_input_tokens_seen": 4653056000,
"step": 17750
},
{
"epoch": 0.08490644788151258,
"grad_norm": 0.1746092140674591,
"learning_rate": 0.001,
"loss": 2.7288,
"num_input_tokens_seen": 4666163200,
"step": 17800
},
{
"epoch": 0.08514494913960671,
"grad_norm": 0.19404995441436768,
"learning_rate": 0.001,
"loss": 2.7129,
"num_input_tokens_seen": 4679270400,
"step": 17850
},
{
"epoch": 0.08538345039770084,
"grad_norm": 0.18850015103816986,
"learning_rate": 0.001,
"loss": 2.7161,
"num_input_tokens_seen": 4692377600,
"step": 17900
},
{
"epoch": 0.08562195165579499,
"grad_norm": 0.19126516580581665,
"learning_rate": 0.001,
"loss": 2.7206,
"num_input_tokens_seen": 4705484800,
"step": 17950
},
{
"epoch": 0.08586045291388912,
"grad_norm": 0.1802307665348053,
"learning_rate": 0.001,
"loss": 2.7163,
"num_input_tokens_seen": 4718592000,
"step": 18000
},
{
"epoch": 0.08586045291388912,
"eval_loss": 2.603686809539795,
"eval_runtime": 50.6488,
"eval_samples_per_second": 98.719,
"eval_steps_per_second": 24.68,
"num_input_tokens_seen": 4718592000,
"step": 18000
},
{
"epoch": 0.08609895417198325,
"grad_norm": 0.18276441097259521,
"learning_rate": 0.001,
"loss": 2.7299,
"num_input_tokens_seen": 4731699200,
"step": 18050
},
{
"epoch": 0.0863374554300774,
"grad_norm": 0.17280028760433197,
"learning_rate": 0.001,
"loss": 2.7175,
"num_input_tokens_seen": 4744806400,
"step": 18100
},
{
"epoch": 0.08657595668817153,
"grad_norm": 0.17224080860614777,
"learning_rate": 0.001,
"loss": 2.7089,
"num_input_tokens_seen": 4757913600,
"step": 18150
},
{
"epoch": 0.08681445794626566,
"grad_norm": 0.17205391824245453,
"learning_rate": 0.001,
"loss": 2.7072,
"num_input_tokens_seen": 4771020800,
"step": 18200
},
{
"epoch": 0.0870529592043598,
"grad_norm": 0.1829432249069214,
"learning_rate": 0.001,
"loss": 2.6959,
"num_input_tokens_seen": 4784128000,
"step": 18250
},
{
"epoch": 0.08729146046245394,
"grad_norm": 0.1669514924287796,
"learning_rate": 0.001,
"loss": 2.7209,
"num_input_tokens_seen": 4797235200,
"step": 18300
},
{
"epoch": 0.08752996172054807,
"grad_norm": 0.18273359537124634,
"learning_rate": 0.001,
"loss": 2.6935,
"num_input_tokens_seen": 4810342400,
"step": 18350
},
{
"epoch": 0.08776846297864221,
"grad_norm": 0.21061965823173523,
"learning_rate": 0.001,
"loss": 2.7204,
"num_input_tokens_seen": 4823449600,
"step": 18400
},
{
"epoch": 0.08800696423673635,
"grad_norm": 0.17710614204406738,
"learning_rate": 0.001,
"loss": 2.7231,
"num_input_tokens_seen": 4836556800,
"step": 18450
},
{
"epoch": 0.08824546549483049,
"grad_norm": 0.17370648682117462,
"learning_rate": 0.001,
"loss": 2.7064,
"num_input_tokens_seen": 4849664000,
"step": 18500
},
{
"epoch": 0.08824546549483049,
"eval_loss": 2.600076913833618,
"eval_runtime": 50.3596,
"eval_samples_per_second": 99.286,
"eval_steps_per_second": 24.822,
"num_input_tokens_seen": 4849664000,
"step": 18500
},
{
"epoch": 0.08848396675292462,
"grad_norm": 0.19398675858974457,
"learning_rate": 0.001,
"loss": 2.7118,
"num_input_tokens_seen": 4862771200,
"step": 18550
},
{
"epoch": 0.08872246801101875,
"grad_norm": 0.18522043526172638,
"learning_rate": 0.001,
"loss": 2.6998,
"num_input_tokens_seen": 4875878400,
"step": 18600
},
{
"epoch": 0.0889609692691129,
"grad_norm": 0.2682952880859375,
"learning_rate": 0.001,
"loss": 2.7057,
"num_input_tokens_seen": 4888985600,
"step": 18650
},
{
"epoch": 0.08919947052720703,
"grad_norm": 0.18555712699890137,
"learning_rate": 0.001,
"loss": 2.7127,
"num_input_tokens_seen": 4902092800,
"step": 18700
},
{
"epoch": 0.08943797178530116,
"grad_norm": 0.1940859854221344,
"learning_rate": 0.001,
"loss": 2.7054,
"num_input_tokens_seen": 4915200000,
"step": 18750
},
{
"epoch": 0.08967647304339531,
"grad_norm": 0.1800539344549179,
"learning_rate": 0.001,
"loss": 2.702,
"num_input_tokens_seen": 4928307200,
"step": 18800
},
{
"epoch": 0.08991497430148944,
"grad_norm": 0.19734695553779602,
"learning_rate": 0.001,
"loss": 2.7157,
"num_input_tokens_seen": 4941414400,
"step": 18850
},
{
"epoch": 0.09015347555958357,
"grad_norm": 0.16387026011943817,
"learning_rate": 0.001,
"loss": 2.7183,
"num_input_tokens_seen": 4954521600,
"step": 18900
},
{
"epoch": 0.09039197681767772,
"grad_norm": 0.19447770714759827,
"learning_rate": 0.001,
"loss": 2.7154,
"num_input_tokens_seen": 4967628800,
"step": 18950
},
{
"epoch": 0.09063047807577185,
"grad_norm": 0.17366571724414825,
"learning_rate": 0.001,
"loss": 2.6996,
"num_input_tokens_seen": 4980736000,
"step": 19000
},
{
"epoch": 0.09063047807577185,
"eval_loss": 2.5983569622039795,
"eval_runtime": 50.8055,
"eval_samples_per_second": 98.415,
"eval_steps_per_second": 24.604,
"num_input_tokens_seen": 4980736000,
"step": 19000
},
{
"epoch": 0.09086897933386599,
"grad_norm": 0.1770928055047989,
"learning_rate": 0.001,
"loss": 2.7171,
"num_input_tokens_seen": 4993843200,
"step": 19050
},
{
"epoch": 0.09110748059196012,
"grad_norm": 0.18122689425945282,
"learning_rate": 0.001,
"loss": 2.7064,
"num_input_tokens_seen": 5006950400,
"step": 19100
},
{
"epoch": 0.09134598185005426,
"grad_norm": 0.19320747256278992,
"learning_rate": 0.001,
"loss": 2.7105,
"num_input_tokens_seen": 5020057600,
"step": 19150
},
{
"epoch": 0.0915844831081484,
"grad_norm": 0.19556616246700287,
"learning_rate": 0.001,
"loss": 2.7018,
"num_input_tokens_seen": 5033164800,
"step": 19200
},
{
"epoch": 0.09182298436624253,
"grad_norm": 0.18251653015613556,
"learning_rate": 0.001,
"loss": 2.7067,
"num_input_tokens_seen": 5046272000,
"step": 19250
},
{
"epoch": 0.09206148562433666,
"grad_norm": 0.17226757109165192,
"learning_rate": 0.001,
"loss": 2.6803,
"num_input_tokens_seen": 5059379200,
"step": 19300
},
{
"epoch": 0.09229998688243081,
"grad_norm": 0.18007858097553253,
"learning_rate": 0.001,
"loss": 2.6998,
"num_input_tokens_seen": 5072486400,
"step": 19350
},
{
"epoch": 0.09253848814052494,
"grad_norm": 0.1664605736732483,
"learning_rate": 0.001,
"loss": 2.6985,
"num_input_tokens_seen": 5085593600,
"step": 19400
},
{
"epoch": 0.09277698939861907,
"grad_norm": 0.17898677289485931,
"learning_rate": 0.001,
"loss": 2.7034,
"num_input_tokens_seen": 5098700800,
"step": 19450
},
{
"epoch": 0.09301549065671322,
"grad_norm": 0.16403160989284515,
"learning_rate": 0.001,
"loss": 2.7012,
"num_input_tokens_seen": 5111808000,
"step": 19500
},
{
"epoch": 0.09301549065671322,
"eval_loss": 2.594251871109009,
"eval_runtime": 50.1924,
"eval_samples_per_second": 99.617,
"eval_steps_per_second": 24.904,
"num_input_tokens_seen": 5111808000,
"step": 19500
},
{
"epoch": 0.09325399191480735,
"grad_norm": 0.17973001301288605,
"learning_rate": 0.001,
"loss": 2.7039,
"num_input_tokens_seen": 5124915200,
"step": 19550
},
{
"epoch": 0.09349249317290148,
"grad_norm": 0.1667868047952652,
"learning_rate": 0.001,
"loss": 2.7038,
"num_input_tokens_seen": 5138022400,
"step": 19600
},
{
"epoch": 0.09373099443099563,
"grad_norm": 0.18338319659233093,
"learning_rate": 0.001,
"loss": 2.7138,
"num_input_tokens_seen": 5151129600,
"step": 19650
},
{
"epoch": 0.09396949568908976,
"grad_norm": 0.17962965369224548,
"learning_rate": 0.001,
"loss": 2.6994,
"num_input_tokens_seen": 5164236800,
"step": 19700
},
{
"epoch": 0.0942079969471839,
"grad_norm": 0.17233812808990479,
"learning_rate": 0.001,
"loss": 2.7073,
"num_input_tokens_seen": 5177344000,
"step": 19750
},
{
"epoch": 0.09444649820527803,
"grad_norm": 0.16720129549503326,
"learning_rate": 0.001,
"loss": 2.7146,
"num_input_tokens_seen": 5190451200,
"step": 19800
},
{
"epoch": 0.09468499946337217,
"grad_norm": 0.1732376664876938,
"learning_rate": 0.001,
"loss": 2.7126,
"num_input_tokens_seen": 5203558400,
"step": 19850
},
{
"epoch": 0.09492350072146631,
"grad_norm": 0.17245380580425262,
"learning_rate": 0.001,
"loss": 2.7054,
"num_input_tokens_seen": 5216665600,
"step": 19900
},
{
"epoch": 0.09516200197956044,
"grad_norm": 0.17415107786655426,
"learning_rate": 0.001,
"loss": 2.7027,
"num_input_tokens_seen": 5229772800,
"step": 19950
},
{
"epoch": 0.09540050323765457,
"grad_norm": 0.1747124344110489,
"learning_rate": 0.001,
"loss": 2.6975,
"num_input_tokens_seen": 5242880000,
"step": 20000
},
{
"epoch": 0.09540050323765457,
"eval_loss": 2.5900332927703857,
"eval_runtime": 50.5898,
"eval_samples_per_second": 98.834,
"eval_steps_per_second": 24.709,
"num_input_tokens_seen": 5242880000,
"step": 20000
},
{
"epoch": 0.09563900449574872,
"grad_norm": 0.17750214040279388,
"learning_rate": 0.001,
"loss": 2.7,
"num_input_tokens_seen": 5255987200,
"step": 20050
},
{
"epoch": 0.09587750575384285,
"grad_norm": 0.16490615904331207,
"learning_rate": 0.001,
"loss": 2.7188,
"num_input_tokens_seen": 5269094400,
"step": 20100
},
{
"epoch": 0.09611600701193698,
"grad_norm": 0.20347309112548828,
"learning_rate": 0.001,
"loss": 2.7034,
"num_input_tokens_seen": 5282201600,
"step": 20150
},
{
"epoch": 0.09635450827003113,
"grad_norm": 0.19717667996883392,
"learning_rate": 0.001,
"loss": 2.6864,
"num_input_tokens_seen": 5295308800,
"step": 20200
},
{
"epoch": 0.09659300952812526,
"grad_norm": 0.17054997384548187,
"learning_rate": 0.001,
"loss": 2.7068,
"num_input_tokens_seen": 5308416000,
"step": 20250
},
{
"epoch": 0.09683151078621939,
"grad_norm": 0.1771887093782425,
"learning_rate": 0.001,
"loss": 2.7037,
"num_input_tokens_seen": 5321523200,
"step": 20300
},
{
"epoch": 0.09707001204431354,
"grad_norm": 0.17556501924991608,
"learning_rate": 0.001,
"loss": 2.705,
"num_input_tokens_seen": 5334630400,
"step": 20350
},
{
"epoch": 0.09730851330240767,
"grad_norm": 0.1696256399154663,
"learning_rate": 0.001,
"loss": 2.7109,
"num_input_tokens_seen": 5347737600,
"step": 20400
},
{
"epoch": 0.09754701456050181,
"grad_norm": 0.18629619479179382,
"learning_rate": 0.001,
"loss": 2.7043,
"num_input_tokens_seen": 5360844800,
"step": 20450
},
{
"epoch": 0.09778551581859594,
"grad_norm": 0.18701018393039703,
"learning_rate": 0.001,
"loss": 2.7002,
"num_input_tokens_seen": 5373952000,
"step": 20500
},
{
"epoch": 0.09778551581859594,
"eval_loss": 2.587498903274536,
"eval_runtime": 51.1986,
"eval_samples_per_second": 97.659,
"eval_steps_per_second": 24.415,
"num_input_tokens_seen": 5373952000,
"step": 20500
},
{
"epoch": 0.09802401707669008,
"grad_norm": 0.1792842447757721,
"learning_rate": 0.001,
"loss": 2.7083,
"num_input_tokens_seen": 5387059200,
"step": 20550
},
{
"epoch": 0.09826251833478422,
"grad_norm": 0.18761058151721954,
"learning_rate": 0.001,
"loss": 2.6898,
"num_input_tokens_seen": 5400166400,
"step": 20600
},
{
"epoch": 0.09850101959287835,
"grad_norm": 0.1827591508626938,
"learning_rate": 0.001,
"loss": 2.6976,
"num_input_tokens_seen": 5413273600,
"step": 20650
},
{
"epoch": 0.09873952085097248,
"grad_norm": 0.16178373992443085,
"learning_rate": 0.001,
"loss": 2.7029,
"num_input_tokens_seen": 5426380800,
"step": 20700
},
{
"epoch": 0.09897802210906663,
"grad_norm": 0.1880313903093338,
"learning_rate": 0.001,
"loss": 2.6867,
"num_input_tokens_seen": 5439488000,
"step": 20750
},
{
"epoch": 0.09921652336716076,
"grad_norm": 0.17611584067344666,
"learning_rate": 0.001,
"loss": 2.6741,
"num_input_tokens_seen": 5452595200,
"step": 20800
},
{
"epoch": 0.09945502462525489,
"grad_norm": 0.17712561786174774,
"learning_rate": 0.001,
"loss": 2.7099,
"num_input_tokens_seen": 5465702400,
"step": 20850
},
{
"epoch": 0.09969352588334904,
"grad_norm": 0.18022434413433075,
"learning_rate": 0.001,
"loss": 2.6988,
"num_input_tokens_seen": 5478809600,
"step": 20900
},
{
"epoch": 0.09993202714144317,
"grad_norm": 0.17434161901474,
"learning_rate": 0.001,
"loss": 2.6869,
"num_input_tokens_seen": 5491916800,
"step": 20950
},
{
"epoch": 0.1001705283995373,
"grad_norm": 0.17802472412586212,
"learning_rate": 0.001,
"loss": 2.6935,
"num_input_tokens_seen": 5505024000,
"step": 21000
},
{
"epoch": 0.1001705283995373,
"eval_loss": 2.5838851928710938,
"eval_runtime": 50.1977,
"eval_samples_per_second": 99.606,
"eval_steps_per_second": 24.902,
"num_input_tokens_seen": 5505024000,
"step": 21000
},
{
"epoch": 0.10040902965763145,
"grad_norm": 0.1723284274339676,
"learning_rate": 0.001,
"loss": 2.694,
"num_input_tokens_seen": 5518131200,
"step": 21050
},
{
"epoch": 0.10064753091572558,
"grad_norm": 0.1627894937992096,
"learning_rate": 0.001,
"loss": 2.6866,
"num_input_tokens_seen": 5531238400,
"step": 21100
},
{
"epoch": 0.10088603217381972,
"grad_norm": 0.20949719846248627,
"learning_rate": 0.001,
"loss": 2.6915,
"num_input_tokens_seen": 5544345600,
"step": 21150
},
{
"epoch": 0.10112453343191385,
"grad_norm": 0.1980736404657364,
"learning_rate": 0.001,
"loss": 2.7076,
"num_input_tokens_seen": 5557452800,
"step": 21200
},
{
"epoch": 0.10136303469000799,
"grad_norm": 0.20961201190948486,
"learning_rate": 0.001,
"loss": 2.6978,
"num_input_tokens_seen": 5570560000,
"step": 21250
},
{
"epoch": 0.10160153594810213,
"grad_norm": 0.18137700855731964,
"learning_rate": 0.001,
"loss": 2.7029,
"num_input_tokens_seen": 5583667200,
"step": 21300
},
{
"epoch": 0.10184003720619626,
"grad_norm": 0.17235560715198517,
"learning_rate": 0.001,
"loss": 2.6979,
"num_input_tokens_seen": 5596774400,
"step": 21350
},
{
"epoch": 0.1020785384642904,
"grad_norm": 0.17818449437618256,
"learning_rate": 0.001,
"loss": 2.6987,
"num_input_tokens_seen": 5609881600,
"step": 21400
},
{
"epoch": 0.10231703972238454,
"grad_norm": 0.1798463761806488,
"learning_rate": 0.001,
"loss": 2.693,
"num_input_tokens_seen": 5622988800,
"step": 21450
},
{
"epoch": 0.10255554098047867,
"grad_norm": 0.19028444588184357,
"learning_rate": 0.001,
"loss": 2.7079,
"num_input_tokens_seen": 5636096000,
"step": 21500
},
{
"epoch": 0.10255554098047867,
"eval_loss": 2.5816380977630615,
"eval_runtime": 50.7808,
"eval_samples_per_second": 98.462,
"eval_steps_per_second": 24.616,
"num_input_tokens_seen": 5636096000,
"step": 21500
},
{
"epoch": 0.1027940422385728,
"grad_norm": 0.1831275075674057,
"learning_rate": 0.001,
"loss": 2.7038,
"num_input_tokens_seen": 5649203200,
"step": 21550
},
{
"epoch": 0.10303254349666695,
"grad_norm": 0.17404012382030487,
"learning_rate": 0.001,
"loss": 2.7071,
"num_input_tokens_seen": 5662310400,
"step": 21600
},
{
"epoch": 0.10327104475476108,
"grad_norm": 0.1652098149061203,
"learning_rate": 0.001,
"loss": 2.7033,
"num_input_tokens_seen": 5675417600,
"step": 21650
},
{
"epoch": 0.10350954601285522,
"grad_norm": 0.1914501190185547,
"learning_rate": 0.001,
"loss": 2.6844,
"num_input_tokens_seen": 5688524800,
"step": 21700
},
{
"epoch": 0.10374804727094936,
"grad_norm": 0.19169588387012482,
"learning_rate": 0.001,
"loss": 2.6793,
"num_input_tokens_seen": 5701632000,
"step": 21750
},
{
"epoch": 0.10398654852904349,
"grad_norm": 0.17937491834163666,
"learning_rate": 0.001,
"loss": 2.6972,
"num_input_tokens_seen": 5714739200,
"step": 21800
},
{
"epoch": 0.10422504978713763,
"grad_norm": 0.17515376210212708,
"learning_rate": 0.001,
"loss": 2.6995,
"num_input_tokens_seen": 5727846400,
"step": 21850
},
{
"epoch": 0.10446355104523176,
"grad_norm": 0.18881027400493622,
"learning_rate": 0.001,
"loss": 2.7097,
"num_input_tokens_seen": 5740953600,
"step": 21900
},
{
"epoch": 0.1047020523033259,
"grad_norm": 0.19030135869979858,
"learning_rate": 0.001,
"loss": 2.6801,
"num_input_tokens_seen": 5754060800,
"step": 21950
},
{
"epoch": 0.10494055356142004,
"grad_norm": 0.17325563728809357,
"learning_rate": 0.001,
"loss": 2.6803,
"num_input_tokens_seen": 5767168000,
"step": 22000
},
{
"epoch": 0.10494055356142004,
"eval_loss": 2.577341318130493,
"eval_runtime": 50.8482,
"eval_samples_per_second": 98.332,
"eval_steps_per_second": 24.583,
"num_input_tokens_seen": 5767168000,
"step": 22000
},
{
"epoch": 0.10517905481951417,
"grad_norm": 0.19298380613327026,
"learning_rate": 0.001,
"loss": 2.6966,
"num_input_tokens_seen": 5780275200,
"step": 22050
},
{
"epoch": 0.1054175560776083,
"grad_norm": 0.1772100180387497,
"learning_rate": 0.001,
"loss": 2.6851,
"num_input_tokens_seen": 5793382400,
"step": 22100
},
{
"epoch": 0.10565605733570245,
"grad_norm": 0.18548481166362762,
"learning_rate": 0.001,
"loss": 2.7028,
"num_input_tokens_seen": 5806489600,
"step": 22150
},
{
"epoch": 0.10589455859379658,
"grad_norm": 0.20102089643478394,
"learning_rate": 0.001,
"loss": 2.6915,
"num_input_tokens_seen": 5819596800,
"step": 22200
},
{
"epoch": 0.10613305985189071,
"grad_norm": 0.1833849996328354,
"learning_rate": 0.001,
"loss": 2.6872,
"num_input_tokens_seen": 5832704000,
"step": 22250
},
{
"epoch": 0.10637156110998486,
"grad_norm": 0.17730027437210083,
"learning_rate": 0.001,
"loss": 2.6811,
"num_input_tokens_seen": 5845811200,
"step": 22300
},
{
"epoch": 0.10661006236807899,
"grad_norm": 0.1818256825208664,
"learning_rate": 0.001,
"loss": 2.7,
"num_input_tokens_seen": 5858918400,
"step": 22350
},
{
"epoch": 0.10684856362617313,
"grad_norm": 0.16850312054157257,
"learning_rate": 0.001,
"loss": 2.6927,
"num_input_tokens_seen": 5872025600,
"step": 22400
},
{
"epoch": 0.10708706488426727,
"grad_norm": 0.209822878241539,
"learning_rate": 0.001,
"loss": 2.6881,
"num_input_tokens_seen": 5885132800,
"step": 22450
},
{
"epoch": 0.1073255661423614,
"grad_norm": 0.2131560891866684,
"learning_rate": 0.001,
"loss": 2.6797,
"num_input_tokens_seen": 5898240000,
"step": 22500
},
{
"epoch": 0.1073255661423614,
"eval_loss": 2.575388193130493,
"eval_runtime": 50.9696,
"eval_samples_per_second": 98.098,
"eval_steps_per_second": 24.524,
"num_input_tokens_seen": 5898240000,
"step": 22500
},
{
"epoch": 0.10756406740045554,
"grad_norm": 0.18200135231018066,
"learning_rate": 0.001,
"loss": 2.6837,
"num_input_tokens_seen": 5911347200,
"step": 22550
},
{
"epoch": 0.10780256865854967,
"grad_norm": 0.1830984354019165,
"learning_rate": 0.001,
"loss": 2.7159,
"num_input_tokens_seen": 5924454400,
"step": 22600
},
{
"epoch": 0.1080410699166438,
"grad_norm": 0.1700614094734192,
"learning_rate": 0.001,
"loss": 2.6852,
"num_input_tokens_seen": 5937561600,
"step": 22650
},
{
"epoch": 0.10827957117473795,
"grad_norm": 0.18473868072032928,
"learning_rate": 0.001,
"loss": 2.6857,
"num_input_tokens_seen": 5950668800,
"step": 22700
},
{
"epoch": 0.10851807243283208,
"grad_norm": 0.19345365464687347,
"learning_rate": 0.001,
"loss": 2.69,
"num_input_tokens_seen": 5963776000,
"step": 22750
},
{
"epoch": 0.10875657369092621,
"grad_norm": 0.18807141482830048,
"learning_rate": 0.001,
"loss": 2.6897,
"num_input_tokens_seen": 5976883200,
"step": 22800
},
{
"epoch": 0.10899507494902036,
"grad_norm": 0.18426446616649628,
"learning_rate": 0.001,
"loss": 2.6855,
"num_input_tokens_seen": 5989990400,
"step": 22850
},
{
"epoch": 0.10923357620711449,
"grad_norm": 0.19184571504592896,
"learning_rate": 0.001,
"loss": 2.6914,
"num_input_tokens_seen": 6003097600,
"step": 22900
},
{
"epoch": 0.10947207746520862,
"grad_norm": 0.22897471487522125,
"learning_rate": 0.001,
"loss": 2.6812,
"num_input_tokens_seen": 6016204800,
"step": 22950
},
{
"epoch": 0.10971057872330277,
"grad_norm": 0.1939724087715149,
"learning_rate": 0.001,
"loss": 2.6836,
"num_input_tokens_seen": 6029312000,
"step": 23000
},
{
"epoch": 0.10971057872330277,
"eval_loss": 2.570572853088379,
"eval_runtime": 50.0606,
"eval_samples_per_second": 99.879,
"eval_steps_per_second": 24.97,
"num_input_tokens_seen": 6029312000,
"step": 23000
},
{
"epoch": 0.1099490799813969,
"grad_norm": 0.17564797401428223,
"learning_rate": 0.001,
"loss": 2.6912,
"num_input_tokens_seen": 6042419200,
"step": 23050
},
{
"epoch": 0.11018758123949104,
"grad_norm": 0.17937473952770233,
"learning_rate": 0.001,
"loss": 2.6708,
"num_input_tokens_seen": 6055526400,
"step": 23100
},
{
"epoch": 0.11042608249758518,
"grad_norm": 0.18281136453151703,
"learning_rate": 0.001,
"loss": 2.6855,
"num_input_tokens_seen": 6068633600,
"step": 23150
},
{
"epoch": 0.11066458375567931,
"grad_norm": 0.18834726512432098,
"learning_rate": 0.001,
"loss": 2.6887,
"num_input_tokens_seen": 6081740800,
"step": 23200
},
{
"epoch": 0.11090308501377345,
"grad_norm": 0.2104720175266266,
"learning_rate": 0.001,
"loss": 2.6914,
"num_input_tokens_seen": 6094848000,
"step": 23250
},
{
"epoch": 0.11114158627186758,
"grad_norm": 0.18674172461032867,
"learning_rate": 0.001,
"loss": 2.6855,
"num_input_tokens_seen": 6107955200,
"step": 23300
},
{
"epoch": 0.11138008752996172,
"grad_norm": 0.19519701600074768,
"learning_rate": 0.001,
"loss": 2.6851,
"num_input_tokens_seen": 6121062400,
"step": 23350
},
{
"epoch": 0.11161858878805586,
"grad_norm": 0.1752537339925766,
"learning_rate": 0.001,
"loss": 2.692,
"num_input_tokens_seen": 6134169600,
"step": 23400
},
{
"epoch": 0.11185709004614999,
"grad_norm": 0.1786031723022461,
"learning_rate": 0.001,
"loss": 2.6785,
"num_input_tokens_seen": 6147276800,
"step": 23450
},
{
"epoch": 0.11209559130424412,
"grad_norm": 0.19057604670524597,
"learning_rate": 0.001,
"loss": 2.6798,
"num_input_tokens_seen": 6160384000,
"step": 23500
},
{
"epoch": 0.11209559130424412,
"eval_loss": 2.5710463523864746,
"eval_runtime": 50.3332,
"eval_samples_per_second": 99.338,
"eval_steps_per_second": 24.835,
"num_input_tokens_seen": 6160384000,
"step": 23500
},
{
"epoch": 0.11233409256233827,
"grad_norm": 0.18272963166236877,
"learning_rate": 0.001,
"loss": 2.6847,
"num_input_tokens_seen": 6173491200,
"step": 23550
},
{
"epoch": 0.1125725938204324,
"grad_norm": 0.1666375696659088,
"learning_rate": 0.001,
"loss": 2.6747,
"num_input_tokens_seen": 6186598400,
"step": 23600
},
{
"epoch": 0.11281109507852653,
"grad_norm": 0.1688246876001358,
"learning_rate": 0.001,
"loss": 2.6963,
"num_input_tokens_seen": 6199705600,
"step": 23650
},
{
"epoch": 0.11304959633662068,
"grad_norm": 0.1970459669828415,
"learning_rate": 0.001,
"loss": 2.6904,
"num_input_tokens_seen": 6212812800,
"step": 23700
},
{
"epoch": 0.11328809759471481,
"grad_norm": 0.19660720229148865,
"learning_rate": 0.001,
"loss": 2.6833,
"num_input_tokens_seen": 6225920000,
"step": 23750
},
{
"epoch": 0.11352659885280895,
"grad_norm": 0.18711698055267334,
"learning_rate": 0.001,
"loss": 2.6872,
"num_input_tokens_seen": 6239027200,
"step": 23800
},
{
"epoch": 0.11376510011090309,
"grad_norm": 0.1878872513771057,
"learning_rate": 0.001,
"loss": 2.6884,
"num_input_tokens_seen": 6252134400,
"step": 23850
},
{
"epoch": 0.11400360136899722,
"grad_norm": 0.1969616860151291,
"learning_rate": 0.001,
"loss": 2.6982,
"num_input_tokens_seen": 6265241600,
"step": 23900
},
{
"epoch": 0.11424210262709136,
"grad_norm": 0.19693812727928162,
"learning_rate": 0.001,
"loss": 2.6782,
"num_input_tokens_seen": 6278348800,
"step": 23950
},
{
"epoch": 0.1144806038851855,
"grad_norm": 0.1731441468000412,
"learning_rate": 0.001,
"loss": 2.6917,
"num_input_tokens_seen": 6291456000,
"step": 24000
},
{
"epoch": 0.1144806038851855,
"eval_loss": 2.5664987564086914,
"eval_runtime": 50.3452,
"eval_samples_per_second": 99.314,
"eval_steps_per_second": 24.829,
"num_input_tokens_seen": 6291456000,
"step": 24000
},
{
"epoch": 0.11471910514327963,
"grad_norm": 0.1724429428577423,
"learning_rate": 0.001,
"loss": 2.6806,
"num_input_tokens_seen": 6304563200,
"step": 24050
},
{
"epoch": 0.11495760640137377,
"grad_norm": 0.20449388027191162,
"learning_rate": 0.001,
"loss": 2.6873,
"num_input_tokens_seen": 6317670400,
"step": 24100
},
{
"epoch": 0.1151961076594679,
"grad_norm": 0.19024738669395447,
"learning_rate": 0.001,
"loss": 2.6811,
"num_input_tokens_seen": 6330777600,
"step": 24150
},
{
"epoch": 0.11543460891756203,
"grad_norm": 0.20510025322437286,
"learning_rate": 0.001,
"loss": 2.6643,
"num_input_tokens_seen": 6343884800,
"step": 24200
},
{
"epoch": 0.11567311017565618,
"grad_norm": 0.1783556044101715,
"learning_rate": 0.001,
"loss": 2.6709,
"num_input_tokens_seen": 6356992000,
"step": 24250
},
{
"epoch": 0.11591161143375031,
"grad_norm": 0.1771089732646942,
"learning_rate": 0.001,
"loss": 2.6677,
"num_input_tokens_seen": 6370099200,
"step": 24300
},
{
"epoch": 0.11615011269184444,
"grad_norm": 0.17016734182834625,
"learning_rate": 0.001,
"loss": 2.6681,
"num_input_tokens_seen": 6383206400,
"step": 24350
},
{
"epoch": 0.11638861394993859,
"grad_norm": 0.1901489496231079,
"learning_rate": 0.001,
"loss": 2.6811,
"num_input_tokens_seen": 6396313600,
"step": 24400
},
{
"epoch": 0.11662711520803272,
"grad_norm": 0.18185457587242126,
"learning_rate": 0.001,
"loss": 2.6787,
"num_input_tokens_seen": 6409420800,
"step": 24450
},
{
"epoch": 0.11686561646612686,
"grad_norm": 0.1789853274822235,
"learning_rate": 0.001,
"loss": 2.6657,
"num_input_tokens_seen": 6422528000,
"step": 24500
},
{
"epoch": 0.11686561646612686,
"eval_loss": 2.564084768295288,
"eval_runtime": 50.4559,
"eval_samples_per_second": 99.096,
"eval_steps_per_second": 24.774,
"num_input_tokens_seen": 6422528000,
"step": 24500
},
{
"epoch": 0.117104117724221,
"grad_norm": 0.17294436693191528,
"learning_rate": 0.001,
"loss": 2.6812,
"num_input_tokens_seen": 6435635200,
"step": 24550
},
{
"epoch": 0.11734261898231513,
"grad_norm": 0.1840251386165619,
"learning_rate": 0.001,
"loss": 2.6599,
"num_input_tokens_seen": 6448742400,
"step": 24600
},
{
"epoch": 0.11758112024040927,
"grad_norm": 0.17588932812213898,
"learning_rate": 0.001,
"loss": 2.6742,
"num_input_tokens_seen": 6461849600,
"step": 24650
},
{
"epoch": 0.1178196214985034,
"grad_norm": 0.1805667132139206,
"learning_rate": 0.001,
"loss": 2.6647,
"num_input_tokens_seen": 6474956800,
"step": 24700
},
{
"epoch": 0.11805812275659754,
"grad_norm": 0.17930665612220764,
"learning_rate": 0.001,
"loss": 2.6763,
"num_input_tokens_seen": 6488064000,
"step": 24750
},
{
"epoch": 0.11829662401469168,
"grad_norm": 0.19195732474327087,
"learning_rate": 0.001,
"loss": 2.6716,
"num_input_tokens_seen": 6501171200,
"step": 24800
},
{
"epoch": 0.11853512527278581,
"grad_norm": 0.19274356961250305,
"learning_rate": 0.001,
"loss": 2.6702,
"num_input_tokens_seen": 6514278400,
"step": 24850
},
{
"epoch": 0.11877362653087994,
"grad_norm": 0.17423510551452637,
"learning_rate": 0.001,
"loss": 2.6733,
"num_input_tokens_seen": 6527385600,
"step": 24900
},
{
"epoch": 0.11901212778897409,
"grad_norm": 0.20267954468727112,
"learning_rate": 0.001,
"loss": 2.6649,
"num_input_tokens_seen": 6540492800,
"step": 24950
},
{
"epoch": 0.11925062904706822,
"grad_norm": 0.1756502240896225,
"learning_rate": 0.001,
"loss": 2.6582,
"num_input_tokens_seen": 6553600000,
"step": 25000
},
{
"epoch": 0.11925062904706822,
"eval_loss": 2.562955379486084,
"eval_runtime": 50.0071,
"eval_samples_per_second": 99.986,
"eval_steps_per_second": 24.996,
"num_input_tokens_seen": 6553600000,
"step": 25000
},
{
"epoch": 0.11948913030516237,
"grad_norm": 0.19173742830753326,
"learning_rate": 0.001,
"loss": 2.6871,
"num_input_tokens_seen": 6566707200,
"step": 25050
},
{
"epoch": 0.1197276315632565,
"grad_norm": 0.1746075600385666,
"learning_rate": 0.001,
"loss": 2.7003,
"num_input_tokens_seen": 6579814400,
"step": 25100
},
{
"epoch": 0.11996613282135063,
"grad_norm": 0.17817530035972595,
"learning_rate": 0.001,
"loss": 2.6944,
"num_input_tokens_seen": 6592921600,
"step": 25150
},
{
"epoch": 0.12020463407944477,
"grad_norm": 0.201807901263237,
"learning_rate": 0.001,
"loss": 2.6766,
"num_input_tokens_seen": 6606028800,
"step": 25200
},
{
"epoch": 0.1204431353375389,
"grad_norm": 0.18620917201042175,
"learning_rate": 0.001,
"loss": 2.6802,
"num_input_tokens_seen": 6619136000,
"step": 25250
},
{
"epoch": 0.12068163659563304,
"grad_norm": 0.17383818328380585,
"learning_rate": 0.001,
"loss": 2.6698,
"num_input_tokens_seen": 6632243200,
"step": 25300
},
{
"epoch": 0.12092013785372718,
"grad_norm": 0.1766287237405777,
"learning_rate": 0.001,
"loss": 2.6705,
"num_input_tokens_seen": 6645350400,
"step": 25350
},
{
"epoch": 0.12115863911182131,
"grad_norm": 0.19551052153110504,
"learning_rate": 0.001,
"loss": 2.678,
"num_input_tokens_seen": 6658457600,
"step": 25400
},
{
"epoch": 0.12139714036991545,
"grad_norm": 0.18625982105731964,
"learning_rate": 0.001,
"loss": 2.6688,
"num_input_tokens_seen": 6671564800,
"step": 25450
},
{
"epoch": 0.12163564162800959,
"grad_norm": 0.18274050951004028,
"learning_rate": 0.001,
"loss": 2.6818,
"num_input_tokens_seen": 6684672000,
"step": 25500
},
{
"epoch": 0.12163564162800959,
"eval_loss": 2.5602569580078125,
"eval_runtime": 50.4187,
"eval_samples_per_second": 99.17,
"eval_steps_per_second": 24.792,
"num_input_tokens_seen": 6684672000,
"step": 25500
},
{
"epoch": 0.12187414288610372,
"grad_norm": 0.18547837436199188,
"learning_rate": 0.001,
"loss": 2.6754,
"num_input_tokens_seen": 6697779200,
"step": 25550
},
{
"epoch": 0.12211264414419785,
"grad_norm": 0.18558937311172485,
"learning_rate": 0.001,
"loss": 2.6767,
"num_input_tokens_seen": 6710886400,
"step": 25600
},
{
"epoch": 0.122351145402292,
"grad_norm": 0.17276135087013245,
"learning_rate": 0.001,
"loss": 2.6775,
"num_input_tokens_seen": 6723993600,
"step": 25650
},
{
"epoch": 0.12258964666038613,
"grad_norm": 0.18483039736747742,
"learning_rate": 0.001,
"loss": 2.6818,
"num_input_tokens_seen": 6737100800,
"step": 25700
},
{
"epoch": 0.12282814791848028,
"grad_norm": 0.18036937713623047,
"learning_rate": 0.001,
"loss": 2.6669,
"num_input_tokens_seen": 6750208000,
"step": 25750
},
{
"epoch": 0.12306664917657441,
"grad_norm": 0.1728815734386444,
"learning_rate": 0.001,
"loss": 2.6789,
"num_input_tokens_seen": 6763315200,
"step": 25800
},
{
"epoch": 0.12330515043466854,
"grad_norm": 0.19193877279758453,
"learning_rate": 0.001,
"loss": 2.6487,
"num_input_tokens_seen": 6776422400,
"step": 25850
},
{
"epoch": 0.12354365169276268,
"grad_norm": 0.1584886610507965,
"learning_rate": 0.001,
"loss": 2.6638,
"num_input_tokens_seen": 6789529600,
"step": 25900
},
{
"epoch": 0.12378215295085682,
"grad_norm": 0.18792498111724854,
"learning_rate": 0.001,
"loss": 2.6754,
"num_input_tokens_seen": 6802636800,
"step": 25950
},
{
"epoch": 0.12402065420895095,
"grad_norm": 0.1689581423997879,
"learning_rate": 0.001,
"loss": 2.6682,
"num_input_tokens_seen": 6815744000,
"step": 26000
},
{
"epoch": 0.12402065420895095,
"eval_loss": 2.5587522983551025,
"eval_runtime": 50.7858,
"eval_samples_per_second": 98.453,
"eval_steps_per_second": 24.613,
"num_input_tokens_seen": 6815744000,
"step": 26000
},
{
"epoch": 0.1242591554670451,
"grad_norm": 0.18573056161403656,
"learning_rate": 0.001,
"loss": 2.6565,
"num_input_tokens_seen": 6828851200,
"step": 26050
},
{
"epoch": 0.12449765672513922,
"grad_norm": 0.19160890579223633,
"learning_rate": 0.001,
"loss": 2.6797,
"num_input_tokens_seen": 6841958400,
"step": 26100
},
{
"epoch": 0.12473615798323336,
"grad_norm": 0.18323373794555664,
"learning_rate": 0.001,
"loss": 2.6602,
"num_input_tokens_seen": 6855065600,
"step": 26150
},
{
"epoch": 0.1249746592413275,
"grad_norm": 0.17691807448863983,
"learning_rate": 0.001,
"loss": 2.6676,
"num_input_tokens_seen": 6868172800,
"step": 26200
},
{
"epoch": 0.12521316049942163,
"grad_norm": 0.20718660950660706,
"learning_rate": 0.001,
"loss": 2.6588,
"num_input_tokens_seen": 6881280000,
"step": 26250
},
{
"epoch": 0.12545166175751576,
"grad_norm": 0.17811058461666107,
"learning_rate": 0.001,
"loss": 2.6754,
"num_input_tokens_seen": 6894387200,
"step": 26300
},
{
"epoch": 0.1256901630156099,
"grad_norm": 0.17490555346012115,
"learning_rate": 0.001,
"loss": 2.6605,
"num_input_tokens_seen": 6907494400,
"step": 26350
},
{
"epoch": 0.12592866427370406,
"grad_norm": 0.17391368746757507,
"learning_rate": 0.001,
"loss": 2.684,
"num_input_tokens_seen": 6920601600,
"step": 26400
},
{
"epoch": 0.1261671655317982,
"grad_norm": 0.16951416432857513,
"learning_rate": 0.001,
"loss": 2.6685,
"num_input_tokens_seen": 6933708800,
"step": 26450
},
{
"epoch": 0.12640566678989232,
"grad_norm": 0.17574581503868103,
"learning_rate": 0.001,
"loss": 2.6665,
"num_input_tokens_seen": 6946816000,
"step": 26500
},
{
"epoch": 0.12640566678989232,
"eval_loss": 2.556109666824341,
"eval_runtime": 50.8743,
"eval_samples_per_second": 98.281,
"eval_steps_per_second": 24.57,
"num_input_tokens_seen": 6946816000,
"step": 26500
},
{
"epoch": 0.12664416804798645,
"grad_norm": 0.19910745322704315,
"learning_rate": 0.001,
"loss": 2.6755,
"num_input_tokens_seen": 6959923200,
"step": 26550
},
{
"epoch": 0.12688266930608058,
"grad_norm": 0.20141273736953735,
"learning_rate": 0.001,
"loss": 2.6798,
"num_input_tokens_seen": 6973030400,
"step": 26600
},
{
"epoch": 0.1271211705641747,
"grad_norm": 0.1732529103755951,
"learning_rate": 0.001,
"loss": 2.6606,
"num_input_tokens_seen": 6986137600,
"step": 26650
},
{
"epoch": 0.12735967182226887,
"grad_norm": 0.17546698451042175,
"learning_rate": 0.001,
"loss": 2.6717,
"num_input_tokens_seen": 6999244800,
"step": 26700
},
{
"epoch": 0.127598173080363,
"grad_norm": 0.2186097502708435,
"learning_rate": 0.001,
"loss": 2.6702,
"num_input_tokens_seen": 7012352000,
"step": 26750
},
{
"epoch": 0.12783667433845713,
"grad_norm": 0.1735202819108963,
"learning_rate": 0.001,
"loss": 2.6795,
"num_input_tokens_seen": 7025459200,
"step": 26800
},
{
"epoch": 0.12807517559655127,
"grad_norm": 0.40701860189437866,
"learning_rate": 0.001,
"loss": 2.6591,
"num_input_tokens_seen": 7038566400,
"step": 26850
},
{
"epoch": 0.1283136768546454,
"grad_norm": 0.19710049033164978,
"learning_rate": 0.001,
"loss": 2.6841,
"num_input_tokens_seen": 7051673600,
"step": 26900
},
{
"epoch": 0.12855217811273956,
"grad_norm": 0.18638554215431213,
"learning_rate": 0.001,
"loss": 2.6718,
"num_input_tokens_seen": 7064780800,
"step": 26950
},
{
"epoch": 0.1287906793708337,
"grad_norm": 0.17546561360359192,
"learning_rate": 0.001,
"loss": 2.6547,
"num_input_tokens_seen": 7077888000,
"step": 27000
},
{
"epoch": 0.1287906793708337,
"eval_loss": 2.551922559738159,
"eval_runtime": 50.1864,
"eval_samples_per_second": 99.629,
"eval_steps_per_second": 24.907,
"num_input_tokens_seen": 7077888000,
"step": 27000
},
{
"epoch": 0.12902918062892782,
"grad_norm": 0.1790401190519333,
"learning_rate": 0.001,
"loss": 2.672,
"num_input_tokens_seen": 7090995200,
"step": 27050
},
{
"epoch": 0.12926768188702195,
"grad_norm": 0.18173836171627045,
"learning_rate": 0.001,
"loss": 2.6563,
"num_input_tokens_seen": 7104102400,
"step": 27100
},
{
"epoch": 0.12950618314511608,
"grad_norm": 0.1827983856201172,
"learning_rate": 0.001,
"loss": 2.665,
"num_input_tokens_seen": 7117209600,
"step": 27150
},
{
"epoch": 0.12974468440321021,
"grad_norm": 0.20252254605293274,
"learning_rate": 0.001,
"loss": 2.675,
"num_input_tokens_seen": 7130316800,
"step": 27200
},
{
"epoch": 0.12998318566130437,
"grad_norm": 0.18492095172405243,
"learning_rate": 0.001,
"loss": 2.6801,
"num_input_tokens_seen": 7143424000,
"step": 27250
},
{
"epoch": 0.1302216869193985,
"grad_norm": 0.1962280571460724,
"learning_rate": 0.001,
"loss": 2.6551,
"num_input_tokens_seen": 7156531200,
"step": 27300
},
{
"epoch": 0.13046018817749264,
"grad_norm": 0.18813727796077728,
"learning_rate": 0.001,
"loss": 2.6728,
"num_input_tokens_seen": 7169638400,
"step": 27350
},
{
"epoch": 0.13069868943558677,
"grad_norm": 0.18111565709114075,
"learning_rate": 0.001,
"loss": 2.6743,
"num_input_tokens_seen": 7182745600,
"step": 27400
},
{
"epoch": 0.1309371906936809,
"grad_norm": 0.1727459728717804,
"learning_rate": 0.001,
"loss": 2.6596,
"num_input_tokens_seen": 7195852800,
"step": 27450
},
{
"epoch": 0.13117569195177506,
"grad_norm": 0.20097768306732178,
"learning_rate": 0.001,
"loss": 2.6651,
"num_input_tokens_seen": 7208960000,
"step": 27500
},
{
"epoch": 0.13117569195177506,
"eval_loss": 2.5501132011413574,
"eval_runtime": 50.3677,
"eval_samples_per_second": 99.27,
"eval_steps_per_second": 24.817,
"num_input_tokens_seen": 7208960000,
"step": 27500
},
{
"epoch": 0.1314141932098692,
"grad_norm": 0.17329637706279755,
"learning_rate": 0.001,
"loss": 2.663,
"num_input_tokens_seen": 7222067200,
"step": 27550
},
{
"epoch": 0.13165269446796332,
"grad_norm": 0.16942919790744781,
"learning_rate": 0.001,
"loss": 2.6609,
"num_input_tokens_seen": 7235174400,
"step": 27600
},
{
"epoch": 0.13189119572605745,
"grad_norm": 0.19828958809375763,
"learning_rate": 0.001,
"loss": 2.6625,
"num_input_tokens_seen": 7248281600,
"step": 27650
},
{
"epoch": 0.13212969698415158,
"grad_norm": 0.1928141862154007,
"learning_rate": 0.001,
"loss": 2.6597,
"num_input_tokens_seen": 7261388800,
"step": 27700
},
{
"epoch": 0.13236819824224572,
"grad_norm": 0.1870756894350052,
"learning_rate": 0.001,
"loss": 2.6718,
"num_input_tokens_seen": 7274496000,
"step": 27750
},
{
"epoch": 0.13260669950033988,
"grad_norm": 0.1786762923002243,
"learning_rate": 0.001,
"loss": 2.6631,
"num_input_tokens_seen": 7287603200,
"step": 27800
},
{
"epoch": 0.132845200758434,
"grad_norm": 0.1710624396800995,
"learning_rate": 0.001,
"loss": 2.6717,
"num_input_tokens_seen": 7300710400,
"step": 27850
},
{
"epoch": 0.13308370201652814,
"grad_norm": 0.1805214285850525,
"learning_rate": 0.001,
"loss": 2.6669,
"num_input_tokens_seen": 7313817600,
"step": 27900
},
{
"epoch": 0.13332220327462227,
"grad_norm": 0.18169906735420227,
"learning_rate": 0.001,
"loss": 2.6659,
"num_input_tokens_seen": 7326924800,
"step": 27950
},
{
"epoch": 0.1335607045327164,
"grad_norm": 0.16959500312805176,
"learning_rate": 0.001,
"loss": 2.6623,
"num_input_tokens_seen": 7340032000,
"step": 28000
},
{
"epoch": 0.1335607045327164,
"eval_loss": 2.548675060272217,
"eval_runtime": 50.3022,
"eval_samples_per_second": 99.399,
"eval_steps_per_second": 24.85,
"num_input_tokens_seen": 7340032000,
"step": 28000
},
{
"epoch": 0.13379920579081056,
"grad_norm": 0.19409704208374023,
"learning_rate": 0.001,
"loss": 2.6776,
"num_input_tokens_seen": 7353139200,
"step": 28050
},
{
"epoch": 0.1340377070489047,
"grad_norm": 0.1712968647480011,
"learning_rate": 0.001,
"loss": 2.6679,
"num_input_tokens_seen": 7366246400,
"step": 28100
},
{
"epoch": 0.13427620830699882,
"grad_norm": 0.20586130023002625,
"learning_rate": 0.001,
"loss": 2.6633,
"num_input_tokens_seen": 7379353600,
"step": 28150
},
{
"epoch": 0.13451470956509295,
"grad_norm": 0.1776891052722931,
"learning_rate": 0.001,
"loss": 2.6683,
"num_input_tokens_seen": 7392460800,
"step": 28200
},
{
"epoch": 0.1347532108231871,
"grad_norm": 0.19293451309204102,
"learning_rate": 0.001,
"loss": 2.6645,
"num_input_tokens_seen": 7405568000,
"step": 28250
},
{
"epoch": 0.13499171208128122,
"grad_norm": 0.17754724621772766,
"learning_rate": 0.001,
"loss": 2.6685,
"num_input_tokens_seen": 7418675200,
"step": 28300
},
{
"epoch": 0.13523021333937538,
"grad_norm": 0.17739038169384003,
"learning_rate": 0.001,
"loss": 2.6607,
"num_input_tokens_seen": 7431782400,
"step": 28350
},
{
"epoch": 0.1354687145974695,
"grad_norm": 0.175009086728096,
"learning_rate": 0.001,
"loss": 2.6679,
"num_input_tokens_seen": 7444889600,
"step": 28400
},
{
"epoch": 0.13570721585556364,
"grad_norm": 0.2229124754667282,
"learning_rate": 0.001,
"loss": 2.6687,
"num_input_tokens_seen": 7457996800,
"step": 28450
},
{
"epoch": 0.13594571711365777,
"grad_norm": 0.1791590005159378,
"learning_rate": 0.001,
"loss": 2.6741,
"num_input_tokens_seen": 7471104000,
"step": 28500
},
{
"epoch": 0.13594571711365777,
"eval_loss": 2.5456056594848633,
"eval_runtime": 50.6342,
"eval_samples_per_second": 98.747,
"eval_steps_per_second": 24.687,
"num_input_tokens_seen": 7471104000,
"step": 28500
},
{
"epoch": 0.1361842183717519,
"grad_norm": 0.18920041620731354,
"learning_rate": 0.001,
"loss": 2.6612,
"num_input_tokens_seen": 7484211200,
"step": 28550
},
{
"epoch": 0.13642271962984603,
"grad_norm": 0.19247522950172424,
"learning_rate": 0.001,
"loss": 2.6597,
"num_input_tokens_seen": 7497318400,
"step": 28600
},
{
"epoch": 0.1366612208879402,
"grad_norm": 0.22499197721481323,
"learning_rate": 0.001,
"loss": 2.6583,
"num_input_tokens_seen": 7510425600,
"step": 28650
},
{
"epoch": 0.13689972214603432,
"grad_norm": 0.18946559727191925,
"learning_rate": 0.001,
"loss": 2.6612,
"num_input_tokens_seen": 7523532800,
"step": 28700
},
{
"epoch": 0.13713822340412846,
"grad_norm": 0.19621454179286957,
"learning_rate": 0.001,
"loss": 2.6425,
"num_input_tokens_seen": 7536640000,
"step": 28750
},
{
"epoch": 0.1373767246622226,
"grad_norm": 0.21594376862049103,
"learning_rate": 0.001,
"loss": 2.6564,
"num_input_tokens_seen": 7549747200,
"step": 28800
},
{
"epoch": 0.13761522592031672,
"grad_norm": 0.18186470866203308,
"learning_rate": 0.001,
"loss": 2.6728,
"num_input_tokens_seen": 7562854400,
"step": 28850
},
{
"epoch": 0.13785372717841088,
"grad_norm": 0.19369743764400482,
"learning_rate": 0.001,
"loss": 2.6585,
"num_input_tokens_seen": 7575961600,
"step": 28900
},
{
"epoch": 0.138092228436505,
"grad_norm": 0.1897999793291092,
"learning_rate": 0.001,
"loss": 2.6564,
"num_input_tokens_seen": 7589068800,
"step": 28950
},
{
"epoch": 0.13833072969459914,
"grad_norm": 0.18076784908771515,
"learning_rate": 0.001,
"loss": 2.6453,
"num_input_tokens_seen": 7602176000,
"step": 29000
},
{
"epoch": 0.13833072969459914,
"eval_loss": 2.54413104057312,
"eval_runtime": 50.9152,
"eval_samples_per_second": 98.202,
"eval_steps_per_second": 24.551,
"num_input_tokens_seen": 7602176000,
"step": 29000
},
{
"epoch": 0.13856923095269327,
"grad_norm": 0.18520566821098328,
"learning_rate": 0.001,
"loss": 2.6644,
"num_input_tokens_seen": 7615283200,
"step": 29050
},
{
"epoch": 0.1388077322107874,
"grad_norm": 0.22739861905574799,
"learning_rate": 0.001,
"loss": 2.6597,
"num_input_tokens_seen": 7628390400,
"step": 29100
},
{
"epoch": 0.13904623346888154,
"grad_norm": 0.18451730906963348,
"learning_rate": 0.001,
"loss": 2.6432,
"num_input_tokens_seen": 7641497600,
"step": 29150
},
{
"epoch": 0.1392847347269757,
"grad_norm": 0.1865098923444748,
"learning_rate": 0.001,
"loss": 2.6651,
"num_input_tokens_seen": 7654604800,
"step": 29200
},
{
"epoch": 0.13952323598506983,
"grad_norm": 0.18676789104938507,
"learning_rate": 0.001,
"loss": 2.6597,
"num_input_tokens_seen": 7667712000,
"step": 29250
},
{
"epoch": 0.13976173724316396,
"grad_norm": 0.17463742196559906,
"learning_rate": 0.001,
"loss": 2.6571,
"num_input_tokens_seen": 7680819200,
"step": 29300
},
{
"epoch": 0.1400002385012581,
"grad_norm": 0.21621429920196533,
"learning_rate": 0.001,
"loss": 2.6342,
"num_input_tokens_seen": 7693926400,
"step": 29350
},
{
"epoch": 0.14023873975935222,
"grad_norm": 0.17493990063667297,
"learning_rate": 0.001,
"loss": 2.6536,
"num_input_tokens_seen": 7707033600,
"step": 29400
},
{
"epoch": 0.14047724101744638,
"grad_norm": 0.17649762332439423,
"learning_rate": 0.001,
"loss": 2.6526,
"num_input_tokens_seen": 7720140800,
"step": 29450
},
{
"epoch": 0.1407157422755405,
"grad_norm": 0.18224874138832092,
"learning_rate": 0.001,
"loss": 2.6635,
"num_input_tokens_seen": 7733248000,
"step": 29500
},
{
"epoch": 0.1407157422755405,
"eval_loss": 2.5433554649353027,
"eval_runtime": 51.2973,
"eval_samples_per_second": 97.471,
"eval_steps_per_second": 24.368,
"num_input_tokens_seen": 7733248000,
"step": 29500
},
{
"epoch": 0.14095424353363464,
"grad_norm": 0.21109874546527863,
"learning_rate": 0.001,
"loss": 2.6788,
"num_input_tokens_seen": 7746355200,
"step": 29550
},
{
"epoch": 0.14119274479172877,
"grad_norm": 0.17663723230361938,
"learning_rate": 0.001,
"loss": 2.6578,
"num_input_tokens_seen": 7759462400,
"step": 29600
},
{
"epoch": 0.1414312460498229,
"grad_norm": 0.18385198712348938,
"learning_rate": 0.001,
"loss": 2.676,
"num_input_tokens_seen": 7772569600,
"step": 29650
},
{
"epoch": 0.14166974730791704,
"grad_norm": 0.1829567402601242,
"learning_rate": 0.001,
"loss": 2.6586,
"num_input_tokens_seen": 7785676800,
"step": 29700
},
{
"epoch": 0.1419082485660112,
"grad_norm": 0.1907297968864441,
"learning_rate": 0.001,
"loss": 2.6508,
"num_input_tokens_seen": 7798784000,
"step": 29750
},
{
"epoch": 0.14214674982410533,
"grad_norm": 0.2106500118970871,
"learning_rate": 0.001,
"loss": 2.6578,
"num_input_tokens_seen": 7811891200,
"step": 29800
},
{
"epoch": 0.14238525108219946,
"grad_norm": 0.18974357843399048,
"learning_rate": 0.001,
"loss": 2.6506,
"num_input_tokens_seen": 7824998400,
"step": 29850
},
{
"epoch": 0.1426237523402936,
"grad_norm": 0.18876343965530396,
"learning_rate": 0.001,
"loss": 2.6663,
"num_input_tokens_seen": 7838105600,
"step": 29900
},
{
"epoch": 0.14286225359838772,
"grad_norm": 0.17305608093738556,
"learning_rate": 0.001,
"loss": 2.657,
"num_input_tokens_seen": 7851212800,
"step": 29950
},
{
"epoch": 0.14310075485648185,
"grad_norm": 0.18900860846042633,
"learning_rate": 0.001,
"loss": 2.6502,
"num_input_tokens_seen": 7864320000,
"step": 30000
},
{
"epoch": 0.14310075485648185,
"eval_loss": 2.540076971054077,
"eval_runtime": 50.1464,
"eval_samples_per_second": 99.708,
"eval_steps_per_second": 24.927,
"num_input_tokens_seen": 7864320000,
"step": 30000
},
{
"epoch": 0.143339256114576,
"grad_norm": 0.16919030249118805,
"learning_rate": 0.001,
"loss": 2.6729,
"num_input_tokens_seen": 7877427200,
"step": 30050
},
{
"epoch": 0.14357775737267015,
"grad_norm": 0.17828898131847382,
"learning_rate": 0.001,
"loss": 2.647,
"num_input_tokens_seen": 7890534400,
"step": 30100
},
{
"epoch": 0.14381625863076428,
"grad_norm": 0.1790715903043747,
"learning_rate": 0.001,
"loss": 2.6639,
"num_input_tokens_seen": 7903641600,
"step": 30150
},
{
"epoch": 0.1440547598888584,
"grad_norm": 0.18818187713623047,
"learning_rate": 0.001,
"loss": 2.6485,
"num_input_tokens_seen": 7916748800,
"step": 30200
},
{
"epoch": 0.14429326114695254,
"grad_norm": 0.2171814739704132,
"learning_rate": 0.001,
"loss": 2.6577,
"num_input_tokens_seen": 7929856000,
"step": 30250
},
{
"epoch": 0.1445317624050467,
"grad_norm": 0.1844399869441986,
"learning_rate": 0.001,
"loss": 2.6473,
"num_input_tokens_seen": 7942963200,
"step": 30300
},
{
"epoch": 0.14477026366314083,
"grad_norm": 0.19607801735401154,
"learning_rate": 0.001,
"loss": 2.6576,
"num_input_tokens_seen": 7956070400,
"step": 30350
},
{
"epoch": 0.14500876492123496,
"grad_norm": 0.1967996209859848,
"learning_rate": 0.001,
"loss": 2.64,
"num_input_tokens_seen": 7969177600,
"step": 30400
},
{
"epoch": 0.1452472661793291,
"grad_norm": 0.2087596207857132,
"learning_rate": 0.001,
"loss": 2.6485,
"num_input_tokens_seen": 7982284800,
"step": 30450
},
{
"epoch": 0.14548576743742322,
"grad_norm": 0.1938595473766327,
"learning_rate": 0.001,
"loss": 2.654,
"num_input_tokens_seen": 7995392000,
"step": 30500
},
{
"epoch": 0.14548576743742322,
"eval_loss": 2.537402391433716,
"eval_runtime": 50.7304,
"eval_samples_per_second": 98.56,
"eval_steps_per_second": 24.64,
"num_input_tokens_seen": 7995392000,
"step": 30500
},
{
"epoch": 0.14572426869551736,
"grad_norm": 0.18282300233840942,
"learning_rate": 0.001,
"loss": 2.6592,
"num_input_tokens_seen": 8008499200,
"step": 30550
},
{
"epoch": 0.14596276995361152,
"grad_norm": 0.1829262375831604,
"learning_rate": 0.001,
"loss": 2.6618,
"num_input_tokens_seen": 8021606400,
"step": 30600
},
{
"epoch": 0.14620127121170565,
"grad_norm": 0.19001947343349457,
"learning_rate": 0.001,
"loss": 2.649,
"num_input_tokens_seen": 8034713600,
"step": 30650
},
{
"epoch": 0.14643977246979978,
"grad_norm": 0.19943153858184814,
"learning_rate": 0.001,
"loss": 2.6578,
"num_input_tokens_seen": 8047820800,
"step": 30700
},
{
"epoch": 0.1466782737278939,
"grad_norm": 0.18482360243797302,
"learning_rate": 0.001,
"loss": 2.6616,
"num_input_tokens_seen": 8060928000,
"step": 30750
},
{
"epoch": 0.14691677498598804,
"grad_norm": 0.20858009159564972,
"learning_rate": 0.001,
"loss": 2.6684,
"num_input_tokens_seen": 8074035200,
"step": 30800
},
{
"epoch": 0.1471552762440822,
"grad_norm": 0.2759605646133423,
"learning_rate": 0.001,
"loss": 2.713,
"num_input_tokens_seen": 8087142400,
"step": 30850
},
{
"epoch": 0.14739377750217633,
"grad_norm": 0.22366145253181458,
"learning_rate": 0.001,
"loss": 2.7065,
"num_input_tokens_seen": 8100249600,
"step": 30900
},
{
"epoch": 0.14763227876027046,
"grad_norm": 0.22143268585205078,
"learning_rate": 0.001,
"loss": 2.672,
"num_input_tokens_seen": 8113356800,
"step": 30950
},
{
"epoch": 0.1478707800183646,
"grad_norm": 0.25140002369880676,
"learning_rate": 0.001,
"loss": 2.6658,
"num_input_tokens_seen": 8126464000,
"step": 31000
},
{
"epoch": 0.1478707800183646,
"eval_loss": 2.5451457500457764,
"eval_runtime": 50.0622,
"eval_samples_per_second": 99.876,
"eval_steps_per_second": 24.969,
"num_input_tokens_seen": 8126464000,
"step": 31000
},
{
"epoch": 0.14810928127645873,
"grad_norm": 0.20207786560058594,
"learning_rate": 0.001,
"loss": 2.6493,
"num_input_tokens_seen": 8139571200,
"step": 31050
},
{
"epoch": 0.14834778253455286,
"grad_norm": 0.20135898888111115,
"learning_rate": 0.001,
"loss": 2.6555,
"num_input_tokens_seen": 8152678400,
"step": 31100
},
{
"epoch": 0.14858628379264702,
"grad_norm": 0.19284267723560333,
"learning_rate": 0.001,
"loss": 2.6637,
"num_input_tokens_seen": 8165785600,
"step": 31150
},
{
"epoch": 0.14882478505074115,
"grad_norm": 0.17214693129062653,
"learning_rate": 0.001,
"loss": 2.6663,
"num_input_tokens_seen": 8178892800,
"step": 31200
},
{
"epoch": 0.14906328630883528,
"grad_norm": 0.19444549083709717,
"learning_rate": 0.001,
"loss": 2.6541,
"num_input_tokens_seen": 8192000000,
"step": 31250
},
{
"epoch": 0.1493017875669294,
"grad_norm": 0.19992901384830475,
"learning_rate": 0.001,
"loss": 2.6419,
"num_input_tokens_seen": 8205107200,
"step": 31300
},
{
"epoch": 0.14954028882502354,
"grad_norm": 0.16732315719127655,
"learning_rate": 0.001,
"loss": 2.6559,
"num_input_tokens_seen": 8218214400,
"step": 31350
},
{
"epoch": 0.1497787900831177,
"grad_norm": 0.4210798442363739,
"learning_rate": 0.001,
"loss": 2.6478,
"num_input_tokens_seen": 8231321600,
"step": 31400
},
{
"epoch": 0.15001729134121183,
"grad_norm": 0.2139436900615692,
"learning_rate": 0.001,
"loss": 2.6753,
"num_input_tokens_seen": 8244428800,
"step": 31450
},
{
"epoch": 0.15025579259930597,
"grad_norm": 0.19131046533584595,
"learning_rate": 0.001,
"loss": 2.6675,
"num_input_tokens_seen": 8257536000,
"step": 31500
},
{
"epoch": 0.15025579259930597,
"eval_loss": 2.5402350425720215,
"eval_runtime": 50.477,
"eval_samples_per_second": 99.055,
"eval_steps_per_second": 24.764,
"num_input_tokens_seen": 8257536000,
"step": 31500
},
{
"epoch": 0.1504942938574001,
"grad_norm": 0.20711492002010345,
"learning_rate": 0.001,
"loss": 2.6654,
"num_input_tokens_seen": 8270643200,
"step": 31550
},
{
"epoch": 0.15073279511549423,
"grad_norm": 0.1888076812028885,
"learning_rate": 0.001,
"loss": 2.6603,
"num_input_tokens_seen": 8283750400,
"step": 31600
},
{
"epoch": 0.15097129637358836,
"grad_norm": 0.18534335494041443,
"learning_rate": 0.001,
"loss": 2.6539,
"num_input_tokens_seen": 8296857600,
"step": 31650
},
{
"epoch": 0.15120979763168252,
"grad_norm": 0.2024192214012146,
"learning_rate": 0.001,
"loss": 2.6514,
"num_input_tokens_seen": 8309964800,
"step": 31700
},
{
"epoch": 0.15144829888977665,
"grad_norm": 0.18967773020267487,
"learning_rate": 0.001,
"loss": 2.6457,
"num_input_tokens_seen": 8323072000,
"step": 31750
},
{
"epoch": 0.15168680014787078,
"grad_norm": 0.18823806941509247,
"learning_rate": 0.001,
"loss": 2.6579,
"num_input_tokens_seen": 8336179200,
"step": 31800
},
{
"epoch": 0.1519253014059649,
"grad_norm": 0.20198485255241394,
"learning_rate": 0.001,
"loss": 2.6623,
"num_input_tokens_seen": 8349286400,
"step": 31850
},
{
"epoch": 0.15216380266405904,
"grad_norm": 0.19362477958202362,
"learning_rate": 0.001,
"loss": 2.6473,
"num_input_tokens_seen": 8362393600,
"step": 31900
},
{
"epoch": 0.15240230392215318,
"grad_norm": 0.18454812467098236,
"learning_rate": 0.001,
"loss": 2.6411,
"num_input_tokens_seen": 8375500800,
"step": 31950
},
{
"epoch": 0.15264080518024734,
"grad_norm": 0.1968630850315094,
"learning_rate": 0.001,
"loss": 2.6405,
"num_input_tokens_seen": 8388608000,
"step": 32000
},
{
"epoch": 0.15264080518024734,
"eval_loss": 2.5325138568878174,
"eval_runtime": 51.0134,
"eval_samples_per_second": 98.013,
"eval_steps_per_second": 24.503,
"num_input_tokens_seen": 8388608000,
"step": 32000
},
{
"epoch": 0.15287930643834147,
"grad_norm": 0.180119588971138,
"learning_rate": 0.001,
"loss": 2.6558,
"num_input_tokens_seen": 8401715200,
"step": 32050
},
{
"epoch": 0.1531178076964356,
"grad_norm": 0.1952589452266693,
"learning_rate": 0.001,
"loss": 2.6465,
"num_input_tokens_seen": 8414822400,
"step": 32100
},
{
"epoch": 0.15335630895452973,
"grad_norm": 0.1845589131116867,
"learning_rate": 0.001,
"loss": 2.6297,
"num_input_tokens_seen": 8427929600,
"step": 32150
},
{
"epoch": 0.15359481021262386,
"grad_norm": 0.20116594433784485,
"learning_rate": 0.001,
"loss": 2.6422,
"num_input_tokens_seen": 8441036800,
"step": 32200
},
{
"epoch": 0.15383331147071802,
"grad_norm": 0.1932612508535385,
"learning_rate": 0.001,
"loss": 2.6494,
"num_input_tokens_seen": 8454144000,
"step": 32250
},
{
"epoch": 0.15407181272881215,
"grad_norm": 0.17934490740299225,
"learning_rate": 0.001,
"loss": 2.6474,
"num_input_tokens_seen": 8467251200,
"step": 32300
},
{
"epoch": 0.15431031398690628,
"grad_norm": 0.19273313879966736,
"learning_rate": 0.001,
"loss": 2.6447,
"num_input_tokens_seen": 8480358400,
"step": 32350
},
{
"epoch": 0.15454881524500041,
"grad_norm": 0.1921055018901825,
"learning_rate": 0.001,
"loss": 2.665,
"num_input_tokens_seen": 8493465600,
"step": 32400
},
{
"epoch": 0.15478731650309455,
"grad_norm": 0.37117844820022583,
"learning_rate": 0.001,
"loss": 2.6351,
"num_input_tokens_seen": 8506572800,
"step": 32450
},
{
"epoch": 0.15502581776118868,
"grad_norm": 0.1884016990661621,
"learning_rate": 0.001,
"loss": 2.6436,
"num_input_tokens_seen": 8519680000,
"step": 32500
},
{
"epoch": 0.15502581776118868,
"eval_loss": 2.5313448905944824,
"eval_runtime": 50.7051,
"eval_samples_per_second": 98.609,
"eval_steps_per_second": 24.652,
"num_input_tokens_seen": 8519680000,
"step": 32500
},
{
"epoch": 0.15526431901928284,
"grad_norm": 0.22205407917499542,
"learning_rate": 0.001,
"loss": 2.6464,
"num_input_tokens_seen": 8532787200,
"step": 32550
},
{
"epoch": 0.15550282027737697,
"grad_norm": 0.18515361845493317,
"learning_rate": 0.001,
"loss": 2.642,
"num_input_tokens_seen": 8545894400,
"step": 32600
},
{
"epoch": 0.1557413215354711,
"grad_norm": 0.18903231620788574,
"learning_rate": 0.001,
"loss": 2.6446,
"num_input_tokens_seen": 8559001600,
"step": 32650
},
{
"epoch": 0.15597982279356523,
"grad_norm": 0.1857556253671646,
"learning_rate": 0.001,
"loss": 2.6561,
"num_input_tokens_seen": 8572108800,
"step": 32700
},
{
"epoch": 0.15621832405165936,
"grad_norm": 0.45706707239151,
"learning_rate": 0.001,
"loss": 2.6487,
"num_input_tokens_seen": 8585216000,
"step": 32750
},
{
"epoch": 0.15645682530975352,
"grad_norm": 0.20191136002540588,
"learning_rate": 0.001,
"loss": 2.6593,
"num_input_tokens_seen": 8598323200,
"step": 32800
},
{
"epoch": 0.15669532656784765,
"grad_norm": 0.21191105246543884,
"learning_rate": 0.001,
"loss": 2.659,
"num_input_tokens_seen": 8611430400,
"step": 32850
},
{
"epoch": 0.15693382782594179,
"grad_norm": 0.20596672594547272,
"learning_rate": 0.001,
"loss": 2.6354,
"num_input_tokens_seen": 8624537600,
"step": 32900
},
{
"epoch": 0.15717232908403592,
"grad_norm": 0.2952199876308441,
"learning_rate": 0.001,
"loss": 2.6501,
"num_input_tokens_seen": 8637644800,
"step": 32950
},
{
"epoch": 0.15741083034213005,
"grad_norm": 0.2217044234275818,
"learning_rate": 0.001,
"loss": 2.6495,
"num_input_tokens_seen": 8650752000,
"step": 33000
},
{
"epoch": 0.15741083034213005,
"eval_loss": 2.5319430828094482,
"eval_runtime": 50.8413,
"eval_samples_per_second": 98.345,
"eval_steps_per_second": 24.586,
"num_input_tokens_seen": 8650752000,
"step": 33000
},
{
"epoch": 0.15764933160022418,
"grad_norm": 0.2384626269340515,
"learning_rate": 0.001,
"loss": 2.6503,
"num_input_tokens_seen": 8663859200,
"step": 33050
},
{
"epoch": 0.15788783285831834,
"grad_norm": 0.18387843668460846,
"learning_rate": 0.001,
"loss": 2.6469,
"num_input_tokens_seen": 8676966400,
"step": 33100
},
{
"epoch": 0.15812633411641247,
"grad_norm": 0.23530641198158264,
"learning_rate": 0.001,
"loss": 2.6484,
"num_input_tokens_seen": 8690073600,
"step": 33150
},
{
"epoch": 0.1583648353745066,
"grad_norm": 0.2027565985918045,
"learning_rate": 0.001,
"loss": 2.6564,
"num_input_tokens_seen": 8703180800,
"step": 33200
},
{
"epoch": 0.15860333663260073,
"grad_norm": 0.21472220122814178,
"learning_rate": 0.001,
"loss": 2.6543,
"num_input_tokens_seen": 8716288000,
"step": 33250
},
{
"epoch": 0.15884183789069486,
"grad_norm": 0.19012615084648132,
"learning_rate": 0.001,
"loss": 2.6378,
"num_input_tokens_seen": 8729395200,
"step": 33300
},
{
"epoch": 0.159080339148789,
"grad_norm": 0.18018738925457,
"learning_rate": 0.001,
"loss": 2.6461,
"num_input_tokens_seen": 8742502400,
"step": 33350
},
{
"epoch": 0.15931884040688316,
"grad_norm": 0.20139184594154358,
"learning_rate": 0.001,
"loss": 2.6419,
"num_input_tokens_seen": 8755609600,
"step": 33400
},
{
"epoch": 0.1595573416649773,
"grad_norm": 0.20734767615795135,
"learning_rate": 0.001,
"loss": 2.6299,
"num_input_tokens_seen": 8768716800,
"step": 33450
},
{
"epoch": 0.15979584292307142,
"grad_norm": 0.18958640098571777,
"learning_rate": 0.001,
"loss": 2.6525,
"num_input_tokens_seen": 8781824000,
"step": 33500
},
{
"epoch": 0.15979584292307142,
"eval_loss": 2.5301430225372314,
"eval_runtime": 64.6928,
"eval_samples_per_second": 77.288,
"eval_steps_per_second": 19.322,
"num_input_tokens_seen": 8781824000,
"step": 33500
},
{
"epoch": 0.16003434418116555,
"grad_norm": 0.20421727001667023,
"learning_rate": 0.001,
"loss": 2.6445,
"num_input_tokens_seen": 8794931200,
"step": 33550
},
{
"epoch": 0.16027284543925968,
"grad_norm": 0.18347379565238953,
"learning_rate": 0.001,
"loss": 2.6525,
"num_input_tokens_seen": 8808038400,
"step": 33600
},
{
"epoch": 0.16051134669735384,
"grad_norm": 0.19450639188289642,
"learning_rate": 0.001,
"loss": 2.6356,
"num_input_tokens_seen": 8821145600,
"step": 33650
},
{
"epoch": 0.16074984795544797,
"grad_norm": 0.17953775823116302,
"learning_rate": 0.001,
"loss": 2.6424,
"num_input_tokens_seen": 8834252800,
"step": 33700
},
{
"epoch": 0.1609883492135421,
"grad_norm": 0.1990649551153183,
"learning_rate": 0.001,
"loss": 2.6608,
"num_input_tokens_seen": 8847360000,
"step": 33750
},
{
"epoch": 0.16122685047163623,
"grad_norm": 0.19343194365501404,
"learning_rate": 0.001,
"loss": 2.6604,
"num_input_tokens_seen": 8860467200,
"step": 33800
},
{
"epoch": 0.16146535172973037,
"grad_norm": 0.19385921955108643,
"learning_rate": 0.001,
"loss": 2.6354,
"num_input_tokens_seen": 8873574400,
"step": 33850
},
{
"epoch": 0.1617038529878245,
"grad_norm": 0.1828273981809616,
"learning_rate": 0.001,
"loss": 2.6578,
"num_input_tokens_seen": 8886681600,
"step": 33900
},
{
"epoch": 0.16194235424591866,
"grad_norm": 0.216063991189003,
"learning_rate": 0.001,
"loss": 2.6575,
"num_input_tokens_seen": 8899788800,
"step": 33950
},
{
"epoch": 0.1621808555040128,
"grad_norm": 0.20358648896217346,
"learning_rate": 0.001,
"loss": 2.6499,
"num_input_tokens_seen": 8912896000,
"step": 34000
},
{
"epoch": 0.1621808555040128,
"eval_loss": 2.5330910682678223,
"eval_runtime": 50.7961,
"eval_samples_per_second": 98.433,
"eval_steps_per_second": 24.608,
"num_input_tokens_seen": 8912896000,
"step": 34000
},
{
"epoch": 0.16241935676210692,
"grad_norm": 0.1935052126646042,
"learning_rate": 0.001,
"loss": 2.6583,
"num_input_tokens_seen": 8926003200,
"step": 34050
},
{
"epoch": 0.16265785802020105,
"grad_norm": 0.7825157642364502,
"learning_rate": 0.001,
"loss": 2.6481,
"num_input_tokens_seen": 8939110400,
"step": 34100
},
{
"epoch": 0.16289635927829518,
"grad_norm": 0.23290683329105377,
"learning_rate": 0.001,
"loss": 2.6925,
"num_input_tokens_seen": 8952217600,
"step": 34150
},
{
"epoch": 0.16313486053638934,
"grad_norm": 0.23564130067825317,
"learning_rate": 0.001,
"loss": 2.6495,
"num_input_tokens_seen": 8965324800,
"step": 34200
},
{
"epoch": 0.16337336179448347,
"grad_norm": 0.19592130184173584,
"learning_rate": 0.001,
"loss": 2.6536,
"num_input_tokens_seen": 8978432000,
"step": 34250
},
{
"epoch": 0.1636118630525776,
"grad_norm": 0.23535041511058807,
"learning_rate": 0.001,
"loss": 2.6608,
"num_input_tokens_seen": 8991539200,
"step": 34300
},
{
"epoch": 0.16385036431067174,
"grad_norm": 0.1991938352584839,
"learning_rate": 0.001,
"loss": 2.6458,
"num_input_tokens_seen": 9004641856,
"step": 34350
},
{
"epoch": 0.16408886556876587,
"grad_norm": 0.19363388419151306,
"learning_rate": 0.001,
"loss": 2.6531,
"num_input_tokens_seen": 9017749056,
"step": 34400
},
{
"epoch": 0.16432736682686,
"grad_norm": 0.18500390648841858,
"learning_rate": 0.001,
"loss": 2.6391,
"num_input_tokens_seen": 9030856256,
"step": 34450
},
{
"epoch": 0.16456586808495416,
"grad_norm": 0.2774065434932709,
"learning_rate": 0.001,
"loss": 2.6619,
"num_input_tokens_seen": 9043963456,
"step": 34500
},
{
"epoch": 0.16456586808495416,
"eval_loss": 2.5325100421905518,
"eval_runtime": 51.5954,
"eval_samples_per_second": 96.908,
"eval_steps_per_second": 24.227,
"num_input_tokens_seen": 9043963456,
"step": 34500
},
{
"epoch": 0.1648043693430483,
"grad_norm": 0.1957511603832245,
"learning_rate": 0.001,
"loss": 2.6456,
"num_input_tokens_seen": 9057070656,
"step": 34550
},
{
"epoch": 0.16504287060114242,
"grad_norm": 0.20958378911018372,
"learning_rate": 0.001,
"loss": 2.6452,
"num_input_tokens_seen": 9070177856,
"step": 34600
},
{
"epoch": 0.16528137185923655,
"grad_norm": 0.206208735704422,
"learning_rate": 0.001,
"loss": 2.6548,
"num_input_tokens_seen": 9083285056,
"step": 34650
},
{
"epoch": 0.16551987311733068,
"grad_norm": 0.22349481284618378,
"learning_rate": 0.001,
"loss": 2.6653,
"num_input_tokens_seen": 9096392256,
"step": 34700
},
{
"epoch": 0.16575837437542484,
"grad_norm": 0.22599968314170837,
"learning_rate": 0.001,
"loss": 2.6329,
"num_input_tokens_seen": 9109499456,
"step": 34750
},
{
"epoch": 0.16599687563351898,
"grad_norm": 0.19219790399074554,
"learning_rate": 0.001,
"loss": 2.6404,
"num_input_tokens_seen": 9122606656,
"step": 34800
},
{
"epoch": 0.1662353768916131,
"grad_norm": 0.2006351351737976,
"learning_rate": 0.001,
"loss": 2.6522,
"num_input_tokens_seen": 9135713856,
"step": 34850
},
{
"epoch": 0.16647387814970724,
"grad_norm": 0.18393316864967346,
"learning_rate": 0.001,
"loss": 2.6464,
"num_input_tokens_seen": 9148821056,
"step": 34900
},
{
"epoch": 0.16671237940780137,
"grad_norm": 0.19820146262645721,
"learning_rate": 0.001,
"loss": 2.6402,
"num_input_tokens_seen": 9161928256,
"step": 34950
},
{
"epoch": 0.1669508806658955,
"grad_norm": 0.1995670199394226,
"learning_rate": 0.001,
"loss": 2.652,
"num_input_tokens_seen": 9175035456,
"step": 35000
},
{
"epoch": 0.1669508806658955,
"eval_loss": 2.5248045921325684,
"eval_runtime": 50.8205,
"eval_samples_per_second": 98.386,
"eval_steps_per_second": 24.596,
"num_input_tokens_seen": 9175035456,
"step": 35000
},
{
"epoch": 0.16718938192398966,
"grad_norm": 0.2099646031856537,
"learning_rate": 0.001,
"loss": 2.6307,
"num_input_tokens_seen": 9188142656,
"step": 35050
},
{
"epoch": 0.1674278831820838,
"grad_norm": 0.18913927674293518,
"learning_rate": 0.001,
"loss": 2.6368,
"num_input_tokens_seen": 9201249856,
"step": 35100
},
{
"epoch": 0.16766638444017792,
"grad_norm": 0.19193056225776672,
"learning_rate": 0.001,
"loss": 2.6325,
"num_input_tokens_seen": 9214357056,
"step": 35150
},
{
"epoch": 0.16790488569827206,
"grad_norm": 0.19911837577819824,
"learning_rate": 0.001,
"loss": 2.6543,
"num_input_tokens_seen": 9227464256,
"step": 35200
},
{
"epoch": 0.1681433869563662,
"grad_norm": 0.1985558718442917,
"learning_rate": 0.001,
"loss": 2.6518,
"num_input_tokens_seen": 9240571456,
"step": 35250
},
{
"epoch": 0.16838188821446032,
"grad_norm": 0.2079145759344101,
"learning_rate": 0.001,
"loss": 2.646,
"num_input_tokens_seen": 9253678656,
"step": 35300
},
{
"epoch": 0.16862038947255448,
"grad_norm": 0.18524424731731415,
"learning_rate": 0.001,
"loss": 2.6378,
"num_input_tokens_seen": 9266785856,
"step": 35350
},
{
"epoch": 0.1688588907306486,
"grad_norm": 0.19140370190143585,
"learning_rate": 0.001,
"loss": 2.6488,
"num_input_tokens_seen": 9279893056,
"step": 35400
},
{
"epoch": 0.16909739198874274,
"grad_norm": 0.18006138503551483,
"learning_rate": 0.001,
"loss": 2.6632,
"num_input_tokens_seen": 9293000256,
"step": 35450
},
{
"epoch": 0.16933589324683687,
"grad_norm": 0.18754282593727112,
"learning_rate": 0.001,
"loss": 2.6436,
"num_input_tokens_seen": 9306107456,
"step": 35500
},
{
"epoch": 0.16933589324683687,
"eval_loss": 2.5230932235717773,
"eval_runtime": 50.9895,
"eval_samples_per_second": 98.059,
"eval_steps_per_second": 24.515,
"num_input_tokens_seen": 9306107456,
"step": 35500
},
{
"epoch": 0.169574394504931,
"grad_norm": 0.18708109855651855,
"learning_rate": 0.001,
"loss": 2.6509,
"num_input_tokens_seen": 9319214656,
"step": 35550
},
{
"epoch": 0.16981289576302516,
"grad_norm": 0.2019611895084381,
"learning_rate": 0.001,
"loss": 2.6333,
"num_input_tokens_seen": 9332321856,
"step": 35600
},
{
"epoch": 0.1700513970211193,
"grad_norm": 0.22504755854606628,
"learning_rate": 0.001,
"loss": 2.6359,
"num_input_tokens_seen": 9345429056,
"step": 35650
},
{
"epoch": 0.17028989827921343,
"grad_norm": 0.1972053200006485,
"learning_rate": 0.001,
"loss": 2.6362,
"num_input_tokens_seen": 9358536256,
"step": 35700
},
{
"epoch": 0.17052839953730756,
"grad_norm": 0.21156789362430573,
"learning_rate": 0.001,
"loss": 2.637,
"num_input_tokens_seen": 9371643456,
"step": 35750
},
{
"epoch": 0.1707669007954017,
"grad_norm": 0.2680750787258148,
"learning_rate": 0.001,
"loss": 2.6332,
"num_input_tokens_seen": 9384750656,
"step": 35800
},
{
"epoch": 0.17100540205349582,
"grad_norm": 0.24413707852363586,
"learning_rate": 0.001,
"loss": 2.6366,
"num_input_tokens_seen": 9397857856,
"step": 35850
},
{
"epoch": 0.17124390331158998,
"grad_norm": 0.19973772764205933,
"learning_rate": 0.001,
"loss": 2.6381,
"num_input_tokens_seen": 9410965056,
"step": 35900
},
{
"epoch": 0.1714824045696841,
"grad_norm": 0.20807349681854248,
"learning_rate": 0.001,
"loss": 2.6416,
"num_input_tokens_seen": 9424072256,
"step": 35950
},
{
"epoch": 0.17172090582777824,
"grad_norm": 0.20126542448997498,
"learning_rate": 0.001,
"loss": 2.6377,
"num_input_tokens_seen": 9437179456,
"step": 36000
},
{
"epoch": 0.17172090582777824,
"eval_loss": 2.52500581741333,
"eval_runtime": 51.4353,
"eval_samples_per_second": 97.21,
"eval_steps_per_second": 24.302,
"num_input_tokens_seen": 9437179456,
"step": 36000
},
{
"epoch": 0.17195940708587237,
"grad_norm": 0.19696597754955292,
"learning_rate": 0.001,
"loss": 2.6521,
"num_input_tokens_seen": 9450286656,
"step": 36050
},
{
"epoch": 0.1721979083439665,
"grad_norm": 0.18839424848556519,
"learning_rate": 0.001,
"loss": 2.6484,
"num_input_tokens_seen": 9463393856,
"step": 36100
},
{
"epoch": 0.17243640960206066,
"grad_norm": 0.33748558163642883,
"learning_rate": 0.001,
"loss": 2.6496,
"num_input_tokens_seen": 9476501056,
"step": 36150
},
{
"epoch": 0.1726749108601548,
"grad_norm": 0.19529207050800323,
"learning_rate": 0.001,
"loss": 2.6484,
"num_input_tokens_seen": 9489608256,
"step": 36200
},
{
"epoch": 0.17291341211824893,
"grad_norm": 0.21542242169380188,
"learning_rate": 0.001,
"loss": 2.6572,
"num_input_tokens_seen": 9502715456,
"step": 36250
},
{
"epoch": 0.17315191337634306,
"grad_norm": 0.37017494440078735,
"learning_rate": 0.001,
"loss": 2.6517,
"num_input_tokens_seen": 9515822656,
"step": 36300
},
{
"epoch": 0.1733904146344372,
"grad_norm": 0.27284151315689087,
"learning_rate": 0.001,
"loss": 2.66,
"num_input_tokens_seen": 9528929856,
"step": 36350
},
{
"epoch": 0.17362891589253132,
"grad_norm": 0.4666242003440857,
"learning_rate": 0.001,
"loss": 2.6514,
"num_input_tokens_seen": 9542037056,
"step": 36400
},
{
"epoch": 0.17386741715062548,
"grad_norm": 0.2031467854976654,
"learning_rate": 0.001,
"loss": 2.6577,
"num_input_tokens_seen": 9555144256,
"step": 36450
},
{
"epoch": 0.1741059184087196,
"grad_norm": 0.2086576223373413,
"learning_rate": 0.001,
"loss": 2.6372,
"num_input_tokens_seen": 9568251456,
"step": 36500
},
{
"epoch": 0.1741059184087196,
"eval_loss": 2.5252223014831543,
"eval_runtime": 51.1282,
"eval_samples_per_second": 97.793,
"eval_steps_per_second": 24.448,
"num_input_tokens_seen": 9568251456,
"step": 36500
},
{
"epoch": 0.17434441966681374,
"grad_norm": 0.19739161431789398,
"learning_rate": 0.001,
"loss": 2.6184,
"num_input_tokens_seen": 9581358656,
"step": 36550
},
{
"epoch": 0.17458292092490788,
"grad_norm": 0.22384846210479736,
"learning_rate": 0.001,
"loss": 2.6504,
"num_input_tokens_seen": 9594465856,
"step": 36600
},
{
"epoch": 0.174821422183002,
"grad_norm": 0.2055511772632599,
"learning_rate": 0.001,
"loss": 2.6333,
"num_input_tokens_seen": 9607573056,
"step": 36650
},
{
"epoch": 0.17505992344109614,
"grad_norm": 0.18193551898002625,
"learning_rate": 0.001,
"loss": 2.6518,
"num_input_tokens_seen": 9620680256,
"step": 36700
},
{
"epoch": 0.1752984246991903,
"grad_norm": 0.1968860775232315,
"learning_rate": 0.001,
"loss": 2.6211,
"num_input_tokens_seen": 9633787456,
"step": 36750
},
{
"epoch": 0.17553692595728443,
"grad_norm": 0.20429988205432892,
"learning_rate": 0.001,
"loss": 2.6269,
"num_input_tokens_seen": 9646894656,
"step": 36800
},
{
"epoch": 0.17577542721537856,
"grad_norm": 0.18364110589027405,
"learning_rate": 0.001,
"loss": 2.6337,
"num_input_tokens_seen": 9660001856,
"step": 36850
},
{
"epoch": 0.1760139284734727,
"grad_norm": 0.2051621973514557,
"learning_rate": 0.001,
"loss": 2.6297,
"num_input_tokens_seen": 9673109056,
"step": 36900
},
{
"epoch": 0.17625242973156682,
"grad_norm": 0.25841349363327026,
"learning_rate": 0.001,
"loss": 2.6678,
"num_input_tokens_seen": 9686216256,
"step": 36950
},
{
"epoch": 0.17649093098966098,
"grad_norm": 0.198688805103302,
"learning_rate": 0.001,
"loss": 2.6521,
"num_input_tokens_seen": 9699323456,
"step": 37000
},
{
"epoch": 0.17649093098966098,
"eval_loss": 2.5248863697052,
"eval_runtime": 51.363,
"eval_samples_per_second": 97.346,
"eval_steps_per_second": 24.337,
"num_input_tokens_seen": 9699323456,
"step": 37000
},
{
"epoch": 0.1767294322477551,
"grad_norm": 0.2030065804719925,
"learning_rate": 0.001,
"loss": 2.6481,
"num_input_tokens_seen": 9712430656,
"step": 37050
},
{
"epoch": 0.17696793350584925,
"grad_norm": 0.20191729068756104,
"learning_rate": 0.001,
"loss": 2.6332,
"num_input_tokens_seen": 9725537856,
"step": 37100
},
{
"epoch": 0.17720643476394338,
"grad_norm": 0.19462484121322632,
"learning_rate": 0.001,
"loss": 2.6444,
"num_input_tokens_seen": 9738645056,
"step": 37150
},
{
"epoch": 0.1774449360220375,
"grad_norm": 0.27893325686454773,
"learning_rate": 0.001,
"loss": 2.6309,
"num_input_tokens_seen": 9751752256,
"step": 37200
},
{
"epoch": 0.17768343728013164,
"grad_norm": 0.20646531879901886,
"learning_rate": 0.001,
"loss": 2.646,
"num_input_tokens_seen": 9764859456,
"step": 37250
},
{
"epoch": 0.1779219385382258,
"grad_norm": 0.20815566182136536,
"learning_rate": 0.001,
"loss": 2.6374,
"num_input_tokens_seen": 9777966656,
"step": 37300
},
{
"epoch": 0.17816043979631993,
"grad_norm": 0.2194615602493286,
"learning_rate": 0.001,
"loss": 2.6313,
"num_input_tokens_seen": 9791073856,
"step": 37350
},
{
"epoch": 0.17839894105441406,
"grad_norm": 0.23223313689231873,
"learning_rate": 0.001,
"loss": 2.6435,
"num_input_tokens_seen": 9804181056,
"step": 37400
},
{
"epoch": 0.1786374423125082,
"grad_norm": 0.1731143593788147,
"learning_rate": 0.001,
"loss": 2.6397,
"num_input_tokens_seen": 9817288256,
"step": 37450
},
{
"epoch": 0.17887594357060232,
"grad_norm": 0.1929951161146164,
"learning_rate": 0.001,
"loss": 2.6406,
"num_input_tokens_seen": 9830395456,
"step": 37500
},
{
"epoch": 0.17887594357060232,
"eval_loss": 2.518928050994873,
"eval_runtime": 51.8733,
"eval_samples_per_second": 96.389,
"eval_steps_per_second": 24.097,
"num_input_tokens_seen": 9830395456,
"step": 37500
},
{
"epoch": 0.17911444482869648,
"grad_norm": 0.19979524612426758,
"learning_rate": 0.001,
"loss": 2.6363,
"num_input_tokens_seen": 9843502656,
"step": 37550
},
{
"epoch": 0.17935294608679062,
"grad_norm": 0.17963503301143646,
"learning_rate": 0.001,
"loss": 2.6423,
"num_input_tokens_seen": 9856609856,
"step": 37600
},
{
"epoch": 0.17959144734488475,
"grad_norm": 0.18216437101364136,
"learning_rate": 0.001,
"loss": 2.6351,
"num_input_tokens_seen": 9869717056,
"step": 37650
},
{
"epoch": 0.17982994860297888,
"grad_norm": 0.16782627999782562,
"learning_rate": 0.001,
"loss": 2.623,
"num_input_tokens_seen": 9882824256,
"step": 37700
},
{
"epoch": 0.180068449861073,
"grad_norm": 0.21884289383888245,
"learning_rate": 0.001,
"loss": 2.6418,
"num_input_tokens_seen": 9895931456,
"step": 37750
},
{
"epoch": 0.18030695111916714,
"grad_norm": 0.18940453231334686,
"learning_rate": 0.001,
"loss": 2.6371,
"num_input_tokens_seen": 9909038656,
"step": 37800
},
{
"epoch": 0.1805454523772613,
"grad_norm": 0.2075282484292984,
"learning_rate": 0.001,
"loss": 2.6347,
"num_input_tokens_seen": 9922145856,
"step": 37850
},
{
"epoch": 0.18078395363535543,
"grad_norm": 0.18504877388477325,
"learning_rate": 0.001,
"loss": 2.6391,
"num_input_tokens_seen": 9935253056,
"step": 37900
},
{
"epoch": 0.18102245489344956,
"grad_norm": 0.17926527559757233,
"learning_rate": 0.001,
"loss": 2.6358,
"num_input_tokens_seen": 9948360256,
"step": 37950
},
{
"epoch": 0.1812609561515437,
"grad_norm": 0.20022514462471008,
"learning_rate": 0.001,
"loss": 2.6369,
"num_input_tokens_seen": 9961467456,
"step": 38000
},
{
"epoch": 0.1812609561515437,
"eval_loss": 2.5171313285827637,
"eval_runtime": 51.617,
"eval_samples_per_second": 96.867,
"eval_steps_per_second": 24.217,
"num_input_tokens_seen": 9961467456,
"step": 38000
},
{
"epoch": 0.18149945740963783,
"grad_norm": 0.19376301765441895,
"learning_rate": 0.001,
"loss": 2.6274,
"num_input_tokens_seen": 9974574656,
"step": 38050
},
{
"epoch": 0.18173795866773199,
"grad_norm": 0.2077150195837021,
"learning_rate": 0.001,
"loss": 2.6303,
"num_input_tokens_seen": 9987681856,
"step": 38100
},
{
"epoch": 0.18197645992582612,
"grad_norm": 0.19407787919044495,
"learning_rate": 0.001,
"loss": 2.6246,
"num_input_tokens_seen": 10000789056,
"step": 38150
},
{
"epoch": 0.18221496118392025,
"grad_norm": 0.20558005571365356,
"learning_rate": 0.001,
"loss": 2.6291,
"num_input_tokens_seen": 10013896256,
"step": 38200
},
{
"epoch": 0.18245346244201438,
"grad_norm": 0.22928735613822937,
"learning_rate": 0.001,
"loss": 2.6336,
"num_input_tokens_seen": 10027003456,
"step": 38250
},
{
"epoch": 0.1826919637001085,
"grad_norm": 0.23481298983097076,
"learning_rate": 0.001,
"loss": 2.6412,
"num_input_tokens_seen": 10040110656,
"step": 38300
},
{
"epoch": 0.18293046495820264,
"grad_norm": 0.19808940589427948,
"learning_rate": 0.001,
"loss": 2.6395,
"num_input_tokens_seen": 10053217856,
"step": 38350
},
{
"epoch": 0.1831689662162968,
"grad_norm": 0.20152992010116577,
"learning_rate": 0.001,
"loss": 2.6224,
"num_input_tokens_seen": 10066325056,
"step": 38400
},
{
"epoch": 0.18340746747439093,
"grad_norm": 0.18065959215164185,
"learning_rate": 0.001,
"loss": 2.626,
"num_input_tokens_seen": 10079432256,
"step": 38450
},
{
"epoch": 0.18364596873248507,
"grad_norm": 0.20382963120937347,
"learning_rate": 0.001,
"loss": 2.6382,
"num_input_tokens_seen": 10092539456,
"step": 38500
},
{
"epoch": 0.18364596873248507,
"eval_loss": 2.5152854919433594,
"eval_runtime": 51.2566,
"eval_samples_per_second": 97.548,
"eval_steps_per_second": 24.387,
"num_input_tokens_seen": 10092539456,
"step": 38500
},
{
"epoch": 0.1838844699905792,
"grad_norm": 0.17728358507156372,
"learning_rate": 0.001,
"loss": 2.6293,
"num_input_tokens_seen": 10105646656,
"step": 38550
},
{
"epoch": 0.18412297124867333,
"grad_norm": 0.20164869725704193,
"learning_rate": 0.001,
"loss": 2.6372,
"num_input_tokens_seen": 10118753856,
"step": 38600
},
{
"epoch": 0.18436147250676746,
"grad_norm": 0.20125731825828552,
"learning_rate": 0.001,
"loss": 2.6326,
"num_input_tokens_seen": 10131861056,
"step": 38650
},
{
"epoch": 0.18459997376486162,
"grad_norm": 0.21193954348564148,
"learning_rate": 0.001,
"loss": 2.6249,
"num_input_tokens_seen": 10144968256,
"step": 38700
},
{
"epoch": 0.18483847502295575,
"grad_norm": 0.1925983726978302,
"learning_rate": 0.001,
"loss": 2.6424,
"num_input_tokens_seen": 10158075456,
"step": 38750
},
{
"epoch": 0.18507697628104988,
"grad_norm": 0.19814860820770264,
"learning_rate": 0.001,
"loss": 2.6431,
"num_input_tokens_seen": 10171182656,
"step": 38800
},
{
"epoch": 0.185315477539144,
"grad_norm": 0.1909031718969345,
"learning_rate": 0.001,
"loss": 2.6068,
"num_input_tokens_seen": 10184289856,
"step": 38850
},
{
"epoch": 0.18555397879723814,
"grad_norm": 0.20779775083065033,
"learning_rate": 0.001,
"loss": 2.625,
"num_input_tokens_seen": 10197397056,
"step": 38900
},
{
"epoch": 0.1857924800553323,
"grad_norm": 0.1768522411584854,
"learning_rate": 0.001,
"loss": 2.6112,
"num_input_tokens_seen": 10210504256,
"step": 38950
},
{
"epoch": 0.18603098131342644,
"grad_norm": 0.20275786519050598,
"learning_rate": 0.001,
"loss": 2.6284,
"num_input_tokens_seen": 10223611456,
"step": 39000
},
{
"epoch": 0.18603098131342644,
"eval_loss": 2.5149083137512207,
"eval_runtime": 51.6703,
"eval_samples_per_second": 96.767,
"eval_steps_per_second": 24.192,
"num_input_tokens_seen": 10223611456,
"step": 39000
},
{
"epoch": 0.18626948257152057,
"grad_norm": 0.19634057581424713,
"learning_rate": 0.001,
"loss": 2.6342,
"num_input_tokens_seen": 10236718656,
"step": 39050
},
{
"epoch": 0.1865079838296147,
"grad_norm": 0.19488537311553955,
"learning_rate": 0.001,
"loss": 2.6305,
"num_input_tokens_seen": 10249825856,
"step": 39100
},
{
"epoch": 0.18674648508770883,
"grad_norm": 0.2082369476556778,
"learning_rate": 0.001,
"loss": 2.6067,
"num_input_tokens_seen": 10262933056,
"step": 39150
},
{
"epoch": 0.18698498634580296,
"grad_norm": 0.21019776165485382,
"learning_rate": 0.001,
"loss": 2.628,
"num_input_tokens_seen": 10276040256,
"step": 39200
},
{
"epoch": 0.18722348760389712,
"grad_norm": 0.19929739832878113,
"learning_rate": 0.001,
"loss": 2.6256,
"num_input_tokens_seen": 10289147456,
"step": 39250
},
{
"epoch": 0.18746198886199125,
"grad_norm": 0.204230397939682,
"learning_rate": 0.001,
"loss": 2.6113,
"num_input_tokens_seen": 10302254656,
"step": 39300
},
{
"epoch": 0.18770049012008538,
"grad_norm": 0.2217213660478592,
"learning_rate": 0.001,
"loss": 2.6253,
"num_input_tokens_seen": 10315361856,
"step": 39350
},
{
"epoch": 0.18793899137817952,
"grad_norm": 0.19329366087913513,
"learning_rate": 0.001,
"loss": 2.6317,
"num_input_tokens_seen": 10328469056,
"step": 39400
},
{
"epoch": 0.18817749263627365,
"grad_norm": 0.18244336545467377,
"learning_rate": 0.001,
"loss": 2.6476,
"num_input_tokens_seen": 10341576256,
"step": 39450
},
{
"epoch": 0.1884159938943678,
"grad_norm": 0.1864692121744156,
"learning_rate": 0.001,
"loss": 2.642,
"num_input_tokens_seen": 10354683456,
"step": 39500
},
{
"epoch": 0.1884159938943678,
"eval_loss": 2.514141321182251,
"eval_runtime": 51.1111,
"eval_samples_per_second": 97.826,
"eval_steps_per_second": 24.457,
"num_input_tokens_seen": 10354683456,
"step": 39500
},
{
"epoch": 0.18865449515246194,
"grad_norm": 0.25003623962402344,
"learning_rate": 0.001,
"loss": 2.6299,
"num_input_tokens_seen": 10367790656,
"step": 39550
},
{
"epoch": 0.18889299641055607,
"grad_norm": 0.19642098248004913,
"learning_rate": 0.001,
"loss": 2.6412,
"num_input_tokens_seen": 10380897856,
"step": 39600
},
{
"epoch": 0.1891314976686502,
"grad_norm": 0.21947956085205078,
"learning_rate": 0.001,
"loss": 2.6211,
"num_input_tokens_seen": 10394005056,
"step": 39650
},
{
"epoch": 0.18936999892674433,
"grad_norm": 0.19838476181030273,
"learning_rate": 0.001,
"loss": 2.6451,
"num_input_tokens_seen": 10407112256,
"step": 39700
},
{
"epoch": 0.18960850018483846,
"grad_norm": 0.21131113171577454,
"learning_rate": 0.001,
"loss": 2.6375,
"num_input_tokens_seen": 10420219456,
"step": 39750
},
{
"epoch": 0.18984700144293262,
"grad_norm": 0.17576864361763,
"learning_rate": 0.001,
"loss": 2.6325,
"num_input_tokens_seen": 10433326656,
"step": 39800
},
{
"epoch": 0.19008550270102675,
"grad_norm": 0.2113037258386612,
"learning_rate": 0.001,
"loss": 2.6254,
"num_input_tokens_seen": 10446433856,
"step": 39850
},
{
"epoch": 0.19032400395912089,
"grad_norm": 0.1972583681344986,
"learning_rate": 0.001,
"loss": 2.6277,
"num_input_tokens_seen": 10459541056,
"step": 39900
},
{
"epoch": 0.19056250521721502,
"grad_norm": 0.43353378772735596,
"learning_rate": 0.001,
"loss": 2.6295,
"num_input_tokens_seen": 10472648256,
"step": 39950
},
{
"epoch": 0.19080100647530915,
"grad_norm": 0.22195081412792206,
"learning_rate": 0.001,
"loss": 2.6422,
"num_input_tokens_seen": 10485755456,
"step": 40000
},
{
"epoch": 0.19080100647530915,
"eval_loss": 2.5144667625427246,
"eval_runtime": 51.0006,
"eval_samples_per_second": 98.038,
"eval_steps_per_second": 24.51,
"num_input_tokens_seen": 10485755456,
"step": 40000
},
{
"epoch": 0.19103950773340328,
"grad_norm": 0.18717694282531738,
"learning_rate": 0.001,
"loss": 2.6512,
"num_input_tokens_seen": 10498862656,
"step": 40050
},
{
"epoch": 0.19127800899149744,
"grad_norm": 0.2009858638048172,
"learning_rate": 0.001,
"loss": 2.6289,
"num_input_tokens_seen": 10511969856,
"step": 40100
},
{
"epoch": 0.19151651024959157,
"grad_norm": 0.2515949010848999,
"learning_rate": 0.001,
"loss": 2.6342,
"num_input_tokens_seen": 10525077056,
"step": 40150
},
{
"epoch": 0.1917550115076857,
"grad_norm": 0.19864948093891144,
"learning_rate": 0.001,
"loss": 2.6191,
"num_input_tokens_seen": 10538184256,
"step": 40200
},
{
"epoch": 0.19199351276577983,
"grad_norm": 0.17704185843467712,
"learning_rate": 0.001,
"loss": 2.6176,
"num_input_tokens_seen": 10551291456,
"step": 40250
},
{
"epoch": 0.19223201402387396,
"grad_norm": 0.2097242772579193,
"learning_rate": 0.001,
"loss": 2.6509,
"num_input_tokens_seen": 10564398656,
"step": 40300
},
{
"epoch": 0.19247051528196812,
"grad_norm": 0.18630579113960266,
"learning_rate": 0.001,
"loss": 2.6273,
"num_input_tokens_seen": 10577505856,
"step": 40350
},
{
"epoch": 0.19270901654006226,
"grad_norm": 0.24162743985652924,
"learning_rate": 0.001,
"loss": 2.6405,
"num_input_tokens_seen": 10590613056,
"step": 40400
},
{
"epoch": 0.1929475177981564,
"grad_norm": 0.19576874375343323,
"learning_rate": 0.001,
"loss": 2.6403,
"num_input_tokens_seen": 10603720256,
"step": 40450
},
{
"epoch": 0.19318601905625052,
"grad_norm": 0.18408045172691345,
"learning_rate": 0.001,
"loss": 2.6149,
"num_input_tokens_seen": 10616827456,
"step": 40500
},
{
"epoch": 0.19318601905625052,
"eval_loss": 2.511899709701538,
"eval_runtime": 51.5326,
"eval_samples_per_second": 97.026,
"eval_steps_per_second": 24.257,
"num_input_tokens_seen": 10616827456,
"step": 40500
},
{
"epoch": 0.19342452031434465,
"grad_norm": 0.20845313370227814,
"learning_rate": 0.001,
"loss": 2.6242,
"num_input_tokens_seen": 10629934656,
"step": 40550
},
{
"epoch": 0.19366302157243878,
"grad_norm": 0.20603816211223602,
"learning_rate": 0.001,
"loss": 2.6305,
"num_input_tokens_seen": 10643041856,
"step": 40600
},
{
"epoch": 0.19390152283053294,
"grad_norm": 0.2180013507604599,
"learning_rate": 0.001,
"loss": 2.6271,
"num_input_tokens_seen": 10656149056,
"step": 40650
},
{
"epoch": 0.19414002408862707,
"grad_norm": 0.22217005491256714,
"learning_rate": 0.001,
"loss": 2.6407,
"num_input_tokens_seen": 10669256256,
"step": 40700
},
{
"epoch": 0.1943785253467212,
"grad_norm": 0.21379347145557404,
"learning_rate": 0.001,
"loss": 2.6209,
"num_input_tokens_seen": 10682363456,
"step": 40750
},
{
"epoch": 0.19461702660481534,
"grad_norm": 0.2011626958847046,
"learning_rate": 0.001,
"loss": 2.6471,
"num_input_tokens_seen": 10695470656,
"step": 40800
},
{
"epoch": 0.19485552786290947,
"grad_norm": 0.1946493685245514,
"learning_rate": 0.001,
"loss": 2.6267,
"num_input_tokens_seen": 10708577856,
"step": 40850
},
{
"epoch": 0.19509402912100363,
"grad_norm": 0.19157454371452332,
"learning_rate": 0.001,
"loss": 2.6362,
"num_input_tokens_seen": 10721685056,
"step": 40900
},
{
"epoch": 0.19533253037909776,
"grad_norm": 0.1978122442960739,
"learning_rate": 0.001,
"loss": 2.6448,
"num_input_tokens_seen": 10734792256,
"step": 40950
},
{
"epoch": 0.1955710316371919,
"grad_norm": 0.19996555149555206,
"learning_rate": 0.001,
"loss": 2.626,
"num_input_tokens_seen": 10747899456,
"step": 41000
},
{
"epoch": 0.1955710316371919,
"eval_loss": 2.5084941387176514,
"eval_runtime": 51.6987,
"eval_samples_per_second": 96.714,
"eval_steps_per_second": 24.179,
"num_input_tokens_seen": 10747899456,
"step": 41000
},
{
"epoch": 0.19580953289528602,
"grad_norm": 0.20298945903778076,
"learning_rate": 0.001,
"loss": 2.6233,
"num_input_tokens_seen": 10761006656,
"step": 41050
},
{
"epoch": 0.19604803415338015,
"grad_norm": 0.2280716896057129,
"learning_rate": 0.001,
"loss": 2.6427,
"num_input_tokens_seen": 10774113856,
"step": 41100
},
{
"epoch": 0.19628653541147428,
"grad_norm": 0.19223643839359283,
"learning_rate": 0.001,
"loss": 2.6263,
"num_input_tokens_seen": 10787221056,
"step": 41150
},
{
"epoch": 0.19652503666956844,
"grad_norm": 0.19221842288970947,
"learning_rate": 0.001,
"loss": 2.6401,
"num_input_tokens_seen": 10800328256,
"step": 41200
},
{
"epoch": 0.19676353792766257,
"grad_norm": 0.19479979574680328,
"learning_rate": 0.001,
"loss": 2.6269,
"num_input_tokens_seen": 10813435456,
"step": 41250
},
{
"epoch": 0.1970020391857567,
"grad_norm": 0.24501195549964905,
"learning_rate": 0.001,
"loss": 2.618,
"num_input_tokens_seen": 10826542656,
"step": 41300
},
{
"epoch": 0.19724054044385084,
"grad_norm": 0.1994044929742813,
"learning_rate": 0.001,
"loss": 2.64,
"num_input_tokens_seen": 10839649856,
"step": 41350
},
{
"epoch": 0.19747904170194497,
"grad_norm": 0.20831650495529175,
"learning_rate": 0.001,
"loss": 2.6513,
"num_input_tokens_seen": 10852757056,
"step": 41400
},
{
"epoch": 0.19771754296003913,
"grad_norm": 0.21919438242912292,
"learning_rate": 0.001,
"loss": 2.6379,
"num_input_tokens_seen": 10865864256,
"step": 41450
},
{
"epoch": 0.19795604421813326,
"grad_norm": 0.23088768124580383,
"learning_rate": 0.001,
"loss": 2.6449,
"num_input_tokens_seen": 10878971456,
"step": 41500
},
{
"epoch": 0.19795604421813326,
"eval_loss": 2.5156567096710205,
"eval_runtime": 51.6776,
"eval_samples_per_second": 96.754,
"eval_steps_per_second": 24.188,
"num_input_tokens_seen": 10878971456,
"step": 41500
},
{
"epoch": 0.1981945454762274,
"grad_norm": 0.1982518881559372,
"learning_rate": 0.001,
"loss": 2.6304,
"num_input_tokens_seen": 10892078656,
"step": 41550
},
{
"epoch": 0.19843304673432152,
"grad_norm": 0.2099853903055191,
"learning_rate": 0.001,
"loss": 2.6305,
"num_input_tokens_seen": 10905185856,
"step": 41600
},
{
"epoch": 0.19867154799241565,
"grad_norm": 0.19403131306171417,
"learning_rate": 0.001,
"loss": 2.6419,
"num_input_tokens_seen": 10918293056,
"step": 41650
},
{
"epoch": 0.19891004925050979,
"grad_norm": 0.20865993201732635,
"learning_rate": 0.001,
"loss": 2.6116,
"num_input_tokens_seen": 10931400256,
"step": 41700
},
{
"epoch": 0.19914855050860394,
"grad_norm": 0.19042626023292542,
"learning_rate": 0.001,
"loss": 2.6271,
"num_input_tokens_seen": 10944507456,
"step": 41750
},
{
"epoch": 0.19938705176669808,
"grad_norm": 0.20514579117298126,
"learning_rate": 0.001,
"loss": 2.6348,
"num_input_tokens_seen": 10957614656,
"step": 41800
},
{
"epoch": 0.1996255530247922,
"grad_norm": 0.21224668622016907,
"learning_rate": 0.001,
"loss": 2.6314,
"num_input_tokens_seen": 10970721856,
"step": 41850
},
{
"epoch": 0.19986405428288634,
"grad_norm": 0.18857082724571228,
"learning_rate": 0.001,
"loss": 2.6217,
"num_input_tokens_seen": 10983829056,
"step": 41900
},
{
"epoch": 0.20010255554098047,
"grad_norm": 0.18431074917316437,
"learning_rate": 0.001,
"loss": 2.6267,
"num_input_tokens_seen": 10996936256,
"step": 41950
},
{
"epoch": 0.2003410567990746,
"grad_norm": 0.20570099353790283,
"learning_rate": 0.001,
"loss": 2.6016,
"num_input_tokens_seen": 11010043456,
"step": 42000
},
{
"epoch": 0.2003410567990746,
"eval_loss": 2.506241798400879,
"eval_runtime": 51.5548,
"eval_samples_per_second": 96.984,
"eval_steps_per_second": 24.246,
"num_input_tokens_seen": 11010043456,
"step": 42000
},
{
"epoch": 0.20057955805716876,
"grad_norm": 0.17952106893062592,
"learning_rate": 0.001,
"loss": 2.6165,
"num_input_tokens_seen": 11023150656,
"step": 42050
},
{
"epoch": 0.2008180593152629,
"grad_norm": 0.20292694866657257,
"learning_rate": 0.001,
"loss": 2.6357,
"num_input_tokens_seen": 11036257856,
"step": 42100
},
{
"epoch": 0.20105656057335702,
"grad_norm": 0.19588933885097504,
"learning_rate": 0.001,
"loss": 2.6102,
"num_input_tokens_seen": 11049365056,
"step": 42150
},
{
"epoch": 0.20129506183145116,
"grad_norm": 0.1982785314321518,
"learning_rate": 0.001,
"loss": 2.6019,
"num_input_tokens_seen": 11062472256,
"step": 42200
},
{
"epoch": 0.2015335630895453,
"grad_norm": 0.18049876391887665,
"learning_rate": 0.001,
"loss": 2.6081,
"num_input_tokens_seen": 11075579456,
"step": 42250
},
{
"epoch": 0.20177206434763945,
"grad_norm": 0.2069908082485199,
"learning_rate": 0.001,
"loss": 2.6173,
"num_input_tokens_seen": 11088686656,
"step": 42300
},
{
"epoch": 0.20201056560573358,
"grad_norm": 0.2415982335805893,
"learning_rate": 0.001,
"loss": 2.6173,
"num_input_tokens_seen": 11101793856,
"step": 42350
},
{
"epoch": 0.2022490668638277,
"grad_norm": 0.20267252624034882,
"learning_rate": 0.001,
"loss": 2.6299,
"num_input_tokens_seen": 11114901056,
"step": 42400
},
{
"epoch": 0.20248756812192184,
"grad_norm": 0.20683065056800842,
"learning_rate": 0.001,
"loss": 2.6282,
"num_input_tokens_seen": 11128008256,
"step": 42450
},
{
"epoch": 0.20272606938001597,
"grad_norm": 0.22137881815433502,
"learning_rate": 0.001,
"loss": 2.6271,
"num_input_tokens_seen": 11141115456,
"step": 42500
},
{
"epoch": 0.20272606938001597,
"eval_loss": 2.5125572681427,
"eval_runtime": 51.794,
"eval_samples_per_second": 96.536,
"eval_steps_per_second": 24.134,
"num_input_tokens_seen": 11141115456,
"step": 42500
},
{
"epoch": 0.2029645706381101,
"grad_norm": 0.20610037446022034,
"learning_rate": 0.001,
"loss": 2.6255,
"num_input_tokens_seen": 11154222656,
"step": 42550
},
{
"epoch": 0.20320307189620426,
"grad_norm": 0.21218810975551605,
"learning_rate": 0.001,
"loss": 2.6149,
"num_input_tokens_seen": 11167329856,
"step": 42600
},
{
"epoch": 0.2034415731542984,
"grad_norm": 0.19685466587543488,
"learning_rate": 0.001,
"loss": 2.6208,
"num_input_tokens_seen": 11180437056,
"step": 42650
},
{
"epoch": 0.20368007441239253,
"grad_norm": 0.20507460832595825,
"learning_rate": 0.001,
"loss": 2.6227,
"num_input_tokens_seen": 11193544256,
"step": 42700
},
{
"epoch": 0.20391857567048666,
"grad_norm": 0.20014505088329315,
"learning_rate": 0.001,
"loss": 2.6238,
"num_input_tokens_seen": 11206651456,
"step": 42750
},
{
"epoch": 0.2041570769285808,
"grad_norm": 0.1907282918691635,
"learning_rate": 0.001,
"loss": 2.6157,
"num_input_tokens_seen": 11219758656,
"step": 42800
},
{
"epoch": 0.20439557818667495,
"grad_norm": 0.18553833663463593,
"learning_rate": 0.001,
"loss": 2.6123,
"num_input_tokens_seen": 11232865856,
"step": 42850
},
{
"epoch": 0.20463407944476908,
"grad_norm": 0.20382866263389587,
"learning_rate": 0.001,
"loss": 2.6163,
"num_input_tokens_seen": 11245973056,
"step": 42900
},
{
"epoch": 0.2048725807028632,
"grad_norm": 0.18923860788345337,
"learning_rate": 0.001,
"loss": 2.5981,
"num_input_tokens_seen": 11259080256,
"step": 42950
},
{
"epoch": 0.20511108196095734,
"grad_norm": 0.19230851531028748,
"learning_rate": 0.001,
"loss": 2.618,
"num_input_tokens_seen": 11272187456,
"step": 43000
},
{
"epoch": 0.20511108196095734,
"eval_loss": 2.5047237873077393,
"eval_runtime": 51.2959,
"eval_samples_per_second": 97.474,
"eval_steps_per_second": 24.368,
"num_input_tokens_seen": 11272187456,
"step": 43000
},
{
"epoch": 0.20534958321905147,
"grad_norm": 0.22746357321739197,
"learning_rate": 0.001,
"loss": 2.6281,
"num_input_tokens_seen": 11285294656,
"step": 43050
},
{
"epoch": 0.2055880844771456,
"grad_norm": 0.21107150614261627,
"learning_rate": 0.001,
"loss": 2.6154,
"num_input_tokens_seen": 11298401856,
"step": 43100
},
{
"epoch": 0.20582658573523976,
"grad_norm": 0.18025045096874237,
"learning_rate": 0.001,
"loss": 2.6141,
"num_input_tokens_seen": 11311509056,
"step": 43150
},
{
"epoch": 0.2060650869933339,
"grad_norm": 0.2009642869234085,
"learning_rate": 0.001,
"loss": 2.6133,
"num_input_tokens_seen": 11324616256,
"step": 43200
},
{
"epoch": 0.20630358825142803,
"grad_norm": 0.1872788518667221,
"learning_rate": 0.001,
"loss": 2.6197,
"num_input_tokens_seen": 11337723456,
"step": 43250
},
{
"epoch": 0.20654208950952216,
"grad_norm": 0.216310054063797,
"learning_rate": 0.001,
"loss": 2.6353,
"num_input_tokens_seen": 11350830656,
"step": 43300
},
{
"epoch": 0.2067805907676163,
"grad_norm": 0.2705513536930084,
"learning_rate": 0.001,
"loss": 2.6333,
"num_input_tokens_seen": 11363937856,
"step": 43350
},
{
"epoch": 0.20701909202571045,
"grad_norm": 0.3040550649166107,
"learning_rate": 0.001,
"loss": 2.6094,
"num_input_tokens_seen": 11377045056,
"step": 43400
},
{
"epoch": 0.20725759328380458,
"grad_norm": 0.2075599879026413,
"learning_rate": 0.001,
"loss": 2.6225,
"num_input_tokens_seen": 11390152256,
"step": 43450
},
{
"epoch": 0.2074960945418987,
"grad_norm": 0.22293590009212494,
"learning_rate": 0.001,
"loss": 2.6271,
"num_input_tokens_seen": 11403259456,
"step": 43500
},
{
"epoch": 0.2074960945418987,
"eval_loss": 2.5097975730895996,
"eval_runtime": 51.7037,
"eval_samples_per_second": 96.705,
"eval_steps_per_second": 24.176,
"num_input_tokens_seen": 11403259456,
"step": 43500
},
{
"epoch": 0.20773459579999284,
"grad_norm": 0.21221335232257843,
"learning_rate": 0.001,
"loss": 2.618,
"num_input_tokens_seen": 11416366656,
"step": 43550
},
{
"epoch": 0.20797309705808698,
"grad_norm": 0.19894948601722717,
"learning_rate": 0.001,
"loss": 2.6305,
"num_input_tokens_seen": 11429473856,
"step": 43600
},
{
"epoch": 0.2082115983161811,
"grad_norm": 0.29371336102485657,
"learning_rate": 0.001,
"loss": 2.6211,
"num_input_tokens_seen": 11442581056,
"step": 43650
},
{
"epoch": 0.20845009957427527,
"grad_norm": 0.19441936910152435,
"learning_rate": 0.001,
"loss": 2.6355,
"num_input_tokens_seen": 11455688256,
"step": 43700
},
{
"epoch": 0.2086886008323694,
"grad_norm": 0.19868114590644836,
"learning_rate": 0.001,
"loss": 2.6206,
"num_input_tokens_seen": 11468795456,
"step": 43750
},
{
"epoch": 0.20892710209046353,
"grad_norm": 0.19971340894699097,
"learning_rate": 0.001,
"loss": 2.6124,
"num_input_tokens_seen": 11481902656,
"step": 43800
},
{
"epoch": 0.20916560334855766,
"grad_norm": 0.22261051833629608,
"learning_rate": 0.001,
"loss": 2.623,
"num_input_tokens_seen": 11495009856,
"step": 43850
},
{
"epoch": 0.2094041046066518,
"grad_norm": 0.20982281863689423,
"learning_rate": 0.001,
"loss": 2.6182,
"num_input_tokens_seen": 11508117056,
"step": 43900
},
{
"epoch": 0.20964260586474592,
"grad_norm": 0.2216535359621048,
"learning_rate": 0.001,
"loss": 2.6086,
"num_input_tokens_seen": 11521224256,
"step": 43950
},
{
"epoch": 0.20988110712284008,
"grad_norm": 0.19298988580703735,
"learning_rate": 0.001,
"loss": 2.6364,
"num_input_tokens_seen": 11534331456,
"step": 44000
},
{
"epoch": 0.20988110712284008,
"eval_loss": 2.5009121894836426,
"eval_runtime": 51.4356,
"eval_samples_per_second": 97.209,
"eval_steps_per_second": 24.302,
"num_input_tokens_seen": 11534331456,
"step": 44000
},
{
"epoch": 0.21011960838093421,
"grad_norm": 0.19737008213996887,
"learning_rate": 0.001,
"loss": 2.6272,
"num_input_tokens_seen": 11547438656,
"step": 44050
},
{
"epoch": 0.21035810963902835,
"grad_norm": 0.1984977424144745,
"learning_rate": 0.001,
"loss": 2.6417,
"num_input_tokens_seen": 11560545856,
"step": 44100
},
{
"epoch": 0.21059661089712248,
"grad_norm": 0.19575904309749603,
"learning_rate": 0.001,
"loss": 2.6277,
"num_input_tokens_seen": 11573653056,
"step": 44150
},
{
"epoch": 0.2108351121552166,
"grad_norm": 0.19875651597976685,
"learning_rate": 0.001,
"loss": 2.6362,
"num_input_tokens_seen": 11586760256,
"step": 44200
},
{
"epoch": 0.21107361341331077,
"grad_norm": 0.20936185121536255,
"learning_rate": 0.001,
"loss": 2.6217,
"num_input_tokens_seen": 11599867456,
"step": 44250
},
{
"epoch": 0.2113121146714049,
"grad_norm": 0.19474463164806366,
"learning_rate": 0.001,
"loss": 2.6235,
"num_input_tokens_seen": 11612974656,
"step": 44300
},
{
"epoch": 0.21155061592949903,
"grad_norm": 0.20833207666873932,
"learning_rate": 0.001,
"loss": 2.6,
"num_input_tokens_seen": 11626081856,
"step": 44350
},
{
"epoch": 0.21178911718759316,
"grad_norm": 0.19269512593746185,
"learning_rate": 0.001,
"loss": 2.6211,
"num_input_tokens_seen": 11639189056,
"step": 44400
},
{
"epoch": 0.2120276184456873,
"grad_norm": 0.21018226444721222,
"learning_rate": 0.001,
"loss": 2.6294,
"num_input_tokens_seen": 11652296256,
"step": 44450
},
{
"epoch": 0.21226611970378143,
"grad_norm": 0.19836543500423431,
"learning_rate": 0.001,
"loss": 2.6051,
"num_input_tokens_seen": 11665403456,
"step": 44500
},
{
"epoch": 0.21226611970378143,
"eval_loss": 2.499817132949829,
"eval_runtime": 50.9003,
"eval_samples_per_second": 98.231,
"eval_steps_per_second": 24.558,
"num_input_tokens_seen": 11665403456,
"step": 44500
},
{
"epoch": 0.21250462096187558,
"grad_norm": 0.18411967158317566,
"learning_rate": 0.001,
"loss": 2.6228,
"num_input_tokens_seen": 11678510656,
"step": 44550
},
{
"epoch": 0.21274312221996972,
"grad_norm": 0.19387467205524445,
"learning_rate": 0.001,
"loss": 2.5902,
"num_input_tokens_seen": 11691617856,
"step": 44600
},
{
"epoch": 0.21298162347806385,
"grad_norm": 0.22076952457427979,
"learning_rate": 0.001,
"loss": 2.613,
"num_input_tokens_seen": 11704725056,
"step": 44650
},
{
"epoch": 0.21322012473615798,
"grad_norm": 0.33861082792282104,
"learning_rate": 0.001,
"loss": 2.6142,
"num_input_tokens_seen": 11717832256,
"step": 44700
},
{
"epoch": 0.2134586259942521,
"grad_norm": 0.20097902417182922,
"learning_rate": 0.001,
"loss": 2.6549,
"num_input_tokens_seen": 11730939456,
"step": 44750
},
{
"epoch": 0.21369712725234627,
"grad_norm": 0.24534635245800018,
"learning_rate": 0.001,
"loss": 2.6293,
"num_input_tokens_seen": 11744046656,
"step": 44800
},
{
"epoch": 0.2139356285104404,
"grad_norm": 0.2439020723104477,
"learning_rate": 0.001,
"loss": 2.635,
"num_input_tokens_seen": 11757153856,
"step": 44850
},
{
"epoch": 0.21417412976853453,
"grad_norm": 0.24259154498577118,
"learning_rate": 0.001,
"loss": 2.6232,
"num_input_tokens_seen": 11770261056,
"step": 44900
},
{
"epoch": 0.21441263102662866,
"grad_norm": 0.23554636538028717,
"learning_rate": 0.001,
"loss": 2.6061,
"num_input_tokens_seen": 11783368256,
"step": 44950
},
{
"epoch": 0.2146511322847228,
"grad_norm": 0.20377275347709656,
"learning_rate": 0.001,
"loss": 2.6156,
"num_input_tokens_seen": 11796475456,
"step": 45000
},
{
"epoch": 0.2146511322847228,
"eval_loss": 2.503781318664551,
"eval_runtime": 51.1656,
"eval_samples_per_second": 97.722,
"eval_steps_per_second": 24.43,
"num_input_tokens_seen": 11796475456,
"step": 45000
},
{
"epoch": 0.21488963354281693,
"grad_norm": 0.226406991481781,
"learning_rate": 0.001,
"loss": 2.626,
"num_input_tokens_seen": 11809582656,
"step": 45050
},
{
"epoch": 0.21512813480091109,
"grad_norm": 0.20505741238594055,
"learning_rate": 0.001,
"loss": 2.6095,
"num_input_tokens_seen": 11822689856,
"step": 45100
},
{
"epoch": 0.21536663605900522,
"grad_norm": 0.2917146682739258,
"learning_rate": 0.001,
"loss": 2.6439,
"num_input_tokens_seen": 11835797056,
"step": 45150
},
{
"epoch": 0.21560513731709935,
"grad_norm": 0.24030283093452454,
"learning_rate": 0.001,
"loss": 2.6386,
"num_input_tokens_seen": 11848904256,
"step": 45200
},
{
"epoch": 0.21584363857519348,
"grad_norm": 0.1799454241991043,
"learning_rate": 0.001,
"loss": 2.6344,
"num_input_tokens_seen": 11862011456,
"step": 45250
},
{
"epoch": 0.2160821398332876,
"grad_norm": 0.2093718945980072,
"learning_rate": 0.001,
"loss": 2.6152,
"num_input_tokens_seen": 11875118656,
"step": 45300
},
{
"epoch": 0.21632064109138174,
"grad_norm": 0.19477079808712006,
"learning_rate": 0.001,
"loss": 2.622,
"num_input_tokens_seen": 11888225856,
"step": 45350
},
{
"epoch": 0.2165591423494759,
"grad_norm": 0.2764741778373718,
"learning_rate": 0.001,
"loss": 2.5951,
"num_input_tokens_seen": 11901333056,
"step": 45400
},
{
"epoch": 0.21679764360757003,
"grad_norm": 0.2127208709716797,
"learning_rate": 0.001,
"loss": 2.6231,
"num_input_tokens_seen": 11914440256,
"step": 45450
},
{
"epoch": 0.21703614486566417,
"grad_norm": 0.21089383959770203,
"learning_rate": 0.001,
"loss": 2.6099,
"num_input_tokens_seen": 11927547456,
"step": 45500
},
{
"epoch": 0.21703614486566417,
"eval_loss": 2.502464771270752,
"eval_runtime": 50.946,
"eval_samples_per_second": 98.143,
"eval_steps_per_second": 24.536,
"num_input_tokens_seen": 11927547456,
"step": 45500
},
{
"epoch": 0.2172746461237583,
"grad_norm": 0.19550016522407532,
"learning_rate": 0.001,
"loss": 2.6365,
"num_input_tokens_seen": 11940654656,
"step": 45550
},
{
"epoch": 0.21751314738185243,
"grad_norm": 0.18284358084201813,
"learning_rate": 0.001,
"loss": 2.6358,
"num_input_tokens_seen": 11953761856,
"step": 45600
},
{
"epoch": 0.2177516486399466,
"grad_norm": 0.21821847558021545,
"learning_rate": 0.001,
"loss": 2.607,
"num_input_tokens_seen": 11966869056,
"step": 45650
},
{
"epoch": 0.21799014989804072,
"grad_norm": 0.2195073515176773,
"learning_rate": 0.001,
"loss": 2.6195,
"num_input_tokens_seen": 11979976256,
"step": 45700
},
{
"epoch": 0.21822865115613485,
"grad_norm": 0.19679750502109528,
"learning_rate": 0.001,
"loss": 2.6259,
"num_input_tokens_seen": 11993083456,
"step": 45750
},
{
"epoch": 0.21846715241422898,
"grad_norm": 0.1985604166984558,
"learning_rate": 0.001,
"loss": 2.6224,
"num_input_tokens_seen": 12006190656,
"step": 45800
},
{
"epoch": 0.2187056536723231,
"grad_norm": 0.18398787081241608,
"learning_rate": 0.001,
"loss": 2.6215,
"num_input_tokens_seen": 12019297856,
"step": 45850
},
{
"epoch": 0.21894415493041725,
"grad_norm": 0.2306145578622818,
"learning_rate": 0.001,
"loss": 2.6346,
"num_input_tokens_seen": 12032405056,
"step": 45900
},
{
"epoch": 0.2191826561885114,
"grad_norm": 0.21335257589817047,
"learning_rate": 0.001,
"loss": 2.6232,
"num_input_tokens_seen": 12045512256,
"step": 45950
},
{
"epoch": 0.21942115744660554,
"grad_norm": 0.22988814115524292,
"learning_rate": 0.001,
"loss": 2.6132,
"num_input_tokens_seen": 12058619456,
"step": 46000
},
{
"epoch": 0.21942115744660554,
"eval_loss": 2.499041795730591,
"eval_runtime": 50.6868,
"eval_samples_per_second": 98.645,
"eval_steps_per_second": 24.661,
"num_input_tokens_seen": 12058619456,
"step": 46000
},
{
"epoch": 0.21965965870469967,
"grad_norm": 0.19492709636688232,
"learning_rate": 0.001,
"loss": 2.6196,
"num_input_tokens_seen": 12071726656,
"step": 46050
},
{
"epoch": 0.2198981599627938,
"grad_norm": 0.19643568992614746,
"learning_rate": 0.001,
"loss": 2.6108,
"num_input_tokens_seen": 12084833856,
"step": 46100
},
{
"epoch": 0.22013666122088793,
"grad_norm": 0.18720099329948425,
"learning_rate": 0.001,
"loss": 2.6181,
"num_input_tokens_seen": 12097941056,
"step": 46150
},
{
"epoch": 0.2203751624789821,
"grad_norm": 0.1929876208305359,
"learning_rate": 0.001,
"loss": 2.6152,
"num_input_tokens_seen": 12111048256,
"step": 46200
},
{
"epoch": 0.22061366373707622,
"grad_norm": 0.19732603430747986,
"learning_rate": 0.001,
"loss": 2.6267,
"num_input_tokens_seen": 12124155456,
"step": 46250
},
{
"epoch": 0.22085216499517035,
"grad_norm": 0.1964132934808731,
"learning_rate": 0.001,
"loss": 2.605,
"num_input_tokens_seen": 12137262656,
"step": 46300
},
{
"epoch": 0.22109066625326448,
"grad_norm": 0.1927288919687271,
"learning_rate": 0.001,
"loss": 2.6178,
"num_input_tokens_seen": 12150369856,
"step": 46350
},
{
"epoch": 0.22132916751135862,
"grad_norm": 0.17873398959636688,
"learning_rate": 0.001,
"loss": 2.6033,
"num_input_tokens_seen": 12163477056,
"step": 46400
},
{
"epoch": 0.22156766876945275,
"grad_norm": 0.24716190993785858,
"learning_rate": 0.001,
"loss": 2.6141,
"num_input_tokens_seen": 12176584256,
"step": 46450
},
{
"epoch": 0.2218061700275469,
"grad_norm": 0.2021339386701584,
"learning_rate": 0.001,
"loss": 2.6259,
"num_input_tokens_seen": 12189691456,
"step": 46500
},
{
"epoch": 0.2218061700275469,
"eval_loss": 2.4975087642669678,
"eval_runtime": 50.8921,
"eval_samples_per_second": 98.247,
"eval_steps_per_second": 24.562,
"num_input_tokens_seen": 12189691456,
"step": 46500
},
{
"epoch": 0.22204467128564104,
"grad_norm": 0.20796166360378265,
"learning_rate": 0.001,
"loss": 2.6211,
"num_input_tokens_seen": 12202798656,
"step": 46550
},
{
"epoch": 0.22228317254373517,
"grad_norm": 0.20472556352615356,
"learning_rate": 0.001,
"loss": 2.6123,
"num_input_tokens_seen": 12215905856,
"step": 46600
},
{
"epoch": 0.2225216738018293,
"grad_norm": 0.20017485320568085,
"learning_rate": 0.001,
"loss": 2.6037,
"num_input_tokens_seen": 12229013056,
"step": 46650
},
{
"epoch": 0.22276017505992343,
"grad_norm": 0.2037762850522995,
"learning_rate": 0.001,
"loss": 2.6155,
"num_input_tokens_seen": 12242120256,
"step": 46700
},
{
"epoch": 0.2229986763180176,
"grad_norm": 0.19346804916858673,
"learning_rate": 0.001,
"loss": 2.601,
"num_input_tokens_seen": 12255227456,
"step": 46750
},
{
"epoch": 0.22323717757611172,
"grad_norm": 0.18640096485614777,
"learning_rate": 0.001,
"loss": 2.6168,
"num_input_tokens_seen": 12268334656,
"step": 46800
},
{
"epoch": 0.22347567883420585,
"grad_norm": 0.20295055210590363,
"learning_rate": 0.001,
"loss": 2.6221,
"num_input_tokens_seen": 12281441856,
"step": 46850
},
{
"epoch": 0.22371418009229999,
"grad_norm": 0.20705671608448029,
"learning_rate": 0.001,
"loss": 2.6202,
"num_input_tokens_seen": 12294549056,
"step": 46900
},
{
"epoch": 0.22395268135039412,
"grad_norm": 0.18724282085895538,
"learning_rate": 0.001,
"loss": 2.6061,
"num_input_tokens_seen": 12307656256,
"step": 46950
},
{
"epoch": 0.22419118260848825,
"grad_norm": 0.18210910260677338,
"learning_rate": 0.001,
"loss": 2.6045,
"num_input_tokens_seen": 12320763456,
"step": 47000
},
{
"epoch": 0.22419118260848825,
"eval_loss": 2.497344493865967,
"eval_runtime": 51.17,
"eval_samples_per_second": 97.713,
"eval_steps_per_second": 24.428,
"num_input_tokens_seen": 12320763456,
"step": 47000
},
{
"epoch": 0.2244296838665824,
"grad_norm": 0.18894509971141815,
"learning_rate": 0.001,
"loss": 2.6069,
"num_input_tokens_seen": 12333870656,
"step": 47050
},
{
"epoch": 0.22466818512467654,
"grad_norm": 0.23441652953624725,
"learning_rate": 0.001,
"loss": 2.6092,
"num_input_tokens_seen": 12346977856,
"step": 47100
},
{
"epoch": 0.22490668638277067,
"grad_norm": 0.20195326209068298,
"learning_rate": 0.001,
"loss": 2.6135,
"num_input_tokens_seen": 12360085056,
"step": 47150
},
{
"epoch": 0.2251451876408648,
"grad_norm": 0.22025838494300842,
"learning_rate": 0.001,
"loss": 2.6034,
"num_input_tokens_seen": 12373192256,
"step": 47200
},
{
"epoch": 0.22538368889895893,
"grad_norm": 0.19111979007720947,
"learning_rate": 0.001,
"loss": 2.6151,
"num_input_tokens_seen": 12386299456,
"step": 47250
},
{
"epoch": 0.22562219015705307,
"grad_norm": 0.2010103464126587,
"learning_rate": 0.001,
"loss": 2.6031,
"num_input_tokens_seen": 12399406656,
"step": 47300
},
{
"epoch": 0.22586069141514722,
"grad_norm": 0.21569807827472687,
"learning_rate": 0.001,
"loss": 2.6012,
"num_input_tokens_seen": 12412513856,
"step": 47350
},
{
"epoch": 0.22609919267324136,
"grad_norm": 0.18600653111934662,
"learning_rate": 0.001,
"loss": 2.6087,
"num_input_tokens_seen": 12425621056,
"step": 47400
},
{
"epoch": 0.2263376939313355,
"grad_norm": 0.19476164877414703,
"learning_rate": 0.001,
"loss": 2.6179,
"num_input_tokens_seen": 12438728256,
"step": 47450
},
{
"epoch": 0.22657619518942962,
"grad_norm": 0.19705821573734283,
"learning_rate": 0.001,
"loss": 2.5983,
"num_input_tokens_seen": 12451835456,
"step": 47500
},
{
"epoch": 0.22657619518942962,
"eval_loss": 2.495936393737793,
"eval_runtime": 51.8116,
"eval_samples_per_second": 96.504,
"eval_steps_per_second": 24.126,
"num_input_tokens_seen": 12451835456,
"step": 47500
},
{
"epoch": 0.22681469644752375,
"grad_norm": 0.23161695897579193,
"learning_rate": 0.001,
"loss": 2.5974,
"num_input_tokens_seen": 12464942656,
"step": 47550
},
{
"epoch": 0.2270531977056179,
"grad_norm": 0.2022540420293808,
"learning_rate": 0.001,
"loss": 2.6251,
"num_input_tokens_seen": 12478049856,
"step": 47600
},
{
"epoch": 0.22729169896371204,
"grad_norm": 1.0341856479644775,
"learning_rate": 0.001,
"loss": 2.5831,
"num_input_tokens_seen": 12491157056,
"step": 47650
},
{
"epoch": 0.22753020022180617,
"grad_norm": 0.3812394440174103,
"learning_rate": 0.001,
"loss": 2.6407,
"num_input_tokens_seen": 12504264256,
"step": 47700
},
{
"epoch": 0.2277687014799003,
"grad_norm": 0.27030590176582336,
"learning_rate": 0.001,
"loss": 2.6327,
"num_input_tokens_seen": 12517371456,
"step": 47750
},
{
"epoch": 0.22800720273799444,
"grad_norm": 1.3918724060058594,
"learning_rate": 0.001,
"loss": 2.6344,
"num_input_tokens_seen": 12530478656,
"step": 47800
},
{
"epoch": 0.22824570399608857,
"grad_norm": 0.22610582411289215,
"learning_rate": 0.001,
"loss": 2.6444,
"num_input_tokens_seen": 12543585856,
"step": 47850
},
{
"epoch": 0.22848420525418273,
"grad_norm": 0.21421480178833008,
"learning_rate": 0.001,
"loss": 2.6169,
"num_input_tokens_seen": 12556693056,
"step": 47900
},
{
"epoch": 0.22872270651227686,
"grad_norm": 0.20389467477798462,
"learning_rate": 0.001,
"loss": 2.6158,
"num_input_tokens_seen": 12569800256,
"step": 47950
},
{
"epoch": 0.228961207770371,
"grad_norm": 0.2265746295452118,
"learning_rate": 0.001,
"loss": 2.6101,
"num_input_tokens_seen": 12582907456,
"step": 48000
},
{
"epoch": 0.228961207770371,
"eval_loss": 2.4971351623535156,
"eval_runtime": 54.0453,
"eval_samples_per_second": 92.515,
"eval_steps_per_second": 23.129,
"num_input_tokens_seen": 12582907456,
"step": 48000
},
{
"epoch": 0.22919970902846512,
"grad_norm": 0.20247948169708252,
"learning_rate": 0.001,
"loss": 2.6122,
"num_input_tokens_seen": 12596014656,
"step": 48050
},
{
"epoch": 0.22943821028655925,
"grad_norm": 0.20237554609775543,
"learning_rate": 0.001,
"loss": 2.6235,
"num_input_tokens_seen": 12609121856,
"step": 48100
},
{
"epoch": 0.2296767115446534,
"grad_norm": 0.19862660765647888,
"learning_rate": 0.001,
"loss": 2.6264,
"num_input_tokens_seen": 12622229056,
"step": 48150
},
{
"epoch": 0.22991521280274754,
"grad_norm": 0.20839153230190277,
"learning_rate": 0.001,
"loss": 2.5915,
"num_input_tokens_seen": 12635336256,
"step": 48200
},
{
"epoch": 0.23015371406084167,
"grad_norm": 0.19385166466236115,
"learning_rate": 0.001,
"loss": 2.5979,
"num_input_tokens_seen": 12648443456,
"step": 48250
},
{
"epoch": 0.2303922153189358,
"grad_norm": 0.197597935795784,
"learning_rate": 0.001,
"loss": 2.6093,
"num_input_tokens_seen": 12661550656,
"step": 48300
},
{
"epoch": 0.23063071657702994,
"grad_norm": 0.20289985835552216,
"learning_rate": 0.001,
"loss": 2.6039,
"num_input_tokens_seen": 12674657856,
"step": 48350
},
{
"epoch": 0.23086921783512407,
"grad_norm": 0.1986515372991562,
"learning_rate": 0.001,
"loss": 2.6048,
"num_input_tokens_seen": 12687765056,
"step": 48400
},
{
"epoch": 0.23110771909321823,
"grad_norm": 0.19720982015132904,
"learning_rate": 0.001,
"loss": 2.6171,
"num_input_tokens_seen": 12700872256,
"step": 48450
},
{
"epoch": 0.23134622035131236,
"grad_norm": 0.24635523557662964,
"learning_rate": 0.001,
"loss": 2.6242,
"num_input_tokens_seen": 12713979456,
"step": 48500
},
{
"epoch": 0.23134622035131236,
"eval_loss": 2.495468854904175,
"eval_runtime": 53.4259,
"eval_samples_per_second": 93.588,
"eval_steps_per_second": 23.397,
"num_input_tokens_seen": 12713979456,
"step": 48500
},
{
"epoch": 0.2315847216094065,
"grad_norm": 0.5883195996284485,
"learning_rate": 0.001,
"loss": 2.6399,
"num_input_tokens_seen": 12727086656,
"step": 48550
},
{
"epoch": 0.23182322286750062,
"grad_norm": 0.20890024304389954,
"learning_rate": 0.001,
"loss": 2.6325,
"num_input_tokens_seen": 12740193856,
"step": 48600
},
{
"epoch": 0.23206172412559475,
"grad_norm": 0.21251678466796875,
"learning_rate": 0.001,
"loss": 2.6233,
"num_input_tokens_seen": 12753301056,
"step": 48650
},
{
"epoch": 0.23230022538368889,
"grad_norm": 0.20996986329555511,
"learning_rate": 0.001,
"loss": 2.6174,
"num_input_tokens_seen": 12766408256,
"step": 48700
},
{
"epoch": 0.23253872664178304,
"grad_norm": 0.23039382696151733,
"learning_rate": 0.001,
"loss": 2.6305,
"num_input_tokens_seen": 12779515456,
"step": 48750
},
{
"epoch": 0.23277722789987718,
"grad_norm": 0.23922136425971985,
"learning_rate": 0.001,
"loss": 2.6108,
"num_input_tokens_seen": 12792622656,
"step": 48800
},
{
"epoch": 0.2330157291579713,
"grad_norm": 0.22746366262435913,
"learning_rate": 0.001,
"loss": 2.6219,
"num_input_tokens_seen": 12805729856,
"step": 48850
},
{
"epoch": 0.23325423041606544,
"grad_norm": 0.22131897509098053,
"learning_rate": 0.001,
"loss": 2.6205,
"num_input_tokens_seen": 12818837056,
"step": 48900
},
{
"epoch": 0.23349273167415957,
"grad_norm": 0.25431814789772034,
"learning_rate": 0.001,
"loss": 2.6252,
"num_input_tokens_seen": 12831944256,
"step": 48950
},
{
"epoch": 0.23373123293225373,
"grad_norm": 0.2622738778591156,
"learning_rate": 0.001,
"loss": 2.6288,
"num_input_tokens_seen": 12845051456,
"step": 49000
},
{
"epoch": 0.23373123293225373,
"eval_loss": 2.498055934906006,
"eval_runtime": 53.8861,
"eval_samples_per_second": 92.788,
"eval_steps_per_second": 23.197,
"num_input_tokens_seen": 12845051456,
"step": 49000
},
{
"epoch": 0.23396973419034786,
"grad_norm": 0.209337517619133,
"learning_rate": 0.001,
"loss": 2.6348,
"num_input_tokens_seen": 12858158656,
"step": 49050
},
{
"epoch": 0.234208235448442,
"grad_norm": 0.1974038928747177,
"learning_rate": 0.001,
"loss": 2.6158,
"num_input_tokens_seen": 12871265856,
"step": 49100
},
{
"epoch": 0.23444673670653612,
"grad_norm": 0.28099164366722107,
"learning_rate": 0.001,
"loss": 2.6101,
"num_input_tokens_seen": 12884373056,
"step": 49150
},
{
"epoch": 0.23468523796463026,
"grad_norm": 0.2172873318195343,
"learning_rate": 0.001,
"loss": 2.596,
"num_input_tokens_seen": 12897480256,
"step": 49200
},
{
"epoch": 0.2349237392227244,
"grad_norm": 0.2120896875858307,
"learning_rate": 0.001,
"loss": 2.5994,
"num_input_tokens_seen": 12910587456,
"step": 49250
},
{
"epoch": 0.23516224048081855,
"grad_norm": 0.20109935104846954,
"learning_rate": 0.001,
"loss": 2.6101,
"num_input_tokens_seen": 12923694656,
"step": 49300
},
{
"epoch": 0.23540074173891268,
"grad_norm": 0.20735585689544678,
"learning_rate": 0.001,
"loss": 2.6142,
"num_input_tokens_seen": 12936801856,
"step": 49350
},
{
"epoch": 0.2356392429970068,
"grad_norm": 0.21295137703418732,
"learning_rate": 0.001,
"loss": 2.6226,
"num_input_tokens_seen": 12949909056,
"step": 49400
},
{
"epoch": 0.23587774425510094,
"grad_norm": 0.20560845732688904,
"learning_rate": 0.001,
"loss": 2.6027,
"num_input_tokens_seen": 12963016256,
"step": 49450
},
{
"epoch": 0.23611624551319507,
"grad_norm": 0.33747321367263794,
"learning_rate": 0.001,
"loss": 2.6231,
"num_input_tokens_seen": 12976123456,
"step": 49500
},
{
"epoch": 0.23611624551319507,
"eval_loss": 2.5008058547973633,
"eval_runtime": 54.2104,
"eval_samples_per_second": 92.233,
"eval_steps_per_second": 23.058,
"num_input_tokens_seen": 12976123456,
"step": 49500
},
{
"epoch": 0.23635474677128923,
"grad_norm": 0.24593485891819,
"learning_rate": 0.001,
"loss": 2.6336,
"num_input_tokens_seen": 12989230656,
"step": 49550
},
{
"epoch": 0.23659324802938336,
"grad_norm": 0.25253933668136597,
"learning_rate": 0.001,
"loss": 2.643,
"num_input_tokens_seen": 13002337856,
"step": 49600
},
{
"epoch": 0.2368317492874775,
"grad_norm": 0.24231670796871185,
"learning_rate": 0.001,
"loss": 2.6074,
"num_input_tokens_seen": 13015445056,
"step": 49650
},
{
"epoch": 0.23707025054557163,
"grad_norm": 0.2178962677717209,
"learning_rate": 0.001,
"loss": 2.6184,
"num_input_tokens_seen": 13028552256,
"step": 49700
},
{
"epoch": 0.23730875180366576,
"grad_norm": 0.2651260793209076,
"learning_rate": 0.001,
"loss": 2.6335,
"num_input_tokens_seen": 13041659456,
"step": 49750
},
{
"epoch": 0.2375472530617599,
"grad_norm": 0.1909639537334442,
"learning_rate": 0.001,
"loss": 2.61,
"num_input_tokens_seen": 13054766656,
"step": 49800
},
{
"epoch": 0.23778575431985405,
"grad_norm": 0.21107855439186096,
"learning_rate": 0.001,
"loss": 2.6333,
"num_input_tokens_seen": 13067873856,
"step": 49850
},
{
"epoch": 0.23802425557794818,
"grad_norm": 0.19366736710071564,
"learning_rate": 0.001,
"loss": 2.6068,
"num_input_tokens_seen": 13080981056,
"step": 49900
},
{
"epoch": 0.2382627568360423,
"grad_norm": 0.2851523458957672,
"learning_rate": 0.001,
"loss": 2.6183,
"num_input_tokens_seen": 13094088256,
"step": 49950
},
{
"epoch": 0.23850125809413644,
"grad_norm": 0.23617912828922272,
"learning_rate": 0.001,
"loss": 2.617,
"num_input_tokens_seen": 13107195456,
"step": 50000
},
{
"epoch": 0.23850125809413644,
"eval_loss": 2.497406005859375,
"eval_runtime": 53.6538,
"eval_samples_per_second": 93.19,
"eval_steps_per_second": 23.298,
"num_input_tokens_seen": 13107195456,
"step": 50000
},
{
"epoch": 0.23873975935223057,
"grad_norm": 0.5069316029548645,
"learning_rate": 0.001,
"loss": 2.6591,
"num_input_tokens_seen": 13120302656,
"step": 50050
},
{
"epoch": 0.23897826061032473,
"grad_norm": 0.21306034922599792,
"learning_rate": 0.001,
"loss": 2.6455,
"num_input_tokens_seen": 13133409856,
"step": 50100
},
{
"epoch": 0.23921676186841886,
"grad_norm": 0.2045888900756836,
"learning_rate": 0.001,
"loss": 2.6227,
"num_input_tokens_seen": 13146517056,
"step": 50150
},
{
"epoch": 0.239455263126513,
"grad_norm": 0.2335623949766159,
"learning_rate": 0.001,
"loss": 2.6097,
"num_input_tokens_seen": 13159624256,
"step": 50200
},
{
"epoch": 0.23969376438460713,
"grad_norm": 0.19884036481380463,
"learning_rate": 0.001,
"loss": 2.6189,
"num_input_tokens_seen": 13172731456,
"step": 50250
},
{
"epoch": 0.23993226564270126,
"grad_norm": 0.21080589294433594,
"learning_rate": 0.001,
"loss": 2.6057,
"num_input_tokens_seen": 13185838656,
"step": 50300
},
{
"epoch": 0.2401707669007954,
"grad_norm": 0.21613669395446777,
"learning_rate": 0.001,
"loss": 2.6045,
"num_input_tokens_seen": 13198945856,
"step": 50350
},
{
"epoch": 0.24040926815888955,
"grad_norm": 0.2029023915529251,
"learning_rate": 0.001,
"loss": 2.6127,
"num_input_tokens_seen": 13212053056,
"step": 50400
},
{
"epoch": 0.24064776941698368,
"grad_norm": 0.2275777906179428,
"learning_rate": 0.001,
"loss": 2.6149,
"num_input_tokens_seen": 13225160256,
"step": 50450
},
{
"epoch": 0.2408862706750778,
"grad_norm": 0.3332397937774658,
"learning_rate": 0.001,
"loss": 2.6013,
"num_input_tokens_seen": 13238267456,
"step": 50500
},
{
"epoch": 0.2408862706750778,
"eval_loss": 2.5022270679473877,
"eval_runtime": 53.5942,
"eval_samples_per_second": 93.294,
"eval_steps_per_second": 23.323,
"num_input_tokens_seen": 13238267456,
"step": 50500
},
{
"epoch": 0.24112477193317194,
"grad_norm": 0.2197851538658142,
"learning_rate": 0.001,
"loss": 2.6326,
"num_input_tokens_seen": 13251374656,
"step": 50550
},
{
"epoch": 0.24136327319126608,
"grad_norm": 0.2201780080795288,
"learning_rate": 0.001,
"loss": 2.6265,
"num_input_tokens_seen": 13264481856,
"step": 50600
},
{
"epoch": 0.2416017744493602,
"grad_norm": 0.2196362316608429,
"learning_rate": 0.001,
"loss": 2.6272,
"num_input_tokens_seen": 13277589056,
"step": 50650
},
{
"epoch": 0.24184027570745437,
"grad_norm": 0.2234160453081131,
"learning_rate": 0.001,
"loss": 2.6178,
"num_input_tokens_seen": 13290696256,
"step": 50700
},
{
"epoch": 0.2420787769655485,
"grad_norm": 0.24019016325473785,
"learning_rate": 0.001,
"loss": 2.6142,
"num_input_tokens_seen": 13303803456,
"step": 50750
},
{
"epoch": 0.24231727822364263,
"grad_norm": 0.21481236815452576,
"learning_rate": 0.001,
"loss": 2.6149,
"num_input_tokens_seen": 13316910656,
"step": 50800
},
{
"epoch": 0.24255577948173676,
"grad_norm": 0.20477178692817688,
"learning_rate": 0.001,
"loss": 2.5977,
"num_input_tokens_seen": 13330017856,
"step": 50850
},
{
"epoch": 0.2427942807398309,
"grad_norm": 0.20742499828338623,
"learning_rate": 0.001,
"loss": 2.6153,
"num_input_tokens_seen": 13343125056,
"step": 50900
},
{
"epoch": 0.24303278199792505,
"grad_norm": 0.21933062374591827,
"learning_rate": 0.001,
"loss": 2.5966,
"num_input_tokens_seen": 13356232256,
"step": 50950
},
{
"epoch": 0.24327128325601918,
"grad_norm": 0.3282420337200165,
"learning_rate": 0.001,
"loss": 2.6063,
"num_input_tokens_seen": 13369339456,
"step": 51000
},
{
"epoch": 0.24327128325601918,
"eval_loss": 2.4981296062469482,
"eval_runtime": 53.5536,
"eval_samples_per_second": 93.364,
"eval_steps_per_second": 23.341,
"num_input_tokens_seen": 13369339456,
"step": 51000
},
{
"epoch": 0.24350978451411331,
"grad_norm": 0.20502831041812897,
"learning_rate": 0.001,
"loss": 2.6059,
"num_input_tokens_seen": 13382446656,
"step": 51050
},
{
"epoch": 0.24374828577220745,
"grad_norm": 0.20750559866428375,
"learning_rate": 0.001,
"loss": 2.6056,
"num_input_tokens_seen": 13395553856,
"step": 51100
},
{
"epoch": 0.24398678703030158,
"grad_norm": 0.19882823526859283,
"learning_rate": 0.001,
"loss": 2.5983,
"num_input_tokens_seen": 13408661056,
"step": 51150
},
{
"epoch": 0.2442252882883957,
"grad_norm": 0.20900660753250122,
"learning_rate": 0.001,
"loss": 2.6087,
"num_input_tokens_seen": 13421768256,
"step": 51200
},
{
"epoch": 0.24446378954648987,
"grad_norm": 0.21428415179252625,
"learning_rate": 0.001,
"loss": 2.5901,
"num_input_tokens_seen": 13434875456,
"step": 51250
},
{
"epoch": 0.244702290804584,
"grad_norm": 0.19987250864505768,
"learning_rate": 0.001,
"loss": 2.5982,
"num_input_tokens_seen": 13447982656,
"step": 51300
},
{
"epoch": 0.24494079206267813,
"grad_norm": 0.2045862078666687,
"learning_rate": 0.001,
"loss": 2.6058,
"num_input_tokens_seen": 13461089856,
"step": 51350
},
{
"epoch": 0.24517929332077226,
"grad_norm": 0.22261273860931396,
"learning_rate": 0.001,
"loss": 2.5972,
"num_input_tokens_seen": 13474197056,
"step": 51400
},
{
"epoch": 0.2454177945788664,
"grad_norm": 0.20395706593990326,
"learning_rate": 0.001,
"loss": 2.6064,
"num_input_tokens_seen": 13487304256,
"step": 51450
},
{
"epoch": 0.24565629583696055,
"grad_norm": 0.21490858495235443,
"learning_rate": 0.001,
"loss": 2.5922,
"num_input_tokens_seen": 13500411456,
"step": 51500
},
{
"epoch": 0.24565629583696055,
"eval_loss": 2.488300085067749,
"eval_runtime": 53.7972,
"eval_samples_per_second": 92.942,
"eval_steps_per_second": 23.235,
"num_input_tokens_seen": 13500411456,
"step": 51500
},
{
"epoch": 0.24589479709505468,
"grad_norm": 0.2039102464914322,
"learning_rate": 0.001,
"loss": 2.5894,
"num_input_tokens_seen": 13513518656,
"step": 51550
},
{
"epoch": 0.24613329835314882,
"grad_norm": 0.21426360309123993,
"learning_rate": 0.001,
"loss": 2.6089,
"num_input_tokens_seen": 13526625856,
"step": 51600
},
{
"epoch": 0.24637179961124295,
"grad_norm": 0.194682314991951,
"learning_rate": 0.001,
"loss": 2.5932,
"num_input_tokens_seen": 13539733056,
"step": 51650
},
{
"epoch": 0.24661030086933708,
"grad_norm": 0.1901472508907318,
"learning_rate": 0.001,
"loss": 2.6031,
"num_input_tokens_seen": 13552840256,
"step": 51700
},
{
"epoch": 0.2468488021274312,
"grad_norm": 0.20517823100090027,
"learning_rate": 0.001,
"loss": 2.5978,
"num_input_tokens_seen": 13565947456,
"step": 51750
},
{
"epoch": 0.24708730338552537,
"grad_norm": 0.23713302612304688,
"learning_rate": 0.001,
"loss": 2.6061,
"num_input_tokens_seen": 13579054656,
"step": 51800
},
{
"epoch": 0.2473258046436195,
"grad_norm": 0.2431441992521286,
"learning_rate": 0.001,
"loss": 2.6062,
"num_input_tokens_seen": 13592161856,
"step": 51850
},
{
"epoch": 0.24756430590171363,
"grad_norm": 0.20358557999134064,
"learning_rate": 0.001,
"loss": 2.6161,
"num_input_tokens_seen": 13605269056,
"step": 51900
},
{
"epoch": 0.24780280715980776,
"grad_norm": 0.21245016157627106,
"learning_rate": 0.001,
"loss": 2.6166,
"num_input_tokens_seen": 13618376256,
"step": 51950
},
{
"epoch": 0.2480413084179019,
"grad_norm": 0.24295999109745026,
"learning_rate": 0.001,
"loss": 2.6139,
"num_input_tokens_seen": 13631483456,
"step": 52000
},
{
"epoch": 0.2480413084179019,
"eval_loss": 2.4932186603546143,
"eval_runtime": 53.6797,
"eval_samples_per_second": 93.145,
"eval_steps_per_second": 23.286,
"num_input_tokens_seen": 13631483456,
"step": 52000
},
{
"epoch": 0.24827980967599603,
"grad_norm": 0.22135989367961884,
"learning_rate": 0.001,
"loss": 2.5947,
"num_input_tokens_seen": 13644590656,
"step": 52050
},
{
"epoch": 0.2485183109340902,
"grad_norm": 0.3656958341598511,
"learning_rate": 0.001,
"loss": 2.6263,
"num_input_tokens_seen": 13657697856,
"step": 52100
},
{
"epoch": 0.24875681219218432,
"grad_norm": 0.2960817813873291,
"learning_rate": 0.001,
"loss": 2.6086,
"num_input_tokens_seen": 13670805056,
"step": 52150
},
{
"epoch": 0.24899531345027845,
"grad_norm": 0.2150612622499466,
"learning_rate": 0.001,
"loss": 2.6314,
"num_input_tokens_seen": 13683912256,
"step": 52200
},
{
"epoch": 0.24923381470837258,
"grad_norm": 0.23089592158794403,
"learning_rate": 0.001,
"loss": 2.6072,
"num_input_tokens_seen": 13697019456,
"step": 52250
},
{
"epoch": 0.2494723159664667,
"grad_norm": 0.19151148200035095,
"learning_rate": 0.001,
"loss": 2.6177,
"num_input_tokens_seen": 13710126656,
"step": 52300
},
{
"epoch": 0.24971081722456087,
"grad_norm": 0.47803962230682373,
"learning_rate": 0.001,
"loss": 2.6018,
"num_input_tokens_seen": 13723233856,
"step": 52350
},
{
"epoch": 0.249949318482655,
"grad_norm": 0.2346401810646057,
"learning_rate": 0.001,
"loss": 2.6068,
"num_input_tokens_seen": 13736341056,
"step": 52400
},
{
"epoch": 0.2501878197407491,
"grad_norm": 0.21514126658439636,
"learning_rate": 0.001,
"loss": 2.6186,
"num_input_tokens_seen": 13749448256,
"step": 52450
},
{
"epoch": 0.25042632099884327,
"grad_norm": 0.20311090350151062,
"learning_rate": 0.001,
"loss": 2.595,
"num_input_tokens_seen": 13762555456,
"step": 52500
},
{
"epoch": 0.25042632099884327,
"eval_loss": 2.490104913711548,
"eval_runtime": 53.8709,
"eval_samples_per_second": 92.814,
"eval_steps_per_second": 23.204,
"num_input_tokens_seen": 13762555456,
"step": 52500
},
{
"epoch": 0.2506648222569374,
"grad_norm": 0.2120152711868286,
"learning_rate": 0.001,
"loss": 2.6027,
"num_input_tokens_seen": 13775662656,
"step": 52550
},
{
"epoch": 0.25090332351503153,
"grad_norm": 0.3172776401042938,
"learning_rate": 0.001,
"loss": 2.6089,
"num_input_tokens_seen": 13788769856,
"step": 52600
},
{
"epoch": 0.2511418247731257,
"grad_norm": 0.24425551295280457,
"learning_rate": 0.001,
"loss": 2.611,
"num_input_tokens_seen": 13801877056,
"step": 52650
},
{
"epoch": 0.2513803260312198,
"grad_norm": 0.24523352086544037,
"learning_rate": 0.001,
"loss": 2.6066,
"num_input_tokens_seen": 13814984256,
"step": 52700
},
{
"epoch": 0.25161882728931395,
"grad_norm": 0.21642154455184937,
"learning_rate": 0.001,
"loss": 2.6069,
"num_input_tokens_seen": 13828091456,
"step": 52750
},
{
"epoch": 0.2518573285474081,
"grad_norm": 0.21867206692695618,
"learning_rate": 0.001,
"loss": 2.6163,
"num_input_tokens_seen": 13841198656,
"step": 52800
},
{
"epoch": 0.2520958298055022,
"grad_norm": 0.2124466449022293,
"learning_rate": 0.001,
"loss": 2.6045,
"num_input_tokens_seen": 13854305856,
"step": 52850
},
{
"epoch": 0.2523343310635964,
"grad_norm": 0.20598042011260986,
"learning_rate": 0.001,
"loss": 2.5881,
"num_input_tokens_seen": 13867413056,
"step": 52900
},
{
"epoch": 0.2525728323216905,
"grad_norm": 0.1949404776096344,
"learning_rate": 0.001,
"loss": 2.6051,
"num_input_tokens_seen": 13880520256,
"step": 52950
},
{
"epoch": 0.25281133357978464,
"grad_norm": 0.18877142667770386,
"learning_rate": 0.001,
"loss": 2.608,
"num_input_tokens_seen": 13893627456,
"step": 53000
},
{
"epoch": 0.25281133357978464,
"eval_loss": 2.485513210296631,
"eval_runtime": 53.7202,
"eval_samples_per_second": 93.075,
"eval_steps_per_second": 23.269,
"num_input_tokens_seen": 13893627456,
"step": 53000
},
{
"epoch": 0.2530498348378788,
"grad_norm": 0.20486177504062653,
"learning_rate": 0.001,
"loss": 2.5977,
"num_input_tokens_seen": 13906734656,
"step": 53050
},
{
"epoch": 0.2532883360959729,
"grad_norm": 0.18098385632038116,
"learning_rate": 0.001,
"loss": 2.5931,
"num_input_tokens_seen": 13919841856,
"step": 53100
},
{
"epoch": 0.25352683735406706,
"grad_norm": 0.1933833658695221,
"learning_rate": 0.001,
"loss": 2.6058,
"num_input_tokens_seen": 13932949056,
"step": 53150
},
{
"epoch": 0.25376533861216116,
"grad_norm": 0.29640141129493713,
"learning_rate": 0.001,
"loss": 2.5864,
"num_input_tokens_seen": 13946056256,
"step": 53200
},
{
"epoch": 0.2540038398702553,
"grad_norm": 0.2559553384780884,
"learning_rate": 0.001,
"loss": 2.6137,
"num_input_tokens_seen": 13959163456,
"step": 53250
},
{
"epoch": 0.2542423411283494,
"grad_norm": 0.21698619425296783,
"learning_rate": 0.001,
"loss": 2.6184,
"num_input_tokens_seen": 13972270656,
"step": 53300
},
{
"epoch": 0.2544808423864436,
"grad_norm": 0.19658173620700836,
"learning_rate": 0.001,
"loss": 2.5938,
"num_input_tokens_seen": 13985377856,
"step": 53350
},
{
"epoch": 0.25471934364453774,
"grad_norm": 0.2056342512369156,
"learning_rate": 0.001,
"loss": 2.5952,
"num_input_tokens_seen": 13998485056,
"step": 53400
},
{
"epoch": 0.25495784490263185,
"grad_norm": 0.1932424008846283,
"learning_rate": 0.001,
"loss": 2.6101,
"num_input_tokens_seen": 14011592256,
"step": 53450
},
{
"epoch": 0.255196346160726,
"grad_norm": 0.19347251951694489,
"learning_rate": 0.001,
"loss": 2.5976,
"num_input_tokens_seen": 14024699456,
"step": 53500
},
{
"epoch": 0.255196346160726,
"eval_loss": 2.4863245487213135,
"eval_runtime": 53.2426,
"eval_samples_per_second": 93.91,
"eval_steps_per_second": 23.477,
"num_input_tokens_seen": 14024699456,
"step": 53500
},
{
"epoch": 0.2554348474188201,
"grad_norm": 0.1986820101737976,
"learning_rate": 0.001,
"loss": 2.6066,
"num_input_tokens_seen": 14037806656,
"step": 53550
},
{
"epoch": 0.25567334867691427,
"grad_norm": 0.21295565366744995,
"learning_rate": 0.001,
"loss": 2.6107,
"num_input_tokens_seen": 14050913856,
"step": 53600
},
{
"epoch": 0.25591184993500843,
"grad_norm": 0.21585114300251007,
"learning_rate": 0.001,
"loss": 2.6077,
"num_input_tokens_seen": 14064021056,
"step": 53650
},
{
"epoch": 0.25615035119310253,
"grad_norm": 0.19424305856227875,
"learning_rate": 0.001,
"loss": 2.5931,
"num_input_tokens_seen": 14077128256,
"step": 53700
},
{
"epoch": 0.2563888524511967,
"grad_norm": 0.20265349745750427,
"learning_rate": 0.001,
"loss": 2.5901,
"num_input_tokens_seen": 14090235456,
"step": 53750
},
{
"epoch": 0.2566273537092908,
"grad_norm": 1.037636160850525,
"learning_rate": 0.001,
"loss": 2.5775,
"num_input_tokens_seen": 14103342656,
"step": 53800
},
{
"epoch": 0.25686585496738495,
"grad_norm": 0.32030293345451355,
"learning_rate": 0.001,
"loss": 2.6242,
"num_input_tokens_seen": 14116449856,
"step": 53850
},
{
"epoch": 0.2571043562254791,
"grad_norm": 0.2339978665113449,
"learning_rate": 0.001,
"loss": 2.6122,
"num_input_tokens_seen": 14129557056,
"step": 53900
},
{
"epoch": 0.2573428574835732,
"grad_norm": 0.22179783880710602,
"learning_rate": 0.001,
"loss": 2.6025,
"num_input_tokens_seen": 14142664256,
"step": 53950
},
{
"epoch": 0.2575813587416674,
"grad_norm": 0.22616736590862274,
"learning_rate": 0.001,
"loss": 2.5916,
"num_input_tokens_seen": 14155771456,
"step": 54000
},
{
"epoch": 0.2575813587416674,
"eval_loss": 2.4871394634246826,
"eval_runtime": 53.8695,
"eval_samples_per_second": 92.817,
"eval_steps_per_second": 23.204,
"num_input_tokens_seen": 14155771456,
"step": 54000
},
{
"epoch": 0.2578198599997615,
"grad_norm": 0.2028844654560089,
"learning_rate": 0.001,
"loss": 2.6039,
"num_input_tokens_seen": 14168878656,
"step": 54050
},
{
"epoch": 0.25805836125785564,
"grad_norm": 0.19936658442020416,
"learning_rate": 0.001,
"loss": 2.5985,
"num_input_tokens_seen": 14181985856,
"step": 54100
},
{
"epoch": 0.2582968625159498,
"grad_norm": 0.2087993025779724,
"learning_rate": 0.001,
"loss": 2.62,
"num_input_tokens_seen": 14195093056,
"step": 54150
},
{
"epoch": 0.2585353637740439,
"grad_norm": 0.18972960114479065,
"learning_rate": 0.001,
"loss": 2.5936,
"num_input_tokens_seen": 14208200256,
"step": 54200
},
{
"epoch": 0.25877386503213806,
"grad_norm": 0.2162945419549942,
"learning_rate": 0.001,
"loss": 2.6125,
"num_input_tokens_seen": 14221307456,
"step": 54250
},
{
"epoch": 0.25901236629023217,
"grad_norm": 0.2538411319255829,
"learning_rate": 0.001,
"loss": 2.6197,
"num_input_tokens_seen": 14234414656,
"step": 54300
},
{
"epoch": 0.2592508675483263,
"grad_norm": 0.28060850501060486,
"learning_rate": 0.001,
"loss": 2.6194,
"num_input_tokens_seen": 14247521856,
"step": 54350
},
{
"epoch": 0.25948936880642043,
"grad_norm": 0.21557608246803284,
"learning_rate": 0.001,
"loss": 2.623,
"num_input_tokens_seen": 14260629056,
"step": 54400
},
{
"epoch": 0.2597278700645146,
"grad_norm": 0.21628426015377045,
"learning_rate": 0.001,
"loss": 2.6077,
"num_input_tokens_seen": 14273736256,
"step": 54450
},
{
"epoch": 0.25996637132260875,
"grad_norm": 0.19123327732086182,
"learning_rate": 0.001,
"loss": 2.5991,
"num_input_tokens_seen": 14286843456,
"step": 54500
},
{
"epoch": 0.25996637132260875,
"eval_loss": 2.4861645698547363,
"eval_runtime": 53.6448,
"eval_samples_per_second": 93.206,
"eval_steps_per_second": 23.301,
"num_input_tokens_seen": 14286843456,
"step": 54500
},
{
"epoch": 0.26020487258070285,
"grad_norm": 0.20462968945503235,
"learning_rate": 0.001,
"loss": 2.5887,
"num_input_tokens_seen": 14299950656,
"step": 54550
},
{
"epoch": 0.260443373838797,
"grad_norm": 0.20952938497066498,
"learning_rate": 0.001,
"loss": 2.608,
"num_input_tokens_seen": 14313057856,
"step": 54600
},
{
"epoch": 0.2606818750968911,
"grad_norm": 0.2095402032136917,
"learning_rate": 0.001,
"loss": 2.6079,
"num_input_tokens_seen": 14326165056,
"step": 54650
},
{
"epoch": 0.2609203763549853,
"grad_norm": 0.2343517541885376,
"learning_rate": 0.001,
"loss": 2.6124,
"num_input_tokens_seen": 14339272256,
"step": 54700
},
{
"epoch": 0.26115887761307943,
"grad_norm": 0.23840700089931488,
"learning_rate": 0.001,
"loss": 2.6015,
"num_input_tokens_seen": 14352379456,
"step": 54750
},
{
"epoch": 0.26139737887117354,
"grad_norm": 0.22024671733379364,
"learning_rate": 0.001,
"loss": 2.5812,
"num_input_tokens_seen": 14365486656,
"step": 54800
},
{
"epoch": 0.2616358801292677,
"grad_norm": 0.19884246587753296,
"learning_rate": 0.001,
"loss": 2.6118,
"num_input_tokens_seen": 14378593856,
"step": 54850
},
{
"epoch": 0.2618743813873618,
"grad_norm": 0.46560585498809814,
"learning_rate": 0.001,
"loss": 2.6024,
"num_input_tokens_seen": 14391701056,
"step": 54900
},
{
"epoch": 0.26211288264545596,
"grad_norm": 0.2956256568431854,
"learning_rate": 0.001,
"loss": 2.6073,
"num_input_tokens_seen": 14404808256,
"step": 54950
},
{
"epoch": 0.2623513839035501,
"grad_norm": 0.286327064037323,
"learning_rate": 0.001,
"loss": 2.5946,
"num_input_tokens_seen": 14417915456,
"step": 55000
},
{
"epoch": 0.2623513839035501,
"eval_loss": 2.4892399311065674,
"eval_runtime": 53.3184,
"eval_samples_per_second": 93.776,
"eval_steps_per_second": 23.444,
"num_input_tokens_seen": 14417915456,
"step": 55000
},
{
"epoch": 0.2625898851616442,
"grad_norm": 0.22046101093292236,
"learning_rate": 0.001,
"loss": 2.6077,
"num_input_tokens_seen": 14431022656,
"step": 55050
},
{
"epoch": 0.2628283864197384,
"grad_norm": 0.4682837724685669,
"learning_rate": 0.001,
"loss": 2.6065,
"num_input_tokens_seen": 14444129856,
"step": 55100
},
{
"epoch": 0.2630668876778325,
"grad_norm": 0.21442484855651855,
"learning_rate": 0.001,
"loss": 2.6079,
"num_input_tokens_seen": 14457237056,
"step": 55150
},
{
"epoch": 0.26330538893592664,
"grad_norm": 0.2513403296470642,
"learning_rate": 0.001,
"loss": 2.6037,
"num_input_tokens_seen": 14470344256,
"step": 55200
},
{
"epoch": 0.26354389019402075,
"grad_norm": 0.21526487171649933,
"learning_rate": 0.001,
"loss": 2.6049,
"num_input_tokens_seen": 14483451456,
"step": 55250
},
{
"epoch": 0.2637823914521149,
"grad_norm": 0.22567112743854523,
"learning_rate": 0.001,
"loss": 2.5953,
"num_input_tokens_seen": 14496558656,
"step": 55300
},
{
"epoch": 0.26402089271020907,
"grad_norm": 0.20226064324378967,
"learning_rate": 0.001,
"loss": 2.609,
"num_input_tokens_seen": 14509665856,
"step": 55350
},
{
"epoch": 0.26425939396830317,
"grad_norm": 0.31736019253730774,
"learning_rate": 0.001,
"loss": 2.6174,
"num_input_tokens_seen": 14522773056,
"step": 55400
},
{
"epoch": 0.26449789522639733,
"grad_norm": 0.2573414146900177,
"learning_rate": 0.001,
"loss": 2.612,
"num_input_tokens_seen": 14535880256,
"step": 55450
},
{
"epoch": 0.26473639648449143,
"grad_norm": 0.278160959482193,
"learning_rate": 0.001,
"loss": 2.6713,
"num_input_tokens_seen": 14548987456,
"step": 55500
},
{
"epoch": 0.26473639648449143,
"eval_loss": 2.5104730129241943,
"eval_runtime": 54.2403,
"eval_samples_per_second": 92.182,
"eval_steps_per_second": 23.046,
"num_input_tokens_seen": 14548987456,
"step": 55500
},
{
"epoch": 0.2649748977425856,
"grad_norm": 0.25843819975852966,
"learning_rate": 0.001,
"loss": 2.6223,
"num_input_tokens_seen": 14562094656,
"step": 55550
},
{
"epoch": 0.26521339900067975,
"grad_norm": 0.42813193798065186,
"learning_rate": 0.001,
"loss": 2.6114,
"num_input_tokens_seen": 14575201856,
"step": 55600
},
{
"epoch": 0.26545190025877385,
"grad_norm": 0.23324181139469147,
"learning_rate": 0.001,
"loss": 2.6149,
"num_input_tokens_seen": 14588309056,
"step": 55650
},
{
"epoch": 0.265690401516868,
"grad_norm": 0.2795487940311432,
"learning_rate": 0.001,
"loss": 2.6067,
"num_input_tokens_seen": 14601416256,
"step": 55700
},
{
"epoch": 0.2659289027749621,
"grad_norm": 0.6856834888458252,
"learning_rate": 0.001,
"loss": 2.6135,
"num_input_tokens_seen": 14614523456,
"step": 55750
},
{
"epoch": 0.2661674040330563,
"grad_norm": 0.348906934261322,
"learning_rate": 0.001,
"loss": 2.6384,
"num_input_tokens_seen": 14627630656,
"step": 55800
},
{
"epoch": 0.26640590529115044,
"grad_norm": 0.2510247528553009,
"learning_rate": 0.001,
"loss": 2.6224,
"num_input_tokens_seen": 14640737856,
"step": 55850
},
{
"epoch": 0.26664440654924454,
"grad_norm": 0.34429189562797546,
"learning_rate": 0.001,
"loss": 2.6139,
"num_input_tokens_seen": 14653845056,
"step": 55900
},
{
"epoch": 0.2668829078073387,
"grad_norm": 0.25697243213653564,
"learning_rate": 0.001,
"loss": 2.6143,
"num_input_tokens_seen": 14666952256,
"step": 55950
},
{
"epoch": 0.2671214090654328,
"grad_norm": 0.2812611758708954,
"learning_rate": 0.001,
"loss": 2.6172,
"num_input_tokens_seen": 14680059456,
"step": 56000
},
{
"epoch": 0.2671214090654328,
"eval_loss": 2.492490291595459,
"eval_runtime": 53.3814,
"eval_samples_per_second": 93.666,
"eval_steps_per_second": 23.416,
"num_input_tokens_seen": 14680059456,
"step": 56000
},
{
"epoch": 0.26735991032352696,
"grad_norm": 0.22615984082221985,
"learning_rate": 0.0009999685283773503,
"loss": 2.5961,
"num_input_tokens_seen": 14693166656,
"step": 56050
},
{
"epoch": 0.2675984115816211,
"grad_norm": 0.2738794982433319,
"learning_rate": 0.0009998741174712534,
"loss": 2.612,
"num_input_tokens_seen": 14706273856,
"step": 56100
},
{
"epoch": 0.2678369128397152,
"grad_norm": 0.23470066487789154,
"learning_rate": 0.0009997167791667668,
"loss": 2.6071,
"num_input_tokens_seen": 14719381056,
"step": 56150
},
{
"epoch": 0.2680754140978094,
"grad_norm": 0.23558543622493744,
"learning_rate": 0.0009994965332706573,
"loss": 2.5956,
"num_input_tokens_seen": 14732488256,
"step": 56200
},
{
"epoch": 0.2683139153559035,
"grad_norm": 0.2274416983127594,
"learning_rate": 0.0009992134075089082,
"loss": 2.5873,
"num_input_tokens_seen": 14745595456,
"step": 56250
},
{
"epoch": 0.26855241661399765,
"grad_norm": 0.21609161794185638,
"learning_rate": 0.000998867437523228,
"loss": 2.6043,
"num_input_tokens_seen": 14758702656,
"step": 56300
},
{
"epoch": 0.26879091787209175,
"grad_norm": 0.2368565797805786,
"learning_rate": 0.000998458666866564,
"loss": 2.5952,
"num_input_tokens_seen": 14771809856,
"step": 56350
},
{
"epoch": 0.2690294191301859,
"grad_norm": 0.22180891036987305,
"learning_rate": 0.0009979871469976197,
"loss": 2.5934,
"num_input_tokens_seen": 14784917056,
"step": 56400
},
{
"epoch": 0.26926792038828007,
"grad_norm": 0.3060019910335541,
"learning_rate": 0.0009974529372743762,
"loss": 2.6224,
"num_input_tokens_seen": 14798024256,
"step": 56450
},
{
"epoch": 0.2695064216463742,
"grad_norm": 0.2387322634458542,
"learning_rate": 0.0009968561049466214,
"loss": 2.5905,
"num_input_tokens_seen": 14811131456,
"step": 56500
},
{
"epoch": 0.2695064216463742,
"eval_loss": 2.4835996627807617,
"eval_runtime": 53.8478,
"eval_samples_per_second": 92.854,
"eval_steps_per_second": 23.214,
"num_input_tokens_seen": 14811131456,
"step": 56500
},
{
"epoch": 0.26974492290446833,
"grad_norm": 0.22091372311115265,
"learning_rate": 0.0009961967251474822,
"loss": 2.6139,
"num_input_tokens_seen": 14824238656,
"step": 56550
},
{
"epoch": 0.26998342416256244,
"grad_norm": 0.2304680198431015,
"learning_rate": 0.0009954748808839674,
"loss": 2.6167,
"num_input_tokens_seen": 14837345856,
"step": 56600
},
{
"epoch": 0.2702219254206566,
"grad_norm": 0.19777421653270721,
"learning_rate": 0.0009946906630265184,
"loss": 2.6082,
"num_input_tokens_seen": 14850453056,
"step": 56650
},
{
"epoch": 0.27046042667875075,
"grad_norm": 0.2113979458808899,
"learning_rate": 0.0009938441702975688,
"loss": 2.5981,
"num_input_tokens_seen": 14863560256,
"step": 56700
},
{
"epoch": 0.27069892793684486,
"grad_norm": 0.19911637902259827,
"learning_rate": 0.0009929355092591179,
"loss": 2.5904,
"num_input_tokens_seen": 14876667456,
"step": 56750
},
{
"epoch": 0.270937429194939,
"grad_norm": 0.20081694424152374,
"learning_rate": 0.0009919647942993148,
"loss": 2.6012,
"num_input_tokens_seen": 14889774656,
"step": 56800
},
{
"epoch": 0.2711759304530331,
"grad_norm": 0.22752800583839417,
"learning_rate": 0.0009909321476180592,
"loss": 2.6017,
"num_input_tokens_seen": 14902881856,
"step": 56850
},
{
"epoch": 0.2714144317111273,
"grad_norm": 0.23174402117729187,
"learning_rate": 0.0009898376992116178,
"loss": 2.6012,
"num_input_tokens_seen": 14915989056,
"step": 56900
},
{
"epoch": 0.27165293296922144,
"grad_norm": 0.22149533033370972,
"learning_rate": 0.0009886815868562597,
"loss": 2.5881,
"num_input_tokens_seen": 14929096256,
"step": 56950
},
{
"epoch": 0.27189143422731554,
"grad_norm": 0.22576771676540375,
"learning_rate": 0.0009874639560909118,
"loss": 2.6021,
"num_input_tokens_seen": 14942203456,
"step": 57000
},
{
"epoch": 0.27189143422731554,
"eval_loss": 2.482896566390991,
"eval_runtime": 53.3773,
"eval_samples_per_second": 93.673,
"eval_steps_per_second": 23.418,
"num_input_tokens_seen": 14942203456,
"step": 57000
},
{
"epoch": 0.2721299354854097,
"grad_norm": 0.22044019401073456,
"learning_rate": 0.0009861849601988384,
"loss": 2.6119,
"num_input_tokens_seen": 14955310656,
"step": 57050
},
{
"epoch": 0.2723684367435038,
"grad_norm": 0.2155238389968872,
"learning_rate": 0.0009848447601883434,
"loss": 2.5869,
"num_input_tokens_seen": 14968417856,
"step": 57100
},
{
"epoch": 0.27260693800159796,
"grad_norm": 0.21131549775600433,
"learning_rate": 0.0009834435247725033,
"loss": 2.5988,
"num_input_tokens_seen": 14981525056,
"step": 57150
},
{
"epoch": 0.27284543925969207,
"grad_norm": 0.21247337758541107,
"learning_rate": 0.0009819814303479266,
"loss": 2.6198,
"num_input_tokens_seen": 14994632256,
"step": 57200
},
{
"epoch": 0.27308394051778623,
"grad_norm": 0.21916711330413818,
"learning_rate": 0.00098045866097255,
"loss": 2.6019,
"num_input_tokens_seen": 15007739456,
"step": 57250
},
{
"epoch": 0.2733224417758804,
"grad_norm": 0.1925441473722458,
"learning_rate": 0.0009788754083424652,
"loss": 2.6143,
"num_input_tokens_seen": 15020846656,
"step": 57300
},
{
"epoch": 0.2735609430339745,
"grad_norm": 0.38578665256500244,
"learning_rate": 0.0009772318717677904,
"loss": 2.6037,
"num_input_tokens_seen": 15033953856,
"step": 57350
},
{
"epoch": 0.27379944429206865,
"grad_norm": 0.19650611281394958,
"learning_rate": 0.0009755282581475768,
"loss": 2.5745,
"num_input_tokens_seen": 15047061056,
"step": 57400
},
{
"epoch": 0.27403794555016275,
"grad_norm": 0.2376088798046112,
"learning_rate": 0.0009737647819437645,
"loss": 2.5968,
"num_input_tokens_seen": 15060168256,
"step": 57450
},
{
"epoch": 0.2742764468082569,
"grad_norm": 0.21746863424777985,
"learning_rate": 0.0009719416651541838,
"loss": 2.5965,
"num_input_tokens_seen": 15073275456,
"step": 57500
},
{
"epoch": 0.2742764468082569,
"eval_loss": 2.483751058578491,
"eval_runtime": 53.9622,
"eval_samples_per_second": 92.657,
"eval_steps_per_second": 23.164,
"num_input_tokens_seen": 15073275456,
"step": 57500
},
{
"epoch": 0.27451494806635107,
"grad_norm": 0.2898815870285034,
"learning_rate": 0.0009700591372846095,
"loss": 2.6105,
"num_input_tokens_seen": 15086382656,
"step": 57550
},
{
"epoch": 0.2747534493244452,
"grad_norm": 0.24887384474277496,
"learning_rate": 0.0009681174353198686,
"loss": 2.6103,
"num_input_tokens_seen": 15099489856,
"step": 57600
},
{
"epoch": 0.27499195058253934,
"grad_norm": 0.26613715291023254,
"learning_rate": 0.0009661168036940071,
"loss": 2.6296,
"num_input_tokens_seen": 15112597056,
"step": 57650
},
{
"epoch": 0.27523045184063344,
"grad_norm": 0.23983849585056305,
"learning_rate": 0.0009640574942595195,
"loss": 2.6008,
"num_input_tokens_seen": 15125704256,
"step": 57700
},
{
"epoch": 0.2754689530987276,
"grad_norm": 0.23169022798538208,
"learning_rate": 0.0009619397662556434,
"loss": 2.596,
"num_input_tokens_seen": 15138811456,
"step": 57750
},
{
"epoch": 0.27570745435682176,
"grad_norm": 0.21353812515735626,
"learning_rate": 0.0009597638862757254,
"loss": 2.6039,
"num_input_tokens_seen": 15151918656,
"step": 57800
},
{
"epoch": 0.27594595561491586,
"grad_norm": 0.2561227083206177,
"learning_rate": 0.00095753012823366,
"loss": 2.6046,
"num_input_tokens_seen": 15165025856,
"step": 57850
},
{
"epoch": 0.27618445687301,
"grad_norm": 0.20380394160747528,
"learning_rate": 0.000955238773329408,
"loss": 2.5968,
"num_input_tokens_seen": 15178133056,
"step": 57900
},
{
"epoch": 0.2764229581311041,
"grad_norm": 0.26447024941444397,
"learning_rate": 0.000952890110013597,
"loss": 2.5848,
"num_input_tokens_seen": 15191240256,
"step": 57950
},
{
"epoch": 0.2766614593891983,
"grad_norm": 0.23530781269073486,
"learning_rate": 0.0009504844339512095,
"loss": 2.582,
"num_input_tokens_seen": 15204347456,
"step": 58000
},
{
"epoch": 0.2766614593891983,
"eval_loss": 2.482050895690918,
"eval_runtime": 53.5775,
"eval_samples_per_second": 93.323,
"eval_steps_per_second": 23.331,
"num_input_tokens_seen": 15204347456,
"step": 58000
},
{
"epoch": 0.2768999606472924,
"grad_norm": 0.2281644344329834,
"learning_rate": 0.0009480220479843627,
"loss": 2.6212,
"num_input_tokens_seen": 15217454656,
"step": 58050
},
{
"epoch": 0.27713846190538655,
"grad_norm": 0.2181713730096817,
"learning_rate": 0.0009455032620941839,
"loss": 2.5927,
"num_input_tokens_seen": 15230561856,
"step": 58100
},
{
"epoch": 0.2773769631634807,
"grad_norm": 0.21573083102703094,
"learning_rate": 0.00094292839336179,
"loss": 2.6112,
"num_input_tokens_seen": 15243669056,
"step": 58150
},
{
"epoch": 0.2776154644215748,
"grad_norm": 0.2686486840248108,
"learning_rate": 0.000940297765928369,
"loss": 2.6133,
"num_input_tokens_seen": 15256776256,
"step": 58200
},
{
"epoch": 0.27785396567966897,
"grad_norm": 0.2320137470960617,
"learning_rate": 0.0009376117109543769,
"loss": 2.6094,
"num_input_tokens_seen": 15269883456,
"step": 58250
},
{
"epoch": 0.27809246693776307,
"grad_norm": 0.22277672588825226,
"learning_rate": 0.0009348705665778478,
"loss": 2.5885,
"num_input_tokens_seen": 15282990656,
"step": 58300
},
{
"epoch": 0.27833096819585723,
"grad_norm": 0.22681231796741486,
"learning_rate": 0.0009320746778718274,
"loss": 2.6005,
"num_input_tokens_seen": 15296097856,
"step": 58350
},
{
"epoch": 0.2785694694539514,
"grad_norm": 0.25187453627586365,
"learning_rate": 0.000929224396800933,
"loss": 2.5944,
"num_input_tokens_seen": 15309205056,
"step": 58400
},
{
"epoch": 0.2788079707120455,
"grad_norm": 0.24962358176708221,
"learning_rate": 0.0009263200821770461,
"loss": 2.5888,
"num_input_tokens_seen": 15322312256,
"step": 58450
},
{
"epoch": 0.27904647197013965,
"grad_norm": 0.18929679691791534,
"learning_rate": 0.0009233620996141421,
"loss": 2.5927,
"num_input_tokens_seen": 15335419456,
"step": 58500
},
{
"epoch": 0.27904647197013965,
"eval_loss": 2.4754066467285156,
"eval_runtime": 53.7558,
"eval_samples_per_second": 93.013,
"eval_steps_per_second": 23.253,
"num_input_tokens_seen": 15335419456,
"step": 58500
},
{
"epoch": 0.27928497322823376,
"grad_norm": 0.22240912914276123,
"learning_rate": 0.0009203508214822651,
"loss": 2.5944,
"num_input_tokens_seen": 15348526656,
"step": 58550
},
{
"epoch": 0.2795234744863279,
"grad_norm": 0.2096235305070877,
"learning_rate": 0.0009172866268606513,
"loss": 2.5964,
"num_input_tokens_seen": 15361633856,
"step": 58600
},
{
"epoch": 0.2797619757444221,
"grad_norm": 0.2913396954536438,
"learning_rate": 0.0009141699014900082,
"loss": 2.5975,
"num_input_tokens_seen": 15374741056,
"step": 58650
},
{
"epoch": 0.2800004770025162,
"grad_norm": 0.21000444889068604,
"learning_rate": 0.0009110010377239551,
"loss": 2.5987,
"num_input_tokens_seen": 15387848256,
"step": 58700
},
{
"epoch": 0.28023897826061034,
"grad_norm": 0.18561489880084991,
"learning_rate": 0.0009077804344796301,
"loss": 2.5955,
"num_input_tokens_seen": 15400955456,
"step": 58750
},
{
"epoch": 0.28047747951870444,
"grad_norm": 0.330816388130188,
"learning_rate": 0.0009045084971874737,
"loss": 2.5837,
"num_input_tokens_seen": 15414062656,
"step": 58800
},
{
"epoch": 0.2807159807767986,
"grad_norm": 0.21823953092098236,
"learning_rate": 0.000901185637740189,
"loss": 2.5921,
"num_input_tokens_seen": 15427169856,
"step": 58850
},
{
"epoch": 0.28095448203489276,
"grad_norm": 0.28721505403518677,
"learning_rate": 0.0008978122744408905,
"loss": 2.5893,
"num_input_tokens_seen": 15440277056,
"step": 58900
},
{
"epoch": 0.28119298329298686,
"grad_norm": 0.2468225359916687,
"learning_rate": 0.0008943888319504456,
"loss": 2.5999,
"num_input_tokens_seen": 15453384256,
"step": 58950
},
{
"epoch": 0.281431484551081,
"grad_norm": 0.20486761629581451,
"learning_rate": 0.000890915741234015,
"loss": 2.6026,
"num_input_tokens_seen": 15466491456,
"step": 59000
},
{
"epoch": 0.281431484551081,
"eval_loss": 2.4756667613983154,
"eval_runtime": 53.3408,
"eval_samples_per_second": 93.737,
"eval_steps_per_second": 23.434,
"num_input_tokens_seen": 15466491456,
"step": 59000
},
{
"epoch": 0.2816699858091751,
"grad_norm": 0.3338637351989746,
"learning_rate": 0.0008873934395068005,
"loss": 2.587,
"num_input_tokens_seen": 15479598656,
"step": 59050
},
{
"epoch": 0.2819084870672693,
"grad_norm": 0.20848780870437622,
"learning_rate": 0.0008838223701790055,
"loss": 2.5989,
"num_input_tokens_seen": 15492705856,
"step": 59100
},
{
"epoch": 0.2821469883253634,
"grad_norm": 0.21479378640651703,
"learning_rate": 0.0008802029828000156,
"loss": 2.6052,
"num_input_tokens_seen": 15505813056,
"step": 59150
},
{
"epoch": 0.28238548958345755,
"grad_norm": 0.1944151073694229,
"learning_rate": 0.0008765357330018055,
"loss": 2.6044,
"num_input_tokens_seen": 15518920256,
"step": 59200
},
{
"epoch": 0.2826239908415517,
"grad_norm": 0.2078033685684204,
"learning_rate": 0.0008728210824415827,
"loss": 2.5929,
"num_input_tokens_seen": 15532027456,
"step": 59250
},
{
"epoch": 0.2828624920996458,
"grad_norm": 0.19340284168720245,
"learning_rate": 0.0008690594987436704,
"loss": 2.5875,
"num_input_tokens_seen": 15545134656,
"step": 59300
},
{
"epoch": 0.28310099335773997,
"grad_norm": 0.22354012727737427,
"learning_rate": 0.0008652514554406388,
"loss": 2.5976,
"num_input_tokens_seen": 15558241856,
"step": 59350
},
{
"epoch": 0.2833394946158341,
"grad_norm": 0.26784005761146545,
"learning_rate": 0.0008613974319136957,
"loss": 2.5868,
"num_input_tokens_seen": 15571349056,
"step": 59400
},
{
"epoch": 0.28357799587392823,
"grad_norm": 0.20749828219413757,
"learning_rate": 0.0008574979133323377,
"loss": 2.5784,
"num_input_tokens_seen": 15584456256,
"step": 59450
},
{
"epoch": 0.2838164971320224,
"grad_norm": 0.21545729041099548,
"learning_rate": 0.0008535533905932737,
"loss": 2.5939,
"num_input_tokens_seen": 15597563456,
"step": 59500
},
{
"epoch": 0.2838164971320224,
"eval_loss": 2.469989538192749,
"eval_runtime": 54.0784,
"eval_samples_per_second": 92.458,
"eval_steps_per_second": 23.115,
"num_input_tokens_seen": 15597563456,
"step": 59500
},
{
"epoch": 0.2840549983901165,
"grad_norm": 0.20836423337459564,
"learning_rate": 0.0008495643602586287,
"loss": 2.5858,
"num_input_tokens_seen": 15610670656,
"step": 59550
},
{
"epoch": 0.28429349964821066,
"grad_norm": 0.20427604019641876,
"learning_rate": 0.0008455313244934324,
"loss": 2.5781,
"num_input_tokens_seen": 15623777856,
"step": 59600
},
{
"epoch": 0.28453200090630476,
"grad_norm": 0.2341683804988861,
"learning_rate": 0.0008414547910024035,
"loss": 2.5713,
"num_input_tokens_seen": 15636885056,
"step": 59650
},
{
"epoch": 0.2847705021643989,
"grad_norm": 0.20808522403240204,
"learning_rate": 0.0008373352729660373,
"loss": 2.5751,
"num_input_tokens_seen": 15649992256,
"step": 59700
},
{
"epoch": 0.2850090034224931,
"grad_norm": 0.21032562851905823,
"learning_rate": 0.000833173288976002,
"loss": 2.5784,
"num_input_tokens_seen": 15663099456,
"step": 59750
},
{
"epoch": 0.2852475046805872,
"grad_norm": 0.23485584557056427,
"learning_rate": 0.0008289693629698564,
"loss": 2.5974,
"num_input_tokens_seen": 15676206656,
"step": 59800
},
{
"epoch": 0.28548600593868134,
"grad_norm": 0.2229880541563034,
"learning_rate": 0.0008247240241650918,
"loss": 2.5834,
"num_input_tokens_seen": 15689313856,
"step": 59850
},
{
"epoch": 0.28572450719677545,
"grad_norm": 0.21837118268013,
"learning_rate": 0.000820437806992512,
"loss": 2.5734,
"num_input_tokens_seen": 15702421056,
"step": 59900
},
{
"epoch": 0.2859630084548696,
"grad_norm": 0.2157929688692093,
"learning_rate": 0.0008161112510289549,
"loss": 2.587,
"num_input_tokens_seen": 15715528256,
"step": 59950
},
{
"epoch": 0.2862015097129637,
"grad_norm": 0.24053893983364105,
"learning_rate": 0.0008117449009293668,
"loss": 2.5853,
"num_input_tokens_seen": 15728635456,
"step": 60000
},
{
"epoch": 0.2862015097129637,
"eval_loss": 2.470459461212158,
"eval_runtime": 53.5859,
"eval_samples_per_second": 93.308,
"eval_steps_per_second": 23.327,
"num_input_tokens_seen": 15728635456,
"step": 60000
},
{
"epoch": 0.28644001097105787,
"grad_norm": 0.25951045751571655,
"learning_rate": 0.0008073393063582386,
"loss": 2.5946,
"num_input_tokens_seen": 15741742656,
"step": 60050
},
{
"epoch": 0.286678512229152,
"grad_norm": 0.22712726891040802,
"learning_rate": 0.00080289502192041,
"loss": 2.5882,
"num_input_tokens_seen": 15754849856,
"step": 60100
},
{
"epoch": 0.28691701348724613,
"grad_norm": 0.2236946076154709,
"learning_rate": 0.0007984126070912518,
"loss": 2.5854,
"num_input_tokens_seen": 15767957056,
"step": 60150
},
{
"epoch": 0.2871555147453403,
"grad_norm": 0.3175867795944214,
"learning_rate": 0.0007938926261462366,
"loss": 2.5855,
"num_input_tokens_seen": 15781064256,
"step": 60200
},
{
"epoch": 0.2873940160034344,
"grad_norm": 0.22954128682613373,
"learning_rate": 0.000789335648089903,
"loss": 2.595,
"num_input_tokens_seen": 15794171456,
"step": 60250
},
{
"epoch": 0.28763251726152855,
"grad_norm": 0.23379147052764893,
"learning_rate": 0.000784742246584226,
"loss": 2.5872,
"num_input_tokens_seen": 15807278656,
"step": 60300
},
{
"epoch": 0.2878710185196227,
"grad_norm": 0.22107115387916565,
"learning_rate": 0.0007801129998764014,
"loss": 2.5704,
"num_input_tokens_seen": 15820385856,
"step": 60350
},
{
"epoch": 0.2881095197777168,
"grad_norm": 0.21197494864463806,
"learning_rate": 0.0007754484907260512,
"loss": 2.5751,
"num_input_tokens_seen": 15833493056,
"step": 60400
},
{
"epoch": 0.288348021035811,
"grad_norm": 0.21372662484645844,
"learning_rate": 0.0007707493063318629,
"loss": 2.5901,
"num_input_tokens_seen": 15846600256,
"step": 60450
},
{
"epoch": 0.2885865222939051,
"grad_norm": 0.23300603032112122,
"learning_rate": 0.0007660160382576683,
"loss": 2.5888,
"num_input_tokens_seen": 15859707456,
"step": 60500
},
{
"epoch": 0.2885865222939051,
"eval_loss": 2.463745355606079,
"eval_runtime": 53.032,
"eval_samples_per_second": 94.283,
"eval_steps_per_second": 23.571,
"num_input_tokens_seen": 15859707456,
"step": 60500
},
{
"epoch": 0.28882502355199924,
"grad_norm": 0.2108684778213501,
"learning_rate": 0.0007612492823579744,
"loss": 2.5965,
"num_input_tokens_seen": 15872814656,
"step": 60550
},
{
"epoch": 0.2890635248100934,
"grad_norm": 0.20625820755958557,
"learning_rate": 0.0007564496387029531,
"loss": 2.5615,
"num_input_tokens_seen": 15885921856,
"step": 60600
},
{
"epoch": 0.2893020260681875,
"grad_norm": 0.22595694661140442,
"learning_rate": 0.0007516177115029001,
"loss": 2.5871,
"num_input_tokens_seen": 15899029056,
"step": 60650
},
{
"epoch": 0.28954052732628166,
"grad_norm": 0.2095574140548706,
"learning_rate": 0.0007467541090321735,
"loss": 2.5867,
"num_input_tokens_seen": 15912136256,
"step": 60700
},
{
"epoch": 0.28977902858437576,
"grad_norm": 0.1979990303516388,
"learning_rate": 0.00074185944355262,
"loss": 2.586,
"num_input_tokens_seen": 15925243456,
"step": 60750
},
{
"epoch": 0.2900175298424699,
"grad_norm": 0.3573000431060791,
"learning_rate": 0.0007369343312364993,
"loss": 2.5807,
"num_input_tokens_seen": 15938350656,
"step": 60800
},
{
"epoch": 0.2902560311005641,
"grad_norm": 0.2209523618221283,
"learning_rate": 0.0007319793920889171,
"loss": 2.5867,
"num_input_tokens_seen": 15951457856,
"step": 60850
},
{
"epoch": 0.2904945323586582,
"grad_norm": 0.1979866325855255,
"learning_rate": 0.0007269952498697733,
"loss": 2.5679,
"num_input_tokens_seen": 15964565056,
"step": 60900
},
{
"epoch": 0.29073303361675235,
"grad_norm": 0.2013344019651413,
"learning_rate": 0.0007219825320152411,
"loss": 2.5842,
"num_input_tokens_seen": 15977672256,
"step": 60950
},
{
"epoch": 0.29097153487484645,
"grad_norm": 0.20511233806610107,
"learning_rate": 0.0007169418695587791,
"loss": 2.5864,
"num_input_tokens_seen": 15990779456,
"step": 61000
},
{
"epoch": 0.29097153487484645,
"eval_loss": 2.4598097801208496,
"eval_runtime": 53.5493,
"eval_samples_per_second": 93.372,
"eval_steps_per_second": 23.343,
"num_input_tokens_seen": 15990779456,
"step": 61000
},
{
"epoch": 0.2912100361329406,
"grad_norm": 0.19767510890960693,
"learning_rate": 0.0007118738970516943,
"loss": 2.5963,
"num_input_tokens_seen": 16003886656,
"step": 61050
},
{
"epoch": 0.2914485373910347,
"grad_norm": 0.21463529765605927,
"learning_rate": 0.0007067792524832604,
"loss": 2.5825,
"num_input_tokens_seen": 16016993856,
"step": 61100
},
{
"epoch": 0.29168703864912887,
"grad_norm": 0.2011532485485077,
"learning_rate": 0.0007016585772004026,
"loss": 2.5783,
"num_input_tokens_seen": 16030101056,
"step": 61150
},
{
"epoch": 0.29192553990722303,
"grad_norm": 0.19351401925086975,
"learning_rate": 0.0006965125158269618,
"loss": 2.5619,
"num_input_tokens_seen": 16043208256,
"step": 61200
},
{
"epoch": 0.29216404116531713,
"grad_norm": 0.1988568902015686,
"learning_rate": 0.000691341716182545,
"loss": 2.6007,
"num_input_tokens_seen": 16056315456,
"step": 61250
},
{
"epoch": 0.2924025424234113,
"grad_norm": 0.20459413528442383,
"learning_rate": 0.0006861468292009726,
"loss": 2.5762,
"num_input_tokens_seen": 16069422656,
"step": 61300
},
{
"epoch": 0.2926410436815054,
"grad_norm": 0.1914205551147461,
"learning_rate": 0.0006809285088483361,
"loss": 2.5734,
"num_input_tokens_seen": 16082529856,
"step": 61350
},
{
"epoch": 0.29287954493959956,
"grad_norm": 0.194325253367424,
"learning_rate": 0.0006756874120406714,
"loss": 2.5874,
"num_input_tokens_seen": 16095637056,
"step": 61400
},
{
"epoch": 0.2931180461976937,
"grad_norm": 0.20854853093624115,
"learning_rate": 0.0006704241985612625,
"loss": 2.5865,
"num_input_tokens_seen": 16108744256,
"step": 61450
},
{
"epoch": 0.2933565474557878,
"grad_norm": 0.190395787358284,
"learning_rate": 0.0006651395309775837,
"loss": 2.5716,
"num_input_tokens_seen": 16121851456,
"step": 61500
},
{
"epoch": 0.2933565474557878,
"eval_loss": 2.4551966190338135,
"eval_runtime": 53.3343,
"eval_samples_per_second": 93.748,
"eval_steps_per_second": 23.437,
"num_input_tokens_seen": 16121851456,
"step": 61500
},
{
"epoch": 0.293595048713882,
"grad_norm": 0.20652073621749878,
"learning_rate": 0.0006598340745578908,
"loss": 2.5765,
"num_input_tokens_seen": 16134958656,
"step": 61550
},
{
"epoch": 0.2938335499719761,
"grad_norm": 0.20701836049556732,
"learning_rate": 0.0006545084971874737,
"loss": 2.5653,
"num_input_tokens_seen": 16148065856,
"step": 61600
},
{
"epoch": 0.29407205123007024,
"grad_norm": 0.1792392134666443,
"learning_rate": 0.000649163469284578,
"loss": 2.577,
"num_input_tokens_seen": 16161173056,
"step": 61650
},
{
"epoch": 0.2943105524881644,
"grad_norm": 0.21742790937423706,
"learning_rate": 0.0006437996637160086,
"loss": 2.574,
"num_input_tokens_seen": 16174280256,
"step": 61700
},
{
"epoch": 0.2945490537462585,
"grad_norm": 0.20747682452201843,
"learning_rate": 0.0006384177557124247,
"loss": 2.564,
"num_input_tokens_seen": 16187387456,
"step": 61750
},
{
"epoch": 0.29478755500435266,
"grad_norm": 0.19990311563014984,
"learning_rate": 0.0006330184227833376,
"loss": 2.5866,
"num_input_tokens_seen": 16200494656,
"step": 61800
},
{
"epoch": 0.29502605626244677,
"grad_norm": 0.20410317182540894,
"learning_rate": 0.0006276023446318213,
"loss": 2.5559,
"num_input_tokens_seen": 16213601856,
"step": 61850
},
{
"epoch": 0.2952645575205409,
"grad_norm": 0.19365034997463226,
"learning_rate": 0.000622170203068947,
"loss": 2.5705,
"num_input_tokens_seen": 16226709056,
"step": 61900
},
{
"epoch": 0.29550305877863503,
"grad_norm": 0.2115161269903183,
"learning_rate": 0.0006167226819279528,
"loss": 2.5621,
"num_input_tokens_seen": 16239816256,
"step": 61950
},
{
"epoch": 0.2957415600367292,
"grad_norm": 0.22992485761642456,
"learning_rate": 0.0006112604669781572,
"loss": 2.5587,
"num_input_tokens_seen": 16252923456,
"step": 62000
},
{
"epoch": 0.2957415600367292,
"eval_loss": 2.452096462249756,
"eval_runtime": 53.6354,
"eval_samples_per_second": 93.222,
"eval_steps_per_second": 23.306,
"num_input_tokens_seen": 16252923456,
"step": 62000
},
{
"epoch": 0.29598006129482335,
"grad_norm": 0.1945638656616211,
"learning_rate": 0.0006057842458386314,
"loss": 2.5582,
"num_input_tokens_seen": 16266030656,
"step": 62050
},
{
"epoch": 0.29621856255291745,
"grad_norm": 0.201882466673851,
"learning_rate": 0.0006002947078916364,
"loss": 2.5764,
"num_input_tokens_seen": 16279137856,
"step": 62100
},
{
"epoch": 0.2964570638110116,
"grad_norm": 0.2137998789548874,
"learning_rate": 0.0005947925441958392,
"loss": 2.5689,
"num_input_tokens_seen": 16292245056,
"step": 62150
},
{
"epoch": 0.2966955650691057,
"grad_norm": 0.18265672028064728,
"learning_rate": 0.0005892784473993184,
"loss": 2.5741,
"num_input_tokens_seen": 16305352256,
"step": 62200
},
{
"epoch": 0.2969340663271999,
"grad_norm": 0.16944251954555511,
"learning_rate": 0.0005837531116523682,
"loss": 2.5537,
"num_input_tokens_seen": 16318459456,
"step": 62250
},
{
"epoch": 0.29717256758529403,
"grad_norm": 0.20273485779762268,
"learning_rate": 0.0005782172325201155,
"loss": 2.5512,
"num_input_tokens_seen": 16331566656,
"step": 62300
},
{
"epoch": 0.29741106884338814,
"grad_norm": 0.19320476055145264,
"learning_rate": 0.0005726715068949564,
"loss": 2.5823,
"num_input_tokens_seen": 16344673856,
"step": 62350
},
{
"epoch": 0.2976495701014823,
"grad_norm": 0.21321871876716614,
"learning_rate": 0.0005671166329088278,
"loss": 2.5608,
"num_input_tokens_seen": 16357781056,
"step": 62400
},
{
"epoch": 0.2978880713595764,
"grad_norm": 0.2007117122411728,
"learning_rate": 0.0005615533098453215,
"loss": 2.5685,
"num_input_tokens_seen": 16370888256,
"step": 62450
},
{
"epoch": 0.29812657261767056,
"grad_norm": 0.1896267682313919,
"learning_rate": 0.0005559822380516539,
"loss": 2.56,
"num_input_tokens_seen": 16383995456,
"step": 62500
},
{
"epoch": 0.29812657261767056,
"eval_loss": 2.448042154312134,
"eval_runtime": 54.1994,
"eval_samples_per_second": 92.252,
"eval_steps_per_second": 23.063,
"num_input_tokens_seen": 16383995456,
"step": 62500
},
{
"epoch": 0.2983650738757647,
"grad_norm": 0.18581034243106842,
"learning_rate": 0.0005504041188505022,
"loss": 2.5691,
"num_input_tokens_seen": 16397102656,
"step": 62550
},
{
"epoch": 0.2986035751338588,
"grad_norm": 0.19272533059120178,
"learning_rate": 0.0005448196544517168,
"loss": 2.5635,
"num_input_tokens_seen": 16410209856,
"step": 62600
},
{
"epoch": 0.298842076391953,
"grad_norm": 0.19940300285816193,
"learning_rate": 0.0005392295478639225,
"loss": 2.5755,
"num_input_tokens_seen": 16423317056,
"step": 62650
},
{
"epoch": 0.2990805776500471,
"grad_norm": 0.18894875049591064,
"learning_rate": 0.0005336345028060199,
"loss": 2.5718,
"num_input_tokens_seen": 16436424256,
"step": 62700
},
{
"epoch": 0.29931907890814125,
"grad_norm": 0.19226962327957153,
"learning_rate": 0.0005280352236185959,
"loss": 2.563,
"num_input_tokens_seen": 16449531456,
"step": 62750
},
{
"epoch": 0.2995575801662354,
"grad_norm": 0.20716702938079834,
"learning_rate": 0.0005224324151752575,
"loss": 2.5532,
"num_input_tokens_seen": 16462638656,
"step": 62800
},
{
"epoch": 0.2997960814243295,
"grad_norm": 0.20232325792312622,
"learning_rate": 0.000516826782793897,
"loss": 2.5691,
"num_input_tokens_seen": 16475745856,
"step": 62850
},
{
"epoch": 0.30003458268242367,
"grad_norm": 0.19828926026821136,
"learning_rate": 0.0005112190321479025,
"loss": 2.5602,
"num_input_tokens_seen": 16488853056,
"step": 62900
},
{
"epoch": 0.30027308394051777,
"grad_norm": 0.22366905212402344,
"learning_rate": 0.000505609869177323,
"loss": 2.5556,
"num_input_tokens_seen": 16501960256,
"step": 62950
},
{
"epoch": 0.30051158519861193,
"grad_norm": 0.1883884221315384,
"learning_rate": 0.0005,
"loss": 2.5567,
"num_input_tokens_seen": 16515067456,
"step": 63000
},
{
"epoch": 0.30051158519861193,
"eval_loss": 2.4441678524017334,
"eval_runtime": 54.2448,
"eval_samples_per_second": 92.175,
"eval_steps_per_second": 23.044,
"num_input_tokens_seen": 16515067456,
"step": 63000
},
{
"epoch": 0.30075008645670603,
"grad_norm": 0.20152603089809418,
"learning_rate": 0.0004943901308226771,
"loss": 2.5562,
"num_input_tokens_seen": 16528174656,
"step": 63050
},
{
"epoch": 0.3009885877148002,
"grad_norm": 0.18534454703330994,
"learning_rate": 0.0004887809678520976,
"loss": 2.5559,
"num_input_tokens_seen": 16541281856,
"step": 63100
},
{
"epoch": 0.30122708897289435,
"grad_norm": 0.18770301342010498,
"learning_rate": 0.0004831732172061032,
"loss": 2.5538,
"num_input_tokens_seen": 16554389056,
"step": 63150
},
{
"epoch": 0.30146559023098846,
"grad_norm": 0.19565705955028534,
"learning_rate": 0.0004775675848247427,
"loss": 2.5593,
"num_input_tokens_seen": 16567496256,
"step": 63200
},
{
"epoch": 0.3017040914890826,
"grad_norm": 0.1954822540283203,
"learning_rate": 0.00047196477638140405,
"loss": 2.5694,
"num_input_tokens_seen": 16580603456,
"step": 63250
},
{
"epoch": 0.3019425927471767,
"grad_norm": 0.18120840191841125,
"learning_rate": 0.0004663654971939802,
"loss": 2.5622,
"num_input_tokens_seen": 16593710656,
"step": 63300
},
{
"epoch": 0.3021810940052709,
"grad_norm": 0.18100927770137787,
"learning_rate": 0.0004607704521360776,
"loss": 2.5437,
"num_input_tokens_seen": 16606817856,
"step": 63350
},
{
"epoch": 0.30241959526336504,
"grad_norm": 0.20565176010131836,
"learning_rate": 0.0004551803455482833,
"loss": 2.5463,
"num_input_tokens_seen": 16619925056,
"step": 63400
},
{
"epoch": 0.30265809652145914,
"grad_norm": 0.18989761173725128,
"learning_rate": 0.0004495958811494978,
"loss": 2.5609,
"num_input_tokens_seen": 16633032256,
"step": 63450
},
{
"epoch": 0.3028965977795533,
"grad_norm": 0.1870686262845993,
"learning_rate": 0.0004440177619483461,
"loss": 2.5554,
"num_input_tokens_seen": 16646139456,
"step": 63500
},
{
"epoch": 0.3028965977795533,
"eval_loss": 2.4395649433135986,
"eval_runtime": 53.4665,
"eval_samples_per_second": 93.516,
"eval_steps_per_second": 23.379,
"num_input_tokens_seen": 16646139456,
"step": 63500
},
{
"epoch": 0.3031350990376474,
"grad_norm": 0.1891048699617386,
"learning_rate": 0.00043844669015467863,
"loss": 2.5627,
"num_input_tokens_seen": 16659246656,
"step": 63550
},
{
"epoch": 0.30337360029574156,
"grad_norm": 0.18591411411762238,
"learning_rate": 0.0004328833670911724,
"loss": 2.5545,
"num_input_tokens_seen": 16672353856,
"step": 63600
},
{
"epoch": 0.3036121015538357,
"grad_norm": 0.18640951812267303,
"learning_rate": 0.0004273284931050438,
"loss": 2.5672,
"num_input_tokens_seen": 16685461056,
"step": 63650
},
{
"epoch": 0.3038506028119298,
"grad_norm": 0.1919756680727005,
"learning_rate": 0.0004217827674798845,
"loss": 2.5492,
"num_input_tokens_seen": 16698568256,
"step": 63700
},
{
"epoch": 0.304089104070024,
"grad_norm": 0.18388938903808594,
"learning_rate": 0.00041624688834763184,
"loss": 2.5487,
"num_input_tokens_seen": 16711675456,
"step": 63750
},
{
"epoch": 0.3043276053281181,
"grad_norm": 0.1851562261581421,
"learning_rate": 0.0004107215526006817,
"loss": 2.5539,
"num_input_tokens_seen": 16724782656,
"step": 63800
},
{
"epoch": 0.30456610658621225,
"grad_norm": 0.17315496504306793,
"learning_rate": 0.0004052074558041608,
"loss": 2.5544,
"num_input_tokens_seen": 16737889856,
"step": 63850
},
{
"epoch": 0.30480460784430635,
"grad_norm": 0.17985352873802185,
"learning_rate": 0.00039970529210836363,
"loss": 2.5511,
"num_input_tokens_seen": 16750997056,
"step": 63900
},
{
"epoch": 0.3050431091024005,
"grad_norm": 0.20455212891101837,
"learning_rate": 0.0003942157541613686,
"loss": 2.5593,
"num_input_tokens_seen": 16764104256,
"step": 63950
},
{
"epoch": 0.30528161036049467,
"grad_norm": 0.1965632140636444,
"learning_rate": 0.00038873953302184284,
"loss": 2.5599,
"num_input_tokens_seen": 16777211456,
"step": 64000
},
{
"epoch": 0.30528161036049467,
"eval_loss": 2.437380790710449,
"eval_runtime": 53.2524,
"eval_samples_per_second": 93.893,
"eval_steps_per_second": 23.473,
"num_input_tokens_seen": 16777211456,
"step": 64000
},
{
"epoch": 0.3055201116185888,
"grad_norm": 0.1703004688024521,
"learning_rate": 0.00038327731807204744,
"loss": 2.5506,
"num_input_tokens_seen": 16790318656,
"step": 64050
},
{
"epoch": 0.30575861287668293,
"grad_norm": 0.19769616425037384,
"learning_rate": 0.00037782979693105293,
"loss": 2.542,
"num_input_tokens_seen": 16803425856,
"step": 64100
},
{
"epoch": 0.30599711413477704,
"grad_norm": 0.20674961805343628,
"learning_rate": 0.00037239765536817873,
"loss": 2.539,
"num_input_tokens_seen": 16816533056,
"step": 64150
},
{
"epoch": 0.3062356153928712,
"grad_norm": 0.19121839106082916,
"learning_rate": 0.0003669815772166625,
"loss": 2.5573,
"num_input_tokens_seen": 16829640256,
"step": 64200
},
{
"epoch": 0.30647411665096536,
"grad_norm": 0.1734025925397873,
"learning_rate": 0.00036158224428757535,
"loss": 2.5416,
"num_input_tokens_seen": 16842747456,
"step": 64250
},
{
"epoch": 0.30671261790905946,
"grad_norm": 0.1857634037733078,
"learning_rate": 0.0003562003362839914,
"loss": 2.5652,
"num_input_tokens_seen": 16855854656,
"step": 64300
},
{
"epoch": 0.3069511191671536,
"grad_norm": 0.17733143270015717,
"learning_rate": 0.000350836530715422,
"loss": 2.5299,
"num_input_tokens_seen": 16868961856,
"step": 64350
},
{
"epoch": 0.3071896204252477,
"grad_norm": 0.18323005735874176,
"learning_rate": 0.00034549150281252633,
"loss": 2.5691,
"num_input_tokens_seen": 16882069056,
"step": 64400
},
{
"epoch": 0.3074281216833419,
"grad_norm": 0.18570365011692047,
"learning_rate": 0.00034016592544210936,
"loss": 2.5436,
"num_input_tokens_seen": 16895176256,
"step": 64450
},
{
"epoch": 0.30766662294143604,
"grad_norm": 0.18571798503398895,
"learning_rate": 0.00033486046902241664,
"loss": 2.5382,
"num_input_tokens_seen": 16908283456,
"step": 64500
},
{
"epoch": 0.30766662294143604,
"eval_loss": 2.4323015213012695,
"eval_runtime": 53.7237,
"eval_samples_per_second": 93.069,
"eval_steps_per_second": 23.267,
"num_input_tokens_seen": 16908283456,
"step": 64500
},
{
"epoch": 0.30790512419953014,
"grad_norm": 0.1829528957605362,
"learning_rate": 0.0003295758014387375,
"loss": 2.5453,
"num_input_tokens_seen": 16921390656,
"step": 64550
},
{
"epoch": 0.3081436254576243,
"grad_norm": 0.1703086644411087,
"learning_rate": 0.0003243125879593286,
"loss": 2.5441,
"num_input_tokens_seen": 16934497856,
"step": 64600
},
{
"epoch": 0.3083821267157184,
"grad_norm": 0.17826180160045624,
"learning_rate": 0.000319071491151664,
"loss": 2.545,
"num_input_tokens_seen": 16947605056,
"step": 64650
},
{
"epoch": 0.30862062797381257,
"grad_norm": 0.17889030277729034,
"learning_rate": 0.00031385317079902743,
"loss": 2.5405,
"num_input_tokens_seen": 16960712256,
"step": 64700
},
{
"epoch": 0.30885912923190667,
"grad_norm": 0.1711336225271225,
"learning_rate": 0.0003086582838174551,
"loss": 2.5222,
"num_input_tokens_seen": 16973819456,
"step": 64750
},
{
"epoch": 0.30909763049000083,
"grad_norm": 0.17962214350700378,
"learning_rate": 0.0003034874841730382,
"loss": 2.5376,
"num_input_tokens_seen": 16986926656,
"step": 64800
},
{
"epoch": 0.309336131748095,
"grad_norm": 0.1699627935886383,
"learning_rate": 0.0002983414227995975,
"loss": 2.5616,
"num_input_tokens_seen": 17000033856,
"step": 64850
},
{
"epoch": 0.3095746330061891,
"grad_norm": 0.18442535400390625,
"learning_rate": 0.00029322074751673977,
"loss": 2.5377,
"num_input_tokens_seen": 17013141056,
"step": 64900
},
{
"epoch": 0.30981313426428325,
"grad_norm": 0.17972196638584137,
"learning_rate": 0.0002881261029483057,
"loss": 2.5474,
"num_input_tokens_seen": 17026248256,
"step": 64950
},
{
"epoch": 0.31005163552237736,
"grad_norm": 0.1810217946767807,
"learning_rate": 0.00028305813044122096,
"loss": 2.5286,
"num_input_tokens_seen": 17039355456,
"step": 65000
},
{
"epoch": 0.31005163552237736,
"eval_loss": 2.4292306900024414,
"eval_runtime": 53.3956,
"eval_samples_per_second": 93.641,
"eval_steps_per_second": 23.41,
"num_input_tokens_seen": 17039355456,
"step": 65000
}
],
"logging_steps": 50,
"max_steps": 70000,
"num_input_tokens_seen": 17039355456,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.5581938885892506e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}