train_copa_789_1760637874 / trainer_state.json
rbelanec's picture
End of training
501c88d verified
{
"best_global_step": 180,
"best_metric": 0.22945284843444824,
"best_model_checkpoint": "saves_multiple/p-tuning/llama-3-8b-instruct/train_copa_789_1760637874/checkpoint-180",
"epoch": 20.0,
"eval_steps": 90,
"global_step": 1800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05555555555555555,
"grad_norm": 215.8870849609375,
"learning_rate": 2.2222222222222223e-05,
"loss": 4.687,
"num_input_tokens_seen": 1632,
"step": 5
},
{
"epoch": 0.1111111111111111,
"grad_norm": 22.133590698242188,
"learning_rate": 5e-05,
"loss": 1.6103,
"num_input_tokens_seen": 3232,
"step": 10
},
{
"epoch": 0.16666666666666666,
"grad_norm": 7.232245922088623,
"learning_rate": 7.777777777777778e-05,
"loss": 0.3703,
"num_input_tokens_seen": 4832,
"step": 15
},
{
"epoch": 0.2222222222222222,
"grad_norm": 1.963056206703186,
"learning_rate": 0.00010555555555555555,
"loss": 0.2378,
"num_input_tokens_seen": 6432,
"step": 20
},
{
"epoch": 0.2777777777777778,
"grad_norm": 6.656666278839111,
"learning_rate": 0.00013333333333333334,
"loss": 0.2584,
"num_input_tokens_seen": 7968,
"step": 25
},
{
"epoch": 0.3333333333333333,
"grad_norm": 2.5032241344451904,
"learning_rate": 0.0001611111111111111,
"loss": 0.5879,
"num_input_tokens_seen": 9504,
"step": 30
},
{
"epoch": 0.3888888888888889,
"grad_norm": 3.4075732231140137,
"learning_rate": 0.00018888888888888888,
"loss": 0.28,
"num_input_tokens_seen": 11104,
"step": 35
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.5475021600723267,
"learning_rate": 0.00021666666666666668,
"loss": 0.2908,
"num_input_tokens_seen": 12704,
"step": 40
},
{
"epoch": 0.5,
"grad_norm": 7.280683517456055,
"learning_rate": 0.00024444444444444443,
"loss": 0.2511,
"num_input_tokens_seen": 14240,
"step": 45
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.17710517346858978,
"learning_rate": 0.0002722222222222222,
"loss": 0.2512,
"num_input_tokens_seen": 15808,
"step": 50
},
{
"epoch": 0.6111111111111112,
"grad_norm": 0.8657889366149902,
"learning_rate": 0.0003,
"loss": 0.4036,
"num_input_tokens_seen": 17344,
"step": 55
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.1960246562957764,
"learning_rate": 0.0003277777777777778,
"loss": 0.2508,
"num_input_tokens_seen": 18912,
"step": 60
},
{
"epoch": 0.7222222222222222,
"grad_norm": 0.656658411026001,
"learning_rate": 0.00035555555555555557,
"loss": 0.2354,
"num_input_tokens_seen": 20448,
"step": 65
},
{
"epoch": 0.7777777777777778,
"grad_norm": 1.5388158559799194,
"learning_rate": 0.00038333333333333334,
"loss": 0.2819,
"num_input_tokens_seen": 21984,
"step": 70
},
{
"epoch": 0.8333333333333334,
"grad_norm": 4.028906345367432,
"learning_rate": 0.0004111111111111111,
"loss": 0.2688,
"num_input_tokens_seen": 23552,
"step": 75
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.4602603018283844,
"learning_rate": 0.0004388888888888889,
"loss": 0.2861,
"num_input_tokens_seen": 25120,
"step": 80
},
{
"epoch": 0.9444444444444444,
"grad_norm": 2.0211222171783447,
"learning_rate": 0.00046666666666666666,
"loss": 0.2392,
"num_input_tokens_seen": 26656,
"step": 85
},
{
"epoch": 1.0,
"grad_norm": 0.11489757150411606,
"learning_rate": 0.0004944444444444445,
"loss": 0.245,
"num_input_tokens_seen": 28192,
"step": 90
},
{
"epoch": 1.0,
"eval_loss": 0.2327098846435547,
"eval_runtime": 1.1469,
"eval_samples_per_second": 34.877,
"eval_steps_per_second": 8.719,
"num_input_tokens_seen": 28192,
"step": 90
},
{
"epoch": 1.0555555555555556,
"grad_norm": 0.2278575897216797,
"learning_rate": 0.0005222222222222223,
"loss": 0.235,
"num_input_tokens_seen": 29792,
"step": 95
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.25667938590049744,
"learning_rate": 0.00055,
"loss": 0.2347,
"num_input_tokens_seen": 31328,
"step": 100
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.09546233713626862,
"learning_rate": 0.0005777777777777778,
"loss": 0.2524,
"num_input_tokens_seen": 32832,
"step": 105
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.16348819434642792,
"learning_rate": 0.0006055555555555556,
"loss": 0.2344,
"num_input_tokens_seen": 34304,
"step": 110
},
{
"epoch": 1.2777777777777777,
"grad_norm": 0.1358136087656021,
"learning_rate": 0.0006333333333333333,
"loss": 0.2314,
"num_input_tokens_seen": 35840,
"step": 115
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.12788349390029907,
"learning_rate": 0.0006611111111111111,
"loss": 0.2244,
"num_input_tokens_seen": 37376,
"step": 120
},
{
"epoch": 1.3888888888888888,
"grad_norm": 0.0501859076321125,
"learning_rate": 0.000688888888888889,
"loss": 0.2462,
"num_input_tokens_seen": 38944,
"step": 125
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.26624178886413574,
"learning_rate": 0.0007166666666666667,
"loss": 0.2241,
"num_input_tokens_seen": 40512,
"step": 130
},
{
"epoch": 1.5,
"grad_norm": 0.037180013954639435,
"learning_rate": 0.0007444444444444445,
"loss": 0.2453,
"num_input_tokens_seen": 42080,
"step": 135
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.10765981674194336,
"learning_rate": 0.0007722222222222223,
"loss": 0.2264,
"num_input_tokens_seen": 43680,
"step": 140
},
{
"epoch": 1.6111111111111112,
"grad_norm": 0.12236373126506805,
"learning_rate": 0.0008,
"loss": 0.2341,
"num_input_tokens_seen": 45216,
"step": 145
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.17313969135284424,
"learning_rate": 0.0008277777777777778,
"loss": 0.2343,
"num_input_tokens_seen": 46720,
"step": 150
},
{
"epoch": 1.7222222222222223,
"grad_norm": 0.9564811587333679,
"learning_rate": 0.0008555555555555556,
"loss": 0.2399,
"num_input_tokens_seen": 48288,
"step": 155
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.09225412458181381,
"learning_rate": 0.0008833333333333333,
"loss": 0.2303,
"num_input_tokens_seen": 49888,
"step": 160
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.09442174434661865,
"learning_rate": 0.0009111111111111111,
"loss": 0.2393,
"num_input_tokens_seen": 51424,
"step": 165
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.056451309472322464,
"learning_rate": 0.000938888888888889,
"loss": 0.2384,
"num_input_tokens_seen": 52992,
"step": 170
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.18257667124271393,
"learning_rate": 0.0009666666666666667,
"loss": 0.228,
"num_input_tokens_seen": 54592,
"step": 175
},
{
"epoch": 2.0,
"grad_norm": 0.30549758672714233,
"learning_rate": 0.0009944444444444445,
"loss": 0.2443,
"num_input_tokens_seen": 56192,
"step": 180
},
{
"epoch": 2.0,
"eval_loss": 0.22945284843444824,
"eval_runtime": 0.8098,
"eval_samples_per_second": 49.394,
"eval_steps_per_second": 12.349,
"num_input_tokens_seen": 56192,
"step": 180
},
{
"epoch": 2.0555555555555554,
"grad_norm": 0.19198457896709442,
"learning_rate": 0.000999984957239884,
"loss": 0.2365,
"num_input_tokens_seen": 57760,
"step": 185
},
{
"epoch": 2.111111111111111,
"grad_norm": 4.2871012687683105,
"learning_rate": 0.0009999238475781956,
"loss": 0.2684,
"num_input_tokens_seen": 59264,
"step": 190
},
{
"epoch": 2.1666666666666665,
"grad_norm": 1.6800514459609985,
"learning_rate": 0.000999815736583355,
"loss": 0.2517,
"num_input_tokens_seen": 60768,
"step": 195
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.0673963725566864,
"learning_rate": 0.000999660634419631,
"loss": 0.2444,
"num_input_tokens_seen": 62272,
"step": 200
},
{
"epoch": 2.2777777777777777,
"grad_norm": 0.046518296003341675,
"learning_rate": 0.0009994585556692623,
"loss": 0.2327,
"num_input_tokens_seen": 63808,
"step": 205
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.05228637158870697,
"learning_rate": 0.0009992095193310836,
"loss": 0.2406,
"num_input_tokens_seen": 65344,
"step": 210
},
{
"epoch": 2.388888888888889,
"grad_norm": 0.21179161965847015,
"learning_rate": 0.0009989135488187406,
"loss": 0.2357,
"num_input_tokens_seen": 66912,
"step": 215
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.337697297334671,
"learning_rate": 0.0009985706719584887,
"loss": 0.2345,
"num_input_tokens_seen": 68480,
"step": 220
},
{
"epoch": 2.5,
"grad_norm": 0.12051483988761902,
"learning_rate": 0.000998180920986577,
"loss": 0.2317,
"num_input_tokens_seen": 70048,
"step": 225
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.026494326069951057,
"learning_rate": 0.0009977443325462165,
"loss": 0.2303,
"num_input_tokens_seen": 71680,
"step": 230
},
{
"epoch": 2.611111111111111,
"grad_norm": 0.02274704910814762,
"learning_rate": 0.0009972609476841367,
"loss": 0.2404,
"num_input_tokens_seen": 73280,
"step": 235
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.12976308166980743,
"learning_rate": 0.0009967308118467252,
"loss": 0.2217,
"num_input_tokens_seen": 74752,
"step": 240
},
{
"epoch": 2.7222222222222223,
"grad_norm": 0.08644629269838333,
"learning_rate": 0.0009961539748757548,
"loss": 0.2598,
"num_input_tokens_seen": 76288,
"step": 245
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.0433812253177166,
"learning_rate": 0.0009955304910036994,
"loss": 0.2304,
"num_input_tokens_seen": 77888,
"step": 250
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.020187288522720337,
"learning_rate": 0.0009948604188486328,
"loss": 0.2377,
"num_input_tokens_seen": 79456,
"step": 255
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.023410074412822723,
"learning_rate": 0.000994143821408719,
"loss": 0.2363,
"num_input_tokens_seen": 81088,
"step": 260
},
{
"epoch": 2.9444444444444446,
"grad_norm": 0.029354019090533257,
"learning_rate": 0.0009933807660562897,
"loss": 0.2357,
"num_input_tokens_seen": 82592,
"step": 265
},
{
"epoch": 3.0,
"grad_norm": 0.01759847067296505,
"learning_rate": 0.0009925713245315083,
"loss": 0.2401,
"num_input_tokens_seen": 84192,
"step": 270
},
{
"epoch": 3.0,
"eval_loss": 0.23134836554527283,
"eval_runtime": 0.813,
"eval_samples_per_second": 49.203,
"eval_steps_per_second": 12.301,
"num_input_tokens_seen": 84192,
"step": 270
},
{
"epoch": 3.0555555555555554,
"grad_norm": 0.06405995786190033,
"learning_rate": 0.0009917155729356273,
"loss": 0.2304,
"num_input_tokens_seen": 85696,
"step": 275
},
{
"epoch": 3.111111111111111,
"grad_norm": 0.0503934845328331,
"learning_rate": 0.000990813591723832,
"loss": 0.2375,
"num_input_tokens_seen": 87296,
"step": 280
},
{
"epoch": 3.1666666666666665,
"grad_norm": 0.016397392377257347,
"learning_rate": 0.000989865465697677,
"loss": 0.2318,
"num_input_tokens_seen": 88832,
"step": 285
},
{
"epoch": 3.2222222222222223,
"grad_norm": 0.06879417598247528,
"learning_rate": 0.0009888712839971133,
"loss": 0.2166,
"num_input_tokens_seen": 90368,
"step": 290
},
{
"epoch": 3.2777777777777777,
"grad_norm": 0.1464420109987259,
"learning_rate": 0.0009878311400921072,
"loss": 0.2354,
"num_input_tokens_seen": 91968,
"step": 295
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.40577006340026855,
"learning_rate": 0.0009867451317738534,
"loss": 0.2449,
"num_input_tokens_seen": 93568,
"step": 300
},
{
"epoch": 3.388888888888889,
"grad_norm": 0.03198301047086716,
"learning_rate": 0.0009856133611455802,
"loss": 0.2378,
"num_input_tokens_seen": 95168,
"step": 305
},
{
"epoch": 3.4444444444444446,
"grad_norm": 0.08572801202535629,
"learning_rate": 0.0009844359346129503,
"loss": 0.2425,
"num_input_tokens_seen": 96704,
"step": 310
},
{
"epoch": 3.5,
"grad_norm": 0.02046571485698223,
"learning_rate": 0.0009832129628740574,
"loss": 0.2357,
"num_input_tokens_seen": 98304,
"step": 315
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.06534157693386078,
"learning_rate": 0.0009819445609090174,
"loss": 0.2368,
"num_input_tokens_seen": 99840,
"step": 320
},
{
"epoch": 3.611111111111111,
"grad_norm": 0.012981094419956207,
"learning_rate": 0.0009806308479691594,
"loss": 0.2315,
"num_input_tokens_seen": 101376,
"step": 325
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.01108462642878294,
"learning_rate": 0.0009792719475658143,
"loss": 0.2294,
"num_input_tokens_seen": 102976,
"step": 330
},
{
"epoch": 3.7222222222222223,
"grad_norm": 0.090813547372818,
"learning_rate": 0.0009778679874587015,
"loss": 0.2325,
"num_input_tokens_seen": 104576,
"step": 335
},
{
"epoch": 3.7777777777777777,
"grad_norm": 0.04561146721243858,
"learning_rate": 0.0009764190996439181,
"loss": 0.2304,
"num_input_tokens_seen": 106112,
"step": 340
},
{
"epoch": 3.8333333333333335,
"grad_norm": 0.01101192831993103,
"learning_rate": 0.0009749254203415288,
"loss": 0.2314,
"num_input_tokens_seen": 107712,
"step": 345
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.05584251880645752,
"learning_rate": 0.000973387089982759,
"loss": 0.234,
"num_input_tokens_seen": 109344,
"step": 350
},
{
"epoch": 3.9444444444444446,
"grad_norm": 0.09156426787376404,
"learning_rate": 0.0009718042531967918,
"loss": 0.2401,
"num_input_tokens_seen": 110944,
"step": 355
},
{
"epoch": 4.0,
"grad_norm": 0.010413792915642262,
"learning_rate": 0.0009701770587971706,
"loss": 0.2337,
"num_input_tokens_seen": 112544,
"step": 360
},
{
"epoch": 4.0,
"eval_loss": 0.23112532496452332,
"eval_runtime": 0.8133,
"eval_samples_per_second": 49.181,
"eval_steps_per_second": 12.295,
"num_input_tokens_seen": 112544,
"step": 360
},
{
"epoch": 4.055555555555555,
"grad_norm": 0.05406933277845383,
"learning_rate": 0.0009685056597678075,
"loss": 0.2357,
"num_input_tokens_seen": 114144,
"step": 365
},
{
"epoch": 4.111111111111111,
"grad_norm": 0.11962758749723434,
"learning_rate": 0.0009667902132486009,
"loss": 0.2285,
"num_input_tokens_seen": 115712,
"step": 370
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.07900924980640411,
"learning_rate": 0.0009650308805206616,
"loss": 0.2343,
"num_input_tokens_seen": 117312,
"step": 375
},
{
"epoch": 4.222222222222222,
"grad_norm": 0.06815142184495926,
"learning_rate": 0.0009632278269911492,
"loss": 0.2245,
"num_input_tokens_seen": 118848,
"step": 380
},
{
"epoch": 4.277777777777778,
"grad_norm": 0.0701470896601677,
"learning_rate": 0.0009613812221777212,
"loss": 0.2225,
"num_input_tokens_seen": 120416,
"step": 385
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.022089658305048943,
"learning_rate": 0.0009594912396925958,
"loss": 0.2649,
"num_input_tokens_seen": 122016,
"step": 390
},
{
"epoch": 4.388888888888889,
"grad_norm": 0.04512215778231621,
"learning_rate": 0.0009575580572262289,
"loss": 0.2293,
"num_input_tokens_seen": 123584,
"step": 395
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.013591914437711239,
"learning_rate": 0.0009555818565306084,
"loss": 0.2338,
"num_input_tokens_seen": 125216,
"step": 400
},
{
"epoch": 4.5,
"grad_norm": 0.08284556865692139,
"learning_rate": 0.0009535628234021669,
"loss": 0.2345,
"num_input_tokens_seen": 126784,
"step": 405
},
{
"epoch": 4.555555555555555,
"grad_norm": 0.08072531223297119,
"learning_rate": 0.0009515011476643126,
"loss": 0.2291,
"num_input_tokens_seen": 128352,
"step": 410
},
{
"epoch": 4.611111111111111,
"grad_norm": 0.057940974831581116,
"learning_rate": 0.0009493970231495835,
"loss": 0.2256,
"num_input_tokens_seen": 129920,
"step": 415
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.10934175550937653,
"learning_rate": 0.0009472506476814238,
"loss": 0.2505,
"num_input_tokens_seen": 131488,
"step": 420
},
{
"epoch": 4.722222222222222,
"grad_norm": 0.04054225981235504,
"learning_rate": 0.0009450622230555847,
"loss": 0.2353,
"num_input_tokens_seen": 133056,
"step": 425
},
{
"epoch": 4.777777777777778,
"grad_norm": 0.03740108013153076,
"learning_rate": 0.0009428319550211531,
"loss": 0.231,
"num_input_tokens_seen": 134624,
"step": 430
},
{
"epoch": 4.833333333333333,
"grad_norm": 0.07108583301305771,
"learning_rate": 0.000940560053261206,
"loss": 0.2312,
"num_input_tokens_seen": 136192,
"step": 435
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.033905934542417526,
"learning_rate": 0.0009382467313730985,
"loss": 0.2328,
"num_input_tokens_seen": 137824,
"step": 440
},
{
"epoch": 4.944444444444445,
"grad_norm": 0.07359946519136429,
"learning_rate": 0.0009358922068483812,
"loss": 0.2322,
"num_input_tokens_seen": 139392,
"step": 445
},
{
"epoch": 5.0,
"grad_norm": 0.07416823506355286,
"learning_rate": 0.0009334967010523523,
"loss": 0.2353,
"num_input_tokens_seen": 140960,
"step": 450
},
{
"epoch": 5.0,
"eval_loss": 0.233585923910141,
"eval_runtime": 0.8135,
"eval_samples_per_second": 49.169,
"eval_steps_per_second": 12.292,
"num_input_tokens_seen": 140960,
"step": 450
},
{
"epoch": 5.055555555555555,
"grad_norm": 0.037136659026145935,
"learning_rate": 0.0009310604392032455,
"loss": 0.2314,
"num_input_tokens_seen": 142560,
"step": 455
},
{
"epoch": 5.111111111111111,
"grad_norm": 0.008749793283641338,
"learning_rate": 0.0009285836503510562,
"loss": 0.2301,
"num_input_tokens_seen": 144096,
"step": 460
},
{
"epoch": 5.166666666666667,
"grad_norm": 0.04284720867872238,
"learning_rate": 0.0009260665673560057,
"loss": 0.2352,
"num_input_tokens_seen": 145632,
"step": 465
},
{
"epoch": 5.222222222222222,
"grad_norm": 0.03149145096540451,
"learning_rate": 0.0009235094268666498,
"loss": 0.2297,
"num_input_tokens_seen": 147232,
"step": 470
},
{
"epoch": 5.277777777777778,
"grad_norm": 0.07827319204807281,
"learning_rate": 0.0009209124692976287,
"loss": 0.2324,
"num_input_tokens_seen": 148864,
"step": 475
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.008484285324811935,
"learning_rate": 0.0009182759388070649,
"loss": 0.2366,
"num_input_tokens_seen": 150432,
"step": 480
},
{
"epoch": 5.388888888888889,
"grad_norm": 0.037567220628261566,
"learning_rate": 0.0009156000832736073,
"loss": 0.2271,
"num_input_tokens_seen": 152032,
"step": 485
},
{
"epoch": 5.444444444444445,
"grad_norm": 0.04004412144422531,
"learning_rate": 0.0009128851542731271,
"loss": 0.2315,
"num_input_tokens_seen": 153600,
"step": 490
},
{
"epoch": 5.5,
"grad_norm": 0.04386899247765541,
"learning_rate": 0.0009101314070550646,
"loss": 0.2342,
"num_input_tokens_seen": 155200,
"step": 495
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.009182831272482872,
"learning_rate": 0.0009073391005184324,
"loss": 0.2372,
"num_input_tokens_seen": 156800,
"step": 500
},
{
"epoch": 5.611111111111111,
"grad_norm": 0.06390406936407089,
"learning_rate": 0.0009045084971874737,
"loss": 0.229,
"num_input_tokens_seen": 158336,
"step": 505
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.007993469946086407,
"learning_rate": 0.0009016398631869811,
"loss": 0.2347,
"num_input_tokens_seen": 159904,
"step": 510
},
{
"epoch": 5.722222222222222,
"grad_norm": 0.03192123398184776,
"learning_rate": 0.0008987334682172759,
"loss": 0.2299,
"num_input_tokens_seen": 161504,
"step": 515
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.03337305784225464,
"learning_rate": 0.0008957895855288517,
"loss": 0.2309,
"num_input_tokens_seen": 163008,
"step": 520
},
{
"epoch": 5.833333333333333,
"grad_norm": 0.007557485718280077,
"learning_rate": 0.000892808491896685,
"loss": 0.2309,
"num_input_tokens_seen": 164512,
"step": 525
},
{
"epoch": 5.888888888888889,
"grad_norm": 0.008209764957427979,
"learning_rate": 0.0008897904675942128,
"loss": 0.2319,
"num_input_tokens_seen": 166080,
"step": 530
},
{
"epoch": 5.944444444444445,
"grad_norm": 0.007890612818300724,
"learning_rate": 0.000886735796366982,
"loss": 0.2314,
"num_input_tokens_seen": 167680,
"step": 535
},
{
"epoch": 6.0,
"grad_norm": 0.0075538670644164085,
"learning_rate": 0.0008836447654059734,
"loss": 0.2335,
"num_input_tokens_seen": 169216,
"step": 540
},
{
"epoch": 6.0,
"eval_loss": 0.23072807490825653,
"eval_runtime": 0.8144,
"eval_samples_per_second": 49.116,
"eval_steps_per_second": 12.279,
"num_input_tokens_seen": 169216,
"step": 540
},
{
"epoch": 6.055555555555555,
"grad_norm": 0.008767198771238327,
"learning_rate": 0.0008805176653206003,
"loss": 0.2326,
"num_input_tokens_seen": 170784,
"step": 545
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.032195769250392914,
"learning_rate": 0.000877354790111386,
"loss": 0.2273,
"num_input_tokens_seen": 172352,
"step": 550
},
{
"epoch": 6.166666666666667,
"grad_norm": 0.008529037237167358,
"learning_rate": 0.0008741564371423235,
"loss": 0.2227,
"num_input_tokens_seen": 173920,
"step": 555
},
{
"epoch": 6.222222222222222,
"grad_norm": 0.059331003576517105,
"learning_rate": 0.0008709229071129177,
"loss": 0.2277,
"num_input_tokens_seen": 175488,
"step": 560
},
{
"epoch": 6.277777777777778,
"grad_norm": 0.06853003799915314,
"learning_rate": 0.0008676545040299144,
"loss": 0.2339,
"num_input_tokens_seen": 177024,
"step": 565
},
{
"epoch": 6.333333333333333,
"grad_norm": 0.008488861843943596,
"learning_rate": 0.0008643515351787192,
"loss": 0.2281,
"num_input_tokens_seen": 178624,
"step": 570
},
{
"epoch": 6.388888888888889,
"grad_norm": 0.03643598034977913,
"learning_rate": 0.0008610143110945068,
"loss": 0.2325,
"num_input_tokens_seen": 180096,
"step": 575
},
{
"epoch": 6.444444444444445,
"grad_norm": 0.06390345096588135,
"learning_rate": 0.0008576431455330258,
"loss": 0.2229,
"num_input_tokens_seen": 181600,
"step": 580
},
{
"epoch": 6.5,
"grad_norm": 0.01061676349490881,
"learning_rate": 0.0008542383554411,
"loss": 0.2242,
"num_input_tokens_seen": 183232,
"step": 585
},
{
"epoch": 6.555555555555555,
"grad_norm": 0.03071434609591961,
"learning_rate": 0.0008508002609268301,
"loss": 0.2356,
"num_input_tokens_seen": 184768,
"step": 590
},
{
"epoch": 6.611111111111111,
"grad_norm": 0.04599233716726303,
"learning_rate": 0.0008473291852294987,
"loss": 0.2313,
"num_input_tokens_seen": 186208,
"step": 595
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.028627226129174232,
"learning_rate": 0.0008438254546891792,
"loss": 0.2385,
"num_input_tokens_seen": 187776,
"step": 600
},
{
"epoch": 6.722222222222222,
"grad_norm": 0.012562990188598633,
"learning_rate": 0.0008402893987160552,
"loss": 0.2202,
"num_input_tokens_seen": 189344,
"step": 605
},
{
"epoch": 6.777777777777778,
"grad_norm": 0.012583953328430653,
"learning_rate": 0.0008367213497594501,
"loss": 0.2342,
"num_input_tokens_seen": 190912,
"step": 610
},
{
"epoch": 6.833333333333333,
"grad_norm": 0.028570545837283134,
"learning_rate": 0.0008331216432765713,
"loss": 0.241,
"num_input_tokens_seen": 192512,
"step": 615
},
{
"epoch": 6.888888888888889,
"grad_norm": 0.03644197806715965,
"learning_rate": 0.0008294906177009707,
"loss": 0.2338,
"num_input_tokens_seen": 194112,
"step": 620
},
{
"epoch": 6.944444444444445,
"grad_norm": 0.010511292144656181,
"learning_rate": 0.0008258286144107276,
"loss": 0.2335,
"num_input_tokens_seen": 195712,
"step": 625
},
{
"epoch": 7.0,
"grad_norm": 0.03820033371448517,
"learning_rate": 0.0008221359776963525,
"loss": 0.2306,
"num_input_tokens_seen": 197248,
"step": 630
},
{
"epoch": 7.0,
"eval_loss": 0.23276683688163757,
"eval_runtime": 0.8141,
"eval_samples_per_second": 49.134,
"eval_steps_per_second": 12.283,
"num_input_tokens_seen": 197248,
"step": 630
},
{
"epoch": 7.055555555555555,
"grad_norm": 0.010046997107565403,
"learning_rate": 0.000818413054728418,
"loss": 0.2401,
"num_input_tokens_seen": 198784,
"step": 635
},
{
"epoch": 7.111111111111111,
"grad_norm": 0.03139125183224678,
"learning_rate": 0.0008146601955249188,
"loss": 0.2324,
"num_input_tokens_seen": 200352,
"step": 640
},
{
"epoch": 7.166666666666667,
"grad_norm": 0.008107461966574192,
"learning_rate": 0.0008108777529183644,
"loss": 0.2297,
"num_input_tokens_seen": 201856,
"step": 645
},
{
"epoch": 7.222222222222222,
"grad_norm": 0.012090813368558884,
"learning_rate": 0.000807066082522607,
"loss": 0.2299,
"num_input_tokens_seen": 203488,
"step": 650
},
{
"epoch": 7.277777777777778,
"grad_norm": 0.033598825335502625,
"learning_rate": 0.0008032255426994069,
"loss": 0.2278,
"num_input_tokens_seen": 205056,
"step": 655
},
{
"epoch": 7.333333333333333,
"grad_norm": 0.009637514129281044,
"learning_rate": 0.0007993564945247409,
"loss": 0.2163,
"num_input_tokens_seen": 206656,
"step": 660
},
{
"epoch": 7.388888888888889,
"grad_norm": 0.035691387951374054,
"learning_rate": 0.0007954593017548556,
"loss": 0.2323,
"num_input_tokens_seen": 208288,
"step": 665
},
{
"epoch": 7.444444444444445,
"grad_norm": 0.03339530527591705,
"learning_rate": 0.0007915343307920673,
"loss": 0.2278,
"num_input_tokens_seen": 209856,
"step": 670
},
{
"epoch": 7.5,
"grad_norm": 0.05436629429459572,
"learning_rate": 0.0007875819506503144,
"loss": 0.2611,
"num_input_tokens_seen": 211424,
"step": 675
},
{
"epoch": 7.555555555555555,
"grad_norm": 0.028627492487430573,
"learning_rate": 0.0007836025329204635,
"loss": 0.2402,
"num_input_tokens_seen": 212928,
"step": 680
},
{
"epoch": 7.611111111111111,
"grad_norm": 0.009143758565187454,
"learning_rate": 0.0007795964517353734,
"loss": 0.2251,
"num_input_tokens_seen": 214496,
"step": 685
},
{
"epoch": 7.666666666666667,
"grad_norm": 0.0279055368155241,
"learning_rate": 0.0007755640837347215,
"loss": 0.227,
"num_input_tokens_seen": 216064,
"step": 690
},
{
"epoch": 7.722222222222222,
"grad_norm": 0.008520636707544327,
"learning_rate": 0.0007715058080295917,
"loss": 0.2359,
"num_input_tokens_seen": 217600,
"step": 695
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.007954890839755535,
"learning_rate": 0.0007674220061668323,
"loss": 0.2393,
"num_input_tokens_seen": 219136,
"step": 700
},
{
"epoch": 7.833333333333333,
"grad_norm": 0.0061981771141290665,
"learning_rate": 0.0007633130620931837,
"loss": 0.2368,
"num_input_tokens_seen": 220704,
"step": 705
},
{
"epoch": 7.888888888888889,
"grad_norm": 0.008587339892983437,
"learning_rate": 0.0007591793621191819,
"loss": 0.2299,
"num_input_tokens_seen": 222272,
"step": 710
},
{
"epoch": 7.944444444444445,
"grad_norm": 0.014931959100067616,
"learning_rate": 0.0007550212948828377,
"loss": 0.2227,
"num_input_tokens_seen": 223872,
"step": 715
},
{
"epoch": 8.0,
"grad_norm": 0.03580380976200104,
"learning_rate": 0.0007508392513130979,
"loss": 0.2366,
"num_input_tokens_seen": 225440,
"step": 720
},
{
"epoch": 8.0,
"eval_loss": 0.23274831473827362,
"eval_runtime": 0.8134,
"eval_samples_per_second": 49.175,
"eval_steps_per_second": 12.294,
"num_input_tokens_seen": 225440,
"step": 720
},
{
"epoch": 8.055555555555555,
"grad_norm": 0.016410276293754578,
"learning_rate": 0.0007466336245930927,
"loss": 0.2335,
"num_input_tokens_seen": 226912,
"step": 725
},
{
"epoch": 8.11111111111111,
"grad_norm": 0.010216175578534603,
"learning_rate": 0.0007424048101231686,
"loss": 0.2378,
"num_input_tokens_seen": 228512,
"step": 730
},
{
"epoch": 8.166666666666666,
"grad_norm": 0.026450112462043762,
"learning_rate": 0.0007381532054837144,
"loss": 0.2351,
"num_input_tokens_seen": 230080,
"step": 735
},
{
"epoch": 8.222222222222221,
"grad_norm": 0.06160770729184151,
"learning_rate": 0.0007338792103977821,
"loss": 0.2347,
"num_input_tokens_seen": 231616,
"step": 740
},
{
"epoch": 8.277777777777779,
"grad_norm": 0.029110893607139587,
"learning_rate": 0.0007295832266935059,
"loss": 0.2294,
"num_input_tokens_seen": 233184,
"step": 745
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.00839630514383316,
"learning_rate": 0.0007252656582663236,
"loss": 0.2356,
"num_input_tokens_seen": 234688,
"step": 750
},
{
"epoch": 8.38888888888889,
"grad_norm": 0.008437261916697025,
"learning_rate": 0.0007209269110410039,
"loss": 0.2282,
"num_input_tokens_seen": 236256,
"step": 755
},
{
"epoch": 8.444444444444445,
"grad_norm": 0.026610706001520157,
"learning_rate": 0.0007165673929334815,
"loss": 0.2304,
"num_input_tokens_seen": 237856,
"step": 760
},
{
"epoch": 8.5,
"grad_norm": 0.052499689161777496,
"learning_rate": 0.0007121875138125077,
"loss": 0.2314,
"num_input_tokens_seen": 239392,
"step": 765
},
{
"epoch": 8.555555555555555,
"grad_norm": 0.05292768031358719,
"learning_rate": 0.0007077876854611145,
"loss": 0.2302,
"num_input_tokens_seen": 240960,
"step": 770
},
{
"epoch": 8.61111111111111,
"grad_norm": 0.030896423384547234,
"learning_rate": 0.0007033683215379002,
"loss": 0.2295,
"num_input_tokens_seen": 242496,
"step": 775
},
{
"epoch": 8.666666666666666,
"grad_norm": 0.009958789683878422,
"learning_rate": 0.000698929837538139,
"loss": 0.2379,
"num_input_tokens_seen": 244064,
"step": 780
},
{
"epoch": 8.722222222222221,
"grad_norm": 0.02802702784538269,
"learning_rate": 0.0006944726507547168,
"loss": 0.2384,
"num_input_tokens_seen": 245664,
"step": 785
},
{
"epoch": 8.777777777777779,
"grad_norm": 0.02543003484606743,
"learning_rate": 0.0006899971802388996,
"loss": 0.2293,
"num_input_tokens_seen": 247264,
"step": 790
},
{
"epoch": 8.833333333333334,
"grad_norm": 0.00691555580124259,
"learning_rate": 0.0006855038467609335,
"loss": 0.2368,
"num_input_tokens_seen": 248832,
"step": 795
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.028850693255662918,
"learning_rate": 0.0006809930727704874,
"loss": 0.2304,
"num_input_tokens_seen": 250432,
"step": 800
},
{
"epoch": 8.944444444444445,
"grad_norm": 0.008373413234949112,
"learning_rate": 0.0006764652823569344,
"loss": 0.2268,
"num_input_tokens_seen": 252064,
"step": 805
},
{
"epoch": 9.0,
"grad_norm": 0.030820351094007492,
"learning_rate": 0.0006719209012094805,
"loss": 0.2396,
"num_input_tokens_seen": 253632,
"step": 810
},
{
"epoch": 9.0,
"eval_loss": 0.23413518071174622,
"eval_runtime": 0.814,
"eval_samples_per_second": 49.14,
"eval_steps_per_second": 12.285,
"num_input_tokens_seen": 253632,
"step": 810
},
{
"epoch": 9.055555555555555,
"grad_norm": 0.007299968507140875,
"learning_rate": 0.0006673603565771424,
"loss": 0.227,
"num_input_tokens_seen": 255232,
"step": 815
},
{
"epoch": 9.11111111111111,
"grad_norm": 0.007571425288915634,
"learning_rate": 0.0006627840772285784,
"loss": 0.2285,
"num_input_tokens_seen": 256800,
"step": 820
},
{
"epoch": 9.166666666666666,
"grad_norm": 0.007662767544388771,
"learning_rate": 0.0006581924934117783,
"loss": 0.2347,
"num_input_tokens_seen": 258368,
"step": 825
},
{
"epoch": 9.222222222222221,
"grad_norm": 0.027202824130654335,
"learning_rate": 0.0006535860368136113,
"loss": 0.231,
"num_input_tokens_seen": 259872,
"step": 830
},
{
"epoch": 9.277777777777779,
"grad_norm": 0.028580373153090477,
"learning_rate": 0.0006489651405192409,
"loss": 0.2346,
"num_input_tokens_seen": 261376,
"step": 835
},
{
"epoch": 9.333333333333334,
"grad_norm": 0.026986513286828995,
"learning_rate": 0.0006443302389714074,
"loss": 0.2336,
"num_input_tokens_seen": 262944,
"step": 840
},
{
"epoch": 9.38888888888889,
"grad_norm": 0.057506296783685684,
"learning_rate": 0.0006396817679295822,
"loss": 0.2384,
"num_input_tokens_seen": 264544,
"step": 845
},
{
"epoch": 9.444444444444445,
"grad_norm": 0.008491387590765953,
"learning_rate": 0.0006350201644290005,
"loss": 0.2325,
"num_input_tokens_seen": 266080,
"step": 850
},
{
"epoch": 9.5,
"grad_norm": 0.026944199576973915,
"learning_rate": 0.0006303458667395708,
"loss": 0.2329,
"num_input_tokens_seen": 267680,
"step": 855
},
{
"epoch": 9.555555555555555,
"grad_norm": 0.02705678716301918,
"learning_rate": 0.0006256593143246718,
"loss": 0.2303,
"num_input_tokens_seen": 269280,
"step": 860
},
{
"epoch": 9.61111111111111,
"grad_norm": 0.007777373772114515,
"learning_rate": 0.0006209609477998338,
"loss": 0.2318,
"num_input_tokens_seen": 270912,
"step": 865
},
{
"epoch": 9.666666666666666,
"grad_norm": 0.027663791552186012,
"learning_rate": 0.0006162512088913149,
"loss": 0.2273,
"num_input_tokens_seen": 272512,
"step": 870
},
{
"epoch": 9.722222222222221,
"grad_norm": 0.028824860230088234,
"learning_rate": 0.0006115305403945697,
"loss": 0.2331,
"num_input_tokens_seen": 274048,
"step": 875
},
{
"epoch": 9.777777777777779,
"grad_norm": 0.051675502210855484,
"learning_rate": 0.0006067993861326201,
"loss": 0.2304,
"num_input_tokens_seen": 275648,
"step": 880
},
{
"epoch": 9.833333333333334,
"grad_norm": 0.00994098000228405,
"learning_rate": 0.0006020581909143279,
"loss": 0.2346,
"num_input_tokens_seen": 277248,
"step": 885
},
{
"epoch": 9.88888888888889,
"grad_norm": 0.02850501798093319,
"learning_rate": 0.0005973074004925755,
"loss": 0.23,
"num_input_tokens_seen": 278816,
"step": 890
},
{
"epoch": 9.944444444444445,
"grad_norm": 0.027266889810562134,
"learning_rate": 0.0005925474615223572,
"loss": 0.2304,
"num_input_tokens_seen": 280416,
"step": 895
},
{
"epoch": 10.0,
"grad_norm": 0.05180501565337181,
"learning_rate": 0.0005877788215187867,
"loss": 0.228,
"num_input_tokens_seen": 281984,
"step": 900
},
{
"epoch": 10.0,
"eval_loss": 0.23290684819221497,
"eval_runtime": 0.8154,
"eval_samples_per_second": 49.058,
"eval_steps_per_second": 12.264,
"num_input_tokens_seen": 281984,
"step": 900
},
{
"epoch": 10.055555555555555,
"grad_norm": 0.051713358610868454,
"learning_rate": 0.0005830019288150222,
"loss": 0.2385,
"num_input_tokens_seen": 283552,
"step": 905
},
{
"epoch": 10.11111111111111,
"grad_norm": 0.01033524889498949,
"learning_rate": 0.0005782172325201155,
"loss": 0.2296,
"num_input_tokens_seen": 285120,
"step": 910
},
{
"epoch": 10.166666666666666,
"grad_norm": 0.05586029216647148,
"learning_rate": 0.0005734251824767894,
"loss": 0.2321,
"num_input_tokens_seen": 286720,
"step": 915
},
{
"epoch": 10.222222222222221,
"grad_norm": 0.011709915474057198,
"learning_rate": 0.0005686262292191438,
"loss": 0.2324,
"num_input_tokens_seen": 288288,
"step": 920
},
{
"epoch": 10.277777777777779,
"grad_norm": 0.031557176262140274,
"learning_rate": 0.0005638208239302974,
"loss": 0.2286,
"num_input_tokens_seen": 289856,
"step": 925
},
{
"epoch": 10.333333333333334,
"grad_norm": 0.05073035880923271,
"learning_rate": 0.0005590094183999698,
"loss": 0.2289,
"num_input_tokens_seen": 291392,
"step": 930
},
{
"epoch": 10.38888888888889,
"grad_norm": 0.05272587388753891,
"learning_rate": 0.0005541924649820054,
"loss": 0.2294,
"num_input_tokens_seen": 292864,
"step": 935
},
{
"epoch": 10.444444444444445,
"grad_norm": 0.03139587864279747,
"learning_rate": 0.000549370416551844,
"loss": 0.2315,
"num_input_tokens_seen": 294496,
"step": 940
},
{
"epoch": 10.5,
"grad_norm": 0.03361722081899643,
"learning_rate": 0.0005445437264639432,
"loss": 0.2263,
"num_input_tokens_seen": 296096,
"step": 945
},
{
"epoch": 10.555555555555555,
"grad_norm": 0.03754409775137901,
"learning_rate": 0.0005397128485091551,
"loss": 0.2281,
"num_input_tokens_seen": 297632,
"step": 950
},
{
"epoch": 10.61111111111111,
"grad_norm": 0.030270714312791824,
"learning_rate": 0.0005348782368720626,
"loss": 0.2347,
"num_input_tokens_seen": 299168,
"step": 955
},
{
"epoch": 10.666666666666666,
"grad_norm": 0.03540631756186485,
"learning_rate": 0.0005300403460882783,
"loss": 0.231,
"num_input_tokens_seen": 300736,
"step": 960
},
{
"epoch": 10.722222222222221,
"grad_norm": 0.032770272344350815,
"learning_rate": 0.00052519963100171,
"loss": 0.2333,
"num_input_tokens_seen": 302304,
"step": 965
},
{
"epoch": 10.777777777777779,
"grad_norm": 0.030736278742551804,
"learning_rate": 0.000520356546721798,
"loss": 0.225,
"num_input_tokens_seen": 303936,
"step": 970
},
{
"epoch": 10.833333333333334,
"grad_norm": 0.03407590091228485,
"learning_rate": 0.0005155115485807269,
"loss": 0.2315,
"num_input_tokens_seen": 305504,
"step": 975
},
{
"epoch": 10.88888888888889,
"grad_norm": 0.03474384918808937,
"learning_rate": 0.0005106650920906171,
"loss": 0.2336,
"num_input_tokens_seen": 307040,
"step": 980
},
{
"epoch": 10.944444444444445,
"grad_norm": 0.03182826563715935,
"learning_rate": 0.0005058176329006986,
"loss": 0.2247,
"num_input_tokens_seen": 308608,
"step": 985
},
{
"epoch": 11.0,
"grad_norm": 0.08343902230262756,
"learning_rate": 0.0005009696267544715,
"loss": 0.2447,
"num_input_tokens_seen": 310176,
"step": 990
},
{
"epoch": 11.0,
"eval_loss": 0.23567497730255127,
"eval_runtime": 0.8148,
"eval_samples_per_second": 49.094,
"eval_steps_per_second": 12.274,
"num_input_tokens_seen": 310176,
"step": 990
},
{
"epoch": 11.055555555555555,
"grad_norm": 0.05768521502614021,
"learning_rate": 0.0004961215294468599,
"loss": 0.2235,
"num_input_tokens_seen": 311712,
"step": 995
},
{
"epoch": 11.11111111111111,
"grad_norm": 0.01326485350728035,
"learning_rate": 0.0004912737967813582,
"loss": 0.2318,
"num_input_tokens_seen": 313280,
"step": 1000
},
{
"epoch": 11.166666666666666,
"grad_norm": 0.028805643320083618,
"learning_rate": 0.0004864268845271786,
"loss": 0.2188,
"num_input_tokens_seen": 314816,
"step": 1005
},
{
"epoch": 11.222222222222221,
"grad_norm": 0.013591939583420753,
"learning_rate": 0.0004815812483764,
"loss": 0.2347,
"num_input_tokens_seen": 316384,
"step": 1010
},
{
"epoch": 11.277777777777779,
"grad_norm": 0.04025491699576378,
"learning_rate": 0.0004767373439011267,
"loss": 0.2427,
"num_input_tokens_seen": 317984,
"step": 1015
},
{
"epoch": 11.333333333333334,
"grad_norm": 0.020464973524212837,
"learning_rate": 0.00047189562651065565,
"loss": 0.2307,
"num_input_tokens_seen": 319552,
"step": 1020
},
{
"epoch": 11.38888888888889,
"grad_norm": 0.06203271448612213,
"learning_rate": 0.00046705655140866074,
"loss": 0.2357,
"num_input_tokens_seen": 321184,
"step": 1025
},
{
"epoch": 11.444444444444445,
"grad_norm": 0.029454471543431282,
"learning_rate": 0.0004622205735503961,
"loss": 0.2282,
"num_input_tokens_seen": 322688,
"step": 1030
},
{
"epoch": 11.5,
"grad_norm": 0.06667706370353699,
"learning_rate": 0.00045738814759992174,
"loss": 0.234,
"num_input_tokens_seen": 324224,
"step": 1035
},
{
"epoch": 11.555555555555555,
"grad_norm": 0.012882853858172894,
"learning_rate": 0.00045255972788735873,
"loss": 0.2319,
"num_input_tokens_seen": 325760,
"step": 1040
},
{
"epoch": 11.61111111111111,
"grad_norm": 0.03125486522912979,
"learning_rate": 0.00044773576836617336,
"loss": 0.2318,
"num_input_tokens_seen": 327264,
"step": 1045
},
{
"epoch": 11.666666666666666,
"grad_norm": 0.012210973538458347,
"learning_rate": 0.000442916722570498,
"loss": 0.2313,
"num_input_tokens_seen": 328832,
"step": 1050
},
{
"epoch": 11.722222222222221,
"grad_norm": 0.01437061931937933,
"learning_rate": 0.0004381030435724919,
"loss": 0.2272,
"num_input_tokens_seen": 330464,
"step": 1055
},
{
"epoch": 11.777777777777779,
"grad_norm": 0.012531839311122894,
"learning_rate": 0.00043329518393974364,
"loss": 0.2293,
"num_input_tokens_seen": 332064,
"step": 1060
},
{
"epoch": 11.833333333333334,
"grad_norm": 0.03327002376317978,
"learning_rate": 0.0004284935956927229,
"loss": 0.2299,
"num_input_tokens_seen": 333664,
"step": 1065
},
{
"epoch": 11.88888888888889,
"grad_norm": 0.014634636230766773,
"learning_rate": 0.00042369873026228263,
"loss": 0.2268,
"num_input_tokens_seen": 335232,
"step": 1070
},
{
"epoch": 11.944444444444445,
"grad_norm": 0.014177635312080383,
"learning_rate": 0.00041891103844721633,
"loss": 0.229,
"num_input_tokens_seen": 336800,
"step": 1075
},
{
"epoch": 12.0,
"grad_norm": 0.03053419105708599,
"learning_rate": 0.00041413097037187657,
"loss": 0.229,
"num_input_tokens_seen": 338400,
"step": 1080
},
{
"epoch": 12.0,
"eval_loss": 0.23775029182434082,
"eval_runtime": 0.8142,
"eval_samples_per_second": 49.127,
"eval_steps_per_second": 12.282,
"num_input_tokens_seen": 338400,
"step": 1080
},
{
"epoch": 12.055555555555555,
"grad_norm": 0.03754847124218941,
"learning_rate": 0.00040935897544385424,
"loss": 0.2332,
"num_input_tokens_seen": 339968,
"step": 1085
},
{
"epoch": 12.11111111111111,
"grad_norm": 0.02007185108959675,
"learning_rate": 0.0004045955023117276,
"loss": 0.2273,
"num_input_tokens_seen": 341536,
"step": 1090
},
{
"epoch": 12.166666666666666,
"grad_norm": 0.021748747676610947,
"learning_rate": 0.00039984099882288133,
"loss": 0.2319,
"num_input_tokens_seen": 343104,
"step": 1095
},
{
"epoch": 12.222222222222221,
"grad_norm": 0.03744516894221306,
"learning_rate": 0.0003950959119814013,
"loss": 0.2293,
"num_input_tokens_seen": 344640,
"step": 1100
},
{
"epoch": 12.277777777777779,
"grad_norm": 0.042804788798093796,
"learning_rate": 0.0003903606879060483,
"loss": 0.2314,
"num_input_tokens_seen": 346272,
"step": 1105
},
{
"epoch": 12.333333333333334,
"grad_norm": 0.023323964327573776,
"learning_rate": 0.0003856357717883161,
"loss": 0.2309,
"num_input_tokens_seen": 347808,
"step": 1110
},
{
"epoch": 12.38888888888889,
"grad_norm": 0.05394618213176727,
"learning_rate": 0.00038092160785057466,
"loss": 0.2305,
"num_input_tokens_seen": 349344,
"step": 1115
},
{
"epoch": 12.444444444444445,
"grad_norm": 0.017731385305523872,
"learning_rate": 0.00037621863930430713,
"loss": 0.2288,
"num_input_tokens_seen": 350912,
"step": 1120
},
{
"epoch": 12.5,
"grad_norm": 0.042858049273490906,
"learning_rate": 0.000371527308308439,
"loss": 0.2278,
"num_input_tokens_seen": 352480,
"step": 1125
},
{
"epoch": 12.555555555555555,
"grad_norm": 0.04008388891816139,
"learning_rate": 0.00036684805592776895,
"loss": 0.228,
"num_input_tokens_seen": 354048,
"step": 1130
},
{
"epoch": 12.61111111111111,
"grad_norm": 0.05050317198038101,
"learning_rate": 0.00036218132209150044,
"loss": 0.2243,
"num_input_tokens_seen": 355648,
"step": 1135
},
{
"epoch": 12.666666666666666,
"grad_norm": 0.08396083861589432,
"learning_rate": 0.0003575275455518811,
"loss": 0.2372,
"num_input_tokens_seen": 357248,
"step": 1140
},
{
"epoch": 12.722222222222221,
"grad_norm": 0.0884268507361412,
"learning_rate": 0.00035288716384295236,
"loss": 0.2273,
"num_input_tokens_seen": 358752,
"step": 1145
},
{
"epoch": 12.777777777777779,
"grad_norm": 0.04433482140302658,
"learning_rate": 0.00034826061323941484,
"loss": 0.2411,
"num_input_tokens_seen": 360288,
"step": 1150
},
{
"epoch": 12.833333333333334,
"grad_norm": 0.03444892540574074,
"learning_rate": 0.0003436483287156091,
"loss": 0.2358,
"num_input_tokens_seen": 361920,
"step": 1155
},
{
"epoch": 12.88888888888889,
"grad_norm": 0.0388001948595047,
"learning_rate": 0.000339050743904623,
"loss": 0.2316,
"num_input_tokens_seen": 363520,
"step": 1160
},
{
"epoch": 12.944444444444445,
"grad_norm": 0.04146473854780197,
"learning_rate": 0.000334468291057521,
"loss": 0.2319,
"num_input_tokens_seen": 365088,
"step": 1165
},
{
"epoch": 13.0,
"grad_norm": 0.016760699450969696,
"learning_rate": 0.00032990140100270637,
"loss": 0.2283,
"num_input_tokens_seen": 366688,
"step": 1170
},
{
"epoch": 13.0,
"eval_loss": 0.23198899626731873,
"eval_runtime": 0.8146,
"eval_samples_per_second": 49.103,
"eval_steps_per_second": 12.276,
"num_input_tokens_seen": 366688,
"step": 1170
},
{
"epoch": 13.055555555555555,
"grad_norm": 0.04751737415790558,
"learning_rate": 0.0003253505031054155,
"loss": 0.2336,
"num_input_tokens_seen": 368192,
"step": 1175
},
{
"epoch": 13.11111111111111,
"grad_norm": 0.018362808972597122,
"learning_rate": 0.00032081602522734986,
"loss": 0.2247,
"num_input_tokens_seen": 369792,
"step": 1180
},
{
"epoch": 13.166666666666666,
"grad_norm": 0.03477654606103897,
"learning_rate": 0.00031629839368645086,
"loss": 0.2294,
"num_input_tokens_seen": 371360,
"step": 1185
},
{
"epoch": 13.222222222222221,
"grad_norm": 0.07415346056222916,
"learning_rate": 0.0003117980332168179,
"loss": 0.2288,
"num_input_tokens_seen": 372928,
"step": 1190
},
{
"epoch": 13.277777777777779,
"grad_norm": 0.04998091980814934,
"learning_rate": 0.00030731536692877595,
"loss": 0.2358,
"num_input_tokens_seen": 374528,
"step": 1195
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.0374421626329422,
"learning_rate": 0.0003028508162690967,
"loss": 0.2287,
"num_input_tokens_seen": 376000,
"step": 1200
},
{
"epoch": 13.38888888888889,
"grad_norm": 0.08486609905958176,
"learning_rate": 0.000298404800981375,
"loss": 0.23,
"num_input_tokens_seen": 377600,
"step": 1205
},
{
"epoch": 13.444444444444445,
"grad_norm": 0.031455185264348984,
"learning_rate": 0.0002939777390665658,
"loss": 0.2304,
"num_input_tokens_seen": 379200,
"step": 1210
},
{
"epoch": 13.5,
"grad_norm": 0.04608649015426636,
"learning_rate": 0.0002895700467436855,
"loss": 0.2258,
"num_input_tokens_seen": 380704,
"step": 1215
},
{
"epoch": 13.555555555555555,
"grad_norm": 0.078786201775074,
"learning_rate": 0.00028518213841067906,
"loss": 0.2228,
"num_input_tokens_seen": 382272,
"step": 1220
},
{
"epoch": 13.61111111111111,
"grad_norm": 0.046005699783563614,
"learning_rate": 0.00028081442660546124,
"loss": 0.2239,
"num_input_tokens_seen": 383776,
"step": 1225
},
{
"epoch": 13.666666666666666,
"grad_norm": 0.0584123358130455,
"learning_rate": 0.00027646732196712974,
"loss": 0.2389,
"num_input_tokens_seen": 385344,
"step": 1230
},
{
"epoch": 13.722222222222221,
"grad_norm": 0.046881191432476044,
"learning_rate": 0.00027214123319735785,
"loss": 0.2284,
"num_input_tokens_seen": 386912,
"step": 1235
},
{
"epoch": 13.777777777777779,
"grad_norm": 0.05353561043739319,
"learning_rate": 0.00026783656702197156,
"loss": 0.2259,
"num_input_tokens_seen": 388480,
"step": 1240
},
{
"epoch": 13.833333333333334,
"grad_norm": 0.05366063863039017,
"learning_rate": 0.00026355372815270835,
"loss": 0.2302,
"num_input_tokens_seen": 390048,
"step": 1245
},
{
"epoch": 13.88888888888889,
"grad_norm": 0.025699380785226822,
"learning_rate": 0.000259293119249168,
"loss": 0.2346,
"num_input_tokens_seen": 391584,
"step": 1250
},
{
"epoch": 13.944444444444445,
"grad_norm": 0.09284758567810059,
"learning_rate": 0.00025505514088095655,
"loss": 0.2374,
"num_input_tokens_seen": 393184,
"step": 1255
},
{
"epoch": 14.0,
"grad_norm": 0.03431705757975578,
"learning_rate": 0.0002508401914900249,
"loss": 0.2299,
"num_input_tokens_seen": 394752,
"step": 1260
},
{
"epoch": 14.0,
"eval_loss": 0.23503117263317108,
"eval_runtime": 0.8137,
"eval_samples_per_second": 49.157,
"eval_steps_per_second": 12.289,
"num_input_tokens_seen": 394752,
"step": 1260
},
{
"epoch": 14.055555555555555,
"grad_norm": 0.07217755913734436,
"learning_rate": 0.00024664866735320885,
"loss": 0.2287,
"num_input_tokens_seen": 396320,
"step": 1265
},
{
"epoch": 14.11111111111111,
"grad_norm": 0.03390711545944214,
"learning_rate": 0.00024248096254497287,
"loss": 0.2307,
"num_input_tokens_seen": 397920,
"step": 1270
},
{
"epoch": 14.166666666666666,
"grad_norm": 0.11280521005392075,
"learning_rate": 0.00023833746890035963,
"loss": 0.2273,
"num_input_tokens_seen": 399552,
"step": 1275
},
{
"epoch": 14.222222222222221,
"grad_norm": 0.06520415097475052,
"learning_rate": 0.0002342185759781511,
"loss": 0.2272,
"num_input_tokens_seen": 401120,
"step": 1280
},
{
"epoch": 14.277777777777779,
"grad_norm": 0.10993943363428116,
"learning_rate": 0.00023012467102424372,
"loss": 0.2273,
"num_input_tokens_seen": 402720,
"step": 1285
},
{
"epoch": 14.333333333333334,
"grad_norm": 0.03927115350961685,
"learning_rate": 0.00022605613893524008,
"loss": 0.2316,
"num_input_tokens_seen": 404320,
"step": 1290
},
{
"epoch": 14.38888888888889,
"grad_norm": 0.0648801177740097,
"learning_rate": 0.00022201336222226332,
"loss": 0.2286,
"num_input_tokens_seen": 405824,
"step": 1295
},
{
"epoch": 14.444444444444445,
"grad_norm": 0.08081918209791183,
"learning_rate": 0.0002179967209749929,
"loss": 0.2324,
"num_input_tokens_seen": 407360,
"step": 1300
},
{
"epoch": 14.5,
"grad_norm": 0.0945676863193512,
"learning_rate": 0.00021400659282593083,
"loss": 0.2218,
"num_input_tokens_seen": 408928,
"step": 1305
},
{
"epoch": 14.555555555555555,
"grad_norm": 0.05751676857471466,
"learning_rate": 0.0002100433529148979,
"loss": 0.2345,
"num_input_tokens_seen": 410496,
"step": 1310
},
{
"epoch": 14.61111111111111,
"grad_norm": 0.059024445712566376,
"learning_rate": 0.00020610737385376348,
"loss": 0.2312,
"num_input_tokens_seen": 412032,
"step": 1315
},
{
"epoch": 14.666666666666666,
"grad_norm": 0.05571332573890686,
"learning_rate": 0.00020219902569141402,
"loss": 0.2198,
"num_input_tokens_seen": 413536,
"step": 1320
},
{
"epoch": 14.722222222222221,
"grad_norm": 0.07701590657234192,
"learning_rate": 0.00019831867587896218,
"loss": 0.2254,
"num_input_tokens_seen": 415136,
"step": 1325
},
{
"epoch": 14.777777777777779,
"grad_norm": 0.13892430067062378,
"learning_rate": 0.0001944666892352001,
"loss": 0.2146,
"num_input_tokens_seen": 416704,
"step": 1330
},
{
"epoch": 14.833333333333334,
"grad_norm": 0.0762837678194046,
"learning_rate": 0.00019064342791230072,
"loss": 0.2589,
"num_input_tokens_seen": 418272,
"step": 1335
},
{
"epoch": 14.88888888888889,
"grad_norm": 0.055135730654001236,
"learning_rate": 0.00018684925136176834,
"loss": 0.2212,
"num_input_tokens_seen": 419808,
"step": 1340
},
{
"epoch": 14.944444444444445,
"grad_norm": 0.07139123976230621,
"learning_rate": 0.0001830845163006448,
"loss": 0.227,
"num_input_tokens_seen": 421344,
"step": 1345
},
{
"epoch": 15.0,
"grad_norm": 0.044596634805202484,
"learning_rate": 0.00017934957667797225,
"loss": 0.2301,
"num_input_tokens_seen": 422912,
"step": 1350
},
{
"epoch": 15.0,
"eval_loss": 0.23467722535133362,
"eval_runtime": 0.8152,
"eval_samples_per_second": 49.068,
"eval_steps_per_second": 12.267,
"num_input_tokens_seen": 422912,
"step": 1350
},
{
"epoch": 15.055555555555555,
"grad_norm": 0.07222557067871094,
"learning_rate": 0.000175644783641515,
"loss": 0.2299,
"num_input_tokens_seen": 424480,
"step": 1355
},
{
"epoch": 15.11111111111111,
"grad_norm": 0.10835325717926025,
"learning_rate": 0.00017197048550474643,
"loss": 0.2289,
"num_input_tokens_seen": 426048,
"step": 1360
},
{
"epoch": 15.166666666666666,
"grad_norm": 0.050258129835128784,
"learning_rate": 0.0001683270277141014,
"loss": 0.2294,
"num_input_tokens_seen": 427648,
"step": 1365
},
{
"epoch": 15.222222222222221,
"grad_norm": 0.07297654449939728,
"learning_rate": 0.00016471475281649818,
"loss": 0.2284,
"num_input_tokens_seen": 429216,
"step": 1370
},
{
"epoch": 15.277777777777779,
"grad_norm": 0.13614331185817719,
"learning_rate": 0.0001611340004271339,
"loss": 0.2362,
"num_input_tokens_seen": 430720,
"step": 1375
},
{
"epoch": 15.333333333333334,
"grad_norm": 0.05731874704360962,
"learning_rate": 0.0001575851071975541,
"loss": 0.2274,
"num_input_tokens_seen": 432320,
"step": 1380
},
{
"epoch": 15.38888888888889,
"grad_norm": 0.08178743720054626,
"learning_rate": 0.00015406840678400203,
"loss": 0.2222,
"num_input_tokens_seen": 433888,
"step": 1385
},
{
"epoch": 15.444444444444445,
"grad_norm": 0.055484700947999954,
"learning_rate": 0.00015058422981604997,
"loss": 0.2184,
"num_input_tokens_seen": 435456,
"step": 1390
},
{
"epoch": 15.5,
"grad_norm": 0.08678986132144928,
"learning_rate": 0.00014713290386551348,
"loss": 0.22,
"num_input_tokens_seen": 437024,
"step": 1395
},
{
"epoch": 15.555555555555555,
"grad_norm": 0.17864876985549927,
"learning_rate": 0.00014371475341565454,
"loss": 0.2566,
"num_input_tokens_seen": 438528,
"step": 1400
},
{
"epoch": 15.61111111111111,
"grad_norm": 0.08130117505788803,
"learning_rate": 0.00014033009983067452,
"loss": 0.2336,
"num_input_tokens_seen": 440096,
"step": 1405
},
{
"epoch": 15.666666666666666,
"grad_norm": 0.07935910671949387,
"learning_rate": 0.00013697926132550054,
"loss": 0.2329,
"num_input_tokens_seen": 441632,
"step": 1410
},
{
"epoch": 15.722222222222221,
"grad_norm": 0.08176440745592117,
"learning_rate": 0.0001336625529358682,
"loss": 0.2294,
"num_input_tokens_seen": 443136,
"step": 1415
},
{
"epoch": 15.777777777777779,
"grad_norm": 0.04429703205823898,
"learning_rate": 0.00013038028648870205,
"loss": 0.2289,
"num_input_tokens_seen": 444736,
"step": 1420
},
{
"epoch": 15.833333333333334,
"grad_norm": 0.04416218772530556,
"learning_rate": 0.0001271327705727991,
"loss": 0.2307,
"num_input_tokens_seen": 446336,
"step": 1425
},
{
"epoch": 15.88888888888889,
"grad_norm": 0.061331622302532196,
"learning_rate": 0.0001239203105098165,
"loss": 0.2248,
"num_input_tokens_seen": 447936,
"step": 1430
},
{
"epoch": 15.944444444444445,
"grad_norm": 0.12402181327342987,
"learning_rate": 0.00012074320832556557,
"loss": 0.2271,
"num_input_tokens_seen": 449440,
"step": 1435
},
{
"epoch": 16.0,
"grad_norm": 0.07083910703659058,
"learning_rate": 0.00011760176272161627,
"loss": 0.2232,
"num_input_tokens_seen": 451008,
"step": 1440
},
{
"epoch": 16.0,
"eval_loss": 0.23479977250099182,
"eval_runtime": 0.8141,
"eval_samples_per_second": 49.136,
"eval_steps_per_second": 12.284,
"num_input_tokens_seen": 451008,
"step": 1440
},
{
"epoch": 16.055555555555557,
"grad_norm": 0.11320801079273224,
"learning_rate": 0.00011449626904721472,
"loss": 0.2316,
"num_input_tokens_seen": 452576,
"step": 1445
},
{
"epoch": 16.11111111111111,
"grad_norm": 0.07996129244565964,
"learning_rate": 0.00011142701927151455,
"loss": 0.2261,
"num_input_tokens_seen": 454112,
"step": 1450
},
{
"epoch": 16.166666666666668,
"grad_norm": 0.07149044424295425,
"learning_rate": 0.00010839430195612793,
"loss": 0.2259,
"num_input_tokens_seen": 455680,
"step": 1455
},
{
"epoch": 16.22222222222222,
"grad_norm": 0.09157592058181763,
"learning_rate": 0.00010539840222799463,
"loss": 0.2243,
"num_input_tokens_seen": 457280,
"step": 1460
},
{
"epoch": 16.27777777777778,
"grad_norm": 0.05001309514045715,
"learning_rate": 0.00010243960175257604,
"loss": 0.2255,
"num_input_tokens_seen": 458848,
"step": 1465
},
{
"epoch": 16.333333333333332,
"grad_norm": 0.12440861761569977,
"learning_rate": 9.9518178707374e-05,
"loss": 0.2238,
"num_input_tokens_seen": 460416,
"step": 1470
},
{
"epoch": 16.38888888888889,
"grad_norm": 0.10305523872375488,
"learning_rate": 9.663440775577653e-05,
"loss": 0.2266,
"num_input_tokens_seen": 462048,
"step": 1475
},
{
"epoch": 16.444444444444443,
"grad_norm": 0.1269642859697342,
"learning_rate": 9.378856002123548e-05,
"loss": 0.218,
"num_input_tokens_seen": 463584,
"step": 1480
},
{
"epoch": 16.5,
"grad_norm": 0.09578834474086761,
"learning_rate": 9.098090306177625e-05,
"loss": 0.2233,
"num_input_tokens_seen": 465152,
"step": 1485
},
{
"epoch": 16.555555555555557,
"grad_norm": 0.1153278648853302,
"learning_rate": 8.821170084484247e-05,
"loss": 0.2287,
"num_input_tokens_seen": 466720,
"step": 1490
},
{
"epoch": 16.61111111111111,
"grad_norm": 0.08815108239650726,
"learning_rate": 8.548121372247918e-05,
"loss": 0.2262,
"num_input_tokens_seen": 468256,
"step": 1495
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.11618749797344208,
"learning_rate": 8.278969840685458e-05,
"loss": 0.2108,
"num_input_tokens_seen": 469792,
"step": 1500
},
{
"epoch": 16.72222222222222,
"grad_norm": 0.13226836919784546,
"learning_rate": 8.013740794612512e-05,
"loss": 0.2431,
"num_input_tokens_seen": 471360,
"step": 1505
},
{
"epoch": 16.77777777777778,
"grad_norm": 0.14580771327018738,
"learning_rate": 7.752459170064491e-05,
"loss": 0.2221,
"num_input_tokens_seen": 472896,
"step": 1510
},
{
"epoch": 16.833333333333332,
"grad_norm": 0.13856364786624908,
"learning_rate": 7.4951495319521e-05,
"loss": 0.2229,
"num_input_tokens_seen": 474496,
"step": 1515
},
{
"epoch": 16.88888888888889,
"grad_norm": 0.16624583303928375,
"learning_rate": 7.241836071751879e-05,
"loss": 0.2221,
"num_input_tokens_seen": 476064,
"step": 1520
},
{
"epoch": 16.944444444444443,
"grad_norm": 0.1337256133556366,
"learning_rate": 6.992542605231739e-05,
"loss": 0.2251,
"num_input_tokens_seen": 477600,
"step": 1525
},
{
"epoch": 17.0,
"grad_norm": 0.08969806879758835,
"learning_rate": 6.747292570211916e-05,
"loss": 0.2318,
"num_input_tokens_seen": 479104,
"step": 1530
},
{
"epoch": 17.0,
"eval_loss": 0.24350161850452423,
"eval_runtime": 0.8143,
"eval_samples_per_second": 49.124,
"eval_steps_per_second": 12.281,
"num_input_tokens_seen": 479104,
"step": 1530
},
{
"epoch": 17.055555555555557,
"grad_norm": 0.08840487152338028,
"learning_rate": 6.506109024361429e-05,
"loss": 0.2191,
"num_input_tokens_seen": 480736,
"step": 1535
},
{
"epoch": 17.11111111111111,
"grad_norm": 0.16554512083530426,
"learning_rate": 6.269014643030213e-05,
"loss": 0.2099,
"num_input_tokens_seen": 482304,
"step": 1540
},
{
"epoch": 17.166666666666668,
"grad_norm": 0.10631433129310608,
"learning_rate": 6.0360317171172794e-05,
"loss": 0.2332,
"num_input_tokens_seen": 483872,
"step": 1545
},
{
"epoch": 17.22222222222222,
"grad_norm": 0.11119335144758224,
"learning_rate": 5.807182150975027e-05,
"loss": 0.2026,
"num_input_tokens_seen": 485408,
"step": 1550
},
{
"epoch": 17.27777777777778,
"grad_norm": 0.1245507299900055,
"learning_rate": 5.5824874603498056e-05,
"loss": 0.2208,
"num_input_tokens_seen": 486944,
"step": 1555
},
{
"epoch": 17.333333333333332,
"grad_norm": 0.1490304321050644,
"learning_rate": 5.361968770359071e-05,
"loss": 0.2278,
"num_input_tokens_seen": 488544,
"step": 1560
},
{
"epoch": 17.38888888888889,
"grad_norm": 0.13573351502418518,
"learning_rate": 5.145646813505339e-05,
"loss": 0.2334,
"num_input_tokens_seen": 490144,
"step": 1565
},
{
"epoch": 17.444444444444443,
"grad_norm": 0.10809604823589325,
"learning_rate": 4.933541927726887e-05,
"loss": 0.2179,
"num_input_tokens_seen": 491680,
"step": 1570
},
{
"epoch": 17.5,
"grad_norm": 0.2656223177909851,
"learning_rate": 4.725674054485712e-05,
"loss": 0.2309,
"num_input_tokens_seen": 493248,
"step": 1575
},
{
"epoch": 17.555555555555557,
"grad_norm": 0.14225415885448456,
"learning_rate": 4.522062736892635e-05,
"loss": 0.2227,
"num_input_tokens_seen": 494816,
"step": 1580
},
{
"epoch": 17.61111111111111,
"grad_norm": 0.11132286489009857,
"learning_rate": 4.322727117869951e-05,
"loss": 0.2249,
"num_input_tokens_seen": 496416,
"step": 1585
},
{
"epoch": 17.666666666666668,
"grad_norm": 0.20258449018001556,
"learning_rate": 4.127685938351694e-05,
"loss": 0.2353,
"num_input_tokens_seen": 497952,
"step": 1590
},
{
"epoch": 17.72222222222222,
"grad_norm": 0.18990765511989594,
"learning_rate": 3.936957535521624e-05,
"loss": 0.2128,
"num_input_tokens_seen": 499520,
"step": 1595
},
{
"epoch": 17.77777777777778,
"grad_norm": 0.21558478474617004,
"learning_rate": 3.750559841089196e-05,
"loss": 0.2246,
"num_input_tokens_seen": 501120,
"step": 1600
},
{
"epoch": 17.833333333333332,
"grad_norm": 0.21279899775981903,
"learning_rate": 3.56851037960379e-05,
"loss": 0.2251,
"num_input_tokens_seen": 502688,
"step": 1605
},
{
"epoch": 17.88888888888889,
"grad_norm": 0.19791842997074127,
"learning_rate": 3.3908262668069845e-05,
"loss": 0.219,
"num_input_tokens_seen": 504224,
"step": 1610
},
{
"epoch": 17.944444444444443,
"grad_norm": 0.22439955174922943,
"learning_rate": 3.217524208023431e-05,
"loss": 0.2137,
"num_input_tokens_seen": 505824,
"step": 1615
},
{
"epoch": 18.0,
"grad_norm": 0.2424042671918869,
"learning_rate": 3.048620496590304e-05,
"loss": 0.2203,
"num_input_tokens_seen": 507392,
"step": 1620
},
{
"epoch": 18.0,
"eval_loss": 0.24611559510231018,
"eval_runtime": 0.8158,
"eval_samples_per_second": 49.03,
"eval_steps_per_second": 12.257,
"num_input_tokens_seen": 507392,
"step": 1620
},
{
"epoch": 18.055555555555557,
"grad_norm": 0.1351904571056366,
"learning_rate": 2.884131012325386e-05,
"loss": 0.2125,
"num_input_tokens_seen": 508928,
"step": 1625
},
{
"epoch": 18.11111111111111,
"grad_norm": 0.2632122039794922,
"learning_rate": 2.724071220034158e-05,
"loss": 0.2346,
"num_input_tokens_seen": 510496,
"step": 1630
},
{
"epoch": 18.166666666666668,
"grad_norm": 0.22769327461719513,
"learning_rate": 2.5684561680557994e-05,
"loss": 0.2089,
"num_input_tokens_seen": 512032,
"step": 1635
},
{
"epoch": 18.22222222222222,
"grad_norm": 0.21476925909519196,
"learning_rate": 2.417300486848373e-05,
"loss": 0.2317,
"num_input_tokens_seen": 513504,
"step": 1640
},
{
"epoch": 18.27777777777778,
"grad_norm": 0.16752052307128906,
"learning_rate": 2.2706183876134045e-05,
"loss": 0.2126,
"num_input_tokens_seen": 515040,
"step": 1645
},
{
"epoch": 18.333333333333332,
"grad_norm": 0.32024410367012024,
"learning_rate": 2.1284236609596887e-05,
"loss": 0.2101,
"num_input_tokens_seen": 516640,
"step": 1650
},
{
"epoch": 18.38888888888889,
"grad_norm": 0.29665425419807434,
"learning_rate": 1.990729675606784e-05,
"loss": 0.209,
"num_input_tokens_seen": 518208,
"step": 1655
},
{
"epoch": 18.444444444444443,
"grad_norm": 0.24566873908042908,
"learning_rate": 1.8575493771281205e-05,
"loss": 0.2161,
"num_input_tokens_seen": 519776,
"step": 1660
},
{
"epoch": 18.5,
"grad_norm": 0.278112530708313,
"learning_rate": 1.728895286733906e-05,
"loss": 0.2034,
"num_input_tokens_seen": 521344,
"step": 1665
},
{
"epoch": 18.555555555555557,
"grad_norm": 0.611442506313324,
"learning_rate": 1.6047795000938782e-05,
"loss": 0.2009,
"num_input_tokens_seen": 522976,
"step": 1670
},
{
"epoch": 18.61111111111111,
"grad_norm": 0.32947102189064026,
"learning_rate": 1.4852136862001764e-05,
"loss": 0.2189,
"num_input_tokens_seen": 524544,
"step": 1675
},
{
"epoch": 18.666666666666668,
"grad_norm": 0.49668529629707336,
"learning_rate": 1.3702090862701855e-05,
"loss": 0.209,
"num_input_tokens_seen": 526112,
"step": 1680
},
{
"epoch": 18.72222222222222,
"grad_norm": 0.911090612411499,
"learning_rate": 1.2597765126897198e-05,
"loss": 0.2027,
"num_input_tokens_seen": 527712,
"step": 1685
},
{
"epoch": 18.77777777777778,
"grad_norm": 0.7248960137367249,
"learning_rate": 1.1539263479964535e-05,
"loss": 0.1916,
"num_input_tokens_seen": 529280,
"step": 1690
},
{
"epoch": 18.833333333333332,
"grad_norm": 0.508635938167572,
"learning_rate": 1.0526685439037842e-05,
"loss": 0.2805,
"num_input_tokens_seen": 530816,
"step": 1695
},
{
"epoch": 18.88888888888889,
"grad_norm": 0.5231343507766724,
"learning_rate": 9.560126203652263e-06,
"loss": 0.196,
"num_input_tokens_seen": 532448,
"step": 1700
},
{
"epoch": 18.944444444444443,
"grad_norm": 0.8021954298019409,
"learning_rate": 8.639676646793382e-06,
"loss": 0.2193,
"num_input_tokens_seen": 534016,
"step": 1705
},
{
"epoch": 19.0,
"grad_norm": 0.42625030875205994,
"learning_rate": 7.76542330635388e-06,
"loss": 0.2287,
"num_input_tokens_seen": 535584,
"step": 1710
},
{
"epoch": 19.0,
"eval_loss": 0.26315268874168396,
"eval_runtime": 0.819,
"eval_samples_per_second": 48.84,
"eval_steps_per_second": 12.21,
"num_input_tokens_seen": 535584,
"step": 1710
},
{
"epoch": 19.055555555555557,
"grad_norm": 0.2445189207792282,
"learning_rate": 6.9374483769975016e-06,
"loss": 0.2139,
"num_input_tokens_seen": 537152,
"step": 1715
},
{
"epoch": 19.11111111111111,
"grad_norm": 0.5057767033576965,
"learning_rate": 6.15582970243117e-06,
"loss": 0.2267,
"num_input_tokens_seen": 538720,
"step": 1720
},
{
"epoch": 19.166666666666668,
"grad_norm": 0.3208341896533966,
"learning_rate": 5.42064076808646e-06,
"loss": 0.1985,
"num_input_tokens_seen": 540256,
"step": 1725
},
{
"epoch": 19.22222222222222,
"grad_norm": 0.3510449528694153,
"learning_rate": 4.731950694210896e-06,
"loss": 0.2141,
"num_input_tokens_seen": 541824,
"step": 1730
},
{
"epoch": 19.27777777777778,
"grad_norm": 0.2291206270456314,
"learning_rate": 4.089824229369155e-06,
"loss": 0.2031,
"num_input_tokens_seen": 543424,
"step": 1735
},
{
"epoch": 19.333333333333332,
"grad_norm": 0.5885674953460693,
"learning_rate": 3.4943217443557664e-06,
"loss": 0.2143,
"num_input_tokens_seen": 545024,
"step": 1740
},
{
"epoch": 19.38888888888889,
"grad_norm": 0.4313626289367676,
"learning_rate": 2.9454992265193214e-06,
"loss": 0.2094,
"num_input_tokens_seen": 546560,
"step": 1745
},
{
"epoch": 19.444444444444443,
"grad_norm": 0.36777594685554504,
"learning_rate": 2.4434082744984598e-06,
"loss": 0.2101,
"num_input_tokens_seen": 548096,
"step": 1750
},
{
"epoch": 19.5,
"grad_norm": 0.4024662375450134,
"learning_rate": 1.9880960933710836e-06,
"loss": 0.1968,
"num_input_tokens_seen": 549664,
"step": 1755
},
{
"epoch": 19.555555555555557,
"grad_norm": 0.4908435344696045,
"learning_rate": 1.5796054902157964e-06,
"loss": 0.2048,
"num_input_tokens_seen": 551264,
"step": 1760
},
{
"epoch": 19.61111111111111,
"grad_norm": 0.3945781886577606,
"learning_rate": 1.2179748700879012e-06,
"loss": 0.2301,
"num_input_tokens_seen": 552800,
"step": 1765
},
{
"epoch": 19.666666666666668,
"grad_norm": 0.6240169405937195,
"learning_rate": 9.032382324080101e-07,
"loss": 0.1973,
"num_input_tokens_seen": 554336,
"step": 1770
},
{
"epoch": 19.72222222222222,
"grad_norm": 0.2913757264614105,
"learning_rate": 6.354251677661571e-07,
"loss": 0.2249,
"num_input_tokens_seen": 555872,
"step": 1775
},
{
"epoch": 19.77777777777778,
"grad_norm": 0.6081423759460449,
"learning_rate": 4.1456085513935646e-07,
"loss": 0.1961,
"num_input_tokens_seen": 557504,
"step": 1780
},
{
"epoch": 19.833333333333332,
"grad_norm": 0.2613461911678314,
"learning_rate": 2.4066605952444145e-07,
"loss": 0.2223,
"num_input_tokens_seen": 559040,
"step": 1785
},
{
"epoch": 19.88888888888889,
"grad_norm": 0.31355512142181396,
"learning_rate": 1.1375712998595855e-07,
"loss": 0.2048,
"num_input_tokens_seen": 560640,
"step": 1790
},
{
"epoch": 19.944444444444443,
"grad_norm": 0.3902145326137543,
"learning_rate": 3.384599811889766e-08,
"loss": 0.2315,
"num_input_tokens_seen": 562176,
"step": 1795
},
{
"epoch": 20.0,
"grad_norm": 0.2755837142467499,
"learning_rate": 9.40176926922387e-10,
"loss": 0.1829,
"num_input_tokens_seen": 563744,
"step": 1800
},
{
"epoch": 20.0,
"eval_loss": 0.25516048073768616,
"eval_runtime": 0.8152,
"eval_samples_per_second": 49.069,
"eval_steps_per_second": 12.267,
"num_input_tokens_seen": 563744,
"step": 1800
},
{
"epoch": 20.0,
"num_input_tokens_seen": 563744,
"step": 1800,
"total_flos": 2.538513752575181e+16,
"train_loss": 0.24851805749866698,
"train_runtime": 322.5049,
"train_samples_per_second": 22.325,
"train_steps_per_second": 5.581
}
],
"logging_steps": 5,
"max_steps": 1800,
"num_input_tokens_seen": 563744,
"num_train_epochs": 20,
"save_steps": 90,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.538513752575181e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}