apexchembert / trainer_state.json
amirhallaji's picture
Upload 8 files
03219cb verified
{
"best_metric": 0.11874233186244965,
"best_model_checkpoint": "./weights/OurNewMoleculeModel-v1/checkpoint-256125",
"epoch": 25.0,
"eval_steps": 500,
"global_step": 256125,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04880429477794046,
"grad_norm": 1.3488572835922241,
"learning_rate": 4.99998163439129e-05,
"loss": 2.2985,
"step": 500
},
{
"epoch": 0.09760858955588092,
"grad_norm": 0.4086189568042755,
"learning_rate": 4.999926537834994e-05,
"loss": 2.0404,
"step": 1000
},
{
"epoch": 0.14641288433382138,
"grad_norm": 0.5561855435371399,
"learning_rate": 4.999834711140619e-05,
"loss": 2.0324,
"step": 1500
},
{
"epoch": 0.19521717911176184,
"grad_norm": 0.2902628779411316,
"learning_rate": 4.999706155657327e-05,
"loss": 2.0287,
"step": 2000
},
{
"epoch": 0.2440214738897023,
"grad_norm": 0.7554148435592651,
"learning_rate": 4.999540873273918e-05,
"loss": 2.0277,
"step": 2500
},
{
"epoch": 0.29282576866764276,
"grad_norm": 0.34928998351097107,
"learning_rate": 4.999338866418801e-05,
"loss": 2.0227,
"step": 3000
},
{
"epoch": 0.3416300634455832,
"grad_norm": 0.5614811182022095,
"learning_rate": 4.999100138059959e-05,
"loss": 2.0122,
"step": 3500
},
{
"epoch": 0.3904343582235237,
"grad_norm": 0.5667726993560791,
"learning_rate": 4.998824691704905e-05,
"loss": 1.9914,
"step": 4000
},
{
"epoch": 0.43923865300146414,
"grad_norm": 1.2578080892562866,
"learning_rate": 4.998512531400633e-05,
"loss": 1.9431,
"step": 4500
},
{
"epoch": 0.4880429477794046,
"grad_norm": 1.1142494678497314,
"learning_rate": 4.9981636617335516e-05,
"loss": 1.578,
"step": 5000
},
{
"epoch": 0.5368472425573451,
"grad_norm": 0.9630743861198425,
"learning_rate": 4.997778087829424e-05,
"loss": 1.2667,
"step": 5500
},
{
"epoch": 0.5856515373352855,
"grad_norm": 0.7279083132743835,
"learning_rate": 4.9973558153532925e-05,
"loss": 1.0208,
"step": 6000
},
{
"epoch": 0.634455832113226,
"grad_norm": 0.8263267874717712,
"learning_rate": 4.996896850509387e-05,
"loss": 0.885,
"step": 6500
},
{
"epoch": 0.6832601268911664,
"grad_norm": 0.7792947292327881,
"learning_rate": 4.996401200041044e-05,
"loss": 0.8054,
"step": 7000
},
{
"epoch": 0.7320644216691069,
"grad_norm": 0.6826034188270569,
"learning_rate": 4.9958688712306015e-05,
"loss": 0.7463,
"step": 7500
},
{
"epoch": 0.7808687164470474,
"grad_norm": 0.7101658582687378,
"learning_rate": 4.995299871899292e-05,
"loss": 0.6952,
"step": 8000
},
{
"epoch": 0.8296730112249878,
"grad_norm": 0.5552261471748352,
"learning_rate": 4.994694210407133e-05,
"loss": 0.6516,
"step": 8500
},
{
"epoch": 0.8784773060029283,
"grad_norm": 0.5594379305839539,
"learning_rate": 4.994051895652797e-05,
"loss": 0.6156,
"step": 9000
},
{
"epoch": 0.9272816007808687,
"grad_norm": 0.7451700568199158,
"learning_rate": 4.993372937073485e-05,
"loss": 0.5843,
"step": 9500
},
{
"epoch": 0.9760858955588092,
"grad_norm": 0.5584864020347595,
"learning_rate": 4.9926573446447875e-05,
"loss": 0.5583,
"step": 10000
},
{
"epoch": 1.0,
"eval_loss": 0.4663134217262268,
"eval_runtime": 27.1679,
"eval_samples_per_second": 289.901,
"eval_steps_per_second": 0.405,
"step": 10245
},
{
"epoch": 1.0248901903367496,
"grad_norm": 0.527858555316925,
"learning_rate": 4.9919051288805364e-05,
"loss": 0.5353,
"step": 10500
},
{
"epoch": 1.0736944851146901,
"grad_norm": 0.5612876415252686,
"learning_rate": 4.9911163008326527e-05,
"loss": 0.5154,
"step": 11000
},
{
"epoch": 1.1224987798926305,
"grad_norm": 0.4924549460411072,
"learning_rate": 4.990290872090982e-05,
"loss": 0.4931,
"step": 11500
},
{
"epoch": 1.171303074670571,
"grad_norm": 0.4243695139884949,
"learning_rate": 4.9894288547831245e-05,
"loss": 0.476,
"step": 12000
},
{
"epoch": 1.2201073694485114,
"grad_norm": 0.5059812068939209,
"learning_rate": 4.98853026157426e-05,
"loss": 0.4609,
"step": 12500
},
{
"epoch": 1.268911664226452,
"grad_norm": 0.4593505263328552,
"learning_rate": 4.987595105666956e-05,
"loss": 0.4468,
"step": 13000
},
{
"epoch": 1.3177159590043923,
"grad_norm": 0.46688178181648254,
"learning_rate": 4.9866234008009794e-05,
"loss": 0.434,
"step": 13500
},
{
"epoch": 1.3665202537823329,
"grad_norm": 0.4821254312992096,
"learning_rate": 4.9856151612530905e-05,
"loss": 0.4218,
"step": 14000
},
{
"epoch": 1.4153245485602732,
"grad_norm": 0.4354498088359833,
"learning_rate": 4.9845704018368364e-05,
"loss": 0.4105,
"step": 14500
},
{
"epoch": 1.4641288433382138,
"grad_norm": 0.4537793695926666,
"learning_rate": 4.9834891379023305e-05,
"loss": 0.3998,
"step": 15000
},
{
"epoch": 1.5129331381161544,
"grad_norm": 0.37507402896881104,
"learning_rate": 4.9823713853360294e-05,
"loss": 0.3899,
"step": 15500
},
{
"epoch": 1.5617374328940947,
"grad_norm": 0.40271782875061035,
"learning_rate": 4.981217160560499e-05,
"loss": 0.3812,
"step": 16000
},
{
"epoch": 1.610541727672035,
"grad_norm": 0.3701293170452118,
"learning_rate": 4.9800264805341694e-05,
"loss": 0.373,
"step": 16500
},
{
"epoch": 1.6593460224499756,
"grad_norm": 0.41362902522087097,
"learning_rate": 4.978799362751094e-05,
"loss": 0.3654,
"step": 17000
},
{
"epoch": 1.7081503172279162,
"grad_norm": 0.3652186989784241,
"learning_rate": 4.9775358252406836e-05,
"loss": 0.3581,
"step": 17500
},
{
"epoch": 1.7569546120058566,
"grad_norm": 0.366926908493042,
"learning_rate": 4.9762358865674464e-05,
"loss": 0.3515,
"step": 18000
},
{
"epoch": 1.805758906783797,
"grad_norm": 0.4293728470802307,
"learning_rate": 4.974899565830715e-05,
"loss": 0.3449,
"step": 18500
},
{
"epoch": 1.8545632015617375,
"grad_norm": 0.37214261293411255,
"learning_rate": 4.973526882664364e-05,
"loss": 0.3394,
"step": 19000
},
{
"epoch": 1.903367496339678,
"grad_norm": 0.4047256112098694,
"learning_rate": 4.9721178572365235e-05,
"loss": 0.3337,
"step": 19500
},
{
"epoch": 1.9521717911176184,
"grad_norm": 0.34720858931541443,
"learning_rate": 4.9706725102492814e-05,
"loss": 0.3287,
"step": 20000
},
{
"epoch": 2.0,
"eval_loss": 0.28212064504623413,
"eval_runtime": 23.8844,
"eval_samples_per_second": 329.755,
"eval_steps_per_second": 0.461,
"step": 20490
},
{
"epoch": 2.0009760858955588,
"grad_norm": 0.37098678946495056,
"learning_rate": 4.969190862938378e-05,
"loss": 0.3237,
"step": 20500
},
{
"epoch": 2.049780380673499,
"grad_norm": 0.3951970040798187,
"learning_rate": 4.967672937072898e-05,
"loss": 0.3191,
"step": 21000
},
{
"epoch": 2.09858467545144,
"grad_norm": 0.3509838581085205,
"learning_rate": 4.9661187549549476e-05,
"loss": 0.3144,
"step": 21500
},
{
"epoch": 2.1473889702293802,
"grad_norm": 0.35936230421066284,
"learning_rate": 4.9645283394193274e-05,
"loss": 0.3099,
"step": 22000
},
{
"epoch": 2.1961932650073206,
"grad_norm": 0.3251510560512543,
"learning_rate": 4.962901713833197e-05,
"loss": 0.3063,
"step": 22500
},
{
"epoch": 2.244997559785261,
"grad_norm": 0.33518901467323303,
"learning_rate": 4.9612389020957306e-05,
"loss": 0.3023,
"step": 23000
},
{
"epoch": 2.2938018545632017,
"grad_norm": 0.3487328886985779,
"learning_rate": 4.9595399286377686e-05,
"loss": 0.2985,
"step": 23500
},
{
"epoch": 2.342606149341142,
"grad_norm": 0.34018632769584656,
"learning_rate": 4.9578048184214565e-05,
"loss": 0.2952,
"step": 24000
},
{
"epoch": 2.3914104441190824,
"grad_norm": 0.34304648637771606,
"learning_rate": 4.956033596939879e-05,
"loss": 0.2915,
"step": 24500
},
{
"epoch": 2.440214738897023,
"grad_norm": 0.34716567397117615,
"learning_rate": 4.9542262902166834e-05,
"loss": 0.2883,
"step": 25000
},
{
"epoch": 2.4890190336749636,
"grad_norm": 0.3204454481601715,
"learning_rate": 4.952382924805702e-05,
"loss": 0.2853,
"step": 25500
},
{
"epoch": 2.537823328452904,
"grad_norm": 0.3337819278240204,
"learning_rate": 4.950503527790555e-05,
"loss": 0.2821,
"step": 26000
},
{
"epoch": 2.5866276232308443,
"grad_norm": 0.3394376039505005,
"learning_rate": 4.948588126784261e-05,
"loss": 0.2793,
"step": 26500
},
{
"epoch": 2.6354319180087846,
"grad_norm": 0.3065101206302643,
"learning_rate": 4.9466367499288213e-05,
"loss": 0.2767,
"step": 27000
},
{
"epoch": 2.6842362127867254,
"grad_norm": 0.30751967430114746,
"learning_rate": 4.9446494258948176e-05,
"loss": 0.2736,
"step": 27500
},
{
"epoch": 2.7330405075646658,
"grad_norm": 0.31060898303985596,
"learning_rate": 4.942626183880981e-05,
"loss": 0.2712,
"step": 28000
},
{
"epoch": 2.781844802342606,
"grad_norm": 0.38574928045272827,
"learning_rate": 4.940567053613768e-05,
"loss": 0.2688,
"step": 28500
},
{
"epoch": 2.8306490971205465,
"grad_norm": 0.31712907552719116,
"learning_rate": 4.938472065346925e-05,
"loss": 0.2669,
"step": 29000
},
{
"epoch": 2.879453391898487,
"grad_norm": 0.2964314818382263,
"learning_rate": 4.9363412498610385e-05,
"loss": 0.2641,
"step": 29500
},
{
"epoch": 2.9282576866764276,
"grad_norm": 0.30216559767723083,
"learning_rate": 4.934174638463087e-05,
"loss": 0.2616,
"step": 30000
},
{
"epoch": 2.977061981454368,
"grad_norm": 0.2843080461025238,
"learning_rate": 4.9319722629859813e-05,
"loss": 0.2598,
"step": 30500
},
{
"epoch": 3.0,
"eval_loss": 0.2259799689054489,
"eval_runtime": 24.7473,
"eval_samples_per_second": 318.256,
"eval_steps_per_second": 0.444,
"step": 30735
},
{
"epoch": 3.0258662762323083,
"grad_norm": 0.3090941905975342,
"learning_rate": 4.9297341557880936e-05,
"loss": 0.2577,
"step": 31000
},
{
"epoch": 3.074670571010249,
"grad_norm": 0.29751360416412354,
"learning_rate": 4.927460349752785e-05,
"loss": 0.2554,
"step": 31500
},
{
"epoch": 3.1234748657881894,
"grad_norm": 0.2908008396625519,
"learning_rate": 4.925150878287921e-05,
"loss": 0.2537,
"step": 32000
},
{
"epoch": 3.17227916056613,
"grad_norm": 0.29090872406959534,
"learning_rate": 4.92280577532538e-05,
"loss": 0.2518,
"step": 32500
},
{
"epoch": 3.22108345534407,
"grad_norm": 0.301048219203949,
"learning_rate": 4.9204250753205585e-05,
"loss": 0.2503,
"step": 33000
},
{
"epoch": 3.2698877501220105,
"grad_norm": 0.2861855924129486,
"learning_rate": 4.91800881325186e-05,
"loss": 0.2482,
"step": 33500
},
{
"epoch": 3.3186920448999513,
"grad_norm": 0.28286224603652954,
"learning_rate": 4.915557024620183e-05,
"loss": 0.2466,
"step": 34000
},
{
"epoch": 3.3674963396778916,
"grad_norm": 0.3069954514503479,
"learning_rate": 4.913069745448399e-05,
"loss": 0.2451,
"step": 34500
},
{
"epoch": 3.416300634455832,
"grad_norm": 0.2962004542350769,
"learning_rate": 4.910547012280827e-05,
"loss": 0.2436,
"step": 35000
},
{
"epoch": 3.465104929233773,
"grad_norm": 0.2845563590526581,
"learning_rate": 4.907988862182689e-05,
"loss": 0.2421,
"step": 35500
},
{
"epoch": 3.513909224011713,
"grad_norm": 0.26839151978492737,
"learning_rate": 4.905395332739574e-05,
"loss": 0.2406,
"step": 36000
},
{
"epoch": 3.5627135187896535,
"grad_norm": 0.27475783228874207,
"learning_rate": 4.902766462056877e-05,
"loss": 0.2389,
"step": 36500
},
{
"epoch": 3.611517813567594,
"grad_norm": 0.26468226313591003,
"learning_rate": 4.900102288759249e-05,
"loss": 0.2374,
"step": 37000
},
{
"epoch": 3.660322108345534,
"grad_norm": 0.276924729347229,
"learning_rate": 4.89740285199002e-05,
"loss": 0.2361,
"step": 37500
},
{
"epoch": 3.709126403123475,
"grad_norm": 0.2739529609680176,
"learning_rate": 4.894668191410629e-05,
"loss": 0.2348,
"step": 38000
},
{
"epoch": 3.7579306979014153,
"grad_norm": 0.26919183135032654,
"learning_rate": 4.8918983472000433e-05,
"loss": 0.2336,
"step": 38500
},
{
"epoch": 3.8067349926793557,
"grad_norm": 0.29099541902542114,
"learning_rate": 4.88909336005416e-05,
"loss": 0.2323,
"step": 39000
},
{
"epoch": 3.8555392874572965,
"grad_norm": 0.2892494797706604,
"learning_rate": 4.8862532711852184e-05,
"loss": 0.2308,
"step": 39500
},
{
"epoch": 3.904343582235237,
"grad_norm": 0.29746654629707336,
"learning_rate": 4.883378122321186e-05,
"loss": 0.2292,
"step": 40000
},
{
"epoch": 3.953147877013177,
"grad_norm": 0.26809337735176086,
"learning_rate": 4.8804679557051495e-05,
"loss": 0.2283,
"step": 40500
},
{
"epoch": 4.0,
"eval_loss": 0.19832605123519897,
"eval_runtime": 25.8272,
"eval_samples_per_second": 304.95,
"eval_steps_per_second": 0.426,
"step": 40980
},
{
"epoch": 4.0019521717911175,
"grad_norm": 0.2542949616909027,
"learning_rate": 4.877522814094696e-05,
"loss": 0.2272,
"step": 41000
},
{
"epoch": 4.050756466569058,
"grad_norm": 0.2937975525856018,
"learning_rate": 4.8745427407612776e-05,
"loss": 0.2258,
"step": 41500
},
{
"epoch": 4.099560761346998,
"grad_norm": 0.2632514536380768,
"learning_rate": 4.8715277794895855e-05,
"loss": 0.2256,
"step": 42000
},
{
"epoch": 4.148365056124939,
"grad_norm": 0.2573137879371643,
"learning_rate": 4.8684779745768974e-05,
"loss": 0.2237,
"step": 42500
},
{
"epoch": 4.19716935090288,
"grad_norm": 0.2653585970401764,
"learning_rate": 4.8653933708324325e-05,
"loss": 0.223,
"step": 43000
},
{
"epoch": 4.24597364568082,
"grad_norm": 0.25552433729171753,
"learning_rate": 4.862274013576691e-05,
"loss": 0.2218,
"step": 43500
},
{
"epoch": 4.2947779404587605,
"grad_norm": 0.2834942936897278,
"learning_rate": 4.859119948640789e-05,
"loss": 0.2211,
"step": 44000
},
{
"epoch": 4.343582235236701,
"grad_norm": 0.2516108751296997,
"learning_rate": 4.855931222365784e-05,
"loss": 0.2202,
"step": 44500
},
{
"epoch": 4.392386530014641,
"grad_norm": 0.301641583442688,
"learning_rate": 4.852707881601996e-05,
"loss": 0.2188,
"step": 45000
},
{
"epoch": 4.4411908247925815,
"grad_norm": 0.26468151807785034,
"learning_rate": 4.849449973708316e-05,
"loss": 0.2176,
"step": 45500
},
{
"epoch": 4.489995119570522,
"grad_norm": 0.274828165769577,
"learning_rate": 4.846157546551516e-05,
"loss": 0.2171,
"step": 46000
},
{
"epoch": 4.538799414348462,
"grad_norm": 0.27979806065559387,
"learning_rate": 4.842830648505535e-05,
"loss": 0.2161,
"step": 46500
},
{
"epoch": 4.5876037091264035,
"grad_norm": 0.26616737246513367,
"learning_rate": 4.839469328450783e-05,
"loss": 0.2149,
"step": 47000
},
{
"epoch": 4.636408003904344,
"grad_norm": 0.24560213088989258,
"learning_rate": 4.8360736357734083e-05,
"loss": 0.2145,
"step": 47500
},
{
"epoch": 4.685212298682284,
"grad_norm": 0.25653526186943054,
"learning_rate": 4.8326436203645833e-05,
"loss": 0.213,
"step": 48000
},
{
"epoch": 4.7340165934602245,
"grad_norm": 0.2549044191837311,
"learning_rate": 4.829179332619763e-05,
"loss": 0.2124,
"step": 48500
},
{
"epoch": 4.782820888238165,
"grad_norm": 0.24373945593833923,
"learning_rate": 4.8256808234379516e-05,
"loss": 0.2115,
"step": 49000
},
{
"epoch": 4.831625183016105,
"grad_norm": 0.24188542366027832,
"learning_rate": 4.822148144220948e-05,
"loss": 0.2104,
"step": 49500
},
{
"epoch": 4.880429477794046,
"grad_norm": 0.2541993260383606,
"learning_rate": 4.8185813468725974e-05,
"loss": 0.2102,
"step": 50000
},
{
"epoch": 4.929233772571987,
"grad_norm": 0.266525536775589,
"learning_rate": 4.814980483798022e-05,
"loss": 0.2092,
"step": 50500
},
{
"epoch": 4.978038067349927,
"grad_norm": 0.24894855916500092,
"learning_rate": 4.811345607902855e-05,
"loss": 0.2084,
"step": 51000
},
{
"epoch": 5.0,
"eval_loss": 0.18130482733249664,
"eval_runtime": 23.9311,
"eval_samples_per_second": 329.111,
"eval_steps_per_second": 0.46,
"step": 51225
},
{
"epoch": 5.0268423621278675,
"grad_norm": 0.23973380029201508,
"learning_rate": 4.8076767725924654e-05,
"loss": 0.2076,
"step": 51500
},
{
"epoch": 5.075646656905808,
"grad_norm": 0.23818284273147583,
"learning_rate": 4.803974031771166e-05,
"loss": 0.2067,
"step": 52000
},
{
"epoch": 5.124450951683748,
"grad_norm": 0.23774628341197968,
"learning_rate": 4.8002374398414295e-05,
"loss": 0.2061,
"step": 52500
},
{
"epoch": 5.1732552464616886,
"grad_norm": 0.2544199824333191,
"learning_rate": 4.796467051703083e-05,
"loss": 0.2051,
"step": 53000
},
{
"epoch": 5.222059541239629,
"grad_norm": 0.24035200476646423,
"learning_rate": 4.7926629227525066e-05,
"loss": 0.2042,
"step": 53500
},
{
"epoch": 5.270863836017569,
"grad_norm": 0.25180783867836,
"learning_rate": 4.788825108881814e-05,
"loss": 0.2037,
"step": 54000
},
{
"epoch": 5.31966813079551,
"grad_norm": 0.25087398290634155,
"learning_rate": 4.7849536664780346e-05,
"loss": 0.2032,
"step": 54500
},
{
"epoch": 5.368472425573451,
"grad_norm": 0.2356226146221161,
"learning_rate": 4.7810486524222885e-05,
"loss": 0.2024,
"step": 55000
},
{
"epoch": 5.417276720351391,
"grad_norm": 0.25190770626068115,
"learning_rate": 4.777110124088942e-05,
"loss": 0.2019,
"step": 55500
},
{
"epoch": 5.4660810151293315,
"grad_norm": 0.24268530309200287,
"learning_rate": 4.77313813934477e-05,
"loss": 0.2011,
"step": 56000
},
{
"epoch": 5.514885309907272,
"grad_norm": 0.23932518064975739,
"learning_rate": 4.7691327565481095e-05,
"loss": 0.2005,
"step": 56500
},
{
"epoch": 5.563689604685212,
"grad_norm": 0.23731377720832825,
"learning_rate": 4.765094034547992e-05,
"loss": 0.1996,
"step": 57000
},
{
"epoch": 5.612493899463153,
"grad_norm": 0.2333805412054062,
"learning_rate": 4.76102203268329e-05,
"loss": 0.1989,
"step": 57500
},
{
"epoch": 5.661298194241093,
"grad_norm": 0.24407994747161865,
"learning_rate": 4.756916810781838e-05,
"loss": 0.1987,
"step": 58000
},
{
"epoch": 5.710102489019034,
"grad_norm": 0.23789800703525543,
"learning_rate": 4.752778429159554e-05,
"loss": 0.1979,
"step": 58500
},
{
"epoch": 5.7589067837969745,
"grad_norm": 0.24565084278583527,
"learning_rate": 4.7486069486195564e-05,
"loss": 0.1969,
"step": 59000
},
{
"epoch": 5.807711078574915,
"grad_norm": 0.26797595620155334,
"learning_rate": 4.744402430451269e-05,
"loss": 0.1965,
"step": 59500
},
{
"epoch": 5.856515373352855,
"grad_norm": 0.25408676266670227,
"learning_rate": 4.74016493642952e-05,
"loss": 0.1955,
"step": 60000
},
{
"epoch": 5.905319668130796,
"grad_norm": 0.23447421193122864,
"learning_rate": 4.7358945288136344e-05,
"loss": 0.1949,
"step": 60500
},
{
"epoch": 5.954123962908736,
"grad_norm": 0.2329121083021164,
"learning_rate": 4.7315912703465225e-05,
"loss": 0.1948,
"step": 61000
},
{
"epoch": 6.0,
"eval_loss": 0.1711394339799881,
"eval_runtime": 27.596,
"eval_samples_per_second": 285.403,
"eval_steps_per_second": 0.399,
"step": 61470
},
{
"epoch": 6.002928257686676,
"grad_norm": 0.2361510992050171,
"learning_rate": 4.727255224253751e-05,
"loss": 0.1941,
"step": 61500
},
{
"epoch": 6.051732552464617,
"grad_norm": 0.23526506125926971,
"learning_rate": 4.7228864542426224e-05,
"loss": 0.1934,
"step": 62000
},
{
"epoch": 6.100536847242557,
"grad_norm": 0.24888668954372406,
"learning_rate": 4.7184850245012316e-05,
"loss": 0.1928,
"step": 62500
},
{
"epoch": 6.149341142020498,
"grad_norm": 0.24024108052253723,
"learning_rate": 4.714050999697528e-05,
"loss": 0.1924,
"step": 63000
},
{
"epoch": 6.1981454367984385,
"grad_norm": 0.24707584083080292,
"learning_rate": 4.709584444978364e-05,
"loss": 0.192,
"step": 63500
},
{
"epoch": 6.246949731576379,
"grad_norm": 0.2352433204650879,
"learning_rate": 4.705085425968536e-05,
"loss": 0.1915,
"step": 64000
},
{
"epoch": 6.295754026354319,
"grad_norm": 0.24224288761615753,
"learning_rate": 4.700554008769823e-05,
"loss": 0.1907,
"step": 64500
},
{
"epoch": 6.34455832113226,
"grad_norm": 0.2216614931821823,
"learning_rate": 4.6959902599600125e-05,
"loss": 0.1902,
"step": 65000
},
{
"epoch": 6.3933626159102,
"grad_norm": 0.22495177388191223,
"learning_rate": 4.691394246591925e-05,
"loss": 0.1899,
"step": 65500
},
{
"epoch": 6.44216691068814,
"grad_norm": 0.22609297931194305,
"learning_rate": 4.686766036192426e-05,
"loss": 0.1891,
"step": 66000
},
{
"epoch": 6.490971205466081,
"grad_norm": 0.24654404819011688,
"learning_rate": 4.682105696761436e-05,
"loss": 0.1889,
"step": 66500
},
{
"epoch": 6.539775500244021,
"grad_norm": 0.2228369563817978,
"learning_rate": 4.6774132967709336e-05,
"loss": 0.1881,
"step": 67000
},
{
"epoch": 6.588579795021962,
"grad_norm": 0.21981576085090637,
"learning_rate": 4.6726889051639436e-05,
"loss": 0.1878,
"step": 67500
},
{
"epoch": 6.637384089799903,
"grad_norm": 0.22510704398155212,
"learning_rate": 4.6679325913535266e-05,
"loss": 0.1871,
"step": 68000
},
{
"epoch": 6.686188384577843,
"grad_norm": 0.24267421662807465,
"learning_rate": 4.663144425221763e-05,
"loss": 0.1867,
"step": 68500
},
{
"epoch": 6.734992679355783,
"grad_norm": 0.2170720249414444,
"learning_rate": 4.65832447711872e-05,
"loss": 0.1862,
"step": 69000
},
{
"epoch": 6.783796974133724,
"grad_norm": 0.25550180673599243,
"learning_rate": 4.653472817861425e-05,
"loss": 0.1857,
"step": 69500
},
{
"epoch": 6.832601268911664,
"grad_norm": 0.23408746719360352,
"learning_rate": 4.648589518732815e-05,
"loss": 0.1853,
"step": 70000
},
{
"epoch": 6.881405563689604,
"grad_norm": 0.26076194643974304,
"learning_rate": 4.6436746514807e-05,
"loss": 0.1849,
"step": 70500
},
{
"epoch": 6.930209858467546,
"grad_norm": 0.21694616973400116,
"learning_rate": 4.638728288316704e-05,
"loss": 0.184,
"step": 71000
},
{
"epoch": 6.979014153245486,
"grad_norm": 0.21888791024684906,
"learning_rate": 4.633750501915203e-05,
"loss": 0.184,
"step": 71500
},
{
"epoch": 7.0,
"eval_loss": 0.16187380254268646,
"eval_runtime": 26.1,
"eval_samples_per_second": 301.762,
"eval_steps_per_second": 0.421,
"step": 71715
},
{
"epoch": 7.027818448023426,
"grad_norm": 0.22506974637508392,
"learning_rate": 4.628741365412258e-05,
"loss": 0.1836,
"step": 72000
},
{
"epoch": 7.076622742801367,
"grad_norm": 0.21344968676567078,
"learning_rate": 4.623700952404542e-05,
"loss": 0.1832,
"step": 72500
},
{
"epoch": 7.125427037579307,
"grad_norm": 0.22301891446113586,
"learning_rate": 4.618629336948258e-05,
"loss": 0.1826,
"step": 73000
},
{
"epoch": 7.174231332357247,
"grad_norm": 0.2228812873363495,
"learning_rate": 4.6135265935580494e-05,
"loss": 0.182,
"step": 73500
},
{
"epoch": 7.223035627135188,
"grad_norm": 0.24568694829940796,
"learning_rate": 4.6083927972059084e-05,
"loss": 0.1814,
"step": 74000
},
{
"epoch": 7.271839921913128,
"grad_norm": 0.23808668553829193,
"learning_rate": 4.603228023320069e-05,
"loss": 0.1816,
"step": 74500
},
{
"epoch": 7.320644216691068,
"grad_norm": 0.21967822313308716,
"learning_rate": 4.598032347783905e-05,
"loss": 0.1809,
"step": 75000
},
{
"epoch": 7.36944851146901,
"grad_norm": 0.20847086608409882,
"learning_rate": 4.5928058469348115e-05,
"loss": 0.1806,
"step": 75500
},
{
"epoch": 7.41825280624695,
"grad_norm": 0.22811928391456604,
"learning_rate": 4.587548597563084e-05,
"loss": 0.18,
"step": 76000
},
{
"epoch": 7.46705710102489,
"grad_norm": 0.22424574196338654,
"learning_rate": 4.582260676910791e-05,
"loss": 0.1794,
"step": 76500
},
{
"epoch": 7.515861395802831,
"grad_norm": 0.22317995131015778,
"learning_rate": 4.5769421626706376e-05,
"loss": 0.1793,
"step": 77000
},
{
"epoch": 7.564665690580771,
"grad_norm": 0.21519626677036285,
"learning_rate": 4.571593132984825e-05,
"loss": 0.1789,
"step": 77500
},
{
"epoch": 7.613469985358711,
"grad_norm": 0.2195836454629898,
"learning_rate": 4.566213666443901e-05,
"loss": 0.1784,
"step": 78000
},
{
"epoch": 7.662274280136652,
"grad_norm": 0.23087261617183685,
"learning_rate": 4.56080384208561e-05,
"loss": 0.1778,
"step": 78500
},
{
"epoch": 7.711078574914593,
"grad_norm": 0.2173396646976471,
"learning_rate": 4.5553637393937234e-05,
"loss": 0.1777,
"step": 79000
},
{
"epoch": 7.759882869692533,
"grad_norm": 0.22740761935710907,
"learning_rate": 4.54989343829688e-05,
"loss": 0.1774,
"step": 79500
},
{
"epoch": 7.808687164470474,
"grad_norm": 0.20074845850467682,
"learning_rate": 4.544393019167408e-05,
"loss": 0.1768,
"step": 80000
},
{
"epoch": 7.857491459248414,
"grad_norm": 0.21903088688850403,
"learning_rate": 4.538862562820143e-05,
"loss": 0.1766,
"step": 80500
},
{
"epoch": 7.906295754026354,
"grad_norm": 0.21944737434387207,
"learning_rate": 4.533302150511243e-05,
"loss": 0.1763,
"step": 81000
},
{
"epoch": 7.955100048804295,
"grad_norm": 0.22298942506313324,
"learning_rate": 4.5277118639369935e-05,
"loss": 0.1758,
"step": 81500
},
{
"epoch": 8.0,
"eval_loss": 0.15350797772407532,
"eval_runtime": 25.6287,
"eval_samples_per_second": 307.312,
"eval_steps_per_second": 0.429,
"step": 81960
},
{
"epoch": 8.003904343582235,
"grad_norm": 0.22214815020561218,
"learning_rate": 4.5220917852326076e-05,
"loss": 0.1758,
"step": 82000
},
{
"epoch": 8.052708638360176,
"grad_norm": 0.22404730319976807,
"learning_rate": 4.516441996971018e-05,
"loss": 0.1751,
"step": 82500
},
{
"epoch": 8.101512933138116,
"grad_norm": 0.21983228623867035,
"learning_rate": 4.510762582161664e-05,
"loss": 0.1747,
"step": 83000
},
{
"epoch": 8.150317227916057,
"grad_norm": 0.23077502846717834,
"learning_rate": 4.5050536242492756e-05,
"loss": 0.1745,
"step": 83500
},
{
"epoch": 8.199121522693996,
"grad_norm": 0.21954509615898132,
"learning_rate": 4.4993152071126424e-05,
"loss": 0.174,
"step": 84000
},
{
"epoch": 8.247925817471938,
"grad_norm": 0.2204139679670334,
"learning_rate": 4.493547415063382e-05,
"loss": 0.1739,
"step": 84500
},
{
"epoch": 8.296730112249879,
"grad_norm": 0.2210853546857834,
"learning_rate": 4.487750332844704e-05,
"loss": 0.1736,
"step": 85000
},
{
"epoch": 8.345534407027818,
"grad_norm": 0.21140769124031067,
"learning_rate": 4.4819240456301645e-05,
"loss": 0.1732,
"step": 85500
},
{
"epoch": 8.39433870180576,
"grad_norm": 0.22270390391349792,
"learning_rate": 4.476068639022412e-05,
"loss": 0.1726,
"step": 86000
},
{
"epoch": 8.443142996583699,
"grad_norm": 0.2249361127614975,
"learning_rate": 4.4701841990519324e-05,
"loss": 0.1724,
"step": 86500
},
{
"epoch": 8.49194729136164,
"grad_norm": 0.21904852986335754,
"learning_rate": 4.4642708121757815e-05,
"loss": 0.1723,
"step": 87000
},
{
"epoch": 8.54075158613958,
"grad_norm": 0.21276357769966125,
"learning_rate": 4.45832856527632e-05,
"loss": 0.1717,
"step": 87500
},
{
"epoch": 8.589555880917521,
"grad_norm": 0.21569614112377167,
"learning_rate": 4.452357545659934e-05,
"loss": 0.1714,
"step": 88000
},
{
"epoch": 8.63836017569546,
"grad_norm": 0.21162466704845428,
"learning_rate": 4.446357841055749e-05,
"loss": 0.171,
"step": 88500
},
{
"epoch": 8.687164470473402,
"grad_norm": 0.2211264669895172,
"learning_rate": 4.4403295396143495e-05,
"loss": 0.1709,
"step": 89000
},
{
"epoch": 8.735968765251343,
"grad_norm": 0.20906701683998108,
"learning_rate": 4.434272729906475e-05,
"loss": 0.1707,
"step": 89500
},
{
"epoch": 8.784773060029282,
"grad_norm": 0.2192634642124176,
"learning_rate": 4.428187500921721e-05,
"loss": 0.1701,
"step": 90000
},
{
"epoch": 8.833577354807224,
"grad_norm": 0.2148887813091278,
"learning_rate": 4.4220739420672376e-05,
"loss": 0.1697,
"step": 90500
},
{
"epoch": 8.882381649585163,
"grad_norm": 0.20213574171066284,
"learning_rate": 4.4159321431664084e-05,
"loss": 0.1695,
"step": 91000
},
{
"epoch": 8.931185944363104,
"grad_norm": 0.21166318655014038,
"learning_rate": 4.4097621944575324e-05,
"loss": 0.1695,
"step": 91500
},
{
"epoch": 8.979990239141044,
"grad_norm": 0.2028771936893463,
"learning_rate": 4.4035641865925015e-05,
"loss": 0.1693,
"step": 92000
},
{
"epoch": 9.0,
"eval_loss": 0.15039320290088654,
"eval_runtime": 24.9603,
"eval_samples_per_second": 315.541,
"eval_steps_per_second": 0.441,
"step": 92205
},
{
"epoch": 9.028794533918985,
"grad_norm": 0.20694176852703094,
"learning_rate": 4.3973382106354655e-05,
"loss": 0.1686,
"step": 92500
},
{
"epoch": 9.077598828696924,
"grad_norm": 0.21907255053520203,
"learning_rate": 4.391084358061494e-05,
"loss": 0.1684,
"step": 93000
},
{
"epoch": 9.126403123474866,
"grad_norm": 0.21821749210357666,
"learning_rate": 4.3848027207552364e-05,
"loss": 0.1683,
"step": 93500
},
{
"epoch": 9.175207418252807,
"grad_norm": 0.20274536311626434,
"learning_rate": 4.3784933910095646e-05,
"loss": 0.1677,
"step": 94000
},
{
"epoch": 9.224011713030746,
"grad_norm": 0.20460249483585358,
"learning_rate": 4.372156461524226e-05,
"loss": 0.1676,
"step": 94500
},
{
"epoch": 9.272816007808688,
"grad_norm": 0.21497923135757446,
"learning_rate": 4.3657920254044726e-05,
"loss": 0.1673,
"step": 95000
},
{
"epoch": 9.321620302586627,
"grad_norm": 0.20720575749874115,
"learning_rate": 4.3594001761597e-05,
"loss": 0.1673,
"step": 95500
},
{
"epoch": 9.370424597364568,
"grad_norm": 0.22322164475917816,
"learning_rate": 4.352981007702071e-05,
"loss": 0.1668,
"step": 96000
},
{
"epoch": 9.419228892142508,
"grad_norm": 0.20235677063465118,
"learning_rate": 4.346534614345132e-05,
"loss": 0.1665,
"step": 96500
},
{
"epoch": 9.468033186920449,
"grad_norm": 0.20581580698490143,
"learning_rate": 4.340061090802436e-05,
"loss": 0.1663,
"step": 97000
},
{
"epoch": 9.51683748169839,
"grad_norm": 0.2083093822002411,
"learning_rate": 4.333560532186142e-05,
"loss": 0.166,
"step": 97500
},
{
"epoch": 9.56564177647633,
"grad_norm": 0.20584595203399658,
"learning_rate": 4.327033034005622e-05,
"loss": 0.1657,
"step": 98000
},
{
"epoch": 9.614446071254271,
"grad_norm": 0.20942457020282745,
"learning_rate": 4.320478692166059e-05,
"loss": 0.1656,
"step": 98500
},
{
"epoch": 9.66325036603221,
"grad_norm": 0.20925435423851013,
"learning_rate": 4.313897602967034e-05,
"loss": 0.1654,
"step": 99000
},
{
"epoch": 9.712054660810152,
"grad_norm": 0.22049732506275177,
"learning_rate": 4.307289863101116e-05,
"loss": 0.165,
"step": 99500
},
{
"epoch": 9.760858955588091,
"grad_norm": 0.20315922796726227,
"learning_rate": 4.300655569652437e-05,
"loss": 0.1646,
"step": 100000
},
{
"epoch": 9.809663250366032,
"grad_norm": 0.20489932596683502,
"learning_rate": 4.293994820095264e-05,
"loss": 0.1643,
"step": 100500
},
{
"epoch": 9.858467545143974,
"grad_norm": 0.218128502368927,
"learning_rate": 4.287307712292576e-05,
"loss": 0.1643,
"step": 101000
},
{
"epoch": 9.907271839921913,
"grad_norm": 0.20896770060062408,
"learning_rate": 4.280594344494617e-05,
"loss": 0.164,
"step": 101500
},
{
"epoch": 9.956076134699854,
"grad_norm": 0.20507818460464478,
"learning_rate": 4.273854815337455e-05,
"loss": 0.1636,
"step": 102000
},
{
"epoch": 10.0,
"eval_loss": 0.14604029059410095,
"eval_runtime": 23.6994,
"eval_samples_per_second": 332.329,
"eval_steps_per_second": 0.464,
"step": 102450
},
{
"epoch": 10.004880429477794,
"grad_norm": 0.20058345794677734,
"learning_rate": 4.267089223841534e-05,
"loss": 0.1636,
"step": 102500
},
{
"epoch": 10.053684724255735,
"grad_norm": 0.2024383842945099,
"learning_rate": 4.2602976694102205e-05,
"loss": 0.1632,
"step": 103000
},
{
"epoch": 10.102489019033674,
"grad_norm": 0.21127928793430328,
"learning_rate": 4.253480251828337e-05,
"loss": 0.1629,
"step": 103500
},
{
"epoch": 10.151293313811616,
"grad_norm": 0.19965404272079468,
"learning_rate": 4.246637071260705e-05,
"loss": 0.1629,
"step": 104000
},
{
"epoch": 10.200097608589555,
"grad_norm": 0.20860032737255096,
"learning_rate": 4.239768228250664e-05,
"loss": 0.1624,
"step": 104500
},
{
"epoch": 10.248901903367496,
"grad_norm": 0.21451444923877716,
"learning_rate": 4.232873823718602e-05,
"loss": 0.1624,
"step": 105000
},
{
"epoch": 10.297706198145438,
"grad_norm": 0.21074171364307404,
"learning_rate": 4.225953958960466e-05,
"loss": 0.1623,
"step": 105500
},
{
"epoch": 10.346510492923377,
"grad_norm": 0.21716845035552979,
"learning_rate": 4.21900873564628e-05,
"loss": 0.1617,
"step": 106000
},
{
"epoch": 10.395314787701318,
"grad_norm": 0.21059440076351166,
"learning_rate": 4.2120382558186474e-05,
"loss": 0.1617,
"step": 106500
},
{
"epoch": 10.444119082479258,
"grad_norm": 0.22244805097579956,
"learning_rate": 4.205042621891251e-05,
"loss": 0.1614,
"step": 107000
},
{
"epoch": 10.492923377257199,
"grad_norm": 0.21420615911483765,
"learning_rate": 4.1980219366473514e-05,
"loss": 0.1611,
"step": 107500
},
{
"epoch": 10.541727672035138,
"grad_norm": 0.2058490365743637,
"learning_rate": 4.1909763032382756e-05,
"loss": 0.161,
"step": 108000
},
{
"epoch": 10.59053196681308,
"grad_norm": 0.20425471663475037,
"learning_rate": 4.1839058251819e-05,
"loss": 0.1609,
"step": 108500
},
{
"epoch": 10.63933626159102,
"grad_norm": 0.20022732019424438,
"learning_rate": 4.176810606361132e-05,
"loss": 0.1606,
"step": 109000
},
{
"epoch": 10.68814055636896,
"grad_norm": 0.20972158014774323,
"learning_rate": 4.169690751022382e-05,
"loss": 0.1604,
"step": 109500
},
{
"epoch": 10.736944851146902,
"grad_norm": 0.20773041248321533,
"learning_rate": 4.1625463637740297e-05,
"loss": 0.1602,
"step": 110000
},
{
"epoch": 10.785749145924841,
"grad_norm": 0.2000124752521515,
"learning_rate": 4.1553775495848934e-05,
"loss": 0.1601,
"step": 110500
},
{
"epoch": 10.834553440702782,
"grad_norm": 0.21309678256511688,
"learning_rate": 4.148184413782682e-05,
"loss": 0.1597,
"step": 111000
},
{
"epoch": 10.883357735480722,
"grad_norm": 0.2132243663072586,
"learning_rate": 4.14096706205245e-05,
"loss": 0.1597,
"step": 111500
},
{
"epoch": 10.932162030258663,
"grad_norm": 0.20744946599006653,
"learning_rate": 4.133725600435042e-05,
"loss": 0.1596,
"step": 112000
},
{
"epoch": 10.980966325036603,
"grad_norm": 0.20575416088104248,
"learning_rate": 4.12646013532554e-05,
"loss": 0.159,
"step": 112500
},
{
"epoch": 11.0,
"eval_loss": 0.13835683465003967,
"eval_runtime": 27.3854,
"eval_samples_per_second": 287.598,
"eval_steps_per_second": 0.402,
"step": 112695
},
{
"epoch": 11.029770619814544,
"grad_norm": 0.2070922553539276,
"learning_rate": 4.119170773471695e-05,
"loss": 0.1589,
"step": 113000
},
{
"epoch": 11.078574914592485,
"grad_norm": 0.20478574931621552,
"learning_rate": 4.11185762197236e-05,
"loss": 0.1586,
"step": 113500
},
{
"epoch": 11.127379209370424,
"grad_norm": 0.1970217078924179,
"learning_rate": 4.104520788275921e-05,
"loss": 0.1586,
"step": 114000
},
{
"epoch": 11.176183504148366,
"grad_norm": 0.19945302605628967,
"learning_rate": 4.097160380178707e-05,
"loss": 0.1582,
"step": 114500
},
{
"epoch": 11.224987798926305,
"grad_norm": 0.19257070124149323,
"learning_rate": 4.0897765058234224e-05,
"loss": 0.1581,
"step": 115000
},
{
"epoch": 11.273792093704246,
"grad_norm": 0.2013574242591858,
"learning_rate": 4.082369273697542e-05,
"loss": 0.158,
"step": 115500
},
{
"epoch": 11.322596388482186,
"grad_norm": 0.21071788668632507,
"learning_rate": 4.0749387926317295e-05,
"loss": 0.1575,
"step": 116000
},
{
"epoch": 11.371400683260127,
"grad_norm": 0.2010817974805832,
"learning_rate": 4.0674851717982286e-05,
"loss": 0.1574,
"step": 116500
},
{
"epoch": 11.420204978038067,
"grad_norm": 0.20782026648521423,
"learning_rate": 4.0600085207092695e-05,
"loss": 0.1573,
"step": 117000
},
{
"epoch": 11.469009272816008,
"grad_norm": 0.2070448100566864,
"learning_rate": 4.052508949215447e-05,
"loss": 0.1573,
"step": 117500
},
{
"epoch": 11.517813567593949,
"grad_norm": 0.2066112607717514,
"learning_rate": 4.044986567504121e-05,
"loss": 0.1571,
"step": 118000
},
{
"epoch": 11.566617862371888,
"grad_norm": 0.20482249557971954,
"learning_rate": 4.037441486097785e-05,
"loss": 0.1568,
"step": 118500
},
{
"epoch": 11.61542215714983,
"grad_norm": 0.20141823589801788,
"learning_rate": 4.02987381585245e-05,
"loss": 0.1568,
"step": 119000
},
{
"epoch": 11.66422645192777,
"grad_norm": 0.20818044245243073,
"learning_rate": 4.02228366795601e-05,
"loss": 0.1565,
"step": 119500
},
{
"epoch": 11.71303074670571,
"grad_norm": 0.20303422212600708,
"learning_rate": 4.014671153926619e-05,
"loss": 0.1562,
"step": 120000
},
{
"epoch": 11.76183504148365,
"grad_norm": 0.19013996422290802,
"learning_rate": 4.007036385611036e-05,
"loss": 0.156,
"step": 120500
},
{
"epoch": 11.810639336261591,
"grad_norm": 0.20407438278198242,
"learning_rate": 3.999379475182996e-05,
"loss": 0.1562,
"step": 121000
},
{
"epoch": 11.859443631039532,
"grad_norm": 0.1977386772632599,
"learning_rate": 3.991700535141556e-05,
"loss": 0.1556,
"step": 121500
},
{
"epoch": 11.908247925817472,
"grad_norm": 0.19012510776519775,
"learning_rate": 3.9839996783094435e-05,
"loss": 0.1555,
"step": 122000
},
{
"epoch": 11.957052220595413,
"grad_norm": 0.20828774571418762,
"learning_rate": 3.976277017831396e-05,
"loss": 0.1553,
"step": 122500
},
{
"epoch": 12.0,
"eval_loss": 0.13950450718402863,
"eval_runtime": 28.1831,
"eval_samples_per_second": 279.458,
"eval_steps_per_second": 0.39,
"step": 122940
},
{
"epoch": 12.005856515373353,
"grad_norm": 0.19804109632968903,
"learning_rate": 3.968532667172501e-05,
"loss": 0.1552,
"step": 123000
},
{
"epoch": 12.054660810151294,
"grad_norm": 0.2035941481590271,
"learning_rate": 3.960766740116531e-05,
"loss": 0.1549,
"step": 123500
},
{
"epoch": 12.103465104929233,
"grad_norm": 0.20041148364543915,
"learning_rate": 3.952979350764268e-05,
"loss": 0.1547,
"step": 124000
},
{
"epoch": 12.152269399707174,
"grad_norm": 0.19230812788009644,
"learning_rate": 3.945170613531828e-05,
"loss": 0.1548,
"step": 124500
},
{
"epoch": 12.201073694485114,
"grad_norm": 0.2065581977367401,
"learning_rate": 3.9373406431489826e-05,
"loss": 0.1544,
"step": 125000
},
{
"epoch": 12.249877989263055,
"grad_norm": 0.19001494348049164,
"learning_rate": 3.929489554657466e-05,
"loss": 0.1543,
"step": 125500
},
{
"epoch": 12.298682284040996,
"grad_norm": 0.20618636906147003,
"learning_rate": 3.921617463409298e-05,
"loss": 0.1537,
"step": 126000
},
{
"epoch": 12.347486578818936,
"grad_norm": 0.1987367868423462,
"learning_rate": 3.913724485065074e-05,
"loss": 0.1542,
"step": 126500
},
{
"epoch": 12.396290873596877,
"grad_norm": 0.1950555443763733,
"learning_rate": 3.905810735592276e-05,
"loss": 0.1537,
"step": 127000
},
{
"epoch": 12.445095168374817,
"grad_norm": 0.20843225717544556,
"learning_rate": 3.8978763312635645e-05,
"loss": 0.1535,
"step": 127500
},
{
"epoch": 12.493899463152758,
"grad_norm": 0.19434267282485962,
"learning_rate": 3.889921388655073e-05,
"loss": 0.1535,
"step": 128000
},
{
"epoch": 12.542703757930697,
"grad_norm": 0.19898554682731628,
"learning_rate": 3.881946024644691e-05,
"loss": 0.1533,
"step": 128500
},
{
"epoch": 12.591508052708638,
"grad_norm": 0.19874414801597595,
"learning_rate": 3.873950356410352e-05,
"loss": 0.1534,
"step": 129000
},
{
"epoch": 12.640312347486578,
"grad_norm": 0.19424794614315033,
"learning_rate": 3.865934501428304e-05,
"loss": 0.1528,
"step": 129500
},
{
"epoch": 12.68911664226452,
"grad_norm": 0.19256962835788727,
"learning_rate": 3.8578985774713955e-05,
"loss": 0.153,
"step": 130000
},
{
"epoch": 12.73792093704246,
"grad_norm": 0.21424148976802826,
"learning_rate": 3.8498427026073325e-05,
"loss": 0.1527,
"step": 130500
},
{
"epoch": 12.7867252318204,
"grad_norm": 0.20375344157218933,
"learning_rate": 3.841766995196951e-05,
"loss": 0.1526,
"step": 131000
},
{
"epoch": 12.835529526598341,
"grad_norm": 0.2020910084247589,
"learning_rate": 3.8336715738924787e-05,
"loss": 0.1522,
"step": 131500
},
{
"epoch": 12.88433382137628,
"grad_norm": 0.21570877730846405,
"learning_rate": 3.825556557635787e-05,
"loss": 0.1522,
"step": 132000
},
{
"epoch": 12.933138116154222,
"grad_norm": 0.202886700630188,
"learning_rate": 3.817422065656645e-05,
"loss": 0.1522,
"step": 132500
},
{
"epoch": 12.981942410932161,
"grad_norm": 0.19793546199798584,
"learning_rate": 3.809268217470971e-05,
"loss": 0.1519,
"step": 133000
},
{
"epoch": 13.0,
"eval_loss": 0.13297139108181,
"eval_runtime": 27.6372,
"eval_samples_per_second": 284.978,
"eval_steps_per_second": 0.398,
"step": 133185
},
{
"epoch": 13.030746705710103,
"grad_norm": 0.19757746160030365,
"learning_rate": 3.8010951328790745e-05,
"loss": 0.1519,
"step": 133500
},
{
"epoch": 13.079551000488044,
"grad_norm": 0.1974940001964569,
"learning_rate": 3.792902931963893e-05,
"loss": 0.1515,
"step": 134000
},
{
"epoch": 13.128355295265983,
"grad_norm": 0.19320930540561676,
"learning_rate": 3.784691735089232e-05,
"loss": 0.1517,
"step": 134500
},
{
"epoch": 13.177159590043924,
"grad_norm": 0.2007361203432083,
"learning_rate": 3.776461662897995e-05,
"loss": 0.1513,
"step": 135000
},
{
"epoch": 13.225963884821864,
"grad_norm": 0.1926342397928238,
"learning_rate": 3.76821283631041e-05,
"loss": 0.1514,
"step": 135500
},
{
"epoch": 13.274768179599805,
"grad_norm": 0.18830719590187073,
"learning_rate": 3.759945376522254e-05,
"loss": 0.1512,
"step": 136000
},
{
"epoch": 13.323572474377745,
"grad_norm": 0.1940852552652359,
"learning_rate": 3.7516594050030715e-05,
"loss": 0.151,
"step": 136500
},
{
"epoch": 13.372376769155686,
"grad_norm": 0.1951226443052292,
"learning_rate": 3.7433550434943934e-05,
"loss": 0.1508,
"step": 137000
},
{
"epoch": 13.421181063933625,
"grad_norm": 0.18908989429473877,
"learning_rate": 3.735032414007941e-05,
"loss": 0.1505,
"step": 137500
},
{
"epoch": 13.469985358711567,
"grad_norm": 0.19911529123783112,
"learning_rate": 3.7266916388238396e-05,
"loss": 0.1503,
"step": 138000
},
{
"epoch": 13.518789653489508,
"grad_norm": 0.20053178071975708,
"learning_rate": 3.718332840488821e-05,
"loss": 0.1504,
"step": 138500
},
{
"epoch": 13.567593948267447,
"grad_norm": 0.19537031650543213,
"learning_rate": 3.70995614181442e-05,
"loss": 0.1502,
"step": 139000
},
{
"epoch": 13.616398243045388,
"grad_norm": 0.19510440528392792,
"learning_rate": 3.7015616658751715e-05,
"loss": 0.1503,
"step": 139500
},
{
"epoch": 13.665202537823328,
"grad_norm": 0.196214497089386,
"learning_rate": 3.693149536006807e-05,
"loss": 0.1499,
"step": 140000
},
{
"epoch": 13.71400683260127,
"grad_norm": 0.1952546089887619,
"learning_rate": 3.6847198758044326e-05,
"loss": 0.1499,
"step": 140500
},
{
"epoch": 13.762811127379209,
"grad_norm": 0.19812558591365814,
"learning_rate": 3.6762728091207216e-05,
"loss": 0.1498,
"step": 141000
},
{
"epoch": 13.81161542215715,
"grad_norm": 0.18906739354133606,
"learning_rate": 3.66780846006409e-05,
"loss": 0.1493,
"step": 141500
},
{
"epoch": 13.860419716935091,
"grad_norm": 0.20462313294410706,
"learning_rate": 3.659326952996879e-05,
"loss": 0.1494,
"step": 142000
},
{
"epoch": 13.90922401171303,
"grad_norm": 0.1982060968875885,
"learning_rate": 3.650828412533519e-05,
"loss": 0.1493,
"step": 142500
},
{
"epoch": 13.958028306490972,
"grad_norm": 0.19797129929065704,
"learning_rate": 3.6423129635387033e-05,
"loss": 0.1494,
"step": 143000
},
{
"epoch": 14.0,
"eval_loss": 0.13158732652664185,
"eval_runtime": 27.7401,
"eval_samples_per_second": 283.921,
"eval_steps_per_second": 0.397,
"step": 143430
},
{
"epoch": 14.006832601268911,
"grad_norm": 0.19103878736495972,
"learning_rate": 3.6337807311255574e-05,
"loss": 0.149,
"step": 143500
},
{
"epoch": 14.055636896046853,
"grad_norm": 0.19477146863937378,
"learning_rate": 3.625231840653794e-05,
"loss": 0.1488,
"step": 144000
},
{
"epoch": 14.104441190824792,
"grad_norm": 0.1984102576971054,
"learning_rate": 3.616666417727875e-05,
"loss": 0.1487,
"step": 144500
},
{
"epoch": 14.153245485602733,
"grad_norm": 0.20152725279331207,
"learning_rate": 3.608084588195166e-05,
"loss": 0.1488,
"step": 145000
},
{
"epoch": 14.202049780380673,
"grad_norm": 0.1842581033706665,
"learning_rate": 3.599486478144085e-05,
"loss": 0.1486,
"step": 145500
},
{
"epoch": 14.250854075158614,
"grad_norm": 0.20297376811504364,
"learning_rate": 3.590872213902252e-05,
"loss": 0.1483,
"step": 146000
},
{
"epoch": 14.299658369936555,
"grad_norm": 0.1883450597524643,
"learning_rate": 3.582241922034631e-05,
"loss": 0.1482,
"step": 146500
},
{
"epoch": 14.348462664714495,
"grad_norm": 0.18912336230278015,
"learning_rate": 3.573595729341675e-05,
"loss": 0.1482,
"step": 147000
},
{
"epoch": 14.397266959492436,
"grad_norm": 0.1913149505853653,
"learning_rate": 3.564933762857454e-05,
"loss": 0.1478,
"step": 147500
},
{
"epoch": 14.446071254270375,
"grad_norm": 0.19658420979976654,
"learning_rate": 3.556256149847801e-05,
"loss": 0.1479,
"step": 148000
},
{
"epoch": 14.494875549048317,
"grad_norm": 0.1880834996700287,
"learning_rate": 3.547563017808432e-05,
"loss": 0.1478,
"step": 148500
},
{
"epoch": 14.543679843826256,
"grad_norm": 0.1877063512802124,
"learning_rate": 3.538854494463074e-05,
"loss": 0.1478,
"step": 149000
},
{
"epoch": 14.592484138604197,
"grad_norm": 0.19691213965415955,
"learning_rate": 3.530130707761594e-05,
"loss": 0.1474,
"step": 149500
},
{
"epoch": 14.641288433382137,
"grad_norm": 0.19889949262142181,
"learning_rate": 3.521391785878114e-05,
"loss": 0.1472,
"step": 150000
},
{
"epoch": 14.690092728160078,
"grad_norm": 0.1987435221672058,
"learning_rate": 3.512637857209131e-05,
"loss": 0.1471,
"step": 150500
},
{
"epoch": 14.73889702293802,
"grad_norm": 0.20512694120407104,
"learning_rate": 3.503869050371626e-05,
"loss": 0.1471,
"step": 151000
},
{
"epoch": 14.787701317715959,
"grad_norm": 0.19599127769470215,
"learning_rate": 3.4950854942011814e-05,
"loss": 0.1471,
"step": 151500
},
{
"epoch": 14.8365056124939,
"grad_norm": 0.1986822932958603,
"learning_rate": 3.4862873177500796e-05,
"loss": 0.1467,
"step": 152000
},
{
"epoch": 14.88530990727184,
"grad_norm": 0.18663661181926727,
"learning_rate": 3.4774746502854164e-05,
"loss": 0.1469,
"step": 152500
},
{
"epoch": 14.93411420204978,
"grad_norm": 0.1881023645401001,
"learning_rate": 3.46864762128719e-05,
"loss": 0.1467,
"step": 153000
},
{
"epoch": 14.98291849682772,
"grad_norm": 0.1909170150756836,
"learning_rate": 3.4598063604464106e-05,
"loss": 0.1465,
"step": 153500
},
{
"epoch": 15.0,
"eval_loss": 0.1301085352897644,
"eval_runtime": 23.6471,
"eval_samples_per_second": 333.064,
"eval_steps_per_second": 0.465,
"step": 153675
},
{
"epoch": 15.031722791605661,
"grad_norm": 0.19014447927474976,
"learning_rate": 3.450950997663189e-05,
"loss": 0.1461,
"step": 154000
},
{
"epoch": 15.080527086383603,
"grad_norm": 0.20832829177379608,
"learning_rate": 3.442081663044827e-05,
"loss": 0.1463,
"step": 154500
},
{
"epoch": 15.129331381161542,
"grad_norm": 0.19706888496875763,
"learning_rate": 3.433198486903906e-05,
"loss": 0.1461,
"step": 155000
},
{
"epoch": 15.178135675939483,
"grad_norm": 0.2018064558506012,
"learning_rate": 3.424301599756378e-05,
"loss": 0.1463,
"step": 155500
},
{
"epoch": 15.226939970717423,
"grad_norm": 0.19212935864925385,
"learning_rate": 3.41539113231964e-05,
"loss": 0.1464,
"step": 156000
},
{
"epoch": 15.275744265495364,
"grad_norm": 0.20076821744441986,
"learning_rate": 3.406467215510619e-05,
"loss": 0.1459,
"step": 156500
},
{
"epoch": 15.324548560273303,
"grad_norm": 0.19215160608291626,
"learning_rate": 3.3975299804438476e-05,
"loss": 0.1456,
"step": 157000
},
{
"epoch": 15.373352855051245,
"grad_norm": 0.19090279936790466,
"learning_rate": 3.388579558429534e-05,
"loss": 0.1458,
"step": 157500
},
{
"epoch": 15.422157149829186,
"grad_norm": 0.19182687997817993,
"learning_rate": 3.3796160809716386e-05,
"loss": 0.1454,
"step": 158000
},
{
"epoch": 15.470961444607125,
"grad_norm": 0.18930520117282867,
"learning_rate": 3.370639679765936e-05,
"loss": 0.1452,
"step": 158500
},
{
"epoch": 15.519765739385067,
"grad_norm": 0.20811304450035095,
"learning_rate": 3.3616504866980834e-05,
"loss": 0.1452,
"step": 159000
},
{
"epoch": 15.568570034163006,
"grad_norm": 0.18808256089687347,
"learning_rate": 3.3526486338416835e-05,
"loss": 0.1453,
"step": 159500
},
{
"epoch": 15.617374328940947,
"grad_norm": 0.18801531195640564,
"learning_rate": 3.343634253456343e-05,
"loss": 0.1451,
"step": 160000
},
{
"epoch": 15.666178623718887,
"grad_norm": 0.19010472297668457,
"learning_rate": 3.334607477985727e-05,
"loss": 0.145,
"step": 160500
},
{
"epoch": 15.714982918496828,
"grad_norm": 0.20773784816265106,
"learning_rate": 3.3255684400556165e-05,
"loss": 0.1449,
"step": 161000
},
{
"epoch": 15.763787213274767,
"grad_norm": 0.1926048994064331,
"learning_rate": 3.316517272471959e-05,
"loss": 0.1445,
"step": 161500
},
{
"epoch": 15.812591508052709,
"grad_norm": 0.20847058296203613,
"learning_rate": 3.307454108218916e-05,
"loss": 0.1448,
"step": 162000
},
{
"epoch": 15.86139580283065,
"grad_norm": 0.18687431514263153,
"learning_rate": 3.2983790804569105e-05,
"loss": 0.1445,
"step": 162500
},
{
"epoch": 15.91020009760859,
"grad_norm": 0.19642353057861328,
"learning_rate": 3.2892923225206695e-05,
"loss": 0.1443,
"step": 163000
},
{
"epoch": 15.95900439238653,
"grad_norm": 0.19062745571136475,
"learning_rate": 3.280193967917265e-05,
"loss": 0.1444,
"step": 163500
},
{
"epoch": 16.0,
"eval_loss": 0.1288023591041565,
"eval_runtime": 27.3979,
"eval_samples_per_second": 287.468,
"eval_steps_per_second": 0.401,
"step": 163920
},
{
"epoch": 16.00780868716447,
"grad_norm": 0.2008381485939026,
"learning_rate": 3.271084150324154e-05,
"loss": 0.1441,
"step": 164000
},
{
"epoch": 16.05661298194241,
"grad_norm": 0.1929151713848114,
"learning_rate": 3.261963003587214e-05,
"loss": 0.1443,
"step": 164500
},
{
"epoch": 16.105417276720353,
"grad_norm": 0.19287170469760895,
"learning_rate": 3.252830661718772e-05,
"loss": 0.144,
"step": 165000
},
{
"epoch": 16.15422157149829,
"grad_norm": 0.19933773577213287,
"learning_rate": 3.243687258895643e-05,
"loss": 0.1439,
"step": 165500
},
{
"epoch": 16.20302586627623,
"grad_norm": 0.2010374516248703,
"learning_rate": 3.234532929457155e-05,
"loss": 0.1439,
"step": 166000
},
{
"epoch": 16.251830161054173,
"grad_norm": 0.19827648997306824,
"learning_rate": 3.2253678079031724e-05,
"loss": 0.1439,
"step": 166500
},
{
"epoch": 16.300634455832114,
"grad_norm": 0.1934526264667511,
"learning_rate": 3.2161920288921254e-05,
"loss": 0.1438,
"step": 167000
},
{
"epoch": 16.349438750610055,
"grad_norm": 0.20245911180973053,
"learning_rate": 3.2070057272390263e-05,
"loss": 0.1436,
"step": 167500
},
{
"epoch": 16.398243045387993,
"grad_norm": 0.1878873109817505,
"learning_rate": 3.197809037913493e-05,
"loss": 0.1433,
"step": 168000
},
{
"epoch": 16.447047340165934,
"grad_norm": 0.19571448862552643,
"learning_rate": 3.188602096037764e-05,
"loss": 0.1435,
"step": 168500
},
{
"epoch": 16.495851634943875,
"grad_norm": 0.19554303586483002,
"learning_rate": 3.179385036884712e-05,
"loss": 0.1433,
"step": 169000
},
{
"epoch": 16.544655929721817,
"grad_norm": 0.18918287754058838,
"learning_rate": 3.170157995875859e-05,
"loss": 0.1435,
"step": 169500
},
{
"epoch": 16.593460224499758,
"grad_norm": 0.19676432013511658,
"learning_rate": 3.160921108579385e-05,
"loss": 0.1432,
"step": 170000
},
{
"epoch": 16.642264519277695,
"grad_norm": 0.20606379210948944,
"learning_rate": 3.151674510708136e-05,
"loss": 0.1431,
"step": 170500
},
{
"epoch": 16.691068814055637,
"grad_norm": 0.18640325963497162,
"learning_rate": 3.142418338117631e-05,
"loss": 0.1428,
"step": 171000
},
{
"epoch": 16.739873108833578,
"grad_norm": 0.18933062255382538,
"learning_rate": 3.1331527268040646e-05,
"loss": 0.1431,
"step": 171500
},
{
"epoch": 16.78867740361152,
"grad_norm": 0.1884533166885376,
"learning_rate": 3.12387781290231e-05,
"loss": 0.1427,
"step": 172000
},
{
"epoch": 16.837481698389457,
"grad_norm": 0.19676893949508667,
"learning_rate": 3.11459373268392e-05,
"loss": 0.1426,
"step": 172500
},
{
"epoch": 16.886285993167398,
"grad_norm": 0.18692608177661896,
"learning_rate": 3.105300622555122e-05,
"loss": 0.1429,
"step": 173000
},
{
"epoch": 16.93509028794534,
"grad_norm": 0.2095184326171875,
"learning_rate": 3.095998619054813e-05,
"loss": 0.1425,
"step": 173500
},
{
"epoch": 16.98389458272328,
"grad_norm": 0.19869489967823029,
"learning_rate": 3.086687858852562e-05,
"loss": 0.1425,
"step": 174000
},
{
"epoch": 17.0,
"eval_loss": 0.12800458073616028,
"eval_runtime": 25.827,
"eval_samples_per_second": 304.952,
"eval_steps_per_second": 0.426,
"step": 174165
},
{
"epoch": 17.032698877501222,
"grad_norm": 0.18050076067447662,
"learning_rate": 3.077368478746591e-05,
"loss": 0.142,
"step": 174500
},
{
"epoch": 17.08150317227916,
"grad_norm": 0.19508038461208344,
"learning_rate": 3.068040615661768e-05,
"loss": 0.1422,
"step": 175000
},
{
"epoch": 17.1303074670571,
"grad_norm": 0.19345000386238098,
"learning_rate": 3.0587044066476024e-05,
"loss": 0.142,
"step": 175500
},
{
"epoch": 17.179111761835042,
"grad_norm": 0.18671298027038574,
"learning_rate": 3.0493599888762235e-05,
"loss": 0.1417,
"step": 176000
},
{
"epoch": 17.227916056612983,
"grad_norm": 0.18719059228897095,
"learning_rate": 3.0400074996403666e-05,
"loss": 0.1419,
"step": 176500
},
{
"epoch": 17.27672035139092,
"grad_norm": 0.192045196890831,
"learning_rate": 3.0306470763513584e-05,
"loss": 0.142,
"step": 177000
},
{
"epoch": 17.325524646168862,
"grad_norm": 0.18663588166236877,
"learning_rate": 3.0212788565370952e-05,
"loss": 0.1419,
"step": 177500
},
{
"epoch": 17.374328940946803,
"grad_norm": 0.19223402440547943,
"learning_rate": 3.0119029778400266e-05,
"loss": 0.1416,
"step": 178000
},
{
"epoch": 17.423133235724745,
"grad_norm": 0.20375187695026398,
"learning_rate": 3.002519578015126e-05,
"loss": 0.1417,
"step": 178500
},
{
"epoch": 17.471937530502686,
"grad_norm": 0.19722655415534973,
"learning_rate": 2.9931287949278752e-05,
"loss": 0.1413,
"step": 179000
},
{
"epoch": 17.520741825280624,
"grad_norm": 0.20561105012893677,
"learning_rate": 2.9837307665522297e-05,
"loss": 0.1412,
"step": 179500
},
{
"epoch": 17.569546120058565,
"grad_norm": 0.1842418909072876,
"learning_rate": 2.9743256309686013e-05,
"loss": 0.1413,
"step": 180000
},
{
"epoch": 17.618350414836506,
"grad_norm": 0.19416528940200806,
"learning_rate": 2.9649135263618205e-05,
"loss": 0.1414,
"step": 180500
},
{
"epoch": 17.667154709614447,
"grad_norm": 0.18883706629276276,
"learning_rate": 2.9554945910191122e-05,
"loss": 0.1414,
"step": 181000
},
{
"epoch": 17.715959004392385,
"grad_norm": 0.18695645034313202,
"learning_rate": 2.9460689633280613e-05,
"loss": 0.1413,
"step": 181500
},
{
"epoch": 17.764763299170326,
"grad_norm": 0.1854555606842041,
"learning_rate": 2.9366367817745794e-05,
"loss": 0.1411,
"step": 182000
},
{
"epoch": 17.813567593948267,
"grad_norm": 0.1904602348804474,
"learning_rate": 2.927198184940872e-05,
"loss": 0.1411,
"step": 182500
},
{
"epoch": 17.86237188872621,
"grad_norm": 0.1872331202030182,
"learning_rate": 2.917753311503399e-05,
"loss": 0.1409,
"step": 183000
},
{
"epoch": 17.91117618350415,
"grad_norm": 0.19253146648406982,
"learning_rate": 2.90830230023084e-05,
"loss": 0.1409,
"step": 183500
},
{
"epoch": 17.959980478282088,
"grad_norm": 0.18223468959331512,
"learning_rate": 2.8988452899820563e-05,
"loss": 0.1407,
"step": 184000
},
{
"epoch": 18.0,
"eval_loss": 0.12518393993377686,
"eval_runtime": 25.7207,
"eval_samples_per_second": 306.212,
"eval_steps_per_second": 0.428,
"step": 184410
},
{
"epoch": 18.00878477306003,
"grad_norm": 0.19220831990242004,
"learning_rate": 2.889382419704047e-05,
"loss": 0.1408,
"step": 184500
},
{
"epoch": 18.05758906783797,
"grad_norm": 0.19996266067028046,
"learning_rate": 2.8799138284299105e-05,
"loss": 0.1406,
"step": 185000
},
{
"epoch": 18.10639336261591,
"grad_norm": 0.192152738571167,
"learning_rate": 2.8704396552767997e-05,
"loss": 0.1405,
"step": 185500
},
{
"epoch": 18.15519765739385,
"grad_norm": 0.19583114981651306,
"learning_rate": 2.8609600394438816e-05,
"loss": 0.1404,
"step": 186000
},
{
"epoch": 18.20400195217179,
"grad_norm": 0.1908300369977951,
"learning_rate": 2.851475120210289e-05,
"loss": 0.1405,
"step": 186500
},
{
"epoch": 18.25280624694973,
"grad_norm": 0.19682295620441437,
"learning_rate": 2.8419850369330714e-05,
"loss": 0.14,
"step": 187000
},
{
"epoch": 18.301610541727673,
"grad_norm": 0.18878893554210663,
"learning_rate": 2.8324899290451556e-05,
"loss": 0.1403,
"step": 187500
},
{
"epoch": 18.350414836505614,
"grad_norm": 0.19927945733070374,
"learning_rate": 2.822989936053291e-05,
"loss": 0.1402,
"step": 188000
},
{
"epoch": 18.39921913128355,
"grad_norm": 0.18962599337100983,
"learning_rate": 2.8134851975359994e-05,
"loss": 0.1399,
"step": 188500
},
{
"epoch": 18.448023426061493,
"grad_norm": 0.19572696089744568,
"learning_rate": 2.8039758531415278e-05,
"loss": 0.1399,
"step": 189000
},
{
"epoch": 18.496827720839434,
"grad_norm": 0.19577118754386902,
"learning_rate": 2.7944620425857952e-05,
"loss": 0.14,
"step": 189500
},
{
"epoch": 18.545632015617375,
"grad_norm": 0.1974543035030365,
"learning_rate": 2.78494390565034e-05,
"loss": 0.1398,
"step": 190000
},
{
"epoch": 18.594436310395317,
"grad_norm": 0.19602327048778534,
"learning_rate": 2.775421582180263e-05,
"loss": 0.1397,
"step": 190500
},
{
"epoch": 18.643240605173254,
"grad_norm": 0.18849612772464752,
"learning_rate": 2.7658952120821802e-05,
"loss": 0.1396,
"step": 191000
},
{
"epoch": 18.692044899951195,
"grad_norm": 0.19312690198421478,
"learning_rate": 2.756364935322158e-05,
"loss": 0.1395,
"step": 191500
},
{
"epoch": 18.740849194729137,
"grad_norm": 0.18100771307945251,
"learning_rate": 2.7468308919236652e-05,
"loss": 0.1394,
"step": 192000
},
{
"epoch": 18.789653489507078,
"grad_norm": 0.20045186579227448,
"learning_rate": 2.737293221965509e-05,
"loss": 0.1394,
"step": 192500
},
{
"epoch": 18.838457784285016,
"grad_norm": 0.1846308708190918,
"learning_rate": 2.7277520655797816e-05,
"loss": 0.1393,
"step": 193000
},
{
"epoch": 18.887262079062957,
"grad_norm": 0.18819710612297058,
"learning_rate": 2.7182075629497976e-05,
"loss": 0.1394,
"step": 193500
},
{
"epoch": 18.936066373840898,
"grad_norm": 0.18752720952033997,
"learning_rate": 2.7086598543080392e-05,
"loss": 0.1391,
"step": 194000
},
{
"epoch": 18.98487066861884,
"grad_norm": 0.19363176822662354,
"learning_rate": 2.6991090799340905e-05,
"loss": 0.1391,
"step": 194500
},
{
"epoch": 19.0,
"eval_loss": 0.1259300708770752,
"eval_runtime": 25.5672,
"eval_samples_per_second": 308.051,
"eval_steps_per_second": 0.43,
"step": 194655
},
{
"epoch": 19.03367496339678,
"grad_norm": 0.19123421609401703,
"learning_rate": 2.6895553801525803e-05,
"loss": 0.1391,
"step": 195000
},
{
"epoch": 19.08247925817472,
"grad_norm": 0.19878804683685303,
"learning_rate": 2.6799988953311162e-05,
"loss": 0.1389,
"step": 195500
},
{
"epoch": 19.13128355295266,
"grad_norm": 0.19207318127155304,
"learning_rate": 2.6704397658782283e-05,
"loss": 0.1391,
"step": 196000
},
{
"epoch": 19.1800878477306,
"grad_norm": 0.18511444330215454,
"learning_rate": 2.6608781322413018e-05,
"loss": 0.1389,
"step": 196500
},
{
"epoch": 19.228892142508542,
"grad_norm": 0.19707535207271576,
"learning_rate": 2.651314134904514e-05,
"loss": 0.1389,
"step": 197000
},
{
"epoch": 19.27769643728648,
"grad_norm": 0.1916116625070572,
"learning_rate": 2.6417479143867697e-05,
"loss": 0.1387,
"step": 197500
},
{
"epoch": 19.32650073206442,
"grad_norm": 0.18978238105773926,
"learning_rate": 2.632179611239642e-05,
"loss": 0.1387,
"step": 198000
},
{
"epoch": 19.375305026842362,
"grad_norm": 0.1835888773202896,
"learning_rate": 2.6226093660452982e-05,
"loss": 0.1385,
"step": 198500
},
{
"epoch": 19.424109321620303,
"grad_norm": 0.18811723589897156,
"learning_rate": 2.613037319414441e-05,
"loss": 0.1387,
"step": 199000
},
{
"epoch": 19.472913616398245,
"grad_norm": 0.1998414546251297,
"learning_rate": 2.6034636119842414e-05,
"loss": 0.1385,
"step": 199500
},
{
"epoch": 19.521717911176182,
"grad_norm": 0.18518772721290588,
"learning_rate": 2.5938883844162715e-05,
"loss": 0.1382,
"step": 200000
},
{
"epoch": 19.570522205954124,
"grad_norm": 0.19242486357688904,
"learning_rate": 2.584311777394437e-05,
"loss": 0.1384,
"step": 200500
},
{
"epoch": 19.619326500732065,
"grad_norm": 0.2028750330209732,
"learning_rate": 2.574733931622912e-05,
"loss": 0.1384,
"step": 201000
},
{
"epoch": 19.668130795510006,
"grad_norm": 0.18917541205883026,
"learning_rate": 2.5651549878240694e-05,
"loss": 0.1381,
"step": 201500
},
{
"epoch": 19.716935090287944,
"grad_norm": 0.19596756994724274,
"learning_rate": 2.5555750867364188e-05,
"loss": 0.138,
"step": 202000
},
{
"epoch": 19.765739385065885,
"grad_norm": 0.19332247972488403,
"learning_rate": 2.5459943691125292e-05,
"loss": 0.1381,
"step": 202500
},
{
"epoch": 19.814543679843826,
"grad_norm": 0.19187049567699432,
"learning_rate": 2.536412975716972e-05,
"loss": 0.1381,
"step": 203000
},
{
"epoch": 19.863347974621767,
"grad_norm": 0.19392500817775726,
"learning_rate": 2.5268310473242424e-05,
"loss": 0.1378,
"step": 203500
},
{
"epoch": 19.91215226939971,
"grad_norm": 0.19194450974464417,
"learning_rate": 2.517248724716701e-05,
"loss": 0.1377,
"step": 204000
},
{
"epoch": 19.960956564177646,
"grad_norm": 0.20554892718791962,
"learning_rate": 2.5076661486824953e-05,
"loss": 0.1379,
"step": 204500
},
{
"epoch": 20.0,
"eval_loss": 0.1231779009103775,
"eval_runtime": 29.7434,
"eval_samples_per_second": 264.798,
"eval_steps_per_second": 0.37,
"step": 204900
},
{
"epoch": 20.009760858955588,
"grad_norm": 0.19533833861351013,
"learning_rate": 2.4980834600135006e-05,
"loss": 0.1377,
"step": 205000
},
{
"epoch": 20.05856515373353,
"grad_norm": 0.18907921016216278,
"learning_rate": 2.488500799503244e-05,
"loss": 0.1377,
"step": 205500
},
{
"epoch": 20.10736944851147,
"grad_norm": 0.1802392452955246,
"learning_rate": 2.4789183079448417e-05,
"loss": 0.1378,
"step": 206000
},
{
"epoch": 20.156173743289408,
"grad_norm": 0.19577832520008087,
"learning_rate": 2.4693361261289247e-05,
"loss": 0.1375,
"step": 206500
},
{
"epoch": 20.20497803806735,
"grad_norm": 0.20748840272426605,
"learning_rate": 2.4597543948415748e-05,
"loss": 0.1376,
"step": 207000
},
{
"epoch": 20.25378233284529,
"grad_norm": 0.19364304840564728,
"learning_rate": 2.4501732548622546e-05,
"loss": 0.1375,
"step": 207500
},
{
"epoch": 20.30258662762323,
"grad_norm": 0.1987764686346054,
"learning_rate": 2.440592846961738e-05,
"loss": 0.1373,
"step": 208000
},
{
"epoch": 20.351390922401173,
"grad_norm": 0.1924201399087906,
"learning_rate": 2.4310133119000438e-05,
"loss": 0.1376,
"step": 208500
},
{
"epoch": 20.40019521717911,
"grad_norm": 0.19483359158039093,
"learning_rate": 2.4214347904243644e-05,
"loss": 0.1374,
"step": 209000
},
{
"epoch": 20.44899951195705,
"grad_norm": 0.19892901182174683,
"learning_rate": 2.4118574232670025e-05,
"loss": 0.1372,
"step": 209500
},
{
"epoch": 20.497803806734993,
"grad_norm": 0.18968260288238525,
"learning_rate": 2.4022813511433027e-05,
"loss": 0.137,
"step": 210000
},
{
"epoch": 20.546608101512934,
"grad_norm": 0.19339485466480255,
"learning_rate": 2.3927067147495765e-05,
"loss": 0.1372,
"step": 210500
},
{
"epoch": 20.595412396290875,
"grad_norm": 0.19323968887329102,
"learning_rate": 2.383133654761045e-05,
"loss": 0.137,
"step": 211000
},
{
"epoch": 20.644216691068813,
"grad_norm": 0.18963748216629028,
"learning_rate": 2.3735623118297692e-05,
"loss": 0.1369,
"step": 211500
},
{
"epoch": 20.693020985846754,
"grad_norm": 0.190143883228302,
"learning_rate": 2.3639928265825783e-05,
"loss": 0.1369,
"step": 212000
},
{
"epoch": 20.741825280624695,
"grad_norm": 0.19597776234149933,
"learning_rate": 2.3544253396190112e-05,
"loss": 0.1369,
"step": 212500
},
{
"epoch": 20.790629575402637,
"grad_norm": 0.18973353505134583,
"learning_rate": 2.3448599915092443e-05,
"loss": 0.1366,
"step": 213000
},
{
"epoch": 20.839433870180574,
"grad_norm": 0.20242229104042053,
"learning_rate": 2.3352969227920303e-05,
"loss": 0.1368,
"step": 213500
},
{
"epoch": 20.888238164958516,
"grad_norm": 0.19486981630325317,
"learning_rate": 2.325736273972633e-05,
"loss": 0.1368,
"step": 214000
},
{
"epoch": 20.937042459736457,
"grad_norm": 0.18778111040592194,
"learning_rate": 2.3161781855207575e-05,
"loss": 0.1365,
"step": 214500
},
{
"epoch": 20.985846754514398,
"grad_norm": 0.19285354018211365,
"learning_rate": 2.3066227978684964e-05,
"loss": 0.1363,
"step": 215000
},
{
"epoch": 21.0,
"eval_loss": 0.12139205634593964,
"eval_runtime": 26.2565,
"eval_samples_per_second": 299.963,
"eval_steps_per_second": 0.419,
"step": 215145
},
{
"epoch": 21.03465104929234,
"grad_norm": 0.1933123618364334,
"learning_rate": 2.297070251408259e-05,
"loss": 0.1364,
"step": 215500
},
{
"epoch": 21.083455344070277,
"grad_norm": 0.18427444994449615,
"learning_rate": 2.287520686490707e-05,
"loss": 0.1365,
"step": 216000
},
{
"epoch": 21.13225963884822,
"grad_norm": 0.17762655019760132,
"learning_rate": 2.2779742434227005e-05,
"loss": 0.1363,
"step": 216500
},
{
"epoch": 21.18106393362616,
"grad_norm": 0.18944330513477325,
"learning_rate": 2.2684310624652287e-05,
"loss": 0.1363,
"step": 217000
},
{
"epoch": 21.2298682284041,
"grad_norm": 0.19393311440944672,
"learning_rate": 2.2588912838313535e-05,
"loss": 0.1363,
"step": 217500
},
{
"epoch": 21.27867252318204,
"grad_norm": 0.1875392496585846,
"learning_rate": 2.2493550476841495e-05,
"loss": 0.1363,
"step": 218000
},
{
"epoch": 21.32747681795998,
"grad_norm": 0.19635601341724396,
"learning_rate": 2.2398224941346408e-05,
"loss": 0.1362,
"step": 218500
},
{
"epoch": 21.37628111273792,
"grad_norm": 0.19351017475128174,
"learning_rate": 2.2302937632397462e-05,
"loss": 0.1359,
"step": 219000
},
{
"epoch": 21.425085407515862,
"grad_norm": 0.18472112715244293,
"learning_rate": 2.2207689950002213e-05,
"loss": 0.1362,
"step": 219500
},
{
"epoch": 21.473889702293803,
"grad_norm": 0.192471444606781,
"learning_rate": 2.211248329358598e-05,
"loss": 0.1359,
"step": 220000
},
{
"epoch": 21.52269399707174,
"grad_norm": 0.19304192066192627,
"learning_rate": 2.2017319061971338e-05,
"loss": 0.1362,
"step": 220500
},
{
"epoch": 21.571498291849682,
"grad_norm": 0.18912473320960999,
"learning_rate": 2.1922198653357498e-05,
"loss": 0.1362,
"step": 221000
},
{
"epoch": 21.620302586627623,
"grad_norm": 0.19801722466945648,
"learning_rate": 2.182712346529983e-05,
"loss": 0.1363,
"step": 221500
},
{
"epoch": 21.669106881405565,
"grad_norm": 0.18331073224544525,
"learning_rate": 2.1732094894689313e-05,
"loss": 0.136,
"step": 222000
},
{
"epoch": 21.717911176183506,
"grad_norm": 0.1763552576303482,
"learning_rate": 2.1637114337731967e-05,
"loss": 0.1356,
"step": 222500
},
{
"epoch": 21.766715470961444,
"grad_norm": 0.1820065975189209,
"learning_rate": 2.1542183189928387e-05,
"loss": 0.1356,
"step": 223000
},
{
"epoch": 21.815519765739385,
"grad_norm": 0.18830101191997528,
"learning_rate": 2.1447302846053234e-05,
"loss": 0.1358,
"step": 223500
},
{
"epoch": 21.864324060517326,
"grad_norm": 0.19416014850139618,
"learning_rate": 2.135247470013471e-05,
"loss": 0.1354,
"step": 224000
},
{
"epoch": 21.913128355295267,
"grad_norm": 0.1934524029493332,
"learning_rate": 2.1257700145434132e-05,
"loss": 0.1356,
"step": 224500
},
{
"epoch": 21.961932650073205,
"grad_norm": 0.19462282955646515,
"learning_rate": 2.116298057442539e-05,
"loss": 0.1357,
"step": 225000
},
{
"epoch": 22.0,
"eval_loss": 0.12161369621753693,
"eval_runtime": 26.6058,
"eval_samples_per_second": 296.025,
"eval_steps_per_second": 0.413,
"step": 225390
},
{
"epoch": 22.010736944851146,
"grad_norm": 0.18952177464962006,
"learning_rate": 2.106831737877456e-05,
"loss": 0.1354,
"step": 225500
},
{
"epoch": 22.059541239629088,
"grad_norm": 0.2017366886138916,
"learning_rate": 2.0973711949319415e-05,
"loss": 0.1355,
"step": 226000
},
{
"epoch": 22.10834553440703,
"grad_norm": 0.19085553288459778,
"learning_rate": 2.087916567604897e-05,
"loss": 0.1353,
"step": 226500
},
{
"epoch": 22.15714982918497,
"grad_norm": 0.20396627485752106,
"learning_rate": 2.0784679948083138e-05,
"loss": 0.1352,
"step": 227000
},
{
"epoch": 22.205954123962908,
"grad_norm": 0.19046179950237274,
"learning_rate": 2.0690256153652248e-05,
"loss": 0.1353,
"step": 227500
},
{
"epoch": 22.25475841874085,
"grad_norm": 0.19359087944030762,
"learning_rate": 2.0595895680076645e-05,
"loss": 0.1353,
"step": 228000
},
{
"epoch": 22.30356271351879,
"grad_norm": 0.186729297041893,
"learning_rate": 2.0501599913746374e-05,
"loss": 0.1351,
"step": 228500
},
{
"epoch": 22.35236700829673,
"grad_norm": 0.1899571716785431,
"learning_rate": 2.0407370240100747e-05,
"loss": 0.1352,
"step": 229000
},
{
"epoch": 22.40117130307467,
"grad_norm": 0.1941409856081009,
"learning_rate": 2.0313208043608017e-05,
"loss": 0.1351,
"step": 229500
},
{
"epoch": 22.44997559785261,
"grad_norm": 0.20220808684825897,
"learning_rate": 2.021911470774504e-05,
"loss": 0.1352,
"step": 230000
},
{
"epoch": 22.49877989263055,
"grad_norm": 0.1803186982870102,
"learning_rate": 2.0125091614976908e-05,
"loss": 0.1348,
"step": 230500
},
{
"epoch": 22.547584187408493,
"grad_norm": 0.19634583592414856,
"learning_rate": 2.0031140146736696e-05,
"loss": 0.1351,
"step": 231000
},
{
"epoch": 22.596388482186434,
"grad_norm": 0.19138526916503906,
"learning_rate": 1.9937261683405135e-05,
"loss": 0.1351,
"step": 231500
},
{
"epoch": 22.64519277696437,
"grad_norm": 0.18439550697803497,
"learning_rate": 1.9843457604290306e-05,
"loss": 0.1348,
"step": 232000
},
{
"epoch": 22.693997071742313,
"grad_norm": 0.1892482042312622,
"learning_rate": 1.974972928760744e-05,
"loss": 0.1347,
"step": 232500
},
{
"epoch": 22.742801366520254,
"grad_norm": 0.18848678469657898,
"learning_rate": 1.9656078110458585e-05,
"loss": 0.1347,
"step": 233000
},
{
"epoch": 22.791605661298195,
"grad_norm": 0.1945199817419052,
"learning_rate": 1.9562505448812453e-05,
"loss": 0.1346,
"step": 233500
},
{
"epoch": 22.840409956076133,
"grad_norm": 0.1922951489686966,
"learning_rate": 1.946901267748417e-05,
"loss": 0.1346,
"step": 234000
},
{
"epoch": 22.889214250854074,
"grad_norm": 0.18175315856933594,
"learning_rate": 1.937560117011504e-05,
"loss": 0.1347,
"step": 234500
},
{
"epoch": 22.938018545632016,
"grad_norm": 0.18632791936397552,
"learning_rate": 1.9282272299152416e-05,
"loss": 0.1344,
"step": 235000
},
{
"epoch": 22.986822840409957,
"grad_norm": 0.1905319094657898,
"learning_rate": 1.9189027435829533e-05,
"loss": 0.1344,
"step": 235500
},
{
"epoch": 23.0,
"eval_loss": 0.11993886530399323,
"eval_runtime": 27.3709,
"eval_samples_per_second": 287.75,
"eval_steps_per_second": 0.402,
"step": 235635
},
{
"epoch": 23.035627135187898,
"grad_norm": 0.19479139149188995,
"learning_rate": 1.909586795014532e-05,
"loss": 0.1343,
"step": 236000
},
{
"epoch": 23.084431429965836,
"grad_norm": 0.1937461495399475,
"learning_rate": 1.9002795210844315e-05,
"loss": 0.1341,
"step": 236500
},
{
"epoch": 23.133235724743777,
"grad_norm": 0.1967599093914032,
"learning_rate": 1.890981058539652e-05,
"loss": 0.1342,
"step": 237000
},
{
"epoch": 23.182040019521718,
"grad_norm": 0.1805768460035324,
"learning_rate": 1.8816915439977333e-05,
"loss": 0.1342,
"step": 237500
},
{
"epoch": 23.23084431429966,
"grad_norm": 0.1909085512161255,
"learning_rate": 1.8724111139447474e-05,
"loss": 0.1342,
"step": 238000
},
{
"epoch": 23.279648609077597,
"grad_norm": 0.18977640569210052,
"learning_rate": 1.863139904733291e-05,
"loss": 0.134,
"step": 238500
},
{
"epoch": 23.32845290385554,
"grad_norm": 0.20453977584838867,
"learning_rate": 1.853878052580485e-05,
"loss": 0.1341,
"step": 239000
},
{
"epoch": 23.37725719863348,
"grad_norm": 0.18761217594146729,
"learning_rate": 1.8446256935659725e-05,
"loss": 0.1341,
"step": 239500
},
{
"epoch": 23.42606149341142,
"grad_norm": 0.19060368835926056,
"learning_rate": 1.835382963629916e-05,
"loss": 0.1341,
"step": 240000
},
{
"epoch": 23.474865788189362,
"grad_norm": 0.18985587358474731,
"learning_rate": 1.8261499985710057e-05,
"loss": 0.1341,
"step": 240500
},
{
"epoch": 23.5236700829673,
"grad_norm": 0.18299199640750885,
"learning_rate": 1.81692693404446e-05,
"loss": 0.1339,
"step": 241000
},
{
"epoch": 23.57247437774524,
"grad_norm": 0.19359715282917023,
"learning_rate": 1.807713905560034e-05,
"loss": 0.1337,
"step": 241500
},
{
"epoch": 23.621278672523182,
"grad_norm": 0.19137471914291382,
"learning_rate": 1.79851104848003e-05,
"loss": 0.1342,
"step": 242000
},
{
"epoch": 23.670082967301123,
"grad_norm": 0.19408565759658813,
"learning_rate": 1.7893184980173038e-05,
"loss": 0.134,
"step": 242500
},
{
"epoch": 23.718887262079065,
"grad_norm": 0.19133317470550537,
"learning_rate": 1.7801363892332846e-05,
"loss": 0.1339,
"step": 243000
},
{
"epoch": 23.767691556857002,
"grad_norm": 0.18910160660743713,
"learning_rate": 1.770964857035986e-05,
"loss": 0.1338,
"step": 243500
},
{
"epoch": 23.816495851634944,
"grad_norm": 0.1919185370206833,
"learning_rate": 1.7618040361780246e-05,
"loss": 0.134,
"step": 244000
},
{
"epoch": 23.865300146412885,
"grad_norm": 0.19021070003509521,
"learning_rate": 1.7526540612546433e-05,
"loss": 0.1336,
"step": 244500
},
{
"epoch": 23.914104441190826,
"grad_norm": 0.1913536936044693,
"learning_rate": 1.743515066701726e-05,
"loss": 0.1338,
"step": 245000
},
{
"epoch": 23.962908735968764,
"grad_norm": 0.19384372234344482,
"learning_rate": 1.734387186793834e-05,
"loss": 0.1336,
"step": 245500
},
{
"epoch": 24.0,
"eval_loss": 0.12013557553291321,
"eval_runtime": 25.8494,
"eval_samples_per_second": 304.688,
"eval_steps_per_second": 0.426,
"step": 245880
},
{
"epoch": 24.011713030746705,
"grad_norm": 0.1967899650335312,
"learning_rate": 1.7252705556422237e-05,
"loss": 0.1337,
"step": 246000
},
{
"epoch": 24.060517325524646,
"grad_norm": 0.19760966300964355,
"learning_rate": 1.7161653071928774e-05,
"loss": 0.1335,
"step": 246500
},
{
"epoch": 24.109321620302588,
"grad_norm": 0.20642146468162537,
"learning_rate": 1.707071575224541e-05,
"loss": 0.1334,
"step": 247000
},
{
"epoch": 24.15812591508053,
"grad_norm": 0.1874382495880127,
"learning_rate": 1.6979894933467533e-05,
"loss": 0.1334,
"step": 247500
},
{
"epoch": 24.206930209858466,
"grad_norm": 0.1970881223678589,
"learning_rate": 1.6889191949978827e-05,
"loss": 0.1336,
"step": 248000
},
{
"epoch": 24.255734504636408,
"grad_norm": 0.1981177181005478,
"learning_rate": 1.6798608134431705e-05,
"loss": 0.1335,
"step": 248500
},
{
"epoch": 24.30453879941435,
"grad_norm": 0.1884346753358841,
"learning_rate": 1.6708144817727685e-05,
"loss": 0.1331,
"step": 249000
},
{
"epoch": 24.35334309419229,
"grad_norm": 0.20689553022384644,
"learning_rate": 1.6617803328997877e-05,
"loss": 0.1336,
"step": 249500
},
{
"epoch": 24.402147388970228,
"grad_norm": 0.21094737946987152,
"learning_rate": 1.6527584995583428e-05,
"loss": 0.1334,
"step": 250000
},
{
"epoch": 24.45095168374817,
"grad_norm": 0.19117839634418488,
"learning_rate": 1.643749114301602e-05,
"loss": 0.133,
"step": 250500
},
{
"epoch": 24.49975597852611,
"grad_norm": 0.1867818832397461,
"learning_rate": 1.6347523094998413e-05,
"loss": 0.1333,
"step": 251000
},
{
"epoch": 24.54856027330405,
"grad_norm": 0.20169509947299957,
"learning_rate": 1.6257682173384987e-05,
"loss": 0.1332,
"step": 251500
},
{
"epoch": 24.597364568081993,
"grad_norm": 0.20369745790958405,
"learning_rate": 1.616796969816229e-05,
"loss": 0.1332,
"step": 252000
},
{
"epoch": 24.64616886285993,
"grad_norm": 0.19687892496585846,
"learning_rate": 1.607838698742972e-05,
"loss": 0.1334,
"step": 252500
},
{
"epoch": 24.69497315763787,
"grad_norm": 0.20486027002334595,
"learning_rate": 1.5988935357380068e-05,
"loss": 0.1331,
"step": 253000
},
{
"epoch": 24.743777452415813,
"grad_norm": 0.18960419297218323,
"learning_rate": 1.5899616122280248e-05,
"loss": 0.1329,
"step": 253500
},
{
"epoch": 24.792581747193754,
"grad_norm": 0.18826229870319366,
"learning_rate": 1.581043059445197e-05,
"loss": 0.1331,
"step": 254000
},
{
"epoch": 24.841386041971692,
"grad_norm": 0.18491852283477783,
"learning_rate": 1.572138008425242e-05,
"loss": 0.1329,
"step": 254500
},
{
"epoch": 24.890190336749633,
"grad_norm": 0.19611585140228271,
"learning_rate": 1.5632465900055073e-05,
"loss": 0.1329,
"step": 255000
},
{
"epoch": 24.938994631527574,
"grad_norm": 0.1943751573562622,
"learning_rate": 1.5543689348230415e-05,
"loss": 0.1329,
"step": 255500
},
{
"epoch": 24.987798926305516,
"grad_norm": 0.1918455809354782,
"learning_rate": 1.545505173312678e-05,
"loss": 0.1328,
"step": 256000
},
{
"epoch": 25.0,
"eval_loss": 0.11874233186244965,
"eval_runtime": 24.4939,
"eval_samples_per_second": 321.55,
"eval_steps_per_second": 0.449,
"step": 256125
}
],
"logging_steps": 500,
"max_steps": 409800,
"num_input_tokens_seen": 0,
"num_train_epochs": 40,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.917586395968569e+18,
"train_batch_size": 384,
"trial_name": null,
"trial_params": null
}