T5Large-8e5-8-checkpoint / trainer_state.json
mateiaassAI's picture
Upload 12 files
ca08fcb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.4,
"eval_steps": 12500,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.032,
"grad_norm": 3.001525402069092,
"learning_rate": 8e-05,
"loss": 13.1617,
"step": 500
},
{
"epoch": 0.064,
"grad_norm": 0.5208349227905273,
"learning_rate": 7.967871485943775e-05,
"loss": 1.4028,
"step": 1000
},
{
"epoch": 0.096,
"grad_norm": 0.43057793378829956,
"learning_rate": 7.93574297188755e-05,
"loss": 1.0426,
"step": 1500
},
{
"epoch": 0.128,
"grad_norm": 0.53807532787323,
"learning_rate": 7.903614457831325e-05,
"loss": 0.8407,
"step": 2000
},
{
"epoch": 0.16,
"grad_norm": 0.7076811194419861,
"learning_rate": 7.8714859437751e-05,
"loss": 0.7035,
"step": 2500
},
{
"epoch": 0.192,
"grad_norm": 0.5321575999259949,
"learning_rate": 7.839357429718876e-05,
"loss": 0.4479,
"step": 3000
},
{
"epoch": 0.224,
"grad_norm": 0.5099969506263733,
"learning_rate": 7.807228915662652e-05,
"loss": 0.4085,
"step": 3500
},
{
"epoch": 0.256,
"grad_norm": 0.3821808099746704,
"learning_rate": 7.775100401606426e-05,
"loss": 0.3811,
"step": 4000
},
{
"epoch": 0.288,
"grad_norm": 0.45157989859580994,
"learning_rate": 7.742971887550202e-05,
"loss": 0.3604,
"step": 4500
},
{
"epoch": 0.32,
"grad_norm": 0.3053194582462311,
"learning_rate": 7.710843373493976e-05,
"loss": 0.3442,
"step": 5000
},
{
"epoch": 0.352,
"grad_norm": 0.45309627056121826,
"learning_rate": 7.678714859437751e-05,
"loss": 0.3308,
"step": 5500
},
{
"epoch": 0.384,
"grad_norm": 0.40403613448143005,
"learning_rate": 7.646586345381526e-05,
"loss": 0.3254,
"step": 6000
},
{
"epoch": 0.416,
"grad_norm": 0.32140398025512695,
"learning_rate": 7.614457831325302e-05,
"loss": 0.3142,
"step": 6500
},
{
"epoch": 0.448,
"grad_norm": 0.36259984970092773,
"learning_rate": 7.582329317269078e-05,
"loss": 0.304,
"step": 7000
},
{
"epoch": 0.48,
"grad_norm": 27.76270294189453,
"learning_rate": 7.550200803212852e-05,
"loss": 0.7921,
"step": 7500
},
{
"epoch": 0.512,
"grad_norm": 0.35605108737945557,
"learning_rate": 7.518072289156628e-05,
"loss": 0.4819,
"step": 8000
},
{
"epoch": 0.544,
"grad_norm": 0.29919707775115967,
"learning_rate": 7.485943775100402e-05,
"loss": 0.2955,
"step": 8500
},
{
"epoch": 0.576,
"grad_norm": 0.27910181879997253,
"learning_rate": 7.453815261044178e-05,
"loss": 0.2891,
"step": 9000
},
{
"epoch": 0.608,
"grad_norm": 0.3084375858306885,
"learning_rate": 7.421686746987952e-05,
"loss": 0.2839,
"step": 9500
},
{
"epoch": 0.64,
"grad_norm": 1.0127158164978027,
"learning_rate": 7.389558232931728e-05,
"loss": 0.2975,
"step": 10000
},
{
"epoch": 0.672,
"grad_norm": 0.29704657196998596,
"learning_rate": 7.357429718875502e-05,
"loss": 0.2784,
"step": 10500
},
{
"epoch": 0.704,
"grad_norm": 0.26352909207344055,
"learning_rate": 7.325301204819278e-05,
"loss": 0.2722,
"step": 11000
},
{
"epoch": 0.736,
"grad_norm": 0.3012610077857971,
"learning_rate": 7.293172690763053e-05,
"loss": 0.2671,
"step": 11500
},
{
"epoch": 0.768,
"grad_norm": 239.10655212402344,
"learning_rate": 7.261044176706828e-05,
"loss": 0.268,
"step": 12000
},
{
"epoch": 0.8,
"grad_norm": 0.2739589214324951,
"learning_rate": 7.228915662650603e-05,
"loss": 0.2758,
"step": 12500
},
{
"epoch": 0.8,
"eval_loss": 0.1865103393793106,
"eval_runtime": 13.4887,
"eval_samples_per_second": 148.272,
"eval_steps_per_second": 6.227,
"step": 12500
},
{
"epoch": 0.832,
"grad_norm": 0.3086440861225128,
"learning_rate": 7.196787148594378e-05,
"loss": 0.258,
"step": 13000
},
{
"epoch": 0.864,
"grad_norm": 0.25051313638687134,
"learning_rate": 7.164658634538153e-05,
"loss": 0.255,
"step": 13500
},
{
"epoch": 0.896,
"grad_norm": 0.2483261674642563,
"learning_rate": 7.132530120481928e-05,
"loss": 0.2505,
"step": 14000
},
{
"epoch": 0.928,
"grad_norm": 0.2732410728931427,
"learning_rate": 7.100401606425703e-05,
"loss": 0.2456,
"step": 14500
},
{
"epoch": 0.96,
"grad_norm": 0.2605254054069519,
"learning_rate": 7.068273092369479e-05,
"loss": 0.2487,
"step": 15000
},
{
"epoch": 0.992,
"grad_norm": 0.294114887714386,
"learning_rate": 7.036144578313253e-05,
"loss": 0.2448,
"step": 15500
},
{
"epoch": 1.024,
"grad_norm": 0.27836671471595764,
"learning_rate": 7.004016064257029e-05,
"loss": 0.2342,
"step": 16000
},
{
"epoch": 1.056,
"grad_norm": 0.2619302272796631,
"learning_rate": 6.971887550200803e-05,
"loss": 0.2282,
"step": 16500
},
{
"epoch": 1.088,
"grad_norm": 0.2845689058303833,
"learning_rate": 6.939759036144579e-05,
"loss": 0.2254,
"step": 17000
},
{
"epoch": 1.12,
"grad_norm": 0.3349343538284302,
"learning_rate": 6.907630522088353e-05,
"loss": 0.2226,
"step": 17500
},
{
"epoch": 1.152,
"grad_norm": 0.293632835149765,
"learning_rate": 6.875502008032129e-05,
"loss": 0.2195,
"step": 18000
},
{
"epoch": 1.184,
"grad_norm": 0.2546076476573944,
"learning_rate": 6.843373493975903e-05,
"loss": 0.2212,
"step": 18500
},
{
"epoch": 1.216,
"grad_norm": 0.25859034061431885,
"learning_rate": 6.811244979919679e-05,
"loss": 0.2197,
"step": 19000
},
{
"epoch": 1.248,
"grad_norm": 0.25642773509025574,
"learning_rate": 6.779116465863455e-05,
"loss": 0.2167,
"step": 19500
},
{
"epoch": 1.28,
"grad_norm": 0.25777679681777954,
"learning_rate": 6.74698795180723e-05,
"loss": 0.2171,
"step": 20000
},
{
"epoch": 1.312,
"grad_norm": 0.28767868876457214,
"learning_rate": 6.714859437751005e-05,
"loss": 0.2122,
"step": 20500
},
{
"epoch": 1.3439999999999999,
"grad_norm": 0.24645094573497772,
"learning_rate": 6.682730923694779e-05,
"loss": 0.2121,
"step": 21000
},
{
"epoch": 1.376,
"grad_norm": 0.27973559498786926,
"learning_rate": 6.650602409638555e-05,
"loss": 0.209,
"step": 21500
},
{
"epoch": 1.408,
"grad_norm": 0.2312808781862259,
"learning_rate": 6.618473895582329e-05,
"loss": 0.207,
"step": 22000
},
{
"epoch": 1.44,
"grad_norm": 0.32198190689086914,
"learning_rate": 6.586345381526105e-05,
"loss": 0.2062,
"step": 22500
},
{
"epoch": 1.472,
"grad_norm": 0.26178357005119324,
"learning_rate": 6.55421686746988e-05,
"loss": 0.2054,
"step": 23000
},
{
"epoch": 1.504,
"grad_norm": 0.2309255599975586,
"learning_rate": 6.522088353413655e-05,
"loss": 0.2029,
"step": 23500
},
{
"epoch": 1.536,
"grad_norm": 0.22608640789985657,
"learning_rate": 6.48995983935743e-05,
"loss": 0.2038,
"step": 24000
},
{
"epoch": 1.568,
"grad_norm": 0.20961375534534454,
"learning_rate": 6.457831325301206e-05,
"loss": 0.2016,
"step": 24500
},
{
"epoch": 1.6,
"grad_norm": 0.1852613389492035,
"learning_rate": 6.42570281124498e-05,
"loss": 0.2003,
"step": 25000
},
{
"epoch": 1.6,
"eval_loss": 0.13480305671691895,
"eval_runtime": 13.4963,
"eval_samples_per_second": 148.189,
"eval_steps_per_second": 6.224,
"step": 25000
},
{
"epoch": 1.6320000000000001,
"grad_norm": 0.25356176495552063,
"learning_rate": 6.393574297188755e-05,
"loss": 0.1991,
"step": 25500
},
{
"epoch": 1.6640000000000001,
"grad_norm": 0.23418042063713074,
"learning_rate": 6.36144578313253e-05,
"loss": 0.1993,
"step": 26000
},
{
"epoch": 1.696,
"grad_norm": 0.4756409823894501,
"learning_rate": 6.329317269076305e-05,
"loss": 0.1962,
"step": 26500
},
{
"epoch": 1.728,
"grad_norm": 0.31074902415275574,
"learning_rate": 6.29718875502008e-05,
"loss": 0.1946,
"step": 27000
},
{
"epoch": 1.76,
"grad_norm": 0.2518517076969147,
"learning_rate": 6.265060240963856e-05,
"loss": 0.1936,
"step": 27500
},
{
"epoch": 1.792,
"grad_norm": 0.23294122517108917,
"learning_rate": 6.232931726907632e-05,
"loss": 0.1931,
"step": 28000
},
{
"epoch": 1.8239999999999998,
"grad_norm": 0.30960217118263245,
"learning_rate": 6.200803212851406e-05,
"loss": 0.1916,
"step": 28500
},
{
"epoch": 1.8559999999999999,
"grad_norm": 0.2612380385398865,
"learning_rate": 6.168674698795182e-05,
"loss": 0.1941,
"step": 29000
},
{
"epoch": 1.888,
"grad_norm": 0.23748044669628143,
"learning_rate": 6.136546184738956e-05,
"loss": 0.1912,
"step": 29500
},
{
"epoch": 1.92,
"grad_norm": 0.21203479170799255,
"learning_rate": 6.104417670682732e-05,
"loss": 0.1905,
"step": 30000
},
{
"epoch": 1.952,
"grad_norm": 0.23570066690444946,
"learning_rate": 6.072289156626506e-05,
"loss": 0.1882,
"step": 30500
},
{
"epoch": 1.984,
"grad_norm": 0.24766422808170319,
"learning_rate": 6.040160642570281e-05,
"loss": 0.1908,
"step": 31000
},
{
"epoch": 2.016,
"grad_norm": 0.21270865201950073,
"learning_rate": 6.008032128514057e-05,
"loss": 0.1835,
"step": 31500
},
{
"epoch": 2.048,
"grad_norm": 0.2123081386089325,
"learning_rate": 5.975903614457831e-05,
"loss": 0.1765,
"step": 32000
},
{
"epoch": 2.08,
"grad_norm": 0.3412201404571533,
"learning_rate": 5.943775100401607e-05,
"loss": 0.1931,
"step": 32500
},
{
"epoch": 2.112,
"grad_norm": 0.2280445396900177,
"learning_rate": 5.911646586345382e-05,
"loss": 0.1893,
"step": 33000
},
{
"epoch": 2.144,
"grad_norm": 0.2307668924331665,
"learning_rate": 5.8795180722891576e-05,
"loss": 0.1826,
"step": 33500
},
{
"epoch": 2.176,
"grad_norm": 0.22051069140434265,
"learning_rate": 5.847389558232932e-05,
"loss": 0.1751,
"step": 34000
},
{
"epoch": 2.208,
"grad_norm": 0.28806522488594055,
"learning_rate": 5.8152610441767076e-05,
"loss": 0.1771,
"step": 34500
},
{
"epoch": 2.24,
"grad_norm": 0.19143815338611603,
"learning_rate": 5.7831325301204826e-05,
"loss": 0.1774,
"step": 35000
},
{
"epoch": 2.2720000000000002,
"grad_norm": 0.2986365556716919,
"learning_rate": 5.751004016064257e-05,
"loss": 0.1752,
"step": 35500
},
{
"epoch": 2.304,
"grad_norm": 0.19981278479099274,
"learning_rate": 5.7188755020080326e-05,
"loss": 0.1756,
"step": 36000
},
{
"epoch": 2.336,
"grad_norm": 0.2546658515930176,
"learning_rate": 5.6867469879518076e-05,
"loss": 0.1741,
"step": 36500
},
{
"epoch": 2.368,
"grad_norm": 0.1719229370355606,
"learning_rate": 5.6546184738955826e-05,
"loss": 0.174,
"step": 37000
},
{
"epoch": 2.4,
"grad_norm": 0.22920982539653778,
"learning_rate": 5.6224899598393576e-05,
"loss": 0.1707,
"step": 37500
},
{
"epoch": 2.4,
"eval_loss": 0.11245977133512497,
"eval_runtime": 13.3722,
"eval_samples_per_second": 149.565,
"eval_steps_per_second": 6.282,
"step": 37500
},
{
"epoch": 2.432,
"grad_norm": 0.22950303554534912,
"learning_rate": 5.590361445783133e-05,
"loss": 0.1715,
"step": 38000
},
{
"epoch": 2.464,
"grad_norm": 0.19281500577926636,
"learning_rate": 5.5582329317269076e-05,
"loss": 0.1719,
"step": 38500
},
{
"epoch": 2.496,
"grad_norm": 0.3043808043003082,
"learning_rate": 5.526104417670683e-05,
"loss": 0.1709,
"step": 39000
},
{
"epoch": 2.528,
"grad_norm": 0.2020847350358963,
"learning_rate": 5.493975903614458e-05,
"loss": 0.1687,
"step": 39500
},
{
"epoch": 2.56,
"grad_norm": 0.2701270878314972,
"learning_rate": 5.461847389558234e-05,
"loss": 0.1689,
"step": 40000
},
{
"epoch": 2.592,
"grad_norm": 0.20151746273040771,
"learning_rate": 5.429718875502008e-05,
"loss": 0.1703,
"step": 40500
},
{
"epoch": 2.624,
"grad_norm": 0.28613436222076416,
"learning_rate": 5.397590361445783e-05,
"loss": 0.1668,
"step": 41000
},
{
"epoch": 2.656,
"grad_norm": 0.24828064441680908,
"learning_rate": 5.365461847389559e-05,
"loss": 0.1671,
"step": 41500
},
{
"epoch": 2.6879999999999997,
"grad_norm": 0.2424352616071701,
"learning_rate": 5.333333333333333e-05,
"loss": 0.1678,
"step": 42000
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.1992502510547638,
"learning_rate": 5.301204819277109e-05,
"loss": 0.165,
"step": 42500
},
{
"epoch": 2.752,
"grad_norm": 0.19437122344970703,
"learning_rate": 5.269076305220884e-05,
"loss": 0.1658,
"step": 43000
},
{
"epoch": 2.784,
"grad_norm": 0.2736295461654663,
"learning_rate": 5.23694779116466e-05,
"loss": 0.165,
"step": 43500
},
{
"epoch": 2.816,
"grad_norm": 0.20108367502689362,
"learning_rate": 5.204819277108434e-05,
"loss": 0.1661,
"step": 44000
},
{
"epoch": 2.848,
"grad_norm": 0.17667348682880402,
"learning_rate": 5.17269076305221e-05,
"loss": 0.163,
"step": 44500
},
{
"epoch": 2.88,
"grad_norm": 0.24210619926452637,
"learning_rate": 5.140562248995984e-05,
"loss": 0.1634,
"step": 45000
},
{
"epoch": 2.912,
"grad_norm": 0.2011321634054184,
"learning_rate": 5.108433734939759e-05,
"loss": 0.1634,
"step": 45500
},
{
"epoch": 2.944,
"grad_norm": 0.1979280710220337,
"learning_rate": 5.076305220883535e-05,
"loss": 0.162,
"step": 46000
},
{
"epoch": 2.976,
"grad_norm": 0.23149700462818146,
"learning_rate": 5.044176706827309e-05,
"loss": 0.1605,
"step": 46500
},
{
"epoch": 3.008,
"grad_norm": 0.1558334231376648,
"learning_rate": 5.012048192771085e-05,
"loss": 0.1575,
"step": 47000
},
{
"epoch": 3.04,
"grad_norm": 0.251295268535614,
"learning_rate": 4.97991967871486e-05,
"loss": 0.1518,
"step": 47500
},
{
"epoch": 3.072,
"grad_norm": 0.1870911866426468,
"learning_rate": 4.9477911646586354e-05,
"loss": 0.151,
"step": 48000
},
{
"epoch": 3.104,
"grad_norm": 0.19055207073688507,
"learning_rate": 4.91566265060241e-05,
"loss": 0.1506,
"step": 48500
},
{
"epoch": 3.136,
"grad_norm": 0.19039425253868103,
"learning_rate": 4.8835341365461854e-05,
"loss": 0.1502,
"step": 49000
},
{
"epoch": 3.168,
"grad_norm": 0.2273644655942917,
"learning_rate": 4.8514056224899604e-05,
"loss": 0.1515,
"step": 49500
},
{
"epoch": 3.2,
"grad_norm": 0.20844458043575287,
"learning_rate": 4.819277108433736e-05,
"loss": 0.15,
"step": 50000
},
{
"epoch": 3.2,
"eval_loss": 0.09431243687868118,
"eval_runtime": 13.3841,
"eval_samples_per_second": 149.431,
"eval_steps_per_second": 6.276,
"step": 50000
},
{
"epoch": 3.232,
"grad_norm": 0.21888093650341034,
"learning_rate": 4.7871485943775104e-05,
"loss": 0.1514,
"step": 50500
},
{
"epoch": 3.2640000000000002,
"grad_norm": 0.1797676682472229,
"learning_rate": 4.7550200803212854e-05,
"loss": 0.1505,
"step": 51000
},
{
"epoch": 3.296,
"grad_norm": 0.17472127079963684,
"learning_rate": 4.7228915662650604e-05,
"loss": 0.15,
"step": 51500
},
{
"epoch": 3.328,
"grad_norm": 0.1877003014087677,
"learning_rate": 4.6907630522088354e-05,
"loss": 0.1501,
"step": 52000
},
{
"epoch": 3.36,
"grad_norm": 0.2039192020893097,
"learning_rate": 4.658634538152611e-05,
"loss": 0.1509,
"step": 52500
},
{
"epoch": 3.392,
"grad_norm": 0.19264955818653107,
"learning_rate": 4.6265060240963854e-05,
"loss": 0.147,
"step": 53000
},
{
"epoch": 3.424,
"grad_norm": 0.2461443841457367,
"learning_rate": 4.594377510040161e-05,
"loss": 0.1473,
"step": 53500
},
{
"epoch": 3.456,
"grad_norm": 0.245579794049263,
"learning_rate": 4.562248995983936e-05,
"loss": 0.148,
"step": 54000
},
{
"epoch": 3.488,
"grad_norm": 0.2419605553150177,
"learning_rate": 4.530120481927712e-05,
"loss": 0.1483,
"step": 54500
},
{
"epoch": 3.52,
"grad_norm": 0.21280065178871155,
"learning_rate": 4.497991967871486e-05,
"loss": 0.1465,
"step": 55000
},
{
"epoch": 3.552,
"grad_norm": 0.19970615208148956,
"learning_rate": 4.465863453815261e-05,
"loss": 0.1479,
"step": 55500
},
{
"epoch": 3.584,
"grad_norm": 0.178068608045578,
"learning_rate": 4.433734939759037e-05,
"loss": 0.1469,
"step": 56000
},
{
"epoch": 3.616,
"grad_norm": 0.2007550597190857,
"learning_rate": 4.401606425702811e-05,
"loss": 0.1484,
"step": 56500
},
{
"epoch": 3.648,
"grad_norm": 0.2703693211078644,
"learning_rate": 4.369477911646587e-05,
"loss": 0.1461,
"step": 57000
},
{
"epoch": 3.68,
"grad_norm": 0.18838930130004883,
"learning_rate": 4.337349397590362e-05,
"loss": 0.1467,
"step": 57500
},
{
"epoch": 3.7119999999999997,
"grad_norm": 0.22705510258674622,
"learning_rate": 4.3052208835341375e-05,
"loss": 0.1471,
"step": 58000
},
{
"epoch": 3.7439999999999998,
"grad_norm": 0.18866540491580963,
"learning_rate": 4.273092369477912e-05,
"loss": 0.1464,
"step": 58500
},
{
"epoch": 3.776,
"grad_norm": 0.16808009147644043,
"learning_rate": 4.2409638554216875e-05,
"loss": 0.1469,
"step": 59000
},
{
"epoch": 3.808,
"grad_norm": 0.20790338516235352,
"learning_rate": 4.208835341365462e-05,
"loss": 0.1455,
"step": 59500
},
{
"epoch": 3.84,
"grad_norm": 0.20283524692058563,
"learning_rate": 4.1767068273092375e-05,
"loss": 0.1453,
"step": 60000
},
{
"epoch": 3.872,
"grad_norm": 0.18522211909294128,
"learning_rate": 4.1445783132530125e-05,
"loss": 0.1449,
"step": 60500
},
{
"epoch": 3.904,
"grad_norm": 0.2298567146062851,
"learning_rate": 4.112449799196787e-05,
"loss": 0.1442,
"step": 61000
},
{
"epoch": 3.936,
"grad_norm": 0.23237478733062744,
"learning_rate": 4.0803212851405625e-05,
"loss": 0.1445,
"step": 61500
},
{
"epoch": 3.968,
"grad_norm": 0.18708902597427368,
"learning_rate": 4.0481927710843375e-05,
"loss": 0.1436,
"step": 62000
},
{
"epoch": 4.0,
"grad_norm": 0.2235335260629654,
"learning_rate": 4.016064257028113e-05,
"loss": 0.145,
"step": 62500
},
{
"epoch": 4.0,
"eval_loss": 0.08151204138994217,
"eval_runtime": 13.1737,
"eval_samples_per_second": 151.818,
"eval_steps_per_second": 6.376,
"step": 62500
},
{
"epoch": 4.032,
"grad_norm": 0.27586033940315247,
"learning_rate": 3.9839357429718875e-05,
"loss": 0.1361,
"step": 63000
},
{
"epoch": 4.064,
"grad_norm": 0.20399342477321625,
"learning_rate": 3.9518072289156625e-05,
"loss": 0.1359,
"step": 63500
},
{
"epoch": 4.096,
"grad_norm": 0.2167077660560608,
"learning_rate": 3.919678714859438e-05,
"loss": 0.1349,
"step": 64000
},
{
"epoch": 4.128,
"grad_norm": 0.2217278927564621,
"learning_rate": 3.887550200803213e-05,
"loss": 0.1343,
"step": 64500
},
{
"epoch": 4.16,
"grad_norm": 0.17411163449287415,
"learning_rate": 3.855421686746988e-05,
"loss": 0.1353,
"step": 65000
},
{
"epoch": 4.192,
"grad_norm": 0.2262706607580185,
"learning_rate": 3.823293172690763e-05,
"loss": 0.1355,
"step": 65500
},
{
"epoch": 4.224,
"grad_norm": 0.21051813662052155,
"learning_rate": 3.791164658634539e-05,
"loss": 0.1351,
"step": 66000
},
{
"epoch": 4.256,
"grad_norm": 0.2202002853155136,
"learning_rate": 3.759036144578314e-05,
"loss": 0.1362,
"step": 66500
},
{
"epoch": 4.288,
"grad_norm": 0.1532248854637146,
"learning_rate": 3.726907630522089e-05,
"loss": 0.1338,
"step": 67000
},
{
"epoch": 4.32,
"grad_norm": 0.25249359011650085,
"learning_rate": 3.694779116465864e-05,
"loss": 0.1334,
"step": 67500
},
{
"epoch": 4.352,
"grad_norm": 0.2392909973859787,
"learning_rate": 3.662650602409639e-05,
"loss": 0.1354,
"step": 68000
},
{
"epoch": 4.384,
"grad_norm": 0.3180345892906189,
"learning_rate": 3.630522088353414e-05,
"loss": 0.1353,
"step": 68500
},
{
"epoch": 4.416,
"grad_norm": 0.27343523502349854,
"learning_rate": 3.598393574297189e-05,
"loss": 0.1354,
"step": 69000
},
{
"epoch": 4.448,
"grad_norm": 0.17806372046470642,
"learning_rate": 3.566265060240964e-05,
"loss": 0.1351,
"step": 69500
},
{
"epoch": 4.48,
"grad_norm": 0.17694541811943054,
"learning_rate": 3.5341365461847396e-05,
"loss": 0.135,
"step": 70000
},
{
"epoch": 4.5120000000000005,
"grad_norm": 0.1796264797449112,
"learning_rate": 3.5020080321285146e-05,
"loss": 0.1326,
"step": 70500
},
{
"epoch": 4.5440000000000005,
"grad_norm": 0.16896295547485352,
"learning_rate": 3.4698795180722896e-05,
"loss": 0.1327,
"step": 71000
},
{
"epoch": 4.576,
"grad_norm": 0.16427506506443024,
"learning_rate": 3.4377510040160646e-05,
"loss": 0.1341,
"step": 71500
},
{
"epoch": 4.608,
"grad_norm": 0.15947696566581726,
"learning_rate": 3.4056224899598396e-05,
"loss": 0.1336,
"step": 72000
},
{
"epoch": 4.64,
"grad_norm": 0.24063943326473236,
"learning_rate": 3.373493975903615e-05,
"loss": 0.1326,
"step": 72500
},
{
"epoch": 4.672,
"grad_norm": 0.2784833610057831,
"learning_rate": 3.3413654618473896e-05,
"loss": 0.1313,
"step": 73000
},
{
"epoch": 4.704,
"grad_norm": 0.1624738872051239,
"learning_rate": 3.3092369477911646e-05,
"loss": 0.1343,
"step": 73500
},
{
"epoch": 4.736,
"grad_norm": 0.163747176527977,
"learning_rate": 3.27710843373494e-05,
"loss": 0.1332,
"step": 74000
},
{
"epoch": 4.768,
"grad_norm": 0.19286634027957916,
"learning_rate": 3.244979919678715e-05,
"loss": 0.1314,
"step": 74500
},
{
"epoch": 4.8,
"grad_norm": 0.2070024311542511,
"learning_rate": 3.21285140562249e-05,
"loss": 0.1323,
"step": 75000
},
{
"epoch": 4.8,
"eval_loss": 0.07321055233478546,
"eval_runtime": 13.3287,
"eval_samples_per_second": 150.052,
"eval_steps_per_second": 6.302,
"step": 75000
},
{
"epoch": 4.832,
"grad_norm": 0.25665566325187683,
"learning_rate": 3.180722891566265e-05,
"loss": 0.133,
"step": 75500
},
{
"epoch": 4.864,
"grad_norm": 0.18933174014091492,
"learning_rate": 3.14859437751004e-05,
"loss": 0.1334,
"step": 76000
},
{
"epoch": 4.896,
"grad_norm": 0.20106372237205505,
"learning_rate": 3.116465863453816e-05,
"loss": 0.1318,
"step": 76500
},
{
"epoch": 4.928,
"grad_norm": 0.23300665616989136,
"learning_rate": 3.084337349397591e-05,
"loss": 0.1314,
"step": 77000
},
{
"epoch": 4.96,
"grad_norm": 0.2776864171028137,
"learning_rate": 3.052208835341366e-05,
"loss": 0.1307,
"step": 77500
},
{
"epoch": 4.992,
"grad_norm": 0.17158937454223633,
"learning_rate": 3.0200803212851406e-05,
"loss": 0.1328,
"step": 78000
},
{
"epoch": 5.024,
"grad_norm": 0.19341541826725006,
"learning_rate": 2.9879518072289156e-05,
"loss": 0.1255,
"step": 78500
},
{
"epoch": 5.056,
"grad_norm": 0.1820104718208313,
"learning_rate": 2.955823293172691e-05,
"loss": 0.1239,
"step": 79000
},
{
"epoch": 5.088,
"grad_norm": 0.19204023480415344,
"learning_rate": 2.923694779116466e-05,
"loss": 0.1251,
"step": 79500
},
{
"epoch": 5.12,
"grad_norm": 0.20374265313148499,
"learning_rate": 2.8915662650602413e-05,
"loss": 0.1238,
"step": 80000
},
{
"epoch": 5.152,
"grad_norm": 0.17607811093330383,
"learning_rate": 2.8594377510040163e-05,
"loss": 0.1242,
"step": 80500
},
{
"epoch": 5.184,
"grad_norm": 0.18394358456134796,
"learning_rate": 2.8273092369477913e-05,
"loss": 0.1246,
"step": 81000
},
{
"epoch": 5.216,
"grad_norm": 0.1720678061246872,
"learning_rate": 2.7951807228915666e-05,
"loss": 0.1234,
"step": 81500
},
{
"epoch": 5.248,
"grad_norm": 0.1649816781282425,
"learning_rate": 2.7630522088353417e-05,
"loss": 0.1246,
"step": 82000
},
{
"epoch": 5.28,
"grad_norm": 0.24089868366718292,
"learning_rate": 2.730923694779117e-05,
"loss": 0.1236,
"step": 82500
},
{
"epoch": 5.312,
"grad_norm": 0.16703809797763824,
"learning_rate": 2.6987951807228917e-05,
"loss": 0.1235,
"step": 83000
},
{
"epoch": 5.344,
"grad_norm": 0.2375577837228775,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.1255,
"step": 83500
},
{
"epoch": 5.376,
"grad_norm": 0.20844422280788422,
"learning_rate": 2.634538152610442e-05,
"loss": 0.124,
"step": 84000
},
{
"epoch": 5.408,
"grad_norm": 0.15090999007225037,
"learning_rate": 2.602409638554217e-05,
"loss": 0.1236,
"step": 84500
},
{
"epoch": 5.44,
"grad_norm": 0.1982196420431137,
"learning_rate": 2.570281124497992e-05,
"loss": 0.1234,
"step": 85000
},
{
"epoch": 5.4719999999999995,
"grad_norm": 0.20548874139785767,
"learning_rate": 2.5381526104417673e-05,
"loss": 0.1237,
"step": 85500
},
{
"epoch": 5.504,
"grad_norm": 0.1553628295660019,
"learning_rate": 2.5060240963855423e-05,
"loss": 0.1241,
"step": 86000
},
{
"epoch": 5.536,
"grad_norm": 0.20827996730804443,
"learning_rate": 2.4738955823293177e-05,
"loss": 0.1236,
"step": 86500
},
{
"epoch": 5.568,
"grad_norm": 0.26525431871414185,
"learning_rate": 2.4417670682730927e-05,
"loss": 0.1244,
"step": 87000
},
{
"epoch": 5.6,
"grad_norm": 0.16824448108673096,
"learning_rate": 2.409638554216868e-05,
"loss": 0.124,
"step": 87500
},
{
"epoch": 5.6,
"eval_loss": 0.06600421667098999,
"eval_runtime": 13.2001,
"eval_samples_per_second": 151.514,
"eval_steps_per_second": 6.364,
"step": 87500
},
{
"epoch": 5.632,
"grad_norm": 0.19588659703731537,
"learning_rate": 2.3775100401606427e-05,
"loss": 0.1239,
"step": 88000
},
{
"epoch": 5.664,
"grad_norm": 0.20829927921295166,
"learning_rate": 2.3453815261044177e-05,
"loss": 0.1236,
"step": 88500
},
{
"epoch": 5.696,
"grad_norm": 0.17067208886146545,
"learning_rate": 2.3132530120481927e-05,
"loss": 0.1246,
"step": 89000
},
{
"epoch": 5.728,
"grad_norm": 0.17796407639980316,
"learning_rate": 2.281124497991968e-05,
"loss": 0.123,
"step": 89500
},
{
"epoch": 5.76,
"grad_norm": 0.17811580002307892,
"learning_rate": 2.248995983935743e-05,
"loss": 0.1247,
"step": 90000
},
{
"epoch": 5.792,
"grad_norm": 0.16586844623088837,
"learning_rate": 2.2168674698795184e-05,
"loss": 0.1226,
"step": 90500
},
{
"epoch": 5.824,
"grad_norm": 0.16634885966777802,
"learning_rate": 2.1847389558232934e-05,
"loss": 0.1239,
"step": 91000
},
{
"epoch": 5.856,
"grad_norm": 0.1891159564256668,
"learning_rate": 2.1526104417670687e-05,
"loss": 0.1234,
"step": 91500
},
{
"epoch": 5.888,
"grad_norm": 0.2127494215965271,
"learning_rate": 2.1204819277108437e-05,
"loss": 0.1227,
"step": 92000
},
{
"epoch": 5.92,
"grad_norm": 0.15109600126743317,
"learning_rate": 2.0883534136546187e-05,
"loss": 0.1218,
"step": 92500
},
{
"epoch": 5.952,
"grad_norm": 0.14382487535476685,
"learning_rate": 2.0562248995983934e-05,
"loss": 0.1216,
"step": 93000
},
{
"epoch": 5.984,
"grad_norm": 0.2122729867696762,
"learning_rate": 2.0240963855421687e-05,
"loss": 0.1248,
"step": 93500
},
{
"epoch": 6.016,
"grad_norm": 0.19725599884986877,
"learning_rate": 1.9919678714859437e-05,
"loss": 0.12,
"step": 94000
},
{
"epoch": 6.048,
"grad_norm": 0.18302911520004272,
"learning_rate": 1.959839357429719e-05,
"loss": 0.1174,
"step": 94500
},
{
"epoch": 6.08,
"grad_norm": 0.19952338933944702,
"learning_rate": 1.927710843373494e-05,
"loss": 0.1176,
"step": 95000
},
{
"epoch": 6.112,
"grad_norm": 0.16824807226657867,
"learning_rate": 1.8955823293172694e-05,
"loss": 0.1174,
"step": 95500
},
{
"epoch": 6.144,
"grad_norm": 0.22110256552696228,
"learning_rate": 1.8634538152610444e-05,
"loss": 0.1172,
"step": 96000
},
{
"epoch": 6.176,
"grad_norm": 0.20097705721855164,
"learning_rate": 1.8313253012048194e-05,
"loss": 0.1171,
"step": 96500
},
{
"epoch": 6.208,
"grad_norm": 0.1569439023733139,
"learning_rate": 1.7991967871485944e-05,
"loss": 0.1177,
"step": 97000
},
{
"epoch": 6.24,
"grad_norm": 0.1618974804878235,
"learning_rate": 1.7670682730923698e-05,
"loss": 0.1169,
"step": 97500
},
{
"epoch": 6.272,
"grad_norm": 0.2146083265542984,
"learning_rate": 1.7349397590361448e-05,
"loss": 0.1172,
"step": 98000
},
{
"epoch": 6.304,
"grad_norm": 0.16460789740085602,
"learning_rate": 1.7028112449799198e-05,
"loss": 0.1166,
"step": 98500
},
{
"epoch": 6.336,
"grad_norm": 0.20428918302059174,
"learning_rate": 1.6706827309236948e-05,
"loss": 0.1177,
"step": 99000
},
{
"epoch": 6.368,
"grad_norm": 0.1996203064918518,
"learning_rate": 1.63855421686747e-05,
"loss": 0.1172,
"step": 99500
},
{
"epoch": 6.4,
"grad_norm": 0.22684213519096375,
"learning_rate": 1.606425702811245e-05,
"loss": 0.118,
"step": 100000
},
{
"epoch": 6.4,
"eval_loss": 0.061658285558223724,
"eval_runtime": 13.1578,
"eval_samples_per_second": 152.001,
"eval_steps_per_second": 6.384,
"step": 100000
}
],
"logging_steps": 500,
"max_steps": 125000,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 12500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.459712365560463e+18,
"train_batch_size": 24,
"trial_name": null,
"trial_params": null
}