keural-alpha-ng / trainer_state.json
mkd-hossain's picture
Upload folder using huggingface_hub
962dac6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0679668152024774,
"eval_steps": 500,
"global_step": 14000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00024273862572313354,
"grad_norm": 3.890625,
"learning_rate": 7.929444129783965e-08,
"loss": 2.5681,
"step": 50
},
{
"epoch": 0.0004854772514462671,
"grad_norm": 6.0625,
"learning_rate": 1.6020713649971682e-07,
"loss": 2.6107,
"step": 100
},
{
"epoch": 0.0007282158771694006,
"grad_norm": 4.875,
"learning_rate": 2.41119831701594e-07,
"loss": 2.5511,
"step": 150
},
{
"epoch": 0.0009709545028925341,
"grad_norm": 4.15625,
"learning_rate": 3.2203252690347116e-07,
"loss": 2.5521,
"step": 200
},
{
"epoch": 0.0012136931286156677,
"grad_norm": 3.625,
"learning_rate": 4.0294522210534837e-07,
"loss": 2.5685,
"step": 250
},
{
"epoch": 0.0014564317543388013,
"grad_norm": 4.03125,
"learning_rate": 4.838579173072256e-07,
"loss": 2.5193,
"step": 300
},
{
"epoch": 0.0016991703800619347,
"grad_norm": 4.53125,
"learning_rate": 5.647706125091027e-07,
"loss": 2.5484,
"step": 350
},
{
"epoch": 0.0019419090057850683,
"grad_norm": 4.59375,
"learning_rate": 6.456833077109799e-07,
"loss": 2.5319,
"step": 400
},
{
"epoch": 0.0021846476315082017,
"grad_norm": 5.375,
"learning_rate": 7.265960029128571e-07,
"loss": 2.5113,
"step": 450
},
{
"epoch": 0.0024273862572313353,
"grad_norm": 4.09375,
"learning_rate": 8.075086981147342e-07,
"loss": 2.5203,
"step": 500
},
{
"epoch": 0.002670124882954469,
"grad_norm": 3.65625,
"learning_rate": 8.884213933166115e-07,
"loss": 2.5355,
"step": 550
},
{
"epoch": 0.0029128635086776026,
"grad_norm": 5.40625,
"learning_rate": 9.693340885184885e-07,
"loss": 2.4935,
"step": 600
},
{
"epoch": 0.003155602134400736,
"grad_norm": 4.03125,
"learning_rate": 1.0502467837203658e-06,
"loss": 2.5222,
"step": 650
},
{
"epoch": 0.0033983407601238694,
"grad_norm": 3.359375,
"learning_rate": 1.131159478922243e-06,
"loss": 2.5488,
"step": 700
},
{
"epoch": 0.003641079385847003,
"grad_norm": 4.3125,
"learning_rate": 1.2120721741241203e-06,
"loss": 2.5078,
"step": 750
},
{
"epoch": 0.0038838180115701366,
"grad_norm": 3.3125,
"learning_rate": 1.2929848693259976e-06,
"loss": 2.4933,
"step": 800
},
{
"epoch": 0.00412655663729327,
"grad_norm": 10.5625,
"learning_rate": 1.3738975645278747e-06,
"loss": 2.5072,
"step": 850
},
{
"epoch": 0.004369295263016403,
"grad_norm": 3.6875,
"learning_rate": 1.4548102597297518e-06,
"loss": 2.4601,
"step": 900
},
{
"epoch": 0.0046120338887395375,
"grad_norm": 3.953125,
"learning_rate": 1.5357229549316289e-06,
"loss": 2.4811,
"step": 950
},
{
"epoch": 0.004854772514462671,
"grad_norm": 3.515625,
"learning_rate": 1.6166356501335062e-06,
"loss": 2.5233,
"step": 1000
},
{
"epoch": 0.005097511140185805,
"grad_norm": 4.0625,
"learning_rate": 1.6975483453353833e-06,
"loss": 2.4694,
"step": 1050
},
{
"epoch": 0.005340249765908938,
"grad_norm": 2.9375,
"learning_rate": 1.7784610405372604e-06,
"loss": 2.4865,
"step": 1100
},
{
"epoch": 0.005582988391632071,
"grad_norm": 3.515625,
"learning_rate": 1.8593737357391375e-06,
"loss": 2.4639,
"step": 1150
},
{
"epoch": 0.005825727017355205,
"grad_norm": 15.4375,
"learning_rate": 1.940286430941015e-06,
"loss": 2.4299,
"step": 1200
},
{
"epoch": 0.006068465643078338,
"grad_norm": 3.390625,
"learning_rate": 2.021199126142892e-06,
"loss": 2.4692,
"step": 1250
},
{
"epoch": 0.006311204268801472,
"grad_norm": 3.125,
"learning_rate": 2.1021118213447693e-06,
"loss": 2.4785,
"step": 1300
},
{
"epoch": 0.0065539428945246055,
"grad_norm": 3.40625,
"learning_rate": 2.1830245165466464e-06,
"loss": 2.4727,
"step": 1350
},
{
"epoch": 0.006796681520247739,
"grad_norm": 2.921875,
"learning_rate": 2.2639372117485235e-06,
"loss": 2.4517,
"step": 1400
},
{
"epoch": 0.007039420145970873,
"grad_norm": 4.03125,
"learning_rate": 2.3448499069504006e-06,
"loss": 2.4343,
"step": 1450
},
{
"epoch": 0.007282158771694006,
"grad_norm": 3.296875,
"learning_rate": 2.4257626021522777e-06,
"loss": 2.4255,
"step": 1500
},
{
"epoch": 0.00752489739741714,
"grad_norm": 3.3125,
"learning_rate": 2.506675297354155e-06,
"loss": 2.4238,
"step": 1550
},
{
"epoch": 0.007767636023140273,
"grad_norm": 4.34375,
"learning_rate": 2.5875879925560323e-06,
"loss": 2.4245,
"step": 1600
},
{
"epoch": 0.008010374648863406,
"grad_norm": 3.765625,
"learning_rate": 2.6685006877579094e-06,
"loss": 2.4052,
"step": 1650
},
{
"epoch": 0.00825311327458654,
"grad_norm": 2.875,
"learning_rate": 2.749413382959787e-06,
"loss": 2.4148,
"step": 1700
},
{
"epoch": 0.008495851900309674,
"grad_norm": 3.09375,
"learning_rate": 2.8303260781616636e-06,
"loss": 2.4272,
"step": 1750
},
{
"epoch": 0.008738590526032807,
"grad_norm": 3.65625,
"learning_rate": 2.911238773363541e-06,
"loss": 2.4219,
"step": 1800
},
{
"epoch": 0.00898132915175594,
"grad_norm": 3.296875,
"learning_rate": 2.992151468565418e-06,
"loss": 2.4591,
"step": 1850
},
{
"epoch": 0.009224067777479075,
"grad_norm": 3.671875,
"learning_rate": 3.0730641637672954e-06,
"loss": 2.4347,
"step": 1900
},
{
"epoch": 0.009466806403202207,
"grad_norm": 3.484375,
"learning_rate": 3.153976858969173e-06,
"loss": 2.3938,
"step": 1950
},
{
"epoch": 0.009709545028925341,
"grad_norm": 3.921875,
"learning_rate": 3.2348895541710496e-06,
"loss": 2.3816,
"step": 2000
},
{
"epoch": 0.009952283654648475,
"grad_norm": 4.5625,
"learning_rate": 3.315802249372927e-06,
"loss": 2.4554,
"step": 2050
},
{
"epoch": 0.01019502228037161,
"grad_norm": 2.46875,
"learning_rate": 3.396714944574804e-06,
"loss": 2.3965,
"step": 2100
},
{
"epoch": 0.010437760906094742,
"grad_norm": 3.078125,
"learning_rate": 3.4776276397766813e-06,
"loss": 2.3747,
"step": 2150
},
{
"epoch": 0.010680499531817876,
"grad_norm": 4.1875,
"learning_rate": 3.5585403349785584e-06,
"loss": 2.4146,
"step": 2200
},
{
"epoch": 0.01092323815754101,
"grad_norm": 3.421875,
"learning_rate": 3.6394530301804355e-06,
"loss": 2.3946,
"step": 2250
},
{
"epoch": 0.011165976783264142,
"grad_norm": 3.375,
"learning_rate": 3.7203657253823126e-06,
"loss": 2.4002,
"step": 2300
},
{
"epoch": 0.011408715408987276,
"grad_norm": 4.28125,
"learning_rate": 3.80127842058419e-06,
"loss": 2.4002,
"step": 2350
},
{
"epoch": 0.01165145403471041,
"grad_norm": 3.0,
"learning_rate": 3.882191115786067e-06,
"loss": 2.4003,
"step": 2400
},
{
"epoch": 0.011894192660433543,
"grad_norm": 3.703125,
"learning_rate": 3.963103810987945e-06,
"loss": 2.4261,
"step": 2450
},
{
"epoch": 0.012136931286156677,
"grad_norm": 3.25,
"learning_rate": 4.044016506189822e-06,
"loss": 2.342,
"step": 2500
},
{
"epoch": 0.01237966991187981,
"grad_norm": 3.25,
"learning_rate": 4.124929201391699e-06,
"loss": 2.3825,
"step": 2550
},
{
"epoch": 0.012622408537602945,
"grad_norm": 3.90625,
"learning_rate": 4.205841896593576e-06,
"loss": 2.3988,
"step": 2600
},
{
"epoch": 0.012865147163326077,
"grad_norm": 3.34375,
"learning_rate": 4.286754591795453e-06,
"loss": 2.3655,
"step": 2650
},
{
"epoch": 0.013107885789049211,
"grad_norm": 3.171875,
"learning_rate": 4.36766728699733e-06,
"loss": 2.3807,
"step": 2700
},
{
"epoch": 0.013350624414772345,
"grad_norm": 4.5,
"learning_rate": 4.448579982199207e-06,
"loss": 2.3725,
"step": 2750
},
{
"epoch": 0.013593363040495477,
"grad_norm": 2.5625,
"learning_rate": 4.5294926774010845e-06,
"loss": 2.3701,
"step": 2800
},
{
"epoch": 0.013836101666218611,
"grad_norm": 3.203125,
"learning_rate": 4.610405372602962e-06,
"loss": 2.3917,
"step": 2850
},
{
"epoch": 0.014078840291941746,
"grad_norm": 3.40625,
"learning_rate": 4.6913180678048396e-06,
"loss": 2.3837,
"step": 2900
},
{
"epoch": 0.014321578917664878,
"grad_norm": 2.578125,
"learning_rate": 4.772230763006716e-06,
"loss": 2.3561,
"step": 2950
},
{
"epoch": 0.014564317543388012,
"grad_norm": 5.5,
"learning_rate": 4.853143458208594e-06,
"loss": 2.3751,
"step": 3000
},
{
"epoch": 0.014807056169111146,
"grad_norm": 3.046875,
"learning_rate": 4.93405615341047e-06,
"loss": 2.3576,
"step": 3050
},
{
"epoch": 0.01504979479483428,
"grad_norm": 3.671875,
"learning_rate": 5.014968848612348e-06,
"loss": 2.3693,
"step": 3100
},
{
"epoch": 0.015292533420557412,
"grad_norm": 3.359375,
"learning_rate": 5.095881543814225e-06,
"loss": 2.362,
"step": 3150
},
{
"epoch": 0.015535272046280546,
"grad_norm": 3.71875,
"learning_rate": 5.176794239016102e-06,
"loss": 2.3633,
"step": 3200
},
{
"epoch": 0.01577801067200368,
"grad_norm": 3.5625,
"learning_rate": 5.2577069342179785e-06,
"loss": 2.3481,
"step": 3250
},
{
"epoch": 0.016020749297726813,
"grad_norm": 4.375,
"learning_rate": 5.338619629419856e-06,
"loss": 2.3511,
"step": 3300
},
{
"epoch": 0.01626348792344995,
"grad_norm": 4.9375,
"learning_rate": 5.4195323246217335e-06,
"loss": 2.3887,
"step": 3350
},
{
"epoch": 0.01650622654917308,
"grad_norm": 4.15625,
"learning_rate": 5.500445019823611e-06,
"loss": 2.3176,
"step": 3400
},
{
"epoch": 0.016748965174896213,
"grad_norm": 3.5,
"learning_rate": 5.5813577150254886e-06,
"loss": 2.3562,
"step": 3450
},
{
"epoch": 0.01699170380061935,
"grad_norm": 3.828125,
"learning_rate": 5.662270410227365e-06,
"loss": 2.3405,
"step": 3500
},
{
"epoch": 0.01723444242634248,
"grad_norm": 4.96875,
"learning_rate": 5.743183105429242e-06,
"loss": 2.344,
"step": 3550
},
{
"epoch": 0.017477181052065614,
"grad_norm": 2.734375,
"learning_rate": 5.82409580063112e-06,
"loss": 2.3372,
"step": 3600
},
{
"epoch": 0.01771991967778875,
"grad_norm": 2.828125,
"learning_rate": 5.905008495832997e-06,
"loss": 2.3337,
"step": 3650
},
{
"epoch": 0.01796265830351188,
"grad_norm": 4.09375,
"learning_rate": 5.985921191034873e-06,
"loss": 2.3394,
"step": 3700
},
{
"epoch": 0.018205396929235014,
"grad_norm": 4.59375,
"learning_rate": 6.066833886236751e-06,
"loss": 2.3587,
"step": 3750
},
{
"epoch": 0.01844813555495815,
"grad_norm": 2.875,
"learning_rate": 6.147746581438628e-06,
"loss": 2.3127,
"step": 3800
},
{
"epoch": 0.018690874180681282,
"grad_norm": 4.15625,
"learning_rate": 6.228659276640505e-06,
"loss": 2.3136,
"step": 3850
},
{
"epoch": 0.018933612806404414,
"grad_norm": 3.671875,
"learning_rate": 6.309571971842383e-06,
"loss": 2.3206,
"step": 3900
},
{
"epoch": 0.01917635143212755,
"grad_norm": 3.546875,
"learning_rate": 6.39048466704426e-06,
"loss": 2.3229,
"step": 3950
},
{
"epoch": 0.019419090057850683,
"grad_norm": 3.625,
"learning_rate": 6.471397362246137e-06,
"loss": 2.3229,
"step": 4000
},
{
"epoch": 0.019661828683573815,
"grad_norm": 2.796875,
"learning_rate": 6.552310057448014e-06,
"loss": 2.3161,
"step": 4050
},
{
"epoch": 0.01990456730929695,
"grad_norm": 3.953125,
"learning_rate": 6.633222752649892e-06,
"loss": 2.328,
"step": 4100
},
{
"epoch": 0.020147305935020083,
"grad_norm": 3.96875,
"learning_rate": 6.714135447851768e-06,
"loss": 2.3185,
"step": 4150
},
{
"epoch": 0.02039004456074322,
"grad_norm": 3.421875,
"learning_rate": 6.795048143053645e-06,
"loss": 2.3551,
"step": 4200
},
{
"epoch": 0.02063278318646635,
"grad_norm": 3.828125,
"learning_rate": 6.875960838255523e-06,
"loss": 2.3208,
"step": 4250
},
{
"epoch": 0.020875521812189483,
"grad_norm": 3.828125,
"learning_rate": 6.9568735334574e-06,
"loss": 2.3159,
"step": 4300
},
{
"epoch": 0.02111826043791262,
"grad_norm": 3.96875,
"learning_rate": 7.0377862286592764e-06,
"loss": 2.2877,
"step": 4350
},
{
"epoch": 0.02136099906363575,
"grad_norm": 3.53125,
"learning_rate": 7.118698923861154e-06,
"loss": 2.3062,
"step": 4400
},
{
"epoch": 0.021603737689358884,
"grad_norm": 3.578125,
"learning_rate": 7.1996116190630315e-06,
"loss": 2.3151,
"step": 4450
},
{
"epoch": 0.02184647631508202,
"grad_norm": 2.421875,
"learning_rate": 7.280524314264909e-06,
"loss": 2.3383,
"step": 4500
},
{
"epoch": 0.022089214940805152,
"grad_norm": 3.046875,
"learning_rate": 7.3614370094667866e-06,
"loss": 2.3252,
"step": 4550
},
{
"epoch": 0.022331953566528284,
"grad_norm": 3.40625,
"learning_rate": 7.442349704668664e-06,
"loss": 2.3223,
"step": 4600
},
{
"epoch": 0.02257469219225142,
"grad_norm": 3.390625,
"learning_rate": 7.52326239987054e-06,
"loss": 2.3112,
"step": 4650
},
{
"epoch": 0.022817430817974552,
"grad_norm": 4.46875,
"learning_rate": 7.604175095072418e-06,
"loss": 2.3034,
"step": 4700
},
{
"epoch": 0.023060169443697685,
"grad_norm": 3.8125,
"learning_rate": 7.685087790274294e-06,
"loss": 2.3457,
"step": 4750
},
{
"epoch": 0.02330290806942082,
"grad_norm": 4.53125,
"learning_rate": 7.766000485476171e-06,
"loss": 2.3064,
"step": 4800
},
{
"epoch": 0.023545646695143953,
"grad_norm": 5.25,
"learning_rate": 7.846913180678048e-06,
"loss": 2.3134,
"step": 4850
},
{
"epoch": 0.023788385320867085,
"grad_norm": 3.984375,
"learning_rate": 7.927825875879927e-06,
"loss": 2.2627,
"step": 4900
},
{
"epoch": 0.02403112394659022,
"grad_norm": 4.0625,
"learning_rate": 8.008738571081803e-06,
"loss": 2.331,
"step": 4950
},
{
"epoch": 0.024273862572313353,
"grad_norm": 4.78125,
"learning_rate": 8.08965126628368e-06,
"loss": 2.2673,
"step": 5000
},
{
"epoch": 0.02451660119803649,
"grad_norm": 6.09375,
"learning_rate": 8.170563961485558e-06,
"loss": 2.2814,
"step": 5050
},
{
"epoch": 0.02475933982375962,
"grad_norm": 3.671875,
"learning_rate": 8.251476656687436e-06,
"loss": 2.2875,
"step": 5100
},
{
"epoch": 0.025002078449482754,
"grad_norm": 3.375,
"learning_rate": 8.332389351889311e-06,
"loss": 2.3154,
"step": 5150
},
{
"epoch": 0.02524481707520589,
"grad_norm": 4.0,
"learning_rate": 8.41330204709119e-06,
"loss": 2.3052,
"step": 5200
},
{
"epoch": 0.02548755570092902,
"grad_norm": 4.46875,
"learning_rate": 8.494214742293067e-06,
"loss": 2.2914,
"step": 5250
},
{
"epoch": 0.025730294326652154,
"grad_norm": 3.15625,
"learning_rate": 8.575127437494944e-06,
"loss": 2.3334,
"step": 5300
},
{
"epoch": 0.02597303295237529,
"grad_norm": 3.96875,
"learning_rate": 8.656040132696821e-06,
"loss": 2.2844,
"step": 5350
},
{
"epoch": 0.026215771578098422,
"grad_norm": 4.46875,
"learning_rate": 8.736952827898698e-06,
"loss": 2.2983,
"step": 5400
},
{
"epoch": 0.026458510203821554,
"grad_norm": 3.796875,
"learning_rate": 8.817865523100575e-06,
"loss": 2.2918,
"step": 5450
},
{
"epoch": 0.02670124882954469,
"grad_norm": 4.6875,
"learning_rate": 8.898778218302452e-06,
"loss": 2.2442,
"step": 5500
},
{
"epoch": 0.026943987455267823,
"grad_norm": 3.84375,
"learning_rate": 8.97969091350433e-06,
"loss": 2.2766,
"step": 5550
},
{
"epoch": 0.027186726080990955,
"grad_norm": 4.3125,
"learning_rate": 9.060603608706207e-06,
"loss": 2.2699,
"step": 5600
},
{
"epoch": 0.02742946470671409,
"grad_norm": 3.515625,
"learning_rate": 9.141516303908084e-06,
"loss": 2.2901,
"step": 5650
},
{
"epoch": 0.027672203332437223,
"grad_norm": 3.265625,
"learning_rate": 9.22242899910996e-06,
"loss": 2.2585,
"step": 5700
},
{
"epoch": 0.027914941958160355,
"grad_norm": 4.25,
"learning_rate": 9.303341694311838e-06,
"loss": 2.2746,
"step": 5750
},
{
"epoch": 0.02815768058388349,
"grad_norm": 4.125,
"learning_rate": 9.384254389513715e-06,
"loss": 2.2992,
"step": 5800
},
{
"epoch": 0.028400419209606623,
"grad_norm": 4.9375,
"learning_rate": 9.465167084715594e-06,
"loss": 2.2443,
"step": 5850
},
{
"epoch": 0.028643157835329756,
"grad_norm": 5.5,
"learning_rate": 9.54607977991747e-06,
"loss": 2.301,
"step": 5900
},
{
"epoch": 0.02888589646105289,
"grad_norm": 3.53125,
"learning_rate": 9.626992475119346e-06,
"loss": 2.2621,
"step": 5950
},
{
"epoch": 0.029128635086776024,
"grad_norm": 4.0625,
"learning_rate": 9.707905170321225e-06,
"loss": 2.2469,
"step": 6000
},
{
"epoch": 0.02937137371249916,
"grad_norm": 3.875,
"learning_rate": 9.788817865523102e-06,
"loss": 2.2495,
"step": 6050
},
{
"epoch": 0.029614112338222292,
"grad_norm": 4.65625,
"learning_rate": 9.869730560724978e-06,
"loss": 2.2683,
"step": 6100
},
{
"epoch": 0.029856850963945424,
"grad_norm": 4.4375,
"learning_rate": 9.950643255926856e-06,
"loss": 2.2682,
"step": 6150
},
{
"epoch": 0.03009958958966856,
"grad_norm": 6.0,
"learning_rate": 1.0031555951128734e-05,
"loss": 2.2343,
"step": 6200
},
{
"epoch": 0.030342328215391692,
"grad_norm": 3.203125,
"learning_rate": 1.011246864633061e-05,
"loss": 2.2693,
"step": 6250
},
{
"epoch": 0.030585066841114825,
"grad_norm": 6.03125,
"learning_rate": 1.0193381341532486e-05,
"loss": 2.2727,
"step": 6300
},
{
"epoch": 0.03082780546683796,
"grad_norm": 3.609375,
"learning_rate": 1.0274294036734365e-05,
"loss": 2.2442,
"step": 6350
},
{
"epoch": 0.031070544092561093,
"grad_norm": 5.5625,
"learning_rate": 1.0355206731936242e-05,
"loss": 2.2315,
"step": 6400
},
{
"epoch": 0.03131328271828423,
"grad_norm": 3.671875,
"learning_rate": 1.0436119427138119e-05,
"loss": 2.2454,
"step": 6450
},
{
"epoch": 0.03155602134400736,
"grad_norm": 4.0625,
"learning_rate": 1.0517032122339996e-05,
"loss": 2.2689,
"step": 6500
},
{
"epoch": 0.03179875996973049,
"grad_norm": 4.21875,
"learning_rate": 1.0597944817541875e-05,
"loss": 2.2182,
"step": 6550
},
{
"epoch": 0.032041498595453625,
"grad_norm": 3.53125,
"learning_rate": 1.067885751274375e-05,
"loss": 2.2385,
"step": 6600
},
{
"epoch": 0.03228423722117676,
"grad_norm": 3.953125,
"learning_rate": 1.0759770207945627e-05,
"loss": 2.2407,
"step": 6650
},
{
"epoch": 0.0325269758468999,
"grad_norm": 4.3125,
"learning_rate": 1.0840682903147506e-05,
"loss": 2.2354,
"step": 6700
},
{
"epoch": 0.03276971447262303,
"grad_norm": 3.953125,
"learning_rate": 1.0921595598349382e-05,
"loss": 2.2377,
"step": 6750
},
{
"epoch": 0.03301245309834616,
"grad_norm": 4.3125,
"learning_rate": 1.1002508293551259e-05,
"loss": 2.2248,
"step": 6800
},
{
"epoch": 0.033255191724069294,
"grad_norm": 2.75,
"learning_rate": 1.1083420988753136e-05,
"loss": 2.1842,
"step": 6850
},
{
"epoch": 0.033497930349792426,
"grad_norm": 3.484375,
"learning_rate": 1.1164333683955013e-05,
"loss": 2.2184,
"step": 6900
},
{
"epoch": 0.03374066897551556,
"grad_norm": 3.53125,
"learning_rate": 1.1245246379156892e-05,
"loss": 2.2141,
"step": 6950
},
{
"epoch": 0.0339834076012387,
"grad_norm": 4.84375,
"learning_rate": 1.1326159074358767e-05,
"loss": 2.1841,
"step": 7000
},
{
"epoch": 0.03422614622696183,
"grad_norm": 5.125,
"learning_rate": 1.1407071769560644e-05,
"loss": 2.2327,
"step": 7050
},
{
"epoch": 0.03446888485268496,
"grad_norm": 4.0625,
"learning_rate": 1.1487984464762523e-05,
"loss": 2.2382,
"step": 7100
},
{
"epoch": 0.034711623478408095,
"grad_norm": 4.84375,
"learning_rate": 1.1568897159964399e-05,
"loss": 2.2254,
"step": 7150
},
{
"epoch": 0.03495436210413123,
"grad_norm": 4.625,
"learning_rate": 1.1649809855166276e-05,
"loss": 2.2133,
"step": 7200
},
{
"epoch": 0.03519710072985436,
"grad_norm": 2.921875,
"learning_rate": 1.1730722550368154e-05,
"loss": 2.216,
"step": 7250
},
{
"epoch": 0.0354398393555775,
"grad_norm": 5.1875,
"learning_rate": 1.181163524557003e-05,
"loss": 2.2227,
"step": 7300
},
{
"epoch": 0.03568257798130063,
"grad_norm": 4.875,
"learning_rate": 1.1892547940771909e-05,
"loss": 2.2058,
"step": 7350
},
{
"epoch": 0.03592531660702376,
"grad_norm": 4.59375,
"learning_rate": 1.1973460635973786e-05,
"loss": 2.207,
"step": 7400
},
{
"epoch": 0.036168055232746896,
"grad_norm": 3.75,
"learning_rate": 1.2054373331175661e-05,
"loss": 2.2047,
"step": 7450
},
{
"epoch": 0.03641079385847003,
"grad_norm": 3.828125,
"learning_rate": 1.213528602637754e-05,
"loss": 2.2141,
"step": 7500
},
{
"epoch": 0.03665353248419317,
"grad_norm": 3.9375,
"learning_rate": 1.2216198721579417e-05,
"loss": 2.2137,
"step": 7550
},
{
"epoch": 0.0368962711099163,
"grad_norm": 4.03125,
"learning_rate": 1.2297111416781292e-05,
"loss": 2.1956,
"step": 7600
},
{
"epoch": 0.03713900973563943,
"grad_norm": 3.828125,
"learning_rate": 1.2378024111983171e-05,
"loss": 2.2152,
"step": 7650
},
{
"epoch": 0.037381748361362564,
"grad_norm": 3.984375,
"learning_rate": 1.2458936807185048e-05,
"loss": 2.2039,
"step": 7700
},
{
"epoch": 0.0376244869870857,
"grad_norm": 2.84375,
"learning_rate": 1.2539849502386925e-05,
"loss": 2.2054,
"step": 7750
},
{
"epoch": 0.03786722561280883,
"grad_norm": 3.484375,
"learning_rate": 1.2620762197588803e-05,
"loss": 2.1559,
"step": 7800
},
{
"epoch": 0.03810996423853197,
"grad_norm": 4.75,
"learning_rate": 1.2701674892790681e-05,
"loss": 2.1573,
"step": 7850
},
{
"epoch": 0.0383527028642551,
"grad_norm": 2.5,
"learning_rate": 1.2782587587992557e-05,
"loss": 2.1676,
"step": 7900
},
{
"epoch": 0.03859544148997823,
"grad_norm": 4.1875,
"learning_rate": 1.2863500283194434e-05,
"loss": 2.2064,
"step": 7950
},
{
"epoch": 0.038838180115701365,
"grad_norm": 5.3125,
"learning_rate": 1.2944412978396313e-05,
"loss": 2.1678,
"step": 8000
},
{
"epoch": 0.0390809187414245,
"grad_norm": 5.09375,
"learning_rate": 1.3025325673598188e-05,
"loss": 2.2002,
"step": 8050
},
{
"epoch": 0.03932365736714763,
"grad_norm": 3.25,
"learning_rate": 1.3106238368800065e-05,
"loss": 2.1444,
"step": 8100
},
{
"epoch": 0.03956639599287077,
"grad_norm": 3.65625,
"learning_rate": 1.3187151064001944e-05,
"loss": 2.2143,
"step": 8150
},
{
"epoch": 0.0398091346185939,
"grad_norm": 3.5625,
"learning_rate": 1.326806375920382e-05,
"loss": 2.1851,
"step": 8200
},
{
"epoch": 0.040051873244317034,
"grad_norm": 3.921875,
"learning_rate": 1.3348976454405698e-05,
"loss": 2.1899,
"step": 8250
},
{
"epoch": 0.040294611870040166,
"grad_norm": 4.21875,
"learning_rate": 1.3429889149607575e-05,
"loss": 2.1567,
"step": 8300
},
{
"epoch": 0.0405373504957633,
"grad_norm": 2.875,
"learning_rate": 1.351080184480945e-05,
"loss": 2.1853,
"step": 8350
},
{
"epoch": 0.04078008912148644,
"grad_norm": 4.5,
"learning_rate": 1.359171454001133e-05,
"loss": 2.1475,
"step": 8400
},
{
"epoch": 0.04102282774720957,
"grad_norm": 3.734375,
"learning_rate": 1.3672627235213205e-05,
"loss": 2.1564,
"step": 8450
},
{
"epoch": 0.0412655663729327,
"grad_norm": 4.03125,
"learning_rate": 1.3753539930415084e-05,
"loss": 2.1772,
"step": 8500
},
{
"epoch": 0.041508304998655834,
"grad_norm": 3.953125,
"learning_rate": 1.3834452625616961e-05,
"loss": 2.2006,
"step": 8550
},
{
"epoch": 0.04175104362437897,
"grad_norm": 3.890625,
"learning_rate": 1.3915365320818836e-05,
"loss": 2.1684,
"step": 8600
},
{
"epoch": 0.0419937822501021,
"grad_norm": 3.0625,
"learning_rate": 1.3996278016020715e-05,
"loss": 2.1765,
"step": 8650
},
{
"epoch": 0.04223652087582524,
"grad_norm": 3.578125,
"learning_rate": 1.4077190711222592e-05,
"loss": 2.1336,
"step": 8700
},
{
"epoch": 0.04247925950154837,
"grad_norm": 3.828125,
"learning_rate": 1.4158103406424468e-05,
"loss": 2.1553,
"step": 8750
},
{
"epoch": 0.0427219981272715,
"grad_norm": 4.21875,
"learning_rate": 1.4239016101626346e-05,
"loss": 2.1563,
"step": 8800
},
{
"epoch": 0.042964736752994635,
"grad_norm": 3.28125,
"learning_rate": 1.4319928796828223e-05,
"loss": 2.0883,
"step": 8850
},
{
"epoch": 0.04320747537871777,
"grad_norm": 4.1875,
"learning_rate": 1.44008414920301e-05,
"loss": 2.1586,
"step": 8900
},
{
"epoch": 0.0434502140044409,
"grad_norm": 3.640625,
"learning_rate": 1.4481754187231978e-05,
"loss": 2.1743,
"step": 8950
},
{
"epoch": 0.04369295263016404,
"grad_norm": 4.5,
"learning_rate": 1.4562666882433856e-05,
"loss": 2.142,
"step": 9000
},
{
"epoch": 0.04393569125588717,
"grad_norm": 3.8125,
"learning_rate": 1.4643579577635732e-05,
"loss": 2.1355,
"step": 9050
},
{
"epoch": 0.044178429881610304,
"grad_norm": 3.515625,
"learning_rate": 1.4724492272837609e-05,
"loss": 2.1328,
"step": 9100
},
{
"epoch": 0.044421168507333436,
"grad_norm": 3.125,
"learning_rate": 1.4805404968039488e-05,
"loss": 2.1121,
"step": 9150
},
{
"epoch": 0.04466390713305657,
"grad_norm": 4.3125,
"learning_rate": 1.4886317663241363e-05,
"loss": 2.185,
"step": 9200
},
{
"epoch": 0.04490664575877971,
"grad_norm": 3.421875,
"learning_rate": 1.496723035844324e-05,
"loss": 2.1548,
"step": 9250
},
{
"epoch": 0.04514938438450284,
"grad_norm": 3.765625,
"learning_rate": 1.5048143053645119e-05,
"loss": 2.1517,
"step": 9300
},
{
"epoch": 0.04539212301022597,
"grad_norm": 3.515625,
"learning_rate": 1.5129055748846995e-05,
"loss": 2.1235,
"step": 9350
},
{
"epoch": 0.045634861635949105,
"grad_norm": 5.59375,
"learning_rate": 1.5209968444048873e-05,
"loss": 2.1636,
"step": 9400
},
{
"epoch": 0.04587760026167224,
"grad_norm": 2.78125,
"learning_rate": 1.529088113925075e-05,
"loss": 2.1182,
"step": 9450
},
{
"epoch": 0.04612033888739537,
"grad_norm": 3.234375,
"learning_rate": 1.5371793834452626e-05,
"loss": 2.1404,
"step": 9500
},
{
"epoch": 0.04636307751311851,
"grad_norm": 2.984375,
"learning_rate": 1.5452706529654505e-05,
"loss": 2.1383,
"step": 9550
},
{
"epoch": 0.04660581613884164,
"grad_norm": 3.125,
"learning_rate": 1.5533619224856383e-05,
"loss": 2.1135,
"step": 9600
},
{
"epoch": 0.04684855476456477,
"grad_norm": 3.21875,
"learning_rate": 1.561453192005826e-05,
"loss": 2.1648,
"step": 9650
},
{
"epoch": 0.047091293390287906,
"grad_norm": 2.71875,
"learning_rate": 1.5695444615260134e-05,
"loss": 2.1479,
"step": 9700
},
{
"epoch": 0.04733403201601104,
"grad_norm": 4.90625,
"learning_rate": 1.5776357310462013e-05,
"loss": 2.1343,
"step": 9750
},
{
"epoch": 0.04757677064173417,
"grad_norm": 3.875,
"learning_rate": 1.585727000566389e-05,
"loss": 2.1563,
"step": 9800
},
{
"epoch": 0.04781950926745731,
"grad_norm": 2.90625,
"learning_rate": 1.5938182700865767e-05,
"loss": 2.1448,
"step": 9850
},
{
"epoch": 0.04806224789318044,
"grad_norm": 3.578125,
"learning_rate": 1.6019095396067646e-05,
"loss": 2.1041,
"step": 9900
},
{
"epoch": 0.048304986518903574,
"grad_norm": 3.546875,
"learning_rate": 1.610000809126952e-05,
"loss": 2.1272,
"step": 9950
},
{
"epoch": 0.048547725144626706,
"grad_norm": 2.796875,
"learning_rate": 1.61809207864714e-05,
"loss": 2.1146,
"step": 10000
},
{
"epoch": 0.04879046377034984,
"grad_norm": 3.65625,
"learning_rate": 1.6261833481673276e-05,
"loss": 2.1449,
"step": 10050
},
{
"epoch": 0.04903320239607298,
"grad_norm": 3.734375,
"learning_rate": 1.634274617687515e-05,
"loss": 2.1108,
"step": 10100
},
{
"epoch": 0.04927594102179611,
"grad_norm": 4.0,
"learning_rate": 1.642365887207703e-05,
"loss": 2.0933,
"step": 10150
},
{
"epoch": 0.04951867964751924,
"grad_norm": 3.421875,
"learning_rate": 1.6504571567278905e-05,
"loss": 2.0981,
"step": 10200
},
{
"epoch": 0.049761418273242375,
"grad_norm": 4.40625,
"learning_rate": 1.6585484262480784e-05,
"loss": 2.1161,
"step": 10250
},
{
"epoch": 0.05000415689896551,
"grad_norm": 4.8125,
"learning_rate": 1.6666396957682663e-05,
"loss": 2.1318,
"step": 10300
},
{
"epoch": 0.05024689552468864,
"grad_norm": 3.375,
"learning_rate": 1.674730965288454e-05,
"loss": 2.1162,
"step": 10350
},
{
"epoch": 0.05048963415041178,
"grad_norm": 3.953125,
"learning_rate": 1.6828222348086417e-05,
"loss": 2.1361,
"step": 10400
},
{
"epoch": 0.05073237277613491,
"grad_norm": 3.40625,
"learning_rate": 1.6909135043288293e-05,
"loss": 2.1388,
"step": 10450
},
{
"epoch": 0.05097511140185804,
"grad_norm": 4.03125,
"learning_rate": 1.699004773849017e-05,
"loss": 2.1265,
"step": 10500
},
{
"epoch": 0.051217850027581176,
"grad_norm": 3.546875,
"learning_rate": 1.7070960433692047e-05,
"loss": 2.0881,
"step": 10550
},
{
"epoch": 0.05146058865330431,
"grad_norm": 2.96875,
"learning_rate": 1.7151873128893926e-05,
"loss": 2.0963,
"step": 10600
},
{
"epoch": 0.05170332727902744,
"grad_norm": 3.640625,
"learning_rate": 1.72327858240958e-05,
"loss": 2.1338,
"step": 10650
},
{
"epoch": 0.05194606590475058,
"grad_norm": 3.734375,
"learning_rate": 1.731369851929768e-05,
"loss": 2.1344,
"step": 10700
},
{
"epoch": 0.05218880453047371,
"grad_norm": 3.1875,
"learning_rate": 1.739461121449956e-05,
"loss": 2.1065,
"step": 10750
},
{
"epoch": 0.052431543156196844,
"grad_norm": 3.9375,
"learning_rate": 1.7475523909701434e-05,
"loss": 2.1218,
"step": 10800
},
{
"epoch": 0.05267428178191998,
"grad_norm": 4.03125,
"learning_rate": 1.755643660490331e-05,
"loss": 2.1298,
"step": 10850
},
{
"epoch": 0.05291702040764311,
"grad_norm": 3.609375,
"learning_rate": 1.7637349300105188e-05,
"loss": 2.1081,
"step": 10900
},
{
"epoch": 0.05315975903336624,
"grad_norm": 3.703125,
"learning_rate": 1.7718261995307064e-05,
"loss": 2.12,
"step": 10950
},
{
"epoch": 0.05340249765908938,
"grad_norm": 2.5625,
"learning_rate": 1.7799174690508942e-05,
"loss": 2.1092,
"step": 11000
},
{
"epoch": 0.05364523628481251,
"grad_norm": 3.71875,
"learning_rate": 1.788008738571082e-05,
"loss": 2.1315,
"step": 11050
},
{
"epoch": 0.053887974910535645,
"grad_norm": 3.6875,
"learning_rate": 1.7961000080912697e-05,
"loss": 2.0988,
"step": 11100
},
{
"epoch": 0.05413071353625878,
"grad_norm": 2.625,
"learning_rate": 1.8041912776114575e-05,
"loss": 2.0957,
"step": 11150
},
{
"epoch": 0.05437345216198191,
"grad_norm": 3.96875,
"learning_rate": 1.812282547131645e-05,
"loss": 2.1168,
"step": 11200
},
{
"epoch": 0.05461619078770505,
"grad_norm": 3.5,
"learning_rate": 1.8203738166518326e-05,
"loss": 2.0957,
"step": 11250
},
{
"epoch": 0.05485892941342818,
"grad_norm": 2.578125,
"learning_rate": 1.8284650861720205e-05,
"loss": 2.0774,
"step": 11300
},
{
"epoch": 0.055101668039151314,
"grad_norm": 2.328125,
"learning_rate": 1.8365563556922084e-05,
"loss": 2.0846,
"step": 11350
},
{
"epoch": 0.055344406664874446,
"grad_norm": 5.125,
"learning_rate": 1.844647625212396e-05,
"loss": 2.0767,
"step": 11400
},
{
"epoch": 0.05558714529059758,
"grad_norm": 3.453125,
"learning_rate": 1.8527388947325838e-05,
"loss": 2.077,
"step": 11450
},
{
"epoch": 0.05582988391632071,
"grad_norm": 3.890625,
"learning_rate": 1.8608301642527717e-05,
"loss": 2.0567,
"step": 11500
},
{
"epoch": 0.05607262254204385,
"grad_norm": 3.890625,
"learning_rate": 1.8689214337729592e-05,
"loss": 2.0764,
"step": 11550
},
{
"epoch": 0.05631536116776698,
"grad_norm": 2.84375,
"learning_rate": 1.8770127032931468e-05,
"loss": 2.091,
"step": 11600
},
{
"epoch": 0.056558099793490114,
"grad_norm": 3.265625,
"learning_rate": 1.8851039728133343e-05,
"loss": 2.0924,
"step": 11650
},
{
"epoch": 0.05680083841921325,
"grad_norm": 3.890625,
"learning_rate": 1.8931952423335222e-05,
"loss": 2.0802,
"step": 11700
},
{
"epoch": 0.05704357704493638,
"grad_norm": 4.03125,
"learning_rate": 1.90128651185371e-05,
"loss": 2.0968,
"step": 11750
},
{
"epoch": 0.05728631567065951,
"grad_norm": 5.03125,
"learning_rate": 1.9093777813738976e-05,
"loss": 2.0843,
"step": 11800
},
{
"epoch": 0.05752905429638265,
"grad_norm": 3.59375,
"learning_rate": 1.9174690508940855e-05,
"loss": 2.0563,
"step": 11850
},
{
"epoch": 0.05777179292210578,
"grad_norm": 2.953125,
"learning_rate": 1.9255603204142734e-05,
"loss": 2.0981,
"step": 11900
},
{
"epoch": 0.058014531547828915,
"grad_norm": 3.171875,
"learning_rate": 1.933651589934461e-05,
"loss": 2.0896,
"step": 11950
},
{
"epoch": 0.05825727017355205,
"grad_norm": 3.296875,
"learning_rate": 1.9417428594546484e-05,
"loss": 2.0678,
"step": 12000
},
{
"epoch": 0.05850000879927518,
"grad_norm": 3.46875,
"learning_rate": 1.9498341289748363e-05,
"loss": 2.1155,
"step": 12050
},
{
"epoch": 0.05874274742499832,
"grad_norm": 4.40625,
"learning_rate": 1.957925398495024e-05,
"loss": 2.0639,
"step": 12100
},
{
"epoch": 0.05898548605072145,
"grad_norm": 4.09375,
"learning_rate": 1.9660166680152117e-05,
"loss": 2.0794,
"step": 12150
},
{
"epoch": 0.059228224676444584,
"grad_norm": 3.359375,
"learning_rate": 1.9741079375353996e-05,
"loss": 2.0526,
"step": 12200
},
{
"epoch": 0.059470963302167716,
"grad_norm": 2.90625,
"learning_rate": 1.982199207055587e-05,
"loss": 2.0403,
"step": 12250
},
{
"epoch": 0.05971370192789085,
"grad_norm": 3.65625,
"learning_rate": 1.990290476575775e-05,
"loss": 2.0693,
"step": 12300
},
{
"epoch": 0.05995644055361398,
"grad_norm": 4.3125,
"learning_rate": 1.9983817460959626e-05,
"loss": 2.0828,
"step": 12350
},
{
"epoch": 0.06019917917933712,
"grad_norm": 3.78125,
"learning_rate": 1.9999999505548666e-05,
"loss": 2.0928,
"step": 12400
},
{
"epoch": 0.06044191780506025,
"grad_norm": 4.59375,
"learning_rate": 1.99999974968402e-05,
"loss": 2.0529,
"step": 12450
},
{
"epoch": 0.060684656430783385,
"grad_norm": 4.0,
"learning_rate": 1.9999993942971703e-05,
"loss": 2.0478,
"step": 12500
},
{
"epoch": 0.06092739505650652,
"grad_norm": 3.359375,
"learning_rate": 1.9999988843943725e-05,
"loss": 2.0252,
"step": 12550
},
{
"epoch": 0.06117013368222965,
"grad_norm": 3.5,
"learning_rate": 1.9999982199757056e-05,
"loss": 2.1012,
"step": 12600
},
{
"epoch": 0.06141287230795278,
"grad_norm": 3.21875,
"learning_rate": 1.999997401041272e-05,
"loss": 2.0468,
"step": 12650
},
{
"epoch": 0.06165561093367592,
"grad_norm": 2.5,
"learning_rate": 1.9999964275911983e-05,
"loss": 2.0217,
"step": 12700
},
{
"epoch": 0.06189834955939905,
"grad_norm": 4.28125,
"learning_rate": 1.9999952996256348e-05,
"loss": 2.0882,
"step": 12750
},
{
"epoch": 0.062141088185122186,
"grad_norm": 3.3125,
"learning_rate": 1.9999940171447564e-05,
"loss": 2.0498,
"step": 12800
},
{
"epoch": 0.06238382681084532,
"grad_norm": 3.609375,
"learning_rate": 1.9999925801487606e-05,
"loss": 2.0406,
"step": 12850
},
{
"epoch": 0.06262656543656846,
"grad_norm": 3.625,
"learning_rate": 1.9999909886378694e-05,
"loss": 2.1089,
"step": 12900
},
{
"epoch": 0.06286930406229159,
"grad_norm": 3.84375,
"learning_rate": 1.999989242612329e-05,
"loss": 2.0344,
"step": 12950
},
{
"epoch": 0.06311204268801472,
"grad_norm": 2.484375,
"learning_rate": 1.999987342072409e-05,
"loss": 2.0811,
"step": 13000
},
{
"epoch": 0.06335478131373785,
"grad_norm": 4.46875,
"learning_rate": 1.999985287018404e-05,
"loss": 2.0513,
"step": 13050
},
{
"epoch": 0.06359751993946099,
"grad_norm": 4.0,
"learning_rate": 1.99998307745063e-05,
"loss": 2.0702,
"step": 13100
},
{
"epoch": 0.06384025856518412,
"grad_norm": 4.34375,
"learning_rate": 1.9999807133694295e-05,
"loss": 2.0912,
"step": 13150
},
{
"epoch": 0.06408299719090725,
"grad_norm": 3.484375,
"learning_rate": 1.9999781947751672e-05,
"loss": 2.066,
"step": 13200
},
{
"epoch": 0.06432573581663038,
"grad_norm": 3.953125,
"learning_rate": 1.9999755216682325e-05,
"loss": 2.0005,
"step": 13250
},
{
"epoch": 0.06456847444235352,
"grad_norm": 2.921875,
"learning_rate": 1.9999726940490384e-05,
"loss": 2.0364,
"step": 13300
},
{
"epoch": 0.06481121306807665,
"grad_norm": 3.359375,
"learning_rate": 1.9999697119180222e-05,
"loss": 2.0808,
"step": 13350
},
{
"epoch": 0.0650539516937998,
"grad_norm": 3.21875,
"learning_rate": 1.9999665752756444e-05,
"loss": 2.0613,
"step": 13400
},
{
"epoch": 0.06529669031952293,
"grad_norm": 3.140625,
"learning_rate": 1.9999632841223892e-05,
"loss": 2.0492,
"step": 13450
},
{
"epoch": 0.06553942894524606,
"grad_norm": 3.203125,
"learning_rate": 1.9999598384587658e-05,
"loss": 2.0875,
"step": 13500
},
{
"epoch": 0.06578216757096919,
"grad_norm": 4.25,
"learning_rate": 1.999956238285306e-05,
"loss": 2.0319,
"step": 13550
},
{
"epoch": 0.06602490619669232,
"grad_norm": 4.15625,
"learning_rate": 1.999952483602567e-05,
"loss": 2.0415,
"step": 13600
},
{
"epoch": 0.06626764482241546,
"grad_norm": 3.25,
"learning_rate": 1.999948574411128e-05,
"loss": 2.0247,
"step": 13650
},
{
"epoch": 0.06651038344813859,
"grad_norm": 3.71875,
"learning_rate": 1.9999445107115936e-05,
"loss": 2.0294,
"step": 13700
},
{
"epoch": 0.06675312207386172,
"grad_norm": 4.21875,
"learning_rate": 1.9999402925045914e-05,
"loss": 2.056,
"step": 13750
},
{
"epoch": 0.06699586069958485,
"grad_norm": 3.0,
"learning_rate": 1.9999359197907732e-05,
"loss": 2.0527,
"step": 13800
},
{
"epoch": 0.06723859932530798,
"grad_norm": 3.34375,
"learning_rate": 1.999931392570815e-05,
"loss": 2.0727,
"step": 13850
},
{
"epoch": 0.06748133795103112,
"grad_norm": 3.4375,
"learning_rate": 1.9999267108454163e-05,
"loss": 2.0542,
"step": 13900
},
{
"epoch": 0.06772407657675426,
"grad_norm": 3.109375,
"learning_rate": 1.9999218746153e-05,
"loss": 1.979,
"step": 13950
},
{
"epoch": 0.0679668152024774,
"grad_norm": 3.765625,
"learning_rate": 1.999916883881214e-05,
"loss": 2.052,
"step": 14000
}
],
"logging_steps": 50,
"max_steps": 411966,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.52714505134465e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}