zephyr-7b-sft-iter1 / trainer_state.json
billxbf's picture
Model save
19ae937 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 638,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.725390625,
"epoch": 0.015698587127158554,
"grad_norm": 27.6445930424515,
"learning_rate": 1.25e-06,
"loss": 1.2418,
"mean_token_accuracy": 0.7577933847904206,
"num_tokens": 177927.0,
"step": 5
},
{
"entropy": 1.057421875,
"epoch": 0.03139717425431711,
"grad_norm": 7.557791169901141,
"learning_rate": 2.8125e-06,
"loss": 1.0793,
"mean_token_accuracy": 0.7529823184013367,
"num_tokens": 365825.0,
"step": 10
},
{
"entropy": 0.96953125,
"epoch": 0.04709576138147567,
"grad_norm": 7.799882301858085,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.9441,
"mean_token_accuracy": 0.76476891040802,
"num_tokens": 550560.0,
"step": 15
},
{
"entropy": 0.983203125,
"epoch": 0.06279434850863422,
"grad_norm": 6.069209273352368,
"learning_rate": 5.9375e-06,
"loss": 0.9834,
"mean_token_accuracy": 0.7617899596691131,
"num_tokens": 731067.0,
"step": 20
},
{
"entropy": 0.969921875,
"epoch": 0.07849293563579278,
"grad_norm": 6.613994091501819,
"learning_rate": 7.500000000000001e-06,
"loss": 0.9232,
"mean_token_accuracy": 0.7665803909301758,
"num_tokens": 899849.0,
"step": 25
},
{
"entropy": 0.88515625,
"epoch": 0.09419152276295134,
"grad_norm": 5.9986199691905515,
"learning_rate": 9.0625e-06,
"loss": 0.8711,
"mean_token_accuracy": 0.7756525576114655,
"num_tokens": 1093174.0,
"step": 30
},
{
"entropy": 0.96953125,
"epoch": 0.10989010989010989,
"grad_norm": 6.593322862351196,
"learning_rate": 1.0625e-05,
"loss": 0.9321,
"mean_token_accuracy": 0.7604309499263764,
"num_tokens": 1275951.0,
"step": 35
},
{
"entropy": 0.903515625,
"epoch": 0.12558869701726844,
"grad_norm": 5.878816195096992,
"learning_rate": 1.2187500000000001e-05,
"loss": 0.9051,
"mean_token_accuracy": 0.7668895900249482,
"num_tokens": 1453554.0,
"step": 40
},
{
"entropy": 0.882421875,
"epoch": 0.141287284144427,
"grad_norm": 6.045022529791159,
"learning_rate": 1.375e-05,
"loss": 0.8897,
"mean_token_accuracy": 0.7693747282028198,
"num_tokens": 1622977.0,
"step": 45
},
{
"entropy": 0.96015625,
"epoch": 0.15698587127158556,
"grad_norm": 5.829657500936277,
"learning_rate": 1.5312500000000003e-05,
"loss": 0.9579,
"mean_token_accuracy": 0.7548311114311218,
"num_tokens": 1798487.0,
"step": 50
},
{
"entropy": 1.0171875,
"epoch": 0.1726844583987441,
"grad_norm": 5.908848606028448,
"learning_rate": 1.6875e-05,
"loss": 1.0203,
"mean_token_accuracy": 0.7450043380260467,
"num_tokens": 1983138.0,
"step": 55
},
{
"entropy": 0.962890625,
"epoch": 0.18838304552590268,
"grad_norm": 10.060274999825255,
"learning_rate": 1.84375e-05,
"loss": 0.97,
"mean_token_accuracy": 0.7483593642711639,
"num_tokens": 2170951.0,
"step": 60
},
{
"entropy": 1.046875,
"epoch": 0.20408163265306123,
"grad_norm": 89.6713302535496,
"learning_rate": 2e-05,
"loss": 1.034,
"mean_token_accuracy": 0.7365632832050324,
"num_tokens": 2341116.0,
"step": 65
},
{
"entropy": 1.0328125,
"epoch": 0.21978021978021978,
"grad_norm": 6.1550408648286625,
"learning_rate": 1.999625580145365e-05,
"loss": 1.0563,
"mean_token_accuracy": 0.7371429681777955,
"num_tokens": 2520854.0,
"step": 70
},
{
"entropy": 1.06796875,
"epoch": 0.23547880690737832,
"grad_norm": 11.503641001521224,
"learning_rate": 1.998502600961916e-05,
"loss": 1.0691,
"mean_token_accuracy": 0.7334820926189423,
"num_tokens": 2699173.0,
"step": 75
},
{
"entropy": 1.047265625,
"epoch": 0.25117739403453687,
"grad_norm": 13.475655867100802,
"learning_rate": 1.9966319033810575e-05,
"loss": 1.056,
"mean_token_accuracy": 0.7318085134029388,
"num_tokens": 2874346.0,
"step": 80
},
{
"entropy": 1.021875,
"epoch": 0.2668759811616955,
"grad_norm": 14.212065936594165,
"learning_rate": 1.9940148882554223e-05,
"loss": 1.0187,
"mean_token_accuracy": 0.7380395472049713,
"num_tokens": 3066471.0,
"step": 85
},
{
"entropy": 0.9859375,
"epoch": 0.282574568288854,
"grad_norm": 9.014318953008775,
"learning_rate": 1.9906535153098558e-05,
"loss": 1.0016,
"mean_token_accuracy": 0.7432572603225708,
"num_tokens": 3247301.0,
"step": 90
},
{
"entropy": 1.029296875,
"epoch": 0.29827315541601257,
"grad_norm": 5.81288792016315,
"learning_rate": 1.9865503016738983e-05,
"loss": 1.0401,
"mean_token_accuracy": 0.7338248550891876,
"num_tokens": 3442754.0,
"step": 95
},
{
"entropy": 0.941015625,
"epoch": 0.3139717425431711,
"grad_norm": 6.044046817780057,
"learning_rate": 1.9817083199968552e-05,
"loss": 0.9458,
"mean_token_accuracy": 0.7531112670898438,
"num_tokens": 3631756.0,
"step": 100
},
{
"entropy": 1.04609375,
"epoch": 0.32967032967032966,
"grad_norm": 6.0210489321664635,
"learning_rate": 1.9761311961468782e-05,
"loss": 1.0348,
"mean_token_accuracy": 0.7363064765930176,
"num_tokens": 3804931.0,
"step": 105
},
{
"entropy": 1.10390625,
"epoch": 0.3453689167974882,
"grad_norm": 5.33683972080787,
"learning_rate": 1.9698231064957695e-05,
"loss": 1.0934,
"mean_token_accuracy": 0.7270130455493927,
"num_tokens": 3985420.0,
"step": 110
},
{
"entropy": 1.03046875,
"epoch": 0.36106750392464676,
"grad_norm": 10.704522004356988,
"learning_rate": 1.9627887747915496e-05,
"loss": 1.0708,
"mean_token_accuracy": 0.7329379975795746,
"num_tokens": 4169490.0,
"step": 115
},
{
"entropy": 1.00390625,
"epoch": 0.37676609105180536,
"grad_norm": 7.036082714775898,
"learning_rate": 1.955033468621126e-05,
"loss": 1.0103,
"mean_token_accuracy": 0.7396618247032165,
"num_tokens": 4347650.0,
"step": 120
},
{
"entropy": 0.99921875,
"epoch": 0.3924646781789639,
"grad_norm": 6.818913605192643,
"learning_rate": 1.9465629954657185e-05,
"loss": 1.0001,
"mean_token_accuracy": 0.7393137633800506,
"num_tokens": 4537162.0,
"step": 125
},
{
"entropy": 1.0328125,
"epoch": 0.40816326530612246,
"grad_norm": 5.755556717131576,
"learning_rate": 1.9373836983519807e-05,
"loss": 1.0505,
"mean_token_accuracy": 0.7324462294578552,
"num_tokens": 4712286.0,
"step": 130
},
{
"entropy": 1.044140625,
"epoch": 0.423861852433281,
"grad_norm": 9.446755565244738,
"learning_rate": 1.927502451102095e-05,
"loss": 1.0619,
"mean_token_accuracy": 0.7303735911846161,
"num_tokens": 4890250.0,
"step": 135
},
{
"entropy": 1.11796875,
"epoch": 0.43956043956043955,
"grad_norm": 10.157882077053817,
"learning_rate": 1.916926653186379e-05,
"loss": 1.0962,
"mean_token_accuracy": 0.7196236491203308,
"num_tokens": 5065862.0,
"step": 140
},
{
"entropy": 0.9890625,
"epoch": 0.4552590266875981,
"grad_norm": 4.921009913523743,
"learning_rate": 1.905664224182269e-05,
"loss": 0.9884,
"mean_token_accuracy": 0.7450262784957886,
"num_tokens": 5240079.0,
"step": 145
},
{
"entropy": 1.028125,
"epoch": 0.47095761381475665,
"grad_norm": 6.1485435101760295,
"learning_rate": 1.8937235978438272e-05,
"loss": 1.05,
"mean_token_accuracy": 0.7296487390995026,
"num_tokens": 5415445.0,
"step": 150
},
{
"entropy": 0.9984375,
"epoch": 0.48665620094191525,
"grad_norm": 4.858301358373238,
"learning_rate": 1.8811137157862084e-05,
"loss": 0.9776,
"mean_token_accuracy": 0.7445457696914672,
"num_tokens": 5592814.0,
"step": 155
},
{
"entropy": 1.03203125,
"epoch": 0.5023547880690737,
"grad_norm": 5.7540662569760395,
"learning_rate": 1.8678440207898264e-05,
"loss": 1.0429,
"mean_token_accuracy": 0.7321981191635132,
"num_tokens": 5780889.0,
"step": 160
},
{
"entropy": 1.049609375,
"epoch": 0.5180533751962323,
"grad_norm": 5.198684112846989,
"learning_rate": 1.8539244497292248e-05,
"loss": 1.04,
"mean_token_accuracy": 0.7295248687267304,
"num_tokens": 5965218.0,
"step": 165
},
{
"entropy": 0.9890625,
"epoch": 0.533751962323391,
"grad_norm": 5.232818269013678,
"learning_rate": 1.8393654261319504e-05,
"loss": 0.9886,
"mean_token_accuracy": 0.7405453681945801,
"num_tokens": 6136493.0,
"step": 170
},
{
"entropy": 1.086328125,
"epoch": 0.5494505494505495,
"grad_norm": 5.545204163745446,
"learning_rate": 1.8241778523729997e-05,
"loss": 1.0636,
"mean_token_accuracy": 0.7313859045505524,
"num_tokens": 6325624.0,
"step": 175
},
{
"entropy": 1.057421875,
"epoch": 0.565149136577708,
"grad_norm": 6.499024191424293,
"learning_rate": 1.8083731015106916e-05,
"loss": 1.0735,
"mean_token_accuracy": 0.7249915122985839,
"num_tokens": 6515331.0,
"step": 180
},
{
"entropy": 1.0359375,
"epoch": 0.5808477237048666,
"grad_norm": 5.040936506275316,
"learning_rate": 1.7919630087700672e-05,
"loss": 1.0775,
"mean_token_accuracy": 0.7272228896617889,
"num_tokens": 6696076.0,
"step": 185
},
{
"entropy": 1.06953125,
"epoch": 0.5965463108320251,
"grad_norm": 4.848467550570556,
"learning_rate": 1.7749598626802028e-05,
"loss": 1.0442,
"mean_token_accuracy": 0.7348412156105042,
"num_tokens": 6878935.0,
"step": 190
},
{
"entropy": 1.025,
"epoch": 0.6122448979591837,
"grad_norm": 5.087731121093467,
"learning_rate": 1.7573763958720736e-05,
"loss": 1.0188,
"mean_token_accuracy": 0.7360713183879852,
"num_tokens": 7061314.0,
"step": 195
},
{
"entropy": 1.015625,
"epoch": 0.6279434850863422,
"grad_norm": 9.74435890233205,
"learning_rate": 1.7392257755438516e-05,
"loss": 1.0072,
"mean_token_accuracy": 0.738730925321579,
"num_tokens": 7242353.0,
"step": 200
},
{
"entropy": 0.999609375,
"epoch": 0.6436420722135008,
"grad_norm": 15.98222584676736,
"learning_rate": 1.720521593600787e-05,
"loss": 0.9944,
"mean_token_accuracy": 0.7411578953266144,
"num_tokens": 7425295.0,
"step": 205
},
{
"entropy": 1.06953125,
"epoch": 0.6593406593406593,
"grad_norm": 5.835834203383889,
"learning_rate": 1.7012778564770484e-05,
"loss": 1.0383,
"mean_token_accuracy": 0.7306137382984161,
"num_tokens": 7584824.0,
"step": 210
},
{
"entropy": 0.99375,
"epoch": 0.6750392464678179,
"grad_norm": 4.8622795054153265,
"learning_rate": 1.6815089746471472e-05,
"loss": 1.0118,
"mean_token_accuracy": 0.742330664396286,
"num_tokens": 7768228.0,
"step": 215
},
{
"entropy": 1.03203125,
"epoch": 0.6907378335949764,
"grad_norm": 4.668555704536192,
"learning_rate": 1.6612297518348072e-05,
"loss": 1.0245,
"mean_token_accuracy": 0.7361457228660584,
"num_tokens": 7949122.0,
"step": 220
},
{
"entropy": 1.09453125,
"epoch": 0.706436420722135,
"grad_norm": 4.6567337129267266,
"learning_rate": 1.6404553739273426e-05,
"loss": 1.1086,
"mean_token_accuracy": 0.7151965975761414,
"num_tokens": 8123662.0,
"step": 225
},
{
"entropy": 1.0109375,
"epoch": 0.7221350078492935,
"grad_norm": 5.00422795457061,
"learning_rate": 1.6192013976038663e-05,
"loss": 1.0294,
"mean_token_accuracy": 0.7341724216938019,
"num_tokens": 8312360.0,
"step": 230
},
{
"entropy": 1.028125,
"epoch": 0.7378335949764521,
"grad_norm": 5.388442787487355,
"learning_rate": 1.597483738685829e-05,
"loss": 1.0036,
"mean_token_accuracy": 0.7365865647792816,
"num_tokens": 8492020.0,
"step": 235
},
{
"entropy": 1.063671875,
"epoch": 0.7535321821036107,
"grad_norm": 6.334463361903077,
"learning_rate": 1.5753186602186207e-05,
"loss": 1.0574,
"mean_token_accuracy": 0.7286219894886017,
"num_tokens": 8665809.0,
"step": 240
},
{
"entropy": 1.017578125,
"epoch": 0.7692307692307693,
"grad_norm": 5.970482979437965,
"learning_rate": 1.552722760293157e-05,
"loss": 1.0271,
"mean_token_accuracy": 0.7386983633041382,
"num_tokens": 8847168.0,
"step": 245
},
{
"entropy": 0.987109375,
"epoch": 0.7849293563579278,
"grad_norm": 5.1621550529599585,
"learning_rate": 1.5297129596165684e-05,
"loss": 0.9863,
"mean_token_accuracy": 0.7422987043857574,
"num_tokens": 9025797.0,
"step": 250
},
{
"entropy": 1.087109375,
"epoch": 0.8006279434850864,
"grad_norm": 5.452415452645883,
"learning_rate": 1.5063064888413048e-05,
"loss": 1.0758,
"mean_token_accuracy": 0.7265674948692322,
"num_tokens": 9207096.0,
"step": 255
},
{
"entropy": 1.03203125,
"epoch": 0.8163265306122449,
"grad_norm": 4.632036015662455,
"learning_rate": 1.4825208756621354e-05,
"loss": 1.0237,
"mean_token_accuracy": 0.7348480224609375,
"num_tokens": 9388986.0,
"step": 260
},
{
"entropy": 1.013671875,
"epoch": 0.8320251177394035,
"grad_norm": 5.170243707129603,
"learning_rate": 1.4583739316907188e-05,
"loss": 1.0082,
"mean_token_accuracy": 0.7390485882759095,
"num_tokens": 9574926.0,
"step": 265
},
{
"entropy": 1.047265625,
"epoch": 0.847723704866562,
"grad_norm": 4.549399122747884,
"learning_rate": 1.4338837391175582e-05,
"loss": 1.0387,
"mean_token_accuracy": 0.7376436531543732,
"num_tokens": 9741165.0,
"step": 270
},
{
"entropy": 0.97578125,
"epoch": 0.8634222919937206,
"grad_norm": 4.415042132554178,
"learning_rate": 1.4090686371713403e-05,
"loss": 0.977,
"mean_token_accuracy": 0.7484280169010162,
"num_tokens": 9932402.0,
"step": 275
},
{
"entropy": 0.996484375,
"epoch": 0.8791208791208791,
"grad_norm": 4.520054645554902,
"learning_rate": 1.3839472083857912e-05,
"loss": 0.9802,
"mean_token_accuracy": 0.7440161406993866,
"num_tokens": 10116131.0,
"step": 280
},
{
"entropy": 1.01328125,
"epoch": 0.8948194662480377,
"grad_norm": 4.353453268472847,
"learning_rate": 1.3585382646843396e-05,
"loss": 1.0053,
"mean_token_accuracy": 0.7423594474792481,
"num_tokens": 10301345.0,
"step": 285
},
{
"entropy": 0.94609375,
"epoch": 0.9105180533751962,
"grad_norm": 4.478156722785245,
"learning_rate": 1.332860833293e-05,
"loss": 0.9375,
"mean_token_accuracy": 0.752207487821579,
"num_tokens": 10487099.0,
"step": 290
},
{
"entropy": 0.991796875,
"epoch": 0.9262166405023547,
"grad_norm": 5.467454719870752,
"learning_rate": 1.3069341424920301e-05,
"loss": 0.9833,
"mean_token_accuracy": 0.7454055905342102,
"num_tokens": 10668291.0,
"step": 295
},
{
"entropy": 0.991796875,
"epoch": 0.9419152276295133,
"grad_norm": 5.779761643334634,
"learning_rate": 1.280777607217031e-05,
"loss": 1.0012,
"mean_token_accuracy": 0.7384802579879761,
"num_tokens": 10838776.0,
"step": 300
},
{
"entropy": 1.019140625,
"epoch": 0.957613814756672,
"grad_norm": 6.047537199685854,
"learning_rate": 1.2544108145202748e-05,
"loss": 1.0125,
"mean_token_accuracy": 0.7406556665897369,
"num_tokens": 11015732.0,
"step": 305
},
{
"entropy": 0.985546875,
"epoch": 0.9733124018838305,
"grad_norm": 4.609576682854215,
"learning_rate": 1.2278535089031377e-05,
"loss": 0.9779,
"mean_token_accuracy": 0.7456105411052704,
"num_tokens": 11215029.0,
"step": 310
},
{
"entropy": 0.979296875,
"epoch": 0.989010989010989,
"grad_norm": 4.594430034893244,
"learning_rate": 1.2011255775306378e-05,
"loss": 0.9851,
"mean_token_accuracy": 0.7425277471542359,
"num_tokens": 11397796.0,
"step": 315
},
{
"entropy": 0.9288194444444444,
"epoch": 1.0031397174254317,
"grad_norm": 5.271756092829798,
"learning_rate": 1.1742470353391329e-05,
"loss": 0.8636,
"mean_token_accuracy": 0.7742305133077834,
"num_tokens": 11566189.0,
"step": 320
},
{
"entropy": 0.527734375,
"epoch": 1.0188383045525902,
"grad_norm": 4.26050787407313,
"learning_rate": 1.1472380100483438e-05,
"loss": 0.5265,
"mean_token_accuracy": 0.8555980503559113,
"num_tokens": 11763172.0,
"step": 325
},
{
"entropy": 0.6,
"epoch": 1.0345368916797488,
"grad_norm": 4.175751994504683,
"learning_rate": 1.1201187270889166e-05,
"loss": 0.5485,
"mean_token_accuracy": 0.8497718095779419,
"num_tokens": 11945459.0,
"step": 330
},
{
"entropy": 0.5111328125,
"epoch": 1.0502354788069075,
"grad_norm": 4.128005949558796,
"learning_rate": 1.0929094944568182e-05,
"loss": 0.4993,
"mean_token_accuracy": 0.862428092956543,
"num_tokens": 12128965.0,
"step": 335
},
{
"entropy": 0.6125,
"epoch": 1.065934065934066,
"grad_norm": 3.837079491617288,
"learning_rate": 1.0656306875059024e-05,
"loss": 0.5625,
"mean_token_accuracy": 0.8477550685405731,
"num_tokens": 12328856.0,
"step": 340
},
{
"entropy": 0.550390625,
"epoch": 1.0816326530612246,
"grad_norm": 5.12821302444939,
"learning_rate": 1.0383027336900356e-05,
"loss": 0.5442,
"mean_token_accuracy": 0.8556217610836029,
"num_tokens": 12504085.0,
"step": 345
},
{
"entropy": 0.608203125,
"epoch": 1.097331240188383,
"grad_norm": 4.192313568974114,
"learning_rate": 1.0109460972662081e-05,
"loss": 0.5784,
"mean_token_accuracy": 0.8412099361419678,
"num_tokens": 12682650.0,
"step": 350
},
{
"entropy": 0.5048828125,
"epoch": 1.1130298273155417,
"grad_norm": 4.016027796950457,
"learning_rate": 9.835812639700862e-06,
"loss": 0.4762,
"mean_token_accuracy": 0.8649287343025207,
"num_tokens": 12870774.0,
"step": 355
},
{
"entropy": 0.5435546875,
"epoch": 1.1287284144427001,
"grad_norm": 4.461511861583663,
"learning_rate": 9.562287256754791e-06,
"loss": 0.5157,
"mean_token_accuracy": 0.8568434357643128,
"num_tokens": 13058291.0,
"step": 360
},
{
"entropy": 0.526953125,
"epoch": 1.1444270015698588,
"grad_norm": 3.960694420334278,
"learning_rate": 9.289089650492119e-06,
"loss": 0.4905,
"mean_token_accuracy": 0.8625445365905762,
"num_tokens": 13239182.0,
"step": 365
},
{
"entropy": 0.5201171875,
"epoch": 1.1601255886970172,
"grad_norm": 3.936087366602336,
"learning_rate": 9.016424402128891e-06,
"loss": 0.5054,
"mean_token_accuracy": 0.8619497656822205,
"num_tokens": 13421892.0,
"step": 370
},
{
"entropy": 0.58828125,
"epoch": 1.1758241758241759,
"grad_norm": 3.9961031282423227,
"learning_rate": 8.744495694230413e-06,
"loss": 0.5439,
"mean_token_accuracy": 0.8468229651451111,
"num_tokens": 13593687.0,
"step": 375
},
{
"entropy": 0.4953125,
"epoch": 1.1915227629513343,
"grad_norm": 4.322236217725683,
"learning_rate": 8.473507157811254e-06,
"loss": 0.4683,
"mean_token_accuracy": 0.8697527587413788,
"num_tokens": 13780570.0,
"step": 380
},
{
"entropy": 0.529296875,
"epoch": 1.207221350078493,
"grad_norm": 5.141990553056302,
"learning_rate": 8.203661719848249e-06,
"loss": 0.4863,
"mean_token_accuracy": 0.8653198599815368,
"num_tokens": 13968713.0,
"step": 385
},
{
"entropy": 0.5470703125,
"epoch": 1.2229199372056514,
"grad_norm": 3.842210785861936,
"learning_rate": 7.935161451320696e-06,
"loss": 0.5175,
"mean_token_accuracy": 0.858714509010315,
"num_tokens": 14142058.0,
"step": 390
},
{
"entropy": 0.553125,
"epoch": 1.23861852433281,
"grad_norm": 5.339381588516691,
"learning_rate": 7.668207415891625e-06,
"loss": 0.534,
"mean_token_accuracy": 0.8537694931030273,
"num_tokens": 14324073.0,
"step": 395
},
{
"entropy": 0.539453125,
"epoch": 1.2543171114599687,
"grad_norm": 4.116642693044476,
"learning_rate": 7.402999519343319e-06,
"loss": 0.501,
"mean_token_accuracy": 0.8621095418930054,
"num_tokens": 14493070.0,
"step": 400
},
{
"entropy": 0.50625,
"epoch": 1.2700156985871272,
"grad_norm": 3.8927696021633973,
"learning_rate": 7.139736359879916e-06,
"loss": 0.4792,
"mean_token_accuracy": 0.8644804239273072,
"num_tokens": 14667152.0,
"step": 405
},
{
"entropy": 0.5326171875,
"epoch": 1.2857142857142856,
"grad_norm": 3.9499973554079313,
"learning_rate": 6.878615079409221e-06,
"loss": 0.4959,
"mean_token_accuracy": 0.8623277962207794,
"num_tokens": 14841585.0,
"step": 410
},
{
"entropy": 0.52265625,
"epoch": 1.3014128728414442,
"grad_norm": 4.630623780872735,
"learning_rate": 6.619831215914974e-06,
"loss": 0.4879,
"mean_token_accuracy": 0.8631112694740295,
"num_tokens": 15018117.0,
"step": 415
},
{
"entropy": 0.534765625,
"epoch": 1.317111459968603,
"grad_norm": 4.099657499983856,
"learning_rate": 6.363578557030285e-06,
"loss": 0.5156,
"mean_token_accuracy": 0.8561935067176819,
"num_tokens": 15198528.0,
"step": 420
},
{
"entropy": 0.540234375,
"epoch": 1.3328100470957613,
"grad_norm": 4.639376931843806,
"learning_rate": 6.110048994921735e-06,
"loss": 0.5195,
"mean_token_accuracy": 0.8552192032337189,
"num_tokens": 15368254.0,
"step": 425
},
{
"entropy": 0.564453125,
"epoch": 1.34850863422292,
"grad_norm": 3.6122566097775257,
"learning_rate": 5.859432382592895e-06,
"loss": 0.5143,
"mean_token_accuracy": 0.8563300728797912,
"num_tokens": 15537168.0,
"step": 430
},
{
"entropy": 0.5,
"epoch": 1.3642072213500784,
"grad_norm": 4.097499019050656,
"learning_rate": 5.611916391714887e-06,
"loss": 0.4718,
"mean_token_accuracy": 0.868126118183136,
"num_tokens": 15712027.0,
"step": 435
},
{
"entropy": 0.512890625,
"epoch": 1.379905808477237,
"grad_norm": 3.8541646824269042,
"learning_rate": 5.367686372090359e-06,
"loss": 0.4784,
"mean_token_accuracy": 0.8659977853298187,
"num_tokens": 15895860.0,
"step": 440
},
{
"entropy": 0.5048828125,
"epoch": 1.3956043956043955,
"grad_norm": 3.8898545956491763,
"learning_rate": 5.126925212856202e-06,
"loss": 0.4616,
"mean_token_accuracy": 0.8711130917072296,
"num_tokens": 16076742.0,
"step": 445
},
{
"entropy": 0.4904296875,
"epoch": 1.4113029827315542,
"grad_norm": 4.486423301199419,
"learning_rate": 4.889813205528895e-06,
"loss": 0.4697,
"mean_token_accuracy": 0.8675071418285369,
"num_tokens": 16253720.0,
"step": 450
},
{
"entropy": 0.522265625,
"epoch": 1.4270015698587128,
"grad_norm": 4.353150769090539,
"learning_rate": 4.65652790899508e-06,
"loss": 0.4806,
"mean_token_accuracy": 0.8667589604854584,
"num_tokens": 16430030.0,
"step": 455
},
{
"entropy": 0.508203125,
"epoch": 1.4427001569858713,
"grad_norm": 3.724503033243514,
"learning_rate": 4.427244016548375e-06,
"loss": 0.4856,
"mean_token_accuracy": 0.8651303589344025,
"num_tokens": 16620012.0,
"step": 460
},
{
"entropy": 0.5498046875,
"epoch": 1.4583987441130297,
"grad_norm": 3.8968644379129165,
"learning_rate": 4.202133225072153e-06,
"loss": 0.5123,
"mean_token_accuracy": 0.8566905677318573,
"num_tokens": 16799772.0,
"step": 465
},
{
"entropy": 0.5142578125,
"epoch": 1.4740973312401884,
"grad_norm": 4.227950531214132,
"learning_rate": 3.9813641064660525e-06,
"loss": 0.4776,
"mean_token_accuracy": 0.8667101263999939,
"num_tokens": 16980171.0,
"step": 470
},
{
"entropy": 0.4716796875,
"epoch": 1.489795918367347,
"grad_norm": 3.7526226508520275,
"learning_rate": 3.7651019814126656e-06,
"loss": 0.4512,
"mean_token_accuracy": 0.8713797450065612,
"num_tokens": 17159904.0,
"step": 475
},
{
"entropy": 0.481640625,
"epoch": 1.5054945054945055,
"grad_norm": 3.6868580303818583,
"learning_rate": 3.5535087955788396e-06,
"loss": 0.4425,
"mean_token_accuracy": 0.8738309502601623,
"num_tokens": 17337452.0,
"step": 480
},
{
"entropy": 0.5099609375,
"epoch": 1.521193092621664,
"grad_norm": 3.695453877176934,
"learning_rate": 3.3467429983443477e-06,
"loss": 0.4807,
"mean_token_accuracy": 0.8654368996620179,
"num_tokens": 17524548.0,
"step": 485
},
{
"entropy": 0.4744140625,
"epoch": 1.5368916797488226,
"grad_norm": 3.54819349493318,
"learning_rate": 3.144959424148666e-06,
"loss": 0.4421,
"mean_token_accuracy": 0.8751842856407166,
"num_tokens": 17722299.0,
"step": 490
},
{
"entropy": 0.488671875,
"epoch": 1.5525902668759812,
"grad_norm": 3.8797108965135285,
"learning_rate": 2.9483091765448426e-06,
"loss": 0.453,
"mean_token_accuracy": 0.8750694751739502,
"num_tokens": 17902084.0,
"step": 495
},
{
"entropy": 0.481640625,
"epoch": 1.5682888540031397,
"grad_norm": 3.992381863433219,
"learning_rate": 2.756939515047108e-06,
"loss": 0.4511,
"mean_token_accuracy": 0.8725055634975434,
"num_tokens": 18079745.0,
"step": 500
},
{
"entropy": 0.503125,
"epoch": 1.5839874411302983,
"grad_norm": 3.6288480240477794,
"learning_rate": 2.570993744857151e-06,
"loss": 0.4688,
"mean_token_accuracy": 0.8687343716621398,
"num_tokens": 18258975.0,
"step": 505
},
{
"entropy": 0.49375,
"epoch": 1.599686028257457,
"grad_norm": 3.9138568832414684,
"learning_rate": 2.390611109551456e-06,
"loss": 0.4576,
"mean_token_accuracy": 0.8736713230609894,
"num_tokens": 18436950.0,
"step": 510
},
{
"entropy": 0.4779296875,
"epoch": 1.6153846153846154,
"grad_norm": 3.974668183681603,
"learning_rate": 2.215926686810206e-06,
"loss": 0.4624,
"mean_token_accuracy": 0.8699750483036042,
"num_tokens": 18612816.0,
"step": 515
},
{
"entropy": 0.46953125,
"epoch": 1.6310832025117739,
"grad_norm": 3.417678338297766,
"learning_rate": 2.047071287265735e-06,
"loss": 0.4274,
"mean_token_accuracy": 0.8788953125476837,
"num_tokens": 18779655.0,
"step": 520
},
{
"entropy": 0.512109375,
"epoch": 1.6467817896389325,
"grad_norm": 4.3168528799542365,
"learning_rate": 1.8841713565463548e-06,
"loss": 0.4806,
"mean_token_accuracy": 0.8664675652980804,
"num_tokens": 18963728.0,
"step": 525
},
{
"entropy": 0.521484375,
"epoch": 1.6624803767660912,
"grad_norm": 3.8378909961117746,
"learning_rate": 1.727348880588815e-06,
"loss": 0.4898,
"mean_token_accuracy": 0.864307701587677,
"num_tokens": 19155989.0,
"step": 530
},
{
"entropy": 0.5279296875,
"epoch": 1.6781789638932496,
"grad_norm": 3.7445661009980395,
"learning_rate": 1.5767212942904275e-06,
"loss": 0.4797,
"mean_token_accuracy": 0.8675626039505004,
"num_tokens": 19331660.0,
"step": 535
},
{
"entropy": 0.49140625,
"epoch": 1.693877551020408,
"grad_norm": 3.57312597751139,
"learning_rate": 1.4324013935691205e-06,
"loss": 0.4652,
"mean_token_accuracy": 0.8723213970661163,
"num_tokens": 19506633.0,
"step": 540
},
{
"entropy": 0.4623046875,
"epoch": 1.7095761381475667,
"grad_norm": 3.8004849150443754,
"learning_rate": 1.2944972508973908e-06,
"loss": 0.4387,
"mean_token_accuracy": 0.8761039733886719,
"num_tokens": 19687893.0,
"step": 545
},
{
"entropy": 0.4958984375,
"epoch": 1.7252747252747254,
"grad_norm": 3.7246541942244846,
"learning_rate": 1.1631121343733443e-06,
"loss": 0.457,
"mean_token_accuracy": 0.8720730483531952,
"num_tokens": 19861818.0,
"step": 550
},
{
"entropy": 0.5181640625,
"epoch": 1.7409733124018838,
"grad_norm": 3.771046430655077,
"learning_rate": 1.0383444303894453e-06,
"loss": 0.4931,
"mean_token_accuracy": 0.8654853940010071,
"num_tokens": 20051183.0,
"step": 555
},
{
"entropy": 0.4501953125,
"epoch": 1.7566718995290422,
"grad_norm": 3.136760621936483,
"learning_rate": 9.202875699568636e-07,
"loss": 0.4119,
"mean_token_accuracy": 0.8824858725070953,
"num_tokens": 20240768.0,
"step": 560
},
{
"entropy": 0.4869140625,
"epoch": 1.772370486656201,
"grad_norm": 3.821451067956206,
"learning_rate": 8.090299587406514e-07,
"loss": 0.4514,
"mean_token_accuracy": 0.8730863869190216,
"num_tokens": 20418509.0,
"step": 565
},
{
"entropy": 0.47734375,
"epoch": 1.7880690737833596,
"grad_norm": 3.8388982673994567,
"learning_rate": 7.04654910858038e-07,
"loss": 0.4506,
"mean_token_accuracy": 0.8735210478305817,
"num_tokens": 20598161.0,
"step": 570
},
{
"entropy": 0.45625,
"epoch": 1.803767660910518,
"grad_norm": 3.9183565034692465,
"learning_rate": 6.072405864895403e-07,
"loss": 0.4307,
"mean_token_accuracy": 0.8798979103565217,
"num_tokens": 20779725.0,
"step": 575
},
{
"entropy": 0.4697265625,
"epoch": 1.8194662480376766,
"grad_norm": 4.022575031872933,
"learning_rate": 5.16859933349495e-07,
"loss": 0.4312,
"mean_token_accuracy": 0.8792614817619324,
"num_tokens": 20948735.0,
"step": 580
},
{
"entropy": 0.4603515625,
"epoch": 1.8351648351648353,
"grad_norm": 3.4815300156968076,
"learning_rate": 4.335806320599234e-07,
"loss": 0.4257,
"mean_token_accuracy": 0.8804711699485779,
"num_tokens": 21121016.0,
"step": 585
},
{
"entropy": 0.4689453125,
"epoch": 1.8508634222919937,
"grad_norm": 3.4871494725317707,
"learning_rate": 3.574650454685902e-07,
"loss": 0.4385,
"mean_token_accuracy": 0.8778711676597595,
"num_tokens": 21311277.0,
"step": 590
},
{
"entropy": 0.4796875,
"epoch": 1.8665620094191522,
"grad_norm": 3.882125284807203,
"learning_rate": 2.8857017194923174e-07,
"loss": 0.4416,
"mean_token_accuracy": 0.8744481921195983,
"num_tokens": 21491007.0,
"step": 595
},
{
"entropy": 0.48984375,
"epoch": 1.8822605965463108,
"grad_norm": 3.5907652067766356,
"learning_rate": 2.2694760271890215e-07,
"loss": 0.46,
"mean_token_accuracy": 0.8710661828517914,
"num_tokens": 21670814.0,
"step": 600
},
{
"entropy": 0.4677734375,
"epoch": 1.8979591836734695,
"grad_norm": 3.5415128539129705,
"learning_rate": 1.7264348320442992e-07,
"loss": 0.4244,
"mean_token_accuracy": 0.8819806277751923,
"num_tokens": 21855006.0,
"step": 605
},
{
"entropy": 0.5015625,
"epoch": 1.913657770800628,
"grad_norm": 4.003861542545578,
"learning_rate": 1.256984784868842e-07,
"loss": 0.4683,
"mean_token_accuracy": 0.8715729892253876,
"num_tokens": 22033513.0,
"step": 610
},
{
"entropy": 0.470703125,
"epoch": 1.9293563579277864,
"grad_norm": 3.119159854744513,
"learning_rate": 8.614774284994797e-08,
"loss": 0.443,
"mean_token_accuracy": 0.8770529448986053,
"num_tokens": 22220738.0,
"step": 615
},
{
"entropy": 0.455078125,
"epoch": 1.945054945054945,
"grad_norm": 3.873386527152864,
"learning_rate": 5.402089345499795e-08,
"loss": 0.4261,
"mean_token_accuracy": 0.8815056264400483,
"num_tokens": 22411420.0,
"step": 620
},
{
"entropy": 0.448828125,
"epoch": 1.9607535321821037,
"grad_norm": 3.503514019607563,
"learning_rate": 2.9341988162595593e-08,
"loss": 0.4299,
"mean_token_accuracy": 0.8768543183803559,
"num_tokens": 22594533.0,
"step": 625
},
{
"entropy": 0.49296875,
"epoch": 1.9764521193092621,
"grad_norm": 3.6439296442719877,
"learning_rate": 1.2129507517003591e-08,
"loss": 0.4646,
"mean_token_accuracy": 0.8703228771686554,
"num_tokens": 22774333.0,
"step": 630
},
{
"entropy": 0.4900390625,
"epoch": 1.9921507064364206,
"grad_norm": 3.4854702032469973,
"learning_rate": 2.396340907225847e-09,
"loss": 0.46,
"mean_token_accuracy": 0.8727392196655274,
"num_tokens": 22959536.0,
"step": 635
},
{
"entropy": 0.5125,
"epoch": 2.0,
"mean_token_accuracy": 0.8682946562767029,
"num_tokens": 23058558.0,
"step": 638,
"total_flos": 77930701455360.0,
"train_loss": 0.7463928270489445,
"train_runtime": 1636.8404,
"train_samples_per_second": 24.899,
"train_steps_per_second": 0.39
}
],
"logging_steps": 5,
"max_steps": 638,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 77930701455360.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}