checkito24's picture
Upload folder using huggingface_hub
91bb64b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 87,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.3709419071674347,
"epoch": 0.011494252873563218,
"grad_norm": 320.0,
"learning_rate": 5e-05,
"loss": 4.9724,
"mean_token_accuracy": 0.5486404895782471,
"num_tokens": 1657.0,
"step": 1
},
{
"entropy": 1.0495511293411255,
"epoch": 0.022988505747126436,
"grad_norm": 41.0,
"learning_rate": 5e-05,
"loss": 1.9782,
"mean_token_accuracy": 0.7108578085899353,
"num_tokens": 3326.0,
"step": 2
},
{
"entropy": 2.898812770843506,
"epoch": 0.034482758620689655,
"grad_norm": 133.0,
"learning_rate": 5e-05,
"loss": 2.3167,
"mean_token_accuracy": 0.6533090472221375,
"num_tokens": 4975.0,
"step": 3
},
{
"entropy": 2.2339625358581543,
"epoch": 0.04597701149425287,
"grad_norm": 30.5,
"learning_rate": 5e-05,
"loss": 1.384,
"mean_token_accuracy": 0.7507635951042175,
"num_tokens": 6614.0,
"step": 4
},
{
"entropy": 1.8993045091629028,
"epoch": 0.05747126436781609,
"grad_norm": 74.5,
"learning_rate": 5e-05,
"loss": 1.2395,
"mean_token_accuracy": 0.7583282589912415,
"num_tokens": 8267.0,
"step": 5
},
{
"entropy": 1.1725226640701294,
"epoch": 0.06896551724137931,
"grad_norm": 18.875,
"learning_rate": 5e-05,
"loss": 0.9074,
"mean_token_accuracy": 0.8194946050643921,
"num_tokens": 9931.0,
"step": 6
},
{
"entropy": 0.9498358368873596,
"epoch": 0.08045977011494253,
"grad_norm": 11.875,
"learning_rate": 5e-05,
"loss": 0.6491,
"mean_token_accuracy": 0.8492492437362671,
"num_tokens": 11598.0,
"step": 7
},
{
"entropy": 0.6594022512435913,
"epoch": 0.09195402298850575,
"grad_norm": 12.0625,
"learning_rate": 5e-05,
"loss": 0.4387,
"mean_token_accuracy": 0.888014554977417,
"num_tokens": 13252.0,
"step": 8
},
{
"entropy": 0.3986137807369232,
"epoch": 0.10344827586206896,
"grad_norm": 9.375,
"learning_rate": 5e-05,
"loss": 0.2872,
"mean_token_accuracy": 0.9253910779953003,
"num_tokens": 14916.0,
"step": 9
},
{
"entropy": 0.25546491146087646,
"epoch": 0.11494252873563218,
"grad_norm": 11.0,
"learning_rate": 5e-05,
"loss": 0.194,
"mean_token_accuracy": 0.9643934965133667,
"num_tokens": 16575.0,
"step": 10
},
{
"entropy": 0.1774519979953766,
"epoch": 0.12643678160919541,
"grad_norm": 6.375,
"learning_rate": 5e-05,
"loss": 0.1092,
"mean_token_accuracy": 0.978941023349762,
"num_tokens": 18239.0,
"step": 11
},
{
"entropy": 0.13791397213935852,
"epoch": 0.13793103448275862,
"grad_norm": 5.625,
"learning_rate": 5e-05,
"loss": 0.1203,
"mean_token_accuracy": 0.9824984669685364,
"num_tokens": 19898.0,
"step": 12
},
{
"entropy": 0.08871836215257645,
"epoch": 0.14942528735632185,
"grad_norm": 3.5625,
"learning_rate": 5e-05,
"loss": 0.0618,
"mean_token_accuracy": 0.9909420013427734,
"num_tokens": 21556.0,
"step": 13
},
{
"entropy": 0.06183680146932602,
"epoch": 0.16091954022988506,
"grad_norm": 3.796875,
"learning_rate": 5e-05,
"loss": 0.088,
"mean_token_accuracy": 0.9915305376052856,
"num_tokens": 23211.0,
"step": 14
},
{
"entropy": 0.06007755175232887,
"epoch": 0.1724137931034483,
"grad_norm": 2.984375,
"learning_rate": 5e-05,
"loss": 0.0707,
"mean_token_accuracy": 0.9921497702598572,
"num_tokens": 24869.0,
"step": 15
},
{
"entropy": 0.05882040038704872,
"epoch": 0.1839080459770115,
"grad_norm": 1.796875,
"learning_rate": 5e-05,
"loss": 0.0539,
"mean_token_accuracy": 0.9939903616905212,
"num_tokens": 26535.0,
"step": 16
},
{
"entropy": 0.04828764125704765,
"epoch": 0.19540229885057472,
"grad_norm": 1.7890625,
"learning_rate": 5e-05,
"loss": 0.0478,
"mean_token_accuracy": 0.9945717453956604,
"num_tokens": 28195.0,
"step": 17
},
{
"entropy": 0.044041331857442856,
"epoch": 0.20689655172413793,
"grad_norm": 2.046875,
"learning_rate": 5e-05,
"loss": 0.0495,
"mean_token_accuracy": 0.9951807260513306,
"num_tokens": 29857.0,
"step": 18
},
{
"entropy": 0.0408758781850338,
"epoch": 0.21839080459770116,
"grad_norm": 1.625,
"learning_rate": 5e-05,
"loss": 0.0329,
"mean_token_accuracy": 0.9952038526535034,
"num_tokens": 31527.0,
"step": 19
},
{
"entropy": 0.035789985209703445,
"epoch": 0.22988505747126436,
"grad_norm": 1.734375,
"learning_rate": 5e-05,
"loss": 0.046,
"mean_token_accuracy": 0.9945717453956604,
"num_tokens": 33187.0,
"step": 20
},
{
"entropy": 0.035998038947582245,
"epoch": 0.2413793103448276,
"grad_norm": 1.453125,
"learning_rate": 5e-05,
"loss": 0.0357,
"mean_token_accuracy": 0.9933494329452515,
"num_tokens": 34843.0,
"step": 21
},
{
"entropy": 0.03482215851545334,
"epoch": 0.25287356321839083,
"grad_norm": 1.90625,
"learning_rate": 5e-05,
"loss": 0.0454,
"mean_token_accuracy": 0.9927536249160767,
"num_tokens": 36501.0,
"step": 22
},
{
"entropy": 0.028973015025258064,
"epoch": 0.26436781609195403,
"grad_norm": 1.59375,
"learning_rate": 5e-05,
"loss": 0.0345,
"mean_token_accuracy": 0.9963768124580383,
"num_tokens": 38159.0,
"step": 23
},
{
"entropy": 0.029489481821656227,
"epoch": 0.27586206896551724,
"grad_norm": 0.83203125,
"learning_rate": 5e-05,
"loss": 0.0275,
"mean_token_accuracy": 0.996372401714325,
"num_tokens": 39815.0,
"step": 24
},
{
"entropy": 0.03362823650240898,
"epoch": 0.28735632183908044,
"grad_norm": 1.2421875,
"learning_rate": 5e-05,
"loss": 0.0309,
"mean_token_accuracy": 0.9957805871963501,
"num_tokens": 41476.0,
"step": 25
},
{
"entropy": 0.029244303703308105,
"epoch": 0.2988505747126437,
"grad_norm": 1.5078125,
"learning_rate": 5e-05,
"loss": 0.0346,
"mean_token_accuracy": 0.9957805871963501,
"num_tokens": 43137.0,
"step": 26
},
{
"entropy": 0.03644675388932228,
"epoch": 0.3103448275862069,
"grad_norm": 2.0,
"learning_rate": 5e-05,
"loss": 0.035,
"mean_token_accuracy": 0.9933934211730957,
"num_tokens": 44804.0,
"step": 27
},
{
"entropy": 0.03685041889548302,
"epoch": 0.3218390804597701,
"grad_norm": 2.1875,
"learning_rate": 5e-05,
"loss": 0.047,
"mean_token_accuracy": 0.9933574795722961,
"num_tokens": 46462.0,
"step": 28
},
{
"entropy": 0.02877631224691868,
"epoch": 0.3333333333333333,
"grad_norm": 0.8828125,
"learning_rate": 5e-05,
"loss": 0.0217,
"mean_token_accuracy": 0.9957343339920044,
"num_tokens": 48105.0,
"step": 29
},
{
"entropy": 0.040678467601537704,
"epoch": 0.3448275862068966,
"grad_norm": 1.0078125,
"learning_rate": 5e-05,
"loss": 0.0332,
"mean_token_accuracy": 0.9940083622932434,
"num_tokens": 49776.0,
"step": 30
},
{
"entropy": 0.027814431115984917,
"epoch": 0.3563218390804598,
"grad_norm": 0.83984375,
"learning_rate": 5e-05,
"loss": 0.0301,
"mean_token_accuracy": 0.9957395195960999,
"num_tokens": 51421.0,
"step": 31
},
{
"entropy": 0.027178222313523293,
"epoch": 0.367816091954023,
"grad_norm": 0.6640625,
"learning_rate": 5e-05,
"loss": 0.0214,
"mean_token_accuracy": 0.9963614344596863,
"num_tokens": 53072.0,
"step": 32
},
{
"entropy": 0.031195349991321564,
"epoch": 0.3793103448275862,
"grad_norm": 0.59375,
"learning_rate": 5e-05,
"loss": 0.0248,
"mean_token_accuracy": 0.9957652688026428,
"num_tokens": 54727.0,
"step": 33
},
{
"entropy": 0.029933765530586243,
"epoch": 0.39080459770114945,
"grad_norm": 0.58203125,
"learning_rate": 5e-05,
"loss": 0.0191,
"mean_token_accuracy": 0.9957957863807678,
"num_tokens": 56394.0,
"step": 34
},
{
"entropy": 0.03709409758448601,
"epoch": 0.40229885057471265,
"grad_norm": 1.890625,
"learning_rate": 5e-05,
"loss": 0.0453,
"mean_token_accuracy": 0.9921592473983765,
"num_tokens": 58054.0,
"step": 35
},
{
"entropy": 0.03175961226224899,
"epoch": 0.41379310344827586,
"grad_norm": 0.73828125,
"learning_rate": 5e-05,
"loss": 0.0288,
"mean_token_accuracy": 0.9951778054237366,
"num_tokens": 59715.0,
"step": 36
},
{
"entropy": 0.03274337574839592,
"epoch": 0.42528735632183906,
"grad_norm": 1.0546875,
"learning_rate": 5e-05,
"loss": 0.0242,
"mean_token_accuracy": 0.9939283728599548,
"num_tokens": 61364.0,
"step": 37
},
{
"entropy": 0.03900443762540817,
"epoch": 0.4367816091954023,
"grad_norm": 1.2421875,
"learning_rate": 5e-05,
"loss": 0.0329,
"mean_token_accuracy": 0.9927841424942017,
"num_tokens": 63029.0,
"step": 38
},
{
"entropy": 0.03216073289513588,
"epoch": 0.4482758620689655,
"grad_norm": 0.85546875,
"learning_rate": 5e-05,
"loss": 0.0407,
"mean_token_accuracy": 0.9933854341506958,
"num_tokens": 64694.0,
"step": 39
},
{
"entropy": 0.026994826272130013,
"epoch": 0.45977011494252873,
"grad_norm": 0.9921875,
"learning_rate": 5e-05,
"loss": 0.0336,
"mean_token_accuracy": 0.9939283728599548,
"num_tokens": 66343.0,
"step": 40
},
{
"entropy": 0.028900790959596634,
"epoch": 0.47126436781609193,
"grad_norm": 0.51171875,
"learning_rate": 5e-05,
"loss": 0.0191,
"mean_token_accuracy": 0.9969879388809204,
"num_tokens": 68005.0,
"step": 41
},
{
"entropy": 0.017634548246860504,
"epoch": 0.4827586206896552,
"grad_norm": 0.609375,
"learning_rate": 5e-05,
"loss": 0.0143,
"mean_token_accuracy": 0.9975772500038147,
"num_tokens": 69658.0,
"step": 42
},
{
"entropy": 0.02459569275379181,
"epoch": 0.4942528735632184,
"grad_norm": 0.8671875,
"learning_rate": 5e-05,
"loss": 0.0194,
"mean_token_accuracy": 0.9969861507415771,
"num_tokens": 71319.0,
"step": 43
},
{
"entropy": 0.03161758929491043,
"epoch": 0.5057471264367817,
"grad_norm": 0.7890625,
"learning_rate": 5e-05,
"loss": 0.0183,
"mean_token_accuracy": 0.9970042109489441,
"num_tokens": 72990.0,
"step": 44
},
{
"entropy": 0.02282997965812683,
"epoch": 0.5172413793103449,
"grad_norm": 0.67578125,
"learning_rate": 5e-05,
"loss": 0.0157,
"mean_token_accuracy": 0.9963877201080322,
"num_tokens": 74653.0,
"step": 45
},
{
"entropy": 0.027062978595495224,
"epoch": 0.5287356321839081,
"grad_norm": 1.109375,
"learning_rate": 5e-05,
"loss": 0.0234,
"mean_token_accuracy": 0.9957678318023682,
"num_tokens": 76309.0,
"step": 46
},
{
"entropy": 0.021824488416314125,
"epoch": 0.5402298850574713,
"grad_norm": 0.62890625,
"learning_rate": 5e-05,
"loss": 0.0175,
"mean_token_accuracy": 0.9975845217704773,
"num_tokens": 77967.0,
"step": 47
},
{
"entropy": 0.0219434704631567,
"epoch": 0.5517241379310345,
"grad_norm": 0.59375,
"learning_rate": 5e-05,
"loss": 0.0184,
"mean_token_accuracy": 0.9969567656517029,
"num_tokens": 79612.0,
"step": 48
},
{
"entropy": 0.018387747928500175,
"epoch": 0.5632183908045977,
"grad_norm": 0.68359375,
"learning_rate": 5e-05,
"loss": 0.022,
"mean_token_accuracy": 0.9952009320259094,
"num_tokens": 81281.0,
"step": 49
},
{
"entropy": 0.018757076933979988,
"epoch": 0.5747126436781609,
"grad_norm": 1.65625,
"learning_rate": 5e-05,
"loss": 0.0229,
"mean_token_accuracy": 0.9963658452033997,
"num_tokens": 82934.0,
"step": 50
},
{
"entropy": 0.016816047951579094,
"epoch": 0.5862068965517241,
"grad_norm": 1.125,
"learning_rate": 5e-05,
"loss": 0.0268,
"mean_token_accuracy": 0.996964156627655,
"num_tokens": 84583.0,
"step": 51
},
{
"entropy": 0.020629465579986572,
"epoch": 0.5977011494252874,
"grad_norm": 0.55078125,
"learning_rate": 5e-05,
"loss": 0.0212,
"mean_token_accuracy": 0.9952067136764526,
"num_tokens": 86254.0,
"step": 52
},
{
"entropy": 0.01931397244334221,
"epoch": 0.6091954022988506,
"grad_norm": 0.91015625,
"learning_rate": 5e-05,
"loss": 0.0251,
"mean_token_accuracy": 0.9957831501960754,
"num_tokens": 87916.0,
"step": 53
},
{
"entropy": 0.015544063411653042,
"epoch": 0.6206896551724138,
"grad_norm": 0.5625,
"learning_rate": 5e-05,
"loss": 0.015,
"mean_token_accuracy": 0.9969770312309265,
"num_tokens": 89572.0,
"step": 54
},
{
"entropy": 0.016354193910956383,
"epoch": 0.632183908045977,
"grad_norm": 0.8671875,
"learning_rate": 5e-05,
"loss": 0.0276,
"mean_token_accuracy": 0.9957957863807678,
"num_tokens": 91239.0,
"step": 55
},
{
"entropy": 0.02520756609737873,
"epoch": 0.6436781609195402,
"grad_norm": 0.7109375,
"learning_rate": 5e-05,
"loss": 0.0166,
"mean_token_accuracy": 0.9957882165908813,
"num_tokens": 92903.0,
"step": 56
},
{
"entropy": 0.017131350934505463,
"epoch": 0.6551724137931034,
"grad_norm": 0.494140625,
"learning_rate": 5e-05,
"loss": 0.0127,
"mean_token_accuracy": 0.9975830912590027,
"num_tokens": 94560.0,
"step": 57
},
{
"entropy": 0.01473468728363514,
"epoch": 0.6666666666666666,
"grad_norm": 0.6171875,
"learning_rate": 5e-05,
"loss": 0.0117,
"mean_token_accuracy": 0.995785653591156,
"num_tokens": 96223.0,
"step": 58
},
{
"entropy": 0.014780101366341114,
"epoch": 0.6781609195402298,
"grad_norm": 0.50390625,
"learning_rate": 5e-05,
"loss": 0.0166,
"mean_token_accuracy": 0.9969788789749146,
"num_tokens": 97880.0,
"step": 59
},
{
"entropy": 0.02004481479525566,
"epoch": 0.6896551724137931,
"grad_norm": 3.640625,
"learning_rate": 5e-05,
"loss": 0.0245,
"mean_token_accuracy": 0.9957678318023682,
"num_tokens": 99536.0,
"step": 60
},
{
"entropy": 0.02079402096569538,
"epoch": 0.7011494252873564,
"grad_norm": 0.8984375,
"learning_rate": 5e-05,
"loss": 0.0256,
"mean_token_accuracy": 0.9957652688026428,
"num_tokens": 101191.0,
"step": 61
},
{
"entropy": 0.01947801001369953,
"epoch": 0.7126436781609196,
"grad_norm": 0.67578125,
"learning_rate": 5e-05,
"loss": 0.0214,
"mean_token_accuracy": 0.9970059990882874,
"num_tokens": 102863.0,
"step": 62
},
{
"entropy": 0.021246658638119698,
"epoch": 0.7241379310344828,
"grad_norm": 12.75,
"learning_rate": 5e-05,
"loss": 0.0144,
"mean_token_accuracy": 0.9970005750656128,
"num_tokens": 104532.0,
"step": 63
},
{
"entropy": 0.01568767800927162,
"epoch": 0.735632183908046,
"grad_norm": 0.77734375,
"learning_rate": 5e-05,
"loss": 0.0137,
"mean_token_accuracy": 0.996995210647583,
"num_tokens": 106198.0,
"step": 64
},
{
"entropy": 0.018703024834394455,
"epoch": 0.7471264367816092,
"grad_norm": 0.796875,
"learning_rate": 5e-05,
"loss": 0.0228,
"mean_token_accuracy": 0.9963658452033997,
"num_tokens": 107851.0,
"step": 65
},
{
"entropy": 0.019862579181790352,
"epoch": 0.7586206896551724,
"grad_norm": 0.60546875,
"learning_rate": 5e-05,
"loss": 0.0229,
"mean_token_accuracy": 0.9957805871963501,
"num_tokens": 109512.0,
"step": 66
},
{
"entropy": 0.012604566290974617,
"epoch": 0.7701149425287356,
"grad_norm": 1.046875,
"learning_rate": 5e-05,
"loss": 0.0147,
"mean_token_accuracy": 0.9975669384002686,
"num_tokens": 111158.0,
"step": 67
},
{
"entropy": 0.01671748049557209,
"epoch": 0.7816091954022989,
"grad_norm": 0.625,
"learning_rate": 5e-05,
"loss": 0.0175,
"mean_token_accuracy": 0.9969987869262695,
"num_tokens": 112826.0,
"step": 68
},
{
"entropy": 0.021043848246335983,
"epoch": 0.7931034482758621,
"grad_norm": 0.486328125,
"learning_rate": 5e-05,
"loss": 0.0144,
"mean_token_accuracy": 0.9958033561706543,
"num_tokens": 114496.0,
"step": 69
},
{
"entropy": 0.016358301043510437,
"epoch": 0.8045977011494253,
"grad_norm": 0.73828125,
"learning_rate": 5e-05,
"loss": 0.0145,
"mean_token_accuracy": 0.9981883764266968,
"num_tokens": 116154.0,
"step": 70
},
{
"entropy": 0.017069363966584206,
"epoch": 0.8160919540229885,
"grad_norm": 0.67578125,
"learning_rate": 5e-05,
"loss": 0.0193,
"mean_token_accuracy": 0.9963877201080322,
"num_tokens": 117817.0,
"step": 71
},
{
"entropy": 0.014579691924154758,
"epoch": 0.8275862068965517,
"grad_norm": 0.484375,
"learning_rate": 5e-05,
"loss": 0.0147,
"mean_token_accuracy": 0.9975830912590027,
"num_tokens": 119474.0,
"step": 72
},
{
"entropy": 0.016145411878824234,
"epoch": 0.8390804597701149,
"grad_norm": 0.859375,
"learning_rate": 5e-05,
"loss": 0.0173,
"mean_token_accuracy": 0.996372401714325,
"num_tokens": 121130.0,
"step": 73
},
{
"entropy": 0.015459074638783932,
"epoch": 0.8505747126436781,
"grad_norm": 0.484375,
"learning_rate": 5e-05,
"loss": 0.0142,
"mean_token_accuracy": 0.9975845217704773,
"num_tokens": 122788.0,
"step": 74
},
{
"entropy": 0.016901882365345955,
"epoch": 0.8620689655172413,
"grad_norm": 0.3671875,
"learning_rate": 5e-05,
"loss": 0.0142,
"mean_token_accuracy": 0.9969861507415771,
"num_tokens": 124449.0,
"step": 75
},
{
"entropy": 0.01234087347984314,
"epoch": 0.8735632183908046,
"grad_norm": 0.345703125,
"learning_rate": 5e-05,
"loss": 0.0084,
"mean_token_accuracy": 0.9969915747642517,
"num_tokens": 126113.0,
"step": 76
},
{
"entropy": 0.013088205829262733,
"epoch": 0.8850574712643678,
"grad_norm": 0.7109375,
"learning_rate": 5e-05,
"loss": 0.0201,
"mean_token_accuracy": 0.9969770312309265,
"num_tokens": 127769.0,
"step": 77
},
{
"entropy": 0.014324485324323177,
"epoch": 0.896551724137931,
"grad_norm": 0.6484375,
"learning_rate": 5e-05,
"loss": 0.0176,
"mean_token_accuracy": 0.9969770312309265,
"num_tokens": 129425.0,
"step": 78
},
{
"entropy": 0.015041607432067394,
"epoch": 0.9080459770114943,
"grad_norm": 0.67578125,
"learning_rate": 5e-05,
"loss": 0.0179,
"mean_token_accuracy": 0.9963658452033997,
"num_tokens": 131078.0,
"step": 79
},
{
"entropy": 0.011612314730882645,
"epoch": 0.9195402298850575,
"grad_norm": 0.3515625,
"learning_rate": 5e-05,
"loss": 0.0071,
"mean_token_accuracy": 0.9981971383094788,
"num_tokens": 132744.0,
"step": 80
},
{
"entropy": 0.014071737416088581,
"epoch": 0.9310344827586207,
"grad_norm": 0.361328125,
"learning_rate": 5e-05,
"loss": 0.0131,
"mean_token_accuracy": 0.9975990653038025,
"num_tokens": 134412.0,
"step": 81
},
{
"entropy": 0.013796189799904823,
"epoch": 0.9425287356321839,
"grad_norm": 0.57421875,
"learning_rate": 5e-05,
"loss": 0.0158,
"mean_token_accuracy": 0.9969715476036072,
"num_tokens": 136065.0,
"step": 82
},
{
"entropy": 0.01726696267724037,
"epoch": 0.9540229885057471,
"grad_norm": 0.796875,
"learning_rate": 5e-05,
"loss": 0.0319,
"mean_token_accuracy": 0.9946042895317078,
"num_tokens": 137735.0,
"step": 83
},
{
"entropy": 0.010913386940956116,
"epoch": 0.9655172413793104,
"grad_norm": 0.400390625,
"learning_rate": 5e-05,
"loss": 0.0136,
"mean_token_accuracy": 0.9975874423980713,
"num_tokens": 139395.0,
"step": 84
},
{
"entropy": 0.01786569133400917,
"epoch": 0.9770114942528736,
"grad_norm": 0.546875,
"learning_rate": 5e-05,
"loss": 0.0187,
"mean_token_accuracy": 0.9957907199859619,
"num_tokens": 141060.0,
"step": 85
},
{
"entropy": 0.015070064924657345,
"epoch": 0.9885057471264368,
"grad_norm": 1.890625,
"learning_rate": 5e-05,
"loss": 0.0169,
"mean_token_accuracy": 0.9969879388809204,
"num_tokens": 142722.0,
"step": 86
},
{
"entropy": 0.018884530290961266,
"epoch": 1.0,
"grad_norm": 0.77734375,
"learning_rate": 5e-05,
"loss": 0.0208,
"mean_token_accuracy": 0.9951603412628174,
"num_tokens": 144377.0,
"step": 87
},
{
"epoch": 1.0,
"eval_entropy": 0.017042334967603285,
"eval_loss": 0.01563352532684803,
"eval_mean_token_accuracy": 0.9966080685456594,
"eval_num_tokens": 144377.0,
"eval_runtime": 159.7482,
"eval_samples_per_second": 0.275,
"eval_steps_per_second": 0.038,
"step": 87
}
],
"logging_steps": 1,
"max_steps": 696,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 87243904817664.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}