Fahad-S's picture
Upload train_prev_f4_model_bf16_ckpt600/trainer_state.json with huggingface_hub
f96f789 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.1278437465320166,
"eval_steps": 500,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0035512151814449007,
"grad_norm": 3.59375,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.4053,
"mean_token_accuracy": 0.03334063550209976,
"step": 1
},
{
"epoch": 0.007102430362889801,
"grad_norm": 5.0625,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.4467,
"mean_token_accuracy": 0.035705664253327996,
"step": 2
},
{
"epoch": 0.010653645544334702,
"grad_norm": 3.84375,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.412,
"mean_token_accuracy": 0.03214258457956021,
"step": 3
},
{
"epoch": 0.014204860725779603,
"grad_norm": 5.4375,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.4373,
"mean_token_accuracy": 0.031811027434741845,
"step": 4
},
{
"epoch": 0.017756075907224503,
"grad_norm": 3.671875,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.4044,
"mean_token_accuracy": 0.030799477463006042,
"step": 5
},
{
"epoch": 0.021307291088669404,
"grad_norm": 3.40625,
"learning_rate": 8.571428571428571e-06,
"loss": 0.4079,
"mean_token_accuracy": 0.029206037786934758,
"step": 6
},
{
"epoch": 0.024858506270114305,
"grad_norm": 3.15625,
"learning_rate": 1e-05,
"loss": 0.3746,
"mean_token_accuracy": 0.02718283714784775,
"step": 7
},
{
"epoch": 0.028409721451559206,
"grad_norm": 2.59375,
"learning_rate": 9.999980365120307e-06,
"loss": 0.3386,
"mean_token_accuracy": 0.029688632937904913,
"step": 8
},
{
"epoch": 0.0319609366330041,
"grad_norm": 3.296875,
"learning_rate": 9.999921460635436e-06,
"loss": 0.3371,
"mean_token_accuracy": 0.028260885846975725,
"step": 9
},
{
"epoch": 0.03551215181444901,
"grad_norm": 3.0,
"learning_rate": 9.999823287008022e-06,
"loss": 0.3134,
"mean_token_accuracy": 0.028826194098655833,
"step": 10
},
{
"epoch": 0.039063366995893904,
"grad_norm": 2.640625,
"learning_rate": 9.999685845009114e-06,
"loss": 0.3141,
"mean_token_accuracy": 0.03309909115705523,
"step": 11
},
{
"epoch": 0.04261458217733881,
"grad_norm": 2.3125,
"learning_rate": 9.999509135718176e-06,
"loss": 0.3091,
"mean_token_accuracy": 0.033378408043063246,
"step": 12
},
{
"epoch": 0.046165797358783706,
"grad_norm": 2.0625,
"learning_rate": 9.999293160523074e-06,
"loss": 0.3064,
"mean_token_accuracy": 0.031830415049626026,
"step": 13
},
{
"epoch": 0.04971701254022861,
"grad_norm": 2.046875,
"learning_rate": 9.999037921120068e-06,
"loss": 0.2926,
"mean_token_accuracy": 0.0313498438699753,
"step": 14
},
{
"epoch": 0.05326822772167351,
"grad_norm": 2.453125,
"learning_rate": 9.998743419513795e-06,
"loss": 0.2893,
"mean_token_accuracy": 0.0330949738236086,
"step": 15
},
{
"epoch": 0.05681944290311841,
"grad_norm": 2.3125,
"learning_rate": 9.998409658017256e-06,
"loss": 0.2984,
"mean_token_accuracy": 0.027138729998114286,
"step": 16
},
{
"epoch": 0.06037065808456331,
"grad_norm": 2.84375,
"learning_rate": 9.998036639251798e-06,
"loss": 0.2999,
"mean_token_accuracy": 0.03013355429357034,
"step": 17
},
{
"epoch": 0.0639218732660082,
"grad_norm": 2.515625,
"learning_rate": 9.997624366147094e-06,
"loss": 0.2867,
"mean_token_accuracy": 0.031100922809855547,
"step": 18
},
{
"epoch": 0.06747308844745312,
"grad_norm": 1.5703125,
"learning_rate": 9.997172841941114e-06,
"loss": 0.279,
"mean_token_accuracy": 0.03025696396798594,
"step": 19
},
{
"epoch": 0.07102430362889801,
"grad_norm": 1.6953125,
"learning_rate": 9.99668207018011e-06,
"loss": 0.2848,
"mean_token_accuracy": 0.032818032945215236,
"step": 20
},
{
"epoch": 0.07457551881034291,
"grad_norm": 1.3671875,
"learning_rate": 9.996152054718579e-06,
"loss": 0.2815,
"mean_token_accuracy": 0.03186455541072064,
"step": 21
},
{
"epoch": 0.07812673399178781,
"grad_norm": 0.98046875,
"learning_rate": 9.995582799719237e-06,
"loss": 0.2878,
"mean_token_accuracy": 0.0334690267482074,
"step": 22
},
{
"epoch": 0.08167794917323272,
"grad_norm": 1.234375,
"learning_rate": 9.994974309652984e-06,
"loss": 0.2826,
"mean_token_accuracy": 0.03061204146433738,
"step": 23
},
{
"epoch": 0.08522916435467762,
"grad_norm": 1.0859375,
"learning_rate": 9.994326589298875e-06,
"loss": 0.2673,
"mean_token_accuracy": 0.03804276299706544,
"step": 24
},
{
"epoch": 0.08878037953612251,
"grad_norm": 1.46875,
"learning_rate": 9.993639643744071e-06,
"loss": 0.2909,
"mean_token_accuracy": 0.033314747761323815,
"step": 25
},
{
"epoch": 0.09233159471756741,
"grad_norm": 1.6171875,
"learning_rate": 9.99291347838381e-06,
"loss": 0.2774,
"mean_token_accuracy": 0.030899848927219864,
"step": 26
},
{
"epoch": 0.09588280989901232,
"grad_norm": 1.015625,
"learning_rate": 9.992148098921361e-06,
"loss": 0.2609,
"mean_token_accuracy": 0.033753670039004646,
"step": 27
},
{
"epoch": 0.09943402508045722,
"grad_norm": 1.046875,
"learning_rate": 9.99134351136798e-06,
"loss": 0.2721,
"mean_token_accuracy": 0.03772788319474785,
"step": 28
},
{
"epoch": 0.10298524026190212,
"grad_norm": 1.1171875,
"learning_rate": 9.990499722042852e-06,
"loss": 0.2751,
"mean_token_accuracy": 0.032295682049152674,
"step": 29
},
{
"epoch": 0.10653645544334701,
"grad_norm": 1.4453125,
"learning_rate": 9.989616737573064e-06,
"loss": 0.2781,
"mean_token_accuracy": 0.033647085841948865,
"step": 30
},
{
"epoch": 0.11008767062479193,
"grad_norm": 1.2890625,
"learning_rate": 9.98869456489353e-06,
"loss": 0.2779,
"mean_token_accuracy": 0.028794426627428038,
"step": 31
},
{
"epoch": 0.11363888580623682,
"grad_norm": 0.89453125,
"learning_rate": 9.987733211246952e-06,
"loss": 0.2607,
"mean_token_accuracy": 0.032419106319139246,
"step": 32
},
{
"epoch": 0.11719010098768172,
"grad_norm": 1.1640625,
"learning_rate": 9.986732684183753e-06,
"loss": 0.2605,
"mean_token_accuracy": 0.037521557434956776,
"step": 33
},
{
"epoch": 0.12074131616912662,
"grad_norm": 1.6328125,
"learning_rate": 9.985692991562026e-06,
"loss": 0.2634,
"mean_token_accuracy": 0.040194204961153446,
"step": 34
},
{
"epoch": 0.12429253135057153,
"grad_norm": 1.28125,
"learning_rate": 9.984614141547468e-06,
"loss": 0.2557,
"mean_token_accuracy": 0.039635664583329344,
"step": 35
},
{
"epoch": 0.1278437465320164,
"grad_norm": 1.546875,
"learning_rate": 9.983496142613314e-06,
"loss": 0.2685,
"mean_token_accuracy": 0.029371898828685516,
"step": 36
},
{
"epoch": 0.13139496171346132,
"grad_norm": 0.921875,
"learning_rate": 9.982339003540272e-06,
"loss": 0.2693,
"mean_token_accuracy": 0.03898763568940922,
"step": 37
},
{
"epoch": 0.13494617689490623,
"grad_norm": 1.1015625,
"learning_rate": 9.981142733416457e-06,
"loss": 0.2679,
"mean_token_accuracy": 0.03345580608220189,
"step": 38
},
{
"epoch": 0.13849739207635112,
"grad_norm": 0.96875,
"learning_rate": 9.97990734163732e-06,
"loss": 0.2602,
"mean_token_accuracy": 0.03686568816374347,
"step": 39
},
{
"epoch": 0.14204860725779603,
"grad_norm": 0.84375,
"learning_rate": 9.978632837905566e-06,
"loss": 0.262,
"mean_token_accuracy": 0.03514496624120511,
"step": 40
},
{
"epoch": 0.14559982243924094,
"grad_norm": 1.2265625,
"learning_rate": 9.977319232231088e-06,
"loss": 0.264,
"mean_token_accuracy": 0.03551164961390896,
"step": 41
},
{
"epoch": 0.14915103762068582,
"grad_norm": 1.2578125,
"learning_rate": 9.975966534930879e-06,
"loss": 0.2641,
"mean_token_accuracy": 0.03131604355075979,
"step": 42
},
{
"epoch": 0.15270225280213073,
"grad_norm": 1.0703125,
"learning_rate": 9.974574756628961e-06,
"loss": 0.2663,
"mean_token_accuracy": 0.035479316422424745,
"step": 43
},
{
"epoch": 0.15625346798357562,
"grad_norm": 1.15625,
"learning_rate": 9.973143908256291e-06,
"loss": 0.2637,
"mean_token_accuracy": 0.03412504114021431,
"step": 44
},
{
"epoch": 0.15980468316502053,
"grad_norm": 1.5546875,
"learning_rate": 9.971674001050687e-06,
"loss": 0.2602,
"mean_token_accuracy": 0.038116581108624814,
"step": 45
},
{
"epoch": 0.16335589834646544,
"grad_norm": 1.125,
"learning_rate": 9.970165046556726e-06,
"loss": 0.2529,
"mean_token_accuracy": 0.03723000747777405,
"step": 46
},
{
"epoch": 0.16690711352791032,
"grad_norm": 1.0234375,
"learning_rate": 9.968617056625665e-06,
"loss": 0.2597,
"mean_token_accuracy": 0.031160971375356894,
"step": 47
},
{
"epoch": 0.17045832870935523,
"grad_norm": 0.98828125,
"learning_rate": 9.967030043415345e-06,
"loss": 0.2654,
"mean_token_accuracy": 0.03424461206304841,
"step": 48
},
{
"epoch": 0.17400954389080014,
"grad_norm": 1.25,
"learning_rate": 9.965404019390087e-06,
"loss": 0.2778,
"mean_token_accuracy": 0.030317287382786162,
"step": 49
},
{
"epoch": 0.17756075907224503,
"grad_norm": 1.1484375,
"learning_rate": 9.963738997320609e-06,
"loss": 0.2656,
"mean_token_accuracy": 0.028600392921362072,
"step": 50
},
{
"epoch": 0.18111197425368994,
"grad_norm": 1.15625,
"learning_rate": 9.962034990283912e-06,
"loss": 0.28,
"mean_token_accuracy": 0.03231685267383,
"step": 51
},
{
"epoch": 0.18466318943513482,
"grad_norm": 1.0,
"learning_rate": 9.960292011663186e-06,
"loss": 0.2533,
"mean_token_accuracy": 0.03403068816260202,
"step": 52
},
{
"epoch": 0.18821440461657973,
"grad_norm": 1.25,
"learning_rate": 9.958510075147703e-06,
"loss": 0.259,
"mean_token_accuracy": 0.032833737386681605,
"step": 53
},
{
"epoch": 0.19176561979802464,
"grad_norm": 1.34375,
"learning_rate": 9.956689194732702e-06,
"loss": 0.2656,
"mean_token_accuracy": 0.035690422784682596,
"step": 54
},
{
"epoch": 0.19531683497946953,
"grad_norm": 0.99609375,
"learning_rate": 9.954829384719296e-06,
"loss": 0.2532,
"mean_token_accuracy": 0.036441230346099474,
"step": 55
},
{
"epoch": 0.19886805016091444,
"grad_norm": 1.0546875,
"learning_rate": 9.95293065971434e-06,
"loss": 0.2651,
"mean_token_accuracy": 0.034422322398313554,
"step": 56
},
{
"epoch": 0.20241926534235935,
"grad_norm": 1.140625,
"learning_rate": 9.950993034630328e-06,
"loss": 0.2657,
"mean_token_accuracy": 0.03684557715678238,
"step": 57
},
{
"epoch": 0.20597048052380423,
"grad_norm": 1.2890625,
"learning_rate": 9.949016524685277e-06,
"loss": 0.2558,
"mean_token_accuracy": 0.03600433477549814,
"step": 58
},
{
"epoch": 0.20952169570524914,
"grad_norm": 0.9921875,
"learning_rate": 9.947001145402598e-06,
"loss": 0.2528,
"mean_token_accuracy": 0.03322864700021455,
"step": 59
},
{
"epoch": 0.21307291088669403,
"grad_norm": 0.90625,
"learning_rate": 9.944946912610986e-06,
"loss": 0.2468,
"mean_token_accuracy": 0.03672743546849233,
"step": 60
},
{
"epoch": 0.21662412606813894,
"grad_norm": 0.87890625,
"learning_rate": 9.942853842444283e-06,
"loss": 0.27,
"mean_token_accuracy": 0.032799736989545636,
"step": 61
},
{
"epoch": 0.22017534124958385,
"grad_norm": 1.59375,
"learning_rate": 9.940721951341365e-06,
"loss": 0.2626,
"mean_token_accuracy": 0.03218006519819028,
"step": 62
},
{
"epoch": 0.22372655643102873,
"grad_norm": 1.21875,
"learning_rate": 9.938551256046e-06,
"loss": 0.2527,
"mean_token_accuracy": 0.03779199095151853,
"step": 63
},
{
"epoch": 0.22727777161247364,
"grad_norm": 1.140625,
"learning_rate": 9.936341773606723e-06,
"loss": 0.2505,
"mean_token_accuracy": 0.03585068928805413,
"step": 64
},
{
"epoch": 0.23082898679391856,
"grad_norm": 0.91015625,
"learning_rate": 9.934093521376707e-06,
"loss": 0.2453,
"mean_token_accuracy": 0.03667068052163813,
"step": 65
},
{
"epoch": 0.23438020197536344,
"grad_norm": 1.125,
"learning_rate": 9.931806517013612e-06,
"loss": 0.2549,
"mean_token_accuracy": 0.03179566226390307,
"step": 66
},
{
"epoch": 0.23793141715680835,
"grad_norm": 1.5546875,
"learning_rate": 9.929480778479465e-06,
"loss": 0.2578,
"mean_token_accuracy": 0.03751813623966882,
"step": 67
},
{
"epoch": 0.24148263233825323,
"grad_norm": 1.828125,
"learning_rate": 9.9271163240405e-06,
"loss": 0.2516,
"mean_token_accuracy": 0.038412457124650246,
"step": 68
},
{
"epoch": 0.24503384751969814,
"grad_norm": 1.015625,
"learning_rate": 9.92471317226703e-06,
"loss": 0.2507,
"mean_token_accuracy": 0.034873580965722795,
"step": 69
},
{
"epoch": 0.24858506270114306,
"grad_norm": 1.7109375,
"learning_rate": 9.922271342033295e-06,
"loss": 0.239,
"mean_token_accuracy": 0.03720474839792587,
"step": 70
},
{
"epoch": 0.25213627788258797,
"grad_norm": 1.1171875,
"learning_rate": 9.919790852517313e-06,
"loss": 0.2463,
"mean_token_accuracy": 0.0361522939929273,
"step": 71
},
{
"epoch": 0.2556874930640328,
"grad_norm": 1.0390625,
"learning_rate": 9.917271723200725e-06,
"loss": 0.2585,
"mean_token_accuracy": 0.033959298038098495,
"step": 72
},
{
"epoch": 0.25923870824547773,
"grad_norm": 1.03125,
"learning_rate": 9.914713973868654e-06,
"loss": 0.2585,
"mean_token_accuracy": 0.035365104355150834,
"step": 73
},
{
"epoch": 0.26278992342692264,
"grad_norm": 1.078125,
"learning_rate": 9.91211762460954e-06,
"loss": 0.2674,
"mean_token_accuracy": 0.03293743663743953,
"step": 74
},
{
"epoch": 0.26634113860836756,
"grad_norm": 0.9609375,
"learning_rate": 9.909482695814986e-06,
"loss": 0.2446,
"mean_token_accuracy": 0.0343262117858103,
"step": 75
},
{
"epoch": 0.26989235378981247,
"grad_norm": 1.1640625,
"learning_rate": 9.906809208179593e-06,
"loss": 0.2521,
"mean_token_accuracy": 0.03826202965865377,
"step": 76
},
{
"epoch": 0.2734435689712574,
"grad_norm": 1.515625,
"learning_rate": 9.904097182700806e-06,
"loss": 0.2589,
"mean_token_accuracy": 0.03630730328222853,
"step": 77
},
{
"epoch": 0.27699478415270223,
"grad_norm": 1.1328125,
"learning_rate": 9.901346640678744e-06,
"loss": 0.2413,
"mean_token_accuracy": 0.03989503123011673,
"step": 78
},
{
"epoch": 0.28054599933414714,
"grad_norm": 1.484375,
"learning_rate": 9.898557603716031e-06,
"loss": 0.2658,
"mean_token_accuracy": 0.03677298163893283,
"step": 79
},
{
"epoch": 0.28409721451559206,
"grad_norm": 1.09375,
"learning_rate": 9.895730093717629e-06,
"loss": 0.252,
"mean_token_accuracy": 0.03291640393581474,
"step": 80
},
{
"epoch": 0.28764842969703697,
"grad_norm": 0.98046875,
"learning_rate": 9.892864132890663e-06,
"loss": 0.2562,
"mean_token_accuracy": 0.03596020668192068,
"step": 81
},
{
"epoch": 0.2911996448784819,
"grad_norm": 1.2734375,
"learning_rate": 9.889959743744253e-06,
"loss": 0.2475,
"mean_token_accuracy": 0.03590105600960669,
"step": 82
},
{
"epoch": 0.29475086005992673,
"grad_norm": 1.3203125,
"learning_rate": 9.887016949089334e-06,
"loss": 0.2616,
"mean_token_accuracy": 0.03479768028228136,
"step": 83
},
{
"epoch": 0.29830207524137164,
"grad_norm": 1.0546875,
"learning_rate": 9.884035772038471e-06,
"loss": 0.2488,
"mean_token_accuracy": 0.042390721162519185,
"step": 84
},
{
"epoch": 0.30185329042281656,
"grad_norm": 1.203125,
"learning_rate": 9.881016236005686e-06,
"loss": 0.2452,
"mean_token_accuracy": 0.03633292374070152,
"step": 85
},
{
"epoch": 0.30540450560426147,
"grad_norm": 0.828125,
"learning_rate": 9.877958364706269e-06,
"loss": 0.2486,
"mean_token_accuracy": 0.03563892778765876,
"step": 86
},
{
"epoch": 0.3089557207857064,
"grad_norm": 1.0546875,
"learning_rate": 9.874862182156596e-06,
"loss": 0.2606,
"mean_token_accuracy": 0.031386454902531113,
"step": 87
},
{
"epoch": 0.31250693596715123,
"grad_norm": 1.078125,
"learning_rate": 9.871727712673931e-06,
"loss": 0.2539,
"mean_token_accuracy": 0.03635412478615763,
"step": 88
},
{
"epoch": 0.31605815114859614,
"grad_norm": 1.0390625,
"learning_rate": 9.868554980876253e-06,
"loss": 0.2501,
"mean_token_accuracy": 0.03346586779662175,
"step": 89
},
{
"epoch": 0.31960936633004106,
"grad_norm": 1.7734375,
"learning_rate": 9.865344011682038e-06,
"loss": 0.261,
"mean_token_accuracy": 0.03928510249170358,
"step": 90
},
{
"epoch": 0.32316058151148597,
"grad_norm": 1.234375,
"learning_rate": 9.86209483031009e-06,
"loss": 0.2564,
"mean_token_accuracy": 0.03208838186401408,
"step": 91
},
{
"epoch": 0.3267117966929309,
"grad_norm": 1.75,
"learning_rate": 9.858807462279319e-06,
"loss": 0.2611,
"mean_token_accuracy": 0.034783691704433295,
"step": 92
},
{
"epoch": 0.3302630118743758,
"grad_norm": 1.3828125,
"learning_rate": 9.855481933408557e-06,
"loss": 0.2563,
"mean_token_accuracy": 0.035772691921010846,
"step": 93
},
{
"epoch": 0.33381422705582064,
"grad_norm": 2.015625,
"learning_rate": 9.852118269816348e-06,
"loss": 0.2562,
"mean_token_accuracy": 0.03401207756360236,
"step": 94
},
{
"epoch": 0.33736544223726556,
"grad_norm": 1.328125,
"learning_rate": 9.848716497920742e-06,
"loss": 0.25,
"mean_token_accuracy": 0.033619258623730275,
"step": 95
},
{
"epoch": 0.34091665741871047,
"grad_norm": 1.7265625,
"learning_rate": 9.845276644439093e-06,
"loss": 0.2491,
"mean_token_accuracy": 0.03523858997687057,
"step": 96
},
{
"epoch": 0.3444678726001554,
"grad_norm": 1.2578125,
"learning_rate": 9.841798736387846e-06,
"loss": 0.2603,
"mean_token_accuracy": 0.036500092544883955,
"step": 97
},
{
"epoch": 0.3480190877816003,
"grad_norm": 1.1484375,
"learning_rate": 9.838282801082322e-06,
"loss": 0.2508,
"mean_token_accuracy": 0.03253259204575443,
"step": 98
},
{
"epoch": 0.35157030296304514,
"grad_norm": 1.0703125,
"learning_rate": 9.834728866136506e-06,
"loss": 0.2622,
"mean_token_accuracy": 0.0348993868137768,
"step": 99
},
{
"epoch": 0.35512151814449006,
"grad_norm": 1.015625,
"learning_rate": 9.831136959462835e-06,
"loss": 0.2535,
"mean_token_accuracy": 0.03625574364923523,
"step": 100
},
{
"epoch": 0.35867273332593497,
"grad_norm": 2.1875,
"learning_rate": 9.82750710927197e-06,
"loss": 0.2343,
"mean_token_accuracy": 0.038484412667457946,
"step": 101
},
{
"epoch": 0.3622239485073799,
"grad_norm": 1.109375,
"learning_rate": 9.823839344072582e-06,
"loss": 0.2504,
"mean_token_accuracy": 0.04149091762883472,
"step": 102
},
{
"epoch": 0.3657751636888248,
"grad_norm": 1.0234375,
"learning_rate": 9.820133692671116e-06,
"loss": 0.248,
"mean_token_accuracy": 0.03215024942983291,
"step": 103
},
{
"epoch": 0.36932637887026964,
"grad_norm": 1.078125,
"learning_rate": 9.816390184171587e-06,
"loss": 0.2525,
"mean_token_accuracy": 0.032526576575037325,
"step": 104
},
{
"epoch": 0.37287759405171456,
"grad_norm": 1.3125,
"learning_rate": 9.812608847975327e-06,
"loss": 0.2373,
"mean_token_accuracy": 0.03747400016436586,
"step": 105
},
{
"epoch": 0.37642880923315947,
"grad_norm": 1.0234375,
"learning_rate": 9.808789713780768e-06,
"loss": 0.2594,
"mean_token_accuracy": 0.03837547679722775,
"step": 106
},
{
"epoch": 0.3799800244146044,
"grad_norm": 1.6015625,
"learning_rate": 9.804932811583208e-06,
"loss": 0.2468,
"mean_token_accuracy": 0.035400711036345456,
"step": 107
},
{
"epoch": 0.3835312395960493,
"grad_norm": 1.25,
"learning_rate": 9.801038171674571e-06,
"loss": 0.2549,
"mean_token_accuracy": 0.036432786924706306,
"step": 108
},
{
"epoch": 0.3870824547774942,
"grad_norm": 1.171875,
"learning_rate": 9.797105824643171e-06,
"loss": 0.264,
"mean_token_accuracy": 0.02840231164736906,
"step": 109
},
{
"epoch": 0.39063366995893906,
"grad_norm": 1.0859375,
"learning_rate": 9.793135801373472e-06,
"loss": 0.2612,
"mean_token_accuracy": 0.031225431059283437,
"step": 110
},
{
"epoch": 0.39418488514038397,
"grad_norm": 1.5546875,
"learning_rate": 9.789128133045846e-06,
"loss": 0.2458,
"mean_token_accuracy": 0.04181941410206491,
"step": 111
},
{
"epoch": 0.3977361003218289,
"grad_norm": 1.1953125,
"learning_rate": 9.785082851136327e-06,
"loss": 0.2472,
"mean_token_accuracy": 0.036744315055329935,
"step": 112
},
{
"epoch": 0.4012873155032738,
"grad_norm": 1.234375,
"learning_rate": 9.780999987416363e-06,
"loss": 0.2613,
"mean_token_accuracy": 0.03466283130183001,
"step": 113
},
{
"epoch": 0.4048385306847187,
"grad_norm": 1.015625,
"learning_rate": 9.776879573952573e-06,
"loss": 0.2608,
"mean_token_accuracy": 0.03970765234407736,
"step": 114
},
{
"epoch": 0.40838974586616356,
"grad_norm": 1.5234375,
"learning_rate": 9.772721643106483e-06,
"loss": 0.2509,
"mean_token_accuracy": 0.034389628966891905,
"step": 115
},
{
"epoch": 0.41194096104760847,
"grad_norm": 1.1796875,
"learning_rate": 9.768526227534286e-06,
"loss": 0.2492,
"mean_token_accuracy": 0.036198436089762254,
"step": 116
},
{
"epoch": 0.4154921762290534,
"grad_norm": 1.25,
"learning_rate": 9.764293360186568e-06,
"loss": 0.2477,
"mean_token_accuracy": 0.041034682388271904,
"step": 117
},
{
"epoch": 0.4190433914104983,
"grad_norm": 1.2734375,
"learning_rate": 9.760023074308067e-06,
"loss": 0.2589,
"mean_token_accuracy": 0.036038057656696765,
"step": 118
},
{
"epoch": 0.4225946065919432,
"grad_norm": 1.2734375,
"learning_rate": 9.755715403437405e-06,
"loss": 0.2536,
"mean_token_accuracy": 0.02993579488611431,
"step": 119
},
{
"epoch": 0.42614582177338806,
"grad_norm": 1.0390625,
"learning_rate": 9.75137038140682e-06,
"loss": 0.2483,
"mean_token_accuracy": 0.03487629652772739,
"step": 120
},
{
"epoch": 0.42969703695483297,
"grad_norm": 1.390625,
"learning_rate": 9.746988042341907e-06,
"loss": 0.2428,
"mean_token_accuracy": 0.03707023007882526,
"step": 121
},
{
"epoch": 0.4332482521362779,
"grad_norm": 1.3359375,
"learning_rate": 9.742568420661347e-06,
"loss": 0.2385,
"mean_token_accuracy": 0.038770213703173795,
"step": 122
},
{
"epoch": 0.4367994673177228,
"grad_norm": 1.109375,
"learning_rate": 9.738111551076633e-06,
"loss": 0.2562,
"mean_token_accuracy": 0.03519319925180753,
"step": 123
},
{
"epoch": 0.4403506824991677,
"grad_norm": 0.98046875,
"learning_rate": 9.733617468591806e-06,
"loss": 0.2484,
"mean_token_accuracy": 0.030770529268920654,
"step": 124
},
{
"epoch": 0.4439018976806126,
"grad_norm": 1.203125,
"learning_rate": 9.729086208503174e-06,
"loss": 0.2516,
"mean_token_accuracy": 0.036209895533829695,
"step": 125
},
{
"epoch": 0.44745311286205747,
"grad_norm": 1.1953125,
"learning_rate": 9.724517806399035e-06,
"loss": 0.2423,
"mean_token_accuracy": 0.03420353115507169,
"step": 126
},
{
"epoch": 0.4510043280435024,
"grad_norm": 2.0625,
"learning_rate": 9.7199122981594e-06,
"loss": 0.2584,
"mean_token_accuracy": 0.03600568573529017,
"step": 127
},
{
"epoch": 0.4545555432249473,
"grad_norm": 1.21875,
"learning_rate": 9.715269719955708e-06,
"loss": 0.2667,
"mean_token_accuracy": 0.03318146305173286,
"step": 128
},
{
"epoch": 0.4581067584063922,
"grad_norm": 1.4296875,
"learning_rate": 9.710590108250546e-06,
"loss": 0.2584,
"mean_token_accuracy": 0.03327207771144458,
"step": 129
},
{
"epoch": 0.4616579735878371,
"grad_norm": 1.28125,
"learning_rate": 9.705873499797358e-06,
"loss": 0.2487,
"mean_token_accuracy": 0.034562985882075736,
"step": 130
},
{
"epoch": 0.46520918876928197,
"grad_norm": 1.421875,
"learning_rate": 9.701119931640161e-06,
"loss": 0.2529,
"mean_token_accuracy": 0.03666910440369975,
"step": 131
},
{
"epoch": 0.4687604039507269,
"grad_norm": 1.984375,
"learning_rate": 9.69632944111325e-06,
"loss": 0.2563,
"mean_token_accuracy": 0.03438336936960695,
"step": 132
},
{
"epoch": 0.4723116191321718,
"grad_norm": 2.875,
"learning_rate": 9.691502065840905e-06,
"loss": 0.2461,
"mean_token_accuracy": 0.03849026275565848,
"step": 133
},
{
"epoch": 0.4758628343136167,
"grad_norm": 1.4921875,
"learning_rate": 9.686637843737104e-06,
"loss": 0.2565,
"mean_token_accuracy": 0.03409404997728416,
"step": 134
},
{
"epoch": 0.4794140494950616,
"grad_norm": 1.1484375,
"learning_rate": 9.681736813005207e-06,
"loss": 0.2568,
"mean_token_accuracy": 0.03154798768446199,
"step": 135
},
{
"epoch": 0.48296526467650647,
"grad_norm": 1.15625,
"learning_rate": 9.676799012137678e-06,
"loss": 0.2441,
"mean_token_accuracy": 0.03744548839313211,
"step": 136
},
{
"epoch": 0.4865164798579514,
"grad_norm": 1.3828125,
"learning_rate": 9.671824479915768e-06,
"loss": 0.2354,
"mean_token_accuracy": 0.04052274335481343,
"step": 137
},
{
"epoch": 0.4900676950393963,
"grad_norm": 1.5390625,
"learning_rate": 9.666813255409212e-06,
"loss": 0.2568,
"mean_token_accuracy": 0.0329462843874353,
"step": 138
},
{
"epoch": 0.4936189102208412,
"grad_norm": 1.390625,
"learning_rate": 9.661765377975924e-06,
"loss": 0.2613,
"mean_token_accuracy": 0.03440686877729604,
"step": 139
},
{
"epoch": 0.4971701254022861,
"grad_norm": 1.125,
"learning_rate": 9.656680887261693e-06,
"loss": 0.233,
"mean_token_accuracy": 0.03317031992082775,
"step": 140
},
{
"epoch": 0.500721340583731,
"grad_norm": 1.1171875,
"learning_rate": 9.651559823199865e-06,
"loss": 0.2564,
"mean_token_accuracy": 0.0330515707100858,
"step": 141
},
{
"epoch": 0.5042725557651759,
"grad_norm": 1.3046875,
"learning_rate": 9.646402226011028e-06,
"loss": 0.256,
"mean_token_accuracy": 0.03438013891536684,
"step": 142
},
{
"epoch": 0.5078237709466208,
"grad_norm": 1.171875,
"learning_rate": 9.641208136202705e-06,
"loss": 0.2457,
"mean_token_accuracy": 0.037212962459307164,
"step": 143
},
{
"epoch": 0.5113749861280656,
"grad_norm": 12.75,
"learning_rate": 9.635977594569025e-06,
"loss": 0.2567,
"mean_token_accuracy": 0.03464826566778356,
"step": 144
},
{
"epoch": 0.5149262013095106,
"grad_norm": 1.0234375,
"learning_rate": 9.630710642190412e-06,
"loss": 0.2432,
"mean_token_accuracy": 0.03332022005270119,
"step": 145
},
{
"epoch": 0.5184774164909555,
"grad_norm": 1.3984375,
"learning_rate": 9.625407320433257e-06,
"loss": 0.2496,
"mean_token_accuracy": 0.04005811481329147,
"step": 146
},
{
"epoch": 0.5220286316724004,
"grad_norm": 2.125,
"learning_rate": 9.620067670949593e-06,
"loss": 0.2492,
"mean_token_accuracy": 0.03430858852516394,
"step": 147
},
{
"epoch": 0.5255798468538453,
"grad_norm": 1.75,
"learning_rate": 9.614691735676768e-06,
"loss": 0.2512,
"mean_token_accuracy": 0.031662787096138345,
"step": 148
},
{
"epoch": 0.5291310620352903,
"grad_norm": 1.0625,
"learning_rate": 9.609279556837122e-06,
"loss": 0.2487,
"mean_token_accuracy": 0.03723479399923235,
"step": 149
},
{
"epoch": 0.5326822772167351,
"grad_norm": 1.09375,
"learning_rate": 9.603831176937645e-06,
"loss": 0.2581,
"mean_token_accuracy": 0.03453483120392775,
"step": 150
},
{
"epoch": 0.53623349239818,
"grad_norm": 0.91796875,
"learning_rate": 9.598346638769653e-06,
"loss": 0.2648,
"mean_token_accuracy": 0.03409193667539512,
"step": 151
},
{
"epoch": 0.5397847075796249,
"grad_norm": 1.25,
"learning_rate": 9.592825985408443e-06,
"loss": 0.2482,
"mean_token_accuracy": 0.037982173162163235,
"step": 152
},
{
"epoch": 0.5433359227610698,
"grad_norm": 1.0,
"learning_rate": 9.58726926021296e-06,
"loss": 0.2446,
"mean_token_accuracy": 0.032505186056368984,
"step": 153
},
{
"epoch": 0.5468871379425148,
"grad_norm": 1.25,
"learning_rate": 9.581676506825458e-06,
"loss": 0.2392,
"mean_token_accuracy": 0.03936906753733638,
"step": 154
},
{
"epoch": 0.5504383531239596,
"grad_norm": 5.5,
"learning_rate": 9.576047769171154e-06,
"loss": 0.253,
"mean_token_accuracy": 0.037195249842625344,
"step": 155
},
{
"epoch": 0.5539895683054045,
"grad_norm": 1.078125,
"learning_rate": 9.57038309145788e-06,
"loss": 0.2476,
"mean_token_accuracy": 0.033277792263106676,
"step": 156
},
{
"epoch": 0.5575407834868494,
"grad_norm": 1.140625,
"learning_rate": 9.564682518175745e-06,
"loss": 0.244,
"mean_token_accuracy": 0.03346487059025094,
"step": 157
},
{
"epoch": 0.5610919986682943,
"grad_norm": 1.0546875,
"learning_rate": 9.558946094096773e-06,
"loss": 0.26,
"mean_token_accuracy": 0.0327607999824977,
"step": 158
},
{
"epoch": 0.5646432138497393,
"grad_norm": 1.2109375,
"learning_rate": 9.553173864274567e-06,
"loss": 0.2459,
"mean_token_accuracy": 0.03798125943285413,
"step": 159
},
{
"epoch": 0.5681944290311841,
"grad_norm": 1.3984375,
"learning_rate": 9.547365874043939e-06,
"loss": 0.2391,
"mean_token_accuracy": 0.03454116692591924,
"step": 160
},
{
"epoch": 0.571745644212629,
"grad_norm": 1.625,
"learning_rate": 9.541522169020568e-06,
"loss": 0.2422,
"mean_token_accuracy": 0.03469104680698365,
"step": 161
},
{
"epoch": 0.5752968593940739,
"grad_norm": 1.09375,
"learning_rate": 9.535642795100628e-06,
"loss": 0.2408,
"mean_token_accuracy": 0.03519366278487723,
"step": 162
},
{
"epoch": 0.5788480745755188,
"grad_norm": 1.0390625,
"learning_rate": 9.529727798460443e-06,
"loss": 0.244,
"mean_token_accuracy": 0.033296961744781584,
"step": 163
},
{
"epoch": 0.5823992897569638,
"grad_norm": 1.2578125,
"learning_rate": 9.52377722555611e-06,
"loss": 0.2477,
"mean_token_accuracy": 0.03645497989964497,
"step": 164
},
{
"epoch": 0.5859505049384086,
"grad_norm": 1.328125,
"learning_rate": 9.517791123123141e-06,
"loss": 0.2481,
"mean_token_accuracy": 0.0361881392163923,
"step": 165
},
{
"epoch": 0.5895017201198535,
"grad_norm": 1.3828125,
"learning_rate": 9.5117695381761e-06,
"loss": 0.2482,
"mean_token_accuracy": 0.03308462850691285,
"step": 166
},
{
"epoch": 0.5930529353012984,
"grad_norm": 1.53125,
"learning_rate": 9.50571251800822e-06,
"loss": 0.2495,
"mean_token_accuracy": 0.0373438170299778,
"step": 167
},
{
"epoch": 0.5966041504827433,
"grad_norm": 1.2890625,
"learning_rate": 9.49962011019105e-06,
"loss": 0.2569,
"mean_token_accuracy": 0.030896698255673982,
"step": 168
},
{
"epoch": 0.6001553656641883,
"grad_norm": 1.7265625,
"learning_rate": 9.493492362574069e-06,
"loss": 0.2317,
"mean_token_accuracy": 0.03890984639656381,
"step": 169
},
{
"epoch": 0.6037065808456331,
"grad_norm": 1.671875,
"learning_rate": 9.487329323284306e-06,
"loss": 0.2605,
"mean_token_accuracy": 0.03156319166737376,
"step": 170
},
{
"epoch": 0.607257796027078,
"grad_norm": 2.09375,
"learning_rate": 9.481131040725982e-06,
"loss": 0.2543,
"mean_token_accuracy": 0.03919281046910328,
"step": 171
},
{
"epoch": 0.6108090112085229,
"grad_norm": 1.078125,
"learning_rate": 9.474897563580105e-06,
"loss": 0.2299,
"mean_token_accuracy": 0.04073215187599999,
"step": 172
},
{
"epoch": 0.6143602263899678,
"grad_norm": 1.140625,
"learning_rate": 9.468628940804109e-06,
"loss": 0.2372,
"mean_token_accuracy": 0.03550426434594556,
"step": 173
},
{
"epoch": 0.6179114415714128,
"grad_norm": 1.7109375,
"learning_rate": 9.46232522163145e-06,
"loss": 0.2577,
"mean_token_accuracy": 0.03367227574563003,
"step": 174
},
{
"epoch": 0.6214626567528576,
"grad_norm": 1.5078125,
"learning_rate": 9.45598645557124e-06,
"loss": 0.2492,
"mean_token_accuracy": 0.032757934073742945,
"step": 175
},
{
"epoch": 0.6250138719343025,
"grad_norm": 1.5859375,
"learning_rate": 9.44961269240784e-06,
"loss": 0.2462,
"mean_token_accuracy": 0.03420929316052934,
"step": 176
},
{
"epoch": 0.6285650871157474,
"grad_norm": 1.1640625,
"learning_rate": 9.443203982200479e-06,
"loss": 0.2592,
"mean_token_accuracy": 0.033935571223992156,
"step": 177
},
{
"epoch": 0.6321163022971923,
"grad_norm": 1.7421875,
"learning_rate": 9.436760375282858e-06,
"loss": 0.2395,
"mean_token_accuracy": 0.03958856422104873,
"step": 178
},
{
"epoch": 0.6356675174786373,
"grad_norm": 1.2734375,
"learning_rate": 9.430281922262758e-06,
"loss": 0.2387,
"mean_token_accuracy": 0.04119610415727948,
"step": 179
},
{
"epoch": 0.6392187326600821,
"grad_norm": 1.21875,
"learning_rate": 9.423768674021638e-06,
"loss": 0.2539,
"mean_token_accuracy": 0.03369366040897148,
"step": 180
},
{
"epoch": 0.642769947841527,
"grad_norm": 1.0703125,
"learning_rate": 9.417220681714232e-06,
"loss": 0.2444,
"mean_token_accuracy": 0.03464862687542336,
"step": 181
},
{
"epoch": 0.6463211630229719,
"grad_norm": 1.890625,
"learning_rate": 9.410637996768161e-06,
"loss": 0.2387,
"mean_token_accuracy": 0.035173144740838325,
"step": 182
},
{
"epoch": 0.6498723782044168,
"grad_norm": 1.3828125,
"learning_rate": 9.404020670883511e-06,
"loss": 0.2466,
"mean_token_accuracy": 0.0379961929138517,
"step": 183
},
{
"epoch": 0.6534235933858618,
"grad_norm": 1.0390625,
"learning_rate": 9.397368756032445e-06,
"loss": 0.2415,
"mean_token_accuracy": 0.03510807668862981,
"step": 184
},
{
"epoch": 0.6569748085673066,
"grad_norm": 1.375,
"learning_rate": 9.390682304458782e-06,
"loss": 0.235,
"mean_token_accuracy": 0.03696706932532834,
"step": 185
},
{
"epoch": 0.6605260237487516,
"grad_norm": 1.6328125,
"learning_rate": 9.38396136867759e-06,
"loss": 0.2437,
"mean_token_accuracy": 0.03457535789129906,
"step": 186
},
{
"epoch": 0.6640772389301964,
"grad_norm": 1.9140625,
"learning_rate": 9.377206001474773e-06,
"loss": 0.2503,
"mean_token_accuracy": 0.03181831787514966,
"step": 187
},
{
"epoch": 0.6676284541116413,
"grad_norm": 1.3359375,
"learning_rate": 9.370416255906663e-06,
"loss": 0.2533,
"mean_token_accuracy": 0.03447212269020383,
"step": 188
},
{
"epoch": 0.6711796692930863,
"grad_norm": 1.4765625,
"learning_rate": 9.363592185299593e-06,
"loss": 0.2508,
"mean_token_accuracy": 0.03599807388127374,
"step": 189
},
{
"epoch": 0.6747308844745311,
"grad_norm": 1.1484375,
"learning_rate": 9.356733843249487e-06,
"loss": 0.2452,
"mean_token_accuracy": 0.030227956107410137,
"step": 190
},
{
"epoch": 0.6782820996559761,
"grad_norm": 1.171875,
"learning_rate": 9.349841283621432e-06,
"loss": 0.2523,
"mean_token_accuracy": 0.030009737140062498,
"step": 191
},
{
"epoch": 0.6818333148374209,
"grad_norm": 1.2734375,
"learning_rate": 9.34291456054926e-06,
"loss": 0.2391,
"mean_token_accuracy": 0.036572398468706524,
"step": 192
},
{
"epoch": 0.6853845300188658,
"grad_norm": 1.796875,
"learning_rate": 9.33595372843512e-06,
"loss": 0.237,
"mean_token_accuracy": 0.034027565063297516,
"step": 193
},
{
"epoch": 0.6889357452003108,
"grad_norm": 1.4609375,
"learning_rate": 9.328958841949056e-06,
"loss": 0.2549,
"mean_token_accuracy": 0.032021424787672004,
"step": 194
},
{
"epoch": 0.6924869603817556,
"grad_norm": 1.3203125,
"learning_rate": 9.321929956028565e-06,
"loss": 0.2503,
"mean_token_accuracy": 0.037128334486624226,
"step": 195
},
{
"epoch": 0.6960381755632006,
"grad_norm": 1.234375,
"learning_rate": 9.31486712587818e-06,
"loss": 0.2519,
"mean_token_accuracy": 0.0347771145261504,
"step": 196
},
{
"epoch": 0.6995893907446454,
"grad_norm": 1.296875,
"learning_rate": 9.307770406969032e-06,
"loss": 0.2512,
"mean_token_accuracy": 0.03815152426250279,
"step": 197
},
{
"epoch": 0.7031406059260903,
"grad_norm": 1.234375,
"learning_rate": 9.300639855038405e-06,
"loss": 0.2442,
"mean_token_accuracy": 0.03452787006244762,
"step": 198
},
{
"epoch": 0.7066918211075353,
"grad_norm": 1.1953125,
"learning_rate": 9.293475526089316e-06,
"loss": 0.2431,
"mean_token_accuracy": 0.03716798722598469,
"step": 199
},
{
"epoch": 0.7102430362889801,
"grad_norm": 1.2578125,
"learning_rate": 9.286277476390056e-06,
"loss": 0.2421,
"mean_token_accuracy": 0.03371156241337303,
"step": 200
},
{
"epoch": 0.7137942514704251,
"grad_norm": 1.1328125,
"learning_rate": 9.279045762473764e-06,
"loss": 0.25,
"mean_token_accuracy": 0.0350483679867466,
"step": 201
},
{
"epoch": 0.7173454666518699,
"grad_norm": 1.1953125,
"learning_rate": 9.27178044113797e-06,
"loss": 0.2437,
"mean_token_accuracy": 0.03318321451479278,
"step": 202
},
{
"epoch": 0.7208966818333148,
"grad_norm": 1.1875,
"learning_rate": 9.264481569444157e-06,
"loss": 0.2437,
"mean_token_accuracy": 0.03552469089845545,
"step": 203
},
{
"epoch": 0.7244478970147598,
"grad_norm": 1.046875,
"learning_rate": 9.257149204717317e-06,
"loss": 0.2507,
"mean_token_accuracy": 0.03375845828668389,
"step": 204
},
{
"epoch": 0.7279991121962046,
"grad_norm": 2.375,
"learning_rate": 9.249783404545488e-06,
"loss": 0.2443,
"mean_token_accuracy": 0.03998926315762219,
"step": 205
},
{
"epoch": 0.7315503273776496,
"grad_norm": 1.1328125,
"learning_rate": 9.242384226779308e-06,
"loss": 0.2457,
"mean_token_accuracy": 0.035812200483633205,
"step": 206
},
{
"epoch": 0.7351015425590944,
"grad_norm": 1.140625,
"learning_rate": 9.234951729531564e-06,
"loss": 0.2365,
"mean_token_accuracy": 0.03771704700193368,
"step": 207
},
{
"epoch": 0.7386527577405393,
"grad_norm": 1.234375,
"learning_rate": 9.227485971176734e-06,
"loss": 0.2432,
"mean_token_accuracy": 0.04016880454582861,
"step": 208
},
{
"epoch": 0.7422039729219843,
"grad_norm": 1.2578125,
"learning_rate": 9.219987010350522e-06,
"loss": 0.2356,
"mean_token_accuracy": 0.038527078770130174,
"step": 209
},
{
"epoch": 0.7457551881034291,
"grad_norm": 1.0703125,
"learning_rate": 9.212454905949406e-06,
"loss": 0.2366,
"mean_token_accuracy": 0.03399550302310672,
"step": 210
},
{
"epoch": 0.7493064032848741,
"grad_norm": 1.2578125,
"learning_rate": 9.204889717130172e-06,
"loss": 0.2525,
"mean_token_accuracy": 0.035145990557793994,
"step": 211
},
{
"epoch": 0.7528576184663189,
"grad_norm": 1.046875,
"learning_rate": 9.197291503309448e-06,
"loss": 0.2378,
"mean_token_accuracy": 0.038536792555532884,
"step": 212
},
{
"epoch": 0.7564088336477638,
"grad_norm": 1.3125,
"learning_rate": 9.189660324163243e-06,
"loss": 0.2474,
"mean_token_accuracy": 0.037968925902532646,
"step": 213
},
{
"epoch": 0.7599600488292088,
"grad_norm": 1.3125,
"learning_rate": 9.181996239626468e-06,
"loss": 0.2373,
"mean_token_accuracy": 0.03542459754316951,
"step": 214
},
{
"epoch": 0.7635112640106536,
"grad_norm": 1.015625,
"learning_rate": 9.174299309892474e-06,
"loss": 0.2371,
"mean_token_accuracy": 0.03846056985275936,
"step": 215
},
{
"epoch": 0.7670624791920986,
"grad_norm": 1.46875,
"learning_rate": 9.166569595412576e-06,
"loss": 0.2308,
"mean_token_accuracy": 0.0358878808474401,
"step": 216
},
{
"epoch": 0.7706136943735434,
"grad_norm": 1.390625,
"learning_rate": 9.158807156895581e-06,
"loss": 0.24,
"mean_token_accuracy": 0.03474290976009797,
"step": 217
},
{
"epoch": 0.7741649095549884,
"grad_norm": 1.109375,
"learning_rate": 9.151012055307308e-06,
"loss": 0.2446,
"mean_token_accuracy": 0.03521274150625686,
"step": 218
},
{
"epoch": 0.7777161247364333,
"grad_norm": 1.1640625,
"learning_rate": 9.14318435187011e-06,
"loss": 0.242,
"mean_token_accuracy": 0.03299982330281637,
"step": 219
},
{
"epoch": 0.7812673399178781,
"grad_norm": 2.5,
"learning_rate": 9.135324108062391e-06,
"loss": 0.2262,
"mean_token_accuracy": 0.04036388936947333,
"step": 220
},
{
"epoch": 0.7848185550993231,
"grad_norm": 1.1875,
"learning_rate": 9.127431385618129e-06,
"loss": 0.2368,
"mean_token_accuracy": 0.04137472144429921,
"step": 221
},
{
"epoch": 0.7883697702807679,
"grad_norm": 1.59375,
"learning_rate": 9.119506246526386e-06,
"loss": 0.2386,
"mean_token_accuracy": 0.036027361149535864,
"step": 222
},
{
"epoch": 0.7919209854622129,
"grad_norm": 1.546875,
"learning_rate": 9.111548753030824e-06,
"loss": 0.2461,
"mean_token_accuracy": 0.034415613237797515,
"step": 223
},
{
"epoch": 0.7954722006436578,
"grad_norm": 1.03125,
"learning_rate": 9.103558967629211e-06,
"loss": 0.2404,
"mean_token_accuracy": 0.03685819863312645,
"step": 224
},
{
"epoch": 0.7990234158251026,
"grad_norm": 1.1875,
"learning_rate": 9.09553695307294e-06,
"loss": 0.248,
"mean_token_accuracy": 0.028797292045055656,
"step": 225
},
{
"epoch": 0.8025746310065476,
"grad_norm": 1.453125,
"learning_rate": 9.087482772366529e-06,
"loss": 0.2471,
"mean_token_accuracy": 0.03305003515924909,
"step": 226
},
{
"epoch": 0.8061258461879924,
"grad_norm": 1.2421875,
"learning_rate": 9.07939648876712e-06,
"loss": 0.2403,
"mean_token_accuracy": 0.03300378163476125,
"step": 227
},
{
"epoch": 0.8096770613694374,
"grad_norm": 1.34375,
"learning_rate": 9.071278165784001e-06,
"loss": 0.2469,
"mean_token_accuracy": 0.03578633150755195,
"step": 228
},
{
"epoch": 0.8132282765508823,
"grad_norm": 1.28125,
"learning_rate": 9.063127867178085e-06,
"loss": 0.2369,
"mean_token_accuracy": 0.036125565729889786,
"step": 229
},
{
"epoch": 0.8167794917323271,
"grad_norm": 1.3828125,
"learning_rate": 9.054945656961429e-06,
"loss": 0.2361,
"mean_token_accuracy": 0.03855715526515269,
"step": 230
},
{
"epoch": 0.8203307069137721,
"grad_norm": 1.1640625,
"learning_rate": 9.046731599396716e-06,
"loss": 0.2513,
"mean_token_accuracy": 0.034193019520898815,
"step": 231
},
{
"epoch": 0.8238819220952169,
"grad_norm": 1.9921875,
"learning_rate": 9.03848575899676e-06,
"loss": 0.2429,
"mean_token_accuracy": 0.04017591796582565,
"step": 232
},
{
"epoch": 0.8274331372766619,
"grad_norm": 1.2421875,
"learning_rate": 9.030208200523994e-06,
"loss": 0.2382,
"mean_token_accuracy": 0.03354914677765919,
"step": 233
},
{
"epoch": 0.8309843524581068,
"grad_norm": 1.234375,
"learning_rate": 9.021898988989966e-06,
"loss": 0.2425,
"mean_token_accuracy": 0.031292581817979226,
"step": 234
},
{
"epoch": 0.8345355676395516,
"grad_norm": 1.1484375,
"learning_rate": 9.013558189654819e-06,
"loss": 0.2354,
"mean_token_accuracy": 0.037720333613833645,
"step": 235
},
{
"epoch": 0.8380867828209966,
"grad_norm": 1.3515625,
"learning_rate": 9.005185868026793e-06,
"loss": 0.2349,
"mean_token_accuracy": 0.03620595469328691,
"step": 236
},
{
"epoch": 0.8416379980024414,
"grad_norm": 1.1171875,
"learning_rate": 8.996782089861699e-06,
"loss": 0.2573,
"mean_token_accuracy": 0.03309179725329159,
"step": 237
},
{
"epoch": 0.8451892131838864,
"grad_norm": 1.6640625,
"learning_rate": 8.988346921162407e-06,
"loss": 0.2476,
"mean_token_accuracy": 0.03174106139340438,
"step": 238
},
{
"epoch": 0.8487404283653313,
"grad_norm": 1.4453125,
"learning_rate": 8.979880428178323e-06,
"loss": 0.24,
"mean_token_accuracy": 0.03508038215659326,
"step": 239
},
{
"epoch": 0.8522916435467761,
"grad_norm": 1.4296875,
"learning_rate": 8.971382677404878e-06,
"loss": 0.2536,
"mean_token_accuracy": 0.03149323346224264,
"step": 240
},
{
"epoch": 0.8558428587282211,
"grad_norm": 1.4296875,
"learning_rate": 8.962853735582996e-06,
"loss": 0.2386,
"mean_token_accuracy": 0.037929220459773205,
"step": 241
},
{
"epoch": 0.8593940739096659,
"grad_norm": 0.8828125,
"learning_rate": 8.95429366969858e-06,
"loss": 0.2478,
"mean_token_accuracy": 0.03517991015723965,
"step": 242
},
{
"epoch": 0.8629452890911109,
"grad_norm": 1.1015625,
"learning_rate": 8.94570254698197e-06,
"loss": 0.2465,
"mean_token_accuracy": 0.040985416919284035,
"step": 243
},
{
"epoch": 0.8664965042725558,
"grad_norm": 1.7578125,
"learning_rate": 8.93708043490743e-06,
"loss": 0.249,
"mean_token_accuracy": 0.037392959638964385,
"step": 244
},
{
"epoch": 0.8700477194540006,
"grad_norm": 1.2421875,
"learning_rate": 8.928427401192618e-06,
"loss": 0.2442,
"mean_token_accuracy": 0.03466749745348352,
"step": 245
},
{
"epoch": 0.8735989346354456,
"grad_norm": 1.3828125,
"learning_rate": 8.919743513798044e-06,
"loss": 0.2417,
"mean_token_accuracy": 0.03530099252020591,
"step": 246
},
{
"epoch": 0.8771501498168904,
"grad_norm": 2.375,
"learning_rate": 8.911028840926537e-06,
"loss": 0.2465,
"mean_token_accuracy": 0.03762459074278013,
"step": 247
},
{
"epoch": 0.8807013649983354,
"grad_norm": 1.7265625,
"learning_rate": 8.902283451022725e-06,
"loss": 0.2322,
"mean_token_accuracy": 0.03646585380920442,
"step": 248
},
{
"epoch": 0.8842525801797803,
"grad_norm": 1.578125,
"learning_rate": 8.89350741277247e-06,
"loss": 0.2554,
"mean_token_accuracy": 0.03502001665037824,
"step": 249
},
{
"epoch": 0.8878037953612252,
"grad_norm": 1.265625,
"learning_rate": 8.884700795102365e-06,
"loss": 0.2463,
"mean_token_accuracy": 0.03513136104447767,
"step": 250
},
{
"epoch": 0.8913550105426701,
"grad_norm": 1.3515625,
"learning_rate": 8.875863667179155e-06,
"loss": 0.2436,
"mean_token_accuracy": 0.0331545490953431,
"step": 251
},
{
"epoch": 0.8949062257241149,
"grad_norm": 1.53125,
"learning_rate": 8.866996098409217e-06,
"loss": 0.2436,
"mean_token_accuracy": 0.03208865020133089,
"step": 252
},
{
"epoch": 0.8984574409055599,
"grad_norm": 1.5625,
"learning_rate": 8.858098158438013e-06,
"loss": 0.2451,
"mean_token_accuracy": 0.03713261121083633,
"step": 253
},
{
"epoch": 0.9020086560870048,
"grad_norm": 1.109375,
"learning_rate": 8.849169917149532e-06,
"loss": 0.2367,
"mean_token_accuracy": 0.03933425937066204,
"step": 254
},
{
"epoch": 0.9055598712684497,
"grad_norm": 1.1875,
"learning_rate": 8.840211444665754e-06,
"loss": 0.2475,
"mean_token_accuracy": 0.04025235302106012,
"step": 255
},
{
"epoch": 0.9091110864498946,
"grad_norm": 1.5546875,
"learning_rate": 8.831222811346088e-06,
"loss": 0.2392,
"mean_token_accuracy": 0.03604988591541769,
"step": 256
},
{
"epoch": 0.9126623016313394,
"grad_norm": 1.40625,
"learning_rate": 8.822204087786831e-06,
"loss": 0.2451,
"mean_token_accuracy": 0.03125222894232138,
"step": 257
},
{
"epoch": 0.9162135168127844,
"grad_norm": 1.53125,
"learning_rate": 8.813155344820602e-06,
"loss": 0.2326,
"mean_token_accuracy": 0.0366273350919073,
"step": 258
},
{
"epoch": 0.9197647319942293,
"grad_norm": 1.1875,
"learning_rate": 8.804076653515792e-06,
"loss": 0.2392,
"mean_token_accuracy": 0.03683656148496084,
"step": 259
},
{
"epoch": 0.9233159471756742,
"grad_norm": 1.0078125,
"learning_rate": 8.794968085176006e-06,
"loss": 0.2438,
"mean_token_accuracy": 0.03510568725687335,
"step": 260
},
{
"epoch": 0.9268671623571191,
"grad_norm": 1.3828125,
"learning_rate": 8.785829711339502e-06,
"loss": 0.238,
"mean_token_accuracy": 0.03469807377041434,
"step": 261
},
{
"epoch": 0.9304183775385639,
"grad_norm": 1.1640625,
"learning_rate": 8.776661603778629e-06,
"loss": 0.2419,
"mean_token_accuracy": 0.03890717753165518,
"step": 262
},
{
"epoch": 0.9339695927200089,
"grad_norm": 1.1875,
"learning_rate": 8.767463834499261e-06,
"loss": 0.2481,
"mean_token_accuracy": 0.032666244380379794,
"step": 263
},
{
"epoch": 0.9375208079014538,
"grad_norm": 1.296875,
"learning_rate": 8.758236475740236e-06,
"loss": 0.2329,
"mean_token_accuracy": 0.037105117426108336,
"step": 264
},
{
"epoch": 0.9410720230828987,
"grad_norm": 2.0625,
"learning_rate": 8.748979599972787e-06,
"loss": 0.2443,
"mean_token_accuracy": 0.03263842041269527,
"step": 265
},
{
"epoch": 0.9446232382643436,
"grad_norm": 1.4375,
"learning_rate": 8.739693279899969e-06,
"loss": 0.2349,
"mean_token_accuracy": 0.034980818134499714,
"step": 266
},
{
"epoch": 0.9481744534457884,
"grad_norm": 1.1328125,
"learning_rate": 8.730377588456092e-06,
"loss": 0.2357,
"mean_token_accuracy": 0.036848886411462445,
"step": 267
},
{
"epoch": 0.9517256686272334,
"grad_norm": 1.2734375,
"learning_rate": 8.72103259880615e-06,
"loss": 0.2417,
"mean_token_accuracy": 0.037047853220428806,
"step": 268
},
{
"epoch": 0.9552768838086783,
"grad_norm": 1.8203125,
"learning_rate": 8.711658384345244e-06,
"loss": 0.2464,
"mean_token_accuracy": 0.03403061416611308,
"step": 269
},
{
"epoch": 0.9588280989901232,
"grad_norm": 1.5078125,
"learning_rate": 8.702255018698e-06,
"loss": 0.2276,
"mean_token_accuracy": 0.035586197598604485,
"step": 270
},
{
"epoch": 0.9623793141715681,
"grad_norm": 1.1015625,
"learning_rate": 8.692822575718e-06,
"loss": 0.2336,
"mean_token_accuracy": 0.03349671917021624,
"step": 271
},
{
"epoch": 0.9659305293530129,
"grad_norm": 1.3046875,
"learning_rate": 8.683361129487198e-06,
"loss": 0.236,
"mean_token_accuracy": 0.0383108137830277,
"step": 272
},
{
"epoch": 0.9694817445344579,
"grad_norm": 1.1484375,
"learning_rate": 8.673870754315336e-06,
"loss": 0.2401,
"mean_token_accuracy": 0.04072669624656555,
"step": 273
},
{
"epoch": 0.9730329597159028,
"grad_norm": 1.3515625,
"learning_rate": 8.664351524739368e-06,
"loss": 0.2529,
"mean_token_accuracy": 0.036624281186959706,
"step": 274
},
{
"epoch": 0.9765841748973477,
"grad_norm": 1.0390625,
"learning_rate": 8.65480351552286e-06,
"loss": 0.2461,
"mean_token_accuracy": 0.03385563577285211,
"step": 275
},
{
"epoch": 0.9801353900787926,
"grad_norm": 2.078125,
"learning_rate": 8.645226801655418e-06,
"loss": 0.2356,
"mean_token_accuracy": 0.03583552375130239,
"step": 276
},
{
"epoch": 0.9836866052602374,
"grad_norm": 2.015625,
"learning_rate": 8.635621458352094e-06,
"loss": 0.2505,
"mean_token_accuracy": 0.03417470180647797,
"step": 277
},
{
"epoch": 0.9872378204416824,
"grad_norm": 1.171875,
"learning_rate": 8.625987561052789e-06,
"loss": 0.2445,
"mean_token_accuracy": 0.034865413577790605,
"step": 278
},
{
"epoch": 0.9907890356231273,
"grad_norm": 1.4609375,
"learning_rate": 8.616325185421673e-06,
"loss": 0.2543,
"mean_token_accuracy": 0.03070924551502685,
"step": 279
},
{
"epoch": 0.9943402508045722,
"grad_norm": 1.28125,
"learning_rate": 8.606634407346575e-06,
"loss": 0.2412,
"mean_token_accuracy": 0.03835893170617055,
"step": 280
},
{
"epoch": 0.9978914659860171,
"grad_norm": 1.421875,
"learning_rate": 8.596915302938403e-06,
"loss": 0.2416,
"mean_token_accuracy": 0.03124076440144563,
"step": 281
},
{
"epoch": 1.0,
"grad_norm": 0.78125,
"learning_rate": 8.587167948530533e-06,
"loss": 0.1443,
"mean_token_accuracy": 0.03148232989540128,
"step": 282
},
{
"epoch": 1.003551215181445,
"grad_norm": 1.390625,
"learning_rate": 8.577392420678217e-06,
"loss": 0.2333,
"mean_token_accuracy": 0.039055653414834524,
"step": 283
},
{
"epoch": 1.0071024303628897,
"grad_norm": 1.1875,
"learning_rate": 8.567588796157983e-06,
"loss": 0.248,
"mean_token_accuracy": 0.03278845629756688,
"step": 284
},
{
"epoch": 1.0106536455443347,
"grad_norm": 1.78125,
"learning_rate": 8.557757151967025e-06,
"loss": 0.2505,
"mean_token_accuracy": 0.03427708127492224,
"step": 285
},
{
"epoch": 1.0142048607257796,
"grad_norm": 0.96484375,
"learning_rate": 8.547897565322601e-06,
"loss": 0.2414,
"mean_token_accuracy": 0.03930300644788076,
"step": 286
},
{
"epoch": 1.0177560759072246,
"grad_norm": 1.1484375,
"learning_rate": 8.538010113661434e-06,
"loss": 0.2325,
"mean_token_accuracy": 0.03667710912122857,
"step": 287
},
{
"epoch": 1.0213072910886694,
"grad_norm": 1.546875,
"learning_rate": 8.528094874639092e-06,
"loss": 0.2494,
"mean_token_accuracy": 0.03501396441060933,
"step": 288
},
{
"epoch": 1.0248585062701143,
"grad_norm": 1.203125,
"learning_rate": 8.518151926129384e-06,
"loss": 0.2265,
"mean_token_accuracy": 0.04366327696516237,
"step": 289
},
{
"epoch": 1.0284097214515593,
"grad_norm": 1.046875,
"learning_rate": 8.508181346223749e-06,
"loss": 0.247,
"mean_token_accuracy": 0.03554195544529648,
"step": 290
},
{
"epoch": 1.031960936633004,
"grad_norm": 1.1796875,
"learning_rate": 8.498183213230646e-06,
"loss": 0.2376,
"mean_token_accuracy": 0.03931035470304778,
"step": 291
},
{
"epoch": 1.035512151814449,
"grad_norm": 1.171875,
"learning_rate": 8.488157605674924e-06,
"loss": 0.2407,
"mean_token_accuracy": 0.03499374365674157,
"step": 292
},
{
"epoch": 1.039063366995894,
"grad_norm": 1.1953125,
"learning_rate": 8.478104602297226e-06,
"loss": 0.2348,
"mean_token_accuracy": 0.03476892200342263,
"step": 293
},
{
"epoch": 1.0426145821773387,
"grad_norm": 1.1484375,
"learning_rate": 8.468024282053357e-06,
"loss": 0.2398,
"mean_token_accuracy": 0.035643573090055725,
"step": 294
},
{
"epoch": 1.0461657973587837,
"grad_norm": 1.4765625,
"learning_rate": 8.457916724113667e-06,
"loss": 0.2437,
"mean_token_accuracy": 0.03672185743562295,
"step": 295
},
{
"epoch": 1.0497170125402286,
"grad_norm": 1.2265625,
"learning_rate": 8.447782007862427e-06,
"loss": 0.2341,
"mean_token_accuracy": 0.039112065329391044,
"step": 296
},
{
"epoch": 1.0532682277216736,
"grad_norm": 1.28125,
"learning_rate": 8.437620212897213e-06,
"loss": 0.2379,
"mean_token_accuracy": 0.03804202094033826,
"step": 297
},
{
"epoch": 1.0568194429031184,
"grad_norm": 1.3671875,
"learning_rate": 8.427431419028273e-06,
"loss": 0.2438,
"mean_token_accuracy": 0.03412439540261403,
"step": 298
},
{
"epoch": 1.0603706580845633,
"grad_norm": 1.1875,
"learning_rate": 8.417215706277905e-06,
"loss": 0.2335,
"mean_token_accuracy": 0.03501426196999091,
"step": 299
},
{
"epoch": 1.0639218732660083,
"grad_norm": 1.578125,
"learning_rate": 8.406973154879826e-06,
"loss": 0.2435,
"mean_token_accuracy": 0.03277427892317064,
"step": 300
},
{
"epoch": 1.067473088447453,
"grad_norm": 1.1328125,
"learning_rate": 8.396703845278537e-06,
"loss": 0.2401,
"mean_token_accuracy": 0.035917978868383216,
"step": 301
},
{
"epoch": 1.071024303628898,
"grad_norm": 1.203125,
"learning_rate": 8.386407858128707e-06,
"loss": 0.2385,
"mean_token_accuracy": 0.03660639774170704,
"step": 302
},
{
"epoch": 1.074575518810343,
"grad_norm": 1.015625,
"learning_rate": 8.376085274294518e-06,
"loss": 0.2433,
"mean_token_accuracy": 0.033684822363284184,
"step": 303
},
{
"epoch": 1.0781267339917877,
"grad_norm": 1.4765625,
"learning_rate": 8.365736174849053e-06,
"loss": 0.2377,
"mean_token_accuracy": 0.034216828673379496,
"step": 304
},
{
"epoch": 1.0816779491732327,
"grad_norm": 1.2578125,
"learning_rate": 8.355360641073637e-06,
"loss": 0.2432,
"mean_token_accuracy": 0.03404876044623961,
"step": 305
},
{
"epoch": 1.0852291643546776,
"grad_norm": 1.6171875,
"learning_rate": 8.344958754457214e-06,
"loss": 0.2401,
"mean_token_accuracy": 0.03517156572706881,
"step": 306
},
{
"epoch": 1.0887803795361226,
"grad_norm": 1.375,
"learning_rate": 8.3345305966957e-06,
"loss": 0.2451,
"mean_token_accuracy": 0.033717118327331264,
"step": 307
},
{
"epoch": 1.0923315947175674,
"grad_norm": 1.5078125,
"learning_rate": 8.324076249691347e-06,
"loss": 0.2415,
"mean_token_accuracy": 0.03640562181681162,
"step": 308
},
{
"epoch": 1.0958828098990123,
"grad_norm": 3.5,
"learning_rate": 8.31359579555209e-06,
"loss": 0.232,
"mean_token_accuracy": 0.039631859472137876,
"step": 309
},
{
"epoch": 1.0994340250804573,
"grad_norm": 1.5625,
"learning_rate": 8.30308931659091e-06,
"loss": 0.2337,
"mean_token_accuracy": 0.03287116249339306,
"step": 310
},
{
"epoch": 1.102985240261902,
"grad_norm": 1.953125,
"learning_rate": 8.292556895325195e-06,
"loss": 0.246,
"mean_token_accuracy": 0.037367168624768965,
"step": 311
},
{
"epoch": 1.106536455443347,
"grad_norm": 1.2265625,
"learning_rate": 8.281998614476066e-06,
"loss": 0.2463,
"mean_token_accuracy": 0.03246536147344159,
"step": 312
},
{
"epoch": 1.110087670624792,
"grad_norm": 1.453125,
"learning_rate": 8.271414556967758e-06,
"loss": 0.2441,
"mean_token_accuracy": 0.033534199588757474,
"step": 313
},
{
"epoch": 1.1136388858062367,
"grad_norm": 1.3125,
"learning_rate": 8.260804805926948e-06,
"loss": 0.2406,
"mean_token_accuracy": 0.034469886544684414,
"step": 314
},
{
"epoch": 1.1171901009876817,
"grad_norm": 1.125,
"learning_rate": 8.250169444682109e-06,
"loss": 0.2364,
"mean_token_accuracy": 0.03434639961778885,
"step": 315
},
{
"epoch": 1.1207413161691266,
"grad_norm": 3.078125,
"learning_rate": 8.239508556762857e-06,
"loss": 0.244,
"mean_token_accuracy": 0.035720634419703856,
"step": 316
},
{
"epoch": 1.1242925313505716,
"grad_norm": 1.2265625,
"learning_rate": 8.228822225899294e-06,
"loss": 0.2398,
"mean_token_accuracy": 0.03444605955883162,
"step": 317
},
{
"epoch": 1.1278437465320164,
"grad_norm": 1.15625,
"learning_rate": 8.218110536021347e-06,
"loss": 0.2409,
"mean_token_accuracy": 0.04050879927308415,
"step": 318
},
{
"epoch": 1.1313949617134613,
"grad_norm": 1.5859375,
"learning_rate": 8.207373571258113e-06,
"loss": 0.2353,
"mean_token_accuracy": 0.03475078833071166,
"step": 319
},
{
"epoch": 1.1349461768949063,
"grad_norm": 6.1875,
"learning_rate": 8.196611415937196e-06,
"loss": 0.2444,
"mean_token_accuracy": 0.032825655180204194,
"step": 320
},
{
"epoch": 1.138497392076351,
"grad_norm": 1.1640625,
"learning_rate": 8.18582415458405e-06,
"loss": 0.2377,
"mean_token_accuracy": 0.036314319742814405,
"step": 321
},
{
"epoch": 1.142048607257796,
"grad_norm": 1.5703125,
"learning_rate": 8.1750118719213e-06,
"loss": 0.2484,
"mean_token_accuracy": 0.03831162358619622,
"step": 322
},
{
"epoch": 1.145599822439241,
"grad_norm": 1.109375,
"learning_rate": 8.164174652868097e-06,
"loss": 0.2499,
"mean_token_accuracy": 0.033986524165811716,
"step": 323
},
{
"epoch": 1.149151037620686,
"grad_norm": 1.109375,
"learning_rate": 8.153312582539438e-06,
"loss": 0.246,
"mean_token_accuracy": 0.03575117406217032,
"step": 324
},
{
"epoch": 1.1527022528021307,
"grad_norm": 1.6015625,
"learning_rate": 8.142425746245503e-06,
"loss": 0.2345,
"mean_token_accuracy": 0.037550230876149726,
"step": 325
},
{
"epoch": 1.1562534679835756,
"grad_norm": 1.015625,
"learning_rate": 8.131514229490975e-06,
"loss": 0.2277,
"mean_token_accuracy": 0.03635891172598349,
"step": 326
},
{
"epoch": 1.1598046831650206,
"grad_norm": 1.375,
"learning_rate": 8.120578117974388e-06,
"loss": 0.2428,
"mean_token_accuracy": 0.0431224472959002,
"step": 327
},
{
"epoch": 1.1633558983464654,
"grad_norm": 1.4296875,
"learning_rate": 8.109617497587429e-06,
"loss": 0.2272,
"mean_token_accuracy": 0.036638571738876635,
"step": 328
},
{
"epoch": 1.1669071135279103,
"grad_norm": 1.78125,
"learning_rate": 8.098632454414286e-06,
"loss": 0.2368,
"mean_token_accuracy": 0.03733357025339501,
"step": 329
},
{
"epoch": 1.1704583287093553,
"grad_norm": 1.359375,
"learning_rate": 8.08762307473096e-06,
"loss": 0.2413,
"mean_token_accuracy": 0.03179363884191844,
"step": 330
},
{
"epoch": 1.1740095438908003,
"grad_norm": 1.6875,
"learning_rate": 8.07658944500459e-06,
"loss": 0.2333,
"mean_token_accuracy": 0.035859118954249425,
"step": 331
},
{
"epoch": 1.177560759072245,
"grad_norm": 1.2734375,
"learning_rate": 8.065531651892771e-06,
"loss": 0.2468,
"mean_token_accuracy": 0.036549534866935574,
"step": 332
},
{
"epoch": 1.18111197425369,
"grad_norm": 1.5625,
"learning_rate": 8.054449782242876e-06,
"loss": 0.2491,
"mean_token_accuracy": 0.029785508802888216,
"step": 333
},
{
"epoch": 1.1846631894351347,
"grad_norm": 1.484375,
"learning_rate": 8.043343923091382e-06,
"loss": 0.2329,
"mean_token_accuracy": 0.037331359108065953,
"step": 334
},
{
"epoch": 1.1882144046165797,
"grad_norm": 1.8671875,
"learning_rate": 8.03221416166317e-06,
"loss": 0.2433,
"mean_token_accuracy": 0.03565247428196017,
"step": 335
},
{
"epoch": 1.1917656197980246,
"grad_norm": 1.96875,
"learning_rate": 8.021060585370845e-06,
"loss": 0.2376,
"mean_token_accuracy": 0.0353173619696463,
"step": 336
},
{
"epoch": 1.1953168349794696,
"grad_norm": 1.578125,
"learning_rate": 8.009883281814066e-06,
"loss": 0.2388,
"mean_token_accuracy": 0.03581908489650232,
"step": 337
},
{
"epoch": 1.1988680501609144,
"grad_norm": 1.25,
"learning_rate": 7.998682338778834e-06,
"loss": 0.2296,
"mean_token_accuracy": 0.041877999477947014,
"step": 338
},
{
"epoch": 1.2024192653423593,
"grad_norm": 1.2265625,
"learning_rate": 7.987457844236817e-06,
"loss": 0.2356,
"mean_token_accuracy": 0.0347092972297105,
"step": 339
},
{
"epoch": 1.2059704805238043,
"grad_norm": 1.2734375,
"learning_rate": 7.976209886344654e-06,
"loss": 0.2387,
"mean_token_accuracy": 0.03283250893218792,
"step": 340
},
{
"epoch": 1.209521695705249,
"grad_norm": 1.40625,
"learning_rate": 7.964938553443267e-06,
"loss": 0.2292,
"mean_token_accuracy": 0.03533021227485733,
"step": 341
},
{
"epoch": 1.213072910886694,
"grad_norm": 1.1640625,
"learning_rate": 7.953643934057162e-06,
"loss": 0.2424,
"mean_token_accuracy": 0.03509292240414652,
"step": 342
},
{
"epoch": 1.216624126068139,
"grad_norm": 1.015625,
"learning_rate": 7.942326116893733e-06,
"loss": 0.2383,
"mean_token_accuracy": 0.03585474247302045,
"step": 343
},
{
"epoch": 1.220175341249584,
"grad_norm": 1.234375,
"learning_rate": 7.930985190842576e-06,
"loss": 0.2338,
"mean_token_accuracy": 0.04019446158054052,
"step": 344
},
{
"epoch": 1.2237265564310287,
"grad_norm": 1.390625,
"learning_rate": 7.919621244974773e-06,
"loss": 0.2458,
"mean_token_accuracy": 0.03536258387248381,
"step": 345
},
{
"epoch": 1.2272777716124736,
"grad_norm": 1.1171875,
"learning_rate": 7.908234368542214e-06,
"loss": 0.2292,
"mean_token_accuracy": 0.03392878967861179,
"step": 346
},
{
"epoch": 1.2308289867939186,
"grad_norm": 1.671875,
"learning_rate": 7.896824650976873e-06,
"loss": 0.2201,
"mean_token_accuracy": 0.037319375238439534,
"step": 347
},
{
"epoch": 1.2343802019753634,
"grad_norm": 2.40625,
"learning_rate": 7.885392181890126e-06,
"loss": 0.2346,
"mean_token_accuracy": 0.04020543451770209,
"step": 348
},
{
"epoch": 1.2379314171568083,
"grad_norm": 1.4921875,
"learning_rate": 7.873937051072037e-06,
"loss": 0.2415,
"mean_token_accuracy": 0.03590130691372906,
"step": 349
},
{
"epoch": 1.2414826323382533,
"grad_norm": 1.3828125,
"learning_rate": 7.862459348490645e-06,
"loss": 0.2419,
"mean_token_accuracy": 0.038676922078593634,
"step": 350
},
{
"epoch": 1.2450338475196983,
"grad_norm": 1.875,
"learning_rate": 7.85095916429128e-06,
"loss": 0.2302,
"mean_token_accuracy": 0.03200000841388828,
"step": 351
},
{
"epoch": 1.248585062701143,
"grad_norm": 1.171875,
"learning_rate": 7.839436588795834e-06,
"loss": 0.2371,
"mean_token_accuracy": 0.03455797864080523,
"step": 352
},
{
"epoch": 1.252136277882588,
"grad_norm": 1.375,
"learning_rate": 7.82789171250206e-06,
"loss": 0.2276,
"mean_token_accuracy": 0.03702645877638133,
"step": 353
},
{
"epoch": 1.2556874930640327,
"grad_norm": 1.453125,
"learning_rate": 7.816324626082864e-06,
"loss": 0.245,
"mean_token_accuracy": 0.035418079471128294,
"step": 354
},
{
"epoch": 1.2592387082454777,
"grad_norm": 1.2109375,
"learning_rate": 7.804735420385578e-06,
"loss": 0.2496,
"mean_token_accuracy": 0.04052014215631061,
"step": 355
},
{
"epoch": 1.2627899234269226,
"grad_norm": 1.328125,
"learning_rate": 7.793124186431271e-06,
"loss": 0.2253,
"mean_token_accuracy": 0.03850708331447095,
"step": 356
},
{
"epoch": 1.2663411386083676,
"grad_norm": 1.125,
"learning_rate": 7.781491015414018e-06,
"loss": 0.2423,
"mean_token_accuracy": 0.03744576991448412,
"step": 357
},
{
"epoch": 1.2698923537898126,
"grad_norm": 1.078125,
"learning_rate": 7.769835998700182e-06,
"loss": 0.2287,
"mean_token_accuracy": 0.04161575323450961,
"step": 358
},
{
"epoch": 1.2734435689712573,
"grad_norm": 1.3046875,
"learning_rate": 7.758159227827701e-06,
"loss": 0.2354,
"mean_token_accuracy": 0.03756350834009936,
"step": 359
},
{
"epoch": 1.2769947841527023,
"grad_norm": 1.171875,
"learning_rate": 7.746460794505375e-06,
"loss": 0.2447,
"mean_token_accuracy": 0.032854993114597164,
"step": 360
},
{
"epoch": 1.280545999334147,
"grad_norm": 1.15625,
"learning_rate": 7.734740790612137e-06,
"loss": 0.2426,
"mean_token_accuracy": 0.037274191450705985,
"step": 361
},
{
"epoch": 1.284097214515592,
"grad_norm": 1.2734375,
"learning_rate": 7.722999308196329e-06,
"loss": 0.2315,
"mean_token_accuracy": 0.03713756873185048,
"step": 362
},
{
"epoch": 1.287648429697037,
"grad_norm": 1.296875,
"learning_rate": 7.711236439474991e-06,
"loss": 0.2365,
"mean_token_accuracy": 0.036791833699680865,
"step": 363
},
{
"epoch": 1.291199644878482,
"grad_norm": 1.2421875,
"learning_rate": 7.69945227683313e-06,
"loss": 0.2352,
"mean_token_accuracy": 0.0381344523448206,
"step": 364
},
{
"epoch": 1.2947508600599267,
"grad_norm": 2.546875,
"learning_rate": 7.68764691282299e-06,
"loss": 0.2415,
"mean_token_accuracy": 0.04074793899053475,
"step": 365
},
{
"epoch": 1.2983020752413716,
"grad_norm": 1.296875,
"learning_rate": 7.675820440163334e-06,
"loss": 0.2369,
"mean_token_accuracy": 0.03905295207732706,
"step": 366
},
{
"epoch": 1.3018532904228166,
"grad_norm": 1.078125,
"learning_rate": 7.663972951738708e-06,
"loss": 0.2301,
"mean_token_accuracy": 0.035785467212917865,
"step": 367
},
{
"epoch": 1.3054045056042614,
"grad_norm": 1.0546875,
"learning_rate": 7.652104540598712e-06,
"loss": 0.232,
"mean_token_accuracy": 0.038330111019604374,
"step": 368
},
{
"epoch": 1.3089557207857063,
"grad_norm": 1.0234375,
"learning_rate": 7.640215299957283e-06,
"loss": 0.2441,
"mean_token_accuracy": 0.029918354761321098,
"step": 369
},
{
"epoch": 1.3125069359671513,
"grad_norm": 1.390625,
"learning_rate": 7.628305323191942e-06,
"loss": 0.2349,
"mean_token_accuracy": 0.036909411177475704,
"step": 370
},
{
"epoch": 1.3160581511485963,
"grad_norm": 1.5390625,
"learning_rate": 7.616374703843071e-06,
"loss": 0.2493,
"mean_token_accuracy": 0.033028718306013616,
"step": 371
},
{
"epoch": 1.319609366330041,
"grad_norm": 1.1484375,
"learning_rate": 7.604423535613183e-06,
"loss": 0.2347,
"mean_token_accuracy": 0.034142533426347654,
"step": 372
},
{
"epoch": 1.323160581511486,
"grad_norm": 1.125,
"learning_rate": 7.592451912366176e-06,
"loss": 0.2303,
"mean_token_accuracy": 0.03645164545378066,
"step": 373
},
{
"epoch": 1.326711796692931,
"grad_norm": 1.359375,
"learning_rate": 7.580459928126607e-06,
"loss": 0.2433,
"mean_token_accuracy": 0.031131474817811977,
"step": 374
},
{
"epoch": 1.3302630118743757,
"grad_norm": 1.171875,
"learning_rate": 7.568447677078937e-06,
"loss": 0.2337,
"mean_token_accuracy": 0.033356625943270046,
"step": 375
},
{
"epoch": 1.3338142270558206,
"grad_norm": 1.3359375,
"learning_rate": 7.556415253566814e-06,
"loss": 0.2391,
"mean_token_accuracy": 0.03734725382673787,
"step": 376
},
{
"epoch": 1.3373654422372656,
"grad_norm": 1.4921875,
"learning_rate": 7.544362752092309e-06,
"loss": 0.2344,
"mean_token_accuracy": 0.04189994388252671,
"step": 377
},
{
"epoch": 1.3409166574187106,
"grad_norm": 1.1640625,
"learning_rate": 7.532290267315189e-06,
"loss": 0.2373,
"mean_token_accuracy": 0.034419633655488724,
"step": 378
},
{
"epoch": 1.3444678726001553,
"grad_norm": 1.28125,
"learning_rate": 7.52019789405217e-06,
"loss": 0.2396,
"mean_token_accuracy": 0.03570301646141161,
"step": 379
},
{
"epoch": 1.3480190877816003,
"grad_norm": 1.609375,
"learning_rate": 7.508085727276169e-06,
"loss": 0.2516,
"mean_token_accuracy": 0.03470832618040731,
"step": 380
},
{
"epoch": 1.351570302963045,
"grad_norm": 1.609375,
"learning_rate": 7.495953862115561e-06,
"loss": 0.2352,
"mean_token_accuracy": 0.030847028268908616,
"step": 381
},
{
"epoch": 1.35512151814449,
"grad_norm": 1.234375,
"learning_rate": 7.483802393853431e-06,
"loss": 0.2212,
"mean_token_accuracy": 0.03855792362446664,
"step": 382
},
{
"epoch": 1.358672733325935,
"grad_norm": 1.4375,
"learning_rate": 7.471631417926826e-06,
"loss": 0.2462,
"mean_token_accuracy": 0.03374627606171998,
"step": 383
},
{
"epoch": 1.36222394850738,
"grad_norm": 1.2578125,
"learning_rate": 7.459441029926006e-06,
"loss": 0.2379,
"mean_token_accuracy": 0.039975615829462186,
"step": 384
},
{
"epoch": 1.365775163688825,
"grad_norm": 1.1953125,
"learning_rate": 7.447231325593689e-06,
"loss": 0.2409,
"mean_token_accuracy": 0.03599957966071088,
"step": 385
},
{
"epoch": 1.3693263788702696,
"grad_norm": 1.0546875,
"learning_rate": 7.435002400824309e-06,
"loss": 0.2424,
"mean_token_accuracy": 0.04002546479568991,
"step": 386
},
{
"epoch": 1.3728775940517146,
"grad_norm": 1.40625,
"learning_rate": 7.422754351663252e-06,
"loss": 0.2276,
"mean_token_accuracy": 0.03706626188068185,
"step": 387
},
{
"epoch": 1.3764288092331594,
"grad_norm": 1.2890625,
"learning_rate": 7.410487274306104e-06,
"loss": 0.2335,
"mean_token_accuracy": 0.03697552310222818,
"step": 388
},
{
"epoch": 1.3799800244146043,
"grad_norm": 1.3125,
"learning_rate": 7.398201265097902e-06,
"loss": 0.2342,
"mean_token_accuracy": 0.038530107736733044,
"step": 389
},
{
"epoch": 1.3835312395960493,
"grad_norm": 1.15625,
"learning_rate": 7.385896420532372e-06,
"loss": 0.2302,
"mean_token_accuracy": 0.03841527171971393,
"step": 390
},
{
"epoch": 1.3870824547774943,
"grad_norm": 1.2421875,
"learning_rate": 7.37357283725117e-06,
"loss": 0.2331,
"mean_token_accuracy": 0.0332794542555348,
"step": 391
},
{
"epoch": 1.390633669958939,
"grad_norm": 1.0703125,
"learning_rate": 7.361230612043125e-06,
"loss": 0.2351,
"mean_token_accuracy": 0.03882583613449242,
"step": 392
},
{
"epoch": 1.394184885140384,
"grad_norm": 1.625,
"learning_rate": 7.3488698418434824e-06,
"loss": 0.236,
"mean_token_accuracy": 0.034570411104141385,
"step": 393
},
{
"epoch": 1.397736100321829,
"grad_norm": 1.4375,
"learning_rate": 7.3364906237331345e-06,
"loss": 0.2342,
"mean_token_accuracy": 0.03536622403044021,
"step": 394
},
{
"epoch": 1.4012873155032737,
"grad_norm": 2.421875,
"learning_rate": 7.324093054937864e-06,
"loss": 0.2403,
"mean_token_accuracy": 0.03640507650561631,
"step": 395
},
{
"epoch": 1.4048385306847186,
"grad_norm": 1.375,
"learning_rate": 7.311677232827583e-06,
"loss": 0.2459,
"mean_token_accuracy": 0.034755626278638374,
"step": 396
},
{
"epoch": 1.4083897458661636,
"grad_norm": 2.28125,
"learning_rate": 7.299243254915558e-06,
"loss": 0.2456,
"mean_token_accuracy": 0.03809268196710036,
"step": 397
},
{
"epoch": 1.4119409610476086,
"grad_norm": 1.1328125,
"learning_rate": 7.286791218857654e-06,
"loss": 0.2353,
"mean_token_accuracy": 0.03845041626118473,
"step": 398
},
{
"epoch": 1.4154921762290533,
"grad_norm": 2.625,
"learning_rate": 7.274321222451561e-06,
"loss": 0.2312,
"mean_token_accuracy": 0.0348738961838535,
"step": 399
},
{
"epoch": 1.4190433914104983,
"grad_norm": 2.078125,
"learning_rate": 7.261833363636036e-06,
"loss": 0.2518,
"mean_token_accuracy": 0.03649535452859709,
"step": 400
},
{
"epoch": 1.4225946065919433,
"grad_norm": 1.21875,
"learning_rate": 7.249327740490114e-06,
"loss": 0.2357,
"mean_token_accuracy": 0.03557684525367222,
"step": 401
},
{
"epoch": 1.426145821773388,
"grad_norm": 1.09375,
"learning_rate": 7.236804451232364e-06,
"loss": 0.2359,
"mean_token_accuracy": 0.037769879712868715,
"step": 402
},
{
"epoch": 1.429697036954833,
"grad_norm": 1.4453125,
"learning_rate": 7.224263594220093e-06,
"loss": 0.2443,
"mean_token_accuracy": 0.035131411799739,
"step": 403
},
{
"epoch": 1.433248252136278,
"grad_norm": 1.15625,
"learning_rate": 7.211705267948592e-06,
"loss": 0.2288,
"mean_token_accuracy": 0.038788905261753825,
"step": 404
},
{
"epoch": 1.436799467317723,
"grad_norm": 1.015625,
"learning_rate": 7.199129571050345e-06,
"loss": 0.2534,
"mean_token_accuracy": 0.03436974439318874,
"step": 405
},
{
"epoch": 1.4403506824991676,
"grad_norm": 0.98828125,
"learning_rate": 7.186536602294278e-06,
"loss": 0.22,
"mean_token_accuracy": 0.03988837570796022,
"step": 406
},
{
"epoch": 1.4439018976806126,
"grad_norm": 1.3984375,
"learning_rate": 7.173926460584956e-06,
"loss": 0.24,
"mean_token_accuracy": 0.0312625703154481,
"step": 407
},
{
"epoch": 1.4474531128620574,
"grad_norm": 1.53125,
"learning_rate": 7.161299244961828e-06,
"loss": 0.2339,
"mean_token_accuracy": 0.04129007174924482,
"step": 408
},
{
"epoch": 1.4510043280435023,
"grad_norm": 1.3203125,
"learning_rate": 7.148655054598436e-06,
"loss": 0.2311,
"mean_token_accuracy": 0.036289898944232846,
"step": 409
},
{
"epoch": 1.4545555432249473,
"grad_norm": 1.421875,
"learning_rate": 7.135993988801644e-06,
"loss": 0.2335,
"mean_token_accuracy": 0.034655624454899225,
"step": 410
},
{
"epoch": 1.4581067584063923,
"grad_norm": 1.125,
"learning_rate": 7.1233161470108525e-06,
"loss": 0.2359,
"mean_token_accuracy": 0.037607920974551234,
"step": 411
},
{
"epoch": 1.4616579735878372,
"grad_norm": 1.3125,
"learning_rate": 7.110621628797222e-06,
"loss": 0.2433,
"mean_token_accuracy": 0.03281604757466994,
"step": 412
},
{
"epoch": 1.465209188769282,
"grad_norm": 1.1796875,
"learning_rate": 7.097910533862886e-06,
"loss": 0.2352,
"mean_token_accuracy": 0.034316202265472384,
"step": 413
},
{
"epoch": 1.468760403950727,
"grad_norm": 1.1875,
"learning_rate": 7.085182962040173e-06,
"loss": 0.249,
"mean_token_accuracy": 0.032907980152231175,
"step": 414
},
{
"epoch": 1.4723116191321717,
"grad_norm": 1.0703125,
"learning_rate": 7.072439013290824e-06,
"loss": 0.238,
"mean_token_accuracy": 0.03209046665506321,
"step": 415
},
{
"epoch": 1.4758628343136166,
"grad_norm": 1.359375,
"learning_rate": 7.059678787705191e-06,
"loss": 0.2456,
"mean_token_accuracy": 0.03140619180339854,
"step": 416
},
{
"epoch": 1.4794140494950616,
"grad_norm": 1.125,
"learning_rate": 7.046902385501477e-06,
"loss": 0.2361,
"mean_token_accuracy": 0.03729598738209461,
"step": 417
},
{
"epoch": 1.4829652646765066,
"grad_norm": 1.3125,
"learning_rate": 7.03410990702493e-06,
"loss": 0.2285,
"mean_token_accuracy": 0.03452415266292519,
"step": 418
},
{
"epoch": 1.4865164798579513,
"grad_norm": 1.1171875,
"learning_rate": 7.02130145274706e-06,
"loss": 0.2397,
"mean_token_accuracy": 0.03802148198155919,
"step": 419
},
{
"epoch": 1.4900676950393963,
"grad_norm": 1.125,
"learning_rate": 7.008477123264849e-06,
"loss": 0.2368,
"mean_token_accuracy": 0.038115493043733295,
"step": 420
},
{
"epoch": 1.4936189102208413,
"grad_norm": 1.3203125,
"learning_rate": 6.995637019299963e-06,
"loss": 0.2386,
"mean_token_accuracy": 0.03685344383484335,
"step": 421
},
{
"epoch": 1.497170125402286,
"grad_norm": 1.484375,
"learning_rate": 6.982781241697963e-06,
"loss": 0.2389,
"mean_token_accuracy": 0.03893219211749965,
"step": 422
},
{
"epoch": 1.500721340583731,
"grad_norm": 1.265625,
"learning_rate": 6.969909891427509e-06,
"loss": 0.2317,
"mean_token_accuracy": 0.0351432016796025,
"step": 423
},
{
"epoch": 1.504272555765176,
"grad_norm": 1.7734375,
"learning_rate": 6.957023069579561e-06,
"loss": 0.2241,
"mean_token_accuracy": 0.03948885342106223,
"step": 424
},
{
"epoch": 1.507823770946621,
"grad_norm": 1.328125,
"learning_rate": 6.944120877366605e-06,
"loss": 0.2438,
"mean_token_accuracy": 0.03480056252737995,
"step": 425
},
{
"epoch": 1.5113749861280656,
"grad_norm": 1.6328125,
"learning_rate": 6.931203416121831e-06,
"loss": 0.2336,
"mean_token_accuracy": 0.031314958760049194,
"step": 426
},
{
"epoch": 1.5149262013095106,
"grad_norm": 1.2265625,
"learning_rate": 6.918270787298361e-06,
"loss": 0.2466,
"mean_token_accuracy": 0.03130312504072208,
"step": 427
},
{
"epoch": 1.5184774164909554,
"grad_norm": 1.5703125,
"learning_rate": 6.90532309246844e-06,
"loss": 0.249,
"mean_token_accuracy": 0.03627348578811507,
"step": 428
},
{
"epoch": 1.5220286316724003,
"grad_norm": 1.1015625,
"learning_rate": 6.89236043332264e-06,
"loss": 0.2229,
"mean_token_accuracy": 0.04060299464981654,
"step": 429
},
{
"epoch": 1.5255798468538453,
"grad_norm": 1.1328125,
"learning_rate": 6.87938291166906e-06,
"loss": 0.2348,
"mean_token_accuracy": 0.03792013142447104,
"step": 430
},
{
"epoch": 1.5291310620352903,
"grad_norm": 1.15625,
"learning_rate": 6.866390629432533e-06,
"loss": 0.228,
"mean_token_accuracy": 0.04107047704019351,
"step": 431
},
{
"epoch": 1.5326822772167352,
"grad_norm": 1.2109375,
"learning_rate": 6.8533836886538175e-06,
"loss": 0.2524,
"mean_token_accuracy": 0.03478304122472764,
"step": 432
},
{
"epoch": 1.53623349239818,
"grad_norm": 1.2265625,
"learning_rate": 6.840362191488801e-06,
"loss": 0.2379,
"mean_token_accuracy": 0.036657528744399315,
"step": 433
},
{
"epoch": 1.539784707579625,
"grad_norm": 1.1171875,
"learning_rate": 6.8273262402076935e-06,
"loss": 0.2321,
"mean_token_accuracy": 0.03788031428484828,
"step": 434
},
{
"epoch": 1.5433359227610697,
"grad_norm": 1.1953125,
"learning_rate": 6.814275937194233e-06,
"loss": 0.2263,
"mean_token_accuracy": 0.0349160894365923,
"step": 435
},
{
"epoch": 1.5468871379425146,
"grad_norm": 1.3515625,
"learning_rate": 6.801211384944867e-06,
"loss": 0.2405,
"mean_token_accuracy": 0.03533229600725463,
"step": 436
},
{
"epoch": 1.5504383531239596,
"grad_norm": 1.1171875,
"learning_rate": 6.788132686067963e-06,
"loss": 0.2356,
"mean_token_accuracy": 0.04051366758358199,
"step": 437
},
{
"epoch": 1.5539895683054046,
"grad_norm": 1.34375,
"learning_rate": 6.77503994328299e-06,
"loss": 0.235,
"mean_token_accuracy": 0.03521855714279809,
"step": 438
},
{
"epoch": 1.5575407834868495,
"grad_norm": 1.390625,
"learning_rate": 6.761933259419725e-06,
"loss": 0.2383,
"mean_token_accuracy": 0.03321351679187501,
"step": 439
},
{
"epoch": 1.5610919986682943,
"grad_norm": 3.125,
"learning_rate": 6.748812737417428e-06,
"loss": 0.2343,
"mean_token_accuracy": 0.03764536406015395,
"step": 440
},
{
"epoch": 1.5646432138497393,
"grad_norm": 1.109375,
"learning_rate": 6.7356784803240464e-06,
"loss": 0.2445,
"mean_token_accuracy": 0.0358848099149327,
"step": 441
},
{
"epoch": 1.568194429031184,
"grad_norm": 1.234375,
"learning_rate": 6.722530591295406e-06,
"loss": 0.2342,
"mean_token_accuracy": 0.035192674804420676,
"step": 442
},
{
"epoch": 1.571745644212629,
"grad_norm": 1.0859375,
"learning_rate": 6.709369173594396e-06,
"loss": 0.2384,
"mean_token_accuracy": 0.037970394078001846,
"step": 443
},
{
"epoch": 1.575296859394074,
"grad_norm": 1.265625,
"learning_rate": 6.6961943305901515e-06,
"loss": 0.2388,
"mean_token_accuracy": 0.0376486132190621,
"step": 444
},
{
"epoch": 1.578848074575519,
"grad_norm": 1.296875,
"learning_rate": 6.683006165757262e-06,
"loss": 0.2249,
"mean_token_accuracy": 0.03808089874291909,
"step": 445
},
{
"epoch": 1.5823992897569639,
"grad_norm": 1.3671875,
"learning_rate": 6.669804782674937e-06,
"loss": 0.2401,
"mean_token_accuracy": 0.03246638694690773,
"step": 446
},
{
"epoch": 1.5859505049384086,
"grad_norm": 1.421875,
"learning_rate": 6.656590285026203e-06,
"loss": 0.2311,
"mean_token_accuracy": 0.037845788236154476,
"step": 447
},
{
"epoch": 1.5895017201198534,
"grad_norm": 1.2734375,
"learning_rate": 6.643362776597089e-06,
"loss": 0.2588,
"mean_token_accuracy": 0.034229919638164574,
"step": 448
},
{
"epoch": 1.5930529353012983,
"grad_norm": 1.2109375,
"learning_rate": 6.630122361275811e-06,
"loss": 0.2437,
"mean_token_accuracy": 0.034587128982821014,
"step": 449
},
{
"epoch": 1.5966041504827433,
"grad_norm": 1.5,
"learning_rate": 6.6168691430519524e-06,
"loss": 0.25,
"mean_token_accuracy": 0.030672385284560733,
"step": 450
},
{
"epoch": 1.6001553656641883,
"grad_norm": 1.609375,
"learning_rate": 6.6036032260156526e-06,
"loss": 0.2306,
"mean_token_accuracy": 0.04033701937441947,
"step": 451
},
{
"epoch": 1.6037065808456332,
"grad_norm": 1.625,
"learning_rate": 6.590324714356784e-06,
"loss": 0.2359,
"mean_token_accuracy": 0.034981218981556594,
"step": 452
},
{
"epoch": 1.607257796027078,
"grad_norm": 2.15625,
"learning_rate": 6.5770337123641405e-06,
"loss": 0.2212,
"mean_token_accuracy": 0.0414577160445333,
"step": 453
},
{
"epoch": 1.610809011208523,
"grad_norm": 1.3671875,
"learning_rate": 6.563730324424609e-06,
"loss": 0.2406,
"mean_token_accuracy": 0.0381101651910285,
"step": 454
},
{
"epoch": 1.6143602263899677,
"grad_norm": 1.2578125,
"learning_rate": 6.55041465502236e-06,
"loss": 0.2272,
"mean_token_accuracy": 0.04102694254106609,
"step": 455
},
{
"epoch": 1.6179114415714126,
"grad_norm": 1.2734375,
"learning_rate": 6.53708680873802e-06,
"loss": 0.2494,
"mean_token_accuracy": 0.03622519101918442,
"step": 456
},
{
"epoch": 1.6214626567528576,
"grad_norm": 1.46875,
"learning_rate": 6.523746890247853e-06,
"loss": 0.2255,
"mean_token_accuracy": 0.03894408120322623,
"step": 457
},
{
"epoch": 1.6250138719343026,
"grad_norm": 1.03125,
"learning_rate": 6.510395004322937e-06,
"loss": 0.2312,
"mean_token_accuracy": 0.03699832962593064,
"step": 458
},
{
"epoch": 1.6285650871157475,
"grad_norm": 1.546875,
"learning_rate": 6.49703125582834e-06,
"loss": 0.2341,
"mean_token_accuracy": 0.03479354368937493,
"step": 459
},
{
"epoch": 1.6321163022971923,
"grad_norm": 1.6484375,
"learning_rate": 6.4836557497222995e-06,
"loss": 0.2397,
"mean_token_accuracy": 0.04207373945610016,
"step": 460
},
{
"epoch": 1.6356675174786373,
"grad_norm": 1.8671875,
"learning_rate": 6.470268591055398e-06,
"loss": 0.2337,
"mean_token_accuracy": 0.03901815911376616,
"step": 461
},
{
"epoch": 1.639218732660082,
"grad_norm": 1.0703125,
"learning_rate": 6.456869884969738e-06,
"loss": 0.2213,
"mean_token_accuracy": 0.03514585681114113,
"step": 462
},
{
"epoch": 1.642769947841527,
"grad_norm": 1.3828125,
"learning_rate": 6.443459736698106e-06,
"loss": 0.2382,
"mean_token_accuracy": 0.03269572283170419,
"step": 463
},
{
"epoch": 1.646321163022972,
"grad_norm": 1.28125,
"learning_rate": 6.430038251563166e-06,
"loss": 0.2268,
"mean_token_accuracy": 0.03920681574527407,
"step": 464
},
{
"epoch": 1.649872378204417,
"grad_norm": 6.90625,
"learning_rate": 6.416605534976614e-06,
"loss": 0.2331,
"mean_token_accuracy": 0.035078568678727606,
"step": 465
},
{
"epoch": 1.6534235933858619,
"grad_norm": 1.4609375,
"learning_rate": 6.403161692438364e-06,
"loss": 0.2333,
"mean_token_accuracy": 0.03780714001550223,
"step": 466
},
{
"epoch": 1.6569748085673066,
"grad_norm": 1.953125,
"learning_rate": 6.3897068295357e-06,
"loss": 0.2311,
"mean_token_accuracy": 0.033330076843412826,
"step": 467
},
{
"epoch": 1.6605260237487516,
"grad_norm": 1.3984375,
"learning_rate": 6.376241051942477e-06,
"loss": 0.2281,
"mean_token_accuracy": 0.038188748992979527,
"step": 468
},
{
"epoch": 1.6640772389301963,
"grad_norm": 1.1484375,
"learning_rate": 6.362764465418258e-06,
"loss": 0.2361,
"mean_token_accuracy": 0.040052126856608083,
"step": 469
},
{
"epoch": 1.6676284541116413,
"grad_norm": 1.265625,
"learning_rate": 6.349277175807506e-06,
"loss": 0.2286,
"mean_token_accuracy": 0.0381559070374351,
"step": 470
},
{
"epoch": 1.6711796692930863,
"grad_norm": 1.421875,
"learning_rate": 6.3357792890387485e-06,
"loss": 0.2314,
"mean_token_accuracy": 0.032606342934741406,
"step": 471
},
{
"epoch": 1.6747308844745312,
"grad_norm": 1.421875,
"learning_rate": 6.322270911123734e-06,
"loss": 0.2369,
"mean_token_accuracy": 0.034345271826168755,
"step": 472
},
{
"epoch": 1.6782820996559762,
"grad_norm": 1.5625,
"learning_rate": 6.308752148156614e-06,
"loss": 0.2256,
"mean_token_accuracy": 0.03452475197263993,
"step": 473
},
{
"epoch": 1.681833314837421,
"grad_norm": 1.15625,
"learning_rate": 6.295223106313104e-06,
"loss": 0.2374,
"mean_token_accuracy": 0.03493335935854702,
"step": 474
},
{
"epoch": 1.6853845300188657,
"grad_norm": 1.1796875,
"learning_rate": 6.281683891849645e-06,
"loss": 0.2407,
"mean_token_accuracy": 0.03363906976665021,
"step": 475
},
{
"epoch": 1.6889357452003106,
"grad_norm": 1.2734375,
"learning_rate": 6.268134611102578e-06,
"loss": 0.2352,
"mean_token_accuracy": 0.03696349471283611,
"step": 476
},
{
"epoch": 1.6924869603817556,
"grad_norm": 1.5,
"learning_rate": 6.254575370487299e-06,
"loss": 0.2299,
"mean_token_accuracy": 0.03581606224179268,
"step": 477
},
{
"epoch": 1.6960381755632006,
"grad_norm": 1.1640625,
"learning_rate": 6.2410062764974366e-06,
"loss": 0.2385,
"mean_token_accuracy": 0.03706932210479863,
"step": 478
},
{
"epoch": 1.6995893907446455,
"grad_norm": 1.15625,
"learning_rate": 6.227427435703997e-06,
"loss": 0.2432,
"mean_token_accuracy": 0.03006288245160249,
"step": 479
},
{
"epoch": 1.7031406059260903,
"grad_norm": 1.25,
"learning_rate": 6.213838954754543e-06,
"loss": 0.2429,
"mean_token_accuracy": 0.044262315965170274,
"step": 480
},
{
"epoch": 1.7066918211075353,
"grad_norm": 1.5546875,
"learning_rate": 6.2002409403723525e-06,
"loss": 0.2393,
"mean_token_accuracy": 0.03908459947706433,
"step": 481
},
{
"epoch": 1.71024303628898,
"grad_norm": 3.46875,
"learning_rate": 6.186633499355576e-06,
"loss": 0.245,
"mean_token_accuracy": 0.036892217398417415,
"step": 482
},
{
"epoch": 1.713794251470425,
"grad_norm": 1.3984375,
"learning_rate": 6.173016738576396e-06,
"loss": 0.2362,
"mean_token_accuracy": 0.03681937377768918,
"step": 483
},
{
"epoch": 1.71734546665187,
"grad_norm": 1.109375,
"learning_rate": 6.159390764980202e-06,
"loss": 0.2312,
"mean_token_accuracy": 0.03445882866799366,
"step": 484
},
{
"epoch": 1.720896681833315,
"grad_norm": 1.25,
"learning_rate": 6.145755685584731e-06,
"loss": 0.2322,
"mean_token_accuracy": 0.039366260476526804,
"step": 485
},
{
"epoch": 1.7244478970147599,
"grad_norm": 1.609375,
"learning_rate": 6.132111607479243e-06,
"loss": 0.2364,
"mean_token_accuracy": 0.03817261819494888,
"step": 486
},
{
"epoch": 1.7279991121962046,
"grad_norm": 1.375,
"learning_rate": 6.118458637823669e-06,
"loss": 0.2247,
"mean_token_accuracy": 0.03410290294414153,
"step": 487
},
{
"epoch": 1.7315503273776496,
"grad_norm": 1.3125,
"learning_rate": 6.104796883847777e-06,
"loss": 0.2359,
"mean_token_accuracy": 0.036821881629293784,
"step": 488
},
{
"epoch": 1.7351015425590943,
"grad_norm": 1.53125,
"learning_rate": 6.091126452850324e-06,
"loss": 0.2207,
"mean_token_accuracy": 0.04196582403710636,
"step": 489
},
{
"epoch": 1.7386527577405393,
"grad_norm": 1.2265625,
"learning_rate": 6.077447452198219e-06,
"loss": 0.245,
"mean_token_accuracy": 0.030956470383898704,
"step": 490
},
{
"epoch": 1.7422039729219843,
"grad_norm": 2.265625,
"learning_rate": 6.063759989325673e-06,
"loss": 0.2277,
"mean_token_accuracy": 0.04123112729212153,
"step": 491
},
{
"epoch": 1.7457551881034292,
"grad_norm": 1.125,
"learning_rate": 6.050064171733362e-06,
"loss": 0.2345,
"mean_token_accuracy": 0.03712779658235377,
"step": 492
},
{
"epoch": 1.7493064032848742,
"grad_norm": 1.09375,
"learning_rate": 6.0363601069875755e-06,
"loss": 0.234,
"mean_token_accuracy": 0.04036831553457887,
"step": 493
},
{
"epoch": 1.752857618466319,
"grad_norm": 1.4375,
"learning_rate": 6.022647902719384e-06,
"loss": 0.252,
"mean_token_accuracy": 0.03416740952343389,
"step": 494
},
{
"epoch": 1.7564088336477637,
"grad_norm": 1.2421875,
"learning_rate": 6.008927666623775e-06,
"loss": 0.2324,
"mean_token_accuracy": 0.03657872789517569,
"step": 495
},
{
"epoch": 1.7599600488292086,
"grad_norm": 1.484375,
"learning_rate": 5.9951995064588245e-06,
"loss": 0.2284,
"mean_token_accuracy": 0.03967171028489247,
"step": 496
},
{
"epoch": 1.7635112640106536,
"grad_norm": 1.4296875,
"learning_rate": 5.981463530044841e-06,
"loss": 0.2325,
"mean_token_accuracy": 0.03946096594881965,
"step": 497
},
{
"epoch": 1.7670624791920986,
"grad_norm": 0.984375,
"learning_rate": 5.967719845263524e-06,
"loss": 0.2344,
"mean_token_accuracy": 0.03674850361494464,
"step": 498
},
{
"epoch": 1.7706136943735435,
"grad_norm": 1.4453125,
"learning_rate": 5.953968560057112e-06,
"loss": 0.2397,
"mean_token_accuracy": 0.03331250947303488,
"step": 499
},
{
"epoch": 1.7741649095549885,
"grad_norm": 1.2421875,
"learning_rate": 5.940209782427535e-06,
"loss": 0.2466,
"mean_token_accuracy": 0.035536976964067435,
"step": 500
},
{
"epoch": 1.7777161247364333,
"grad_norm": 1.2421875,
"learning_rate": 5.926443620435572e-06,
"loss": 0.2372,
"mean_token_accuracy": 0.034023440719465725,
"step": 501
},
{
"epoch": 1.781267339917878,
"grad_norm": 1.4375,
"learning_rate": 5.912670182199998e-06,
"loss": 0.2423,
"mean_token_accuracy": 0.0336604088297463,
"step": 502
},
{
"epoch": 1.784818555099323,
"grad_norm": 1.3125,
"learning_rate": 5.898889575896731e-06,
"loss": 0.2378,
"mean_token_accuracy": 0.034576399257275625,
"step": 503
},
{
"epoch": 1.788369770280768,
"grad_norm": 1.7109375,
"learning_rate": 5.8851019097579935e-06,
"loss": 0.2299,
"mean_token_accuracy": 0.04091495179090998,
"step": 504
},
{
"epoch": 1.791920985462213,
"grad_norm": 1.203125,
"learning_rate": 5.871307292071449e-06,
"loss": 0.2423,
"mean_token_accuracy": 0.03323508650282747,
"step": 505
},
{
"epoch": 1.7954722006436579,
"grad_norm": 1.234375,
"learning_rate": 5.857505831179361e-06,
"loss": 0.2452,
"mean_token_accuracy": 0.039159927426226204,
"step": 506
},
{
"epoch": 1.7990234158251026,
"grad_norm": 1.078125,
"learning_rate": 5.843697635477742e-06,
"loss": 0.2474,
"mean_token_accuracy": 0.032113108623889275,
"step": 507
},
{
"epoch": 1.8025746310065476,
"grad_norm": 1.28125,
"learning_rate": 5.8298828134154935e-06,
"loss": 0.2273,
"mean_token_accuracy": 0.03509632355417125,
"step": 508
},
{
"epoch": 1.8061258461879923,
"grad_norm": 1.8984375,
"learning_rate": 5.816061473493565e-06,
"loss": 0.2457,
"mean_token_accuracy": 0.03150586118135834,
"step": 509
},
{
"epoch": 1.8096770613694373,
"grad_norm": 1.1171875,
"learning_rate": 5.802233724264094e-06,
"loss": 0.2423,
"mean_token_accuracy": 0.03352252102922648,
"step": 510
},
{
"epoch": 1.8132282765508823,
"grad_norm": 1.484375,
"learning_rate": 5.788399674329559e-06,
"loss": 0.2355,
"mean_token_accuracy": 0.03722988534354954,
"step": 511
},
{
"epoch": 1.8167794917323272,
"grad_norm": 1.1796875,
"learning_rate": 5.774559432341918e-06,
"loss": 0.2347,
"mean_token_accuracy": 0.03498390710228705,
"step": 512
},
{
"epoch": 1.8203307069137722,
"grad_norm": 1.046875,
"learning_rate": 5.760713107001773e-06,
"loss": 0.239,
"mean_token_accuracy": 0.034972945799381705,
"step": 513
},
{
"epoch": 1.823881922095217,
"grad_norm": 1.609375,
"learning_rate": 5.746860807057491e-06,
"loss": 0.2354,
"mean_token_accuracy": 0.03691285277818679,
"step": 514
},
{
"epoch": 1.827433137276662,
"grad_norm": 1.9375,
"learning_rate": 5.7330026413043726e-06,
"loss": 0.2316,
"mean_token_accuracy": 0.03540732314104389,
"step": 515
},
{
"epoch": 1.8309843524581066,
"grad_norm": 1.390625,
"learning_rate": 5.719138718583781e-06,
"loss": 0.2379,
"mean_token_accuracy": 0.03562885835162888,
"step": 516
},
{
"epoch": 1.8345355676395516,
"grad_norm": 1.140625,
"learning_rate": 5.705269147782303e-06,
"loss": 0.2338,
"mean_token_accuracy": 0.033621748105360894,
"step": 517
},
{
"epoch": 1.8380867828209966,
"grad_norm": 1.046875,
"learning_rate": 5.6913940378308755e-06,
"loss": 0.234,
"mean_token_accuracy": 0.04016883431177121,
"step": 518
},
{
"epoch": 1.8416379980024415,
"grad_norm": 1.6015625,
"learning_rate": 5.677513497703947e-06,
"loss": 0.2357,
"mean_token_accuracy": 0.04463333530293312,
"step": 519
},
{
"epoch": 1.8451892131838865,
"grad_norm": 1.234375,
"learning_rate": 5.663627636418611e-06,
"loss": 0.232,
"mean_token_accuracy": 0.03623782243448659,
"step": 520
},
{
"epoch": 1.8487404283653313,
"grad_norm": 1.203125,
"learning_rate": 5.649736563033754e-06,
"loss": 0.2433,
"mean_token_accuracy": 0.03492143240146106,
"step": 521
},
{
"epoch": 1.852291643546776,
"grad_norm": 1.65625,
"learning_rate": 5.635840386649197e-06,
"loss": 0.2308,
"mean_token_accuracy": 0.03824477005036897,
"step": 522
},
{
"epoch": 1.855842858728221,
"grad_norm": 1.1953125,
"learning_rate": 5.621939216404842e-06,
"loss": 0.2439,
"mean_token_accuracy": 0.034680439697694965,
"step": 523
},
{
"epoch": 1.859394073909666,
"grad_norm": 1.1953125,
"learning_rate": 5.608033161479811e-06,
"loss": 0.2264,
"mean_token_accuracy": 0.039163380019090255,
"step": 524
},
{
"epoch": 1.862945289091111,
"grad_norm": 2.015625,
"learning_rate": 5.594122331091591e-06,
"loss": 0.2239,
"mean_token_accuracy": 0.036949697489035316,
"step": 525
},
{
"epoch": 1.8664965042725559,
"grad_norm": 1.453125,
"learning_rate": 5.580206834495169e-06,
"loss": 0.2358,
"mean_token_accuracy": 0.03440156889701029,
"step": 526
},
{
"epoch": 1.8700477194540006,
"grad_norm": 1.25,
"learning_rate": 5.566286780982193e-06,
"loss": 0.2363,
"mean_token_accuracy": 0.03501361182497931,
"step": 527
},
{
"epoch": 1.8735989346354456,
"grad_norm": 1.5078125,
"learning_rate": 5.552362279880091e-06,
"loss": 0.2435,
"mean_token_accuracy": 0.03163332929398166,
"step": 528
},
{
"epoch": 1.8771501498168903,
"grad_norm": 1.078125,
"learning_rate": 5.538433440551221e-06,
"loss": 0.2309,
"mean_token_accuracy": 0.040504791504645254,
"step": 529
},
{
"epoch": 1.8807013649983353,
"grad_norm": 1.3203125,
"learning_rate": 5.524500372392021e-06,
"loss": 0.2386,
"mean_token_accuracy": 0.03375892240728717,
"step": 530
},
{
"epoch": 1.8842525801797803,
"grad_norm": 1.0546875,
"learning_rate": 5.5105631848321375e-06,
"loss": 0.2396,
"mean_token_accuracy": 0.040157406461730716,
"step": 531
},
{
"epoch": 1.8878037953612252,
"grad_norm": 1.2109375,
"learning_rate": 5.496621987333567e-06,
"loss": 0.2455,
"mean_token_accuracy": 0.034050565504003316,
"step": 532
},
{
"epoch": 1.8913550105426702,
"grad_norm": 1.3828125,
"learning_rate": 5.482676889389808e-06,
"loss": 0.2376,
"mean_token_accuracy": 0.03857235643590684,
"step": 533
},
{
"epoch": 1.894906225724115,
"grad_norm": 1.2578125,
"learning_rate": 5.468728000524987e-06,
"loss": 0.2264,
"mean_token_accuracy": 0.040069550173939206,
"step": 534
},
{
"epoch": 1.89845744090556,
"grad_norm": 1.7734375,
"learning_rate": 5.454775430293008e-06,
"loss": 0.2318,
"mean_token_accuracy": 0.03648939702179632,
"step": 535
},
{
"epoch": 1.9020086560870046,
"grad_norm": 0.8984375,
"learning_rate": 5.440819288276683e-06,
"loss": 0.2312,
"mean_token_accuracy": 0.038116528681712225,
"step": 536
},
{
"epoch": 1.9055598712684496,
"grad_norm": 1.34375,
"learning_rate": 5.426859684086881e-06,
"loss": 0.2456,
"mean_token_accuracy": 0.034208514596684836,
"step": 537
},
{
"epoch": 1.9091110864498946,
"grad_norm": 2.0,
"learning_rate": 5.412896727361663e-06,
"loss": 0.2265,
"mean_token_accuracy": 0.040587535226222826,
"step": 538
},
{
"epoch": 1.9126623016313395,
"grad_norm": 1.2734375,
"learning_rate": 5.398930527765416e-06,
"loss": 0.2385,
"mean_token_accuracy": 0.03604849764087703,
"step": 539
},
{
"epoch": 1.9162135168127845,
"grad_norm": 1.421875,
"learning_rate": 5.384961194988002e-06,
"loss": 0.2318,
"mean_token_accuracy": 0.03515218906613882,
"step": 540
},
{
"epoch": 1.9197647319942293,
"grad_norm": 1.6015625,
"learning_rate": 5.370988838743889e-06,
"loss": 0.236,
"mean_token_accuracy": 0.03593129891669378,
"step": 541
},
{
"epoch": 1.9233159471756742,
"grad_norm": 1.09375,
"learning_rate": 5.357013568771288e-06,
"loss": 0.2232,
"mean_token_accuracy": 0.03748265598733269,
"step": 542
},
{
"epoch": 1.926867162357119,
"grad_norm": 1.28125,
"learning_rate": 5.343035494831298e-06,
"loss": 0.2288,
"mean_token_accuracy": 0.03536356821859954,
"step": 543
},
{
"epoch": 1.930418377538564,
"grad_norm": 2.171875,
"learning_rate": 5.32905472670704e-06,
"loss": 0.2246,
"mean_token_accuracy": 0.03547816792342928,
"step": 544
},
{
"epoch": 1.933969592720009,
"grad_norm": 1.34375,
"learning_rate": 5.315071374202792e-06,
"loss": 0.2534,
"mean_token_accuracy": 0.03712530972188688,
"step": 545
},
{
"epoch": 1.9375208079014539,
"grad_norm": 1.2578125,
"learning_rate": 5.301085547143135e-06,
"loss": 0.2312,
"mean_token_accuracy": 0.03489181573604583,
"step": 546
},
{
"epoch": 1.9410720230828988,
"grad_norm": 1.1328125,
"learning_rate": 5.287097355372079e-06,
"loss": 0.2396,
"mean_token_accuracy": 0.030296869514131686,
"step": 547
},
{
"epoch": 1.9446232382643436,
"grad_norm": 1.5859375,
"learning_rate": 5.273106908752211e-06,
"loss": 0.2213,
"mean_token_accuracy": 0.03895932896557497,
"step": 548
},
{
"epoch": 1.9481744534457883,
"grad_norm": 1.2265625,
"learning_rate": 5.259114317163822e-06,
"loss": 0.2386,
"mean_token_accuracy": 0.03215844640726573,
"step": 549
},
{
"epoch": 1.9517256686272333,
"grad_norm": 1.1171875,
"learning_rate": 5.245119690504056e-06,
"loss": 0.2353,
"mean_token_accuracy": 0.03430458562797867,
"step": 550
},
{
"epoch": 1.9552768838086783,
"grad_norm": 1.1015625,
"learning_rate": 5.231123138686036e-06,
"loss": 0.2239,
"mean_token_accuracy": 0.042256227920006495,
"step": 551
},
{
"epoch": 1.9588280989901232,
"grad_norm": 1.2578125,
"learning_rate": 5.217124771638008e-06,
"loss": 0.2488,
"mean_token_accuracy": 0.033073368220357224,
"step": 552
},
{
"epoch": 1.9623793141715682,
"grad_norm": 1.140625,
"learning_rate": 5.2031246993024705e-06,
"loss": 0.2335,
"mean_token_accuracy": 0.03448270119224617,
"step": 553
},
{
"epoch": 1.965930529353013,
"grad_norm": 1.03125,
"learning_rate": 5.1891230316353215e-06,
"loss": 0.2315,
"mean_token_accuracy": 0.038373887560737785,
"step": 554
},
{
"epoch": 1.969481744534458,
"grad_norm": 1.25,
"learning_rate": 5.1751198786049815e-06,
"loss": 0.2251,
"mean_token_accuracy": 0.035272060780698666,
"step": 555
},
{
"epoch": 1.9730329597159026,
"grad_norm": 1.6015625,
"learning_rate": 5.161115350191543e-06,
"loss": 0.2358,
"mean_token_accuracy": 0.037838590164028574,
"step": 556
},
{
"epoch": 1.9765841748973476,
"grad_norm": 1.0859375,
"learning_rate": 5.147109556385898e-06,
"loss": 0.225,
"mean_token_accuracy": 0.035712803082788014,
"step": 557
},
{
"epoch": 1.9801353900787926,
"grad_norm": 1.3984375,
"learning_rate": 5.133102607188875e-06,
"loss": 0.236,
"mean_token_accuracy": 0.037219416080915835,
"step": 558
},
{
"epoch": 1.9836866052602375,
"grad_norm": 1.265625,
"learning_rate": 5.119094612610381e-06,
"loss": 0.2321,
"mean_token_accuracy": 0.03555746503116097,
"step": 559
},
{
"epoch": 1.9872378204416825,
"grad_norm": 1.734375,
"learning_rate": 5.10508568266853e-06,
"loss": 0.2416,
"mean_token_accuracy": 0.03534104762002244,
"step": 560
},
{
"epoch": 1.9907890356231273,
"grad_norm": 1.484375,
"learning_rate": 5.091075927388785e-06,
"loss": 0.2424,
"mean_token_accuracy": 0.034718068922302336,
"step": 561
},
{
"epoch": 1.9943402508045722,
"grad_norm": 1.078125,
"learning_rate": 5.077065456803089e-06,
"loss": 0.2403,
"mean_token_accuracy": 0.036199005553498864,
"step": 562
},
{
"epoch": 1.997891465986017,
"grad_norm": 0.9140625,
"learning_rate": 5.063054380949003e-06,
"loss": 0.2283,
"mean_token_accuracy": 0.0378040480427444,
"step": 563
},
{
"epoch": 2.0,
"grad_norm": 1.0078125,
"learning_rate": 5.049042809868845e-06,
"loss": 0.1326,
"mean_token_accuracy": 0.03690299116026022,
"step": 564
},
{
"epoch": 2.003551215181445,
"grad_norm": 1.5546875,
"learning_rate": 5.035030853608817e-06,
"loss": 0.2468,
"mean_token_accuracy": 0.03732422069515451,
"step": 565
},
{
"epoch": 2.00710243036289,
"grad_norm": 1.140625,
"learning_rate": 5.0210186222181515e-06,
"loss": 0.2215,
"mean_token_accuracy": 0.0390621348105924,
"step": 566
},
{
"epoch": 2.010653645544335,
"grad_norm": 1.578125,
"learning_rate": 5.007006225748238e-06,
"loss": 0.2289,
"mean_token_accuracy": 0.03769652777918964,
"step": 567
},
{
"epoch": 2.0142048607257794,
"grad_norm": 1.328125,
"learning_rate": 4.992993774251764e-06,
"loss": 0.2419,
"mean_token_accuracy": 0.03473068901075749,
"step": 568
},
{
"epoch": 2.0177560759072244,
"grad_norm": 1.0390625,
"learning_rate": 4.97898137778185e-06,
"loss": 0.2346,
"mean_token_accuracy": 0.03907662617348251,
"step": 569
},
{
"epoch": 2.0213072910886694,
"grad_norm": 1.125,
"learning_rate": 4.964969146391184e-06,
"loss": 0.2347,
"mean_token_accuracy": 0.03834367445597309,
"step": 570
},
{
"epoch": 2.0248585062701143,
"grad_norm": 1.328125,
"learning_rate": 4.950957190131157e-06,
"loss": 0.2287,
"mean_token_accuracy": 0.041130597212031716,
"step": 571
},
{
"epoch": 2.0284097214515593,
"grad_norm": 3.09375,
"learning_rate": 4.936945619050998e-06,
"loss": 0.242,
"mean_token_accuracy": 0.03198737775164773,
"step": 572
},
{
"epoch": 2.0319609366330043,
"grad_norm": 1.125,
"learning_rate": 4.922934543196912e-06,
"loss": 0.2301,
"mean_token_accuracy": 0.03535122560788295,
"step": 573
},
{
"epoch": 2.0355121518144492,
"grad_norm": 1.4921875,
"learning_rate": 4.908924072611218e-06,
"loss": 0.2334,
"mean_token_accuracy": 0.0340686052768433,
"step": 574
},
{
"epoch": 2.0390633669958937,
"grad_norm": 1.0078125,
"learning_rate": 4.894914317331471e-06,
"loss": 0.2274,
"mean_token_accuracy": 0.038621567571681226,
"step": 575
},
{
"epoch": 2.0426145821773387,
"grad_norm": 1.25,
"learning_rate": 4.88090538738962e-06,
"loss": 0.239,
"mean_token_accuracy": 0.032639294491673354,
"step": 576
},
{
"epoch": 2.0461657973587837,
"grad_norm": 1.453125,
"learning_rate": 4.866897392811127e-06,
"loss": 0.2341,
"mean_token_accuracy": 0.03567746441694908,
"step": 577
},
{
"epoch": 2.0497170125402286,
"grad_norm": 1.21875,
"learning_rate": 4.852890443614105e-06,
"loss": 0.2354,
"mean_token_accuracy": 0.03639252596258302,
"step": 578
},
{
"epoch": 2.0532682277216736,
"grad_norm": 1.2265625,
"learning_rate": 4.838884649808458e-06,
"loss": 0.2366,
"mean_token_accuracy": 0.04135367330673034,
"step": 579
},
{
"epoch": 2.0568194429031186,
"grad_norm": 1.2109375,
"learning_rate": 4.82488012139502e-06,
"loss": 0.2342,
"mean_token_accuracy": 0.040144713548215805,
"step": 580
},
{
"epoch": 2.060370658084563,
"grad_norm": 1.34375,
"learning_rate": 4.810876968364679e-06,
"loss": 0.2303,
"mean_token_accuracy": 0.039767392045177985,
"step": 581
},
{
"epoch": 2.063921873266008,
"grad_norm": 1.171875,
"learning_rate": 4.796875300697532e-06,
"loss": 0.2284,
"mean_token_accuracy": 0.03720286238967674,
"step": 582
},
{
"epoch": 2.067473088447453,
"grad_norm": 1.1015625,
"learning_rate": 4.782875228361994e-06,
"loss": 0.2258,
"mean_token_accuracy": 0.0378811931987002,
"step": 583
},
{
"epoch": 2.071024303628898,
"grad_norm": 1.296875,
"learning_rate": 4.7688768613139655e-06,
"loss": 0.2382,
"mean_token_accuracy": 0.03471687817364,
"step": 584
},
{
"epoch": 2.074575518810343,
"grad_norm": 1.1796875,
"learning_rate": 4.754880309495946e-06,
"loss": 0.2217,
"mean_token_accuracy": 0.03663280401087832,
"step": 585
},
{
"epoch": 2.078126733991788,
"grad_norm": 2.53125,
"learning_rate": 4.74088568283618e-06,
"loss": 0.2388,
"mean_token_accuracy": 0.038270618082606234,
"step": 586
},
{
"epoch": 2.081677949173233,
"grad_norm": 1.5234375,
"learning_rate": 4.726893091247792e-06,
"loss": 0.2363,
"mean_token_accuracy": 0.03145957482593076,
"step": 587
},
{
"epoch": 2.0852291643546774,
"grad_norm": 1.640625,
"learning_rate": 4.712902644627923e-06,
"loss": 0.2259,
"mean_token_accuracy": 0.03767043779953383,
"step": 588
},
{
"epoch": 2.0887803795361224,
"grad_norm": 1.375,
"learning_rate": 4.698914452856866e-06,
"loss": 0.2192,
"mean_token_accuracy": 0.03968908742535859,
"step": 589
},
{
"epoch": 2.0923315947175674,
"grad_norm": 2.1875,
"learning_rate": 4.684928625797208e-06,
"loss": 0.2311,
"mean_token_accuracy": 0.03802780578553211,
"step": 590
},
{
"epoch": 2.0958828098990123,
"grad_norm": 1.4453125,
"learning_rate": 4.6709452732929614e-06,
"loss": 0.2412,
"mean_token_accuracy": 0.03319291988009354,
"step": 591
},
{
"epoch": 2.0994340250804573,
"grad_norm": 1.6875,
"learning_rate": 4.656964505168703e-06,
"loss": 0.2434,
"mean_token_accuracy": 0.03837345182910212,
"step": 592
},
{
"epoch": 2.1029852402619023,
"grad_norm": 1.546875,
"learning_rate": 4.642986431228713e-06,
"loss": 0.228,
"mean_token_accuracy": 0.03522277422234765,
"step": 593
},
{
"epoch": 2.1065364554433472,
"grad_norm": 1.046875,
"learning_rate": 4.629011161256114e-06,
"loss": 0.2266,
"mean_token_accuracy": 0.03698047510260949,
"step": 594
},
{
"epoch": 2.1100876706247917,
"grad_norm": 1.5390625,
"learning_rate": 4.615038805011999e-06,
"loss": 0.2394,
"mean_token_accuracy": 0.034128702951420564,
"step": 595
},
{
"epoch": 2.1136388858062367,
"grad_norm": 1.4921875,
"learning_rate": 4.601069472234584e-06,
"loss": 0.235,
"mean_token_accuracy": 0.03288589773364947,
"step": 596
},
{
"epoch": 2.1171901009876817,
"grad_norm": 1.3203125,
"learning_rate": 4.587103272638339e-06,
"loss": 0.2261,
"mean_token_accuracy": 0.0330345298134489,
"step": 597
},
{
"epoch": 2.1207413161691266,
"grad_norm": 1.234375,
"learning_rate": 4.57314031591312e-06,
"loss": 0.2354,
"mean_token_accuracy": 0.03702639001130592,
"step": 598
},
{
"epoch": 2.1242925313505716,
"grad_norm": 1.34375,
"learning_rate": 4.559180711723318e-06,
"loss": 0.2499,
"mean_token_accuracy": 0.03753973091443186,
"step": 599
},
{
"epoch": 2.1278437465320166,
"grad_norm": 1.2578125,
"learning_rate": 4.545224569706994e-06,
"loss": 0.2327,
"mean_token_accuracy": 0.035053080224315636,
"step": 600
}
],
"logging_steps": 1,
"max_steps": 1124,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.283809149801565e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}