POLAR-G3-4b-tool-v2 / trainer_state.json
CocoRoF's picture
Upload content from checkpoint-364
27189b7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9917808219178084,
"eval_steps": 100000.0,
"global_step": 364,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005479452054794521,
"grad_norm": 432.0,
"learning_rate": 0.0,
"loss": 5.7373,
"mean_token_accuracy": 0.6561740338802338,
"step": 1
},
{
"epoch": 0.010958904109589041,
"grad_norm": 334.0,
"learning_rate": 2.0000000000000003e-06,
"loss": 5.8256,
"mean_token_accuracy": 0.6489620804786682,
"step": 2
},
{
"epoch": 0.01643835616438356,
"grad_norm": 181.0,
"learning_rate": 4.000000000000001e-06,
"loss": 5.088,
"mean_token_accuracy": 0.6650368273258209,
"step": 3
},
{
"epoch": 0.021917808219178082,
"grad_norm": 75.5,
"learning_rate": 6e-06,
"loss": 3.994,
"mean_token_accuracy": 0.6906364560127258,
"step": 4
},
{
"epoch": 0.0273972602739726,
"grad_norm": 44.75,
"learning_rate": 8.000000000000001e-06,
"loss": 3.4044,
"mean_token_accuracy": 0.7045449316501617,
"step": 5
},
{
"epoch": 0.03287671232876712,
"grad_norm": 28.125,
"learning_rate": 1e-05,
"loss": 2.8925,
"mean_token_accuracy": 0.7223854660987854,
"step": 6
},
{
"epoch": 0.038356164383561646,
"grad_norm": 20.0,
"learning_rate": 1.2e-05,
"loss": 2.3515,
"mean_token_accuracy": 0.7618080377578735,
"step": 7
},
{
"epoch": 0.043835616438356165,
"grad_norm": 19.0,
"learning_rate": 1.4e-05,
"loss": 2.0269,
"mean_token_accuracy": 0.7856993675231934,
"step": 8
},
{
"epoch": 0.049315068493150684,
"grad_norm": 12.8125,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.7614,
"mean_token_accuracy": 0.8198637962341309,
"step": 9
},
{
"epoch": 0.0547945205479452,
"grad_norm": 15.875,
"learning_rate": 1.8e-05,
"loss": 1.4779,
"mean_token_accuracy": 0.8413160443305969,
"step": 10
},
{
"epoch": 0.06027397260273973,
"grad_norm": 8.3125,
"learning_rate": 2e-05,
"loss": 1.3764,
"mean_token_accuracy": 0.8465317487716675,
"step": 11
},
{
"epoch": 0.06575342465753424,
"grad_norm": 13.6875,
"learning_rate": 1.9943502824858758e-05,
"loss": 1.2839,
"mean_token_accuracy": 0.8562129735946655,
"step": 12
},
{
"epoch": 0.07123287671232877,
"grad_norm": 10.6875,
"learning_rate": 1.9887005649717518e-05,
"loss": 1.186,
"mean_token_accuracy": 0.8632737696170807,
"step": 13
},
{
"epoch": 0.07671232876712329,
"grad_norm": 6.46875,
"learning_rate": 1.9830508474576275e-05,
"loss": 1.2342,
"mean_token_accuracy": 0.8569013476371765,
"step": 14
},
{
"epoch": 0.0821917808219178,
"grad_norm": 3.671875,
"learning_rate": 1.977401129943503e-05,
"loss": 1.1773,
"mean_token_accuracy": 0.861639678478241,
"step": 15
},
{
"epoch": 0.08767123287671233,
"grad_norm": 4.1875,
"learning_rate": 1.9717514124293785e-05,
"loss": 1.0634,
"mean_token_accuracy": 0.8734889626502991,
"step": 16
},
{
"epoch": 0.09315068493150686,
"grad_norm": 3.109375,
"learning_rate": 1.9661016949152545e-05,
"loss": 1.1534,
"mean_token_accuracy": 0.8641785085201263,
"step": 17
},
{
"epoch": 0.09863013698630137,
"grad_norm": 2.546875,
"learning_rate": 1.96045197740113e-05,
"loss": 1.1272,
"mean_token_accuracy": 0.8654375374317169,
"step": 18
},
{
"epoch": 0.10410958904109589,
"grad_norm": 2.375,
"learning_rate": 1.9548022598870058e-05,
"loss": 1.0411,
"mean_token_accuracy": 0.8728566467761993,
"step": 19
},
{
"epoch": 0.1095890410958904,
"grad_norm": 2.390625,
"learning_rate": 1.9491525423728814e-05,
"loss": 1.0268,
"mean_token_accuracy": 0.8728775084018707,
"step": 20
},
{
"epoch": 0.11506849315068493,
"grad_norm": 2.515625,
"learning_rate": 1.9435028248587574e-05,
"loss": 1.0674,
"mean_token_accuracy": 0.8690488934516907,
"step": 21
},
{
"epoch": 0.12054794520547946,
"grad_norm": 2.59375,
"learning_rate": 1.937853107344633e-05,
"loss": 0.9995,
"mean_token_accuracy": 0.8791483938694,
"step": 22
},
{
"epoch": 0.12602739726027398,
"grad_norm": 2.328125,
"learning_rate": 1.9322033898305087e-05,
"loss": 1.0724,
"mean_token_accuracy": 0.8647687137126923,
"step": 23
},
{
"epoch": 0.13150684931506848,
"grad_norm": 2.6875,
"learning_rate": 1.9265536723163844e-05,
"loss": 1.0266,
"mean_token_accuracy": 0.8738211989402771,
"step": 24
},
{
"epoch": 0.136986301369863,
"grad_norm": 2.046875,
"learning_rate": 1.92090395480226e-05,
"loss": 1.0761,
"mean_token_accuracy": 0.8665441274642944,
"step": 25
},
{
"epoch": 0.14246575342465753,
"grad_norm": 2.1875,
"learning_rate": 1.9152542372881357e-05,
"loss": 0.9683,
"mean_token_accuracy": 0.8777200281620026,
"step": 26
},
{
"epoch": 0.14794520547945206,
"grad_norm": 1.984375,
"learning_rate": 1.9096045197740114e-05,
"loss": 0.9868,
"mean_token_accuracy": 0.8754189312458038,
"step": 27
},
{
"epoch": 0.15342465753424658,
"grad_norm": 2.140625,
"learning_rate": 1.9039548022598874e-05,
"loss": 1.0158,
"mean_token_accuracy": 0.8761765658855438,
"step": 28
},
{
"epoch": 0.1589041095890411,
"grad_norm": 1.8046875,
"learning_rate": 1.898305084745763e-05,
"loss": 0.8789,
"mean_token_accuracy": 0.8906848430633545,
"step": 29
},
{
"epoch": 0.1643835616438356,
"grad_norm": 1.84375,
"learning_rate": 1.8926553672316387e-05,
"loss": 0.9747,
"mean_token_accuracy": 0.8772170841693878,
"step": 30
},
{
"epoch": 0.16986301369863013,
"grad_norm": 1.9296875,
"learning_rate": 1.8870056497175144e-05,
"loss": 0.9559,
"mean_token_accuracy": 0.879201203584671,
"step": 31
},
{
"epoch": 0.17534246575342466,
"grad_norm": 2.046875,
"learning_rate": 1.88135593220339e-05,
"loss": 0.9505,
"mean_token_accuracy": 0.8790359497070312,
"step": 32
},
{
"epoch": 0.18082191780821918,
"grad_norm": 1.875,
"learning_rate": 1.8757062146892657e-05,
"loss": 0.9744,
"mean_token_accuracy": 0.8767738342285156,
"step": 33
},
{
"epoch": 0.1863013698630137,
"grad_norm": 1.8046875,
"learning_rate": 1.8700564971751413e-05,
"loss": 0.9881,
"mean_token_accuracy": 0.8765853643417358,
"step": 34
},
{
"epoch": 0.1917808219178082,
"grad_norm": 1.78125,
"learning_rate": 1.864406779661017e-05,
"loss": 0.9423,
"mean_token_accuracy": 0.8802583813667297,
"step": 35
},
{
"epoch": 0.19726027397260273,
"grad_norm": 1.9296875,
"learning_rate": 1.858757062146893e-05,
"loss": 0.9277,
"mean_token_accuracy": 0.8835765421390533,
"step": 36
},
{
"epoch": 0.20273972602739726,
"grad_norm": 1.8125,
"learning_rate": 1.8531073446327686e-05,
"loss": 0.9619,
"mean_token_accuracy": 0.8785396814346313,
"step": 37
},
{
"epoch": 0.20821917808219179,
"grad_norm": 1.8046875,
"learning_rate": 1.8474576271186443e-05,
"loss": 0.9415,
"mean_token_accuracy": 0.8792627155780792,
"step": 38
},
{
"epoch": 0.2136986301369863,
"grad_norm": 1.890625,
"learning_rate": 1.84180790960452e-05,
"loss": 1.1318,
"mean_token_accuracy": 0.8636864423751831,
"step": 39
},
{
"epoch": 0.2191780821917808,
"grad_norm": 1.828125,
"learning_rate": 1.8361581920903956e-05,
"loss": 0.9542,
"mean_token_accuracy": 0.8771731555461884,
"step": 40
},
{
"epoch": 0.22465753424657534,
"grad_norm": 1.921875,
"learning_rate": 1.8305084745762713e-05,
"loss": 0.9122,
"mean_token_accuracy": 0.881610095500946,
"step": 41
},
{
"epoch": 0.23013698630136986,
"grad_norm": 1.78125,
"learning_rate": 1.824858757062147e-05,
"loss": 0.8768,
"mean_token_accuracy": 0.8854578733444214,
"step": 42
},
{
"epoch": 0.2356164383561644,
"grad_norm": 1.75,
"learning_rate": 1.8192090395480226e-05,
"loss": 0.8985,
"mean_token_accuracy": 0.8832479119300842,
"step": 43
},
{
"epoch": 0.2410958904109589,
"grad_norm": 1.71875,
"learning_rate": 1.8135593220338986e-05,
"loss": 0.9174,
"mean_token_accuracy": 0.88409024477005,
"step": 44
},
{
"epoch": 0.2465753424657534,
"grad_norm": 1.9375,
"learning_rate": 1.8079096045197743e-05,
"loss": 0.9281,
"mean_token_accuracy": 0.8829980492591858,
"step": 45
},
{
"epoch": 0.25205479452054796,
"grad_norm": 1.796875,
"learning_rate": 1.80225988700565e-05,
"loss": 0.9484,
"mean_token_accuracy": 0.8815726637840271,
"step": 46
},
{
"epoch": 0.25753424657534246,
"grad_norm": 1.796875,
"learning_rate": 1.7966101694915256e-05,
"loss": 0.9032,
"mean_token_accuracy": 0.8823907375335693,
"step": 47
},
{
"epoch": 0.26301369863013696,
"grad_norm": 1.7421875,
"learning_rate": 1.7909604519774012e-05,
"loss": 0.954,
"mean_token_accuracy": 0.8775873780250549,
"step": 48
},
{
"epoch": 0.2684931506849315,
"grad_norm": 1.8515625,
"learning_rate": 1.785310734463277e-05,
"loss": 0.8923,
"mean_token_accuracy": 0.8820536136627197,
"step": 49
},
{
"epoch": 0.273972602739726,
"grad_norm": 1.78125,
"learning_rate": 1.7796610169491526e-05,
"loss": 0.9235,
"mean_token_accuracy": 0.8839817941188812,
"step": 50
},
{
"epoch": 0.27945205479452057,
"grad_norm": 1.796875,
"learning_rate": 1.7740112994350286e-05,
"loss": 0.9095,
"mean_token_accuracy": 0.8809832036495209,
"step": 51
},
{
"epoch": 0.28493150684931506,
"grad_norm": 1.6484375,
"learning_rate": 1.7683615819209042e-05,
"loss": 0.8763,
"mean_token_accuracy": 0.8849304616451263,
"step": 52
},
{
"epoch": 0.29041095890410956,
"grad_norm": 1.78125,
"learning_rate": 1.76271186440678e-05,
"loss": 0.9113,
"mean_token_accuracy": 0.8810350298881531,
"step": 53
},
{
"epoch": 0.2958904109589041,
"grad_norm": 1.8359375,
"learning_rate": 1.7570621468926555e-05,
"loss": 0.9356,
"mean_token_accuracy": 0.8820917904376984,
"step": 54
},
{
"epoch": 0.3013698630136986,
"grad_norm": 1.765625,
"learning_rate": 1.7514124293785312e-05,
"loss": 0.9315,
"mean_token_accuracy": 0.8792105317115784,
"step": 55
},
{
"epoch": 0.30684931506849317,
"grad_norm": 1.796875,
"learning_rate": 1.745762711864407e-05,
"loss": 0.9129,
"mean_token_accuracy": 0.884558379650116,
"step": 56
},
{
"epoch": 0.31232876712328766,
"grad_norm": 1.6953125,
"learning_rate": 1.7401129943502825e-05,
"loss": 0.9175,
"mean_token_accuracy": 0.8841174840927124,
"step": 57
},
{
"epoch": 0.3178082191780822,
"grad_norm": 1.75,
"learning_rate": 1.734463276836158e-05,
"loss": 0.9441,
"mean_token_accuracy": 0.8799735009670258,
"step": 58
},
{
"epoch": 0.3232876712328767,
"grad_norm": 1.671875,
"learning_rate": 1.728813559322034e-05,
"loss": 0.8381,
"mean_token_accuracy": 0.8915592730045319,
"step": 59
},
{
"epoch": 0.3287671232876712,
"grad_norm": 1.75,
"learning_rate": 1.7231638418079098e-05,
"loss": 0.878,
"mean_token_accuracy": 0.8879745006561279,
"step": 60
},
{
"epoch": 0.33424657534246577,
"grad_norm": 1.796875,
"learning_rate": 1.7175141242937855e-05,
"loss": 0.894,
"mean_token_accuracy": 0.8848598599433899,
"step": 61
},
{
"epoch": 0.33972602739726027,
"grad_norm": 1.8125,
"learning_rate": 1.711864406779661e-05,
"loss": 0.9141,
"mean_token_accuracy": 0.882304459810257,
"step": 62
},
{
"epoch": 0.3452054794520548,
"grad_norm": 1.7265625,
"learning_rate": 1.7062146892655368e-05,
"loss": 0.9312,
"mean_token_accuracy": 0.8793206214904785,
"step": 63
},
{
"epoch": 0.3506849315068493,
"grad_norm": 1.6953125,
"learning_rate": 1.7005649717514125e-05,
"loss": 0.8844,
"mean_token_accuracy": 0.8880196511745453,
"step": 64
},
{
"epoch": 0.3561643835616438,
"grad_norm": 2.125,
"learning_rate": 1.694915254237288e-05,
"loss": 0.7884,
"mean_token_accuracy": 0.9000090658664703,
"step": 65
},
{
"epoch": 0.36164383561643837,
"grad_norm": 1.8359375,
"learning_rate": 1.689265536723164e-05,
"loss": 0.856,
"mean_token_accuracy": 0.8874339759349823,
"step": 66
},
{
"epoch": 0.36712328767123287,
"grad_norm": 1.9921875,
"learning_rate": 1.6836158192090398e-05,
"loss": 0.9166,
"mean_token_accuracy": 0.8831603229045868,
"step": 67
},
{
"epoch": 0.3726027397260274,
"grad_norm": 1.8359375,
"learning_rate": 1.6779661016949154e-05,
"loss": 0.9728,
"mean_token_accuracy": 0.8779590725898743,
"step": 68
},
{
"epoch": 0.3780821917808219,
"grad_norm": 1.8203125,
"learning_rate": 1.672316384180791e-05,
"loss": 0.9378,
"mean_token_accuracy": 0.8816089332103729,
"step": 69
},
{
"epoch": 0.3835616438356164,
"grad_norm": 1.796875,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.9222,
"mean_token_accuracy": 0.8854541778564453,
"step": 70
},
{
"epoch": 0.38904109589041097,
"grad_norm": 1.828125,
"learning_rate": 1.6610169491525424e-05,
"loss": 0.9116,
"mean_token_accuracy": 0.8790720105171204,
"step": 71
},
{
"epoch": 0.39452054794520547,
"grad_norm": 1.7890625,
"learning_rate": 1.655367231638418e-05,
"loss": 0.8784,
"mean_token_accuracy": 0.8863929808139801,
"step": 72
},
{
"epoch": 0.4,
"grad_norm": 1.7578125,
"learning_rate": 1.6497175141242937e-05,
"loss": 0.8976,
"mean_token_accuracy": 0.8833265900611877,
"step": 73
},
{
"epoch": 0.4054794520547945,
"grad_norm": 1.7421875,
"learning_rate": 1.6440677966101697e-05,
"loss": 0.8872,
"mean_token_accuracy": 0.8866287767887115,
"step": 74
},
{
"epoch": 0.410958904109589,
"grad_norm": 1.75,
"learning_rate": 1.6384180790960454e-05,
"loss": 0.9294,
"mean_token_accuracy": 0.8842478394508362,
"step": 75
},
{
"epoch": 0.41643835616438357,
"grad_norm": 1.6953125,
"learning_rate": 1.632768361581921e-05,
"loss": 0.8754,
"mean_token_accuracy": 0.887272983789444,
"step": 76
},
{
"epoch": 0.42191780821917807,
"grad_norm": 1.6953125,
"learning_rate": 1.6271186440677967e-05,
"loss": 0.8779,
"mean_token_accuracy": 0.8887339234352112,
"step": 77
},
{
"epoch": 0.4273972602739726,
"grad_norm": 1.7265625,
"learning_rate": 1.6214689265536724e-05,
"loss": 0.8863,
"mean_token_accuracy": 0.8841427862644196,
"step": 78
},
{
"epoch": 0.4328767123287671,
"grad_norm": 1.671875,
"learning_rate": 1.615819209039548e-05,
"loss": 0.8632,
"mean_token_accuracy": 0.8868177235126495,
"step": 79
},
{
"epoch": 0.4383561643835616,
"grad_norm": 1.8515625,
"learning_rate": 1.6101694915254237e-05,
"loss": 0.9242,
"mean_token_accuracy": 0.8839016258716583,
"step": 80
},
{
"epoch": 0.4438356164383562,
"grad_norm": 1.796875,
"learning_rate": 1.6045197740112997e-05,
"loss": 0.908,
"mean_token_accuracy": 0.8833514153957367,
"step": 81
},
{
"epoch": 0.44931506849315067,
"grad_norm": 1.796875,
"learning_rate": 1.5988700564971753e-05,
"loss": 0.8747,
"mean_token_accuracy": 0.8873944878578186,
"step": 82
},
{
"epoch": 0.4547945205479452,
"grad_norm": 1.75,
"learning_rate": 1.593220338983051e-05,
"loss": 0.9303,
"mean_token_accuracy": 0.8823438286781311,
"step": 83
},
{
"epoch": 0.4602739726027397,
"grad_norm": 1.6953125,
"learning_rate": 1.5875706214689266e-05,
"loss": 0.8441,
"mean_token_accuracy": 0.888810396194458,
"step": 84
},
{
"epoch": 0.4657534246575342,
"grad_norm": 1.734375,
"learning_rate": 1.5819209039548023e-05,
"loss": 0.8512,
"mean_token_accuracy": 0.8866813480854034,
"step": 85
},
{
"epoch": 0.4712328767123288,
"grad_norm": 1.7109375,
"learning_rate": 1.576271186440678e-05,
"loss": 0.8743,
"mean_token_accuracy": 0.8884658217430115,
"step": 86
},
{
"epoch": 0.4767123287671233,
"grad_norm": 1.703125,
"learning_rate": 1.5706214689265536e-05,
"loss": 0.8523,
"mean_token_accuracy": 0.8898887932300568,
"step": 87
},
{
"epoch": 0.4821917808219178,
"grad_norm": 1.734375,
"learning_rate": 1.5649717514124293e-05,
"loss": 0.8947,
"mean_token_accuracy": 0.8856352865695953,
"step": 88
},
{
"epoch": 0.4876712328767123,
"grad_norm": 1.734375,
"learning_rate": 1.5593220338983053e-05,
"loss": 0.8895,
"mean_token_accuracy": 0.8845047950744629,
"step": 89
},
{
"epoch": 0.4931506849315068,
"grad_norm": 1.7109375,
"learning_rate": 1.553672316384181e-05,
"loss": 0.8215,
"mean_token_accuracy": 0.8922451138496399,
"step": 90
},
{
"epoch": 0.4986301369863014,
"grad_norm": 1.765625,
"learning_rate": 1.5480225988700566e-05,
"loss": 0.8677,
"mean_token_accuracy": 0.8868069648742676,
"step": 91
},
{
"epoch": 0.5041095890410959,
"grad_norm": 1.921875,
"learning_rate": 1.5423728813559326e-05,
"loss": 0.8843,
"mean_token_accuracy": 0.8863621056079865,
"step": 92
},
{
"epoch": 0.5095890410958904,
"grad_norm": 1.828125,
"learning_rate": 1.536723163841808e-05,
"loss": 0.8593,
"mean_token_accuracy": 0.8890643417835236,
"step": 93
},
{
"epoch": 0.5150684931506849,
"grad_norm": 1.7734375,
"learning_rate": 1.5310734463276836e-05,
"loss": 0.8961,
"mean_token_accuracy": 0.885326474905014,
"step": 94
},
{
"epoch": 0.5205479452054794,
"grad_norm": 1.65625,
"learning_rate": 1.5254237288135594e-05,
"loss": 0.8115,
"mean_token_accuracy": 0.8935891091823578,
"step": 95
},
{
"epoch": 0.5260273972602739,
"grad_norm": 1.71875,
"learning_rate": 1.5197740112994352e-05,
"loss": 0.8157,
"mean_token_accuracy": 0.8906770646572113,
"step": 96
},
{
"epoch": 0.5315068493150685,
"grad_norm": 1.703125,
"learning_rate": 1.5141242937853109e-05,
"loss": 0.9447,
"mean_token_accuracy": 0.8807980120182037,
"step": 97
},
{
"epoch": 0.536986301369863,
"grad_norm": 1.6484375,
"learning_rate": 1.5084745762711865e-05,
"loss": 0.8652,
"mean_token_accuracy": 0.886509358882904,
"step": 98
},
{
"epoch": 0.5424657534246575,
"grad_norm": 1.7578125,
"learning_rate": 1.5028248587570622e-05,
"loss": 0.8307,
"mean_token_accuracy": 0.8903360962867737,
"step": 99
},
{
"epoch": 0.547945205479452,
"grad_norm": 6.25,
"learning_rate": 1.497175141242938e-05,
"loss": 0.918,
"mean_token_accuracy": 0.8805244266986847,
"step": 100
},
{
"epoch": 0.5534246575342465,
"grad_norm": 1.75,
"learning_rate": 1.4915254237288137e-05,
"loss": 0.8078,
"mean_token_accuracy": 0.8928936421871185,
"step": 101
},
{
"epoch": 0.5589041095890411,
"grad_norm": 1.7109375,
"learning_rate": 1.4858757062146894e-05,
"loss": 0.8592,
"mean_token_accuracy": 0.889348953962326,
"step": 102
},
{
"epoch": 0.5643835616438356,
"grad_norm": 1.6953125,
"learning_rate": 1.480225988700565e-05,
"loss": 0.7756,
"mean_token_accuracy": 0.8963395059108734,
"step": 103
},
{
"epoch": 0.5698630136986301,
"grad_norm": 1.7890625,
"learning_rate": 1.4745762711864408e-05,
"loss": 0.8789,
"mean_token_accuracy": 0.886920839548111,
"step": 104
},
{
"epoch": 0.5753424657534246,
"grad_norm": 1.9140625,
"learning_rate": 1.4689265536723165e-05,
"loss": 0.8967,
"mean_token_accuracy": 0.8798324763774872,
"step": 105
},
{
"epoch": 0.5808219178082191,
"grad_norm": 1.6953125,
"learning_rate": 1.4632768361581922e-05,
"loss": 0.843,
"mean_token_accuracy": 0.8907739818096161,
"step": 106
},
{
"epoch": 0.5863013698630137,
"grad_norm": 1.78125,
"learning_rate": 1.4576271186440678e-05,
"loss": 0.8541,
"mean_token_accuracy": 0.8892745971679688,
"step": 107
},
{
"epoch": 0.5917808219178082,
"grad_norm": 1.8046875,
"learning_rate": 1.4519774011299436e-05,
"loss": 0.8462,
"mean_token_accuracy": 0.8868447542190552,
"step": 108
},
{
"epoch": 0.5972602739726027,
"grad_norm": 1.703125,
"learning_rate": 1.4463276836158193e-05,
"loss": 0.8057,
"mean_token_accuracy": 0.8942738175392151,
"step": 109
},
{
"epoch": 0.6027397260273972,
"grad_norm": 1.78125,
"learning_rate": 1.440677966101695e-05,
"loss": 0.9267,
"mean_token_accuracy": 0.879928857088089,
"step": 110
},
{
"epoch": 0.6082191780821918,
"grad_norm": 1.8046875,
"learning_rate": 1.4350282485875708e-05,
"loss": 0.8554,
"mean_token_accuracy": 0.8891916871070862,
"step": 111
},
{
"epoch": 0.6136986301369863,
"grad_norm": 1.7265625,
"learning_rate": 1.4293785310734465e-05,
"loss": 0.8786,
"mean_token_accuracy": 0.8868878483772278,
"step": 112
},
{
"epoch": 0.6191780821917808,
"grad_norm": 1.765625,
"learning_rate": 1.4237288135593221e-05,
"loss": 0.8238,
"mean_token_accuracy": 0.8923602402210236,
"step": 113
},
{
"epoch": 0.6246575342465753,
"grad_norm": 1.65625,
"learning_rate": 1.4180790960451978e-05,
"loss": 0.7824,
"mean_token_accuracy": 0.898999810218811,
"step": 114
},
{
"epoch": 0.6301369863013698,
"grad_norm": 1.8359375,
"learning_rate": 1.4124293785310736e-05,
"loss": 0.8753,
"mean_token_accuracy": 0.8899329602718353,
"step": 115
},
{
"epoch": 0.6356164383561644,
"grad_norm": 1.7578125,
"learning_rate": 1.4067796610169493e-05,
"loss": 0.8015,
"mean_token_accuracy": 0.8944090604782104,
"step": 116
},
{
"epoch": 0.6410958904109589,
"grad_norm": 1.7265625,
"learning_rate": 1.4011299435028249e-05,
"loss": 0.8078,
"mean_token_accuracy": 0.8907613754272461,
"step": 117
},
{
"epoch": 0.6465753424657534,
"grad_norm": 1.84375,
"learning_rate": 1.3954802259887006e-05,
"loss": 0.8241,
"mean_token_accuracy": 0.8921016752719879,
"step": 118
},
{
"epoch": 0.6520547945205479,
"grad_norm": 1.7109375,
"learning_rate": 1.3898305084745764e-05,
"loss": 0.8554,
"mean_token_accuracy": 0.8893947899341583,
"step": 119
},
{
"epoch": 0.6575342465753424,
"grad_norm": 1.7265625,
"learning_rate": 1.384180790960452e-05,
"loss": 0.7452,
"mean_token_accuracy": 0.9025129973888397,
"step": 120
},
{
"epoch": 0.663013698630137,
"grad_norm": 1.8125,
"learning_rate": 1.3785310734463277e-05,
"loss": 0.8917,
"mean_token_accuracy": 0.8841381669044495,
"step": 121
},
{
"epoch": 0.6684931506849315,
"grad_norm": 1.6953125,
"learning_rate": 1.3728813559322034e-05,
"loss": 0.7997,
"mean_token_accuracy": 0.8949976563453674,
"step": 122
},
{
"epoch": 0.673972602739726,
"grad_norm": 1.65625,
"learning_rate": 1.3672316384180792e-05,
"loss": 0.77,
"mean_token_accuracy": 0.8981111347675323,
"step": 123
},
{
"epoch": 0.6794520547945205,
"grad_norm": 1.8828125,
"learning_rate": 1.3615819209039549e-05,
"loss": 0.8735,
"mean_token_accuracy": 0.88605797290802,
"step": 124
},
{
"epoch": 0.684931506849315,
"grad_norm": 1.7890625,
"learning_rate": 1.3559322033898305e-05,
"loss": 0.8578,
"mean_token_accuracy": 0.8875480592250824,
"step": 125
},
{
"epoch": 0.6904109589041096,
"grad_norm": 1.7890625,
"learning_rate": 1.3502824858757064e-05,
"loss": 0.842,
"mean_token_accuracy": 0.8861750662326813,
"step": 126
},
{
"epoch": 0.6958904109589041,
"grad_norm": 1.75,
"learning_rate": 1.344632768361582e-05,
"loss": 0.7661,
"mean_token_accuracy": 0.8968884646892548,
"step": 127
},
{
"epoch": 0.7013698630136986,
"grad_norm": 1.8203125,
"learning_rate": 1.3389830508474577e-05,
"loss": 0.8218,
"mean_token_accuracy": 0.89415243268013,
"step": 128
},
{
"epoch": 0.7068493150684931,
"grad_norm": 1.8046875,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.84,
"mean_token_accuracy": 0.8905225694179535,
"step": 129
},
{
"epoch": 0.7123287671232876,
"grad_norm": 1.75,
"learning_rate": 1.3276836158192092e-05,
"loss": 0.7886,
"mean_token_accuracy": 0.8948017656803131,
"step": 130
},
{
"epoch": 0.7178082191780822,
"grad_norm": 1.7265625,
"learning_rate": 1.3220338983050848e-05,
"loss": 0.7962,
"mean_token_accuracy": 0.8944378197193146,
"step": 131
},
{
"epoch": 0.7232876712328767,
"grad_norm": 1.6875,
"learning_rate": 1.3163841807909605e-05,
"loss": 0.7744,
"mean_token_accuracy": 0.8996008336544037,
"step": 132
},
{
"epoch": 0.7287671232876712,
"grad_norm": 1.8984375,
"learning_rate": 1.3107344632768361e-05,
"loss": 0.8511,
"mean_token_accuracy": 0.8891786336898804,
"step": 133
},
{
"epoch": 0.7342465753424657,
"grad_norm": 1.7265625,
"learning_rate": 1.305084745762712e-05,
"loss": 0.7581,
"mean_token_accuracy": 0.8997050821781158,
"step": 134
},
{
"epoch": 0.7397260273972602,
"grad_norm": 1.7265625,
"learning_rate": 1.2994350282485876e-05,
"loss": 0.8179,
"mean_token_accuracy": 0.889685183763504,
"step": 135
},
{
"epoch": 0.7452054794520548,
"grad_norm": 1.71875,
"learning_rate": 1.2937853107344633e-05,
"loss": 0.7822,
"mean_token_accuracy": 0.8955155909061432,
"step": 136
},
{
"epoch": 0.7506849315068493,
"grad_norm": 1.7890625,
"learning_rate": 1.288135593220339e-05,
"loss": 0.8004,
"mean_token_accuracy": 0.8967688679695129,
"step": 137
},
{
"epoch": 0.7561643835616438,
"grad_norm": 1.71875,
"learning_rate": 1.282485875706215e-05,
"loss": 0.7105,
"mean_token_accuracy": 0.903778463602066,
"step": 138
},
{
"epoch": 0.7616438356164383,
"grad_norm": 1.9609375,
"learning_rate": 1.2768361581920904e-05,
"loss": 0.8719,
"mean_token_accuracy": 0.8871429562568665,
"step": 139
},
{
"epoch": 0.7671232876712328,
"grad_norm": 1.8203125,
"learning_rate": 1.2711864406779661e-05,
"loss": 0.8294,
"mean_token_accuracy": 0.8909505307674408,
"step": 140
},
{
"epoch": 0.7726027397260274,
"grad_norm": 1.7265625,
"learning_rate": 1.265536723163842e-05,
"loss": 0.8538,
"mean_token_accuracy": 0.8904447853565216,
"step": 141
},
{
"epoch": 0.7780821917808219,
"grad_norm": 1.75,
"learning_rate": 1.2598870056497177e-05,
"loss": 0.7883,
"mean_token_accuracy": 0.8976991772651672,
"step": 142
},
{
"epoch": 0.7835616438356164,
"grad_norm": 1.6953125,
"learning_rate": 1.2542372881355932e-05,
"loss": 0.8273,
"mean_token_accuracy": 0.8918053209781647,
"step": 143
},
{
"epoch": 0.7890410958904109,
"grad_norm": 1.765625,
"learning_rate": 1.2485875706214689e-05,
"loss": 0.8064,
"mean_token_accuracy": 0.8940227329730988,
"step": 144
},
{
"epoch": 0.7945205479452054,
"grad_norm": 1.7109375,
"learning_rate": 1.2429378531073449e-05,
"loss": 0.8038,
"mean_token_accuracy": 0.893746942281723,
"step": 145
},
{
"epoch": 0.8,
"grad_norm": 1.6953125,
"learning_rate": 1.2372881355932205e-05,
"loss": 0.7326,
"mean_token_accuracy": 0.9016251862049103,
"step": 146
},
{
"epoch": 0.8054794520547945,
"grad_norm": 1.765625,
"learning_rate": 1.2316384180790962e-05,
"loss": 0.8565,
"mean_token_accuracy": 0.8860533237457275,
"step": 147
},
{
"epoch": 0.810958904109589,
"grad_norm": 1.7109375,
"learning_rate": 1.2259887005649717e-05,
"loss": 0.814,
"mean_token_accuracy": 0.8916102349758148,
"step": 148
},
{
"epoch": 0.8164383561643835,
"grad_norm": 1.734375,
"learning_rate": 1.2203389830508477e-05,
"loss": 0.826,
"mean_token_accuracy": 0.8912359774112701,
"step": 149
},
{
"epoch": 0.821917808219178,
"grad_norm": 1.7890625,
"learning_rate": 1.2146892655367234e-05,
"loss": 0.7332,
"mean_token_accuracy": 0.901479035615921,
"step": 150
},
{
"epoch": 0.8273972602739726,
"grad_norm": 1.7890625,
"learning_rate": 1.209039548022599e-05,
"loss": 0.8137,
"mean_token_accuracy": 0.8948657810688019,
"step": 151
},
{
"epoch": 0.8328767123287671,
"grad_norm": 1.78125,
"learning_rate": 1.2033898305084745e-05,
"loss": 0.7541,
"mean_token_accuracy": 0.8997522294521332,
"step": 152
},
{
"epoch": 0.8383561643835616,
"grad_norm": 1.796875,
"learning_rate": 1.1977401129943505e-05,
"loss": 0.7506,
"mean_token_accuracy": 0.9004031419754028,
"step": 153
},
{
"epoch": 0.8438356164383561,
"grad_norm": 1.9140625,
"learning_rate": 1.1920903954802262e-05,
"loss": 0.8034,
"mean_token_accuracy": 0.8948807418346405,
"step": 154
},
{
"epoch": 0.8493150684931506,
"grad_norm": 1.7109375,
"learning_rate": 1.1864406779661018e-05,
"loss": 0.8141,
"mean_token_accuracy": 0.8910720944404602,
"step": 155
},
{
"epoch": 0.8547945205479452,
"grad_norm": 1.8828125,
"learning_rate": 1.1807909604519776e-05,
"loss": 0.8654,
"mean_token_accuracy": 0.8876812160015106,
"step": 156
},
{
"epoch": 0.8602739726027397,
"grad_norm": 1.7109375,
"learning_rate": 1.1751412429378533e-05,
"loss": 0.7872,
"mean_token_accuracy": 0.8944331705570221,
"step": 157
},
{
"epoch": 0.8657534246575342,
"grad_norm": 1.7265625,
"learning_rate": 1.169491525423729e-05,
"loss": 0.8064,
"mean_token_accuracy": 0.8923972845077515,
"step": 158
},
{
"epoch": 0.8712328767123287,
"grad_norm": 1.796875,
"learning_rate": 1.1638418079096046e-05,
"loss": 0.8012,
"mean_token_accuracy": 0.8913676142692566,
"step": 159
},
{
"epoch": 0.8767123287671232,
"grad_norm": 1.796875,
"learning_rate": 1.1581920903954804e-05,
"loss": 0.7828,
"mean_token_accuracy": 0.8984484672546387,
"step": 160
},
{
"epoch": 0.8821917808219178,
"grad_norm": 1.6953125,
"learning_rate": 1.1525423728813561e-05,
"loss": 0.7947,
"mean_token_accuracy": 0.8937750458717346,
"step": 161
},
{
"epoch": 0.8876712328767123,
"grad_norm": 1.875,
"learning_rate": 1.1468926553672318e-05,
"loss": 0.8615,
"mean_token_accuracy": 0.8844136893749237,
"step": 162
},
{
"epoch": 0.8931506849315068,
"grad_norm": 1.765625,
"learning_rate": 1.1412429378531074e-05,
"loss": 0.7851,
"mean_token_accuracy": 0.8954149484634399,
"step": 163
},
{
"epoch": 0.8986301369863013,
"grad_norm": 1.859375,
"learning_rate": 1.1355932203389833e-05,
"loss": 0.8646,
"mean_token_accuracy": 0.8902953565120697,
"step": 164
},
{
"epoch": 0.9041095890410958,
"grad_norm": 1.9140625,
"learning_rate": 1.1299435028248589e-05,
"loss": 0.8723,
"mean_token_accuracy": 0.8843137621879578,
"step": 165
},
{
"epoch": 0.9095890410958904,
"grad_norm": 1.7421875,
"learning_rate": 1.1242937853107346e-05,
"loss": 0.8073,
"mean_token_accuracy": 0.8933804333209991,
"step": 166
},
{
"epoch": 0.915068493150685,
"grad_norm": 1.7734375,
"learning_rate": 1.1186440677966102e-05,
"loss": 0.8571,
"mean_token_accuracy": 0.8872124254703522,
"step": 167
},
{
"epoch": 0.9205479452054794,
"grad_norm": 1.6953125,
"learning_rate": 1.112994350282486e-05,
"loss": 0.7582,
"mean_token_accuracy": 0.8991103768348694,
"step": 168
},
{
"epoch": 0.9260273972602739,
"grad_norm": 1.875,
"learning_rate": 1.1073446327683617e-05,
"loss": 0.7952,
"mean_token_accuracy": 0.8964874744415283,
"step": 169
},
{
"epoch": 0.9315068493150684,
"grad_norm": 1.6875,
"learning_rate": 1.1016949152542374e-05,
"loss": 0.7878,
"mean_token_accuracy": 0.894893229007721,
"step": 170
},
{
"epoch": 0.936986301369863,
"grad_norm": 1.828125,
"learning_rate": 1.096045197740113e-05,
"loss": 0.8082,
"mean_token_accuracy": 0.8924291729927063,
"step": 171
},
{
"epoch": 0.9424657534246575,
"grad_norm": 1.828125,
"learning_rate": 1.0903954802259889e-05,
"loss": 0.8655,
"mean_token_accuracy": 0.8870832324028015,
"step": 172
},
{
"epoch": 0.947945205479452,
"grad_norm": 1.9296875,
"learning_rate": 1.0847457627118645e-05,
"loss": 0.7696,
"mean_token_accuracy": 0.8987171053886414,
"step": 173
},
{
"epoch": 0.9534246575342465,
"grad_norm": 1.8984375,
"learning_rate": 1.0790960451977402e-05,
"loss": 0.7408,
"mean_token_accuracy": 0.9013040661811829,
"step": 174
},
{
"epoch": 0.958904109589041,
"grad_norm": 1.953125,
"learning_rate": 1.073446327683616e-05,
"loss": 0.8175,
"mean_token_accuracy": 0.8909050524234772,
"step": 175
},
{
"epoch": 0.9643835616438357,
"grad_norm": 1.7421875,
"learning_rate": 1.0677966101694917e-05,
"loss": 0.7626,
"mean_token_accuracy": 0.895355612039566,
"step": 176
},
{
"epoch": 0.9698630136986301,
"grad_norm": 2.453125,
"learning_rate": 1.0621468926553673e-05,
"loss": 0.8223,
"mean_token_accuracy": 0.8929450511932373,
"step": 177
},
{
"epoch": 0.9753424657534246,
"grad_norm": 1.9453125,
"learning_rate": 1.056497175141243e-05,
"loss": 0.8457,
"mean_token_accuracy": 0.8877719342708588,
"step": 178
},
{
"epoch": 0.9808219178082191,
"grad_norm": 1.7890625,
"learning_rate": 1.0508474576271188e-05,
"loss": 0.8065,
"mean_token_accuracy": 0.8943466544151306,
"step": 179
},
{
"epoch": 0.9863013698630136,
"grad_norm": 1.765625,
"learning_rate": 1.0451977401129945e-05,
"loss": 0.7862,
"mean_token_accuracy": 0.8981420993804932,
"step": 180
},
{
"epoch": 0.9917808219178083,
"grad_norm": 1.6796875,
"learning_rate": 1.0395480225988701e-05,
"loss": 0.7166,
"mean_token_accuracy": 0.9056210517883301,
"step": 181
},
{
"epoch": 0.9972602739726028,
"grad_norm": 1.7734375,
"learning_rate": 1.0338983050847458e-05,
"loss": 0.716,
"mean_token_accuracy": 0.9040741622447968,
"step": 182
},
{
"epoch": 1.0,
"grad_norm": 1.203125,
"learning_rate": 1.0282485875706216e-05,
"loss": 0.366,
"mean_token_accuracy": 0.8993819952011108,
"step": 183
},
{
"epoch": 1.0054794520547945,
"grad_norm": 1.96875,
"learning_rate": 1.0225988700564973e-05,
"loss": 0.6589,
"mean_token_accuracy": 0.9100659489631653,
"step": 184
},
{
"epoch": 1.010958904109589,
"grad_norm": 1.921875,
"learning_rate": 1.016949152542373e-05,
"loss": 0.7632,
"mean_token_accuracy": 0.8995705246925354,
"step": 185
},
{
"epoch": 1.0164383561643835,
"grad_norm": 1.7890625,
"learning_rate": 1.0112994350282486e-05,
"loss": 0.7087,
"mean_token_accuracy": 0.9034123718738556,
"step": 186
},
{
"epoch": 1.021917808219178,
"grad_norm": 1.8125,
"learning_rate": 1.0056497175141244e-05,
"loss": 0.7332,
"mean_token_accuracy": 0.8997578024864197,
"step": 187
},
{
"epoch": 1.0273972602739727,
"grad_norm": 1.96875,
"learning_rate": 1e-05,
"loss": 0.7192,
"mean_token_accuracy": 0.9052003026008606,
"step": 188
},
{
"epoch": 1.0328767123287672,
"grad_norm": 1.8828125,
"learning_rate": 9.943502824858759e-06,
"loss": 0.6996,
"mean_token_accuracy": 0.9051499664783478,
"step": 189
},
{
"epoch": 1.0383561643835617,
"grad_norm": 1.9296875,
"learning_rate": 9.887005649717516e-06,
"loss": 0.6603,
"mean_token_accuracy": 0.9084216058254242,
"step": 190
},
{
"epoch": 1.0438356164383562,
"grad_norm": 2.09375,
"learning_rate": 9.830508474576272e-06,
"loss": 0.6934,
"mean_token_accuracy": 0.905331015586853,
"step": 191
},
{
"epoch": 1.0493150684931507,
"grad_norm": 1.9609375,
"learning_rate": 9.774011299435029e-06,
"loss": 0.6852,
"mean_token_accuracy": 0.907427966594696,
"step": 192
},
{
"epoch": 1.0547945205479452,
"grad_norm": 1.8671875,
"learning_rate": 9.717514124293787e-06,
"loss": 0.6364,
"mean_token_accuracy": 0.9119535982608795,
"step": 193
},
{
"epoch": 1.0602739726027397,
"grad_norm": 1.7890625,
"learning_rate": 9.661016949152544e-06,
"loss": 0.7319,
"mean_token_accuracy": 0.902245968580246,
"step": 194
},
{
"epoch": 1.0657534246575342,
"grad_norm": 1.84375,
"learning_rate": 9.6045197740113e-06,
"loss": 0.7186,
"mean_token_accuracy": 0.9031082093715668,
"step": 195
},
{
"epoch": 1.0712328767123287,
"grad_norm": 1.921875,
"learning_rate": 9.548022598870057e-06,
"loss": 0.7056,
"mean_token_accuracy": 0.9083144962787628,
"step": 196
},
{
"epoch": 1.0767123287671232,
"grad_norm": 1.9296875,
"learning_rate": 9.491525423728815e-06,
"loss": 0.6955,
"mean_token_accuracy": 0.9058070778846741,
"step": 197
},
{
"epoch": 1.0821917808219177,
"grad_norm": 1.8203125,
"learning_rate": 9.435028248587572e-06,
"loss": 0.658,
"mean_token_accuracy": 0.9093527495861053,
"step": 198
},
{
"epoch": 1.0876712328767124,
"grad_norm": 1.8203125,
"learning_rate": 9.378531073446328e-06,
"loss": 0.6913,
"mean_token_accuracy": 0.907717764377594,
"step": 199
},
{
"epoch": 1.093150684931507,
"grad_norm": 1.7890625,
"learning_rate": 9.322033898305085e-06,
"loss": 0.6677,
"mean_token_accuracy": 0.9069054424762726,
"step": 200
},
{
"epoch": 1.0986301369863014,
"grad_norm": 1.84375,
"learning_rate": 9.265536723163843e-06,
"loss": 0.6996,
"mean_token_accuracy": 0.9076306223869324,
"step": 201
},
{
"epoch": 1.104109589041096,
"grad_norm": 1.8359375,
"learning_rate": 9.2090395480226e-06,
"loss": 0.7397,
"mean_token_accuracy": 0.8992961049079895,
"step": 202
},
{
"epoch": 1.1095890410958904,
"grad_norm": 1.7890625,
"learning_rate": 9.152542372881356e-06,
"loss": 0.7211,
"mean_token_accuracy": 0.9030121862888336,
"step": 203
},
{
"epoch": 1.115068493150685,
"grad_norm": 2.359375,
"learning_rate": 9.096045197740113e-06,
"loss": 0.7143,
"mean_token_accuracy": 0.9033773839473724,
"step": 204
},
{
"epoch": 1.1205479452054794,
"grad_norm": 2.03125,
"learning_rate": 9.039548022598871e-06,
"loss": 0.7024,
"mean_token_accuracy": 0.905937910079956,
"step": 205
},
{
"epoch": 1.126027397260274,
"grad_norm": 1.953125,
"learning_rate": 8.983050847457628e-06,
"loss": 0.671,
"mean_token_accuracy": 0.9082026183605194,
"step": 206
},
{
"epoch": 1.1315068493150684,
"grad_norm": 1.875,
"learning_rate": 8.926553672316384e-06,
"loss": 0.713,
"mean_token_accuracy": 0.903482049703598,
"step": 207
},
{
"epoch": 1.1369863013698631,
"grad_norm": 1.765625,
"learning_rate": 8.870056497175143e-06,
"loss": 0.7127,
"mean_token_accuracy": 0.9049177765846252,
"step": 208
},
{
"epoch": 1.1424657534246576,
"grad_norm": 1.953125,
"learning_rate": 8.8135593220339e-06,
"loss": 0.7038,
"mean_token_accuracy": 0.9026915431022644,
"step": 209
},
{
"epoch": 1.1479452054794521,
"grad_norm": 1.9609375,
"learning_rate": 8.757062146892656e-06,
"loss": 0.7828,
"mean_token_accuracy": 0.8936010599136353,
"step": 210
},
{
"epoch": 1.1534246575342466,
"grad_norm": 1.8203125,
"learning_rate": 8.700564971751413e-06,
"loss": 0.6303,
"mean_token_accuracy": 0.9128701090812683,
"step": 211
},
{
"epoch": 1.158904109589041,
"grad_norm": 1.8359375,
"learning_rate": 8.64406779661017e-06,
"loss": 0.6587,
"mean_token_accuracy": 0.9105681478977203,
"step": 212
},
{
"epoch": 1.1643835616438356,
"grad_norm": 1.8359375,
"learning_rate": 8.587570621468927e-06,
"loss": 0.6522,
"mean_token_accuracy": 0.9126417934894562,
"step": 213
},
{
"epoch": 1.16986301369863,
"grad_norm": 1.921875,
"learning_rate": 8.531073446327684e-06,
"loss": 0.6384,
"mean_token_accuracy": 0.9142054915428162,
"step": 214
},
{
"epoch": 1.1753424657534246,
"grad_norm": 1.84375,
"learning_rate": 8.47457627118644e-06,
"loss": 0.6443,
"mean_token_accuracy": 0.9108563363552094,
"step": 215
},
{
"epoch": 1.180821917808219,
"grad_norm": 1.8828125,
"learning_rate": 8.418079096045199e-06,
"loss": 0.6724,
"mean_token_accuracy": 0.9071345031261444,
"step": 216
},
{
"epoch": 1.1863013698630138,
"grad_norm": 1.984375,
"learning_rate": 8.361581920903955e-06,
"loss": 0.7334,
"mean_token_accuracy": 0.9011849761009216,
"step": 217
},
{
"epoch": 1.191780821917808,
"grad_norm": 1.890625,
"learning_rate": 8.305084745762712e-06,
"loss": 0.6668,
"mean_token_accuracy": 0.9077614843845367,
"step": 218
},
{
"epoch": 1.1972602739726028,
"grad_norm": 1.8828125,
"learning_rate": 8.248587570621469e-06,
"loss": 0.7365,
"mean_token_accuracy": 0.9032963216304779,
"step": 219
},
{
"epoch": 1.2027397260273973,
"grad_norm": 1.890625,
"learning_rate": 8.192090395480227e-06,
"loss": 0.6649,
"mean_token_accuracy": 0.9089824557304382,
"step": 220
},
{
"epoch": 1.2082191780821918,
"grad_norm": 1.7890625,
"learning_rate": 8.135593220338983e-06,
"loss": 0.6773,
"mean_token_accuracy": 0.9071504771709442,
"step": 221
},
{
"epoch": 1.2136986301369863,
"grad_norm": 1.9765625,
"learning_rate": 8.07909604519774e-06,
"loss": 0.69,
"mean_token_accuracy": 0.9098499119281769,
"step": 222
},
{
"epoch": 1.2191780821917808,
"grad_norm": 1.8671875,
"learning_rate": 8.022598870056498e-06,
"loss": 0.6418,
"mean_token_accuracy": 0.9123164415359497,
"step": 223
},
{
"epoch": 1.2246575342465753,
"grad_norm": 1.8359375,
"learning_rate": 7.966101694915255e-06,
"loss": 0.6542,
"mean_token_accuracy": 0.9111972451210022,
"step": 224
},
{
"epoch": 1.2301369863013698,
"grad_norm": 1.8828125,
"learning_rate": 7.909604519774012e-06,
"loss": 0.7119,
"mean_token_accuracy": 0.9055051803588867,
"step": 225
},
{
"epoch": 1.2356164383561643,
"grad_norm": 2.0,
"learning_rate": 7.853107344632768e-06,
"loss": 0.6844,
"mean_token_accuracy": 0.9083731472492218,
"step": 226
},
{
"epoch": 1.2410958904109588,
"grad_norm": 1.9296875,
"learning_rate": 7.796610169491526e-06,
"loss": 0.6986,
"mean_token_accuracy": 0.9047116041183472,
"step": 227
},
{
"epoch": 1.2465753424657535,
"grad_norm": 1.8359375,
"learning_rate": 7.740112994350283e-06,
"loss": 0.6755,
"mean_token_accuracy": 0.9082626402378082,
"step": 228
},
{
"epoch": 1.252054794520548,
"grad_norm": 1.890625,
"learning_rate": 7.68361581920904e-06,
"loss": 0.6935,
"mean_token_accuracy": 0.9040849804878235,
"step": 229
},
{
"epoch": 1.2575342465753425,
"grad_norm": 2.109375,
"learning_rate": 7.627118644067797e-06,
"loss": 0.6887,
"mean_token_accuracy": 0.9099703729152679,
"step": 230
},
{
"epoch": 1.263013698630137,
"grad_norm": 2.0,
"learning_rate": 7.5706214689265545e-06,
"loss": 0.7033,
"mean_token_accuracy": 0.9058608114719391,
"step": 231
},
{
"epoch": 1.2684931506849315,
"grad_norm": 1.9375,
"learning_rate": 7.514124293785311e-06,
"loss": 0.7058,
"mean_token_accuracy": 0.9063239991664886,
"step": 232
},
{
"epoch": 1.273972602739726,
"grad_norm": 1.9765625,
"learning_rate": 7.4576271186440685e-06,
"loss": 0.7218,
"mean_token_accuracy": 0.9018439948558807,
"step": 233
},
{
"epoch": 1.2794520547945205,
"grad_norm": 1.9140625,
"learning_rate": 7.401129943502825e-06,
"loss": 0.7134,
"mean_token_accuracy": 0.9037202000617981,
"step": 234
},
{
"epoch": 1.284931506849315,
"grad_norm": 1.96875,
"learning_rate": 7.3446327683615825e-06,
"loss": 0.6849,
"mean_token_accuracy": 0.9080476462841034,
"step": 235
},
{
"epoch": 1.2904109589041095,
"grad_norm": 2.015625,
"learning_rate": 7.288135593220339e-06,
"loss": 0.747,
"mean_token_accuracy": 0.9032059609889984,
"step": 236
},
{
"epoch": 1.2958904109589042,
"grad_norm": 1.875,
"learning_rate": 7.2316384180790965e-06,
"loss": 0.6309,
"mean_token_accuracy": 0.9128157496452332,
"step": 237
},
{
"epoch": 1.3013698630136985,
"grad_norm": 1.859375,
"learning_rate": 7.175141242937854e-06,
"loss": 0.7132,
"mean_token_accuracy": 0.9027237892150879,
"step": 238
},
{
"epoch": 1.3068493150684932,
"grad_norm": 1.9140625,
"learning_rate": 7.1186440677966106e-06,
"loss": 0.6806,
"mean_token_accuracy": 0.9067763984203339,
"step": 239
},
{
"epoch": 1.3123287671232877,
"grad_norm": 1.90625,
"learning_rate": 7.062146892655368e-06,
"loss": 0.7434,
"mean_token_accuracy": 0.9005565345287323,
"step": 240
},
{
"epoch": 1.3178082191780822,
"grad_norm": 1.875,
"learning_rate": 7.0056497175141246e-06,
"loss": 0.6809,
"mean_token_accuracy": 0.9072897434234619,
"step": 241
},
{
"epoch": 1.3232876712328767,
"grad_norm": 1.984375,
"learning_rate": 6.949152542372882e-06,
"loss": 0.7208,
"mean_token_accuracy": 0.906402200460434,
"step": 242
},
{
"epoch": 1.3287671232876712,
"grad_norm": 1.9921875,
"learning_rate": 6.892655367231639e-06,
"loss": 0.7418,
"mean_token_accuracy": 0.9020512700080872,
"step": 243
},
{
"epoch": 1.3342465753424657,
"grad_norm": 2.09375,
"learning_rate": 6.836158192090396e-06,
"loss": 0.7158,
"mean_token_accuracy": 0.9037717282772064,
"step": 244
},
{
"epoch": 1.3397260273972602,
"grad_norm": 1.96875,
"learning_rate": 6.779661016949153e-06,
"loss": 0.7274,
"mean_token_accuracy": 0.9033022224903107,
"step": 245
},
{
"epoch": 1.345205479452055,
"grad_norm": 2.015625,
"learning_rate": 6.72316384180791e-06,
"loss": 0.6736,
"mean_token_accuracy": 0.9079640805721283,
"step": 246
},
{
"epoch": 1.3506849315068492,
"grad_norm": 1.9921875,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6624,
"mean_token_accuracy": 0.9096674621105194,
"step": 247
},
{
"epoch": 1.356164383561644,
"grad_norm": 2.015625,
"learning_rate": 6.610169491525424e-06,
"loss": 0.7013,
"mean_token_accuracy": 0.9066566228866577,
"step": 248
},
{
"epoch": 1.3616438356164384,
"grad_norm": 1.9375,
"learning_rate": 6.553672316384181e-06,
"loss": 0.7107,
"mean_token_accuracy": 0.9048294425010681,
"step": 249
},
{
"epoch": 1.367123287671233,
"grad_norm": 2.046875,
"learning_rate": 6.497175141242938e-06,
"loss": 0.6964,
"mean_token_accuracy": 0.9053798615932465,
"step": 250
},
{
"epoch": 1.3726027397260274,
"grad_norm": 1.859375,
"learning_rate": 6.440677966101695e-06,
"loss": 0.6184,
"mean_token_accuracy": 0.9159266352653503,
"step": 251
},
{
"epoch": 1.378082191780822,
"grad_norm": 1.8984375,
"learning_rate": 6.384180790960452e-06,
"loss": 0.6435,
"mean_token_accuracy": 0.9122322797775269,
"step": 252
},
{
"epoch": 1.3835616438356164,
"grad_norm": 1.8359375,
"learning_rate": 6.32768361581921e-06,
"loss": 0.6622,
"mean_token_accuracy": 0.910215824842453,
"step": 253
},
{
"epoch": 1.389041095890411,
"grad_norm": 1.9140625,
"learning_rate": 6.271186440677966e-06,
"loss": 0.6991,
"mean_token_accuracy": 0.907020092010498,
"step": 254
},
{
"epoch": 1.3945205479452054,
"grad_norm": 1.9921875,
"learning_rate": 6.2146892655367244e-06,
"loss": 0.6726,
"mean_token_accuracy": 0.9085971713066101,
"step": 255
},
{
"epoch": 1.4,
"grad_norm": 1.84375,
"learning_rate": 6.158192090395481e-06,
"loss": 0.6498,
"mean_token_accuracy": 0.9123066365718842,
"step": 256
},
{
"epoch": 1.4054794520547946,
"grad_norm": 1.8203125,
"learning_rate": 6.1016949152542385e-06,
"loss": 0.6304,
"mean_token_accuracy": 0.914670318365097,
"step": 257
},
{
"epoch": 1.410958904109589,
"grad_norm": 1.8671875,
"learning_rate": 6.045197740112995e-06,
"loss": 0.6677,
"mean_token_accuracy": 0.9098580479621887,
"step": 258
},
{
"epoch": 1.4164383561643836,
"grad_norm": 1.90625,
"learning_rate": 5.9887005649717525e-06,
"loss": 0.6927,
"mean_token_accuracy": 0.9073116779327393,
"step": 259
},
{
"epoch": 1.4219178082191781,
"grad_norm": 2.25,
"learning_rate": 5.932203389830509e-06,
"loss": 0.7589,
"mean_token_accuracy": 0.8991564214229584,
"step": 260
},
{
"epoch": 1.4273972602739726,
"grad_norm": 1.921875,
"learning_rate": 5.8757062146892665e-06,
"loss": 0.7304,
"mean_token_accuracy": 0.9021161198616028,
"step": 261
},
{
"epoch": 1.4328767123287671,
"grad_norm": 1.828125,
"learning_rate": 5.819209039548023e-06,
"loss": 0.6267,
"mean_token_accuracy": 0.9146751463413239,
"step": 262
},
{
"epoch": 1.4383561643835616,
"grad_norm": 1.9140625,
"learning_rate": 5.7627118644067805e-06,
"loss": 0.7622,
"mean_token_accuracy": 0.8948341906070709,
"step": 263
},
{
"epoch": 1.4438356164383561,
"grad_norm": 1.8515625,
"learning_rate": 5.706214689265537e-06,
"loss": 0.6191,
"mean_token_accuracy": 0.913863331079483,
"step": 264
},
{
"epoch": 1.4493150684931506,
"grad_norm": 2.09375,
"learning_rate": 5.6497175141242946e-06,
"loss": 0.6872,
"mean_token_accuracy": 0.9074727296829224,
"step": 265
},
{
"epoch": 1.4547945205479453,
"grad_norm": 2.046875,
"learning_rate": 5.593220338983051e-06,
"loss": 0.6691,
"mean_token_accuracy": 0.910420149564743,
"step": 266
},
{
"epoch": 1.4602739726027396,
"grad_norm": 1.9296875,
"learning_rate": 5.536723163841809e-06,
"loss": 0.6667,
"mean_token_accuracy": 0.9060676395893097,
"step": 267
},
{
"epoch": 1.4657534246575343,
"grad_norm": 1.9375,
"learning_rate": 5.480225988700565e-06,
"loss": 0.6477,
"mean_token_accuracy": 0.9133667647838593,
"step": 268
},
{
"epoch": 1.4712328767123288,
"grad_norm": 1.921875,
"learning_rate": 5.423728813559323e-06,
"loss": 0.6444,
"mean_token_accuracy": 0.9106875658035278,
"step": 269
},
{
"epoch": 1.4767123287671233,
"grad_norm": 1.9140625,
"learning_rate": 5.36723163841808e-06,
"loss": 0.6404,
"mean_token_accuracy": 0.9118073582649231,
"step": 270
},
{
"epoch": 1.4821917808219178,
"grad_norm": 1.890625,
"learning_rate": 5.310734463276837e-06,
"loss": 0.6769,
"mean_token_accuracy": 0.9056595265865326,
"step": 271
},
{
"epoch": 1.4876712328767123,
"grad_norm": 1.9765625,
"learning_rate": 5.254237288135594e-06,
"loss": 0.6808,
"mean_token_accuracy": 0.9079870283603668,
"step": 272
},
{
"epoch": 1.4931506849315068,
"grad_norm": 1.953125,
"learning_rate": 5.197740112994351e-06,
"loss": 0.7428,
"mean_token_accuracy": 0.9004494547843933,
"step": 273
},
{
"epoch": 1.4986301369863013,
"grad_norm": 1.8828125,
"learning_rate": 5.141242937853108e-06,
"loss": 0.6059,
"mean_token_accuracy": 0.9166204333305359,
"step": 274
},
{
"epoch": 1.504109589041096,
"grad_norm": 2.203125,
"learning_rate": 5.084745762711865e-06,
"loss": 0.6387,
"mean_token_accuracy": 0.9125949144363403,
"step": 275
},
{
"epoch": 1.5095890410958903,
"grad_norm": 1.9296875,
"learning_rate": 5.028248587570622e-06,
"loss": 0.674,
"mean_token_accuracy": 0.910666286945343,
"step": 276
},
{
"epoch": 1.515068493150685,
"grad_norm": 1.9296875,
"learning_rate": 4.9717514124293796e-06,
"loss": 0.653,
"mean_token_accuracy": 0.9097437858581543,
"step": 277
},
{
"epoch": 1.5205479452054793,
"grad_norm": 1.9921875,
"learning_rate": 4.915254237288136e-06,
"loss": 0.6891,
"mean_token_accuracy": 0.9078112840652466,
"step": 278
},
{
"epoch": 1.526027397260274,
"grad_norm": 2.140625,
"learning_rate": 4.8587570621468936e-06,
"loss": 0.7249,
"mean_token_accuracy": 0.9042028188705444,
"step": 279
},
{
"epoch": 1.5315068493150685,
"grad_norm": 1.9140625,
"learning_rate": 4.80225988700565e-06,
"loss": 0.6809,
"mean_token_accuracy": 0.9092828631401062,
"step": 280
},
{
"epoch": 1.536986301369863,
"grad_norm": 1.890625,
"learning_rate": 4.745762711864408e-06,
"loss": 0.6141,
"mean_token_accuracy": 0.9147942662239075,
"step": 281
},
{
"epoch": 1.5424657534246575,
"grad_norm": 2.046875,
"learning_rate": 4.689265536723164e-06,
"loss": 0.7209,
"mean_token_accuracy": 0.9041316211223602,
"step": 282
},
{
"epoch": 1.547945205479452,
"grad_norm": 1.84375,
"learning_rate": 4.632768361581922e-06,
"loss": 0.6145,
"mean_token_accuracy": 0.9166280925273895,
"step": 283
},
{
"epoch": 1.5534246575342465,
"grad_norm": 2.125,
"learning_rate": 4.576271186440678e-06,
"loss": 0.673,
"mean_token_accuracy": 0.9096376001834869,
"step": 284
},
{
"epoch": 1.558904109589041,
"grad_norm": 1.9609375,
"learning_rate": 4.519774011299436e-06,
"loss": 0.6784,
"mean_token_accuracy": 0.9063901305198669,
"step": 285
},
{
"epoch": 1.5643835616438357,
"grad_norm": 1.9453125,
"learning_rate": 4.463276836158192e-06,
"loss": 0.6594,
"mean_token_accuracy": 0.9109133183956146,
"step": 286
},
{
"epoch": 1.56986301369863,
"grad_norm": 1.9453125,
"learning_rate": 4.40677966101695e-06,
"loss": 0.6409,
"mean_token_accuracy": 0.9116988480091095,
"step": 287
},
{
"epoch": 1.5753424657534247,
"grad_norm": 1.90625,
"learning_rate": 4.350282485875706e-06,
"loss": 0.6547,
"mean_token_accuracy": 0.9099021852016449,
"step": 288
},
{
"epoch": 1.580821917808219,
"grad_norm": 1.90625,
"learning_rate": 4.293785310734464e-06,
"loss": 0.6769,
"mean_token_accuracy": 0.9094822108745575,
"step": 289
},
{
"epoch": 1.5863013698630137,
"grad_norm": 2.0,
"learning_rate": 4.23728813559322e-06,
"loss": 0.7131,
"mean_token_accuracy": 0.903436928987503,
"step": 290
},
{
"epoch": 1.5917808219178082,
"grad_norm": 1.9375,
"learning_rate": 4.180790960451978e-06,
"loss": 0.6988,
"mean_token_accuracy": 0.9046348929405212,
"step": 291
},
{
"epoch": 1.5972602739726027,
"grad_norm": 2.015625,
"learning_rate": 4.124293785310734e-06,
"loss": 0.7204,
"mean_token_accuracy": 0.9052576124668121,
"step": 292
},
{
"epoch": 1.6027397260273972,
"grad_norm": 1.984375,
"learning_rate": 4.067796610169492e-06,
"loss": 0.6905,
"mean_token_accuracy": 0.9082843363285065,
"step": 293
},
{
"epoch": 1.6082191780821917,
"grad_norm": 2.015625,
"learning_rate": 4.011299435028249e-06,
"loss": 0.7316,
"mean_token_accuracy": 0.9009381234645844,
"step": 294
},
{
"epoch": 1.6136986301369864,
"grad_norm": 1.9921875,
"learning_rate": 3.954802259887006e-06,
"loss": 0.7316,
"mean_token_accuracy": 0.9005630910396576,
"step": 295
},
{
"epoch": 1.6191780821917807,
"grad_norm": 1.984375,
"learning_rate": 3.898305084745763e-06,
"loss": 0.6994,
"mean_token_accuracy": 0.905254602432251,
"step": 296
},
{
"epoch": 1.6246575342465754,
"grad_norm": 1.953125,
"learning_rate": 3.84180790960452e-06,
"loss": 0.6493,
"mean_token_accuracy": 0.9093045294284821,
"step": 297
},
{
"epoch": 1.6301369863013697,
"grad_norm": 1.90625,
"learning_rate": 3.7853107344632772e-06,
"loss": 0.6282,
"mean_token_accuracy": 0.9144234955310822,
"step": 298
},
{
"epoch": 1.6356164383561644,
"grad_norm": 1.9296875,
"learning_rate": 3.7288135593220342e-06,
"loss": 0.6878,
"mean_token_accuracy": 0.9078341126441956,
"step": 299
},
{
"epoch": 1.641095890410959,
"grad_norm": 1.890625,
"learning_rate": 3.6723163841807913e-06,
"loss": 0.6357,
"mean_token_accuracy": 0.9131599366664886,
"step": 300
},
{
"epoch": 1.6465753424657534,
"grad_norm": 1.875,
"learning_rate": 3.6158192090395483e-06,
"loss": 0.7303,
"mean_token_accuracy": 0.9041744768619537,
"step": 301
},
{
"epoch": 1.652054794520548,
"grad_norm": 1.9375,
"learning_rate": 3.5593220338983053e-06,
"loss": 0.6885,
"mean_token_accuracy": 0.9045538306236267,
"step": 302
},
{
"epoch": 1.6575342465753424,
"grad_norm": 1.9453125,
"learning_rate": 3.5028248587570623e-06,
"loss": 0.6915,
"mean_token_accuracy": 0.9064763784408569,
"step": 303
},
{
"epoch": 1.6630136986301371,
"grad_norm": 2.078125,
"learning_rate": 3.4463276836158193e-06,
"loss": 0.7073,
"mean_token_accuracy": 0.9064249396324158,
"step": 304
},
{
"epoch": 1.6684931506849314,
"grad_norm": 1.953125,
"learning_rate": 3.3898305084745763e-06,
"loss": 0.6497,
"mean_token_accuracy": 0.9118121564388275,
"step": 305
},
{
"epoch": 1.6739726027397261,
"grad_norm": 1.90625,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.6628,
"mean_token_accuracy": 0.9104187488555908,
"step": 306
},
{
"epoch": 1.6794520547945204,
"grad_norm": 1.90625,
"learning_rate": 3.2768361581920903e-06,
"loss": 0.6534,
"mean_token_accuracy": 0.909776359796524,
"step": 307
},
{
"epoch": 1.6849315068493151,
"grad_norm": 1.875,
"learning_rate": 3.2203389830508473e-06,
"loss": 0.6717,
"mean_token_accuracy": 0.9090818762779236,
"step": 308
},
{
"epoch": 1.6904109589041096,
"grad_norm": 1.984375,
"learning_rate": 3.163841807909605e-06,
"loss": 0.6735,
"mean_token_accuracy": 0.9090877175331116,
"step": 309
},
{
"epoch": 1.6958904109589041,
"grad_norm": 1.9609375,
"learning_rate": 3.1073446327683622e-06,
"loss": 0.6932,
"mean_token_accuracy": 0.9055996835231781,
"step": 310
},
{
"epoch": 1.7013698630136986,
"grad_norm": 1.9140625,
"learning_rate": 3.0508474576271192e-06,
"loss": 0.6919,
"mean_token_accuracy": 0.9036900401115417,
"step": 311
},
{
"epoch": 1.7068493150684931,
"grad_norm": 1.9453125,
"learning_rate": 2.9943502824858762e-06,
"loss": 0.6737,
"mean_token_accuracy": 0.9093412756919861,
"step": 312
},
{
"epoch": 1.7123287671232876,
"grad_norm": 1.84375,
"learning_rate": 2.9378531073446333e-06,
"loss": 0.6038,
"mean_token_accuracy": 0.9156779944896698,
"step": 313
},
{
"epoch": 1.7178082191780821,
"grad_norm": 1.8984375,
"learning_rate": 2.8813559322033903e-06,
"loss": 0.6681,
"mean_token_accuracy": 0.9089544415473938,
"step": 314
},
{
"epoch": 1.7232876712328768,
"grad_norm": 2.015625,
"learning_rate": 2.8248587570621473e-06,
"loss": 0.6544,
"mean_token_accuracy": 0.911058783531189,
"step": 315
},
{
"epoch": 1.7287671232876711,
"grad_norm": 1.9453125,
"learning_rate": 2.7683615819209043e-06,
"loss": 0.6405,
"mean_token_accuracy": 0.9108243882656097,
"step": 316
},
{
"epoch": 1.7342465753424658,
"grad_norm": 1.921875,
"learning_rate": 2.7118644067796613e-06,
"loss": 0.6556,
"mean_token_accuracy": 0.9137302935123444,
"step": 317
},
{
"epoch": 1.7397260273972601,
"grad_norm": 2.28125,
"learning_rate": 2.6553672316384183e-06,
"loss": 0.6323,
"mean_token_accuracy": 0.9138603806495667,
"step": 318
},
{
"epoch": 1.7452054794520548,
"grad_norm": 1.8984375,
"learning_rate": 2.5988700564971753e-06,
"loss": 0.662,
"mean_token_accuracy": 0.908368855714798,
"step": 319
},
{
"epoch": 1.7506849315068493,
"grad_norm": 1.9296875,
"learning_rate": 2.5423728813559323e-06,
"loss": 0.602,
"mean_token_accuracy": 0.9159910678863525,
"step": 320
},
{
"epoch": 1.7561643835616438,
"grad_norm": 1.9140625,
"learning_rate": 2.4858757062146898e-06,
"loss": 0.6691,
"mean_token_accuracy": 0.9087992608547211,
"step": 321
},
{
"epoch": 1.7616438356164383,
"grad_norm": 1.796875,
"learning_rate": 2.4293785310734468e-06,
"loss": 0.596,
"mean_token_accuracy": 0.917241632938385,
"step": 322
},
{
"epoch": 1.7671232876712328,
"grad_norm": 2.015625,
"learning_rate": 2.372881355932204e-06,
"loss": 0.7606,
"mean_token_accuracy": 0.8987223207950592,
"step": 323
},
{
"epoch": 1.7726027397260276,
"grad_norm": 1.8984375,
"learning_rate": 2.316384180790961e-06,
"loss": 0.6812,
"mean_token_accuracy": 0.9057367146015167,
"step": 324
},
{
"epoch": 1.7780821917808218,
"grad_norm": 1.953125,
"learning_rate": 2.259887005649718e-06,
"loss": 0.6828,
"mean_token_accuracy": 0.9079809784889221,
"step": 325
},
{
"epoch": 1.7835616438356166,
"grad_norm": 1.984375,
"learning_rate": 2.203389830508475e-06,
"loss": 0.7241,
"mean_token_accuracy": 0.9030264317989349,
"step": 326
},
{
"epoch": 1.7890410958904108,
"grad_norm": 1.90625,
"learning_rate": 2.146892655367232e-06,
"loss": 0.6908,
"mean_token_accuracy": 0.9060869216918945,
"step": 327
},
{
"epoch": 1.7945205479452055,
"grad_norm": 1.984375,
"learning_rate": 2.090395480225989e-06,
"loss": 0.6877,
"mean_token_accuracy": 0.9075476229190826,
"step": 328
},
{
"epoch": 1.8,
"grad_norm": 2.0,
"learning_rate": 2.033898305084746e-06,
"loss": 0.6607,
"mean_token_accuracy": 0.9095008373260498,
"step": 329
},
{
"epoch": 1.8054794520547945,
"grad_norm": 1.859375,
"learning_rate": 1.977401129943503e-06,
"loss": 0.5915,
"mean_token_accuracy": 0.9170799255371094,
"step": 330
},
{
"epoch": 1.810958904109589,
"grad_norm": 2.03125,
"learning_rate": 1.92090395480226e-06,
"loss": 0.7045,
"mean_token_accuracy": 0.9037127196788788,
"step": 331
},
{
"epoch": 1.8164383561643835,
"grad_norm": 1.9765625,
"learning_rate": 1.8644067796610171e-06,
"loss": 0.6665,
"mean_token_accuracy": 0.9105578064918518,
"step": 332
},
{
"epoch": 1.821917808219178,
"grad_norm": 1.8671875,
"learning_rate": 1.8079096045197741e-06,
"loss": 0.6302,
"mean_token_accuracy": 0.9144696891307831,
"step": 333
},
{
"epoch": 1.8273972602739725,
"grad_norm": 1.984375,
"learning_rate": 1.7514124293785311e-06,
"loss": 0.7235,
"mean_token_accuracy": 0.9047011733055115,
"step": 334
},
{
"epoch": 1.8328767123287673,
"grad_norm": 1.875,
"learning_rate": 1.6949152542372882e-06,
"loss": 0.6352,
"mean_token_accuracy": 0.9123874604701996,
"step": 335
},
{
"epoch": 1.8383561643835615,
"grad_norm": 1.9609375,
"learning_rate": 1.6384180790960452e-06,
"loss": 0.6611,
"mean_token_accuracy": 0.9092899858951569,
"step": 336
},
{
"epoch": 1.8438356164383563,
"grad_norm": 2.015625,
"learning_rate": 1.5819209039548026e-06,
"loss": 0.7016,
"mean_token_accuracy": 0.9032659232616425,
"step": 337
},
{
"epoch": 1.8493150684931505,
"grad_norm": 2.03125,
"learning_rate": 1.5254237288135596e-06,
"loss": 0.7062,
"mean_token_accuracy": 0.9050180912017822,
"step": 338
},
{
"epoch": 1.8547945205479452,
"grad_norm": 2.0,
"learning_rate": 1.4689265536723166e-06,
"loss": 0.7284,
"mean_token_accuracy": 0.9015350937843323,
"step": 339
},
{
"epoch": 1.8602739726027397,
"grad_norm": 1.96875,
"learning_rate": 1.4124293785310736e-06,
"loss": 0.6355,
"mean_token_accuracy": 0.9123442471027374,
"step": 340
},
{
"epoch": 1.8657534246575342,
"grad_norm": 1.9921875,
"learning_rate": 1.3559322033898307e-06,
"loss": 0.6472,
"mean_token_accuracy": 0.9134511053562164,
"step": 341
},
{
"epoch": 1.8712328767123287,
"grad_norm": 2.421875,
"learning_rate": 1.2994350282485877e-06,
"loss": 0.6786,
"mean_token_accuracy": 0.9099195599555969,
"step": 342
},
{
"epoch": 1.8767123287671232,
"grad_norm": 2.015625,
"learning_rate": 1.2429378531073449e-06,
"loss": 0.747,
"mean_token_accuracy": 0.9031140804290771,
"step": 343
},
{
"epoch": 1.882191780821918,
"grad_norm": 2.0,
"learning_rate": 1.186440677966102e-06,
"loss": 0.6904,
"mean_token_accuracy": 0.9059503078460693,
"step": 344
},
{
"epoch": 1.8876712328767122,
"grad_norm": 1.9375,
"learning_rate": 1.129943502824859e-06,
"loss": 0.6317,
"mean_token_accuracy": 0.913758397102356,
"step": 345
},
{
"epoch": 1.893150684931507,
"grad_norm": 1.921875,
"learning_rate": 1.073446327683616e-06,
"loss": 0.6718,
"mean_token_accuracy": 0.9079194068908691,
"step": 346
},
{
"epoch": 1.8986301369863012,
"grad_norm": 2.046875,
"learning_rate": 1.016949152542373e-06,
"loss": 0.6781,
"mean_token_accuracy": 0.9061055779457092,
"step": 347
},
{
"epoch": 1.904109589041096,
"grad_norm": 1.9375,
"learning_rate": 9.6045197740113e-07,
"loss": 0.7169,
"mean_token_accuracy": 0.9037725031375885,
"step": 348
},
{
"epoch": 1.9095890410958904,
"grad_norm": 2.078125,
"learning_rate": 9.039548022598871e-07,
"loss": 0.6882,
"mean_token_accuracy": 0.9081202149391174,
"step": 349
},
{
"epoch": 1.915068493150685,
"grad_norm": 1.984375,
"learning_rate": 8.474576271186441e-07,
"loss": 0.6714,
"mean_token_accuracy": 0.9084700644016266,
"step": 350
},
{
"epoch": 1.9205479452054794,
"grad_norm": 1.8984375,
"learning_rate": 7.909604519774013e-07,
"loss": 0.5951,
"mean_token_accuracy": 0.915435403585434,
"step": 351
},
{
"epoch": 1.926027397260274,
"grad_norm": 1.921875,
"learning_rate": 7.344632768361583e-07,
"loss": 0.6639,
"mean_token_accuracy": 0.9109577238559723,
"step": 352
},
{
"epoch": 1.9315068493150684,
"grad_norm": 1.9296875,
"learning_rate": 6.779661016949153e-07,
"loss": 0.6354,
"mean_token_accuracy": 0.9138354063034058,
"step": 353
},
{
"epoch": 1.936986301369863,
"grad_norm": 1.984375,
"learning_rate": 6.214689265536724e-07,
"loss": 0.6613,
"mean_token_accuracy": 0.9091556370258331,
"step": 354
},
{
"epoch": 1.9424657534246577,
"grad_norm": 1.8828125,
"learning_rate": 5.649717514124295e-07,
"loss": 0.6825,
"mean_token_accuracy": 0.9077677130699158,
"step": 355
},
{
"epoch": 1.947945205479452,
"grad_norm": 1.890625,
"learning_rate": 5.084745762711865e-07,
"loss": 0.6799,
"mean_token_accuracy": 0.9061026573181152,
"step": 356
},
{
"epoch": 1.9534246575342467,
"grad_norm": 1.96875,
"learning_rate": 4.5197740112994353e-07,
"loss": 0.631,
"mean_token_accuracy": 0.914902925491333,
"step": 357
},
{
"epoch": 1.958904109589041,
"grad_norm": 1.9375,
"learning_rate": 3.9548022598870065e-07,
"loss": 0.7215,
"mean_token_accuracy": 0.9018444120883942,
"step": 358
},
{
"epoch": 1.9643835616438357,
"grad_norm": 1.90625,
"learning_rate": 3.3898305084745766e-07,
"loss": 0.7234,
"mean_token_accuracy": 0.9011828899383545,
"step": 359
},
{
"epoch": 1.9698630136986301,
"grad_norm": 2.046875,
"learning_rate": 2.8248587570621473e-07,
"loss": 0.6796,
"mean_token_accuracy": 0.9087766408920288,
"step": 360
},
{
"epoch": 1.9753424657534246,
"grad_norm": 1.921875,
"learning_rate": 2.2598870056497177e-07,
"loss": 0.6692,
"mean_token_accuracy": 0.9075906872749329,
"step": 361
},
{
"epoch": 1.9808219178082191,
"grad_norm": 1.9453125,
"learning_rate": 1.6949152542372883e-07,
"loss": 0.7053,
"mean_token_accuracy": 0.9050938785076141,
"step": 362
},
{
"epoch": 1.9863013698630136,
"grad_norm": 1.9375,
"learning_rate": 1.1299435028248588e-07,
"loss": 0.6466,
"mean_token_accuracy": 0.9130730330944061,
"step": 363
},
{
"epoch": 1.9917808219178084,
"grad_norm": 1.8671875,
"learning_rate": 5.649717514124294e-08,
"loss": 0.6636,
"mean_token_accuracy": 0.9109528958797455,
"step": 364
}
],
"logging_steps": 1.0,
"max_steps": 364,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.568355335831552e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}