arunos728's picture
Upload checkpoint-10000/trainer_state.json
48d39a8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 38.46153846153846,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.038461538461538464,
"grad_norm": 0.8129774928092957,
"learning_rate": 2.7e-07,
"loss": 1.3508,
"step": 10
},
{
"epoch": 0.07692307692307693,
"grad_norm": 0.6433653235435486,
"learning_rate": 5.7e-07,
"loss": 1.3552,
"step": 20
},
{
"epoch": 0.11538461538461539,
"grad_norm": 0.6810219883918762,
"learning_rate": 8.7e-07,
"loss": 1.3478,
"step": 30
},
{
"epoch": 0.15384615384615385,
"grad_norm": 0.6231173276901245,
"learning_rate": 1.17e-06,
"loss": 1.3472,
"step": 40
},
{
"epoch": 0.19230769230769232,
"grad_norm": 0.829264223575592,
"learning_rate": 1.4700000000000001e-06,
"loss": 1.3416,
"step": 50
},
{
"epoch": 0.23076923076923078,
"grad_norm": 0.9087441563606262,
"learning_rate": 1.77e-06,
"loss": 1.3256,
"step": 60
},
{
"epoch": 0.2692307692307692,
"grad_norm": 1.0764439105987549,
"learning_rate": 2.07e-06,
"loss": 1.3159,
"step": 70
},
{
"epoch": 0.3076923076923077,
"grad_norm": 1.0241045951843262,
"learning_rate": 2.37e-06,
"loss": 1.3131,
"step": 80
},
{
"epoch": 0.34615384615384615,
"grad_norm": 1.0223745107650757,
"learning_rate": 2.67e-06,
"loss": 1.2953,
"step": 90
},
{
"epoch": 0.38461538461538464,
"grad_norm": 1.3292489051818848,
"learning_rate": 2.9700000000000004e-06,
"loss": 1.2714,
"step": 100
},
{
"epoch": 0.4230769230769231,
"grad_norm": 1.2786927223205566,
"learning_rate": 3.27e-06,
"loss": 1.2593,
"step": 110
},
{
"epoch": 0.46153846153846156,
"grad_norm": 1.0608361959457397,
"learning_rate": 3.57e-06,
"loss": 1.2464,
"step": 120
},
{
"epoch": 0.5,
"grad_norm": 0.9192166328430176,
"learning_rate": 3.87e-06,
"loss": 1.2222,
"step": 130
},
{
"epoch": 0.5384615384615384,
"grad_norm": 0.8084186315536499,
"learning_rate": 4.170000000000001e-06,
"loss": 1.2077,
"step": 140
},
{
"epoch": 0.5769230769230769,
"grad_norm": 0.8090039491653442,
"learning_rate": 4.4699999999999996e-06,
"loss": 1.1963,
"step": 150
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.7544081807136536,
"learning_rate": 4.77e-06,
"loss": 1.1736,
"step": 160
},
{
"epoch": 0.6538461538461539,
"grad_norm": 0.7331106662750244,
"learning_rate": 5.070000000000001e-06,
"loss": 1.1634,
"step": 170
},
{
"epoch": 0.6923076923076923,
"grad_norm": 0.8381819128990173,
"learning_rate": 5.37e-06,
"loss": 1.154,
"step": 180
},
{
"epoch": 0.7307692307692307,
"grad_norm": 0.8138782382011414,
"learning_rate": 5.67e-06,
"loss": 1.1325,
"step": 190
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.7330286502838135,
"learning_rate": 5.9700000000000004e-06,
"loss": 1.1344,
"step": 200
},
{
"epoch": 0.8076923076923077,
"grad_norm": 0.8166984915733337,
"learning_rate": 6.27e-06,
"loss": 1.1302,
"step": 210
},
{
"epoch": 0.8461538461538461,
"grad_norm": 0.7504987120628357,
"learning_rate": 6.57e-06,
"loss": 1.1089,
"step": 220
},
{
"epoch": 0.8846153846153846,
"grad_norm": 0.7398339509963989,
"learning_rate": 6.87e-06,
"loss": 1.0961,
"step": 230
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.6914257407188416,
"learning_rate": 7.17e-06,
"loss": 1.0939,
"step": 240
},
{
"epoch": 0.9615384615384616,
"grad_norm": 0.753593921661377,
"learning_rate": 7.4700000000000005e-06,
"loss": 1.0844,
"step": 250
},
{
"epoch": 1.0,
"grad_norm": 0.775452733039856,
"learning_rate": 7.77e-06,
"loss": 1.0757,
"step": 260
},
{
"epoch": 1.0384615384615385,
"grad_norm": 0.8942003846168518,
"learning_rate": 8.07e-06,
"loss": 1.0727,
"step": 270
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.8823027610778809,
"learning_rate": 8.370000000000001e-06,
"loss": 1.073,
"step": 280
},
{
"epoch": 1.1153846153846154,
"grad_norm": 0.7825962901115417,
"learning_rate": 8.67e-06,
"loss": 1.0681,
"step": 290
},
{
"epoch": 1.1538461538461537,
"grad_norm": 1.042375087738037,
"learning_rate": 8.97e-06,
"loss": 1.0604,
"step": 300
},
{
"epoch": 1.1923076923076923,
"grad_norm": 0.9922687411308289,
"learning_rate": 9.27e-06,
"loss": 1.0616,
"step": 310
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.8586735129356384,
"learning_rate": 9.57e-06,
"loss": 1.0532,
"step": 320
},
{
"epoch": 1.2692307692307692,
"grad_norm": 0.9278848171234131,
"learning_rate": 9.87e-06,
"loss": 1.0478,
"step": 330
},
{
"epoch": 1.3076923076923077,
"grad_norm": 0.8375242948532104,
"learning_rate": 1.0170000000000001e-05,
"loss": 1.0513,
"step": 340
},
{
"epoch": 1.3461538461538463,
"grad_norm": 0.7845088839530945,
"learning_rate": 1.047e-05,
"loss": 1.0438,
"step": 350
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.9647091031074524,
"learning_rate": 1.077e-05,
"loss": 1.0285,
"step": 360
},
{
"epoch": 1.4230769230769231,
"grad_norm": 1.1311838626861572,
"learning_rate": 1.107e-05,
"loss": 1.023,
"step": 370
},
{
"epoch": 1.4615384615384617,
"grad_norm": 1.1281001567840576,
"learning_rate": 1.137e-05,
"loss": 1.0075,
"step": 380
},
{
"epoch": 1.5,
"grad_norm": 1.461525321006775,
"learning_rate": 1.167e-05,
"loss": 0.9779,
"step": 390
},
{
"epoch": 1.5384615384615383,
"grad_norm": 1.534185767173767,
"learning_rate": 1.197e-05,
"loss": 0.9208,
"step": 400
},
{
"epoch": 1.5769230769230769,
"grad_norm": 1.9967076778411865,
"learning_rate": 1.227e-05,
"loss": 0.8734,
"step": 410
},
{
"epoch": 1.6153846153846154,
"grad_norm": 1.6220159530639648,
"learning_rate": 1.257e-05,
"loss": 0.8197,
"step": 420
},
{
"epoch": 1.6538461538461537,
"grad_norm": 1.832636833190918,
"learning_rate": 1.287e-05,
"loss": 0.771,
"step": 430
},
{
"epoch": 1.6923076923076923,
"grad_norm": 2.2742176055908203,
"learning_rate": 1.3170000000000001e-05,
"loss": 0.7211,
"step": 440
},
{
"epoch": 1.7307692307692308,
"grad_norm": 1.9380594491958618,
"learning_rate": 1.3470000000000001e-05,
"loss": 0.6881,
"step": 450
},
{
"epoch": 1.7692307692307692,
"grad_norm": 2.589594602584839,
"learning_rate": 1.377e-05,
"loss": 0.6551,
"step": 460
},
{
"epoch": 1.8076923076923077,
"grad_norm": 2.73272442817688,
"learning_rate": 1.4069999999999999e-05,
"loss": 0.64,
"step": 470
},
{
"epoch": 1.8461538461538463,
"grad_norm": 2.789524555206299,
"learning_rate": 1.437e-05,
"loss": 0.6142,
"step": 480
},
{
"epoch": 1.8846153846153846,
"grad_norm": 2.5425634384155273,
"learning_rate": 1.467e-05,
"loss": 0.5954,
"step": 490
},
{
"epoch": 1.9230769230769231,
"grad_norm": 2.916409969329834,
"learning_rate": 1.497e-05,
"loss": 0.574,
"step": 500
},
{
"epoch": 1.9615384615384617,
"grad_norm": 2.5941741466522217,
"learning_rate": 1.527e-05,
"loss": 0.5612,
"step": 510
},
{
"epoch": 2.0,
"grad_norm": 3.0018510818481445,
"learning_rate": 1.5570000000000002e-05,
"loss": 0.545,
"step": 520
},
{
"epoch": 2.0384615384615383,
"grad_norm": 2.7710626125335693,
"learning_rate": 1.5870000000000002e-05,
"loss": 0.5413,
"step": 530
},
{
"epoch": 2.076923076923077,
"grad_norm": 3.6336352825164795,
"learning_rate": 1.6170000000000003e-05,
"loss": 0.5231,
"step": 540
},
{
"epoch": 2.1153846153846154,
"grad_norm": 3.880500555038452,
"learning_rate": 1.6470000000000003e-05,
"loss": 0.5153,
"step": 550
},
{
"epoch": 2.1538461538461537,
"grad_norm": 3.505535840988159,
"learning_rate": 1.677e-05,
"loss": 0.5065,
"step": 560
},
{
"epoch": 2.1923076923076925,
"grad_norm": 3.137913465499878,
"learning_rate": 1.7069999999999998e-05,
"loss": 0.4962,
"step": 570
},
{
"epoch": 2.230769230769231,
"grad_norm": 3.880664110183716,
"learning_rate": 1.7369999999999998e-05,
"loss": 0.4966,
"step": 580
},
{
"epoch": 2.269230769230769,
"grad_norm": 3.5915846824645996,
"learning_rate": 1.767e-05,
"loss": 0.498,
"step": 590
},
{
"epoch": 2.3076923076923075,
"grad_norm": 3.1810109615325928,
"learning_rate": 1.797e-05,
"loss": 0.4988,
"step": 600
},
{
"epoch": 2.3461538461538463,
"grad_norm": 3.6019811630249023,
"learning_rate": 1.827e-05,
"loss": 0.4974,
"step": 610
},
{
"epoch": 2.3846153846153846,
"grad_norm": 3.5532901287078857,
"learning_rate": 1.857e-05,
"loss": 0.4819,
"step": 620
},
{
"epoch": 2.423076923076923,
"grad_norm": 3.921673536300659,
"learning_rate": 1.887e-05,
"loss": 0.4798,
"step": 630
},
{
"epoch": 2.4615384615384617,
"grad_norm": 3.2884128093719482,
"learning_rate": 1.917e-05,
"loss": 0.4826,
"step": 640
},
{
"epoch": 2.5,
"grad_norm": 3.231353759765625,
"learning_rate": 1.947e-05,
"loss": 0.4825,
"step": 650
},
{
"epoch": 2.5384615384615383,
"grad_norm": 4.012421131134033,
"learning_rate": 1.9770000000000002e-05,
"loss": 0.4782,
"step": 660
},
{
"epoch": 2.5769230769230766,
"grad_norm": 3.7715535163879395,
"learning_rate": 2.0070000000000003e-05,
"loss": 0.4854,
"step": 670
},
{
"epoch": 2.6153846153846154,
"grad_norm": 3.2601428031921387,
"learning_rate": 2.0370000000000003e-05,
"loss": 0.4762,
"step": 680
},
{
"epoch": 2.6538461538461537,
"grad_norm": 3.3098268508911133,
"learning_rate": 2.067e-05,
"loss": 0.4699,
"step": 690
},
{
"epoch": 2.6923076923076925,
"grad_norm": 3.7813925743103027,
"learning_rate": 2.097e-05,
"loss": 0.4739,
"step": 700
},
{
"epoch": 2.730769230769231,
"grad_norm": 3.242644786834717,
"learning_rate": 2.1269999999999998e-05,
"loss": 0.476,
"step": 710
},
{
"epoch": 2.769230769230769,
"grad_norm": 3.092524766921997,
"learning_rate": 2.157e-05,
"loss": 0.4687,
"step": 720
},
{
"epoch": 2.8076923076923075,
"grad_norm": 4.034473896026611,
"learning_rate": 2.187e-05,
"loss": 0.4655,
"step": 730
},
{
"epoch": 2.8461538461538463,
"grad_norm": 3.64107084274292,
"learning_rate": 2.217e-05,
"loss": 0.4637,
"step": 740
},
{
"epoch": 2.8846153846153846,
"grad_norm": 4.293366432189941,
"learning_rate": 2.247e-05,
"loss": 0.4662,
"step": 750
},
{
"epoch": 2.9230769230769234,
"grad_norm": 3.5577127933502197,
"learning_rate": 2.277e-05,
"loss": 0.4571,
"step": 760
},
{
"epoch": 2.9615384615384617,
"grad_norm": 3.0843167304992676,
"learning_rate": 2.307e-05,
"loss": 0.4625,
"step": 770
},
{
"epoch": 3.0,
"grad_norm": 3.5912671089172363,
"learning_rate": 2.337e-05,
"loss": 0.457,
"step": 780
},
{
"epoch": 3.0384615384615383,
"grad_norm": 4.415438175201416,
"learning_rate": 2.3670000000000002e-05,
"loss": 0.4594,
"step": 790
},
{
"epoch": 3.076923076923077,
"grad_norm": 3.8929693698883057,
"learning_rate": 2.3970000000000003e-05,
"loss": 0.4613,
"step": 800
},
{
"epoch": 3.1153846153846154,
"grad_norm": 4.137643814086914,
"learning_rate": 2.4270000000000003e-05,
"loss": 0.4628,
"step": 810
},
{
"epoch": 3.1538461538461537,
"grad_norm": 3.0388357639312744,
"learning_rate": 2.457e-05,
"loss": 0.457,
"step": 820
},
{
"epoch": 3.1923076923076925,
"grad_norm": 3.6971359252929688,
"learning_rate": 2.487e-05,
"loss": 0.4546,
"step": 830
},
{
"epoch": 3.230769230769231,
"grad_norm": 3.4712138175964355,
"learning_rate": 2.517e-05,
"loss": 0.4511,
"step": 840
},
{
"epoch": 3.269230769230769,
"grad_norm": 3.3456554412841797,
"learning_rate": 2.547e-05,
"loss": 0.4505,
"step": 850
},
{
"epoch": 3.3076923076923075,
"grad_norm": 3.5414817333221436,
"learning_rate": 2.577e-05,
"loss": 0.4545,
"step": 860
},
{
"epoch": 3.3461538461538463,
"grad_norm": 3.727144241333008,
"learning_rate": 2.607e-05,
"loss": 0.4537,
"step": 870
},
{
"epoch": 3.3846153846153846,
"grad_norm": 3.6448793411254883,
"learning_rate": 2.637e-05,
"loss": 0.4434,
"step": 880
},
{
"epoch": 3.423076923076923,
"grad_norm": 3.6482105255126953,
"learning_rate": 2.667e-05,
"loss": 0.4438,
"step": 890
},
{
"epoch": 3.4615384615384617,
"grad_norm": 3.7402617931365967,
"learning_rate": 2.697e-05,
"loss": 0.4499,
"step": 900
},
{
"epoch": 3.5,
"grad_norm": 3.1410586833953857,
"learning_rate": 2.727e-05,
"loss": 0.4423,
"step": 910
},
{
"epoch": 3.5384615384615383,
"grad_norm": 4.063307762145996,
"learning_rate": 2.7570000000000002e-05,
"loss": 0.4476,
"step": 920
},
{
"epoch": 3.5769230769230766,
"grad_norm": 3.781724214553833,
"learning_rate": 2.7870000000000003e-05,
"loss": 0.4445,
"step": 930
},
{
"epoch": 3.6153846153846154,
"grad_norm": 4.0924787521362305,
"learning_rate": 2.817e-05,
"loss": 0.4449,
"step": 940
},
{
"epoch": 3.6538461538461537,
"grad_norm": 3.8442327976226807,
"learning_rate": 2.847e-05,
"loss": 0.452,
"step": 950
},
{
"epoch": 3.6923076923076925,
"grad_norm": 3.5104334354400635,
"learning_rate": 2.877e-05,
"loss": 0.4363,
"step": 960
},
{
"epoch": 3.730769230769231,
"grad_norm": 4.416885852813721,
"learning_rate": 2.907e-05,
"loss": 0.4423,
"step": 970
},
{
"epoch": 3.769230769230769,
"grad_norm": 3.5241246223449707,
"learning_rate": 2.9370000000000002e-05,
"loss": 0.4426,
"step": 980
},
{
"epoch": 3.8076923076923075,
"grad_norm": 3.1697614192962646,
"learning_rate": 2.967e-05,
"loss": 0.4437,
"step": 990
},
{
"epoch": 3.8461538461538463,
"grad_norm": 3.7319610118865967,
"learning_rate": 2.997e-05,
"loss": 0.4419,
"step": 1000
},
{
"epoch": 3.8846153846153846,
"grad_norm": 4.392916679382324,
"learning_rate": 2.9999983391181253e-05,
"loss": 0.4371,
"step": 1010
},
{
"epoch": 3.9230769230769234,
"grad_norm": 3.606473207473755,
"learning_rate": 2.9999925978027876e-05,
"loss": 0.4403,
"step": 1020
},
{
"epoch": 3.9615384615384617,
"grad_norm": 3.2601609230041504,
"learning_rate": 2.9999827555649637e-05,
"loss": 0.4357,
"step": 1030
},
{
"epoch": 4.0,
"grad_norm": 3.616961717605591,
"learning_rate": 2.999968812431563e-05,
"loss": 0.4394,
"step": 1040
},
{
"epoch": 4.038461538461538,
"grad_norm": 4.0653204917907715,
"learning_rate": 2.999950768440706e-05,
"loss": 0.435,
"step": 1050
},
{
"epoch": 4.076923076923077,
"grad_norm": 4.050017833709717,
"learning_rate": 2.999928623641723e-05,
"loss": 0.4448,
"step": 1060
},
{
"epoch": 4.115384615384615,
"grad_norm": 3.190156936645508,
"learning_rate": 2.9999023780951575e-05,
"loss": 0.4356,
"step": 1070
},
{
"epoch": 4.153846153846154,
"grad_norm": 3.323148250579834,
"learning_rate": 2.999872031872764e-05,
"loss": 0.4384,
"step": 1080
},
{
"epoch": 4.1923076923076925,
"grad_norm": 3.506814479827881,
"learning_rate": 2.999837585057508e-05,
"loss": 0.4287,
"step": 1090
},
{
"epoch": 4.230769230769231,
"grad_norm": 4.119077205657959,
"learning_rate": 2.999799037743565e-05,
"loss": 0.427,
"step": 1100
},
{
"epoch": 4.269230769230769,
"grad_norm": 3.2304561138153076,
"learning_rate": 2.999756390036323e-05,
"loss": 0.4326,
"step": 1110
},
{
"epoch": 4.3076923076923075,
"grad_norm": 3.1747395992279053,
"learning_rate": 2.9997096420523788e-05,
"loss": 0.4325,
"step": 1120
},
{
"epoch": 4.346153846153846,
"grad_norm": 3.7520697116851807,
"learning_rate": 2.9996587939195395e-05,
"loss": 0.4372,
"step": 1130
},
{
"epoch": 4.384615384615385,
"grad_norm": 3.395843744277954,
"learning_rate": 2.999603845776822e-05,
"loss": 0.4344,
"step": 1140
},
{
"epoch": 4.423076923076923,
"grad_norm": 4.427037715911865,
"learning_rate": 2.999544797774452e-05,
"loss": 0.4342,
"step": 1150
},
{
"epoch": 4.461538461538462,
"grad_norm": 3.310183525085449,
"learning_rate": 2.9994816500738648e-05,
"loss": 0.433,
"step": 1160
},
{
"epoch": 4.5,
"grad_norm": 3.460261583328247,
"learning_rate": 2.999414402847704e-05,
"loss": 0.4279,
"step": 1170
},
{
"epoch": 4.538461538461538,
"grad_norm": 3.523247718811035,
"learning_rate": 2.999343056279821e-05,
"loss": 0.4281,
"step": 1180
},
{
"epoch": 4.576923076923077,
"grad_norm": 3.0562286376953125,
"learning_rate": 2.9992676105652746e-05,
"loss": 0.4281,
"step": 1190
},
{
"epoch": 4.615384615384615,
"grad_norm": 3.179258108139038,
"learning_rate": 2.9991880659103298e-05,
"loss": 0.4321,
"step": 1200
},
{
"epoch": 4.653846153846154,
"grad_norm": 3.0872421264648438,
"learning_rate": 2.9991044225324593e-05,
"loss": 0.4255,
"step": 1210
},
{
"epoch": 4.6923076923076925,
"grad_norm": 3.896038293838501,
"learning_rate": 2.9990166806603407e-05,
"loss": 0.4221,
"step": 1220
},
{
"epoch": 4.730769230769231,
"grad_norm": 3.345339775085449,
"learning_rate": 2.9989248405338573e-05,
"loss": 0.4236,
"step": 1230
},
{
"epoch": 4.769230769230769,
"grad_norm": 3.4517908096313477,
"learning_rate": 2.9988289024040962e-05,
"loss": 0.419,
"step": 1240
},
{
"epoch": 4.8076923076923075,
"grad_norm": 3.4892146587371826,
"learning_rate": 2.998728866533348e-05,
"loss": 0.4235,
"step": 1250
},
{
"epoch": 4.846153846153846,
"grad_norm": 3.3247246742248535,
"learning_rate": 2.9986247331951083e-05,
"loss": 0.4143,
"step": 1260
},
{
"epoch": 4.884615384615385,
"grad_norm": 3.2198266983032227,
"learning_rate": 2.998516502674072e-05,
"loss": 0.4167,
"step": 1270
},
{
"epoch": 4.923076923076923,
"grad_norm": 3.2690086364746094,
"learning_rate": 2.9984041752661386e-05,
"loss": 0.4142,
"step": 1280
},
{
"epoch": 4.961538461538462,
"grad_norm": 3.7173590660095215,
"learning_rate": 2.9982877512784067e-05,
"loss": 0.4141,
"step": 1290
},
{
"epoch": 5.0,
"grad_norm": 4.059051990509033,
"learning_rate": 2.998167231029174e-05,
"loss": 0.4073,
"step": 1300
},
{
"epoch": 5.038461538461538,
"grad_norm": 3.8667192459106445,
"learning_rate": 2.99804261484794e-05,
"loss": 0.4082,
"step": 1310
},
{
"epoch": 5.076923076923077,
"grad_norm": 3.3809549808502197,
"learning_rate": 2.997913903075399e-05,
"loss": 0.408,
"step": 1320
},
{
"epoch": 5.115384615384615,
"grad_norm": 3.0045559406280518,
"learning_rate": 2.997781096063445e-05,
"loss": 0.4067,
"step": 1330
},
{
"epoch": 5.153846153846154,
"grad_norm": 3.27519154548645,
"learning_rate": 2.9976441941751663e-05,
"loss": 0.4023,
"step": 1340
},
{
"epoch": 5.1923076923076925,
"grad_norm": 3.353156089782715,
"learning_rate": 2.997503197784849e-05,
"loss": 0.4034,
"step": 1350
},
{
"epoch": 5.230769230769231,
"grad_norm": 3.568828821182251,
"learning_rate": 2.9973581072779702e-05,
"loss": 0.4025,
"step": 1360
},
{
"epoch": 5.269230769230769,
"grad_norm": 3.06388258934021,
"learning_rate": 2.9972089230512035e-05,
"loss": 0.3948,
"step": 1370
},
{
"epoch": 5.3076923076923075,
"grad_norm": 3.4603991508483887,
"learning_rate": 2.997055645512411e-05,
"loss": 0.4045,
"step": 1380
},
{
"epoch": 5.346153846153846,
"grad_norm": 2.7319891452789307,
"learning_rate": 2.9968982750806492e-05,
"loss": 0.3956,
"step": 1390
},
{
"epoch": 5.384615384615385,
"grad_norm": 3.0891315937042236,
"learning_rate": 2.9967368121861623e-05,
"loss": 0.4019,
"step": 1400
},
{
"epoch": 5.423076923076923,
"grad_norm": 3.29032039642334,
"learning_rate": 2.9965712572703834e-05,
"loss": 0.3962,
"step": 1410
},
{
"epoch": 5.461538461538462,
"grad_norm": 3.2606821060180664,
"learning_rate": 2.996401610785934e-05,
"loss": 0.4016,
"step": 1420
},
{
"epoch": 5.5,
"grad_norm": 3.013518810272217,
"learning_rate": 2.99622787319662e-05,
"loss": 0.398,
"step": 1430
},
{
"epoch": 5.538461538461538,
"grad_norm": 2.9983019828796387,
"learning_rate": 2.9960500449774338e-05,
"loss": 0.4002,
"step": 1440
},
{
"epoch": 5.576923076923077,
"grad_norm": 3.4363009929656982,
"learning_rate": 2.9958681266145517e-05,
"loss": 0.4017,
"step": 1450
},
{
"epoch": 5.615384615384615,
"grad_norm": 3.917628049850464,
"learning_rate": 2.995682118605331e-05,
"loss": 0.3938,
"step": 1460
},
{
"epoch": 5.653846153846154,
"grad_norm": 2.4763236045837402,
"learning_rate": 2.9954920214583107e-05,
"loss": 0.3976,
"step": 1470
},
{
"epoch": 5.6923076923076925,
"grad_norm": 3.2113523483276367,
"learning_rate": 2.9952978356932084e-05,
"loss": 0.3984,
"step": 1480
},
{
"epoch": 5.730769230769231,
"grad_norm": 3.3912851810455322,
"learning_rate": 2.9950995618409215e-05,
"loss": 0.3925,
"step": 1490
},
{
"epoch": 5.769230769230769,
"grad_norm": 2.927689790725708,
"learning_rate": 2.9948972004435228e-05,
"loss": 0.4036,
"step": 1500
},
{
"epoch": 5.8076923076923075,
"grad_norm": 3.0390853881835938,
"learning_rate": 2.9946907520542602e-05,
"loss": 0.3954,
"step": 1510
},
{
"epoch": 5.846153846153846,
"grad_norm": 3.3175177574157715,
"learning_rate": 2.9944802172375566e-05,
"loss": 0.3984,
"step": 1520
},
{
"epoch": 5.884615384615385,
"grad_norm": 3.478698968887329,
"learning_rate": 2.9942655965690053e-05,
"loss": 0.3971,
"step": 1530
},
{
"epoch": 5.923076923076923,
"grad_norm": 2.9315621852874756,
"learning_rate": 2.9940468906353712e-05,
"loss": 0.3989,
"step": 1540
},
{
"epoch": 5.961538461538462,
"grad_norm": 3.185990810394287,
"learning_rate": 2.9938241000345887e-05,
"loss": 0.3948,
"step": 1550
},
{
"epoch": 6.0,
"grad_norm": 4.063650608062744,
"learning_rate": 2.993597225375758e-05,
"loss": 0.3937,
"step": 1560
},
{
"epoch": 6.038461538461538,
"grad_norm": 4.225824356079102,
"learning_rate": 2.993366267279146e-05,
"loss": 0.3931,
"step": 1570
},
{
"epoch": 6.076923076923077,
"grad_norm": 3.3525872230529785,
"learning_rate": 2.993131226376183e-05,
"loss": 0.3918,
"step": 1580
},
{
"epoch": 6.115384615384615,
"grad_norm": 3.352668523788452,
"learning_rate": 2.9928921033094626e-05,
"loss": 0.3901,
"step": 1590
},
{
"epoch": 6.153846153846154,
"grad_norm": 3.500871181488037,
"learning_rate": 2.9926488987327376e-05,
"loss": 0.3905,
"step": 1600
},
{
"epoch": 6.1923076923076925,
"grad_norm": 3.547381639480591,
"learning_rate": 2.99240161331092e-05,
"loss": 0.3847,
"step": 1610
},
{
"epoch": 6.230769230769231,
"grad_norm": 3.7568917274475098,
"learning_rate": 2.992150247720079e-05,
"loss": 0.383,
"step": 1620
},
{
"epoch": 6.269230769230769,
"grad_norm": 2.754397392272949,
"learning_rate": 2.991894802647438e-05,
"loss": 0.3833,
"step": 1630
},
{
"epoch": 6.3076923076923075,
"grad_norm": 2.8265631198883057,
"learning_rate": 2.9916352787913746e-05,
"loss": 0.3875,
"step": 1640
},
{
"epoch": 6.346153846153846,
"grad_norm": 3.522829532623291,
"learning_rate": 2.991371676861417e-05,
"loss": 0.3867,
"step": 1650
},
{
"epoch": 6.384615384615385,
"grad_norm": 4.345142841339111,
"learning_rate": 2.991103997578243e-05,
"loss": 0.3754,
"step": 1660
},
{
"epoch": 6.423076923076923,
"grad_norm": 3.212238073348999,
"learning_rate": 2.9908322416736767e-05,
"loss": 0.382,
"step": 1670
},
{
"epoch": 6.461538461538462,
"grad_norm": 3.8595285415649414,
"learning_rate": 2.990556409890689e-05,
"loss": 0.3831,
"step": 1680
},
{
"epoch": 6.5,
"grad_norm": 3.5283541679382324,
"learning_rate": 2.990276502983394e-05,
"loss": 0.3785,
"step": 1690
},
{
"epoch": 6.538461538461538,
"grad_norm": 2.779311180114746,
"learning_rate": 2.9899925217170455e-05,
"loss": 0.3722,
"step": 1700
},
{
"epoch": 6.576923076923077,
"grad_norm": 3.0596842765808105,
"learning_rate": 2.989704466868038e-05,
"loss": 0.373,
"step": 1710
},
{
"epoch": 6.615384615384615,
"grad_norm": 3.0654571056365967,
"learning_rate": 2.9894123392239018e-05,
"loss": 0.3717,
"step": 1720
},
{
"epoch": 6.653846153846154,
"grad_norm": 3.738708734512329,
"learning_rate": 2.9891161395833037e-05,
"loss": 0.3652,
"step": 1730
},
{
"epoch": 6.6923076923076925,
"grad_norm": 3.2960824966430664,
"learning_rate": 2.988815868756042e-05,
"loss": 0.3606,
"step": 1740
},
{
"epoch": 6.730769230769231,
"grad_norm": 3.8682546615600586,
"learning_rate": 2.9885115275630447e-05,
"loss": 0.3616,
"step": 1750
},
{
"epoch": 6.769230769230769,
"grad_norm": 5.847049236297607,
"learning_rate": 2.9882031168363703e-05,
"loss": 0.3594,
"step": 1760
},
{
"epoch": 6.8076923076923075,
"grad_norm": 4.252906322479248,
"learning_rate": 2.9878906374192013e-05,
"loss": 0.3599,
"step": 1770
},
{
"epoch": 6.846153846153846,
"grad_norm": 3.371454954147339,
"learning_rate": 2.9875740901658446e-05,
"loss": 0.3522,
"step": 1780
},
{
"epoch": 6.884615384615385,
"grad_norm": 3.9505934715270996,
"learning_rate": 2.987253475941728e-05,
"loss": 0.3518,
"step": 1790
},
{
"epoch": 6.923076923076923,
"grad_norm": 4.2967047691345215,
"learning_rate": 2.9869287956233986e-05,
"loss": 0.3453,
"step": 1800
},
{
"epoch": 6.961538461538462,
"grad_norm": 3.76397705078125,
"learning_rate": 2.9866000500985207e-05,
"loss": 0.3433,
"step": 1810
},
{
"epoch": 7.0,
"grad_norm": 3.6324551105499268,
"learning_rate": 2.9862672402658712e-05,
"loss": 0.3322,
"step": 1820
},
{
"epoch": 7.038461538461538,
"grad_norm": 4.136635780334473,
"learning_rate": 2.98593036703534e-05,
"loss": 0.3367,
"step": 1830
},
{
"epoch": 7.076923076923077,
"grad_norm": 4.801270008087158,
"learning_rate": 2.9855894313279256e-05,
"loss": 0.3327,
"step": 1840
},
{
"epoch": 7.115384615384615,
"grad_norm": 3.6734988689422607,
"learning_rate": 2.9852444340757326e-05,
"loss": 0.33,
"step": 1850
},
{
"epoch": 7.153846153846154,
"grad_norm": 3.806365966796875,
"learning_rate": 2.9848953762219707e-05,
"loss": 0.3315,
"step": 1860
},
{
"epoch": 7.1923076923076925,
"grad_norm": 4.397286891937256,
"learning_rate": 2.984542258720951e-05,
"loss": 0.3221,
"step": 1870
},
{
"epoch": 7.230769230769231,
"grad_norm": 4.844172954559326,
"learning_rate": 2.984185082538083e-05,
"loss": 0.3104,
"step": 1880
},
{
"epoch": 7.269230769230769,
"grad_norm": 3.5059876441955566,
"learning_rate": 2.983823848649873e-05,
"loss": 0.3044,
"step": 1890
},
{
"epoch": 7.3076923076923075,
"grad_norm": 3.5109610557556152,
"learning_rate": 2.9834585580439203e-05,
"loss": 0.3145,
"step": 1900
},
{
"epoch": 7.346153846153846,
"grad_norm": 3.935485601425171,
"learning_rate": 2.9830892117189157e-05,
"loss": 0.3047,
"step": 1910
},
{
"epoch": 7.384615384615385,
"grad_norm": 4.239902019500732,
"learning_rate": 2.982715810684638e-05,
"loss": 0.3004,
"step": 1920
},
{
"epoch": 7.423076923076923,
"grad_norm": 3.725754737854004,
"learning_rate": 2.982338355961951e-05,
"loss": 0.2974,
"step": 1930
},
{
"epoch": 7.461538461538462,
"grad_norm": 4.275974750518799,
"learning_rate": 2.981956848582802e-05,
"loss": 0.2943,
"step": 1940
},
{
"epoch": 7.5,
"grad_norm": 4.536103248596191,
"learning_rate": 2.981571289590217e-05,
"loss": 0.2879,
"step": 1950
},
{
"epoch": 7.538461538461538,
"grad_norm": 5.503205299377441,
"learning_rate": 2.9811816800383003e-05,
"loss": 0.2769,
"step": 1960
},
{
"epoch": 7.576923076923077,
"grad_norm": 4.398034572601318,
"learning_rate": 2.9807880209922288e-05,
"loss": 0.2841,
"step": 1970
},
{
"epoch": 7.615384615384615,
"grad_norm": 4.218934059143066,
"learning_rate": 2.9803903135282518e-05,
"loss": 0.2787,
"step": 1980
},
{
"epoch": 7.653846153846154,
"grad_norm": 3.5821478366851807,
"learning_rate": 2.9799885587336862e-05,
"loss": 0.2669,
"step": 1990
},
{
"epoch": 7.6923076923076925,
"grad_norm": 4.140866279602051,
"learning_rate": 2.9795827577069145e-05,
"loss": 0.2558,
"step": 2000
},
{
"epoch": 7.730769230769231,
"grad_norm": 3.9685962200164795,
"learning_rate": 2.9791729115573808e-05,
"loss": 0.2493,
"step": 2010
},
{
"epoch": 7.769230769230769,
"grad_norm": 6.242632865905762,
"learning_rate": 2.9787590214055887e-05,
"loss": 0.2538,
"step": 2020
},
{
"epoch": 7.8076923076923075,
"grad_norm": 5.438572406768799,
"learning_rate": 2.9783410883830983e-05,
"loss": 0.2555,
"step": 2030
},
{
"epoch": 7.846153846153846,
"grad_norm": 4.109585762023926,
"learning_rate": 2.9779191136325233e-05,
"loss": 0.2533,
"step": 2040
},
{
"epoch": 7.884615384615385,
"grad_norm": 4.627237319946289,
"learning_rate": 2.977493098307525e-05,
"loss": 0.2371,
"step": 2050
},
{
"epoch": 7.923076923076923,
"grad_norm": 4.566162109375,
"learning_rate": 2.9770630435728142e-05,
"loss": 0.2214,
"step": 2060
},
{
"epoch": 7.961538461538462,
"grad_norm": 5.047729969024658,
"learning_rate": 2.976628950604144e-05,
"loss": 0.222,
"step": 2070
},
{
"epoch": 8.0,
"grad_norm": 5.952073574066162,
"learning_rate": 2.9761908205883073e-05,
"loss": 0.2164,
"step": 2080
},
{
"epoch": 8.038461538461538,
"grad_norm": 6.023576736450195,
"learning_rate": 2.9757486547231357e-05,
"loss": 0.2171,
"step": 2090
},
{
"epoch": 8.076923076923077,
"grad_norm": 5.403975486755371,
"learning_rate": 2.9753024542174934e-05,
"loss": 0.2039,
"step": 2100
},
{
"epoch": 8.115384615384615,
"grad_norm": 6.422732830047607,
"learning_rate": 2.9748522202912755e-05,
"loss": 0.2025,
"step": 2110
},
{
"epoch": 8.153846153846153,
"grad_norm": 4.933073043823242,
"learning_rate": 2.974397954175404e-05,
"loss": 0.1943,
"step": 2120
},
{
"epoch": 8.192307692307692,
"grad_norm": 3.9752209186553955,
"learning_rate": 2.973939657111826e-05,
"loss": 0.1947,
"step": 2130
},
{
"epoch": 8.23076923076923,
"grad_norm": 5.760082721710205,
"learning_rate": 2.9734773303535078e-05,
"loss": 0.1982,
"step": 2140
},
{
"epoch": 8.26923076923077,
"grad_norm": 4.883352279663086,
"learning_rate": 2.9730109751644325e-05,
"loss": 0.1932,
"step": 2150
},
{
"epoch": 8.307692307692308,
"grad_norm": 5.062183856964111,
"learning_rate": 2.9725405928195985e-05,
"loss": 0.1808,
"step": 2160
},
{
"epoch": 8.346153846153847,
"grad_norm": 5.349424362182617,
"learning_rate": 2.9720661846050123e-05,
"loss": 0.1648,
"step": 2170
},
{
"epoch": 8.384615384615385,
"grad_norm": 4.347816467285156,
"learning_rate": 2.971587751817688e-05,
"loss": 0.1733,
"step": 2180
},
{
"epoch": 8.423076923076923,
"grad_norm": 6.218843460083008,
"learning_rate": 2.9711052957656425e-05,
"loss": 0.1568,
"step": 2190
},
{
"epoch": 8.461538461538462,
"grad_norm": 4.51317024230957,
"learning_rate": 2.9706188177678924e-05,
"loss": 0.1544,
"step": 2200
},
{
"epoch": 8.5,
"grad_norm": 6.40587854385376,
"learning_rate": 2.97012831915445e-05,
"loss": 0.1501,
"step": 2210
},
{
"epoch": 8.538461538461538,
"grad_norm": 3.9102859497070312,
"learning_rate": 2.96963380126632e-05,
"loss": 0.1443,
"step": 2220
},
{
"epoch": 8.576923076923077,
"grad_norm": 5.443079948425293,
"learning_rate": 2.9691352654554953e-05,
"loss": 0.1472,
"step": 2230
},
{
"epoch": 8.615384615384615,
"grad_norm": 6.355679988861084,
"learning_rate": 2.9686327130849536e-05,
"loss": 0.1383,
"step": 2240
},
{
"epoch": 8.653846153846153,
"grad_norm": 6.52142333984375,
"learning_rate": 2.9681261455286538e-05,
"loss": 0.1482,
"step": 2250
},
{
"epoch": 8.692307692307692,
"grad_norm": 5.2175703048706055,
"learning_rate": 2.9676155641715318e-05,
"loss": 0.1405,
"step": 2260
},
{
"epoch": 8.73076923076923,
"grad_norm": 5.178348541259766,
"learning_rate": 2.9671009704094988e-05,
"loss": 0.1307,
"step": 2270
},
{
"epoch": 8.76923076923077,
"grad_norm": 5.276549816131592,
"learning_rate": 2.9665823656494335e-05,
"loss": 0.1402,
"step": 2280
},
{
"epoch": 8.807692307692308,
"grad_norm": 4.72429084777832,
"learning_rate": 2.9660597513091824e-05,
"loss": 0.129,
"step": 2290
},
{
"epoch": 8.846153846153847,
"grad_norm": 4.0139312744140625,
"learning_rate": 2.965533128817552e-05,
"loss": 0.1181,
"step": 2300
},
{
"epoch": 8.884615384615385,
"grad_norm": 4.549901485443115,
"learning_rate": 2.9650024996143084e-05,
"loss": 0.1293,
"step": 2310
},
{
"epoch": 8.923076923076923,
"grad_norm": 4.0546183586120605,
"learning_rate": 2.964467865150172e-05,
"loss": 0.1098,
"step": 2320
},
{
"epoch": 8.961538461538462,
"grad_norm": 5.741022109985352,
"learning_rate": 2.9639292268868133e-05,
"loss": 0.1255,
"step": 2330
},
{
"epoch": 9.0,
"grad_norm": 5.321122169494629,
"learning_rate": 2.9633865862968478e-05,
"loss": 0.112,
"step": 2340
},
{
"epoch": 9.038461538461538,
"grad_norm": 4.528491020202637,
"learning_rate": 2.9628399448638352e-05,
"loss": 0.1198,
"step": 2350
},
{
"epoch": 9.076923076923077,
"grad_norm": 3.827465295791626,
"learning_rate": 2.9622893040822714e-05,
"loss": 0.1039,
"step": 2360
},
{
"epoch": 9.115384615384615,
"grad_norm": 4.378634452819824,
"learning_rate": 2.9617346654575875e-05,
"loss": 0.1049,
"step": 2370
},
{
"epoch": 9.153846153846153,
"grad_norm": 4.433044910430908,
"learning_rate": 2.9611760305061447e-05,
"loss": 0.099,
"step": 2380
},
{
"epoch": 9.192307692307692,
"grad_norm": 4.418703556060791,
"learning_rate": 2.9606134007552292e-05,
"loss": 0.1097,
"step": 2390
},
{
"epoch": 9.23076923076923,
"grad_norm": 4.773401737213135,
"learning_rate": 2.9600467777430497e-05,
"loss": 0.1039,
"step": 2400
},
{
"epoch": 9.26923076923077,
"grad_norm": 5.771136283874512,
"learning_rate": 2.9594761630187312e-05,
"loss": 0.0954,
"step": 2410
},
{
"epoch": 9.307692307692308,
"grad_norm": 4.874659061431885,
"learning_rate": 2.9589015581423132e-05,
"loss": 0.0973,
"step": 2420
},
{
"epoch": 9.346153846153847,
"grad_norm": 5.661646366119385,
"learning_rate": 2.958322964684743e-05,
"loss": 0.0958,
"step": 2430
},
{
"epoch": 9.384615384615385,
"grad_norm": 5.317569732666016,
"learning_rate": 2.9577403842278735e-05,
"loss": 0.1032,
"step": 2440
},
{
"epoch": 9.423076923076923,
"grad_norm": 4.762622356414795,
"learning_rate": 2.957153818364457e-05,
"loss": 0.0972,
"step": 2450
},
{
"epoch": 9.461538461538462,
"grad_norm": 5.307681083679199,
"learning_rate": 2.9565632686981428e-05,
"loss": 0.1038,
"step": 2460
},
{
"epoch": 9.5,
"grad_norm": 3.2064321041107178,
"learning_rate": 2.9559687368434702e-05,
"loss": 0.0902,
"step": 2470
},
{
"epoch": 9.538461538461538,
"grad_norm": 4.020598888397217,
"learning_rate": 2.9553702244258674e-05,
"loss": 0.0886,
"step": 2480
},
{
"epoch": 9.576923076923077,
"grad_norm": 5.292928218841553,
"learning_rate": 2.954767733081644e-05,
"loss": 0.0839,
"step": 2490
},
{
"epoch": 9.615384615384615,
"grad_norm": 4.200097560882568,
"learning_rate": 2.9541612644579887e-05,
"loss": 0.0943,
"step": 2500
},
{
"epoch": 9.653846153846153,
"grad_norm": 4.739330291748047,
"learning_rate": 2.9535508202129634e-05,
"loss": 0.0903,
"step": 2510
},
{
"epoch": 9.692307692307692,
"grad_norm": 4.430630683898926,
"learning_rate": 2.9529364020154994e-05,
"loss": 0.0819,
"step": 2520
},
{
"epoch": 9.73076923076923,
"grad_norm": 4.4350481033325195,
"learning_rate": 2.9523180115453922e-05,
"loss": 0.0798,
"step": 2530
},
{
"epoch": 9.76923076923077,
"grad_norm": 4.632157325744629,
"learning_rate": 2.9516956504932984e-05,
"loss": 0.0917,
"step": 2540
},
{
"epoch": 9.807692307692308,
"grad_norm": 3.786306619644165,
"learning_rate": 2.9510693205607286e-05,
"loss": 0.0978,
"step": 2550
},
{
"epoch": 9.846153846153847,
"grad_norm": 4.0172929763793945,
"learning_rate": 2.9504390234600456e-05,
"loss": 0.0884,
"step": 2560
},
{
"epoch": 9.884615384615385,
"grad_norm": 4.171654224395752,
"learning_rate": 2.9498047609144577e-05,
"loss": 0.0821,
"step": 2570
},
{
"epoch": 9.923076923076923,
"grad_norm": 5.381162643432617,
"learning_rate": 2.9491665346580134e-05,
"loss": 0.0872,
"step": 2580
},
{
"epoch": 9.961538461538462,
"grad_norm": 3.150331497192383,
"learning_rate": 2.9485243464356e-05,
"loss": 0.0859,
"step": 2590
},
{
"epoch": 10.0,
"grad_norm": 4.065011978149414,
"learning_rate": 2.9478781980029352e-05,
"loss": 0.0823,
"step": 2600
},
{
"epoch": 10.038461538461538,
"grad_norm": 3.450819492340088,
"learning_rate": 2.9472280911265642e-05,
"loss": 0.079,
"step": 2610
},
{
"epoch": 10.076923076923077,
"grad_norm": 3.7067441940307617,
"learning_rate": 2.9465740275838543e-05,
"loss": 0.0802,
"step": 2620
},
{
"epoch": 10.115384615384615,
"grad_norm": 3.485081434249878,
"learning_rate": 2.94591600916299e-05,
"loss": 0.0772,
"step": 2630
},
{
"epoch": 10.153846153846153,
"grad_norm": 2.740015983581543,
"learning_rate": 2.9452540376629692e-05,
"loss": 0.0712,
"step": 2640
},
{
"epoch": 10.192307692307692,
"grad_norm": 3.398298501968384,
"learning_rate": 2.944588114893596e-05,
"loss": 0.0826,
"step": 2650
},
{
"epoch": 10.23076923076923,
"grad_norm": 3.7506258487701416,
"learning_rate": 2.9439182426754784e-05,
"loss": 0.0896,
"step": 2660
},
{
"epoch": 10.26923076923077,
"grad_norm": 3.0388355255126953,
"learning_rate": 2.9432444228400208e-05,
"loss": 0.0794,
"step": 2670
},
{
"epoch": 10.307692307692308,
"grad_norm": 2.9595770835876465,
"learning_rate": 2.9425666572294218e-05,
"loss": 0.0823,
"step": 2680
},
{
"epoch": 10.346153846153847,
"grad_norm": 3.7701125144958496,
"learning_rate": 2.941884947696666e-05,
"loss": 0.0777,
"step": 2690
},
{
"epoch": 10.384615384615385,
"grad_norm": 3.5903477668762207,
"learning_rate": 2.9411992961055214e-05,
"loss": 0.0864,
"step": 2700
},
{
"epoch": 10.423076923076923,
"grad_norm": 3.5986275672912598,
"learning_rate": 2.9405097043305334e-05,
"loss": 0.0809,
"step": 2710
},
{
"epoch": 10.461538461538462,
"grad_norm": 3.5043976306915283,
"learning_rate": 2.9398161742570196e-05,
"loss": 0.0787,
"step": 2720
},
{
"epoch": 10.5,
"grad_norm": 3.9481887817382812,
"learning_rate": 2.9391187077810644e-05,
"loss": 0.0828,
"step": 2730
},
{
"epoch": 10.538461538461538,
"grad_norm": 3.953305721282959,
"learning_rate": 2.9384173068095145e-05,
"loss": 0.079,
"step": 2740
},
{
"epoch": 10.576923076923077,
"grad_norm": 4.111423492431641,
"learning_rate": 2.937711973259974e-05,
"loss": 0.0702,
"step": 2750
},
{
"epoch": 10.615384615384615,
"grad_norm": 3.0147547721862793,
"learning_rate": 2.9370027090607974e-05,
"loss": 0.0767,
"step": 2760
},
{
"epoch": 10.653846153846153,
"grad_norm": 3.451683759689331,
"learning_rate": 2.936289516151086e-05,
"loss": 0.0724,
"step": 2770
},
{
"epoch": 10.692307692307692,
"grad_norm": 3.6806628704071045,
"learning_rate": 2.935572396480682e-05,
"loss": 0.0753,
"step": 2780
},
{
"epoch": 10.73076923076923,
"grad_norm": 3.9452641010284424,
"learning_rate": 2.9348513520101636e-05,
"loss": 0.0692,
"step": 2790
},
{
"epoch": 10.76923076923077,
"grad_norm": 3.2497544288635254,
"learning_rate": 2.9341263847108383e-05,
"loss": 0.0745,
"step": 2800
},
{
"epoch": 10.807692307692308,
"grad_norm": 3.549978494644165,
"learning_rate": 2.933397496564739e-05,
"loss": 0.0749,
"step": 2810
},
{
"epoch": 10.846153846153847,
"grad_norm": 3.3335323333740234,
"learning_rate": 2.9326646895646178e-05,
"loss": 0.0692,
"step": 2820
},
{
"epoch": 10.884615384615385,
"grad_norm": 3.405339479446411,
"learning_rate": 2.931927965713942e-05,
"loss": 0.0746,
"step": 2830
},
{
"epoch": 10.923076923076923,
"grad_norm": 3.82893443107605,
"learning_rate": 2.931187327026886e-05,
"loss": 0.0721,
"step": 2840
},
{
"epoch": 10.961538461538462,
"grad_norm": 3.7267136573791504,
"learning_rate": 2.9304427755283278e-05,
"loss": 0.0734,
"step": 2850
},
{
"epoch": 11.0,
"grad_norm": 3.892670154571533,
"learning_rate": 2.9296943132538425e-05,
"loss": 0.0695,
"step": 2860
},
{
"epoch": 11.038461538461538,
"grad_norm": 3.6918320655822754,
"learning_rate": 2.928941942249697e-05,
"loss": 0.0715,
"step": 2870
},
{
"epoch": 11.076923076923077,
"grad_norm": 3.9371860027313232,
"learning_rate": 2.928185664572846e-05,
"loss": 0.0698,
"step": 2880
},
{
"epoch": 11.115384615384615,
"grad_norm": 3.618164539337158,
"learning_rate": 2.927425482290923e-05,
"loss": 0.0722,
"step": 2890
},
{
"epoch": 11.153846153846153,
"grad_norm": 3.7436001300811768,
"learning_rate": 2.926661397482238e-05,
"loss": 0.0763,
"step": 2900
},
{
"epoch": 11.192307692307692,
"grad_norm": 3.660889148712158,
"learning_rate": 2.9258934122357685e-05,
"loss": 0.0917,
"step": 2910
},
{
"epoch": 11.23076923076923,
"grad_norm": 3.8613357543945312,
"learning_rate": 2.9251215286511574e-05,
"loss": 0.072,
"step": 2920
},
{
"epoch": 11.26923076923077,
"grad_norm": 3.738790273666382,
"learning_rate": 2.924345748838706e-05,
"loss": 0.0709,
"step": 2930
},
{
"epoch": 11.307692307692308,
"grad_norm": 3.2637104988098145,
"learning_rate": 2.923566074919365e-05,
"loss": 0.0707,
"step": 2940
},
{
"epoch": 11.346153846153847,
"grad_norm": 2.518782615661621,
"learning_rate": 2.9227825090247346e-05,
"loss": 0.0679,
"step": 2950
},
{
"epoch": 11.384615384615385,
"grad_norm": 3.1399145126342773,
"learning_rate": 2.9219950532970526e-05,
"loss": 0.0806,
"step": 2960
},
{
"epoch": 11.423076923076923,
"grad_norm": 3.190922498703003,
"learning_rate": 2.921203709889194e-05,
"loss": 0.0766,
"step": 2970
},
{
"epoch": 11.461538461538462,
"grad_norm": 3.0159084796905518,
"learning_rate": 2.9204084809646607e-05,
"loss": 0.0671,
"step": 2980
},
{
"epoch": 11.5,
"grad_norm": 3.547037124633789,
"learning_rate": 2.9196093686975793e-05,
"loss": 0.0661,
"step": 2990
},
{
"epoch": 11.538461538461538,
"grad_norm": 3.7780942916870117,
"learning_rate": 2.918806375272691e-05,
"loss": 0.0772,
"step": 3000
},
{
"epoch": 11.576923076923077,
"grad_norm": 3.440263509750366,
"learning_rate": 2.9179995028853498e-05,
"loss": 0.0697,
"step": 3010
},
{
"epoch": 11.615384615384615,
"grad_norm": 3.568605661392212,
"learning_rate": 2.917188753741514e-05,
"loss": 0.0779,
"step": 3020
},
{
"epoch": 11.653846153846153,
"grad_norm": 3.969608783721924,
"learning_rate": 2.916374130057741e-05,
"loss": 0.0684,
"step": 3030
},
{
"epoch": 11.692307692307692,
"grad_norm": 3.102283000946045,
"learning_rate": 2.91555563406118e-05,
"loss": 0.0624,
"step": 3040
},
{
"epoch": 11.73076923076923,
"grad_norm": 2.713280200958252,
"learning_rate": 2.9147332679895683e-05,
"loss": 0.0723,
"step": 3050
},
{
"epoch": 11.76923076923077,
"grad_norm": 3.4983530044555664,
"learning_rate": 2.9139070340912236e-05,
"loss": 0.0714,
"step": 3060
},
{
"epoch": 11.807692307692308,
"grad_norm": 3.3425889015197754,
"learning_rate": 2.9130769346250376e-05,
"loss": 0.0776,
"step": 3070
},
{
"epoch": 11.846153846153847,
"grad_norm": 3.1763546466827393,
"learning_rate": 2.9122429718604704e-05,
"loss": 0.0705,
"step": 3080
},
{
"epoch": 11.884615384615385,
"grad_norm": 2.6937384605407715,
"learning_rate": 2.911405148077545e-05,
"loss": 0.0691,
"step": 3090
},
{
"epoch": 11.923076923076923,
"grad_norm": 2.8746800422668457,
"learning_rate": 2.9105634655668385e-05,
"loss": 0.0715,
"step": 3100
},
{
"epoch": 11.961538461538462,
"grad_norm": 3.535343647003174,
"learning_rate": 2.9097179266294794e-05,
"loss": 0.0754,
"step": 3110
},
{
"epoch": 12.0,
"grad_norm": 3.433488607406616,
"learning_rate": 2.9088685335771396e-05,
"loss": 0.0619,
"step": 3120
},
{
"epoch": 12.038461538461538,
"grad_norm": 3.0864665508270264,
"learning_rate": 2.9080152887320255e-05,
"loss": 0.0732,
"step": 3130
},
{
"epoch": 12.076923076923077,
"grad_norm": 3.4718246459960938,
"learning_rate": 2.9071581944268778e-05,
"loss": 0.0702,
"step": 3140
},
{
"epoch": 12.115384615384615,
"grad_norm": 3.164069175720215,
"learning_rate": 2.906297253004958e-05,
"loss": 0.0692,
"step": 3150
},
{
"epoch": 12.153846153846153,
"grad_norm": 2.8084299564361572,
"learning_rate": 2.9054324668200483e-05,
"loss": 0.0681,
"step": 3160
},
{
"epoch": 12.192307692307692,
"grad_norm": 2.749694347381592,
"learning_rate": 2.9045638382364404e-05,
"loss": 0.0712,
"step": 3170
},
{
"epoch": 12.23076923076923,
"grad_norm": 2.85776948928833,
"learning_rate": 2.9036913696289318e-05,
"loss": 0.0653,
"step": 3180
},
{
"epoch": 12.26923076923077,
"grad_norm": 2.447458505630493,
"learning_rate": 2.9028150633828186e-05,
"loss": 0.068,
"step": 3190
},
{
"epoch": 12.307692307692308,
"grad_norm": 2.5854172706604004,
"learning_rate": 2.9019349218938887e-05,
"loss": 0.0759,
"step": 3200
},
{
"epoch": 12.346153846153847,
"grad_norm": 2.299642324447632,
"learning_rate": 2.9010509475684146e-05,
"loss": 0.0663,
"step": 3210
},
{
"epoch": 12.384615384615385,
"grad_norm": 2.9513206481933594,
"learning_rate": 2.900163142823149e-05,
"loss": 0.0608,
"step": 3220
},
{
"epoch": 12.423076923076923,
"grad_norm": 3.0884900093078613,
"learning_rate": 2.8992715100853166e-05,
"loss": 0.0658,
"step": 3230
},
{
"epoch": 12.461538461538462,
"grad_norm": 3.2073280811309814,
"learning_rate": 2.898376051792606e-05,
"loss": 0.0596,
"step": 3240
},
{
"epoch": 12.5,
"grad_norm": 2.7891628742218018,
"learning_rate": 2.897476770393167e-05,
"loss": 0.0592,
"step": 3250
},
{
"epoch": 12.538461538461538,
"grad_norm": 3.262420892715454,
"learning_rate": 2.8965736683456e-05,
"loss": 0.0762,
"step": 3260
},
{
"epoch": 12.576923076923077,
"grad_norm": 3.5874862670898438,
"learning_rate": 2.895666748118952e-05,
"loss": 0.0682,
"step": 3270
},
{
"epoch": 12.615384615384615,
"grad_norm": 3.6109778881073,
"learning_rate": 2.8947560121927077e-05,
"loss": 0.0615,
"step": 3280
},
{
"epoch": 12.653846153846153,
"grad_norm": 3.210784435272217,
"learning_rate": 2.8938414630567852e-05,
"loss": 0.0658,
"step": 3290
},
{
"epoch": 12.692307692307692,
"grad_norm": 3.263617992401123,
"learning_rate": 2.892923103211526e-05,
"loss": 0.0607,
"step": 3300
},
{
"epoch": 12.73076923076923,
"grad_norm": 3.226895332336426,
"learning_rate": 2.892000935167691e-05,
"loss": 0.058,
"step": 3310
},
{
"epoch": 12.76923076923077,
"grad_norm": 3.4264261722564697,
"learning_rate": 2.8910749614464536e-05,
"loss": 0.0587,
"step": 3320
},
{
"epoch": 12.807692307692308,
"grad_norm": 3.2235159873962402,
"learning_rate": 2.890145184579389e-05,
"loss": 0.068,
"step": 3330
},
{
"epoch": 12.846153846153847,
"grad_norm": 3.570420503616333,
"learning_rate": 2.8892116071084727e-05,
"loss": 0.0631,
"step": 3340
},
{
"epoch": 12.884615384615385,
"grad_norm": 3.176832914352417,
"learning_rate": 2.8882742315860692e-05,
"loss": 0.0683,
"step": 3350
},
{
"epoch": 12.923076923076923,
"grad_norm": 3.430499792098999,
"learning_rate": 2.8873330605749275e-05,
"loss": 0.0616,
"step": 3360
},
{
"epoch": 12.961538461538462,
"grad_norm": 3.4460554122924805,
"learning_rate": 2.886388096648174e-05,
"loss": 0.0755,
"step": 3370
},
{
"epoch": 13.0,
"grad_norm": 3.3051247596740723,
"learning_rate": 2.8854393423893024e-05,
"loss": 0.0663,
"step": 3380
},
{
"epoch": 13.038461538461538,
"grad_norm": 3.3149800300598145,
"learning_rate": 2.8844868003921723e-05,
"loss": 0.0687,
"step": 3390
},
{
"epoch": 13.076923076923077,
"grad_norm": 3.245965003967285,
"learning_rate": 2.8835304732609962e-05,
"loss": 0.0726,
"step": 3400
},
{
"epoch": 13.115384615384615,
"grad_norm": 2.479734420776367,
"learning_rate": 2.882570363610336e-05,
"loss": 0.064,
"step": 3410
},
{
"epoch": 13.153846153846153,
"grad_norm": 3.601377487182617,
"learning_rate": 2.8816064740650954e-05,
"loss": 0.0675,
"step": 3420
},
{
"epoch": 13.192307692307692,
"grad_norm": 3.4560110569000244,
"learning_rate": 2.880638807260511e-05,
"loss": 0.0607,
"step": 3430
},
{
"epoch": 13.23076923076923,
"grad_norm": 2.8063180446624756,
"learning_rate": 2.8796673658421472e-05,
"loss": 0.0647,
"step": 3440
},
{
"epoch": 13.26923076923077,
"grad_norm": 2.8318352699279785,
"learning_rate": 2.8786921524658877e-05,
"loss": 0.057,
"step": 3450
},
{
"epoch": 13.307692307692308,
"grad_norm": 2.682603597640991,
"learning_rate": 2.8777131697979283e-05,
"loss": 0.0701,
"step": 3460
},
{
"epoch": 13.346153846153847,
"grad_norm": 2.213459014892578,
"learning_rate": 2.876730420514771e-05,
"loss": 0.0572,
"step": 3470
},
{
"epoch": 13.384615384615385,
"grad_norm": 3.3933420181274414,
"learning_rate": 2.8757439073032136e-05,
"loss": 0.0645,
"step": 3480
},
{
"epoch": 13.423076923076923,
"grad_norm": 2.8533976078033447,
"learning_rate": 2.874753632860347e-05,
"loss": 0.064,
"step": 3490
},
{
"epoch": 13.461538461538462,
"grad_norm": 2.714489459991455,
"learning_rate": 2.873759599893543e-05,
"loss": 0.0729,
"step": 3500
},
{
"epoch": 13.5,
"grad_norm": 2.61195969581604,
"learning_rate": 2.8727618111204494e-05,
"loss": 0.0665,
"step": 3510
},
{
"epoch": 13.538461538461538,
"grad_norm": 2.846728801727295,
"learning_rate": 2.871760269268983e-05,
"loss": 0.0595,
"step": 3520
},
{
"epoch": 13.576923076923077,
"grad_norm": 2.8665387630462646,
"learning_rate": 2.870754977077321e-05,
"loss": 0.0626,
"step": 3530
},
{
"epoch": 13.615384615384615,
"grad_norm": 2.974710464477539,
"learning_rate": 2.869745937293894e-05,
"loss": 0.0685,
"step": 3540
},
{
"epoch": 13.653846153846153,
"grad_norm": 2.7472543716430664,
"learning_rate": 2.8687331526773775e-05,
"loss": 0.0639,
"step": 3550
},
{
"epoch": 13.692307692307692,
"grad_norm": 2.984053611755371,
"learning_rate": 2.867716625996687e-05,
"loss": 0.0642,
"step": 3560
},
{
"epoch": 13.73076923076923,
"grad_norm": 2.6401329040527344,
"learning_rate": 2.8666963600309672e-05,
"loss": 0.0698,
"step": 3570
},
{
"epoch": 13.76923076923077,
"grad_norm": 3.483025550842285,
"learning_rate": 2.8656723575695862e-05,
"loss": 0.0633,
"step": 3580
},
{
"epoch": 13.807692307692308,
"grad_norm": 3.1316492557525635,
"learning_rate": 2.8646446214121276e-05,
"loss": 0.0599,
"step": 3590
},
{
"epoch": 13.846153846153847,
"grad_norm": 2.7110755443573,
"learning_rate": 2.8636131543683828e-05,
"loss": 0.0657,
"step": 3600
},
{
"epoch": 13.884615384615385,
"grad_norm": 2.8898301124572754,
"learning_rate": 2.8625779592583436e-05,
"loss": 0.0569,
"step": 3610
},
{
"epoch": 13.923076923076923,
"grad_norm": 3.3383960723876953,
"learning_rate": 2.861539038912193e-05,
"loss": 0.0618,
"step": 3620
},
{
"epoch": 13.961538461538462,
"grad_norm": 2.7277491092681885,
"learning_rate": 2.860496396170301e-05,
"loss": 0.0626,
"step": 3630
},
{
"epoch": 14.0,
"grad_norm": 3.5116193294525146,
"learning_rate": 2.859450033883212e-05,
"loss": 0.0632,
"step": 3640
},
{
"epoch": 14.038461538461538,
"grad_norm": 3.129054069519043,
"learning_rate": 2.8583999549116413e-05,
"loss": 0.065,
"step": 3650
},
{
"epoch": 14.076923076923077,
"grad_norm": 2.848735809326172,
"learning_rate": 2.857346162126464e-05,
"loss": 0.0536,
"step": 3660
},
{
"epoch": 14.115384615384615,
"grad_norm": 2.7207109928131104,
"learning_rate": 2.8562886584087092e-05,
"loss": 0.0624,
"step": 3670
},
{
"epoch": 14.153846153846153,
"grad_norm": 2.6145403385162354,
"learning_rate": 2.8552274466495525e-05,
"loss": 0.0638,
"step": 3680
},
{
"epoch": 14.192307692307692,
"grad_norm": 2.5214338302612305,
"learning_rate": 2.8541625297503056e-05,
"loss": 0.0645,
"step": 3690
},
{
"epoch": 14.23076923076923,
"grad_norm": 2.453113555908203,
"learning_rate": 2.8530939106224106e-05,
"loss": 0.059,
"step": 3700
},
{
"epoch": 14.26923076923077,
"grad_norm": 2.9459733963012695,
"learning_rate": 2.8520215921874325e-05,
"loss": 0.0619,
"step": 3710
},
{
"epoch": 14.307692307692308,
"grad_norm": 3.1279542446136475,
"learning_rate": 2.850945577377048e-05,
"loss": 0.0652,
"step": 3720
},
{
"epoch": 14.346153846153847,
"grad_norm": 3.1692466735839844,
"learning_rate": 2.8498658691330406e-05,
"loss": 0.056,
"step": 3730
},
{
"epoch": 14.384615384615385,
"grad_norm": 2.7388417720794678,
"learning_rate": 2.8487824704072913e-05,
"loss": 0.0641,
"step": 3740
},
{
"epoch": 14.423076923076923,
"grad_norm": 3.1448442935943604,
"learning_rate": 2.8476953841617713e-05,
"loss": 0.063,
"step": 3750
},
{
"epoch": 14.461538461538462,
"grad_norm": 2.5632693767547607,
"learning_rate": 2.846604613368532e-05,
"loss": 0.0562,
"step": 3760
},
{
"epoch": 14.5,
"grad_norm": 2.959221601486206,
"learning_rate": 2.8455101610097002e-05,
"loss": 0.0675,
"step": 3770
},
{
"epoch": 14.538461538461538,
"grad_norm": 2.6775829792022705,
"learning_rate": 2.8444120300774666e-05,
"loss": 0.0606,
"step": 3780
},
{
"epoch": 14.576923076923077,
"grad_norm": 2.562490940093994,
"learning_rate": 2.8433102235740788e-05,
"loss": 0.0573,
"step": 3790
},
{
"epoch": 14.615384615384615,
"grad_norm": 2.8279271125793457,
"learning_rate": 2.842204744511834e-05,
"loss": 0.0595,
"step": 3800
},
{
"epoch": 14.653846153846153,
"grad_norm": 2.3390533924102783,
"learning_rate": 2.8410955959130693e-05,
"loss": 0.0533,
"step": 3810
},
{
"epoch": 14.692307692307692,
"grad_norm": 2.221428871154785,
"learning_rate": 2.8399827808101554e-05,
"loss": 0.0623,
"step": 3820
},
{
"epoch": 14.73076923076923,
"grad_norm": 2.7612838745117188,
"learning_rate": 2.8388663022454857e-05,
"loss": 0.0616,
"step": 3830
},
{
"epoch": 14.76923076923077,
"grad_norm": 2.730405330657959,
"learning_rate": 2.83774616327147e-05,
"loss": 0.0575,
"step": 3840
},
{
"epoch": 14.807692307692308,
"grad_norm": 2.5589001178741455,
"learning_rate": 2.836622366950526e-05,
"loss": 0.0576,
"step": 3850
},
{
"epoch": 14.846153846153847,
"grad_norm": 2.984184741973877,
"learning_rate": 2.835494916355069e-05,
"loss": 0.0635,
"step": 3860
},
{
"epoch": 14.884615384615385,
"grad_norm": 2.6934828758239746,
"learning_rate": 2.8343638145675072e-05,
"loss": 0.0616,
"step": 3870
},
{
"epoch": 14.923076923076923,
"grad_norm": 2.6913018226623535,
"learning_rate": 2.8332290646802282e-05,
"loss": 0.0601,
"step": 3880
},
{
"epoch": 14.961538461538462,
"grad_norm": 2.394535779953003,
"learning_rate": 2.8320906697955963e-05,
"loss": 0.0562,
"step": 3890
},
{
"epoch": 15.0,
"grad_norm": 2.9435136318206787,
"learning_rate": 2.8309486330259385e-05,
"loss": 0.0626,
"step": 3900
},
{
"epoch": 15.038461538461538,
"grad_norm": 2.9360132217407227,
"learning_rate": 2.82980295749354e-05,
"loss": 0.0624,
"step": 3910
},
{
"epoch": 15.076923076923077,
"grad_norm": 3.197340965270996,
"learning_rate": 2.828653646330634e-05,
"loss": 0.0612,
"step": 3920
},
{
"epoch": 15.115384615384615,
"grad_norm": 2.8745882511138916,
"learning_rate": 2.8275007026793938e-05,
"loss": 0.0597,
"step": 3930
},
{
"epoch": 15.153846153846153,
"grad_norm": 2.903532028198242,
"learning_rate": 2.826344129691923e-05,
"loss": 0.0632,
"step": 3940
},
{
"epoch": 15.192307692307692,
"grad_norm": 2.7144417762756348,
"learning_rate": 2.8251839305302478e-05,
"loss": 0.0636,
"step": 3950
},
{
"epoch": 15.23076923076923,
"grad_norm": 2.7780587673187256,
"learning_rate": 2.8240201083663088e-05,
"loss": 0.0569,
"step": 3960
},
{
"epoch": 15.26923076923077,
"grad_norm": 2.372857093811035,
"learning_rate": 2.8228526663819504e-05,
"loss": 0.0615,
"step": 3970
},
{
"epoch": 15.307692307692308,
"grad_norm": 2.953294277191162,
"learning_rate": 2.8216816077689158e-05,
"loss": 0.0569,
"step": 3980
},
{
"epoch": 15.346153846153847,
"grad_norm": 3.1452503204345703,
"learning_rate": 2.8205069357288337e-05,
"loss": 0.0555,
"step": 3990
},
{
"epoch": 15.384615384615385,
"grad_norm": 2.4111666679382324,
"learning_rate": 2.8193286534732128e-05,
"loss": 0.0612,
"step": 4000
},
{
"epoch": 15.423076923076923,
"grad_norm": 2.9192652702331543,
"learning_rate": 2.8181467642234317e-05,
"loss": 0.0578,
"step": 4010
},
{
"epoch": 15.461538461538462,
"grad_norm": 2.613917589187622,
"learning_rate": 2.8169612712107306e-05,
"loss": 0.0553,
"step": 4020
},
{
"epoch": 15.5,
"grad_norm": 2.7946548461914062,
"learning_rate": 2.8157721776762017e-05,
"loss": 0.0632,
"step": 4030
},
{
"epoch": 15.538461538461538,
"grad_norm": 2.815171003341675,
"learning_rate": 2.814579486870782e-05,
"loss": 0.0602,
"step": 4040
},
{
"epoch": 15.576923076923077,
"grad_norm": 2.570457696914673,
"learning_rate": 2.813383202055242e-05,
"loss": 0.0551,
"step": 4050
},
{
"epoch": 15.615384615384615,
"grad_norm": 2.0170300006866455,
"learning_rate": 2.8121833265001792e-05,
"loss": 0.0642,
"step": 4060
},
{
"epoch": 15.653846153846153,
"grad_norm": 2.3233728408813477,
"learning_rate": 2.8109798634860072e-05,
"loss": 0.0588,
"step": 4070
},
{
"epoch": 15.692307692307692,
"grad_norm": 2.1397790908813477,
"learning_rate": 2.8097728163029482e-05,
"loss": 0.0529,
"step": 4080
},
{
"epoch": 15.73076923076923,
"grad_norm": 2.950807571411133,
"learning_rate": 2.8085621882510233e-05,
"loss": 0.0569,
"step": 4090
},
{
"epoch": 15.76923076923077,
"grad_norm": 2.839064121246338,
"learning_rate": 2.8073479826400425e-05,
"loss": 0.0615,
"step": 4100
},
{
"epoch": 15.807692307692308,
"grad_norm": 2.629194736480713,
"learning_rate": 2.806130202789598e-05,
"loss": 0.0596,
"step": 4110
},
{
"epoch": 15.846153846153847,
"grad_norm": 2.859807252883911,
"learning_rate": 2.804908852029054e-05,
"loss": 0.0568,
"step": 4120
},
{
"epoch": 15.884615384615385,
"grad_norm": 2.6774954795837402,
"learning_rate": 2.8036839336975367e-05,
"loss": 0.0573,
"step": 4130
},
{
"epoch": 15.923076923076923,
"grad_norm": 3.763676166534424,
"learning_rate": 2.8024554511439253e-05,
"loss": 0.0612,
"step": 4140
},
{
"epoch": 15.961538461538462,
"grad_norm": 2.945574998855591,
"learning_rate": 2.801223407726844e-05,
"loss": 0.0553,
"step": 4150
},
{
"epoch": 16.0,
"grad_norm": 2.359447717666626,
"learning_rate": 2.7999878068146537e-05,
"loss": 0.049,
"step": 4160
},
{
"epoch": 16.03846153846154,
"grad_norm": 2.7345235347747803,
"learning_rate": 2.7987486517854396e-05,
"loss": 0.0595,
"step": 4170
},
{
"epoch": 16.076923076923077,
"grad_norm": 2.2802300453186035,
"learning_rate": 2.7975059460270037e-05,
"loss": 0.0622,
"step": 4180
},
{
"epoch": 16.115384615384617,
"grad_norm": 2.078882932662964,
"learning_rate": 2.7962596929368566e-05,
"loss": 0.0585,
"step": 4190
},
{
"epoch": 16.153846153846153,
"grad_norm": 2.182929039001465,
"learning_rate": 2.795009895922207e-05,
"loss": 0.0536,
"step": 4200
},
{
"epoch": 16.192307692307693,
"grad_norm": 3.1578166484832764,
"learning_rate": 2.7937565583999513e-05,
"loss": 0.0517,
"step": 4210
},
{
"epoch": 16.23076923076923,
"grad_norm": 2.627866268157959,
"learning_rate": 2.792499683796667e-05,
"loss": 0.0542,
"step": 4220
},
{
"epoch": 16.26923076923077,
"grad_norm": 2.430981159210205,
"learning_rate": 2.791239275548601e-05,
"loss": 0.0484,
"step": 4230
},
{
"epoch": 16.307692307692307,
"grad_norm": 2.130453586578369,
"learning_rate": 2.789975337101662e-05,
"loss": 0.0603,
"step": 4240
},
{
"epoch": 16.346153846153847,
"grad_norm": 2.2407262325286865,
"learning_rate": 2.788707871911409e-05,
"loss": 0.0606,
"step": 4250
},
{
"epoch": 16.384615384615383,
"grad_norm": 2.404421091079712,
"learning_rate": 2.7874368834430426e-05,
"loss": 0.0561,
"step": 4260
},
{
"epoch": 16.423076923076923,
"grad_norm": 2.6223599910736084,
"learning_rate": 2.7861623751713982e-05,
"loss": 0.053,
"step": 4270
},
{
"epoch": 16.46153846153846,
"grad_norm": 2.6188464164733887,
"learning_rate": 2.7848843505809317e-05,
"loss": 0.0559,
"step": 4280
},
{
"epoch": 16.5,
"grad_norm": 2.1133105754852295,
"learning_rate": 2.7836028131657142e-05,
"loss": 0.0524,
"step": 4290
},
{
"epoch": 16.53846153846154,
"grad_norm": 2.4061028957366943,
"learning_rate": 2.7823177664294197e-05,
"loss": 0.0545,
"step": 4300
},
{
"epoch": 16.576923076923077,
"grad_norm": 2.635202169418335,
"learning_rate": 2.7810292138853168e-05,
"loss": 0.0526,
"step": 4310
},
{
"epoch": 16.615384615384617,
"grad_norm": 2.4512650966644287,
"learning_rate": 2.779737159056259e-05,
"loss": 0.0527,
"step": 4320
},
{
"epoch": 16.653846153846153,
"grad_norm": 2.650878429412842,
"learning_rate": 2.7784416054746753e-05,
"loss": 0.0518,
"step": 4330
},
{
"epoch": 16.692307692307693,
"grad_norm": 2.4242632389068604,
"learning_rate": 2.7771425566825593e-05,
"loss": 0.0527,
"step": 4340
},
{
"epoch": 16.73076923076923,
"grad_norm": 2.464691162109375,
"learning_rate": 2.7758400162314605e-05,
"loss": 0.0547,
"step": 4350
},
{
"epoch": 16.76923076923077,
"grad_norm": 2.6380252838134766,
"learning_rate": 2.7745339876824756e-05,
"loss": 0.0503,
"step": 4360
},
{
"epoch": 16.807692307692307,
"grad_norm": 2.551723003387451,
"learning_rate": 2.7732244746062363e-05,
"loss": 0.0537,
"step": 4370
},
{
"epoch": 16.846153846153847,
"grad_norm": 2.8268020153045654,
"learning_rate": 2.7719114805829015e-05,
"loss": 0.0516,
"step": 4380
},
{
"epoch": 16.884615384615383,
"grad_norm": 2.501936435699463,
"learning_rate": 2.7705950092021465e-05,
"loss": 0.0583,
"step": 4390
},
{
"epoch": 16.923076923076923,
"grad_norm": 2.710240602493286,
"learning_rate": 2.7692750640631533e-05,
"loss": 0.0658,
"step": 4400
},
{
"epoch": 16.96153846153846,
"grad_norm": 2.378331422805786,
"learning_rate": 2.767951648774603e-05,
"loss": 0.0548,
"step": 4410
},
{
"epoch": 17.0,
"grad_norm": 2.894601345062256,
"learning_rate": 2.766624766954661e-05,
"loss": 0.0567,
"step": 4420
},
{
"epoch": 17.03846153846154,
"grad_norm": 2.357337236404419,
"learning_rate": 2.7652944222309727e-05,
"loss": 0.0538,
"step": 4430
},
{
"epoch": 17.076923076923077,
"grad_norm": 2.6987788677215576,
"learning_rate": 2.7639606182406484e-05,
"loss": 0.0617,
"step": 4440
},
{
"epoch": 17.115384615384617,
"grad_norm": 1.861840009689331,
"learning_rate": 2.7626233586302583e-05,
"loss": 0.0487,
"step": 4450
},
{
"epoch": 17.153846153846153,
"grad_norm": 1.9388928413391113,
"learning_rate": 2.7612826470558192e-05,
"loss": 0.0568,
"step": 4460
},
{
"epoch": 17.192307692307693,
"grad_norm": 2.5828299522399902,
"learning_rate": 2.7599384871827846e-05,
"loss": 0.0535,
"step": 4470
},
{
"epoch": 17.23076923076923,
"grad_norm": 2.6794981956481934,
"learning_rate": 2.7585908826860368e-05,
"loss": 0.0528,
"step": 4480
},
{
"epoch": 17.26923076923077,
"grad_norm": 2.3347725868225098,
"learning_rate": 2.757239837249875e-05,
"loss": 0.0551,
"step": 4490
},
{
"epoch": 17.307692307692307,
"grad_norm": 3.0247833728790283,
"learning_rate": 2.7558853545680057e-05,
"loss": 0.0524,
"step": 4500
},
{
"epoch": 17.346153846153847,
"grad_norm": 2.5298354625701904,
"learning_rate": 2.754527438343533e-05,
"loss": 0.0624,
"step": 4510
},
{
"epoch": 17.384615384615383,
"grad_norm": 2.508488893508911,
"learning_rate": 2.7531660922889477e-05,
"loss": 0.0548,
"step": 4520
},
{
"epoch": 17.423076923076923,
"grad_norm": 2.869568347930908,
"learning_rate": 2.751801320126118e-05,
"loss": 0.0567,
"step": 4530
},
{
"epoch": 17.46153846153846,
"grad_norm": 3.067972421646118,
"learning_rate": 2.750433125586279e-05,
"loss": 0.0527,
"step": 4540
},
{
"epoch": 17.5,
"grad_norm": 2.691254138946533,
"learning_rate": 2.7490615124100225e-05,
"loss": 0.049,
"step": 4550
},
{
"epoch": 17.53846153846154,
"grad_norm": 2.420832395553589,
"learning_rate": 2.747686484347286e-05,
"loss": 0.0488,
"step": 4560
},
{
"epoch": 17.576923076923077,
"grad_norm": 2.2831807136535645,
"learning_rate": 2.7463080451573447e-05,
"loss": 0.0547,
"step": 4570
},
{
"epoch": 17.615384615384617,
"grad_norm": 2.23234486579895,
"learning_rate": 2.744926198608798e-05,
"loss": 0.0521,
"step": 4580
},
{
"epoch": 17.653846153846153,
"grad_norm": 2.116222620010376,
"learning_rate": 2.743540948479561e-05,
"loss": 0.0526,
"step": 4590
},
{
"epoch": 17.692307692307693,
"grad_norm": 2.2791247367858887,
"learning_rate": 2.7421522985568562e-05,
"loss": 0.054,
"step": 4600
},
{
"epoch": 17.73076923076923,
"grad_norm": 2.1204259395599365,
"learning_rate": 2.7407602526371983e-05,
"loss": 0.0544,
"step": 4610
},
{
"epoch": 17.76923076923077,
"grad_norm": 2.725435733795166,
"learning_rate": 2.7393648145263873e-05,
"loss": 0.0545,
"step": 4620
},
{
"epoch": 17.807692307692307,
"grad_norm": 2.7043561935424805,
"learning_rate": 2.7379659880394996e-05,
"loss": 0.0554,
"step": 4630
},
{
"epoch": 17.846153846153847,
"grad_norm": 3.0216851234436035,
"learning_rate": 2.7365637770008717e-05,
"loss": 0.0551,
"step": 4640
},
{
"epoch": 17.884615384615383,
"grad_norm": 2.6223690509796143,
"learning_rate": 2.7351581852440953e-05,
"loss": 0.0545,
"step": 4650
},
{
"epoch": 17.923076923076923,
"grad_norm": 2.883230209350586,
"learning_rate": 2.7337492166120053e-05,
"loss": 0.0632,
"step": 4660
},
{
"epoch": 17.96153846153846,
"grad_norm": 2.9789953231811523,
"learning_rate": 2.732336874956667e-05,
"loss": 0.0558,
"step": 4670
},
{
"epoch": 18.0,
"grad_norm": 2.189687967300415,
"learning_rate": 2.7309211641393696e-05,
"loss": 0.0439,
"step": 4680
},
{
"epoch": 18.03846153846154,
"grad_norm": 2.24808931350708,
"learning_rate": 2.7295020880306123e-05,
"loss": 0.0567,
"step": 4690
},
{
"epoch": 18.076923076923077,
"grad_norm": 2.401685953140259,
"learning_rate": 2.7280796505100946e-05,
"loss": 0.0578,
"step": 4700
},
{
"epoch": 18.115384615384617,
"grad_norm": 1.9197814464569092,
"learning_rate": 2.7266538554667065e-05,
"loss": 0.0525,
"step": 4710
},
{
"epoch": 18.153846153846153,
"grad_norm": 2.022991180419922,
"learning_rate": 2.725224706798517e-05,
"loss": 0.0567,
"step": 4720
},
{
"epoch": 18.192307692307693,
"grad_norm": 2.132225513458252,
"learning_rate": 2.7237922084127643e-05,
"loss": 0.0524,
"step": 4730
},
{
"epoch": 18.23076923076923,
"grad_norm": 2.158095598220825,
"learning_rate": 2.7223563642258446e-05,
"loss": 0.0496,
"step": 4740
},
{
"epoch": 18.26923076923077,
"grad_norm": 2.268658399581909,
"learning_rate": 2.7209171781633e-05,
"loss": 0.0536,
"step": 4750
},
{
"epoch": 18.307692307692307,
"grad_norm": 2.788698673248291,
"learning_rate": 2.7194746541598113e-05,
"loss": 0.0527,
"step": 4760
},
{
"epoch": 18.346153846153847,
"grad_norm": 2.46181321144104,
"learning_rate": 2.7180287961591835e-05,
"loss": 0.0547,
"step": 4770
},
{
"epoch": 18.384615384615383,
"grad_norm": 2.1883716583251953,
"learning_rate": 2.7165796081143377e-05,
"loss": 0.0554,
"step": 4780
},
{
"epoch": 18.423076923076923,
"grad_norm": 2.5119552612304688,
"learning_rate": 2.715127093987298e-05,
"loss": 0.0502,
"step": 4790
},
{
"epoch": 18.46153846153846,
"grad_norm": 2.3125741481781006,
"learning_rate": 2.713671257749183e-05,
"loss": 0.0521,
"step": 4800
},
{
"epoch": 18.5,
"grad_norm": 2.01935076713562,
"learning_rate": 2.712212103380193e-05,
"loss": 0.049,
"step": 4810
},
{
"epoch": 18.53846153846154,
"grad_norm": 2.192258834838867,
"learning_rate": 2.7107496348696004e-05,
"loss": 0.0468,
"step": 4820
},
{
"epoch": 18.576923076923077,
"grad_norm": 2.0872395038604736,
"learning_rate": 2.7092838562157386e-05,
"loss": 0.0608,
"step": 4830
},
{
"epoch": 18.615384615384617,
"grad_norm": 2.0910723209381104,
"learning_rate": 2.7078147714259905e-05,
"loss": 0.0597,
"step": 4840
},
{
"epoch": 18.653846153846153,
"grad_norm": 2.1819539070129395,
"learning_rate": 2.7063423845167773e-05,
"loss": 0.0514,
"step": 4850
},
{
"epoch": 18.692307692307693,
"grad_norm": 2.541072368621826,
"learning_rate": 2.7048666995135494e-05,
"loss": 0.0518,
"step": 4860
},
{
"epoch": 18.73076923076923,
"grad_norm": 2.253460168838501,
"learning_rate": 2.7033877204507722e-05,
"loss": 0.0519,
"step": 4870
},
{
"epoch": 18.76923076923077,
"grad_norm": 2.454606294631958,
"learning_rate": 2.701905451371919e-05,
"loss": 0.0559,
"step": 4880
},
{
"epoch": 18.807692307692307,
"grad_norm": 2.596813917160034,
"learning_rate": 2.7004198963294558e-05,
"loss": 0.0557,
"step": 4890
},
{
"epoch": 18.846153846153847,
"grad_norm": 2.3953702449798584,
"learning_rate": 2.6989310593848345e-05,
"loss": 0.0553,
"step": 4900
},
{
"epoch": 18.884615384615383,
"grad_norm": 2.1888365745544434,
"learning_rate": 2.6974389446084776e-05,
"loss": 0.0488,
"step": 4910
},
{
"epoch": 18.923076923076923,
"grad_norm": 2.3826677799224854,
"learning_rate": 2.6959435560797706e-05,
"loss": 0.0495,
"step": 4920
},
{
"epoch": 18.96153846153846,
"grad_norm": 2.1377201080322266,
"learning_rate": 2.6944448978870478e-05,
"loss": 0.0566,
"step": 4930
},
{
"epoch": 19.0,
"grad_norm": 2.3483967781066895,
"learning_rate": 2.6929429741275845e-05,
"loss": 0.0488,
"step": 4940
},
{
"epoch": 19.03846153846154,
"grad_norm": 1.9696838855743408,
"learning_rate": 2.691437788907582e-05,
"loss": 0.051,
"step": 4950
},
{
"epoch": 19.076923076923077,
"grad_norm": 2.692110776901245,
"learning_rate": 2.689929346342159e-05,
"loss": 0.0461,
"step": 4960
},
{
"epoch": 19.115384615384617,
"grad_norm": 2.2084898948669434,
"learning_rate": 2.688417650555341e-05,
"loss": 0.0599,
"step": 4970
},
{
"epoch": 19.153846153846153,
"grad_norm": 2.121561288833618,
"learning_rate": 2.686902705680046e-05,
"loss": 0.0578,
"step": 4980
},
{
"epoch": 19.192307692307693,
"grad_norm": 2.3033506870269775,
"learning_rate": 2.6853845158580756e-05,
"loss": 0.0519,
"step": 4990
},
{
"epoch": 19.23076923076923,
"grad_norm": 2.0581743717193604,
"learning_rate": 2.6838630852401028e-05,
"loss": 0.0538,
"step": 5000
},
{
"epoch": 19.26923076923077,
"grad_norm": 2.1499931812286377,
"learning_rate": 2.6823384179856602e-05,
"loss": 0.0515,
"step": 5010
},
{
"epoch": 19.307692307692307,
"grad_norm": 2.4993667602539062,
"learning_rate": 2.6808105182631303e-05,
"loss": 0.0537,
"step": 5020
},
{
"epoch": 19.346153846153847,
"grad_norm": 2.7367970943450928,
"learning_rate": 2.6792793902497328e-05,
"loss": 0.0507,
"step": 5030
},
{
"epoch": 19.384615384615383,
"grad_norm": 2.3506581783294678,
"learning_rate": 2.6777450381315133e-05,
"loss": 0.0501,
"step": 5040
},
{
"epoch": 19.423076923076923,
"grad_norm": 1.9951107501983643,
"learning_rate": 2.676207466103331e-05,
"loss": 0.0519,
"step": 5050
},
{
"epoch": 19.46153846153846,
"grad_norm": 2.0051987171173096,
"learning_rate": 2.6746666783688503e-05,
"loss": 0.0505,
"step": 5060
},
{
"epoch": 19.5,
"grad_norm": 1.8392553329467773,
"learning_rate": 2.673122679140525e-05,
"loss": 0.0421,
"step": 5070
},
{
"epoch": 19.53846153846154,
"grad_norm": 1.9367773532867432,
"learning_rate": 2.671575472639591e-05,
"loss": 0.0514,
"step": 5080
},
{
"epoch": 19.576923076923077,
"grad_norm": 2.1854703426361084,
"learning_rate": 2.6700250630960506e-05,
"loss": 0.0535,
"step": 5090
},
{
"epoch": 19.615384615384617,
"grad_norm": 2.0213778018951416,
"learning_rate": 2.6684714547486654e-05,
"loss": 0.0546,
"step": 5100
},
{
"epoch": 19.653846153846153,
"grad_norm": 2.09897780418396,
"learning_rate": 2.6669146518449407e-05,
"loss": 0.0477,
"step": 5110
},
{
"epoch": 19.692307692307693,
"grad_norm": 2.4380099773406982,
"learning_rate": 2.665354658641117e-05,
"loss": 0.0611,
"step": 5120
},
{
"epoch": 19.73076923076923,
"grad_norm": 2.0932047367095947,
"learning_rate": 2.6637914794021552e-05,
"loss": 0.0462,
"step": 5130
},
{
"epoch": 19.76923076923077,
"grad_norm": 2.186757802963257,
"learning_rate": 2.6622251184017274e-05,
"loss": 0.0487,
"step": 5140
},
{
"epoch": 19.807692307692307,
"grad_norm": 1.9822323322296143,
"learning_rate": 2.660655579922206e-05,
"loss": 0.0498,
"step": 5150
},
{
"epoch": 19.846153846153847,
"grad_norm": 2.4814295768737793,
"learning_rate": 2.6590828682546487e-05,
"loss": 0.0541,
"step": 5160
},
{
"epoch": 19.884615384615383,
"grad_norm": 2.3453330993652344,
"learning_rate": 2.657506987698789e-05,
"loss": 0.0403,
"step": 5170
},
{
"epoch": 19.923076923076923,
"grad_norm": 2.3614869117736816,
"learning_rate": 2.655927942563024e-05,
"loss": 0.0491,
"step": 5180
},
{
"epoch": 19.96153846153846,
"grad_norm": 2.508418083190918,
"learning_rate": 2.6543457371644027e-05,
"loss": 0.0506,
"step": 5190
},
{
"epoch": 20.0,
"grad_norm": 1.809973955154419,
"learning_rate": 2.652760375828615e-05,
"loss": 0.0553,
"step": 5200
},
{
"epoch": 20.03846153846154,
"grad_norm": 2.3840510845184326,
"learning_rate": 2.651171862889978e-05,
"loss": 0.0511,
"step": 5210
},
{
"epoch": 20.076923076923077,
"grad_norm": 2.154362440109253,
"learning_rate": 2.649580202691425e-05,
"loss": 0.0522,
"step": 5220
},
{
"epoch": 20.115384615384617,
"grad_norm": 2.4772872924804688,
"learning_rate": 2.6479853995844942e-05,
"loss": 0.0524,
"step": 5230
},
{
"epoch": 20.153846153846153,
"grad_norm": 2.3529508113861084,
"learning_rate": 2.646387457929317e-05,
"loss": 0.0494,
"step": 5240
},
{
"epoch": 20.192307692307693,
"grad_norm": 2.08630633354187,
"learning_rate": 2.6447863820946047e-05,
"loss": 0.0462,
"step": 5250
},
{
"epoch": 20.23076923076923,
"grad_norm": 1.6905722618103027,
"learning_rate": 2.6431821764576367e-05,
"loss": 0.0512,
"step": 5260
},
{
"epoch": 20.26923076923077,
"grad_norm": 1.8629543781280518,
"learning_rate": 2.641574845404251e-05,
"loss": 0.0502,
"step": 5270
},
{
"epoch": 20.307692307692307,
"grad_norm": 2.0543575286865234,
"learning_rate": 2.639964393328829e-05,
"loss": 0.0545,
"step": 5280
},
{
"epoch": 20.346153846153847,
"grad_norm": 2.485801935195923,
"learning_rate": 2.6383508246342844e-05,
"loss": 0.0514,
"step": 5290
},
{
"epoch": 20.384615384615383,
"grad_norm": 1.7313271760940552,
"learning_rate": 2.636734143732054e-05,
"loss": 0.0488,
"step": 5300
},
{
"epoch": 20.423076923076923,
"grad_norm": 1.8892971277236938,
"learning_rate": 2.63511435504208e-05,
"loss": 0.05,
"step": 5310
},
{
"epoch": 20.46153846153846,
"grad_norm": 1.8958619832992554,
"learning_rate": 2.633491462992804e-05,
"loss": 0.0509,
"step": 5320
},
{
"epoch": 20.5,
"grad_norm": 2.0657594203948975,
"learning_rate": 2.63186547202115e-05,
"loss": 0.0457,
"step": 5330
},
{
"epoch": 20.53846153846154,
"grad_norm": 2.0821566581726074,
"learning_rate": 2.6302363865725158e-05,
"loss": 0.0515,
"step": 5340
},
{
"epoch": 20.576923076923077,
"grad_norm": 1.938544750213623,
"learning_rate": 2.628604211100759e-05,
"loss": 0.0505,
"step": 5350
},
{
"epoch": 20.615384615384617,
"grad_norm": 2.1884799003601074,
"learning_rate": 2.6269689500681846e-05,
"loss": 0.0514,
"step": 5360
},
{
"epoch": 20.653846153846153,
"grad_norm": 2.3804664611816406,
"learning_rate": 2.6253306079455337e-05,
"loss": 0.0541,
"step": 5370
},
{
"epoch": 20.692307692307693,
"grad_norm": 2.18882155418396,
"learning_rate": 2.6236891892119713e-05,
"loss": 0.0492,
"step": 5380
},
{
"epoch": 20.73076923076923,
"grad_norm": 2.1746103763580322,
"learning_rate": 2.6220446983550738e-05,
"loss": 0.0487,
"step": 5390
},
{
"epoch": 20.76923076923077,
"grad_norm": 1.9990772008895874,
"learning_rate": 2.6203971398708162e-05,
"loss": 0.0492,
"step": 5400
},
{
"epoch": 20.807692307692307,
"grad_norm": 2.243596315383911,
"learning_rate": 2.6187465182635598e-05,
"loss": 0.0521,
"step": 5410
},
{
"epoch": 20.846153846153847,
"grad_norm": 2.5800552368164062,
"learning_rate": 2.6170928380460424e-05,
"loss": 0.0468,
"step": 5420
},
{
"epoch": 20.884615384615383,
"grad_norm": 2.2908005714416504,
"learning_rate": 2.615436103739362e-05,
"loss": 0.051,
"step": 5430
},
{
"epoch": 20.923076923076923,
"grad_norm": 2.4502663612365723,
"learning_rate": 2.6137763198729665e-05,
"loss": 0.0476,
"step": 5440
},
{
"epoch": 20.96153846153846,
"grad_norm": 2.216867208480835,
"learning_rate": 2.6121134909846416e-05,
"loss": 0.0553,
"step": 5450
},
{
"epoch": 21.0,
"grad_norm": 2.4995601177215576,
"learning_rate": 2.6104476216204985e-05,
"loss": 0.0453,
"step": 5460
},
{
"epoch": 21.03846153846154,
"grad_norm": 1.9124618768692017,
"learning_rate": 2.6087787163349605e-05,
"loss": 0.0479,
"step": 5470
},
{
"epoch": 21.076923076923077,
"grad_norm": 1.9513589143753052,
"learning_rate": 2.60710677969075e-05,
"loss": 0.0476,
"step": 5480
},
{
"epoch": 21.115384615384617,
"grad_norm": 1.7153360843658447,
"learning_rate": 2.6054318162588792e-05,
"loss": 0.044,
"step": 5490
},
{
"epoch": 21.153846153846153,
"grad_norm": 2.320129871368408,
"learning_rate": 2.6037538306186337e-05,
"loss": 0.0456,
"step": 5500
},
{
"epoch": 21.192307692307693,
"grad_norm": 1.6431825160980225,
"learning_rate": 2.602072827357562e-05,
"loss": 0.0483,
"step": 5510
},
{
"epoch": 21.23076923076923,
"grad_norm": 2.124098539352417,
"learning_rate": 2.6003888110714624e-05,
"loss": 0.0402,
"step": 5520
},
{
"epoch": 21.26923076923077,
"grad_norm": 2.669579267501831,
"learning_rate": 2.5987017863643714e-05,
"loss": 0.0547,
"step": 5530
},
{
"epoch": 21.307692307692307,
"grad_norm": 2.1754443645477295,
"learning_rate": 2.5970117578485506e-05,
"loss": 0.0481,
"step": 5540
},
{
"epoch": 21.346153846153847,
"grad_norm": 2.0452888011932373,
"learning_rate": 2.5953187301444733e-05,
"loss": 0.0436,
"step": 5550
},
{
"epoch": 21.384615384615383,
"grad_norm": 2.2750635147094727,
"learning_rate": 2.5936227078808123e-05,
"loss": 0.0498,
"step": 5560
},
{
"epoch": 21.423076923076923,
"grad_norm": 1.7035436630249023,
"learning_rate": 2.5919236956944277e-05,
"loss": 0.0455,
"step": 5570
},
{
"epoch": 21.46153846153846,
"grad_norm": 2.014969825744629,
"learning_rate": 2.5902216982303544e-05,
"loss": 0.0502,
"step": 5580
},
{
"epoch": 21.5,
"grad_norm": 2.2059478759765625,
"learning_rate": 2.588516720141788e-05,
"loss": 0.0466,
"step": 5590
},
{
"epoch": 21.53846153846154,
"grad_norm": 1.8128308057785034,
"learning_rate": 2.5868087660900735e-05,
"loss": 0.0494,
"step": 5600
},
{
"epoch": 21.576923076923077,
"grad_norm": 2.221346139907837,
"learning_rate": 2.5850978407446924e-05,
"loss": 0.0532,
"step": 5610
},
{
"epoch": 21.615384615384617,
"grad_norm": 1.9685431718826294,
"learning_rate": 2.5833839487832488e-05,
"loss": 0.0506,
"step": 5620
},
{
"epoch": 21.653846153846153,
"grad_norm": 1.538164734840393,
"learning_rate": 2.5816670948914583e-05,
"loss": 0.0446,
"step": 5630
},
{
"epoch": 21.692307692307693,
"grad_norm": 1.8389371633529663,
"learning_rate": 2.5799472837631338e-05,
"loss": 0.053,
"step": 5640
},
{
"epoch": 21.73076923076923,
"grad_norm": 2.2778820991516113,
"learning_rate": 2.578224520100173e-05,
"loss": 0.0423,
"step": 5650
},
{
"epoch": 21.76923076923077,
"grad_norm": 2.5849103927612305,
"learning_rate": 2.576498808612546e-05,
"loss": 0.0489,
"step": 5660
},
{
"epoch": 21.807692307692307,
"grad_norm": 2.3590989112854004,
"learning_rate": 2.5747701540182825e-05,
"loss": 0.0514,
"step": 5670
},
{
"epoch": 21.846153846153847,
"grad_norm": 1.8261209726333618,
"learning_rate": 2.573038561043458e-05,
"loss": 0.0459,
"step": 5680
},
{
"epoch": 21.884615384615383,
"grad_norm": 2.492357015609741,
"learning_rate": 2.5713040344221815e-05,
"loss": 0.053,
"step": 5690
},
{
"epoch": 21.923076923076923,
"grad_norm": 2.3282649517059326,
"learning_rate": 2.5695665788965823e-05,
"loss": 0.0422,
"step": 5700
},
{
"epoch": 21.96153846153846,
"grad_norm": 2.3369557857513428,
"learning_rate": 2.5678261992167978e-05,
"loss": 0.0542,
"step": 5710
},
{
"epoch": 22.0,
"grad_norm": 2.2342748641967773,
"learning_rate": 2.5660829001409594e-05,
"loss": 0.0553,
"step": 5720
},
{
"epoch": 22.03846153846154,
"grad_norm": 1.9400802850723267,
"learning_rate": 2.5643366864351806e-05,
"loss": 0.0554,
"step": 5730
},
{
"epoch": 22.076923076923077,
"grad_norm": 2.1915154457092285,
"learning_rate": 2.5625875628735423e-05,
"loss": 0.0463,
"step": 5740
},
{
"epoch": 22.115384615384617,
"grad_norm": 1.7244971990585327,
"learning_rate": 2.560835534238082e-05,
"loss": 0.0466,
"step": 5750
},
{
"epoch": 22.153846153846153,
"grad_norm": 2.1806528568267822,
"learning_rate": 2.5590806053187793e-05,
"loss": 0.0483,
"step": 5760
},
{
"epoch": 22.192307692307693,
"grad_norm": 1.9255647659301758,
"learning_rate": 2.557322780913542e-05,
"loss": 0.0489,
"step": 5770
},
{
"epoch": 22.23076923076923,
"grad_norm": 2.2142374515533447,
"learning_rate": 2.555562065828196e-05,
"loss": 0.0527,
"step": 5780
},
{
"epoch": 22.26923076923077,
"grad_norm": 2.4573400020599365,
"learning_rate": 2.5537984648764684e-05,
"loss": 0.0451,
"step": 5790
},
{
"epoch": 22.307692307692307,
"grad_norm": 2.1143500804901123,
"learning_rate": 2.5520319828799766e-05,
"loss": 0.0479,
"step": 5800
},
{
"epoch": 22.346153846153847,
"grad_norm": 1.8646248579025269,
"learning_rate": 2.550262624668216e-05,
"loss": 0.0435,
"step": 5810
},
{
"epoch": 22.384615384615383,
"grad_norm": 1.5074269771575928,
"learning_rate": 2.5484903950785432e-05,
"loss": 0.0446,
"step": 5820
},
{
"epoch": 22.423076923076923,
"grad_norm": 2.103839874267578,
"learning_rate": 2.546715298956167e-05,
"loss": 0.0527,
"step": 5830
},
{
"epoch": 22.46153846153846,
"grad_norm": 1.9880903959274292,
"learning_rate": 2.5449373411541322e-05,
"loss": 0.0492,
"step": 5840
},
{
"epoch": 22.5,
"grad_norm": 1.8882006406784058,
"learning_rate": 2.5431565265333074e-05,
"loss": 0.0552,
"step": 5850
},
{
"epoch": 22.53846153846154,
"grad_norm": 2.3421194553375244,
"learning_rate": 2.541372859962372e-05,
"loss": 0.0487,
"step": 5860
},
{
"epoch": 22.576923076923077,
"grad_norm": 2.0303971767425537,
"learning_rate": 2.5395863463178023e-05,
"loss": 0.0467,
"step": 5870
},
{
"epoch": 22.615384615384617,
"grad_norm": 1.9958126544952393,
"learning_rate": 2.537796990483858e-05,
"loss": 0.0593,
"step": 5880
},
{
"epoch": 22.653846153846153,
"grad_norm": 2.40570068359375,
"learning_rate": 2.53600479735257e-05,
"loss": 0.046,
"step": 5890
},
{
"epoch": 22.692307692307693,
"grad_norm": 1.7980605363845825,
"learning_rate": 2.5342097718237262e-05,
"loss": 0.0496,
"step": 5900
},
{
"epoch": 22.73076923076923,
"grad_norm": 2.364152431488037,
"learning_rate": 2.5324119188048567e-05,
"loss": 0.0485,
"step": 5910
},
{
"epoch": 22.76923076923077,
"grad_norm": 2.065056085586548,
"learning_rate": 2.530611243211224e-05,
"loss": 0.0483,
"step": 5920
},
{
"epoch": 22.807692307692307,
"grad_norm": 1.8822139501571655,
"learning_rate": 2.5288077499658064e-05,
"loss": 0.0445,
"step": 5930
},
{
"epoch": 22.846153846153847,
"grad_norm": 1.9192858934402466,
"learning_rate": 2.527001443999285e-05,
"loss": 0.0462,
"step": 5940
},
{
"epoch": 22.884615384615383,
"grad_norm": 2.175851821899414,
"learning_rate": 2.5251923302500318e-05,
"loss": 0.0448,
"step": 5950
},
{
"epoch": 22.923076923076923,
"grad_norm": 2.1367673873901367,
"learning_rate": 2.523380413664095e-05,
"loss": 0.0461,
"step": 5960
},
{
"epoch": 22.96153846153846,
"grad_norm": 2.0076870918273926,
"learning_rate": 2.5215656991951844e-05,
"loss": 0.0429,
"step": 5970
},
{
"epoch": 23.0,
"grad_norm": 2.2983851432800293,
"learning_rate": 2.5197481918046606e-05,
"loss": 0.0473,
"step": 5980
},
{
"epoch": 23.03846153846154,
"grad_norm": 1.547244906425476,
"learning_rate": 2.5179278964615192e-05,
"loss": 0.0441,
"step": 5990
},
{
"epoch": 23.076923076923077,
"grad_norm": 1.8851597309112549,
"learning_rate": 2.516104818142379e-05,
"loss": 0.0494,
"step": 6000
},
{
"epoch": 23.115384615384617,
"grad_norm": 1.918543815612793,
"learning_rate": 2.5142789618314654e-05,
"loss": 0.0417,
"step": 6010
},
{
"epoch": 23.153846153846153,
"grad_norm": 1.7837193012237549,
"learning_rate": 2.5124503325206006e-05,
"loss": 0.0469,
"step": 6020
},
{
"epoch": 23.192307692307693,
"grad_norm": 1.9671348333358765,
"learning_rate": 2.5106189352091867e-05,
"loss": 0.0429,
"step": 6030
},
{
"epoch": 23.23076923076923,
"grad_norm": 1.8569093942642212,
"learning_rate": 2.5087847749041944e-05,
"loss": 0.0498,
"step": 6040
},
{
"epoch": 23.26923076923077,
"grad_norm": 2.0214736461639404,
"learning_rate": 2.506947856620148e-05,
"loss": 0.0395,
"step": 6050
},
{
"epoch": 23.307692307692307,
"grad_norm": 1.9467337131500244,
"learning_rate": 2.505108185379111e-05,
"loss": 0.046,
"step": 6060
},
{
"epoch": 23.346153846153847,
"grad_norm": 1.8502717018127441,
"learning_rate": 2.503265766210676e-05,
"loss": 0.044,
"step": 6070
},
{
"epoch": 23.384615384615383,
"grad_norm": 2.310283660888672,
"learning_rate": 2.5014206041519456e-05,
"loss": 0.0521,
"step": 6080
},
{
"epoch": 23.423076923076923,
"grad_norm": 1.6830577850341797,
"learning_rate": 2.499572704247523e-05,
"loss": 0.0458,
"step": 6090
},
{
"epoch": 23.46153846153846,
"grad_norm": 1.6736561059951782,
"learning_rate": 2.497722071549495e-05,
"loss": 0.0422,
"step": 6100
},
{
"epoch": 23.5,
"grad_norm": 1.5868667364120483,
"learning_rate": 2.4958687111174216e-05,
"loss": 0.0432,
"step": 6110
},
{
"epoch": 23.53846153846154,
"grad_norm": 2.2007408142089844,
"learning_rate": 2.494012628018319e-05,
"loss": 0.0495,
"step": 6120
},
{
"epoch": 23.576923076923077,
"grad_norm": 1.7047786712646484,
"learning_rate": 2.4921538273266475e-05,
"loss": 0.0491,
"step": 6130
},
{
"epoch": 23.615384615384617,
"grad_norm": 2.0366289615631104,
"learning_rate": 2.490292314124298e-05,
"loss": 0.0413,
"step": 6140
},
{
"epoch": 23.653846153846153,
"grad_norm": 2.104093551635742,
"learning_rate": 2.4884280935005755e-05,
"loss": 0.0395,
"step": 6150
},
{
"epoch": 23.692307692307693,
"grad_norm": 2.0423264503479004,
"learning_rate": 2.486561170552188e-05,
"loss": 0.051,
"step": 6160
},
{
"epoch": 23.73076923076923,
"grad_norm": 2.2798447608947754,
"learning_rate": 2.4846915503832326e-05,
"loss": 0.0504,
"step": 6170
},
{
"epoch": 23.76923076923077,
"grad_norm": 2.168039083480835,
"learning_rate": 2.4828192381051787e-05,
"loss": 0.0494,
"step": 6180
},
{
"epoch": 23.807692307692307,
"grad_norm": 2.1671855449676514,
"learning_rate": 2.480944238836857e-05,
"loss": 0.0532,
"step": 6190
},
{
"epoch": 23.846153846153847,
"grad_norm": 1.7856274843215942,
"learning_rate": 2.4790665577044428e-05,
"loss": 0.046,
"step": 6200
},
{
"epoch": 23.884615384615383,
"grad_norm": 2.0235636234283447,
"learning_rate": 2.4771861998414458e-05,
"loss": 0.0465,
"step": 6210
},
{
"epoch": 23.923076923076923,
"grad_norm": 2.179847478866577,
"learning_rate": 2.475303170388692e-05,
"loss": 0.0447,
"step": 6220
},
{
"epoch": 23.96153846153846,
"grad_norm": 1.6464632749557495,
"learning_rate": 2.4734174744943122e-05,
"loss": 0.0449,
"step": 6230
},
{
"epoch": 24.0,
"grad_norm": 1.7687687873840332,
"learning_rate": 2.471529117313727e-05,
"loss": 0.0447,
"step": 6240
},
{
"epoch": 24.03846153846154,
"grad_norm": 1.8370416164398193,
"learning_rate": 2.4696381040096335e-05,
"loss": 0.0476,
"step": 6250
},
{
"epoch": 24.076923076923077,
"grad_norm": 1.8568719625473022,
"learning_rate": 2.4677444397519883e-05,
"loss": 0.0478,
"step": 6260
},
{
"epoch": 24.115384615384617,
"grad_norm": 1.9673206806182861,
"learning_rate": 2.4658481297179987e-05,
"loss": 0.0466,
"step": 6270
},
{
"epoch": 24.153846153846153,
"grad_norm": 2.064671516418457,
"learning_rate": 2.4639491790921028e-05,
"loss": 0.0482,
"step": 6280
},
{
"epoch": 24.192307692307693,
"grad_norm": 2.0179781913757324,
"learning_rate": 2.4620475930659596e-05,
"loss": 0.0491,
"step": 6290
},
{
"epoch": 24.23076923076923,
"grad_norm": 1.8786386251449585,
"learning_rate": 2.4601433768384327e-05,
"loss": 0.042,
"step": 6300
},
{
"epoch": 24.26923076923077,
"grad_norm": 2.0720064640045166,
"learning_rate": 2.4582365356155766e-05,
"loss": 0.0452,
"step": 6310
},
{
"epoch": 24.307692307692307,
"grad_norm": 2.04176926612854,
"learning_rate": 2.4563270746106224e-05,
"loss": 0.0432,
"step": 6320
},
{
"epoch": 24.346153846153847,
"grad_norm": 1.7975056171417236,
"learning_rate": 2.4544149990439632e-05,
"loss": 0.0431,
"step": 6330
},
{
"epoch": 24.384615384615383,
"grad_norm": 1.9778639078140259,
"learning_rate": 2.4525003141431413e-05,
"loss": 0.0431,
"step": 6340
},
{
"epoch": 24.423076923076923,
"grad_norm": 1.931883454322815,
"learning_rate": 2.450583025142831e-05,
"loss": 0.0411,
"step": 6350
},
{
"epoch": 24.46153846153846,
"grad_norm": 2.43259596824646,
"learning_rate": 2.4486631372848286e-05,
"loss": 0.0464,
"step": 6360
},
{
"epoch": 24.5,
"grad_norm": 2.0310606956481934,
"learning_rate": 2.4467406558180328e-05,
"loss": 0.0453,
"step": 6370
},
{
"epoch": 24.53846153846154,
"grad_norm": 1.961578130722046,
"learning_rate": 2.4448155859984357e-05,
"loss": 0.0425,
"step": 6380
},
{
"epoch": 24.576923076923077,
"grad_norm": 1.9241677522659302,
"learning_rate": 2.442887933089104e-05,
"loss": 0.0489,
"step": 6390
},
{
"epoch": 24.615384615384617,
"grad_norm": 1.6629705429077148,
"learning_rate": 2.440957702360167e-05,
"loss": 0.0449,
"step": 6400
},
{
"epoch": 24.653846153846153,
"grad_norm": 1.9020706415176392,
"learning_rate": 2.4390248990888026e-05,
"loss": 0.0468,
"step": 6410
},
{
"epoch": 24.692307692307693,
"grad_norm": 1.9919699430465698,
"learning_rate": 2.4370895285592202e-05,
"loss": 0.047,
"step": 6420
},
{
"epoch": 24.73076923076923,
"grad_norm": 1.7475947141647339,
"learning_rate": 2.43515159606265e-05,
"loss": 0.041,
"step": 6430
},
{
"epoch": 24.76923076923077,
"grad_norm": 1.8714221715927124,
"learning_rate": 2.4332111068973243e-05,
"loss": 0.045,
"step": 6440
},
{
"epoch": 24.807692307692307,
"grad_norm": 1.8813892602920532,
"learning_rate": 2.4312680663684674e-05,
"loss": 0.0416,
"step": 6450
},
{
"epoch": 24.846153846153847,
"grad_norm": 1.9131197929382324,
"learning_rate": 2.429322479788277e-05,
"loss": 0.0423,
"step": 6460
},
{
"epoch": 24.884615384615383,
"grad_norm": 1.9985467195510864,
"learning_rate": 2.4273743524759132e-05,
"loss": 0.0496,
"step": 6470
},
{
"epoch": 24.923076923076923,
"grad_norm": 2.1156630516052246,
"learning_rate": 2.4254236897574818e-05,
"loss": 0.0425,
"step": 6480
},
{
"epoch": 24.96153846153846,
"grad_norm": 2.0164928436279297,
"learning_rate": 2.4234704969660192e-05,
"loss": 0.0421,
"step": 6490
},
{
"epoch": 25.0,
"grad_norm": 1.9131778478622437,
"learning_rate": 2.4215147794414806e-05,
"loss": 0.0431,
"step": 6500
},
{
"epoch": 25.03846153846154,
"grad_norm": 2.0394511222839355,
"learning_rate": 2.419556542530723e-05,
"loss": 0.0462,
"step": 6510
},
{
"epoch": 25.076923076923077,
"grad_norm": 1.879845142364502,
"learning_rate": 2.4175957915874916e-05,
"loss": 0.0489,
"step": 6520
},
{
"epoch": 25.115384615384617,
"grad_norm": 1.9313360452651978,
"learning_rate": 2.4156325319724037e-05,
"loss": 0.0473,
"step": 6530
},
{
"epoch": 25.153846153846153,
"grad_norm": 1.9558639526367188,
"learning_rate": 2.4136667690529372e-05,
"loss": 0.0432,
"step": 6540
},
{
"epoch": 25.192307692307693,
"grad_norm": 1.932648777961731,
"learning_rate": 2.4116985082034126e-05,
"loss": 0.0441,
"step": 6550
},
{
"epoch": 25.23076923076923,
"grad_norm": 1.8341952562332153,
"learning_rate": 2.409727754804979e-05,
"loss": 0.0484,
"step": 6560
},
{
"epoch": 25.26923076923077,
"grad_norm": 2.142963171005249,
"learning_rate": 2.4077545142456025e-05,
"loss": 0.0491,
"step": 6570
},
{
"epoch": 25.307692307692307,
"grad_norm": 2.0177972316741943,
"learning_rate": 2.405778791920046e-05,
"loss": 0.0581,
"step": 6580
},
{
"epoch": 25.346153846153847,
"grad_norm": 2.0351908206939697,
"learning_rate": 2.4038005932298594e-05,
"loss": 0.0448,
"step": 6590
},
{
"epoch": 25.384615384615383,
"grad_norm": 1.8626478910446167,
"learning_rate": 2.4018199235833624e-05,
"loss": 0.0428,
"step": 6600
},
{
"epoch": 25.423076923076923,
"grad_norm": 1.7484854459762573,
"learning_rate": 2.3998367883956306e-05,
"loss": 0.0462,
"step": 6610
},
{
"epoch": 25.46153846153846,
"grad_norm": 1.729616403579712,
"learning_rate": 2.3978511930884795e-05,
"loss": 0.0421,
"step": 6620
},
{
"epoch": 25.5,
"grad_norm": 1.5235844850540161,
"learning_rate": 2.3958631430904504e-05,
"loss": 0.0489,
"step": 6630
},
{
"epoch": 25.53846153846154,
"grad_norm": 1.9413248300552368,
"learning_rate": 2.393872643836797e-05,
"loss": 0.0464,
"step": 6640
},
{
"epoch": 25.576923076923077,
"grad_norm": 1.6310465335845947,
"learning_rate": 2.3918797007694675e-05,
"loss": 0.0412,
"step": 6650
},
{
"epoch": 25.615384615384617,
"grad_norm": 2.049593925476074,
"learning_rate": 2.3898843193370923e-05,
"loss": 0.0451,
"step": 6660
},
{
"epoch": 25.653846153846153,
"grad_norm": 1.554084062576294,
"learning_rate": 2.387886504994969e-05,
"loss": 0.0434,
"step": 6670
},
{
"epoch": 25.692307692307693,
"grad_norm": 2.2045257091522217,
"learning_rate": 2.385886263205044e-05,
"loss": 0.0458,
"step": 6680
},
{
"epoch": 25.73076923076923,
"grad_norm": 2.1516737937927246,
"learning_rate": 2.3838835994359036e-05,
"loss": 0.0445,
"step": 6690
},
{
"epoch": 25.76923076923077,
"grad_norm": 1.9684396982192993,
"learning_rate": 2.3818785191627525e-05,
"loss": 0.0442,
"step": 6700
},
{
"epoch": 25.807692307692307,
"grad_norm": 1.7377302646636963,
"learning_rate": 2.379871027867405e-05,
"loss": 0.0443,
"step": 6710
},
{
"epoch": 25.846153846153847,
"grad_norm": 1.860579490661621,
"learning_rate": 2.3778611310382653e-05,
"loss": 0.043,
"step": 6720
},
{
"epoch": 25.884615384615383,
"grad_norm": 2.0663163661956787,
"learning_rate": 2.3758488341703137e-05,
"loss": 0.0444,
"step": 6730
},
{
"epoch": 25.923076923076923,
"grad_norm": 1.8501702547073364,
"learning_rate": 2.3738341427650945e-05,
"loss": 0.0497,
"step": 6740
},
{
"epoch": 25.96153846153846,
"grad_norm": 1.7028001546859741,
"learning_rate": 2.3718170623306955e-05,
"loss": 0.0418,
"step": 6750
},
{
"epoch": 26.0,
"grad_norm": 1.7601794004440308,
"learning_rate": 2.369797598381739e-05,
"loss": 0.0413,
"step": 6760
},
{
"epoch": 26.03846153846154,
"grad_norm": 1.8938238620758057,
"learning_rate": 2.3677757564393612e-05,
"loss": 0.0397,
"step": 6770
},
{
"epoch": 26.076923076923077,
"grad_norm": 2.0491912364959717,
"learning_rate": 2.3657515420312015e-05,
"loss": 0.0414,
"step": 6780
},
{
"epoch": 26.115384615384617,
"grad_norm": 2.1569526195526123,
"learning_rate": 2.3637249606913847e-05,
"loss": 0.045,
"step": 6790
},
{
"epoch": 26.153846153846153,
"grad_norm": 1.8800678253173828,
"learning_rate": 2.3616960179605064e-05,
"loss": 0.0532,
"step": 6800
},
{
"epoch": 26.192307692307693,
"grad_norm": 1.7340847253799438,
"learning_rate": 2.3596647193856188e-05,
"loss": 0.0409,
"step": 6810
},
{
"epoch": 26.23076923076923,
"grad_norm": 2.0529165267944336,
"learning_rate": 2.3576310705202143e-05,
"loss": 0.0409,
"step": 6820
},
{
"epoch": 26.26923076923077,
"grad_norm": 1.351607322692871,
"learning_rate": 2.3555950769242122e-05,
"loss": 0.0378,
"step": 6830
},
{
"epoch": 26.307692307692307,
"grad_norm": 1.7300976514816284,
"learning_rate": 2.3535567441639396e-05,
"loss": 0.0416,
"step": 6840
},
{
"epoch": 26.346153846153847,
"grad_norm": 1.606675148010254,
"learning_rate": 2.351516077812122e-05,
"loss": 0.0385,
"step": 6850
},
{
"epoch": 26.384615384615383,
"grad_norm": 2.1588430404663086,
"learning_rate": 2.349473083447863e-05,
"loss": 0.0479,
"step": 6860
},
{
"epoch": 26.423076923076923,
"grad_norm": 1.7339332103729248,
"learning_rate": 2.3474277666566307e-05,
"loss": 0.0499,
"step": 6870
},
{
"epoch": 26.46153846153846,
"grad_norm": 1.6982123851776123,
"learning_rate": 2.345380133030243e-05,
"loss": 0.0507,
"step": 6880
},
{
"epoch": 26.5,
"grad_norm": 1.991782307624817,
"learning_rate": 2.343330188166853e-05,
"loss": 0.0455,
"step": 6890
},
{
"epoch": 26.53846153846154,
"grad_norm": 1.6237735748291016,
"learning_rate": 2.3412779376709304e-05,
"loss": 0.043,
"step": 6900
},
{
"epoch": 26.576923076923077,
"grad_norm": 1.8890820741653442,
"learning_rate": 2.3392233871532504e-05,
"loss": 0.0465,
"step": 6910
},
{
"epoch": 26.615384615384617,
"grad_norm": 1.9966788291931152,
"learning_rate": 2.337166542230876e-05,
"loss": 0.0509,
"step": 6920
},
{
"epoch": 26.653846153846153,
"grad_norm": 2.028984546661377,
"learning_rate": 2.335107408527142e-05,
"loss": 0.0391,
"step": 6930
},
{
"epoch": 26.692307692307693,
"grad_norm": 1.8344041109085083,
"learning_rate": 2.3330459916716417e-05,
"loss": 0.0387,
"step": 6940
},
{
"epoch": 26.73076923076923,
"grad_norm": 1.8030184507369995,
"learning_rate": 2.3309822973002097e-05,
"loss": 0.0476,
"step": 6950
},
{
"epoch": 26.76923076923077,
"grad_norm": 2.0031795501708984,
"learning_rate": 2.328916331054908e-05,
"loss": 0.0437,
"step": 6960
},
{
"epoch": 26.807692307692307,
"grad_norm": 1.6738231182098389,
"learning_rate": 2.3268480985840093e-05,
"loss": 0.0423,
"step": 6970
},
{
"epoch": 26.846153846153847,
"grad_norm": 1.5337883234024048,
"learning_rate": 2.3247776055419826e-05,
"loss": 0.0417,
"step": 6980
},
{
"epoch": 26.884615384615383,
"grad_norm": 1.6017706394195557,
"learning_rate": 2.3227048575894758e-05,
"loss": 0.0451,
"step": 6990
},
{
"epoch": 26.923076923076923,
"grad_norm": 1.8597776889801025,
"learning_rate": 2.3206298603933037e-05,
"loss": 0.04,
"step": 7000
},
{
"epoch": 26.96153846153846,
"grad_norm": 1.6399952173233032,
"learning_rate": 2.3185526196264288e-05,
"loss": 0.0434,
"step": 7010
},
{
"epoch": 27.0,
"grad_norm": 1.3050568103790283,
"learning_rate": 2.3164731409679476e-05,
"loss": 0.0364,
"step": 7020
},
{
"epoch": 27.03846153846154,
"grad_norm": 1.4773322343826294,
"learning_rate": 2.3143914301030765e-05,
"loss": 0.0443,
"step": 7030
},
{
"epoch": 27.076923076923077,
"grad_norm": 1.4294204711914062,
"learning_rate": 2.3123074927231332e-05,
"loss": 0.0451,
"step": 7040
},
{
"epoch": 27.115384615384617,
"grad_norm": 1.8464568853378296,
"learning_rate": 2.310221334525522e-05,
"loss": 0.0509,
"step": 7050
},
{
"epoch": 27.153846153846153,
"grad_norm": 1.5100023746490479,
"learning_rate": 2.3081329612137207e-05,
"loss": 0.041,
"step": 7060
},
{
"epoch": 27.192307692307693,
"grad_norm": 1.688292145729065,
"learning_rate": 2.3060423784972625e-05,
"loss": 0.045,
"step": 7070
},
{
"epoch": 27.23076923076923,
"grad_norm": 1.5028406381607056,
"learning_rate": 2.3039495920917193e-05,
"loss": 0.0398,
"step": 7080
},
{
"epoch": 27.26923076923077,
"grad_norm": 1.6817067861557007,
"learning_rate": 2.301854607718691e-05,
"loss": 0.0399,
"step": 7090
},
{
"epoch": 27.307692307692307,
"grad_norm": 1.7924225330352783,
"learning_rate": 2.299757431105783e-05,
"loss": 0.0435,
"step": 7100
},
{
"epoch": 27.346153846153847,
"grad_norm": 1.8683700561523438,
"learning_rate": 2.2976580679865972e-05,
"loss": 0.0438,
"step": 7110
},
{
"epoch": 27.384615384615383,
"grad_norm": 1.6464298963546753,
"learning_rate": 2.2955565241007123e-05,
"loss": 0.0462,
"step": 7120
},
{
"epoch": 27.423076923076923,
"grad_norm": 1.7487648725509644,
"learning_rate": 2.293452805193669e-05,
"loss": 0.0407,
"step": 7130
},
{
"epoch": 27.46153846153846,
"grad_norm": 1.8740178346633911,
"learning_rate": 2.291346917016954e-05,
"loss": 0.046,
"step": 7140
},
{
"epoch": 27.5,
"grad_norm": 1.5780128240585327,
"learning_rate": 2.289238865327985e-05,
"loss": 0.0451,
"step": 7150
},
{
"epoch": 27.53846153846154,
"grad_norm": 1.7798570394515991,
"learning_rate": 2.2871286558900956e-05,
"loss": 0.0461,
"step": 7160
},
{
"epoch": 27.576923076923077,
"grad_norm": 2.028824806213379,
"learning_rate": 2.285016294472517e-05,
"loss": 0.0448,
"step": 7170
},
{
"epoch": 27.615384615384617,
"grad_norm": 1.669351577758789,
"learning_rate": 2.2829017868503658e-05,
"loss": 0.042,
"step": 7180
},
{
"epoch": 27.653846153846153,
"grad_norm": 2.1708717346191406,
"learning_rate": 2.280785138804624e-05,
"loss": 0.0513,
"step": 7190
},
{
"epoch": 27.692307692307693,
"grad_norm": 1.5223692655563354,
"learning_rate": 2.2786663561221265e-05,
"loss": 0.0407,
"step": 7200
},
{
"epoch": 27.73076923076923,
"grad_norm": 1.670336365699768,
"learning_rate": 2.2765454445955452e-05,
"loss": 0.0369,
"step": 7210
},
{
"epoch": 27.76923076923077,
"grad_norm": 1.4955790042877197,
"learning_rate": 2.2744224100233705e-05,
"loss": 0.0479,
"step": 7220
},
{
"epoch": 27.807692307692307,
"grad_norm": 1.7686519622802734,
"learning_rate": 2.2722972582098984e-05,
"loss": 0.0408,
"step": 7230
},
{
"epoch": 27.846153846153847,
"grad_norm": 2.064610004425049,
"learning_rate": 2.2701699949652118e-05,
"loss": 0.0415,
"step": 7240
},
{
"epoch": 27.884615384615383,
"grad_norm": 2.065558433532715,
"learning_rate": 2.2680406261051685e-05,
"loss": 0.0386,
"step": 7250
},
{
"epoch": 27.923076923076923,
"grad_norm": 1.6095936298370361,
"learning_rate": 2.2659091574513805e-05,
"loss": 0.0433,
"step": 7260
},
{
"epoch": 27.96153846153846,
"grad_norm": 1.7477508783340454,
"learning_rate": 2.263775594831202e-05,
"loss": 0.0431,
"step": 7270
},
{
"epoch": 28.0,
"grad_norm": 1.827488899230957,
"learning_rate": 2.2616399440777128e-05,
"loss": 0.0389,
"step": 7280
},
{
"epoch": 28.03846153846154,
"grad_norm": 1.7892285585403442,
"learning_rate": 2.2595022110296988e-05,
"loss": 0.042,
"step": 7290
},
{
"epoch": 28.076923076923077,
"grad_norm": 2.0402352809906006,
"learning_rate": 2.2573624015316418e-05,
"loss": 0.0459,
"step": 7300
},
{
"epoch": 28.115384615384617,
"grad_norm": 1.7068613767623901,
"learning_rate": 2.2552205214336986e-05,
"loss": 0.0445,
"step": 7310
},
{
"epoch": 28.153846153846153,
"grad_norm": 1.5043061971664429,
"learning_rate": 2.253076576591688e-05,
"loss": 0.041,
"step": 7320
},
{
"epoch": 28.192307692307693,
"grad_norm": 1.7505626678466797,
"learning_rate": 2.2509305728670733e-05,
"loss": 0.0458,
"step": 7330
},
{
"epoch": 28.23076923076923,
"grad_norm": 1.7237753868103027,
"learning_rate": 2.2487825161269463e-05,
"loss": 0.0417,
"step": 7340
},
{
"epoch": 28.26923076923077,
"grad_norm": 1.7082198858261108,
"learning_rate": 2.2466324122440125e-05,
"loss": 0.0397,
"step": 7350
},
{
"epoch": 28.307692307692307,
"grad_norm": 1.798220157623291,
"learning_rate": 2.2444802670965732e-05,
"loss": 0.0403,
"step": 7360
},
{
"epoch": 28.346153846153847,
"grad_norm": 2.100522994995117,
"learning_rate": 2.2423260865685124e-05,
"loss": 0.0401,
"step": 7370
},
{
"epoch": 28.384615384615383,
"grad_norm": 2.088228702545166,
"learning_rate": 2.2401698765492762e-05,
"loss": 0.0438,
"step": 7380
},
{
"epoch": 28.423076923076923,
"grad_norm": 1.6894526481628418,
"learning_rate": 2.2380116429338612e-05,
"loss": 0.0445,
"step": 7390
},
{
"epoch": 28.46153846153846,
"grad_norm": 1.8538814783096313,
"learning_rate": 2.2358513916227945e-05,
"loss": 0.0418,
"step": 7400
},
{
"epoch": 28.5,
"grad_norm": 1.6178901195526123,
"learning_rate": 2.233689128522122e-05,
"loss": 0.0372,
"step": 7410
},
{
"epoch": 28.53846153846154,
"grad_norm": 1.6388591527938843,
"learning_rate": 2.2315248595433883e-05,
"loss": 0.0455,
"step": 7420
},
{
"epoch": 28.576923076923077,
"grad_norm": 1.7331197261810303,
"learning_rate": 2.2293585906036214e-05,
"loss": 0.0421,
"step": 7430
},
{
"epoch": 28.615384615384617,
"grad_norm": 1.5612764358520508,
"learning_rate": 2.2271903276253183e-05,
"loss": 0.0426,
"step": 7440
},
{
"epoch": 28.653846153846153,
"grad_norm": 1.5970007181167603,
"learning_rate": 2.2250200765364273e-05,
"loss": 0.0393,
"step": 7450
},
{
"epoch": 28.692307692307693,
"grad_norm": 1.7833430767059326,
"learning_rate": 2.2228478432703317e-05,
"loss": 0.043,
"step": 7460
},
{
"epoch": 28.73076923076923,
"grad_norm": 1.7582666873931885,
"learning_rate": 2.2206736337658348e-05,
"loss": 0.0426,
"step": 7470
},
{
"epoch": 28.76923076923077,
"grad_norm": 1.8394339084625244,
"learning_rate": 2.2184974539671417e-05,
"loss": 0.0421,
"step": 7480
},
{
"epoch": 28.807692307692307,
"grad_norm": 2.0509567260742188,
"learning_rate": 2.2163193098238453e-05,
"loss": 0.0468,
"step": 7490
},
{
"epoch": 28.846153846153847,
"grad_norm": 1.9507832527160645,
"learning_rate": 2.2141392072909082e-05,
"loss": 0.0427,
"step": 7500
},
{
"epoch": 28.884615384615383,
"grad_norm": 2.1092071533203125,
"learning_rate": 2.2119571523286484e-05,
"loss": 0.037,
"step": 7510
},
{
"epoch": 28.923076923076923,
"grad_norm": 1.6048542261123657,
"learning_rate": 2.2097731509027196e-05,
"loss": 0.045,
"step": 7520
},
{
"epoch": 28.96153846153846,
"grad_norm": 1.5821419954299927,
"learning_rate": 2.207587208984099e-05,
"loss": 0.0443,
"step": 7530
},
{
"epoch": 29.0,
"grad_norm": 1.5150803327560425,
"learning_rate": 2.205399332549068e-05,
"loss": 0.0385,
"step": 7540
},
{
"epoch": 29.03846153846154,
"grad_norm": 1.6221213340759277,
"learning_rate": 2.2032095275791974e-05,
"loss": 0.0407,
"step": 7550
},
{
"epoch": 29.076923076923077,
"grad_norm": 1.737960934638977,
"learning_rate": 2.2010178000613307e-05,
"loss": 0.0407,
"step": 7560
},
{
"epoch": 29.115384615384617,
"grad_norm": 1.7430760860443115,
"learning_rate": 2.1988241559875666e-05,
"loss": 0.0417,
"step": 7570
},
{
"epoch": 29.153846153846153,
"grad_norm": 1.7095967531204224,
"learning_rate": 2.1966286013552448e-05,
"loss": 0.0466,
"step": 7580
},
{
"epoch": 29.192307692307693,
"grad_norm": 1.701668381690979,
"learning_rate": 2.1944311421669274e-05,
"loss": 0.0432,
"step": 7590
},
{
"epoch": 29.23076923076923,
"grad_norm": 1.6135324239730835,
"learning_rate": 2.1922317844303846e-05,
"loss": 0.045,
"step": 7600
},
{
"epoch": 29.26923076923077,
"grad_norm": 1.5123496055603027,
"learning_rate": 2.1900305341585756e-05,
"loss": 0.037,
"step": 7610
},
{
"epoch": 29.307692307692307,
"grad_norm": 1.4518818855285645,
"learning_rate": 2.187827397369635e-05,
"loss": 0.0387,
"step": 7620
},
{
"epoch": 29.346153846153847,
"grad_norm": 1.8243811130523682,
"learning_rate": 2.1856223800868542e-05,
"loss": 0.0408,
"step": 7630
},
{
"epoch": 29.384615384615383,
"grad_norm": 1.6765333414077759,
"learning_rate": 2.183415488338667e-05,
"loss": 0.0421,
"step": 7640
},
{
"epoch": 29.423076923076923,
"grad_norm": 1.7374074459075928,
"learning_rate": 2.1812067281586312e-05,
"loss": 0.0449,
"step": 7650
},
{
"epoch": 29.46153846153846,
"grad_norm": 1.3828516006469727,
"learning_rate": 2.178996105585412e-05,
"loss": 0.0416,
"step": 7660
},
{
"epoch": 29.5,
"grad_norm": 1.41438627243042,
"learning_rate": 2.1767836266627676e-05,
"loss": 0.0416,
"step": 7670
},
{
"epoch": 29.53846153846154,
"grad_norm": 1.4291772842407227,
"learning_rate": 2.174569297439531e-05,
"loss": 0.0381,
"step": 7680
},
{
"epoch": 29.576923076923077,
"grad_norm": 1.4206595420837402,
"learning_rate": 2.1723531239695932e-05,
"loss": 0.0411,
"step": 7690
},
{
"epoch": 29.615384615384617,
"grad_norm": 1.4999078512191772,
"learning_rate": 2.1701351123118886e-05,
"loss": 0.0406,
"step": 7700
},
{
"epoch": 29.653846153846153,
"grad_norm": 1.9684511423110962,
"learning_rate": 2.167915268530376e-05,
"loss": 0.0438,
"step": 7710
},
{
"epoch": 29.692307692307693,
"grad_norm": 1.7212599515914917,
"learning_rate": 2.165693598694023e-05,
"loss": 0.0397,
"step": 7720
},
{
"epoch": 29.73076923076923,
"grad_norm": 1.5021146535873413,
"learning_rate": 2.163470108876791e-05,
"loss": 0.0478,
"step": 7730
},
{
"epoch": 29.76923076923077,
"grad_norm": 1.5587773323059082,
"learning_rate": 2.161244805157616e-05,
"loss": 0.0357,
"step": 7740
},
{
"epoch": 29.807692307692307,
"grad_norm": 1.5270205736160278,
"learning_rate": 2.159017693620393e-05,
"loss": 0.0415,
"step": 7750
},
{
"epoch": 29.846153846153847,
"grad_norm": 1.5055110454559326,
"learning_rate": 2.15678878035396e-05,
"loss": 0.04,
"step": 7760
},
{
"epoch": 29.884615384615383,
"grad_norm": 1.9528456926345825,
"learning_rate": 2.1545580714520817e-05,
"loss": 0.0351,
"step": 7770
},
{
"epoch": 29.923076923076923,
"grad_norm": 1.9646743535995483,
"learning_rate": 2.1523255730134294e-05,
"loss": 0.0434,
"step": 7780
},
{
"epoch": 29.96153846153846,
"grad_norm": 1.6051669120788574,
"learning_rate": 2.15009129114157e-05,
"loss": 0.0373,
"step": 7790
},
{
"epoch": 30.0,
"grad_norm": 1.7047405242919922,
"learning_rate": 2.1478552319449443e-05,
"loss": 0.0408,
"step": 7800
},
{
"epoch": 30.03846153846154,
"grad_norm": 1.537502646446228,
"learning_rate": 2.1456174015368527e-05,
"loss": 0.0426,
"step": 7810
},
{
"epoch": 30.076923076923077,
"grad_norm": 1.4524402618408203,
"learning_rate": 2.1433778060354375e-05,
"loss": 0.0464,
"step": 7820
},
{
"epoch": 30.115384615384617,
"grad_norm": 1.7234159708023071,
"learning_rate": 2.1411364515636685e-05,
"loss": 0.0512,
"step": 7830
},
{
"epoch": 30.153846153846153,
"grad_norm": 1.512885570526123,
"learning_rate": 2.1388933442493232e-05,
"loss": 0.0438,
"step": 7840
},
{
"epoch": 30.192307692307693,
"grad_norm": 1.5967854261398315,
"learning_rate": 2.13664849022497e-05,
"loss": 0.039,
"step": 7850
},
{
"epoch": 30.23076923076923,
"grad_norm": 2.0385942459106445,
"learning_rate": 2.1344018956279547e-05,
"loss": 0.0369,
"step": 7860
},
{
"epoch": 30.26923076923077,
"grad_norm": 1.9069523811340332,
"learning_rate": 2.1321535666003817e-05,
"loss": 0.0392,
"step": 7870
},
{
"epoch": 30.307692307692307,
"grad_norm": 1.7070564031600952,
"learning_rate": 2.1299035092890966e-05,
"loss": 0.0447,
"step": 7880
},
{
"epoch": 30.346153846153847,
"grad_norm": 1.4670286178588867,
"learning_rate": 2.12765172984567e-05,
"loss": 0.0416,
"step": 7890
},
{
"epoch": 30.384615384615383,
"grad_norm": 1.7784276008605957,
"learning_rate": 2.1253982344263803e-05,
"loss": 0.0378,
"step": 7900
},
{
"epoch": 30.423076923076923,
"grad_norm": 1.4911285638809204,
"learning_rate": 2.1231430291921987e-05,
"loss": 0.0459,
"step": 7910
},
{
"epoch": 30.46153846153846,
"grad_norm": 1.4901173114776611,
"learning_rate": 2.1208861203087695e-05,
"loss": 0.0404,
"step": 7920
},
{
"epoch": 30.5,
"grad_norm": 1.8902019262313843,
"learning_rate": 2.1186275139463967e-05,
"loss": 0.0421,
"step": 7930
},
{
"epoch": 30.53846153846154,
"grad_norm": 1.5284932851791382,
"learning_rate": 2.1163672162800222e-05,
"loss": 0.0405,
"step": 7940
},
{
"epoch": 30.576923076923077,
"grad_norm": 1.6077547073364258,
"learning_rate": 2.114105233489215e-05,
"loss": 0.0505,
"step": 7950
},
{
"epoch": 30.615384615384617,
"grad_norm": 1.7635397911071777,
"learning_rate": 2.1118415717581487e-05,
"loss": 0.0509,
"step": 7960
},
{
"epoch": 30.653846153846153,
"grad_norm": 1.7803516387939453,
"learning_rate": 2.1095762372755885e-05,
"loss": 0.0428,
"step": 7970
},
{
"epoch": 30.692307692307693,
"grad_norm": 1.7161511182785034,
"learning_rate": 2.1073092362348716e-05,
"loss": 0.0496,
"step": 7980
},
{
"epoch": 30.73076923076923,
"grad_norm": 1.8986924886703491,
"learning_rate": 2.1050405748338933e-05,
"loss": 0.0425,
"step": 7990
},
{
"epoch": 30.76923076923077,
"grad_norm": 1.8261290788650513,
"learning_rate": 2.102770259275087e-05,
"loss": 0.0395,
"step": 8000
},
{
"epoch": 30.807692307692307,
"grad_norm": 1.6952645778656006,
"learning_rate": 2.100498295765408e-05,
"loss": 0.0454,
"step": 8010
},
{
"epoch": 30.846153846153847,
"grad_norm": 1.8140101432800293,
"learning_rate": 2.098224690516319e-05,
"loss": 0.0411,
"step": 8020
},
{
"epoch": 30.884615384615383,
"grad_norm": 1.8083970546722412,
"learning_rate": 2.0959494497437688e-05,
"loss": 0.0427,
"step": 8030
},
{
"epoch": 30.923076923076923,
"grad_norm": 1.270798921585083,
"learning_rate": 2.0936725796681796e-05,
"loss": 0.0379,
"step": 8040
},
{
"epoch": 30.96153846153846,
"grad_norm": 1.6783995628356934,
"learning_rate": 2.0913940865144266e-05,
"loss": 0.0372,
"step": 8050
},
{
"epoch": 31.0,
"grad_norm": 1.6331335306167603,
"learning_rate": 2.0891139765118235e-05,
"loss": 0.0415,
"step": 8060
},
{
"epoch": 31.03846153846154,
"grad_norm": 1.3328851461410522,
"learning_rate": 2.086832255894104e-05,
"loss": 0.0386,
"step": 8070
},
{
"epoch": 31.076923076923077,
"grad_norm": 1.5463865995407104,
"learning_rate": 2.084548930899405e-05,
"loss": 0.0392,
"step": 8080
},
{
"epoch": 31.115384615384617,
"grad_norm": 1.4242238998413086,
"learning_rate": 2.08226400777025e-05,
"loss": 0.0371,
"step": 8090
},
{
"epoch": 31.153846153846153,
"grad_norm": 1.5513811111450195,
"learning_rate": 2.0799774927535313e-05,
"loss": 0.0368,
"step": 8100
},
{
"epoch": 31.192307692307693,
"grad_norm": 1.7626824378967285,
"learning_rate": 2.0776893921004936e-05,
"loss": 0.0394,
"step": 8110
},
{
"epoch": 31.23076923076923,
"grad_norm": 1.5672152042388916,
"learning_rate": 2.0753997120667172e-05,
"loss": 0.0431,
"step": 8120
},
{
"epoch": 31.26923076923077,
"grad_norm": 1.6233484745025635,
"learning_rate": 2.0731084589120995e-05,
"loss": 0.0358,
"step": 8130
},
{
"epoch": 31.307692307692307,
"grad_norm": 1.7482249736785889,
"learning_rate": 2.070815638900839e-05,
"loss": 0.0446,
"step": 8140
},
{
"epoch": 31.346153846153847,
"grad_norm": 1.4374932050704956,
"learning_rate": 2.0685212583014186e-05,
"loss": 0.0407,
"step": 8150
},
{
"epoch": 31.384615384615383,
"grad_norm": 1.583405613899231,
"learning_rate": 2.0662253233865866e-05,
"loss": 0.0395,
"step": 8160
},
{
"epoch": 31.423076923076923,
"grad_norm": 1.5504549741744995,
"learning_rate": 2.063927840433342e-05,
"loss": 0.0396,
"step": 8170
},
{
"epoch": 31.46153846153846,
"grad_norm": 1.6861332654953003,
"learning_rate": 2.0616288157229154e-05,
"loss": 0.0427,
"step": 8180
},
{
"epoch": 31.5,
"grad_norm": 1.3032209873199463,
"learning_rate": 2.0593282555407522e-05,
"loss": 0.0409,
"step": 8190
},
{
"epoch": 31.53846153846154,
"grad_norm": 1.734984040260315,
"learning_rate": 2.057026166176496e-05,
"loss": 0.0405,
"step": 8200
},
{
"epoch": 31.576923076923077,
"grad_norm": 1.5121088027954102,
"learning_rate": 2.0547225539239715e-05,
"loss": 0.04,
"step": 8210
},
{
"epoch": 31.615384615384617,
"grad_norm": 1.6952282190322876,
"learning_rate": 2.0524174250811665e-05,
"loss": 0.0442,
"step": 8220
},
{
"epoch": 31.653846153846153,
"grad_norm": 1.8165638446807861,
"learning_rate": 2.050110785950216e-05,
"loss": 0.0415,
"step": 8230
},
{
"epoch": 31.692307692307693,
"grad_norm": 1.6110750436782837,
"learning_rate": 2.047802642837382e-05,
"loss": 0.0377,
"step": 8240
},
{
"epoch": 31.73076923076923,
"grad_norm": 1.6655876636505127,
"learning_rate": 2.0454930020530403e-05,
"loss": 0.0385,
"step": 8250
},
{
"epoch": 31.76923076923077,
"grad_norm": 1.491156816482544,
"learning_rate": 2.0431818699116606e-05,
"loss": 0.0375,
"step": 8260
},
{
"epoch": 31.807692307692307,
"grad_norm": 1.4836500883102417,
"learning_rate": 2.04086925273179e-05,
"loss": 0.0521,
"step": 8270
},
{
"epoch": 31.846153846153847,
"grad_norm": 1.317879319190979,
"learning_rate": 2.0385551568360357e-05,
"loss": 0.0393,
"step": 8280
},
{
"epoch": 31.884615384615383,
"grad_norm": 1.4740430116653442,
"learning_rate": 2.036239588551047e-05,
"loss": 0.0401,
"step": 8290
},
{
"epoch": 31.923076923076923,
"grad_norm": 1.7209266424179077,
"learning_rate": 2.0339225542074996e-05,
"loss": 0.0352,
"step": 8300
},
{
"epoch": 31.96153846153846,
"grad_norm": 1.6838961839675903,
"learning_rate": 2.0316040601400765e-05,
"loss": 0.0433,
"step": 8310
},
{
"epoch": 32.0,
"grad_norm": 1.6667869091033936,
"learning_rate": 2.029284112687453e-05,
"loss": 0.0417,
"step": 8320
},
{
"epoch": 32.03846153846154,
"grad_norm": 1.8445954322814941,
"learning_rate": 2.0269627181922752e-05,
"loss": 0.0414,
"step": 8330
},
{
"epoch": 32.07692307692308,
"grad_norm": 1.4822769165039062,
"learning_rate": 2.0246398830011482e-05,
"loss": 0.0382,
"step": 8340
},
{
"epoch": 32.11538461538461,
"grad_norm": 1.6866445541381836,
"learning_rate": 2.0223156134646142e-05,
"loss": 0.042,
"step": 8350
},
{
"epoch": 32.15384615384615,
"grad_norm": 1.7119044065475464,
"learning_rate": 2.019989915937138e-05,
"loss": 0.0403,
"step": 8360
},
{
"epoch": 32.19230769230769,
"grad_norm": 2.0892767906188965,
"learning_rate": 2.0176627967770873e-05,
"loss": 0.035,
"step": 8370
},
{
"epoch": 32.23076923076923,
"grad_norm": 1.5972812175750732,
"learning_rate": 2.015334262346717e-05,
"loss": 0.0409,
"step": 8380
},
{
"epoch": 32.26923076923077,
"grad_norm": 1.5595967769622803,
"learning_rate": 2.0130043190121515e-05,
"loss": 0.04,
"step": 8390
},
{
"epoch": 32.30769230769231,
"grad_norm": 1.349557876586914,
"learning_rate": 2.0106729731433663e-05,
"loss": 0.0337,
"step": 8400
},
{
"epoch": 32.34615384615385,
"grad_norm": 1.450696587562561,
"learning_rate": 2.008340231114173e-05,
"loss": 0.0388,
"step": 8410
},
{
"epoch": 32.38461538461539,
"grad_norm": 1.7912949323654175,
"learning_rate": 2.006006099302199e-05,
"loss": 0.0378,
"step": 8420
},
{
"epoch": 32.42307692307692,
"grad_norm": 1.6425917148590088,
"learning_rate": 2.003670584088871e-05,
"loss": 0.0384,
"step": 8430
},
{
"epoch": 32.46153846153846,
"grad_norm": 1.5332478284835815,
"learning_rate": 2.001333691859399e-05,
"loss": 0.0348,
"step": 8440
},
{
"epoch": 32.5,
"grad_norm": 1.5672633647918701,
"learning_rate": 1.9989954290027565e-05,
"loss": 0.0379,
"step": 8450
},
{
"epoch": 32.53846153846154,
"grad_norm": 1.715303897857666,
"learning_rate": 1.9966558019116654e-05,
"loss": 0.0344,
"step": 8460
},
{
"epoch": 32.57692307692308,
"grad_norm": 1.3653415441513062,
"learning_rate": 1.9943148169825766e-05,
"loss": 0.0391,
"step": 8470
},
{
"epoch": 32.61538461538461,
"grad_norm": 1.5256359577178955,
"learning_rate": 1.991972480615653e-05,
"loss": 0.046,
"step": 8480
},
{
"epoch": 32.65384615384615,
"grad_norm": 1.432858943939209,
"learning_rate": 1.989628799214754e-05,
"loss": 0.0408,
"step": 8490
},
{
"epoch": 32.69230769230769,
"grad_norm": 1.5813000202178955,
"learning_rate": 1.987283779187414e-05,
"loss": 0.0359,
"step": 8500
},
{
"epoch": 32.73076923076923,
"grad_norm": 1.5625938177108765,
"learning_rate": 1.9849374269448288e-05,
"loss": 0.0407,
"step": 8510
},
{
"epoch": 32.76923076923077,
"grad_norm": 1.6737438440322876,
"learning_rate": 1.982589748901836e-05,
"loss": 0.0387,
"step": 8520
},
{
"epoch": 32.80769230769231,
"grad_norm": 1.765540361404419,
"learning_rate": 1.9802407514768964e-05,
"loss": 0.0369,
"step": 8530
},
{
"epoch": 32.84615384615385,
"grad_norm": 1.6643508672714233,
"learning_rate": 1.9778904410920808e-05,
"loss": 0.036,
"step": 8540
},
{
"epoch": 32.88461538461539,
"grad_norm": 1.470862865447998,
"learning_rate": 1.9755388241730475e-05,
"loss": 0.0414,
"step": 8550
},
{
"epoch": 32.92307692307692,
"grad_norm": 1.748119831085205,
"learning_rate": 1.973185907149027e-05,
"loss": 0.0362,
"step": 8560
},
{
"epoch": 32.96153846153846,
"grad_norm": 1.780103325843811,
"learning_rate": 1.970831696452805e-05,
"loss": 0.0421,
"step": 8570
},
{
"epoch": 33.0,
"grad_norm": 1.5416651964187622,
"learning_rate": 1.9684761985207038e-05,
"loss": 0.0369,
"step": 8580
},
{
"epoch": 33.03846153846154,
"grad_norm": 1.6990591287612915,
"learning_rate": 1.9661194197925644e-05,
"loss": 0.0356,
"step": 8590
},
{
"epoch": 33.07692307692308,
"grad_norm": 1.5676331520080566,
"learning_rate": 1.9637613667117303e-05,
"loss": 0.0437,
"step": 8600
},
{
"epoch": 33.11538461538461,
"grad_norm": 1.9157164096832275,
"learning_rate": 1.961402045725028e-05,
"loss": 0.0408,
"step": 8610
},
{
"epoch": 33.15384615384615,
"grad_norm": 1.409117341041565,
"learning_rate": 1.9590414632827513e-05,
"loss": 0.0414,
"step": 8620
},
{
"epoch": 33.19230769230769,
"grad_norm": 1.700792670249939,
"learning_rate": 1.9566796258386424e-05,
"loss": 0.037,
"step": 8630
},
{
"epoch": 33.23076923076923,
"grad_norm": 1.9453880786895752,
"learning_rate": 1.9543165398498743e-05,
"loss": 0.0385,
"step": 8640
},
{
"epoch": 33.26923076923077,
"grad_norm": 1.4440393447875977,
"learning_rate": 1.9519522117770355e-05,
"loss": 0.0409,
"step": 8650
},
{
"epoch": 33.30769230769231,
"grad_norm": 1.8256205320358276,
"learning_rate": 1.9495866480841063e-05,
"loss": 0.038,
"step": 8660
},
{
"epoch": 33.34615384615385,
"grad_norm": 2.067312479019165,
"learning_rate": 1.9472198552384494e-05,
"loss": 0.0378,
"step": 8670
},
{
"epoch": 33.38461538461539,
"grad_norm": 1.8080998659133911,
"learning_rate": 1.9448518397107848e-05,
"loss": 0.0446,
"step": 8680
},
{
"epoch": 33.42307692307692,
"grad_norm": 1.553263545036316,
"learning_rate": 1.942482607975177e-05,
"loss": 0.0385,
"step": 8690
},
{
"epoch": 33.46153846153846,
"grad_norm": 1.4473556280136108,
"learning_rate": 1.940112166509016e-05,
"loss": 0.0382,
"step": 8700
},
{
"epoch": 33.5,
"grad_norm": 1.6522272825241089,
"learning_rate": 1.937740521792996e-05,
"loss": 0.0339,
"step": 8710
},
{
"epoch": 33.53846153846154,
"grad_norm": 1.5314304828643799,
"learning_rate": 1.935367680311106e-05,
"loss": 0.0381,
"step": 8720
},
{
"epoch": 33.57692307692308,
"grad_norm": 1.5628886222839355,
"learning_rate": 1.9329936485506012e-05,
"loss": 0.0403,
"step": 8730
},
{
"epoch": 33.61538461538461,
"grad_norm": 1.291195034980774,
"learning_rate": 1.930618433001996e-05,
"loss": 0.0386,
"step": 8740
},
{
"epoch": 33.65384615384615,
"grad_norm": 1.1823079586029053,
"learning_rate": 1.9282420401590377e-05,
"loss": 0.0402,
"step": 8750
},
{
"epoch": 33.69230769230769,
"grad_norm": 1.515400767326355,
"learning_rate": 1.925864476518694e-05,
"loss": 0.0431,
"step": 8760
},
{
"epoch": 33.73076923076923,
"grad_norm": 1.39093017578125,
"learning_rate": 1.9234857485811336e-05,
"loss": 0.0366,
"step": 8770
},
{
"epoch": 33.76923076923077,
"grad_norm": 1.5924241542816162,
"learning_rate": 1.9211058628497066e-05,
"loss": 0.0412,
"step": 8780
},
{
"epoch": 33.80769230769231,
"grad_norm": 1.3143709897994995,
"learning_rate": 1.918724825830931e-05,
"loss": 0.0447,
"step": 8790
},
{
"epoch": 33.84615384615385,
"grad_norm": 1.6090748310089111,
"learning_rate": 1.9163426440344702e-05,
"loss": 0.0398,
"step": 8800
},
{
"epoch": 33.88461538461539,
"grad_norm": 1.4839627742767334,
"learning_rate": 1.913959323973119e-05,
"loss": 0.0405,
"step": 8810
},
{
"epoch": 33.92307692307692,
"grad_norm": 1.691476583480835,
"learning_rate": 1.9115748721627827e-05,
"loss": 0.041,
"step": 8820
},
{
"epoch": 33.96153846153846,
"grad_norm": 1.4554405212402344,
"learning_rate": 1.9091892951224614e-05,
"loss": 0.0394,
"step": 8830
},
{
"epoch": 34.0,
"grad_norm": 1.4189049005508423,
"learning_rate": 1.906802599374233e-05,
"loss": 0.0357,
"step": 8840
},
{
"epoch": 34.03846153846154,
"grad_norm": 1.9339512586593628,
"learning_rate": 1.904414791443231e-05,
"loss": 0.0432,
"step": 8850
},
{
"epoch": 34.07692307692308,
"grad_norm": 1.4830939769744873,
"learning_rate": 1.9020258778576324e-05,
"loss": 0.042,
"step": 8860
},
{
"epoch": 34.11538461538461,
"grad_norm": 1.5076172351837158,
"learning_rate": 1.8996358651486347e-05,
"loss": 0.0365,
"step": 8870
},
{
"epoch": 34.15384615384615,
"grad_norm": 1.5338919162750244,
"learning_rate": 1.8972447598504417e-05,
"loss": 0.0406,
"step": 8880
},
{
"epoch": 34.19230769230769,
"grad_norm": 1.9823853969573975,
"learning_rate": 1.8948525685002438e-05,
"loss": 0.0349,
"step": 8890
},
{
"epoch": 34.23076923076923,
"grad_norm": 1.429854393005371,
"learning_rate": 1.892459297638201e-05,
"loss": 0.0427,
"step": 8900
},
{
"epoch": 34.26923076923077,
"grad_norm": 1.8108021020889282,
"learning_rate": 1.890064953807425e-05,
"loss": 0.0341,
"step": 8910
},
{
"epoch": 34.30769230769231,
"grad_norm": 1.07712721824646,
"learning_rate": 1.8876695435539596e-05,
"loss": 0.0371,
"step": 8920
},
{
"epoch": 34.34615384615385,
"grad_norm": 1.3517080545425415,
"learning_rate": 1.8852730734267653e-05,
"loss": 0.0348,
"step": 8930
},
{
"epoch": 34.38461538461539,
"grad_norm": 1.369992733001709,
"learning_rate": 1.8828755499776997e-05,
"loss": 0.0379,
"step": 8940
},
{
"epoch": 34.42307692307692,
"grad_norm": 1.4494092464447021,
"learning_rate": 1.8804769797615007e-05,
"loss": 0.0386,
"step": 8950
},
{
"epoch": 34.46153846153846,
"grad_norm": 1.806056022644043,
"learning_rate": 1.8780773693357675e-05,
"loss": 0.0416,
"step": 8960
},
{
"epoch": 34.5,
"grad_norm": 1.4004135131835938,
"learning_rate": 1.8756767252609433e-05,
"loss": 0.034,
"step": 8970
},
{
"epoch": 34.53846153846154,
"grad_norm": 1.3696749210357666,
"learning_rate": 1.8732750541002974e-05,
"loss": 0.0393,
"step": 8980
},
{
"epoch": 34.57692307692308,
"grad_norm": 1.2756446599960327,
"learning_rate": 1.870872362419907e-05,
"loss": 0.0398,
"step": 8990
},
{
"epoch": 34.61538461538461,
"grad_norm": 1.3034799098968506,
"learning_rate": 1.8684686567886398e-05,
"loss": 0.0435,
"step": 9000
},
{
"epoch": 34.65384615384615,
"grad_norm": 1.5392398834228516,
"learning_rate": 1.8660639437781344e-05,
"loss": 0.0349,
"step": 9010
},
{
"epoch": 34.69230769230769,
"grad_norm": 1.5113469362258911,
"learning_rate": 1.8636582299627854e-05,
"loss": 0.0359,
"step": 9020
},
{
"epoch": 34.73076923076923,
"grad_norm": 1.1920346021652222,
"learning_rate": 1.8612515219197215e-05,
"loss": 0.0361,
"step": 9030
},
{
"epoch": 34.76923076923077,
"grad_norm": 1.304843783378601,
"learning_rate": 1.858843826228791e-05,
"loss": 0.0357,
"step": 9040
},
{
"epoch": 34.80769230769231,
"grad_norm": 1.7710132598876953,
"learning_rate": 1.8564351494725423e-05,
"loss": 0.0373,
"step": 9050
},
{
"epoch": 34.84615384615385,
"grad_norm": 1.4856317043304443,
"learning_rate": 1.8540254982362053e-05,
"loss": 0.0371,
"step": 9060
},
{
"epoch": 34.88461538461539,
"grad_norm": 1.8330937623977661,
"learning_rate": 1.8516148791076743e-05,
"loss": 0.0389,
"step": 9070
},
{
"epoch": 34.92307692307692,
"grad_norm": 1.504800796508789,
"learning_rate": 1.8492032986774904e-05,
"loss": 0.0359,
"step": 9080
},
{
"epoch": 34.96153846153846,
"grad_norm": 1.2242687940597534,
"learning_rate": 1.8467907635388225e-05,
"loss": 0.0371,
"step": 9090
},
{
"epoch": 35.0,
"grad_norm": 1.3216480016708374,
"learning_rate": 1.844377280287449e-05,
"loss": 0.0386,
"step": 9100
},
{
"epoch": 35.03846153846154,
"grad_norm": 1.0509378910064697,
"learning_rate": 1.8419628555217407e-05,
"loss": 0.0405,
"step": 9110
},
{
"epoch": 35.07692307692308,
"grad_norm": 1.1122572422027588,
"learning_rate": 1.839547495842644e-05,
"loss": 0.039,
"step": 9120
},
{
"epoch": 35.11538461538461,
"grad_norm": 1.334999680519104,
"learning_rate": 1.8371312078536587e-05,
"loss": 0.0316,
"step": 9130
},
{
"epoch": 35.15384615384615,
"grad_norm": 1.2729421854019165,
"learning_rate": 1.834713998160825e-05,
"loss": 0.039,
"step": 9140
},
{
"epoch": 35.19230769230769,
"grad_norm": 1.4823824167251587,
"learning_rate": 1.832295873372701e-05,
"loss": 0.0428,
"step": 9150
},
{
"epoch": 35.23076923076923,
"grad_norm": 1.5749338865280151,
"learning_rate": 1.8298768401003477e-05,
"loss": 0.0386,
"step": 9160
},
{
"epoch": 35.26923076923077,
"grad_norm": 1.4943759441375732,
"learning_rate": 1.8274569049573103e-05,
"loss": 0.0409,
"step": 9170
},
{
"epoch": 35.30769230769231,
"grad_norm": 1.4229377508163452,
"learning_rate": 1.8250360745595983e-05,
"loss": 0.0398,
"step": 9180
},
{
"epoch": 35.34615384615385,
"grad_norm": 1.192239761352539,
"learning_rate": 1.8226143555256703e-05,
"loss": 0.0348,
"step": 9190
},
{
"epoch": 35.38461538461539,
"grad_norm": 1.7158582210540771,
"learning_rate": 1.820191754476413e-05,
"loss": 0.04,
"step": 9200
},
{
"epoch": 35.42307692307692,
"grad_norm": 1.2287282943725586,
"learning_rate": 1.8177682780351256e-05,
"loss": 0.0381,
"step": 9210
},
{
"epoch": 35.46153846153846,
"grad_norm": 1.578681468963623,
"learning_rate": 1.8153439328275e-05,
"loss": 0.0434,
"step": 9220
},
{
"epoch": 35.5,
"grad_norm": 1.7834547758102417,
"learning_rate": 1.8129187254816035e-05,
"loss": 0.0393,
"step": 9230
},
{
"epoch": 35.53846153846154,
"grad_norm": 1.6021229028701782,
"learning_rate": 1.81049266262786e-05,
"loss": 0.0372,
"step": 9240
},
{
"epoch": 35.57692307692308,
"grad_norm": 1.3831838369369507,
"learning_rate": 1.808065750899033e-05,
"loss": 0.0409,
"step": 9250
},
{
"epoch": 35.61538461538461,
"grad_norm": 1.727046012878418,
"learning_rate": 1.8056379969302066e-05,
"loss": 0.0377,
"step": 9260
},
{
"epoch": 35.65384615384615,
"grad_norm": 1.710699439048767,
"learning_rate": 1.8032094073587675e-05,
"loss": 0.0376,
"step": 9270
},
{
"epoch": 35.69230769230769,
"grad_norm": 1.6480785608291626,
"learning_rate": 1.800779988824387e-05,
"loss": 0.0343,
"step": 9280
},
{
"epoch": 35.73076923076923,
"grad_norm": 1.4451653957366943,
"learning_rate": 1.7983497479690018e-05,
"loss": 0.0347,
"step": 9290
},
{
"epoch": 35.76923076923077,
"grad_norm": 1.520155668258667,
"learning_rate": 1.795918691436798e-05,
"loss": 0.039,
"step": 9300
},
{
"epoch": 35.80769230769231,
"grad_norm": 1.372151494026184,
"learning_rate": 1.7934868258741917e-05,
"loss": 0.0366,
"step": 9310
},
{
"epoch": 35.84615384615385,
"grad_norm": 1.39862859249115,
"learning_rate": 1.79105415792981e-05,
"loss": 0.0377,
"step": 9320
},
{
"epoch": 35.88461538461539,
"grad_norm": 1.4081544876098633,
"learning_rate": 1.788620694254475e-05,
"loss": 0.0412,
"step": 9330
},
{
"epoch": 35.92307692307692,
"grad_norm": 1.39034903049469,
"learning_rate": 1.7861864415011827e-05,
"loss": 0.0411,
"step": 9340
},
{
"epoch": 35.96153846153846,
"grad_norm": 1.650275468826294,
"learning_rate": 1.783751406325087e-05,
"loss": 0.0336,
"step": 9350
},
{
"epoch": 36.0,
"grad_norm": 1.45979905128479,
"learning_rate": 1.7813155953834814e-05,
"loss": 0.0402,
"step": 9360
},
{
"epoch": 36.03846153846154,
"grad_norm": 1.596880555152893,
"learning_rate": 1.7788790153357803e-05,
"loss": 0.0439,
"step": 9370
},
{
"epoch": 36.07692307692308,
"grad_norm": 1.4605010747909546,
"learning_rate": 1.7764416728435e-05,
"loss": 0.0429,
"step": 9380
},
{
"epoch": 36.11538461538461,
"grad_norm": 1.4628403186798096,
"learning_rate": 1.774003574570242e-05,
"loss": 0.0416,
"step": 9390
},
{
"epoch": 36.15384615384615,
"grad_norm": 1.8021140098571777,
"learning_rate": 1.7715647271816744e-05,
"loss": 0.0346,
"step": 9400
},
{
"epoch": 36.19230769230769,
"grad_norm": 1.4157359600067139,
"learning_rate": 1.769125137345512e-05,
"loss": 0.0357,
"step": 9410
},
{
"epoch": 36.23076923076923,
"grad_norm": 1.421094536781311,
"learning_rate": 1.7666848117315008e-05,
"loss": 0.0392,
"step": 9420
},
{
"epoch": 36.26923076923077,
"grad_norm": 1.7203149795532227,
"learning_rate": 1.7642437570113974e-05,
"loss": 0.0358,
"step": 9430
},
{
"epoch": 36.30769230769231,
"grad_norm": 1.436547040939331,
"learning_rate": 1.7618019798589525e-05,
"loss": 0.043,
"step": 9440
},
{
"epoch": 36.34615384615385,
"grad_norm": 1.5459340810775757,
"learning_rate": 1.7593594869498915e-05,
"loss": 0.035,
"step": 9450
},
{
"epoch": 36.38461538461539,
"grad_norm": 1.5073521137237549,
"learning_rate": 1.7569162849618966e-05,
"loss": 0.0393,
"step": 9460
},
{
"epoch": 36.42307692307692,
"grad_norm": 1.3095002174377441,
"learning_rate": 1.75447238057459e-05,
"loss": 0.0383,
"step": 9470
},
{
"epoch": 36.46153846153846,
"grad_norm": 1.308945655822754,
"learning_rate": 1.752027780469511e-05,
"loss": 0.0364,
"step": 9480
},
{
"epoch": 36.5,
"grad_norm": 1.4354068040847778,
"learning_rate": 1.7495824913301043e-05,
"loss": 0.0417,
"step": 9490
},
{
"epoch": 36.53846153846154,
"grad_norm": 1.756820559501648,
"learning_rate": 1.7471365198416957e-05,
"loss": 0.0393,
"step": 9500
},
{
"epoch": 36.57692307692308,
"grad_norm": 1.3326730728149414,
"learning_rate": 1.7446898726914797e-05,
"loss": 0.0369,
"step": 9510
},
{
"epoch": 36.61538461538461,
"grad_norm": 1.3120198249816895,
"learning_rate": 1.742242556568495e-05,
"loss": 0.0426,
"step": 9520
},
{
"epoch": 36.65384615384615,
"grad_norm": 1.121603012084961,
"learning_rate": 1.73979457816361e-05,
"loss": 0.0387,
"step": 9530
},
{
"epoch": 36.69230769230769,
"grad_norm": 1.3100221157073975,
"learning_rate": 1.7373459441695058e-05,
"loss": 0.0423,
"step": 9540
},
{
"epoch": 36.73076923076923,
"grad_norm": 1.3542190790176392,
"learning_rate": 1.7348966612806524e-05,
"loss": 0.0421,
"step": 9550
},
{
"epoch": 36.76923076923077,
"grad_norm": 1.59927499294281,
"learning_rate": 1.7324467361932973e-05,
"loss": 0.0397,
"step": 9560
},
{
"epoch": 36.80769230769231,
"grad_norm": 1.6307555437088013,
"learning_rate": 1.729996175605441e-05,
"loss": 0.0411,
"step": 9570
},
{
"epoch": 36.84615384615385,
"grad_norm": 1.3856720924377441,
"learning_rate": 1.7275449862168235e-05,
"loss": 0.0401,
"step": 9580
},
{
"epoch": 36.88461538461539,
"grad_norm": 1.2768700122833252,
"learning_rate": 1.725093174728902e-05,
"loss": 0.0324,
"step": 9590
},
{
"epoch": 36.92307692307692,
"grad_norm": 1.9689397811889648,
"learning_rate": 1.7226407478448357e-05,
"loss": 0.038,
"step": 9600
},
{
"epoch": 36.96153846153846,
"grad_norm": 1.4385662078857422,
"learning_rate": 1.7201877122694666e-05,
"loss": 0.0423,
"step": 9610
},
{
"epoch": 37.0,
"grad_norm": 1.3675845861434937,
"learning_rate": 1.7177340747093e-05,
"loss": 0.0397,
"step": 9620
},
{
"epoch": 37.03846153846154,
"grad_norm": 1.733004093170166,
"learning_rate": 1.7152798418724873e-05,
"loss": 0.0417,
"step": 9630
},
{
"epoch": 37.07692307692308,
"grad_norm": 1.5616117715835571,
"learning_rate": 1.712825020468807e-05,
"loss": 0.0359,
"step": 9640
},
{
"epoch": 37.11538461538461,
"grad_norm": 1.513261079788208,
"learning_rate": 1.710369617209648e-05,
"loss": 0.0414,
"step": 9650
},
{
"epoch": 37.15384615384615,
"grad_norm": 1.6468145847320557,
"learning_rate": 1.7079136388079884e-05,
"loss": 0.038,
"step": 9660
},
{
"epoch": 37.19230769230769,
"grad_norm": 1.6188808679580688,
"learning_rate": 1.7054570919783796e-05,
"loss": 0.0382,
"step": 9670
},
{
"epoch": 37.23076923076923,
"grad_norm": 1.556585669517517,
"learning_rate": 1.7029999834369264e-05,
"loss": 0.0387,
"step": 9680
},
{
"epoch": 37.26923076923077,
"grad_norm": 1.5919781923294067,
"learning_rate": 1.7005423199012696e-05,
"loss": 0.034,
"step": 9690
},
{
"epoch": 37.30769230769231,
"grad_norm": 1.565858244895935,
"learning_rate": 1.6980841080905687e-05,
"loss": 0.0344,
"step": 9700
},
{
"epoch": 37.34615384615385,
"grad_norm": 1.190491795539856,
"learning_rate": 1.6956253547254798e-05,
"loss": 0.0405,
"step": 9710
},
{
"epoch": 37.38461538461539,
"grad_norm": 1.2053945064544678,
"learning_rate": 1.693166066528141e-05,
"loss": 0.0366,
"step": 9720
},
{
"epoch": 37.42307692307692,
"grad_norm": 1.7797147035598755,
"learning_rate": 1.690706250222152e-05,
"loss": 0.0404,
"step": 9730
},
{
"epoch": 37.46153846153846,
"grad_norm": 1.3981484174728394,
"learning_rate": 1.6882459125325573e-05,
"loss": 0.0372,
"step": 9740
},
{
"epoch": 37.5,
"grad_norm": 1.3930727243423462,
"learning_rate": 1.685785060185826e-05,
"loss": 0.036,
"step": 9750
},
{
"epoch": 37.53846153846154,
"grad_norm": 1.2622334957122803,
"learning_rate": 1.683323699909834e-05,
"loss": 0.036,
"step": 9760
},
{
"epoch": 37.57692307692308,
"grad_norm": 1.3782292604446411,
"learning_rate": 1.6808618384338472e-05,
"loss": 0.0375,
"step": 9770
},
{
"epoch": 37.61538461538461,
"grad_norm": 1.3075623512268066,
"learning_rate": 1.6783994824885e-05,
"loss": 0.0357,
"step": 9780
},
{
"epoch": 37.65384615384615,
"grad_norm": 1.449067234992981,
"learning_rate": 1.6759366388057795e-05,
"loss": 0.04,
"step": 9790
},
{
"epoch": 37.69230769230769,
"grad_norm": 1.53331458568573,
"learning_rate": 1.6734733141190073e-05,
"loss": 0.0322,
"step": 9800
},
{
"epoch": 37.73076923076923,
"grad_norm": 1.3725186586380005,
"learning_rate": 1.6710095151628182e-05,
"loss": 0.0394,
"step": 9810
},
{
"epoch": 37.76923076923077,
"grad_norm": 1.6133525371551514,
"learning_rate": 1.668545248673144e-05,
"loss": 0.0329,
"step": 9820
},
{
"epoch": 37.80769230769231,
"grad_norm": 1.2607682943344116,
"learning_rate": 1.6660805213871962e-05,
"loss": 0.0432,
"step": 9830
},
{
"epoch": 37.84615384615385,
"grad_norm": 1.4924263954162598,
"learning_rate": 1.663615340043445e-05,
"loss": 0.0392,
"step": 9840
},
{
"epoch": 37.88461538461539,
"grad_norm": 1.3537297248840332,
"learning_rate": 1.6611497113816014e-05,
"loss": 0.0426,
"step": 9850
},
{
"epoch": 37.92307692307692,
"grad_norm": 1.1831531524658203,
"learning_rate": 1.6586836421426007e-05,
"loss": 0.0383,
"step": 9860
},
{
"epoch": 37.96153846153846,
"grad_norm": 1.317609429359436,
"learning_rate": 1.6562171390685815e-05,
"loss": 0.033,
"step": 9870
},
{
"epoch": 38.0,
"grad_norm": 1.4860436916351318,
"learning_rate": 1.653750208902869e-05,
"loss": 0.0377,
"step": 9880
},
{
"epoch": 38.03846153846154,
"grad_norm": 1.7705858945846558,
"learning_rate": 1.6512828583899562e-05,
"loss": 0.0428,
"step": 9890
},
{
"epoch": 38.07692307692308,
"grad_norm": 1.4256800413131714,
"learning_rate": 1.648815094275486e-05,
"loss": 0.0374,
"step": 9900
},
{
"epoch": 38.11538461538461,
"grad_norm": 1.430487871170044,
"learning_rate": 1.6463469233062302e-05,
"loss": 0.0382,
"step": 9910
},
{
"epoch": 38.15384615384615,
"grad_norm": 1.273598074913025,
"learning_rate": 1.6438783522300742e-05,
"loss": 0.0403,
"step": 9920
},
{
"epoch": 38.19230769230769,
"grad_norm": 1.6827222108840942,
"learning_rate": 1.641409387795997e-05,
"loss": 0.0318,
"step": 9930
},
{
"epoch": 38.23076923076923,
"grad_norm": 1.5564827919006348,
"learning_rate": 1.6389400367540534e-05,
"loss": 0.0443,
"step": 9940
},
{
"epoch": 38.26923076923077,
"grad_norm": 1.5248053073883057,
"learning_rate": 1.6364703058553552e-05,
"loss": 0.0408,
"step": 9950
},
{
"epoch": 38.30769230769231,
"grad_norm": 1.2614673376083374,
"learning_rate": 1.6340002018520512e-05,
"loss": 0.0372,
"step": 9960
},
{
"epoch": 38.34615384615385,
"grad_norm": 1.43790602684021,
"learning_rate": 1.6315297314973126e-05,
"loss": 0.0354,
"step": 9970
},
{
"epoch": 38.38461538461539,
"grad_norm": 1.3772172927856445,
"learning_rate": 1.6290589015453102e-05,
"loss": 0.0362,
"step": 9980
},
{
"epoch": 38.42307692307692,
"grad_norm": 1.5918662548065186,
"learning_rate": 1.6265877187511993e-05,
"loss": 0.0378,
"step": 9990
},
{
"epoch": 38.46153846153846,
"grad_norm": 1.3942394256591797,
"learning_rate": 1.6241161898710993e-05,
"loss": 0.0408,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 77,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}