Qwen2.5-7B-rm-text-simple / trainer_state.json
hyungjoochae's picture
Upload folder using huggingface_hub
59ed1c9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5001442723862652,
"eval_steps": 1300,
"global_step": 1300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003847263633740502,
"grad_norm": 108.82730102539062,
"learning_rate": 1.0000000000000001e-07,
"loss": 3.3651,
"step": 1
},
{
"epoch": 0.0003847263633740502,
"eval_loss": 3.230529308319092,
"eval_runtime": 238.8764,
"eval_samples_per_second": 0.804,
"eval_steps_per_second": 0.402,
"step": 1
},
{
"epoch": 0.0007694527267481004,
"grad_norm": 93.2563705444336,
"learning_rate": 2.0000000000000002e-07,
"loss": 3.0404,
"step": 2
},
{
"epoch": 0.0011541790901221506,
"grad_norm": 92.04035186767578,
"learning_rate": 3.0000000000000004e-07,
"loss": 3.1693,
"step": 3
},
{
"epoch": 0.0015389054534962008,
"grad_norm": 92.20787048339844,
"learning_rate": 4.0000000000000003e-07,
"loss": 3.111,
"step": 4
},
{
"epoch": 0.001923631816870251,
"grad_norm": 95.61177825927734,
"learning_rate": 5.000000000000001e-07,
"loss": 3.1856,
"step": 5
},
{
"epoch": 0.002308358180244301,
"grad_norm": 99.54902648925781,
"learning_rate": 6.000000000000001e-07,
"loss": 3.1906,
"step": 6
},
{
"epoch": 0.0026930845436183514,
"grad_norm": 91.08406829833984,
"learning_rate": 7.000000000000001e-07,
"loss": 2.8621,
"step": 7
},
{
"epoch": 0.0030778109069924016,
"grad_norm": 76.93069458007812,
"learning_rate": 8.000000000000001e-07,
"loss": 2.868,
"step": 8
},
{
"epoch": 0.003462537270366452,
"grad_norm": 73.68675994873047,
"learning_rate": 9.000000000000001e-07,
"loss": 2.7618,
"step": 9
},
{
"epoch": 0.003847263633740502,
"grad_norm": 65.52332305908203,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.467,
"step": 10
},
{
"epoch": 0.004231989997114553,
"grad_norm": 51.694793701171875,
"learning_rate": 1.1e-06,
"loss": 2.1402,
"step": 11
},
{
"epoch": 0.004616716360488602,
"grad_norm": 56.9593391418457,
"learning_rate": 1.2000000000000002e-06,
"loss": 2.2299,
"step": 12
},
{
"epoch": 0.005001442723862653,
"grad_norm": 52.62051773071289,
"learning_rate": 1.3e-06,
"loss": 1.9982,
"step": 13
},
{
"epoch": 0.005386169087236703,
"grad_norm": 41.68424606323242,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.5395,
"step": 14
},
{
"epoch": 0.005770895450610753,
"grad_norm": 36.1357307434082,
"learning_rate": 1.5e-06,
"loss": 1.2412,
"step": 15
},
{
"epoch": 0.006155621813984803,
"grad_norm": 33.65456008911133,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.14,
"step": 16
},
{
"epoch": 0.006540348177358854,
"grad_norm": 33.47177505493164,
"learning_rate": 1.7000000000000002e-06,
"loss": 0.9993,
"step": 17
},
{
"epoch": 0.006925074540732904,
"grad_norm": 31.6041316986084,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.9344,
"step": 18
},
{
"epoch": 0.007309800904106954,
"grad_norm": 26.2973690032959,
"learning_rate": 1.9000000000000002e-06,
"loss": 0.6089,
"step": 19
},
{
"epoch": 0.007694527267481004,
"grad_norm": 29.93291473388672,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.4644,
"step": 20
},
{
"epoch": 0.008079253630855054,
"grad_norm": 25.220720291137695,
"learning_rate": 2.1000000000000002e-06,
"loss": 0.3234,
"step": 21
},
{
"epoch": 0.008463979994229105,
"grad_norm": 23.863779067993164,
"learning_rate": 2.2e-06,
"loss": 0.258,
"step": 22
},
{
"epoch": 0.008848706357603155,
"grad_norm": 13.900153160095215,
"learning_rate": 2.3000000000000004e-06,
"loss": 0.1722,
"step": 23
},
{
"epoch": 0.009233432720977205,
"grad_norm": 8.517366409301758,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.1005,
"step": 24
},
{
"epoch": 0.009618159084351255,
"grad_norm": 11.185029983520508,
"learning_rate": 2.5e-06,
"loss": 0.1114,
"step": 25
},
{
"epoch": 0.010002885447725306,
"grad_norm": 10.078015327453613,
"learning_rate": 2.6e-06,
"loss": 0.0737,
"step": 26
},
{
"epoch": 0.010387611811099356,
"grad_norm": 4.785120964050293,
"learning_rate": 2.7000000000000004e-06,
"loss": 0.0508,
"step": 27
},
{
"epoch": 0.010772338174473406,
"grad_norm": 10.350395202636719,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.0708,
"step": 28
},
{
"epoch": 0.011157064537847455,
"grad_norm": 7.701849937438965,
"learning_rate": 2.9e-06,
"loss": 0.0475,
"step": 29
},
{
"epoch": 0.011541790901221507,
"grad_norm": 3.197638750076294,
"learning_rate": 3e-06,
"loss": 0.0392,
"step": 30
},
{
"epoch": 0.011926517264595557,
"grad_norm": 3.299790382385254,
"learning_rate": 3.1000000000000004e-06,
"loss": 0.0397,
"step": 31
},
{
"epoch": 0.012311243627969606,
"grad_norm": 4.075237274169922,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.0337,
"step": 32
},
{
"epoch": 0.012695969991343656,
"grad_norm": 3.679044008255005,
"learning_rate": 3.3000000000000006e-06,
"loss": 0.0353,
"step": 33
},
{
"epoch": 0.013080696354717708,
"grad_norm": 4.004962921142578,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.0251,
"step": 34
},
{
"epoch": 0.013465422718091757,
"grad_norm": 4.201568126678467,
"learning_rate": 3.5e-06,
"loss": 0.0381,
"step": 35
},
{
"epoch": 0.013850149081465807,
"grad_norm": 8.480195045471191,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.0568,
"step": 36
},
{
"epoch": 0.014234875444839857,
"grad_norm": 3.5017919540405273,
"learning_rate": 3.7e-06,
"loss": 0.0731,
"step": 37
},
{
"epoch": 0.014619601808213908,
"grad_norm": 12.31709098815918,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.0727,
"step": 38
},
{
"epoch": 0.015004328171587958,
"grad_norm": 3.9239490032196045,
"learning_rate": 3.900000000000001e-06,
"loss": 0.0406,
"step": 39
},
{
"epoch": 0.015389054534962008,
"grad_norm": 2.3493189811706543,
"learning_rate": 4.000000000000001e-06,
"loss": 0.043,
"step": 40
},
{
"epoch": 0.01577378089833606,
"grad_norm": 4.353029727935791,
"learning_rate": 4.1e-06,
"loss": 0.0439,
"step": 41
},
{
"epoch": 0.016158507261710107,
"grad_norm": 2.912537097930908,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.0445,
"step": 42
},
{
"epoch": 0.01654323362508416,
"grad_norm": 3.65114688873291,
"learning_rate": 4.3e-06,
"loss": 0.0426,
"step": 43
},
{
"epoch": 0.01692795998845821,
"grad_norm": 11.928956031799316,
"learning_rate": 4.4e-06,
"loss": 0.0634,
"step": 44
},
{
"epoch": 0.01731268635183226,
"grad_norm": 9.41391372680664,
"learning_rate": 4.5e-06,
"loss": 0.0824,
"step": 45
},
{
"epoch": 0.01769741271520631,
"grad_norm": 3.015249013900757,
"learning_rate": 4.600000000000001e-06,
"loss": 0.0382,
"step": 46
},
{
"epoch": 0.018082139078580358,
"grad_norm": 6.8663554191589355,
"learning_rate": 4.7e-06,
"loss": 0.0411,
"step": 47
},
{
"epoch": 0.01846686544195441,
"grad_norm": 6.370840549468994,
"learning_rate": 4.800000000000001e-06,
"loss": 0.0527,
"step": 48
},
{
"epoch": 0.01885159180532846,
"grad_norm": 2.8191823959350586,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.0384,
"step": 49
},
{
"epoch": 0.01923631816870251,
"grad_norm": 9.229619026184082,
"learning_rate": 5e-06,
"loss": 0.0463,
"step": 50
},
{
"epoch": 0.01962104453207656,
"grad_norm": 7.707767486572266,
"learning_rate": 5.1e-06,
"loss": 0.0566,
"step": 51
},
{
"epoch": 0.020005770895450612,
"grad_norm": 3.9060797691345215,
"learning_rate": 5.2e-06,
"loss": 0.0289,
"step": 52
},
{
"epoch": 0.02039049725882466,
"grad_norm": 1.166146993637085,
"learning_rate": 5.300000000000001e-06,
"loss": 0.0196,
"step": 53
},
{
"epoch": 0.02077522362219871,
"grad_norm": 5.692835330963135,
"learning_rate": 5.400000000000001e-06,
"loss": 0.0443,
"step": 54
},
{
"epoch": 0.021159949985572763,
"grad_norm": 7.362571716308594,
"learning_rate": 5.500000000000001e-06,
"loss": 0.0387,
"step": 55
},
{
"epoch": 0.02154467634894681,
"grad_norm": 4.404002666473389,
"learning_rate": 5.600000000000001e-06,
"loss": 0.0325,
"step": 56
},
{
"epoch": 0.021929402712320863,
"grad_norm": 7.550673007965088,
"learning_rate": 5.7e-06,
"loss": 0.0473,
"step": 57
},
{
"epoch": 0.02231412907569491,
"grad_norm": 5.290981769561768,
"learning_rate": 5.8e-06,
"loss": 0.0322,
"step": 58
},
{
"epoch": 0.022698855439068962,
"grad_norm": 3.3678693771362305,
"learning_rate": 5.9e-06,
"loss": 0.0259,
"step": 59
},
{
"epoch": 0.023083581802443014,
"grad_norm": 5.019497871398926,
"learning_rate": 6e-06,
"loss": 0.0349,
"step": 60
},
{
"epoch": 0.02346830816581706,
"grad_norm": 5.121387958526611,
"learning_rate": 6.1e-06,
"loss": 0.0323,
"step": 61
},
{
"epoch": 0.023853034529191113,
"grad_norm": 3.188506841659546,
"learning_rate": 6.200000000000001e-06,
"loss": 0.0501,
"step": 62
},
{
"epoch": 0.024237760892565165,
"grad_norm": 1.362047791481018,
"learning_rate": 6.300000000000001e-06,
"loss": 0.0266,
"step": 63
},
{
"epoch": 0.024622487255939213,
"grad_norm": 4.539747714996338,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.0303,
"step": 64
},
{
"epoch": 0.025007213619313264,
"grad_norm": 8.616043090820312,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.0486,
"step": 65
},
{
"epoch": 0.025391939982687312,
"grad_norm": 5.378427028656006,
"learning_rate": 6.600000000000001e-06,
"loss": 0.033,
"step": 66
},
{
"epoch": 0.025776666346061364,
"grad_norm": 2.1194162368774414,
"learning_rate": 6.700000000000001e-06,
"loss": 0.0288,
"step": 67
},
{
"epoch": 0.026161392709435415,
"grad_norm": 2.0167043209075928,
"learning_rate": 6.800000000000001e-06,
"loss": 0.0326,
"step": 68
},
{
"epoch": 0.026546119072809463,
"grad_norm": 1.795593023300171,
"learning_rate": 6.9e-06,
"loss": 0.0295,
"step": 69
},
{
"epoch": 0.026930845436183515,
"grad_norm": 1.237252116203308,
"learning_rate": 7e-06,
"loss": 0.0177,
"step": 70
},
{
"epoch": 0.027315571799557566,
"grad_norm": 1.0062570571899414,
"learning_rate": 7.100000000000001e-06,
"loss": 0.0233,
"step": 71
},
{
"epoch": 0.027700298162931614,
"grad_norm": 1.7850754261016846,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.0211,
"step": 72
},
{
"epoch": 0.028085024526305666,
"grad_norm": 1.5070022344589233,
"learning_rate": 7.3e-06,
"loss": 0.0156,
"step": 73
},
{
"epoch": 0.028469750889679714,
"grad_norm": 2.268380641937256,
"learning_rate": 7.4e-06,
"loss": 0.0353,
"step": 74
},
{
"epoch": 0.028854477253053765,
"grad_norm": 3.3155412673950195,
"learning_rate": 7.500000000000001e-06,
"loss": 0.0163,
"step": 75
},
{
"epoch": 0.029239203616427817,
"grad_norm": 3.727926731109619,
"learning_rate": 7.600000000000001e-06,
"loss": 0.0281,
"step": 76
},
{
"epoch": 0.029623929979801865,
"grad_norm": 8.840143203735352,
"learning_rate": 7.7e-06,
"loss": 0.0443,
"step": 77
},
{
"epoch": 0.030008656343175916,
"grad_norm": 5.514863014221191,
"learning_rate": 7.800000000000002e-06,
"loss": 0.075,
"step": 78
},
{
"epoch": 0.030393382706549968,
"grad_norm": 5.712233543395996,
"learning_rate": 7.9e-06,
"loss": 0.0474,
"step": 79
},
{
"epoch": 0.030778109069924016,
"grad_norm": 12.506179809570312,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0539,
"step": 80
},
{
"epoch": 0.031162835433298067,
"grad_norm": 2.7478084564208984,
"learning_rate": 8.1e-06,
"loss": 0.0225,
"step": 81
},
{
"epoch": 0.03154756179667212,
"grad_norm": 7.17296838760376,
"learning_rate": 8.2e-06,
"loss": 0.0577,
"step": 82
},
{
"epoch": 0.03193228816004617,
"grad_norm": 9.32388687133789,
"learning_rate": 8.3e-06,
"loss": 0.0613,
"step": 83
},
{
"epoch": 0.032317014523420215,
"grad_norm": 9.240764617919922,
"learning_rate": 8.400000000000001e-06,
"loss": 0.0513,
"step": 84
},
{
"epoch": 0.03270174088679427,
"grad_norm": 3.8488717079162598,
"learning_rate": 8.5e-06,
"loss": 0.0313,
"step": 85
},
{
"epoch": 0.03308646725016832,
"grad_norm": 4.666772365570068,
"learning_rate": 8.6e-06,
"loss": 0.0379,
"step": 86
},
{
"epoch": 0.033471193613542366,
"grad_norm": 16.0006160736084,
"learning_rate": 8.700000000000001e-06,
"loss": 0.0857,
"step": 87
},
{
"epoch": 0.03385591997691642,
"grad_norm": 7.749240875244141,
"learning_rate": 8.8e-06,
"loss": 0.0615,
"step": 88
},
{
"epoch": 0.03424064634029047,
"grad_norm": 3.0161995887756348,
"learning_rate": 8.900000000000001e-06,
"loss": 0.0233,
"step": 89
},
{
"epoch": 0.03462537270366452,
"grad_norm": 1.6129286289215088,
"learning_rate": 9e-06,
"loss": 0.022,
"step": 90
},
{
"epoch": 0.03501009906703857,
"grad_norm": 3.569190740585327,
"learning_rate": 9.100000000000001e-06,
"loss": 0.0249,
"step": 91
},
{
"epoch": 0.03539482543041262,
"grad_norm": 5.3800740242004395,
"learning_rate": 9.200000000000002e-06,
"loss": 0.0523,
"step": 92
},
{
"epoch": 0.03577955179378667,
"grad_norm": 4.84494686126709,
"learning_rate": 9.3e-06,
"loss": 0.0434,
"step": 93
},
{
"epoch": 0.036164278157160716,
"grad_norm": 2.5774073600769043,
"learning_rate": 9.4e-06,
"loss": 0.0598,
"step": 94
},
{
"epoch": 0.03654900452053477,
"grad_norm": 3.4920051097869873,
"learning_rate": 9.5e-06,
"loss": 0.0211,
"step": 95
},
{
"epoch": 0.03693373088390882,
"grad_norm": 2.574754238128662,
"learning_rate": 9.600000000000001e-06,
"loss": 0.0226,
"step": 96
},
{
"epoch": 0.03731845724728287,
"grad_norm": 6.462972164154053,
"learning_rate": 9.7e-06,
"loss": 0.0375,
"step": 97
},
{
"epoch": 0.03770318361065692,
"grad_norm": 6.1067986488342285,
"learning_rate": 9.800000000000001e-06,
"loss": 0.0412,
"step": 98
},
{
"epoch": 0.03808790997403097,
"grad_norm": 2.107085704803467,
"learning_rate": 9.9e-06,
"loss": 0.0249,
"step": 99
},
{
"epoch": 0.03847263633740502,
"grad_norm": 1.3796989917755127,
"learning_rate": 1e-05,
"loss": 0.022,
"step": 100
},
{
"epoch": 0.03885736270077907,
"grad_norm": 1.5101048946380615,
"learning_rate": 9.99999985161259e-06,
"loss": 0.043,
"step": 101
},
{
"epoch": 0.03924208906415312,
"grad_norm": 4.264603614807129,
"learning_rate": 9.999999406450364e-06,
"loss": 0.0591,
"step": 102
},
{
"epoch": 0.03962681542752717,
"grad_norm": 3.4575819969177246,
"learning_rate": 9.999998664513351e-06,
"loss": 0.0313,
"step": 103
},
{
"epoch": 0.040011541790901224,
"grad_norm": 1.9395074844360352,
"learning_rate": 9.999997625801593e-06,
"loss": 0.0141,
"step": 104
},
{
"epoch": 0.04039626815427527,
"grad_norm": 2.3632755279541016,
"learning_rate": 9.999996290315154e-06,
"loss": 0.0163,
"step": 105
},
{
"epoch": 0.04078099451764932,
"grad_norm": 3.7122342586517334,
"learning_rate": 9.999994658054113e-06,
"loss": 0.0144,
"step": 106
},
{
"epoch": 0.041165720881023375,
"grad_norm": 8.778124809265137,
"learning_rate": 9.999992729018565e-06,
"loss": 0.0989,
"step": 107
},
{
"epoch": 0.04155044724439742,
"grad_norm": 3.6246390342712402,
"learning_rate": 9.999990503208625e-06,
"loss": 0.053,
"step": 108
},
{
"epoch": 0.04193517360777147,
"grad_norm": 2.6841275691986084,
"learning_rate": 9.999987980624426e-06,
"loss": 0.0221,
"step": 109
},
{
"epoch": 0.042319899971145526,
"grad_norm": 1.4657002687454224,
"learning_rate": 9.999985161266116e-06,
"loss": 0.0178,
"step": 110
},
{
"epoch": 0.042704626334519574,
"grad_norm": 4.295589447021484,
"learning_rate": 9.999982045133868e-06,
"loss": 0.0394,
"step": 111
},
{
"epoch": 0.04308935269789362,
"grad_norm": 5.708546161651611,
"learning_rate": 9.999978632227859e-06,
"loss": 0.0308,
"step": 112
},
{
"epoch": 0.04347407906126767,
"grad_norm": 1.5994555950164795,
"learning_rate": 9.999974922548297e-06,
"loss": 0.0147,
"step": 113
},
{
"epoch": 0.043858805424641725,
"grad_norm": 5.406169414520264,
"learning_rate": 9.9999709160954e-06,
"loss": 0.0643,
"step": 114
},
{
"epoch": 0.04424353178801577,
"grad_norm": 1.2986438274383545,
"learning_rate": 9.999966612869404e-06,
"loss": 0.0129,
"step": 115
},
{
"epoch": 0.04462825815138982,
"grad_norm": 3.2887203693389893,
"learning_rate": 9.999962012870571e-06,
"loss": 0.0334,
"step": 116
},
{
"epoch": 0.045012984514763876,
"grad_norm": 4.520976543426514,
"learning_rate": 9.999957116099169e-06,
"loss": 0.0259,
"step": 117
},
{
"epoch": 0.045397710878137924,
"grad_norm": 2.6903226375579834,
"learning_rate": 9.999951922555486e-06,
"loss": 0.0118,
"step": 118
},
{
"epoch": 0.04578243724151197,
"grad_norm": 9.5642671585083,
"learning_rate": 9.999946432239835e-06,
"loss": 0.0835,
"step": 119
},
{
"epoch": 0.04616716360488603,
"grad_norm": 3.200307607650757,
"learning_rate": 9.999940645152541e-06,
"loss": 0.0305,
"step": 120
},
{
"epoch": 0.046551889968260075,
"grad_norm": 2.1902754306793213,
"learning_rate": 9.999934561293948e-06,
"loss": 0.0264,
"step": 121
},
{
"epoch": 0.04693661633163412,
"grad_norm": 2.826099395751953,
"learning_rate": 9.999928180664415e-06,
"loss": 0.0257,
"step": 122
},
{
"epoch": 0.04732134269500818,
"grad_norm": 1.8656409978866577,
"learning_rate": 9.999921503264322e-06,
"loss": 0.0223,
"step": 123
},
{
"epoch": 0.047706069058382226,
"grad_norm": 0.8881422281265259,
"learning_rate": 9.999914529094066e-06,
"loss": 0.0207,
"step": 124
},
{
"epoch": 0.048090795421756274,
"grad_norm": 4.812993049621582,
"learning_rate": 9.99990725815406e-06,
"loss": 0.0141,
"step": 125
},
{
"epoch": 0.04847552178513033,
"grad_norm": 1.3018031120300293,
"learning_rate": 9.999899690444736e-06,
"loss": 0.0295,
"step": 126
},
{
"epoch": 0.04886024814850438,
"grad_norm": 4.202988624572754,
"learning_rate": 9.999891825966541e-06,
"loss": 0.0377,
"step": 127
},
{
"epoch": 0.049244974511878425,
"grad_norm": 12.903936386108398,
"learning_rate": 9.999883664719945e-06,
"loss": 0.0459,
"step": 128
},
{
"epoch": 0.049629700875252473,
"grad_norm": 1.8323734998703003,
"learning_rate": 9.999875206705432e-06,
"loss": 0.0308,
"step": 129
},
{
"epoch": 0.05001442723862653,
"grad_norm": 2.0135998725891113,
"learning_rate": 9.999866451923502e-06,
"loss": 0.0382,
"step": 130
},
{
"epoch": 0.050399153602000576,
"grad_norm": 3.817152261734009,
"learning_rate": 9.999857400374676e-06,
"loss": 0.0285,
"step": 131
},
{
"epoch": 0.050783879965374625,
"grad_norm": 4.566162586212158,
"learning_rate": 9.999848052059489e-06,
"loss": 0.0352,
"step": 132
},
{
"epoch": 0.05116860632874868,
"grad_norm": 8.352324485778809,
"learning_rate": 9.999838406978499e-06,
"loss": 0.0515,
"step": 133
},
{
"epoch": 0.05155333269212273,
"grad_norm": 3.5633044242858887,
"learning_rate": 9.999828465132278e-06,
"loss": 0.0281,
"step": 134
},
{
"epoch": 0.051938059055496776,
"grad_norm": 2.960449457168579,
"learning_rate": 9.999818226521416e-06,
"loss": 0.0448,
"step": 135
},
{
"epoch": 0.05232278541887083,
"grad_norm": 2.096433639526367,
"learning_rate": 9.99980769114652e-06,
"loss": 0.0312,
"step": 136
},
{
"epoch": 0.05270751178224488,
"grad_norm": 1.4195860624313354,
"learning_rate": 9.999796859008215e-06,
"loss": 0.0193,
"step": 137
},
{
"epoch": 0.05309223814561893,
"grad_norm": 1.6590723991394043,
"learning_rate": 9.999785730107145e-06,
"loss": 0.0331,
"step": 138
},
{
"epoch": 0.05347696450899298,
"grad_norm": 1.200443148612976,
"learning_rate": 9.99977430444397e-06,
"loss": 0.0276,
"step": 139
},
{
"epoch": 0.05386169087236703,
"grad_norm": 1.6274645328521729,
"learning_rate": 9.999762582019365e-06,
"loss": 0.0205,
"step": 140
},
{
"epoch": 0.05424641723574108,
"grad_norm": 1.1515076160430908,
"learning_rate": 9.999750562834032e-06,
"loss": 0.0199,
"step": 141
},
{
"epoch": 0.05463114359911513,
"grad_norm": 2.4453465938568115,
"learning_rate": 9.999738246888682e-06,
"loss": 0.0303,
"step": 142
},
{
"epoch": 0.05501586996248918,
"grad_norm": 2.5557849407196045,
"learning_rate": 9.999725634184044e-06,
"loss": 0.026,
"step": 143
},
{
"epoch": 0.05540059632586323,
"grad_norm": 1.422838568687439,
"learning_rate": 9.999712724720868e-06,
"loss": 0.0153,
"step": 144
},
{
"epoch": 0.05578532268923728,
"grad_norm": 1.7006760835647583,
"learning_rate": 9.999699518499922e-06,
"loss": 0.0423,
"step": 145
},
{
"epoch": 0.05617004905261133,
"grad_norm": 2.2471675872802734,
"learning_rate": 9.999686015521986e-06,
"loss": 0.0134,
"step": 146
},
{
"epoch": 0.05655477541598538,
"grad_norm": 4.883077621459961,
"learning_rate": 9.999672215787864e-06,
"loss": 0.0356,
"step": 147
},
{
"epoch": 0.05693950177935943,
"grad_norm": 3.122020721435547,
"learning_rate": 9.999658119298374e-06,
"loss": 0.0452,
"step": 148
},
{
"epoch": 0.05732422814273348,
"grad_norm": 2.787843942642212,
"learning_rate": 9.999643726054354e-06,
"loss": 0.066,
"step": 149
},
{
"epoch": 0.05770895450610753,
"grad_norm": 2.476574659347534,
"learning_rate": 9.999629036056657e-06,
"loss": 0.0231,
"step": 150
},
{
"epoch": 0.05809368086948158,
"grad_norm": 1.8053635358810425,
"learning_rate": 9.999614049306157e-06,
"loss": 0.021,
"step": 151
},
{
"epoch": 0.058478407232855634,
"grad_norm": 4.4272894859313965,
"learning_rate": 9.999598765803742e-06,
"loss": 0.0529,
"step": 152
},
{
"epoch": 0.05886313359622968,
"grad_norm": 5.861669063568115,
"learning_rate": 9.999583185550318e-06,
"loss": 0.0415,
"step": 153
},
{
"epoch": 0.05924785995960373,
"grad_norm": 1.1911544799804688,
"learning_rate": 9.999567308546811e-06,
"loss": 0.0213,
"step": 154
},
{
"epoch": 0.059632586322977785,
"grad_norm": 1.1022003889083862,
"learning_rate": 9.999551134794164e-06,
"loss": 0.0175,
"step": 155
},
{
"epoch": 0.06001731268635183,
"grad_norm": 3.5450422763824463,
"learning_rate": 9.999534664293337e-06,
"loss": 0.0349,
"step": 156
},
{
"epoch": 0.06040203904972588,
"grad_norm": 2.6744449138641357,
"learning_rate": 9.999517897045306e-06,
"loss": 0.0128,
"step": 157
},
{
"epoch": 0.060786765413099936,
"grad_norm": 4.565217018127441,
"learning_rate": 9.999500833051067e-06,
"loss": 0.049,
"step": 158
},
{
"epoch": 0.061171491776473984,
"grad_norm": 1.150743007659912,
"learning_rate": 9.999483472311636e-06,
"loss": 0.0187,
"step": 159
},
{
"epoch": 0.06155621813984803,
"grad_norm": 2.818730115890503,
"learning_rate": 9.999465814828037e-06,
"loss": 0.0223,
"step": 160
},
{
"epoch": 0.06194094450322208,
"grad_norm": 2.2844345569610596,
"learning_rate": 9.999447860601322e-06,
"loss": 0.0211,
"step": 161
},
{
"epoch": 0.062325670866596135,
"grad_norm": 1.219356656074524,
"learning_rate": 9.999429609632557e-06,
"loss": 0.0223,
"step": 162
},
{
"epoch": 0.06271039722997018,
"grad_norm": 2.913196086883545,
"learning_rate": 9.999411061922824e-06,
"loss": 0.0179,
"step": 163
},
{
"epoch": 0.06309512359334424,
"grad_norm": 1.3902113437652588,
"learning_rate": 9.999392217473225e-06,
"loss": 0.027,
"step": 164
},
{
"epoch": 0.06347984995671828,
"grad_norm": 2.247837781906128,
"learning_rate": 9.999373076284877e-06,
"loss": 0.0154,
"step": 165
},
{
"epoch": 0.06386457632009233,
"grad_norm": 2.8625357151031494,
"learning_rate": 9.99935363835892e-06,
"loss": 0.021,
"step": 166
},
{
"epoch": 0.06424930268346639,
"grad_norm": 4.006026744842529,
"learning_rate": 9.999333903696502e-06,
"loss": 0.029,
"step": 167
},
{
"epoch": 0.06463402904684043,
"grad_norm": 2.9252421855926514,
"learning_rate": 9.999313872298796e-06,
"loss": 0.022,
"step": 168
},
{
"epoch": 0.06501875541021448,
"grad_norm": 6.485391616821289,
"learning_rate": 9.999293544166995e-06,
"loss": 0.08,
"step": 169
},
{
"epoch": 0.06540348177358854,
"grad_norm": 4.041164398193359,
"learning_rate": 9.9992729193023e-06,
"loss": 0.0433,
"step": 170
},
{
"epoch": 0.06578820813696258,
"grad_norm": 3.4366846084594727,
"learning_rate": 9.999251997705941e-06,
"loss": 0.0458,
"step": 171
},
{
"epoch": 0.06617293450033664,
"grad_norm": 0.8214027285575867,
"learning_rate": 9.999230779379155e-06,
"loss": 0.0186,
"step": 172
},
{
"epoch": 0.06655766086371069,
"grad_norm": 6.1616597175598145,
"learning_rate": 9.999209264323201e-06,
"loss": 0.0627,
"step": 173
},
{
"epoch": 0.06694238722708473,
"grad_norm": 10.068628311157227,
"learning_rate": 9.999187452539361e-06,
"loss": 0.0602,
"step": 174
},
{
"epoch": 0.06732711359045879,
"grad_norm": 5.898168087005615,
"learning_rate": 9.999165344028927e-06,
"loss": 0.0418,
"step": 175
},
{
"epoch": 0.06771183995383284,
"grad_norm": 2.946211099624634,
"learning_rate": 9.99914293879321e-06,
"loss": 0.0157,
"step": 176
},
{
"epoch": 0.06809656631720688,
"grad_norm": 3.2756588459014893,
"learning_rate": 9.99912023683354e-06,
"loss": 0.0306,
"step": 177
},
{
"epoch": 0.06848129268058094,
"grad_norm": 23.782278060913086,
"learning_rate": 9.999097238151266e-06,
"loss": 0.0323,
"step": 178
},
{
"epoch": 0.06886601904395499,
"grad_norm": 3.2240705490112305,
"learning_rate": 9.999073942747752e-06,
"loss": 0.0457,
"step": 179
},
{
"epoch": 0.06925074540732903,
"grad_norm": 1.9313907623291016,
"learning_rate": 9.999050350624381e-06,
"loss": 0.0086,
"step": 180
},
{
"epoch": 0.06963547177070309,
"grad_norm": 1.5393195152282715,
"learning_rate": 9.999026461782556e-06,
"loss": 0.01,
"step": 181
},
{
"epoch": 0.07002019813407714,
"grad_norm": 0.6853839159011841,
"learning_rate": 9.999002276223688e-06,
"loss": 0.0111,
"step": 182
},
{
"epoch": 0.07040492449745119,
"grad_norm": 2.393374443054199,
"learning_rate": 9.99897779394922e-06,
"loss": 0.0218,
"step": 183
},
{
"epoch": 0.07078965086082524,
"grad_norm": 2.253481388092041,
"learning_rate": 9.998953014960603e-06,
"loss": 0.0197,
"step": 184
},
{
"epoch": 0.0711743772241993,
"grad_norm": 1.3223674297332764,
"learning_rate": 9.998927939259303e-06,
"loss": 0.009,
"step": 185
},
{
"epoch": 0.07155910358757334,
"grad_norm": 1.9413704872131348,
"learning_rate": 9.998902566846814e-06,
"loss": 0.0098,
"step": 186
},
{
"epoch": 0.07194382995094739,
"grad_norm": 1.4487104415893555,
"learning_rate": 9.998876897724641e-06,
"loss": 0.0191,
"step": 187
},
{
"epoch": 0.07232855631432143,
"grad_norm": 1.8149521350860596,
"learning_rate": 9.998850931894305e-06,
"loss": 0.0348,
"step": 188
},
{
"epoch": 0.07271328267769549,
"grad_norm": 2.054905891418457,
"learning_rate": 9.99882466935735e-06,
"loss": 0.0153,
"step": 189
},
{
"epoch": 0.07309800904106954,
"grad_norm": 2.0988175868988037,
"learning_rate": 9.998798110115333e-06,
"loss": 0.0188,
"step": 190
},
{
"epoch": 0.07348273540444358,
"grad_norm": 1.3500646352767944,
"learning_rate": 9.998771254169833e-06,
"loss": 0.0195,
"step": 191
},
{
"epoch": 0.07386746176781764,
"grad_norm": 1.4596278667449951,
"learning_rate": 9.99874410152244e-06,
"loss": 0.0345,
"step": 192
},
{
"epoch": 0.0742521881311917,
"grad_norm": 1.385651707649231,
"learning_rate": 9.99871665217477e-06,
"loss": 0.0181,
"step": 193
},
{
"epoch": 0.07463691449456573,
"grad_norm": 1.3517183065414429,
"learning_rate": 9.998688906128446e-06,
"loss": 0.0255,
"step": 194
},
{
"epoch": 0.07502164085793979,
"grad_norm": 1.5442254543304443,
"learning_rate": 9.998660863385124e-06,
"loss": 0.0249,
"step": 195
},
{
"epoch": 0.07540636722131384,
"grad_norm": 0.9525883793830872,
"learning_rate": 9.99863252394646e-06,
"loss": 0.0181,
"step": 196
},
{
"epoch": 0.07579109358468789,
"grad_norm": 2.393604040145874,
"learning_rate": 9.99860388781414e-06,
"loss": 0.0287,
"step": 197
},
{
"epoch": 0.07617581994806194,
"grad_norm": 3.1305532455444336,
"learning_rate": 9.998574954989863e-06,
"loss": 0.0363,
"step": 198
},
{
"epoch": 0.076560546311436,
"grad_norm": 1.5628687143325806,
"learning_rate": 9.998545725475348e-06,
"loss": 0.0471,
"step": 199
},
{
"epoch": 0.07694527267481004,
"grad_norm": 1.5641419887542725,
"learning_rate": 9.998516199272327e-06,
"loss": 0.0232,
"step": 200
},
{
"epoch": 0.07732999903818409,
"grad_norm": 1.4028891324996948,
"learning_rate": 9.998486376382555e-06,
"loss": 0.0352,
"step": 201
},
{
"epoch": 0.07771472540155815,
"grad_norm": 1.6165571212768555,
"learning_rate": 9.9984562568078e-06,
"loss": 0.0195,
"step": 202
},
{
"epoch": 0.07809945176493219,
"grad_norm": 2.3587136268615723,
"learning_rate": 9.998425840549853e-06,
"loss": 0.0203,
"step": 203
},
{
"epoch": 0.07848417812830624,
"grad_norm": 3.3537991046905518,
"learning_rate": 9.998395127610515e-06,
"loss": 0.0552,
"step": 204
},
{
"epoch": 0.0788689044916803,
"grad_norm": 1.2197805643081665,
"learning_rate": 9.998364117991612e-06,
"loss": 0.0345,
"step": 205
},
{
"epoch": 0.07925363085505434,
"grad_norm": 1.2044868469238281,
"learning_rate": 9.998332811694985e-06,
"loss": 0.0186,
"step": 206
},
{
"epoch": 0.0796383572184284,
"grad_norm": 0.8047626614570618,
"learning_rate": 9.998301208722488e-06,
"loss": 0.0245,
"step": 207
},
{
"epoch": 0.08002308358180245,
"grad_norm": 1.0816692113876343,
"learning_rate": 9.998269309076001e-06,
"loss": 0.0279,
"step": 208
},
{
"epoch": 0.08040780994517649,
"grad_norm": 1.4782438278198242,
"learning_rate": 9.998237112757417e-06,
"loss": 0.0126,
"step": 209
},
{
"epoch": 0.08079253630855054,
"grad_norm": 2.366748332977295,
"learning_rate": 9.998204619768645e-06,
"loss": 0.0281,
"step": 210
},
{
"epoch": 0.0811772626719246,
"grad_norm": 1.8751951456069946,
"learning_rate": 9.998171830111615e-06,
"loss": 0.0237,
"step": 211
},
{
"epoch": 0.08156198903529864,
"grad_norm": 1.4843281507492065,
"learning_rate": 9.998138743788273e-06,
"loss": 0.0185,
"step": 212
},
{
"epoch": 0.0819467153986727,
"grad_norm": 10.534754753112793,
"learning_rate": 9.998105360800583e-06,
"loss": 0.0545,
"step": 213
},
{
"epoch": 0.08233144176204675,
"grad_norm": 3.7792885303497314,
"learning_rate": 9.998071681150525e-06,
"loss": 0.0325,
"step": 214
},
{
"epoch": 0.08271616812542079,
"grad_norm": 3.8291914463043213,
"learning_rate": 9.998037704840103e-06,
"loss": 0.0472,
"step": 215
},
{
"epoch": 0.08310089448879485,
"grad_norm": 1.368371605873108,
"learning_rate": 9.998003431871325e-06,
"loss": 0.0288,
"step": 216
},
{
"epoch": 0.0834856208521689,
"grad_norm": 3.24894118309021,
"learning_rate": 9.997968862246234e-06,
"loss": 0.0326,
"step": 217
},
{
"epoch": 0.08387034721554294,
"grad_norm": 1.941206455230713,
"learning_rate": 9.997933995966877e-06,
"loss": 0.0348,
"step": 218
},
{
"epoch": 0.084255073578917,
"grad_norm": 2.0731537342071533,
"learning_rate": 9.997898833035324e-06,
"loss": 0.0249,
"step": 219
},
{
"epoch": 0.08463979994229105,
"grad_norm": 0.5582459568977356,
"learning_rate": 9.997863373453664e-06,
"loss": 0.0206,
"step": 220
},
{
"epoch": 0.0850245263056651,
"grad_norm": 1.5135104656219482,
"learning_rate": 9.997827617223998e-06,
"loss": 0.0246,
"step": 221
},
{
"epoch": 0.08540925266903915,
"grad_norm": 0.9743750691413879,
"learning_rate": 9.997791564348454e-06,
"loss": 0.0235,
"step": 222
},
{
"epoch": 0.08579397903241319,
"grad_norm": 1.0159460306167603,
"learning_rate": 9.997755214829166e-06,
"loss": 0.0146,
"step": 223
},
{
"epoch": 0.08617870539578724,
"grad_norm": 1.4491690397262573,
"learning_rate": 9.997718568668295e-06,
"loss": 0.0237,
"step": 224
},
{
"epoch": 0.0865634317591613,
"grad_norm": 0.8503080010414124,
"learning_rate": 9.997681625868014e-06,
"loss": 0.0269,
"step": 225
},
{
"epoch": 0.08694815812253534,
"grad_norm": 1.2467992305755615,
"learning_rate": 9.99764438643052e-06,
"loss": 0.0102,
"step": 226
},
{
"epoch": 0.0873328844859094,
"grad_norm": 2.121068239212036,
"learning_rate": 9.997606850358018e-06,
"loss": 0.0267,
"step": 227
},
{
"epoch": 0.08771761084928345,
"grad_norm": 4.5842390060424805,
"learning_rate": 9.99756901765274e-06,
"loss": 0.0385,
"step": 228
},
{
"epoch": 0.08810233721265749,
"grad_norm": 0.8507031798362732,
"learning_rate": 9.997530888316927e-06,
"loss": 0.0204,
"step": 229
},
{
"epoch": 0.08848706357603155,
"grad_norm": 3.37808895111084,
"learning_rate": 9.997492462352846e-06,
"loss": 0.0232,
"step": 230
},
{
"epoch": 0.0888717899394056,
"grad_norm": 1.019683837890625,
"learning_rate": 9.997453739762779e-06,
"loss": 0.0244,
"step": 231
},
{
"epoch": 0.08925651630277964,
"grad_norm": 2.1636667251586914,
"learning_rate": 9.99741472054902e-06,
"loss": 0.029,
"step": 232
},
{
"epoch": 0.0896412426661537,
"grad_norm": 1.8102247714996338,
"learning_rate": 9.997375404713889e-06,
"loss": 0.0151,
"step": 233
},
{
"epoch": 0.09002596902952775,
"grad_norm": 1.4596302509307861,
"learning_rate": 9.997335792259717e-06,
"loss": 0.0386,
"step": 234
},
{
"epoch": 0.0904106953929018,
"grad_norm": 4.365789413452148,
"learning_rate": 9.997295883188855e-06,
"loss": 0.0429,
"step": 235
},
{
"epoch": 0.09079542175627585,
"grad_norm": 1.7287161350250244,
"learning_rate": 9.997255677503674e-06,
"loss": 0.0242,
"step": 236
},
{
"epoch": 0.0911801481196499,
"grad_norm": 2.087010383605957,
"learning_rate": 9.997215175206559e-06,
"loss": 0.0241,
"step": 237
},
{
"epoch": 0.09156487448302394,
"grad_norm": 5.224289417266846,
"learning_rate": 9.997174376299915e-06,
"loss": 0.0354,
"step": 238
},
{
"epoch": 0.091949600846398,
"grad_norm": 3.8172953128814697,
"learning_rate": 9.997133280786162e-06,
"loss": 0.0394,
"step": 239
},
{
"epoch": 0.09233432720977205,
"grad_norm": 4.284353256225586,
"learning_rate": 9.997091888667739e-06,
"loss": 0.061,
"step": 240
},
{
"epoch": 0.0927190535731461,
"grad_norm": 1.4266258478164673,
"learning_rate": 9.997050199947105e-06,
"loss": 0.0234,
"step": 241
},
{
"epoch": 0.09310377993652015,
"grad_norm": 0.6778956651687622,
"learning_rate": 9.997008214626732e-06,
"loss": 0.0194,
"step": 242
},
{
"epoch": 0.0934885062998942,
"grad_norm": 2.573453426361084,
"learning_rate": 9.996965932709115e-06,
"loss": 0.0279,
"step": 243
},
{
"epoch": 0.09387323266326825,
"grad_norm": 3.498032808303833,
"learning_rate": 9.996923354196761e-06,
"loss": 0.0397,
"step": 244
},
{
"epoch": 0.0942579590266423,
"grad_norm": 2.4384148120880127,
"learning_rate": 9.996880479092199e-06,
"loss": 0.0168,
"step": 245
},
{
"epoch": 0.09464268539001636,
"grad_norm": 0.9454012513160706,
"learning_rate": 9.996837307397972e-06,
"loss": 0.037,
"step": 246
},
{
"epoch": 0.0950274117533904,
"grad_norm": 2.0980632305145264,
"learning_rate": 9.996793839116643e-06,
"loss": 0.0332,
"step": 247
},
{
"epoch": 0.09541213811676445,
"grad_norm": 0.8643459677696228,
"learning_rate": 9.996750074250793e-06,
"loss": 0.0176,
"step": 248
},
{
"epoch": 0.09579686448013851,
"grad_norm": 2.557846784591675,
"learning_rate": 9.996706012803022e-06,
"loss": 0.0226,
"step": 249
},
{
"epoch": 0.09618159084351255,
"grad_norm": 1.4250762462615967,
"learning_rate": 9.996661654775938e-06,
"loss": 0.0141,
"step": 250
},
{
"epoch": 0.0965663172068866,
"grad_norm": 3.252248764038086,
"learning_rate": 9.996617000172181e-06,
"loss": 0.0514,
"step": 251
},
{
"epoch": 0.09695104357026066,
"grad_norm": 1.6876319646835327,
"learning_rate": 9.9965720489944e-06,
"loss": 0.0391,
"step": 252
},
{
"epoch": 0.0973357699336347,
"grad_norm": 0.6499518156051636,
"learning_rate": 9.99652680124526e-06,
"loss": 0.0118,
"step": 253
},
{
"epoch": 0.09772049629700875,
"grad_norm": 2.352055311203003,
"learning_rate": 9.996481256927449e-06,
"loss": 0.0369,
"step": 254
},
{
"epoch": 0.0981052226603828,
"grad_norm": 3.2287955284118652,
"learning_rate": 9.99643541604367e-06,
"loss": 0.0368,
"step": 255
},
{
"epoch": 0.09848994902375685,
"grad_norm": 4.091669082641602,
"learning_rate": 9.996389278596642e-06,
"loss": 0.0501,
"step": 256
},
{
"epoch": 0.0988746753871309,
"grad_norm": 2.6284351348876953,
"learning_rate": 9.99634284458911e-06,
"loss": 0.0431,
"step": 257
},
{
"epoch": 0.09925940175050495,
"grad_norm": 3.2887837886810303,
"learning_rate": 9.99629611402382e-06,
"loss": 0.037,
"step": 258
},
{
"epoch": 0.099644128113879,
"grad_norm": 0.5075850486755371,
"learning_rate": 9.996249086903553e-06,
"loss": 0.0136,
"step": 259
},
{
"epoch": 0.10002885447725306,
"grad_norm": 1.5473183393478394,
"learning_rate": 9.9962017632311e-06,
"loss": 0.0178,
"step": 260
},
{
"epoch": 0.1004135808406271,
"grad_norm": 1.7333952188491821,
"learning_rate": 9.996154143009267e-06,
"loss": 0.0237,
"step": 261
},
{
"epoch": 0.10079830720400115,
"grad_norm": 1.8454315662384033,
"learning_rate": 9.996106226240881e-06,
"loss": 0.0105,
"step": 262
},
{
"epoch": 0.10118303356737521,
"grad_norm": 1.2308003902435303,
"learning_rate": 9.996058012928786e-06,
"loss": 0.0149,
"step": 263
},
{
"epoch": 0.10156775993074925,
"grad_norm": 0.9899033308029175,
"learning_rate": 9.996009503075848e-06,
"loss": 0.0094,
"step": 264
},
{
"epoch": 0.1019524862941233,
"grad_norm": 1.0548290014266968,
"learning_rate": 9.995960696684939e-06,
"loss": 0.0094,
"step": 265
},
{
"epoch": 0.10233721265749736,
"grad_norm": 0.6202391982078552,
"learning_rate": 9.995911593758963e-06,
"loss": 0.0063,
"step": 266
},
{
"epoch": 0.1027219390208714,
"grad_norm": 1.7102643251419067,
"learning_rate": 9.99586219430083e-06,
"loss": 0.021,
"step": 267
},
{
"epoch": 0.10310666538424546,
"grad_norm": 0.6108173727989197,
"learning_rate": 9.995812498313472e-06,
"loss": 0.0033,
"step": 268
},
{
"epoch": 0.10349139174761951,
"grad_norm": 3.20573353767395,
"learning_rate": 9.99576250579984e-06,
"loss": 0.0275,
"step": 269
},
{
"epoch": 0.10387611811099355,
"grad_norm": 2.09236741065979,
"learning_rate": 9.995712216762903e-06,
"loss": 0.0119,
"step": 270
},
{
"epoch": 0.1042608444743676,
"grad_norm": 2.949190855026245,
"learning_rate": 9.995661631205644e-06,
"loss": 0.0274,
"step": 271
},
{
"epoch": 0.10464557083774166,
"grad_norm": 10.193642616271973,
"learning_rate": 9.995610749131064e-06,
"loss": 0.0464,
"step": 272
},
{
"epoch": 0.1050302972011157,
"grad_norm": 3.7807576656341553,
"learning_rate": 9.995559570542187e-06,
"loss": 0.0677,
"step": 273
},
{
"epoch": 0.10541502356448976,
"grad_norm": 4.352870464324951,
"learning_rate": 9.995508095442048e-06,
"loss": 0.0508,
"step": 274
},
{
"epoch": 0.10579974992786381,
"grad_norm": 5.404750347137451,
"learning_rate": 9.995456323833702e-06,
"loss": 0.0462,
"step": 275
},
{
"epoch": 0.10618447629123785,
"grad_norm": 2.6962974071502686,
"learning_rate": 9.995404255720223e-06,
"loss": 0.0266,
"step": 276
},
{
"epoch": 0.10656920265461191,
"grad_norm": 2.8063340187072754,
"learning_rate": 9.9953518911047e-06,
"loss": 0.0649,
"step": 277
},
{
"epoch": 0.10695392901798596,
"grad_norm": 1.472765326499939,
"learning_rate": 9.995299229990245e-06,
"loss": 0.0266,
"step": 278
},
{
"epoch": 0.10733865538136,
"grad_norm": 1.273878574371338,
"learning_rate": 9.99524627237998e-06,
"loss": 0.0116,
"step": 279
},
{
"epoch": 0.10772338174473406,
"grad_norm": 0.9871545433998108,
"learning_rate": 9.99519301827705e-06,
"loss": 0.0165,
"step": 280
},
{
"epoch": 0.10810810810810811,
"grad_norm": 1.1046977043151855,
"learning_rate": 9.995139467684614e-06,
"loss": 0.0115,
"step": 281
},
{
"epoch": 0.10849283447148216,
"grad_norm": 2.761289358139038,
"learning_rate": 9.99508562060585e-06,
"loss": 0.0231,
"step": 282
},
{
"epoch": 0.10887756083485621,
"grad_norm": 1.1875776052474976,
"learning_rate": 9.99503147704396e-06,
"loss": 0.0133,
"step": 283
},
{
"epoch": 0.10926228719823027,
"grad_norm": 0.36607322096824646,
"learning_rate": 9.994977037002152e-06,
"loss": 0.0082,
"step": 284
},
{
"epoch": 0.1096470135616043,
"grad_norm": 0.9858017563819885,
"learning_rate": 9.994922300483657e-06,
"loss": 0.0457,
"step": 285
},
{
"epoch": 0.11003173992497836,
"grad_norm": 1.2574812173843384,
"learning_rate": 9.994867267491729e-06,
"loss": 0.0175,
"step": 286
},
{
"epoch": 0.11041646628835242,
"grad_norm": 1.4595177173614502,
"learning_rate": 9.994811938029627e-06,
"loss": 0.0153,
"step": 287
},
{
"epoch": 0.11080119265172646,
"grad_norm": 1.0112245082855225,
"learning_rate": 9.994756312100642e-06,
"loss": 0.0132,
"step": 288
},
{
"epoch": 0.11118591901510051,
"grad_norm": 1.3767755031585693,
"learning_rate": 9.994700389708071e-06,
"loss": 0.0141,
"step": 289
},
{
"epoch": 0.11157064537847455,
"grad_norm": 0.3537527024745941,
"learning_rate": 9.994644170855237e-06,
"loss": 0.0079,
"step": 290
},
{
"epoch": 0.11195537174184861,
"grad_norm": 1.6381282806396484,
"learning_rate": 9.994587655545476e-06,
"loss": 0.0206,
"step": 291
},
{
"epoch": 0.11234009810522266,
"grad_norm": 3.2190628051757812,
"learning_rate": 9.99453084378214e-06,
"loss": 0.0301,
"step": 292
},
{
"epoch": 0.1127248244685967,
"grad_norm": 4.9315032958984375,
"learning_rate": 9.994473735568602e-06,
"loss": 0.0098,
"step": 293
},
{
"epoch": 0.11310955083197076,
"grad_norm": 0.7560256719589233,
"learning_rate": 9.994416330908252e-06,
"loss": 0.0138,
"step": 294
},
{
"epoch": 0.11349427719534481,
"grad_norm": 1.0898628234863281,
"learning_rate": 9.9943586298045e-06,
"loss": 0.0184,
"step": 295
},
{
"epoch": 0.11387900355871886,
"grad_norm": 2.7064716815948486,
"learning_rate": 9.994300632260766e-06,
"loss": 0.0218,
"step": 296
},
{
"epoch": 0.11426372992209291,
"grad_norm": 1.0446937084197998,
"learning_rate": 9.994242338280495e-06,
"loss": 0.0108,
"step": 297
},
{
"epoch": 0.11464845628546697,
"grad_norm": 3.4280526638031006,
"learning_rate": 9.994183747867148e-06,
"loss": 0.0355,
"step": 298
},
{
"epoch": 0.115033182648841,
"grad_norm": 1.0543782711029053,
"learning_rate": 9.994124861024199e-06,
"loss": 0.024,
"step": 299
},
{
"epoch": 0.11541790901221506,
"grad_norm": 1.5086958408355713,
"learning_rate": 9.994065677755148e-06,
"loss": 0.0214,
"step": 300
},
{
"epoch": 0.11580263537558912,
"grad_norm": 3.212959051132202,
"learning_rate": 9.994006198063506e-06,
"loss": 0.0285,
"step": 301
},
{
"epoch": 0.11618736173896316,
"grad_norm": 0.6459766030311584,
"learning_rate": 9.9939464219528e-06,
"loss": 0.004,
"step": 302
},
{
"epoch": 0.11657208810233721,
"grad_norm": 2.0472171306610107,
"learning_rate": 9.993886349426584e-06,
"loss": 0.0217,
"step": 303
},
{
"epoch": 0.11695681446571127,
"grad_norm": 0.8928171992301941,
"learning_rate": 9.993825980488418e-06,
"loss": 0.006,
"step": 304
},
{
"epoch": 0.11734154082908531,
"grad_norm": 1.5749402046203613,
"learning_rate": 9.99376531514189e-06,
"loss": 0.0186,
"step": 305
},
{
"epoch": 0.11772626719245936,
"grad_norm": 1.007002592086792,
"learning_rate": 9.993704353390597e-06,
"loss": 0.0519,
"step": 306
},
{
"epoch": 0.11811099355583342,
"grad_norm": 2.073533535003662,
"learning_rate": 9.99364309523816e-06,
"loss": 0.0258,
"step": 307
},
{
"epoch": 0.11849571991920746,
"grad_norm": 1.1913518905639648,
"learning_rate": 9.993581540688213e-06,
"loss": 0.0203,
"step": 308
},
{
"epoch": 0.11888044628258151,
"grad_norm": 0.6313387751579285,
"learning_rate": 9.993519689744411e-06,
"loss": 0.0061,
"step": 309
},
{
"epoch": 0.11926517264595557,
"grad_norm": 0.5906733870506287,
"learning_rate": 9.993457542410424e-06,
"loss": 0.0129,
"step": 310
},
{
"epoch": 0.11964989900932961,
"grad_norm": 0.5875241756439209,
"learning_rate": 9.993395098689943e-06,
"loss": 0.0147,
"step": 311
},
{
"epoch": 0.12003462537270367,
"grad_norm": 0.4100046753883362,
"learning_rate": 9.993332358586669e-06,
"loss": 0.0095,
"step": 312
},
{
"epoch": 0.12041935173607772,
"grad_norm": 2.3552939891815186,
"learning_rate": 9.993269322104332e-06,
"loss": 0.0563,
"step": 313
},
{
"epoch": 0.12080407809945176,
"grad_norm": 2.6373159885406494,
"learning_rate": 9.993205989246672e-06,
"loss": 0.0474,
"step": 314
},
{
"epoch": 0.12118880446282582,
"grad_norm": 2.1648263931274414,
"learning_rate": 9.993142360017447e-06,
"loss": 0.0182,
"step": 315
},
{
"epoch": 0.12157353082619987,
"grad_norm": 2.0293798446655273,
"learning_rate": 9.993078434420433e-06,
"loss": 0.0161,
"step": 316
},
{
"epoch": 0.12195825718957391,
"grad_norm": 2.0390355587005615,
"learning_rate": 9.993014212459425e-06,
"loss": 0.0276,
"step": 317
},
{
"epoch": 0.12234298355294797,
"grad_norm": 1.5906345844268799,
"learning_rate": 9.992949694138236e-06,
"loss": 0.0221,
"step": 318
},
{
"epoch": 0.12272770991632202,
"grad_norm": 1.5202964544296265,
"learning_rate": 9.992884879460694e-06,
"loss": 0.0252,
"step": 319
},
{
"epoch": 0.12311243627969606,
"grad_norm": 1.605220079421997,
"learning_rate": 9.992819768430648e-06,
"loss": 0.02,
"step": 320
},
{
"epoch": 0.12349716264307012,
"grad_norm": 1.02753746509552,
"learning_rate": 9.992754361051959e-06,
"loss": 0.0176,
"step": 321
},
{
"epoch": 0.12388188900644416,
"grad_norm": 1.7800108194351196,
"learning_rate": 9.992688657328515e-06,
"loss": 0.019,
"step": 322
},
{
"epoch": 0.12426661536981821,
"grad_norm": 5.6064839363098145,
"learning_rate": 9.992622657264211e-06,
"loss": 0.1288,
"step": 323
},
{
"epoch": 0.12465134173319227,
"grad_norm": 4.702009201049805,
"learning_rate": 9.992556360862966e-06,
"loss": 0.0451,
"step": 324
},
{
"epoch": 0.1250360680965663,
"grad_norm": 7.251805782318115,
"learning_rate": 9.992489768128714e-06,
"loss": 0.0272,
"step": 325
},
{
"epoch": 0.12542079445994037,
"grad_norm": 2.643155336380005,
"learning_rate": 9.992422879065409e-06,
"loss": 0.025,
"step": 326
},
{
"epoch": 0.12580552082331442,
"grad_norm": 0.8993635773658752,
"learning_rate": 9.99235569367702e-06,
"loss": 0.0092,
"step": 327
},
{
"epoch": 0.12619024718668848,
"grad_norm": 4.789489269256592,
"learning_rate": 9.992288211967537e-06,
"loss": 0.0462,
"step": 328
},
{
"epoch": 0.12657497355006253,
"grad_norm": 0.475421667098999,
"learning_rate": 9.992220433940963e-06,
"loss": 0.0037,
"step": 329
},
{
"epoch": 0.12695969991343656,
"grad_norm": 2.3975934982299805,
"learning_rate": 9.992152359601323e-06,
"loss": 0.0332,
"step": 330
},
{
"epoch": 0.1273444262768106,
"grad_norm": 1.1064420938491821,
"learning_rate": 9.992083988952654e-06,
"loss": 0.0217,
"step": 331
},
{
"epoch": 0.12772915264018467,
"grad_norm": 1.686231255531311,
"learning_rate": 9.99201532199902e-06,
"loss": 0.0159,
"step": 332
},
{
"epoch": 0.12811387900355872,
"grad_norm": 0.7885820269584656,
"learning_rate": 9.99194635874449e-06,
"loss": 0.0065,
"step": 333
},
{
"epoch": 0.12849860536693278,
"grad_norm": 2.0389482975006104,
"learning_rate": 9.991877099193164e-06,
"loss": 0.0182,
"step": 334
},
{
"epoch": 0.12888333173030683,
"grad_norm": 1.462136149406433,
"learning_rate": 9.991807543349148e-06,
"loss": 0.0271,
"step": 335
},
{
"epoch": 0.12926805809368086,
"grad_norm": 1.666369080543518,
"learning_rate": 9.99173769121657e-06,
"loss": 0.0145,
"step": 336
},
{
"epoch": 0.12965278445705491,
"grad_norm": 1.0154768228530884,
"learning_rate": 9.99166754279958e-06,
"loss": 0.0084,
"step": 337
},
{
"epoch": 0.13003751082042897,
"grad_norm": 6.3970112800598145,
"learning_rate": 9.991597098102339e-06,
"loss": 0.0658,
"step": 338
},
{
"epoch": 0.13042223718380302,
"grad_norm": 2.765911817550659,
"learning_rate": 9.991526357129028e-06,
"loss": 0.0249,
"step": 339
},
{
"epoch": 0.13080696354717708,
"grad_norm": 3.0030531883239746,
"learning_rate": 9.991455319883849e-06,
"loss": 0.0266,
"step": 340
},
{
"epoch": 0.1311916899105511,
"grad_norm": 4.774355888366699,
"learning_rate": 9.991383986371016e-06,
"loss": 0.0494,
"step": 341
},
{
"epoch": 0.13157641627392516,
"grad_norm": 1.2228891849517822,
"learning_rate": 9.991312356594762e-06,
"loss": 0.0182,
"step": 342
},
{
"epoch": 0.13196114263729922,
"grad_norm": 0.6857689023017883,
"learning_rate": 9.991240430559342e-06,
"loss": 0.0098,
"step": 343
},
{
"epoch": 0.13234586900067327,
"grad_norm": 1.3903642892837524,
"learning_rate": 9.99116820826902e-06,
"loss": 0.0201,
"step": 344
},
{
"epoch": 0.13273059536404733,
"grad_norm": 1.7887595891952515,
"learning_rate": 9.991095689728088e-06,
"loss": 0.0098,
"step": 345
},
{
"epoch": 0.13311532172742138,
"grad_norm": 3.239908456802368,
"learning_rate": 9.991022874940845e-06,
"loss": 0.0596,
"step": 346
},
{
"epoch": 0.1335000480907954,
"grad_norm": 2.807297706604004,
"learning_rate": 9.990949763911619e-06,
"loss": 0.0384,
"step": 347
},
{
"epoch": 0.13388477445416946,
"grad_norm": 1.4440059661865234,
"learning_rate": 9.990876356644746e-06,
"loss": 0.0163,
"step": 348
},
{
"epoch": 0.13426950081754352,
"grad_norm": 1.5381439924240112,
"learning_rate": 9.990802653144583e-06,
"loss": 0.0309,
"step": 349
},
{
"epoch": 0.13465422718091757,
"grad_norm": 12.466649055480957,
"learning_rate": 9.990728653415504e-06,
"loss": 0.0578,
"step": 350
},
{
"epoch": 0.13503895354429163,
"grad_norm": 1.6615933179855347,
"learning_rate": 9.990654357461903e-06,
"loss": 0.0384,
"step": 351
},
{
"epoch": 0.13542367990766568,
"grad_norm": 0.5724029541015625,
"learning_rate": 9.990579765288191e-06,
"loss": 0.0046,
"step": 352
},
{
"epoch": 0.1358084062710397,
"grad_norm": 0.7477350234985352,
"learning_rate": 9.990504876898792e-06,
"loss": 0.0122,
"step": 353
},
{
"epoch": 0.13619313263441377,
"grad_norm": 1.5245766639709473,
"learning_rate": 9.990429692298154e-06,
"loss": 0.008,
"step": 354
},
{
"epoch": 0.13657785899778782,
"grad_norm": 2.3614602088928223,
"learning_rate": 9.990354211490736e-06,
"loss": 0.0338,
"step": 355
},
{
"epoch": 0.13696258536116188,
"grad_norm": 2.2919530868530273,
"learning_rate": 9.990278434481022e-06,
"loss": 0.027,
"step": 356
},
{
"epoch": 0.13734731172453593,
"grad_norm": 0.5782844424247742,
"learning_rate": 9.99020236127351e-06,
"loss": 0.0089,
"step": 357
},
{
"epoch": 0.13773203808790999,
"grad_norm": 2.5682413578033447,
"learning_rate": 9.99012599187271e-06,
"loss": 0.0232,
"step": 358
},
{
"epoch": 0.138116764451284,
"grad_norm": 6.079646587371826,
"learning_rate": 9.99004932628316e-06,
"loss": 0.0329,
"step": 359
},
{
"epoch": 0.13850149081465807,
"grad_norm": 4.070837497711182,
"learning_rate": 9.989972364509408e-06,
"loss": 0.0342,
"step": 360
},
{
"epoch": 0.13888621717803212,
"grad_norm": 0.6112903952598572,
"learning_rate": 9.989895106556025e-06,
"loss": 0.0164,
"step": 361
},
{
"epoch": 0.13927094354140618,
"grad_norm": 1.106340765953064,
"learning_rate": 9.989817552427594e-06,
"loss": 0.0145,
"step": 362
},
{
"epoch": 0.13965566990478023,
"grad_norm": 0.9722648859024048,
"learning_rate": 9.989739702128717e-06,
"loss": 0.0139,
"step": 363
},
{
"epoch": 0.1400403962681543,
"grad_norm": 0.8131417036056519,
"learning_rate": 9.989661555664019e-06,
"loss": 0.0083,
"step": 364
},
{
"epoch": 0.14042512263152832,
"grad_norm": 1.2104814052581787,
"learning_rate": 9.989583113038134e-06,
"loss": 0.0229,
"step": 365
},
{
"epoch": 0.14080984899490237,
"grad_norm": 0.8418599367141724,
"learning_rate": 9.98950437425572e-06,
"loss": 0.0164,
"step": 366
},
{
"epoch": 0.14119457535827643,
"grad_norm": 2.4839096069335938,
"learning_rate": 9.989425339321453e-06,
"loss": 0.0298,
"step": 367
},
{
"epoch": 0.14157930172165048,
"grad_norm": 2.681351661682129,
"learning_rate": 9.98934600824002e-06,
"loss": 0.0482,
"step": 368
},
{
"epoch": 0.14196402808502454,
"grad_norm": 1.8847802877426147,
"learning_rate": 9.989266381016131e-06,
"loss": 0.0208,
"step": 369
},
{
"epoch": 0.1423487544483986,
"grad_norm": 11.408592224121094,
"learning_rate": 9.989186457654515e-06,
"loss": 0.014,
"step": 370
},
{
"epoch": 0.14273348081177262,
"grad_norm": 2.6127874851226807,
"learning_rate": 9.989106238159909e-06,
"loss": 0.0315,
"step": 371
},
{
"epoch": 0.14311820717514667,
"grad_norm": 6.132943153381348,
"learning_rate": 9.989025722537082e-06,
"loss": 0.0233,
"step": 372
},
{
"epoch": 0.14350293353852073,
"grad_norm": 0.842147946357727,
"learning_rate": 9.988944910790808e-06,
"loss": 0.0173,
"step": 373
},
{
"epoch": 0.14388765990189478,
"grad_norm": 20.986244201660156,
"learning_rate": 9.988863802925887e-06,
"loss": 0.0222,
"step": 374
},
{
"epoch": 0.14427238626526884,
"grad_norm": 1.4051076173782349,
"learning_rate": 9.988782398947132e-06,
"loss": 0.0286,
"step": 375
},
{
"epoch": 0.14465711262864286,
"grad_norm": 1.1612248420715332,
"learning_rate": 9.988700698859373e-06,
"loss": 0.0246,
"step": 376
},
{
"epoch": 0.14504183899201692,
"grad_norm": 0.6646268367767334,
"learning_rate": 9.988618702667461e-06,
"loss": 0.0132,
"step": 377
},
{
"epoch": 0.14542656535539097,
"grad_norm": 1.8865305185317993,
"learning_rate": 9.988536410376261e-06,
"loss": 0.0254,
"step": 378
},
{
"epoch": 0.14581129171876503,
"grad_norm": 0.6670300960540771,
"learning_rate": 9.988453821990663e-06,
"loss": 0.0094,
"step": 379
},
{
"epoch": 0.14619601808213908,
"grad_norm": 0.9475501775741577,
"learning_rate": 9.988370937515562e-06,
"loss": 0.0103,
"step": 380
},
{
"epoch": 0.14658074444551314,
"grad_norm": 21.623615264892578,
"learning_rate": 9.98828775695588e-06,
"loss": 0.0772,
"step": 381
},
{
"epoch": 0.14696547080888717,
"grad_norm": 1.0714162588119507,
"learning_rate": 9.988204280316556e-06,
"loss": 0.0141,
"step": 382
},
{
"epoch": 0.14735019717226122,
"grad_norm": 0.8990387320518494,
"learning_rate": 9.988120507602544e-06,
"loss": 0.0282,
"step": 383
},
{
"epoch": 0.14773492353563528,
"grad_norm": 0.7467316389083862,
"learning_rate": 9.988036438818815e-06,
"loss": 0.0068,
"step": 384
},
{
"epoch": 0.14811964989900933,
"grad_norm": 0.75644451379776,
"learning_rate": 9.98795207397036e-06,
"loss": 0.0104,
"step": 385
},
{
"epoch": 0.1485043762623834,
"grad_norm": 1.473006248474121,
"learning_rate": 9.987867413062187e-06,
"loss": 0.0257,
"step": 386
},
{
"epoch": 0.14888910262575744,
"grad_norm": 1.1377049684524536,
"learning_rate": 9.987782456099319e-06,
"loss": 0.0231,
"step": 387
},
{
"epoch": 0.14927382898913147,
"grad_norm": 0.35376957058906555,
"learning_rate": 9.9876972030868e-06,
"loss": 0.011,
"step": 388
},
{
"epoch": 0.14965855535250552,
"grad_norm": 1.170577883720398,
"learning_rate": 9.987611654029691e-06,
"loss": 0.0232,
"step": 389
},
{
"epoch": 0.15004328171587958,
"grad_norm": 2.140615701675415,
"learning_rate": 9.987525808933069e-06,
"loss": 0.0421,
"step": 390
},
{
"epoch": 0.15042800807925363,
"grad_norm": 1.2613564729690552,
"learning_rate": 9.987439667802028e-06,
"loss": 0.0138,
"step": 391
},
{
"epoch": 0.1508127344426277,
"grad_norm": 1.3194247484207153,
"learning_rate": 9.987353230641683e-06,
"loss": 0.0127,
"step": 392
},
{
"epoch": 0.15119746080600174,
"grad_norm": 1.2532031536102295,
"learning_rate": 9.987266497457161e-06,
"loss": 0.02,
"step": 393
},
{
"epoch": 0.15158218716937577,
"grad_norm": 1.568895697593689,
"learning_rate": 9.987179468253616e-06,
"loss": 0.016,
"step": 394
},
{
"epoch": 0.15196691353274983,
"grad_norm": 2.125100612640381,
"learning_rate": 9.98709214303621e-06,
"loss": 0.0197,
"step": 395
},
{
"epoch": 0.15235163989612388,
"grad_norm": 2.59309983253479,
"learning_rate": 9.987004521810124e-06,
"loss": 0.0368,
"step": 396
},
{
"epoch": 0.15273636625949794,
"grad_norm": 0.5584582090377808,
"learning_rate": 9.986916604580564e-06,
"loss": 0.0097,
"step": 397
},
{
"epoch": 0.153121092622872,
"grad_norm": 1.2047398090362549,
"learning_rate": 9.986828391352743e-06,
"loss": 0.0182,
"step": 398
},
{
"epoch": 0.15350581898624605,
"grad_norm": 1.0749913454055786,
"learning_rate": 9.986739882131901e-06,
"loss": 0.0175,
"step": 399
},
{
"epoch": 0.15389054534962007,
"grad_norm": 0.9072372913360596,
"learning_rate": 9.986651076923288e-06,
"loss": 0.0158,
"step": 400
},
{
"epoch": 0.15427527171299413,
"grad_norm": 0.31551575660705566,
"learning_rate": 9.986561975732179e-06,
"loss": 0.0048,
"step": 401
},
{
"epoch": 0.15465999807636818,
"grad_norm": 0.3892665505409241,
"learning_rate": 9.986472578563859e-06,
"loss": 0.0035,
"step": 402
},
{
"epoch": 0.15504472443974224,
"grad_norm": 1.0639790296554565,
"learning_rate": 9.986382885423637e-06,
"loss": 0.004,
"step": 403
},
{
"epoch": 0.1554294508031163,
"grad_norm": 0.8102086186408997,
"learning_rate": 9.986292896316834e-06,
"loss": 0.0108,
"step": 404
},
{
"epoch": 0.15581417716649035,
"grad_norm": 1.0145649909973145,
"learning_rate": 9.986202611248794e-06,
"loss": 0.0183,
"step": 405
},
{
"epoch": 0.15619890352986437,
"grad_norm": 1.0815285444259644,
"learning_rate": 9.986112030224872e-06,
"loss": 0.0074,
"step": 406
},
{
"epoch": 0.15658362989323843,
"grad_norm": 0.902445375919342,
"learning_rate": 9.986021153250449e-06,
"loss": 0.007,
"step": 407
},
{
"epoch": 0.15696835625661248,
"grad_norm": 2.0999677181243896,
"learning_rate": 9.985929980330917e-06,
"loss": 0.0144,
"step": 408
},
{
"epoch": 0.15735308261998654,
"grad_norm": 0.6674321293830872,
"learning_rate": 9.985838511471688e-06,
"loss": 0.011,
"step": 409
},
{
"epoch": 0.1577378089833606,
"grad_norm": 2.6808223724365234,
"learning_rate": 9.98574674667819e-06,
"loss": 0.0108,
"step": 410
},
{
"epoch": 0.15812253534673462,
"grad_norm": 0.9309936165809631,
"learning_rate": 9.98565468595587e-06,
"loss": 0.0034,
"step": 411
},
{
"epoch": 0.15850726171010868,
"grad_norm": 3.7667415142059326,
"learning_rate": 9.985562329310192e-06,
"loss": 0.0218,
"step": 412
},
{
"epoch": 0.15889198807348273,
"grad_norm": 1.3515875339508057,
"learning_rate": 9.98546967674664e-06,
"loss": 0.0261,
"step": 413
},
{
"epoch": 0.1592767144368568,
"grad_norm": 1.2153210639953613,
"learning_rate": 9.98537672827071e-06,
"loss": 0.0276,
"step": 414
},
{
"epoch": 0.15966144080023084,
"grad_norm": 2.072326183319092,
"learning_rate": 9.985283483887923e-06,
"loss": 0.0354,
"step": 415
},
{
"epoch": 0.1600461671636049,
"grad_norm": 1.6364926099777222,
"learning_rate": 9.985189943603811e-06,
"loss": 0.0586,
"step": 416
},
{
"epoch": 0.16043089352697892,
"grad_norm": 1.4485971927642822,
"learning_rate": 9.985096107423925e-06,
"loss": 0.0128,
"step": 417
},
{
"epoch": 0.16081561989035298,
"grad_norm": 0.7800815105438232,
"learning_rate": 9.985001975353835e-06,
"loss": 0.0117,
"step": 418
},
{
"epoch": 0.16120034625372703,
"grad_norm": 1.6513843536376953,
"learning_rate": 9.984907547399132e-06,
"loss": 0.0232,
"step": 419
},
{
"epoch": 0.1615850726171011,
"grad_norm": 2.340419292449951,
"learning_rate": 9.984812823565417e-06,
"loss": 0.0282,
"step": 420
},
{
"epoch": 0.16196979898047514,
"grad_norm": 0.8637623190879822,
"learning_rate": 9.984717803858312e-06,
"loss": 0.0224,
"step": 421
},
{
"epoch": 0.1623545253438492,
"grad_norm": 1.3352808952331543,
"learning_rate": 9.98462248828346e-06,
"loss": 0.0161,
"step": 422
},
{
"epoch": 0.16273925170722323,
"grad_norm": 1.0792741775512695,
"learning_rate": 9.984526876846517e-06,
"loss": 0.0353,
"step": 423
},
{
"epoch": 0.16312397807059728,
"grad_norm": 1.0091253519058228,
"learning_rate": 9.984430969553155e-06,
"loss": 0.0114,
"step": 424
},
{
"epoch": 0.16350870443397134,
"grad_norm": 1.8374234437942505,
"learning_rate": 9.984334766409072e-06,
"loss": 0.0186,
"step": 425
},
{
"epoch": 0.1638934307973454,
"grad_norm": 0.6512305736541748,
"learning_rate": 9.984238267419974e-06,
"loss": 0.014,
"step": 426
},
{
"epoch": 0.16427815716071945,
"grad_norm": 1.6066893339157104,
"learning_rate": 9.984141472591591e-06,
"loss": 0.0154,
"step": 427
},
{
"epoch": 0.1646628835240935,
"grad_norm": 2.240696430206299,
"learning_rate": 9.984044381929667e-06,
"loss": 0.0127,
"step": 428
},
{
"epoch": 0.16504760988746753,
"grad_norm": 1.4211338758468628,
"learning_rate": 9.983946995439964e-06,
"loss": 0.0148,
"step": 429
},
{
"epoch": 0.16543233625084158,
"grad_norm": 0.8117442727088928,
"learning_rate": 9.983849313128265e-06,
"loss": 0.0161,
"step": 430
},
{
"epoch": 0.16581706261421564,
"grad_norm": 1.8538448810577393,
"learning_rate": 9.983751335000365e-06,
"loss": 0.0168,
"step": 431
},
{
"epoch": 0.1662017889775897,
"grad_norm": 0.8695465326309204,
"learning_rate": 9.983653061062084e-06,
"loss": 0.0075,
"step": 432
},
{
"epoch": 0.16658651534096375,
"grad_norm": 1.9306254386901855,
"learning_rate": 9.983554491319248e-06,
"loss": 0.0211,
"step": 433
},
{
"epoch": 0.1669712417043378,
"grad_norm": 0.9681547284126282,
"learning_rate": 9.983455625777713e-06,
"loss": 0.0219,
"step": 434
},
{
"epoch": 0.16735596806771183,
"grad_norm": 1.8259018659591675,
"learning_rate": 9.983356464443349e-06,
"loss": 0.0388,
"step": 435
},
{
"epoch": 0.16774069443108588,
"grad_norm": 0.9157015681266785,
"learning_rate": 9.983257007322033e-06,
"loss": 0.0082,
"step": 436
},
{
"epoch": 0.16812542079445994,
"grad_norm": 2.4303884506225586,
"learning_rate": 9.983157254419678e-06,
"loss": 0.0507,
"step": 437
},
{
"epoch": 0.168510147157834,
"grad_norm": 1.1336638927459717,
"learning_rate": 9.983057205742199e-06,
"loss": 0.012,
"step": 438
},
{
"epoch": 0.16889487352120805,
"grad_norm": 1.2588491439819336,
"learning_rate": 9.982956861295536e-06,
"loss": 0.0236,
"step": 439
},
{
"epoch": 0.1692795998845821,
"grad_norm": 0.6215165853500366,
"learning_rate": 9.982856221085644e-06,
"loss": 0.0135,
"step": 440
},
{
"epoch": 0.16966432624795613,
"grad_norm": 0.6819311380386353,
"learning_rate": 9.982755285118499e-06,
"loss": 0.0089,
"step": 441
},
{
"epoch": 0.1700490526113302,
"grad_norm": 0.6803324818611145,
"learning_rate": 9.982654053400089e-06,
"loss": 0.0073,
"step": 442
},
{
"epoch": 0.17043377897470424,
"grad_norm": 1.0501588582992554,
"learning_rate": 9.982552525936425e-06,
"loss": 0.0118,
"step": 443
},
{
"epoch": 0.1708185053380783,
"grad_norm": 0.6693577766418457,
"learning_rate": 9.982450702733532e-06,
"loss": 0.0085,
"step": 444
},
{
"epoch": 0.17120323170145235,
"grad_norm": 1.0826107263565063,
"learning_rate": 9.982348583797454e-06,
"loss": 0.0165,
"step": 445
},
{
"epoch": 0.17158795806482638,
"grad_norm": 0.35750290751457214,
"learning_rate": 9.982246169134251e-06,
"loss": 0.0037,
"step": 446
},
{
"epoch": 0.17197268442820043,
"grad_norm": 0.6661262512207031,
"learning_rate": 9.982143458750005e-06,
"loss": 0.0073,
"step": 447
},
{
"epoch": 0.1723574107915745,
"grad_norm": 0.6900920867919922,
"learning_rate": 9.98204045265081e-06,
"loss": 0.0157,
"step": 448
},
{
"epoch": 0.17274213715494854,
"grad_norm": 2.380176067352295,
"learning_rate": 9.98193715084278e-06,
"loss": 0.0271,
"step": 449
},
{
"epoch": 0.1731268635183226,
"grad_norm": 0.618948221206665,
"learning_rate": 9.981833553332045e-06,
"loss": 0.0089,
"step": 450
},
{
"epoch": 0.17351158988169665,
"grad_norm": 0.5611693859100342,
"learning_rate": 9.981729660124759e-06,
"loss": 0.0062,
"step": 451
},
{
"epoch": 0.17389631624507068,
"grad_norm": 1.119604229927063,
"learning_rate": 9.981625471227083e-06,
"loss": 0.0155,
"step": 452
},
{
"epoch": 0.17428104260844474,
"grad_norm": 1.747314453125,
"learning_rate": 9.981520986645204e-06,
"loss": 0.0235,
"step": 453
},
{
"epoch": 0.1746657689718188,
"grad_norm": 2.7685446739196777,
"learning_rate": 9.981416206385323e-06,
"loss": 0.0571,
"step": 454
},
{
"epoch": 0.17505049533519285,
"grad_norm": 0.5638480186462402,
"learning_rate": 9.98131113045366e-06,
"loss": 0.0047,
"step": 455
},
{
"epoch": 0.1754352216985669,
"grad_norm": 1.8126139640808105,
"learning_rate": 9.981205758856452e-06,
"loss": 0.0121,
"step": 456
},
{
"epoch": 0.17581994806194096,
"grad_norm": 2.3808107376098633,
"learning_rate": 9.98110009159995e-06,
"loss": 0.0255,
"step": 457
},
{
"epoch": 0.17620467442531498,
"grad_norm": 1.3327685594558716,
"learning_rate": 9.98099412869043e-06,
"loss": 0.0135,
"step": 458
},
{
"epoch": 0.17658940078868904,
"grad_norm": 1.5651696920394897,
"learning_rate": 9.980887870134181e-06,
"loss": 0.0261,
"step": 459
},
{
"epoch": 0.1769741271520631,
"grad_norm": 0.8582300543785095,
"learning_rate": 9.980781315937507e-06,
"loss": 0.0088,
"step": 460
},
{
"epoch": 0.17735885351543715,
"grad_norm": 2.216838836669922,
"learning_rate": 9.980674466106735e-06,
"loss": 0.022,
"step": 461
},
{
"epoch": 0.1777435798788112,
"grad_norm": 1.1542237997055054,
"learning_rate": 9.980567320648207e-06,
"loss": 0.0095,
"step": 462
},
{
"epoch": 0.17812830624218526,
"grad_norm": 0.5303864479064941,
"learning_rate": 9.98045987956828e-06,
"loss": 0.0145,
"step": 463
},
{
"epoch": 0.17851303260555929,
"grad_norm": 0.49043771624565125,
"learning_rate": 9.980352142873335e-06,
"loss": 0.0057,
"step": 464
},
{
"epoch": 0.17889775896893334,
"grad_norm": 0.9760215878486633,
"learning_rate": 9.980244110569765e-06,
"loss": 0.0102,
"step": 465
},
{
"epoch": 0.1792824853323074,
"grad_norm": 0.7870310544967651,
"learning_rate": 9.980135782663981e-06,
"loss": 0.0113,
"step": 466
},
{
"epoch": 0.17966721169568145,
"grad_norm": 0.9848241806030273,
"learning_rate": 9.980027159162415e-06,
"loss": 0.0224,
"step": 467
},
{
"epoch": 0.1800519380590555,
"grad_norm": 0.3691968321800232,
"learning_rate": 9.979918240071512e-06,
"loss": 0.0029,
"step": 468
},
{
"epoch": 0.18043666442242956,
"grad_norm": 1.2080693244934082,
"learning_rate": 9.97980902539774e-06,
"loss": 0.0133,
"step": 469
},
{
"epoch": 0.1808213907858036,
"grad_norm": 1.9378516674041748,
"learning_rate": 9.979699515147579e-06,
"loss": 0.0285,
"step": 470
},
{
"epoch": 0.18120611714917764,
"grad_norm": 0.5868281126022339,
"learning_rate": 9.979589709327528e-06,
"loss": 0.0104,
"step": 471
},
{
"epoch": 0.1815908435125517,
"grad_norm": 1.4102672338485718,
"learning_rate": 9.979479607944107e-06,
"loss": 0.0129,
"step": 472
},
{
"epoch": 0.18197556987592575,
"grad_norm": 1.0837740898132324,
"learning_rate": 9.97936921100385e-06,
"loss": 0.0078,
"step": 473
},
{
"epoch": 0.1823602962392998,
"grad_norm": 0.7247804403305054,
"learning_rate": 9.97925851851331e-06,
"loss": 0.0138,
"step": 474
},
{
"epoch": 0.18274502260267383,
"grad_norm": 1.0309706926345825,
"learning_rate": 9.979147530479057e-06,
"loss": 0.0214,
"step": 475
},
{
"epoch": 0.1831297489660479,
"grad_norm": 1.6426202058792114,
"learning_rate": 9.979036246907679e-06,
"loss": 0.0194,
"step": 476
},
{
"epoch": 0.18351447532942194,
"grad_norm": 1.030686855316162,
"learning_rate": 9.97892466780578e-06,
"loss": 0.0083,
"step": 477
},
{
"epoch": 0.183899201692796,
"grad_norm": 1.575244665145874,
"learning_rate": 9.978812793179985e-06,
"loss": 0.0125,
"step": 478
},
{
"epoch": 0.18428392805617005,
"grad_norm": 0.7389955520629883,
"learning_rate": 9.97870062303693e-06,
"loss": 0.0123,
"step": 479
},
{
"epoch": 0.1846686544195441,
"grad_norm": 1.0610774755477905,
"learning_rate": 9.978588157383277e-06,
"loss": 0.0101,
"step": 480
},
{
"epoch": 0.18505338078291814,
"grad_norm": 0.9599776268005371,
"learning_rate": 9.978475396225702e-06,
"loss": 0.0161,
"step": 481
},
{
"epoch": 0.1854381071462922,
"grad_norm": 1.2432290315628052,
"learning_rate": 9.978362339570896e-06,
"loss": 0.0251,
"step": 482
},
{
"epoch": 0.18582283350966625,
"grad_norm": 1.463511347770691,
"learning_rate": 9.978248987425567e-06,
"loss": 0.0152,
"step": 483
},
{
"epoch": 0.1862075598730403,
"grad_norm": 1.1043009757995605,
"learning_rate": 9.978135339796448e-06,
"loss": 0.0188,
"step": 484
},
{
"epoch": 0.18659228623641436,
"grad_norm": 0.21234546601772308,
"learning_rate": 9.97802139669028e-06,
"loss": 0.0031,
"step": 485
},
{
"epoch": 0.1869770125997884,
"grad_norm": 1.1203808784484863,
"learning_rate": 9.977907158113832e-06,
"loss": 0.0165,
"step": 486
},
{
"epoch": 0.18736173896316244,
"grad_norm": 3.042773962020874,
"learning_rate": 9.977792624073876e-06,
"loss": 0.03,
"step": 487
},
{
"epoch": 0.1877464653265365,
"grad_norm": 1.725350260734558,
"learning_rate": 9.977677794577218e-06,
"loss": 0.023,
"step": 488
},
{
"epoch": 0.18813119168991055,
"grad_norm": 1.014220118522644,
"learning_rate": 9.977562669630669e-06,
"loss": 0.0071,
"step": 489
},
{
"epoch": 0.1885159180532846,
"grad_norm": 1.6001348495483398,
"learning_rate": 9.977447249241065e-06,
"loss": 0.0195,
"step": 490
},
{
"epoch": 0.18890064441665866,
"grad_norm": 0.877295732498169,
"learning_rate": 9.977331533415256e-06,
"loss": 0.0085,
"step": 491
},
{
"epoch": 0.1892853707800327,
"grad_norm": 0.8194282650947571,
"learning_rate": 9.97721552216011e-06,
"loss": 0.0067,
"step": 492
},
{
"epoch": 0.18967009714340674,
"grad_norm": 3.308549642562866,
"learning_rate": 9.977099215482512e-06,
"loss": 0.0431,
"step": 493
},
{
"epoch": 0.1900548235067808,
"grad_norm": 0.5826371908187866,
"learning_rate": 9.976982613389368e-06,
"loss": 0.0135,
"step": 494
},
{
"epoch": 0.19043954987015485,
"grad_norm": 1.85649836063385,
"learning_rate": 9.976865715887595e-06,
"loss": 0.0217,
"step": 495
},
{
"epoch": 0.1908242762335289,
"grad_norm": 1.886124849319458,
"learning_rate": 9.976748522984137e-06,
"loss": 0.0193,
"step": 496
},
{
"epoch": 0.19120900259690296,
"grad_norm": 1.2473396062850952,
"learning_rate": 9.976631034685943e-06,
"loss": 0.0107,
"step": 497
},
{
"epoch": 0.19159372896027702,
"grad_norm": 0.38580647110939026,
"learning_rate": 9.97651325099999e-06,
"loss": 0.009,
"step": 498
},
{
"epoch": 0.19197845532365104,
"grad_norm": 1.0286208391189575,
"learning_rate": 9.976395171933271e-06,
"loss": 0.0104,
"step": 499
},
{
"epoch": 0.1923631816870251,
"grad_norm": 9.139874458312988,
"learning_rate": 9.976276797492793e-06,
"loss": 0.0105,
"step": 500
},
{
"epoch": 0.19274790805039915,
"grad_norm": 1.5034836530685425,
"learning_rate": 9.976158127685583e-06,
"loss": 0.0255,
"step": 501
},
{
"epoch": 0.1931326344137732,
"grad_norm": 3.2329092025756836,
"learning_rate": 9.976039162518681e-06,
"loss": 0.0406,
"step": 502
},
{
"epoch": 0.19351736077714726,
"grad_norm": 3.273045301437378,
"learning_rate": 9.97591990199915e-06,
"loss": 0.0409,
"step": 503
},
{
"epoch": 0.19390208714052132,
"grad_norm": 1.2689261436462402,
"learning_rate": 9.975800346134071e-06,
"loss": 0.0186,
"step": 504
},
{
"epoch": 0.19428681350389534,
"grad_norm": 2.2513368129730225,
"learning_rate": 9.975680494930538e-06,
"loss": 0.0263,
"step": 505
},
{
"epoch": 0.1946715398672694,
"grad_norm": 0.4429526925086975,
"learning_rate": 9.975560348395666e-06,
"loss": 0.0078,
"step": 506
},
{
"epoch": 0.19505626623064345,
"grad_norm": 11.008428573608398,
"learning_rate": 9.975439906536586e-06,
"loss": 0.0458,
"step": 507
},
{
"epoch": 0.1954409925940175,
"grad_norm": 1.4643393754959106,
"learning_rate": 9.975319169360446e-06,
"loss": 0.0394,
"step": 508
},
{
"epoch": 0.19582571895739156,
"grad_norm": 2.285447120666504,
"learning_rate": 9.975198136874412e-06,
"loss": 0.0195,
"step": 509
},
{
"epoch": 0.1962104453207656,
"grad_norm": 2.2482571601867676,
"learning_rate": 9.97507680908567e-06,
"loss": 0.0158,
"step": 510
},
{
"epoch": 0.19659517168413965,
"grad_norm": 2.518343210220337,
"learning_rate": 9.974955186001419e-06,
"loss": 0.026,
"step": 511
},
{
"epoch": 0.1969798980475137,
"grad_norm": 2.389946222305298,
"learning_rate": 9.974833267628879e-06,
"loss": 0.0156,
"step": 512
},
{
"epoch": 0.19736462441088776,
"grad_norm": 1.022152066230774,
"learning_rate": 9.974711053975287e-06,
"loss": 0.0127,
"step": 513
},
{
"epoch": 0.1977493507742618,
"grad_norm": 0.8410608768463135,
"learning_rate": 9.974588545047897e-06,
"loss": 0.0125,
"step": 514
},
{
"epoch": 0.19813407713763587,
"grad_norm": 0.7129068374633789,
"learning_rate": 9.974465740853981e-06,
"loss": 0.0156,
"step": 515
},
{
"epoch": 0.1985188035010099,
"grad_norm": 0.5806570649147034,
"learning_rate": 9.974342641400826e-06,
"loss": 0.0086,
"step": 516
},
{
"epoch": 0.19890352986438395,
"grad_norm": 1.541094183921814,
"learning_rate": 9.974219246695737e-06,
"loss": 0.0315,
"step": 517
},
{
"epoch": 0.199288256227758,
"grad_norm": 1.3908828496932983,
"learning_rate": 9.974095556746043e-06,
"loss": 0.0148,
"step": 518
},
{
"epoch": 0.19967298259113206,
"grad_norm": 1.4197914600372314,
"learning_rate": 9.973971571559085e-06,
"loss": 0.0184,
"step": 519
},
{
"epoch": 0.2000577089545061,
"grad_norm": 0.2611946761608124,
"learning_rate": 9.973847291142218e-06,
"loss": 0.0033,
"step": 520
},
{
"epoch": 0.20044243531788017,
"grad_norm": 0.8428516983985901,
"learning_rate": 9.973722715502821e-06,
"loss": 0.0103,
"step": 521
},
{
"epoch": 0.2008271616812542,
"grad_norm": 0.4746531844139099,
"learning_rate": 9.973597844648291e-06,
"loss": 0.0061,
"step": 522
},
{
"epoch": 0.20121188804462825,
"grad_norm": 1.7854464054107666,
"learning_rate": 9.973472678586033e-06,
"loss": 0.0143,
"step": 523
},
{
"epoch": 0.2015966144080023,
"grad_norm": 1.9014302492141724,
"learning_rate": 9.973347217323484e-06,
"loss": 0.0221,
"step": 524
},
{
"epoch": 0.20198134077137636,
"grad_norm": 1.26873779296875,
"learning_rate": 9.973221460868086e-06,
"loss": 0.0129,
"step": 525
},
{
"epoch": 0.20236606713475042,
"grad_norm": 2.484241485595703,
"learning_rate": 9.973095409227303e-06,
"loss": 0.0089,
"step": 526
},
{
"epoch": 0.20275079349812447,
"grad_norm": 1.4181262254714966,
"learning_rate": 9.972969062408618e-06,
"loss": 0.0322,
"step": 527
},
{
"epoch": 0.2031355198614985,
"grad_norm": 1.1394925117492676,
"learning_rate": 9.97284242041953e-06,
"loss": 0.0171,
"step": 528
},
{
"epoch": 0.20352024622487255,
"grad_norm": 0.7076565623283386,
"learning_rate": 9.972715483267558e-06,
"loss": 0.0132,
"step": 529
},
{
"epoch": 0.2039049725882466,
"grad_norm": 0.8092791438102722,
"learning_rate": 9.972588250960235e-06,
"loss": 0.0069,
"step": 530
},
{
"epoch": 0.20428969895162066,
"grad_norm": 1.4082260131835938,
"learning_rate": 9.97246072350511e-06,
"loss": 0.0193,
"step": 531
},
{
"epoch": 0.20467442531499472,
"grad_norm": 1.6323784589767456,
"learning_rate": 9.972332900909755e-06,
"loss": 0.0198,
"step": 532
},
{
"epoch": 0.20505915167836877,
"grad_norm": 1.7675199508666992,
"learning_rate": 9.972204783181759e-06,
"loss": 0.0139,
"step": 533
},
{
"epoch": 0.2054438780417428,
"grad_norm": 1.2901581525802612,
"learning_rate": 9.972076370328722e-06,
"loss": 0.0125,
"step": 534
},
{
"epoch": 0.20582860440511686,
"grad_norm": 0.7784814834594727,
"learning_rate": 9.971947662358271e-06,
"loss": 0.0062,
"step": 535
},
{
"epoch": 0.2062133307684909,
"grad_norm": 0.5156156420707703,
"learning_rate": 9.97181865927804e-06,
"loss": 0.0091,
"step": 536
},
{
"epoch": 0.20659805713186497,
"grad_norm": 1.0497115850448608,
"learning_rate": 9.971689361095688e-06,
"loss": 0.0074,
"step": 537
},
{
"epoch": 0.20698278349523902,
"grad_norm": 1.1292210817337036,
"learning_rate": 9.971559767818891e-06,
"loss": 0.0176,
"step": 538
},
{
"epoch": 0.20736750985861307,
"grad_norm": 0.7185404300689697,
"learning_rate": 9.971429879455339e-06,
"loss": 0.0081,
"step": 539
},
{
"epoch": 0.2077522362219871,
"grad_norm": 1.4201236963272095,
"learning_rate": 9.971299696012744e-06,
"loss": 0.0239,
"step": 540
},
{
"epoch": 0.20813696258536116,
"grad_norm": 1.1339606046676636,
"learning_rate": 9.97116921749883e-06,
"loss": 0.0212,
"step": 541
},
{
"epoch": 0.2085216889487352,
"grad_norm": 0.13454781472682953,
"learning_rate": 9.971038443921344e-06,
"loss": 0.0014,
"step": 542
},
{
"epoch": 0.20890641531210927,
"grad_norm": 1.0006049871444702,
"learning_rate": 9.970907375288048e-06,
"loss": 0.0161,
"step": 543
},
{
"epoch": 0.20929114167548332,
"grad_norm": 1.2062994241714478,
"learning_rate": 9.97077601160672e-06,
"loss": 0.023,
"step": 544
},
{
"epoch": 0.20967586803885735,
"grad_norm": 1.2538203001022339,
"learning_rate": 9.970644352885157e-06,
"loss": 0.0412,
"step": 545
},
{
"epoch": 0.2100605944022314,
"grad_norm": 2.4046273231506348,
"learning_rate": 9.970512399131174e-06,
"loss": 0.0337,
"step": 546
},
{
"epoch": 0.21044532076560546,
"grad_norm": 0.40798303484916687,
"learning_rate": 9.970380150352606e-06,
"loss": 0.0109,
"step": 547
},
{
"epoch": 0.21083004712897951,
"grad_norm": 0.9853060841560364,
"learning_rate": 9.970247606557297e-06,
"loss": 0.0344,
"step": 548
},
{
"epoch": 0.21121477349235357,
"grad_norm": 0.3079826533794403,
"learning_rate": 9.97011476775312e-06,
"loss": 0.0116,
"step": 549
},
{
"epoch": 0.21159949985572762,
"grad_norm": 0.7892830967903137,
"learning_rate": 9.969981633947956e-06,
"loss": 0.0163,
"step": 550
},
{
"epoch": 0.21198422621910165,
"grad_norm": 1.4530398845672607,
"learning_rate": 9.969848205149706e-06,
"loss": 0.0211,
"step": 551
},
{
"epoch": 0.2123689525824757,
"grad_norm": 0.7890651226043701,
"learning_rate": 9.969714481366295e-06,
"loss": 0.0172,
"step": 552
},
{
"epoch": 0.21275367894584976,
"grad_norm": 1.151655912399292,
"learning_rate": 9.969580462605656e-06,
"loss": 0.0228,
"step": 553
},
{
"epoch": 0.21313840530922382,
"grad_norm": 0.722616970539093,
"learning_rate": 9.969446148875743e-06,
"loss": 0.0157,
"step": 554
},
{
"epoch": 0.21352313167259787,
"grad_norm": 0.3728174567222595,
"learning_rate": 9.969311540184532e-06,
"loss": 0.0133,
"step": 555
},
{
"epoch": 0.21390785803597193,
"grad_norm": 1.246963620185852,
"learning_rate": 9.969176636540007e-06,
"loss": 0.0166,
"step": 556
},
{
"epoch": 0.21429258439934595,
"grad_norm": 0.4768514037132263,
"learning_rate": 9.969041437950182e-06,
"loss": 0.0077,
"step": 557
},
{
"epoch": 0.21467731076272,
"grad_norm": 0.5575240254402161,
"learning_rate": 9.968905944423077e-06,
"loss": 0.0065,
"step": 558
},
{
"epoch": 0.21506203712609406,
"grad_norm": 0.7773118615150452,
"learning_rate": 9.968770155966736e-06,
"loss": 0.0159,
"step": 559
},
{
"epoch": 0.21544676348946812,
"grad_norm": 0.2891775369644165,
"learning_rate": 9.968634072589219e-06,
"loss": 0.0025,
"step": 560
},
{
"epoch": 0.21583148985284217,
"grad_norm": 0.7732624411582947,
"learning_rate": 9.968497694298602e-06,
"loss": 0.0112,
"step": 561
},
{
"epoch": 0.21621621621621623,
"grad_norm": 1.4917421340942383,
"learning_rate": 9.96836102110298e-06,
"loss": 0.0087,
"step": 562
},
{
"epoch": 0.21660094257959026,
"grad_norm": 1.883601188659668,
"learning_rate": 9.968224053010464e-06,
"loss": 0.0375,
"step": 563
},
{
"epoch": 0.2169856689429643,
"grad_norm": 1.3303236961364746,
"learning_rate": 9.968086790029187e-06,
"loss": 0.0315,
"step": 564
},
{
"epoch": 0.21737039530633837,
"grad_norm": 2.322463035583496,
"learning_rate": 9.967949232167295e-06,
"loss": 0.0343,
"step": 565
},
{
"epoch": 0.21775512166971242,
"grad_norm": 3.1290431022644043,
"learning_rate": 9.96781137943295e-06,
"loss": 0.0359,
"step": 566
},
{
"epoch": 0.21813984803308648,
"grad_norm": 2.3872196674346924,
"learning_rate": 9.967673231834338e-06,
"loss": 0.036,
"step": 567
},
{
"epoch": 0.21852457439646053,
"grad_norm": 2.2352898120880127,
"learning_rate": 9.967534789379657e-06,
"loss": 0.0206,
"step": 568
},
{
"epoch": 0.21890930075983456,
"grad_norm": 1.576250672340393,
"learning_rate": 9.967396052077125e-06,
"loss": 0.0196,
"step": 569
},
{
"epoch": 0.2192940271232086,
"grad_norm": 1.1456630229949951,
"learning_rate": 9.967257019934976e-06,
"loss": 0.0155,
"step": 570
},
{
"epoch": 0.21967875348658267,
"grad_norm": 0.7318146824836731,
"learning_rate": 9.96711769296146e-06,
"loss": 0.0108,
"step": 571
},
{
"epoch": 0.22006347984995672,
"grad_norm": 0.5068544745445251,
"learning_rate": 9.966978071164851e-06,
"loss": 0.0064,
"step": 572
},
{
"epoch": 0.22044820621333078,
"grad_norm": 1.3462088108062744,
"learning_rate": 9.966838154553436e-06,
"loss": 0.0208,
"step": 573
},
{
"epoch": 0.22083293257670483,
"grad_norm": 1.4697437286376953,
"learning_rate": 9.966697943135516e-06,
"loss": 0.0245,
"step": 574
},
{
"epoch": 0.22121765894007886,
"grad_norm": 0.5085150003433228,
"learning_rate": 9.966557436919416e-06,
"loss": 0.0104,
"step": 575
},
{
"epoch": 0.22160238530345291,
"grad_norm": 0.722129225730896,
"learning_rate": 9.966416635913475e-06,
"loss": 0.0219,
"step": 576
},
{
"epoch": 0.22198711166682697,
"grad_norm": 1.4348595142364502,
"learning_rate": 9.96627554012605e-06,
"loss": 0.0209,
"step": 577
},
{
"epoch": 0.22237183803020102,
"grad_norm": 0.4099920988082886,
"learning_rate": 9.966134149565518e-06,
"loss": 0.0075,
"step": 578
},
{
"epoch": 0.22275656439357508,
"grad_norm": 0.4294058680534363,
"learning_rate": 9.965992464240268e-06,
"loss": 0.0114,
"step": 579
},
{
"epoch": 0.2231412907569491,
"grad_norm": 0.6493693590164185,
"learning_rate": 9.96585048415871e-06,
"loss": 0.0092,
"step": 580
},
{
"epoch": 0.22352601712032316,
"grad_norm": 0.8157480955123901,
"learning_rate": 9.965708209329275e-06,
"loss": 0.013,
"step": 581
},
{
"epoch": 0.22391074348369722,
"grad_norm": 1.7782557010650635,
"learning_rate": 9.965565639760405e-06,
"loss": 0.0147,
"step": 582
},
{
"epoch": 0.22429546984707127,
"grad_norm": 1.609440803527832,
"learning_rate": 9.965422775460559e-06,
"loss": 0.0287,
"step": 583
},
{
"epoch": 0.22468019621044533,
"grad_norm": 1.303137183189392,
"learning_rate": 9.96527961643822e-06,
"loss": 0.0135,
"step": 584
},
{
"epoch": 0.22506492257381938,
"grad_norm": 0.9957038760185242,
"learning_rate": 9.965136162701889e-06,
"loss": 0.0088,
"step": 585
},
{
"epoch": 0.2254496489371934,
"grad_norm": 1.0103319883346558,
"learning_rate": 9.964992414260076e-06,
"loss": 0.0215,
"step": 586
},
{
"epoch": 0.22583437530056746,
"grad_norm": 2.134277582168579,
"learning_rate": 9.964848371121312e-06,
"loss": 0.0378,
"step": 587
},
{
"epoch": 0.22621910166394152,
"grad_norm": 1.829238772392273,
"learning_rate": 9.964704033294148e-06,
"loss": 0.0334,
"step": 588
},
{
"epoch": 0.22660382802731557,
"grad_norm": 0.39848384261131287,
"learning_rate": 9.964559400787155e-06,
"loss": 0.0055,
"step": 589
},
{
"epoch": 0.22698855439068963,
"grad_norm": 0.5261602997779846,
"learning_rate": 9.964414473608912e-06,
"loss": 0.008,
"step": 590
},
{
"epoch": 0.22737328075406368,
"grad_norm": 0.9430747032165527,
"learning_rate": 9.964269251768025e-06,
"loss": 0.0125,
"step": 591
},
{
"epoch": 0.2277580071174377,
"grad_norm": 0.9117157459259033,
"learning_rate": 9.964123735273112e-06,
"loss": 0.0205,
"step": 592
},
{
"epoch": 0.22814273348081177,
"grad_norm": 1.4581772089004517,
"learning_rate": 9.96397792413281e-06,
"loss": 0.008,
"step": 593
},
{
"epoch": 0.22852745984418582,
"grad_norm": 0.778039276599884,
"learning_rate": 9.963831818355774e-06,
"loss": 0.0113,
"step": 594
},
{
"epoch": 0.22891218620755988,
"grad_norm": 0.2719285190105438,
"learning_rate": 9.963685417950678e-06,
"loss": 0.0081,
"step": 595
},
{
"epoch": 0.22929691257093393,
"grad_norm": 0.4310326874256134,
"learning_rate": 9.963538722926208e-06,
"loss": 0.0079,
"step": 596
},
{
"epoch": 0.22968163893430799,
"grad_norm": 1.2294691801071167,
"learning_rate": 9.963391733291072e-06,
"loss": 0.0131,
"step": 597
},
{
"epoch": 0.230066365297682,
"grad_norm": 0.7488526701927185,
"learning_rate": 9.963244449053997e-06,
"loss": 0.0067,
"step": 598
},
{
"epoch": 0.23045109166105607,
"grad_norm": 0.5428204536437988,
"learning_rate": 9.963096870223722e-06,
"loss": 0.0177,
"step": 599
},
{
"epoch": 0.23083581802443012,
"grad_norm": 0.5847753882408142,
"learning_rate": 9.962948996809008e-06,
"loss": 0.0073,
"step": 600
},
{
"epoch": 0.23122054438780418,
"grad_norm": 0.7159871459007263,
"learning_rate": 9.962800828818633e-06,
"loss": 0.0084,
"step": 601
},
{
"epoch": 0.23160527075117823,
"grad_norm": 0.6076216101646423,
"learning_rate": 9.962652366261392e-06,
"loss": 0.0042,
"step": 602
},
{
"epoch": 0.2319899971145523,
"grad_norm": 0.7141630053520203,
"learning_rate": 9.962503609146092e-06,
"loss": 0.0111,
"step": 603
},
{
"epoch": 0.23237472347792631,
"grad_norm": 0.9941124320030212,
"learning_rate": 9.962354557481569e-06,
"loss": 0.0178,
"step": 604
},
{
"epoch": 0.23275944984130037,
"grad_norm": 1.804352879524231,
"learning_rate": 9.962205211276666e-06,
"loss": 0.0134,
"step": 605
},
{
"epoch": 0.23314417620467442,
"grad_norm": 2.7440221309661865,
"learning_rate": 9.962055570540247e-06,
"loss": 0.0163,
"step": 606
},
{
"epoch": 0.23352890256804848,
"grad_norm": 1.269745945930481,
"learning_rate": 9.961905635281196e-06,
"loss": 0.0036,
"step": 607
},
{
"epoch": 0.23391362893142253,
"grad_norm": 1.0230756998062134,
"learning_rate": 9.961755405508413e-06,
"loss": 0.014,
"step": 608
},
{
"epoch": 0.2342983552947966,
"grad_norm": 1.520411729812622,
"learning_rate": 9.961604881230812e-06,
"loss": 0.0092,
"step": 609
},
{
"epoch": 0.23468308165817062,
"grad_norm": 0.7422340512275696,
"learning_rate": 9.96145406245733e-06,
"loss": 0.0048,
"step": 610
},
{
"epoch": 0.23506780802154467,
"grad_norm": 1.6586474180221558,
"learning_rate": 9.961302949196916e-06,
"loss": 0.0196,
"step": 611
},
{
"epoch": 0.23545253438491873,
"grad_norm": 0.3466293215751648,
"learning_rate": 9.961151541458542e-06,
"loss": 0.0023,
"step": 612
},
{
"epoch": 0.23583726074829278,
"grad_norm": 1.8197641372680664,
"learning_rate": 9.960999839251195e-06,
"loss": 0.0392,
"step": 613
},
{
"epoch": 0.23622198711166684,
"grad_norm": 1.4489648342132568,
"learning_rate": 9.960847842583878e-06,
"loss": 0.0147,
"step": 614
},
{
"epoch": 0.23660671347504086,
"grad_norm": 1.529948353767395,
"learning_rate": 9.96069555146561e-06,
"loss": 0.0199,
"step": 615
},
{
"epoch": 0.23699143983841492,
"grad_norm": 1.676714539527893,
"learning_rate": 9.960542965905438e-06,
"loss": 0.0135,
"step": 616
},
{
"epoch": 0.23737616620178897,
"grad_norm": 1.0007433891296387,
"learning_rate": 9.96039008591241e-06,
"loss": 0.0235,
"step": 617
},
{
"epoch": 0.23776089256516303,
"grad_norm": 0.8955550193786621,
"learning_rate": 9.960236911495605e-06,
"loss": 0.016,
"step": 618
},
{
"epoch": 0.23814561892853708,
"grad_norm": 0.43766599893569946,
"learning_rate": 9.960083442664114e-06,
"loss": 0.0078,
"step": 619
},
{
"epoch": 0.23853034529191114,
"grad_norm": 6.031861782073975,
"learning_rate": 9.959929679427047e-06,
"loss": 0.0163,
"step": 620
},
{
"epoch": 0.23891507165528517,
"grad_norm": 0.5251021981239319,
"learning_rate": 9.959775621793528e-06,
"loss": 0.0101,
"step": 621
},
{
"epoch": 0.23929979801865922,
"grad_norm": 1.166603922843933,
"learning_rate": 9.959621269772704e-06,
"loss": 0.016,
"step": 622
},
{
"epoch": 0.23968452438203328,
"grad_norm": 1.8840091228485107,
"learning_rate": 9.959466623373732e-06,
"loss": 0.0244,
"step": 623
},
{
"epoch": 0.24006925074540733,
"grad_norm": 1.1539957523345947,
"learning_rate": 9.959311682605797e-06,
"loss": 0.0163,
"step": 624
},
{
"epoch": 0.24045397710878139,
"grad_norm": 0.8867762088775635,
"learning_rate": 9.959156447478091e-06,
"loss": 0.006,
"step": 625
},
{
"epoch": 0.24083870347215544,
"grad_norm": 1.2318607568740845,
"learning_rate": 9.959000917999831e-06,
"loss": 0.0096,
"step": 626
},
{
"epoch": 0.24122342983552947,
"grad_norm": 2.0298075675964355,
"learning_rate": 9.958845094180247e-06,
"loss": 0.023,
"step": 627
},
{
"epoch": 0.24160815619890352,
"grad_norm": 0.8165722489356995,
"learning_rate": 9.958688976028588e-06,
"loss": 0.0216,
"step": 628
},
{
"epoch": 0.24199288256227758,
"grad_norm": 0.2466529905796051,
"learning_rate": 9.958532563554119e-06,
"loss": 0.0041,
"step": 629
},
{
"epoch": 0.24237760892565163,
"grad_norm": 0.6395401358604431,
"learning_rate": 9.958375856766128e-06,
"loss": 0.0192,
"step": 630
},
{
"epoch": 0.2427623352890257,
"grad_norm": 1.0328831672668457,
"learning_rate": 9.95821885567391e-06,
"loss": 0.0236,
"step": 631
},
{
"epoch": 0.24314706165239974,
"grad_norm": 0.6815221905708313,
"learning_rate": 9.95806156028679e-06,
"loss": 0.0158,
"step": 632
},
{
"epoch": 0.24353178801577377,
"grad_norm": 0.6181775331497192,
"learning_rate": 9.9579039706141e-06,
"loss": 0.0087,
"step": 633
},
{
"epoch": 0.24391651437914783,
"grad_norm": 1.2611945867538452,
"learning_rate": 9.957746086665196e-06,
"loss": 0.0137,
"step": 634
},
{
"epoch": 0.24430124074252188,
"grad_norm": 0.1974414438009262,
"learning_rate": 9.957587908449448e-06,
"loss": 0.0035,
"step": 635
},
{
"epoch": 0.24468596710589594,
"grad_norm": 0.9455735683441162,
"learning_rate": 9.957429435976245e-06,
"loss": 0.0063,
"step": 636
},
{
"epoch": 0.24507069346927,
"grad_norm": 0.764569103717804,
"learning_rate": 9.957270669254994e-06,
"loss": 0.0133,
"step": 637
},
{
"epoch": 0.24545541983264405,
"grad_norm": 0.3373180031776428,
"learning_rate": 9.957111608295119e-06,
"loss": 0.0055,
"step": 638
},
{
"epoch": 0.24584014619601807,
"grad_norm": 0.24801532924175262,
"learning_rate": 9.956952253106059e-06,
"loss": 0.0056,
"step": 639
},
{
"epoch": 0.24622487255939213,
"grad_norm": 0.1746530830860138,
"learning_rate": 9.956792603697274e-06,
"loss": 0.003,
"step": 640
},
{
"epoch": 0.24660959892276618,
"grad_norm": 0.575825035572052,
"learning_rate": 9.956632660078239e-06,
"loss": 0.0099,
"step": 641
},
{
"epoch": 0.24699432528614024,
"grad_norm": 1.7585387229919434,
"learning_rate": 9.956472422258447e-06,
"loss": 0.0477,
"step": 642
},
{
"epoch": 0.2473790516495143,
"grad_norm": 1.1238574981689453,
"learning_rate": 9.956311890247411e-06,
"loss": 0.0328,
"step": 643
},
{
"epoch": 0.24776377801288832,
"grad_norm": 1.6307287216186523,
"learning_rate": 9.956151064054658e-06,
"loss": 0.0134,
"step": 644
},
{
"epoch": 0.24814850437626237,
"grad_norm": 0.32868483662605286,
"learning_rate": 9.955989943689734e-06,
"loss": 0.0017,
"step": 645
},
{
"epoch": 0.24853323073963643,
"grad_norm": 0.8028136491775513,
"learning_rate": 9.955828529162201e-06,
"loss": 0.0056,
"step": 646
},
{
"epoch": 0.24891795710301048,
"grad_norm": 1.284749984741211,
"learning_rate": 9.955666820481645e-06,
"loss": 0.0103,
"step": 647
},
{
"epoch": 0.24930268346638454,
"grad_norm": 1.407700777053833,
"learning_rate": 9.955504817657656e-06,
"loss": 0.0311,
"step": 648
},
{
"epoch": 0.2496874098297586,
"grad_norm": 0.9083035588264465,
"learning_rate": 9.955342520699856e-06,
"loss": 0.0156,
"step": 649
},
{
"epoch": 0.2500721361931326,
"grad_norm": 1.101758599281311,
"learning_rate": 9.955179929617875e-06,
"loss": 0.018,
"step": 650
},
{
"epoch": 0.2504568625565067,
"grad_norm": 2.050755023956299,
"learning_rate": 9.955017044421368e-06,
"loss": 0.0167,
"step": 651
},
{
"epoch": 0.25084158891988073,
"grad_norm": 0.8650450706481934,
"learning_rate": 9.954853865119996e-06,
"loss": 0.0108,
"step": 652
},
{
"epoch": 0.2512263152832548,
"grad_norm": 1.0544840097427368,
"learning_rate": 9.95469039172345e-06,
"loss": 0.0154,
"step": 653
},
{
"epoch": 0.25161104164662884,
"grad_norm": 5.561423301696777,
"learning_rate": 9.954526624241429e-06,
"loss": 0.0283,
"step": 654
},
{
"epoch": 0.2519957680100029,
"grad_norm": 1.3561756610870361,
"learning_rate": 9.954362562683658e-06,
"loss": 0.023,
"step": 655
},
{
"epoch": 0.25238049437337695,
"grad_norm": 1.6020667552947998,
"learning_rate": 9.954198207059872e-06,
"loss": 0.0175,
"step": 656
},
{
"epoch": 0.252765220736751,
"grad_norm": 0.3961346745491028,
"learning_rate": 9.954033557379826e-06,
"loss": 0.0073,
"step": 657
},
{
"epoch": 0.25314994710012506,
"grad_norm": 1.5307796001434326,
"learning_rate": 9.953868613653295e-06,
"loss": 0.0092,
"step": 658
},
{
"epoch": 0.25353467346349906,
"grad_norm": 1.0183442831039429,
"learning_rate": 9.953703375890067e-06,
"loss": 0.0445,
"step": 659
},
{
"epoch": 0.2539193998268731,
"grad_norm": 0.954535722732544,
"learning_rate": 9.95353784409995e-06,
"loss": 0.0095,
"step": 660
},
{
"epoch": 0.25430412619024717,
"grad_norm": 0.8993096351623535,
"learning_rate": 9.953372018292771e-06,
"loss": 0.0151,
"step": 661
},
{
"epoch": 0.2546888525536212,
"grad_norm": 0.7639009952545166,
"learning_rate": 9.95320589847837e-06,
"loss": 0.0082,
"step": 662
},
{
"epoch": 0.2550735789169953,
"grad_norm": 0.5023514032363892,
"learning_rate": 9.953039484666607e-06,
"loss": 0.0065,
"step": 663
},
{
"epoch": 0.25545830528036934,
"grad_norm": 0.5645049214363098,
"learning_rate": 9.952872776867365e-06,
"loss": 0.0074,
"step": 664
},
{
"epoch": 0.2558430316437434,
"grad_norm": 0.6301894783973694,
"learning_rate": 9.95270577509053e-06,
"loss": 0.0108,
"step": 665
},
{
"epoch": 0.25622775800711745,
"grad_norm": 0.7627872824668884,
"learning_rate": 9.952538479346022e-06,
"loss": 0.0235,
"step": 666
},
{
"epoch": 0.2566124843704915,
"grad_norm": 1.0123822689056396,
"learning_rate": 9.952370889643766e-06,
"loss": 0.0149,
"step": 667
},
{
"epoch": 0.25699721073386556,
"grad_norm": 0.6568852663040161,
"learning_rate": 9.952203005993713e-06,
"loss": 0.0112,
"step": 668
},
{
"epoch": 0.2573819370972396,
"grad_norm": 0.11951254308223724,
"learning_rate": 9.952034828405824e-06,
"loss": 0.0024,
"step": 669
},
{
"epoch": 0.25776666346061367,
"grad_norm": 1.049953579902649,
"learning_rate": 9.951866356890084e-06,
"loss": 0.0118,
"step": 670
},
{
"epoch": 0.25815138982398766,
"grad_norm": 0.5883866548538208,
"learning_rate": 9.951697591456493e-06,
"loss": 0.0048,
"step": 671
},
{
"epoch": 0.2585361161873617,
"grad_norm": 0.340031236410141,
"learning_rate": 9.951528532115065e-06,
"loss": 0.0086,
"step": 672
},
{
"epoch": 0.2589208425507358,
"grad_norm": 1.1816928386688232,
"learning_rate": 9.951359178875837e-06,
"loss": 0.0075,
"step": 673
},
{
"epoch": 0.25930556891410983,
"grad_norm": 1.2891496419906616,
"learning_rate": 9.95118953174886e-06,
"loss": 0.008,
"step": 674
},
{
"epoch": 0.2596902952774839,
"grad_norm": 0.6510321497917175,
"learning_rate": 9.951019590744203e-06,
"loss": 0.0109,
"step": 675
},
{
"epoch": 0.26007502164085794,
"grad_norm": 1.740363597869873,
"learning_rate": 9.950849355871954e-06,
"loss": 0.0136,
"step": 676
},
{
"epoch": 0.260459748004232,
"grad_norm": 0.9862236976623535,
"learning_rate": 9.950678827142218e-06,
"loss": 0.0126,
"step": 677
},
{
"epoch": 0.26084447436760605,
"grad_norm": 1.5269832611083984,
"learning_rate": 9.950508004565114e-06,
"loss": 0.0197,
"step": 678
},
{
"epoch": 0.2612292007309801,
"grad_norm": 0.9901081919670105,
"learning_rate": 9.950336888150781e-06,
"loss": 0.0091,
"step": 679
},
{
"epoch": 0.26161392709435416,
"grad_norm": 0.7796134352684021,
"learning_rate": 9.95016547790938e-06,
"loss": 0.008,
"step": 680
},
{
"epoch": 0.2619986534577282,
"grad_norm": 1.631371259689331,
"learning_rate": 9.949993773851082e-06,
"loss": 0.0254,
"step": 681
},
{
"epoch": 0.2623833798211022,
"grad_norm": 0.6020075678825378,
"learning_rate": 9.949821775986078e-06,
"loss": 0.0136,
"step": 682
},
{
"epoch": 0.26276810618447627,
"grad_norm": 0.6335283517837524,
"learning_rate": 9.949649484324579e-06,
"loss": 0.0145,
"step": 683
},
{
"epoch": 0.2631528325478503,
"grad_norm": 1.1868330240249634,
"learning_rate": 9.949476898876808e-06,
"loss": 0.014,
"step": 684
},
{
"epoch": 0.2635375589112244,
"grad_norm": 0.5912204384803772,
"learning_rate": 9.949304019653012e-06,
"loss": 0.0053,
"step": 685
},
{
"epoch": 0.26392228527459843,
"grad_norm": 0.32141926884651184,
"learning_rate": 9.949130846663451e-06,
"loss": 0.0041,
"step": 686
},
{
"epoch": 0.2643070116379725,
"grad_norm": 0.2612176835536957,
"learning_rate": 9.948957379918405e-06,
"loss": 0.0041,
"step": 687
},
{
"epoch": 0.26469173800134654,
"grad_norm": 1.4371662139892578,
"learning_rate": 9.948783619428168e-06,
"loss": 0.0122,
"step": 688
},
{
"epoch": 0.2650764643647206,
"grad_norm": 0.8866683840751648,
"learning_rate": 9.948609565203054e-06,
"loss": 0.0147,
"step": 689
},
{
"epoch": 0.26546119072809465,
"grad_norm": 0.2633998692035675,
"learning_rate": 9.948435217253394e-06,
"loss": 0.0022,
"step": 690
},
{
"epoch": 0.2658459170914687,
"grad_norm": 0.6015472412109375,
"learning_rate": 9.948260575589538e-06,
"loss": 0.0134,
"step": 691
},
{
"epoch": 0.26623064345484276,
"grad_norm": 0.7728354930877686,
"learning_rate": 9.94808564022185e-06,
"loss": 0.0286,
"step": 692
},
{
"epoch": 0.2666153698182168,
"grad_norm": 1.7141587734222412,
"learning_rate": 9.947910411160715e-06,
"loss": 0.0105,
"step": 693
},
{
"epoch": 0.2670000961815908,
"grad_norm": 0.2638167142868042,
"learning_rate": 9.947734888416532e-06,
"loss": 0.0143,
"step": 694
},
{
"epoch": 0.2673848225449649,
"grad_norm": 3.141570806503296,
"learning_rate": 9.947559071999719e-06,
"loss": 0.0379,
"step": 695
},
{
"epoch": 0.26776954890833893,
"grad_norm": 2.544487953186035,
"learning_rate": 9.947382961920713e-06,
"loss": 0.028,
"step": 696
},
{
"epoch": 0.268154275271713,
"grad_norm": 0.16364744305610657,
"learning_rate": 9.947206558189967e-06,
"loss": 0.0024,
"step": 697
},
{
"epoch": 0.26853900163508704,
"grad_norm": 0.9908127188682556,
"learning_rate": 9.94702986081795e-06,
"loss": 0.0092,
"step": 698
},
{
"epoch": 0.2689237279984611,
"grad_norm": 1.1755777597427368,
"learning_rate": 9.946852869815152e-06,
"loss": 0.0178,
"step": 699
},
{
"epoch": 0.26930845436183515,
"grad_norm": 1.6439647674560547,
"learning_rate": 9.946675585192076e-06,
"loss": 0.0174,
"step": 700
},
{
"epoch": 0.2696931807252092,
"grad_norm": 1.5522323846817017,
"learning_rate": 9.946498006959246e-06,
"loss": 0.0265,
"step": 701
},
{
"epoch": 0.27007790708858326,
"grad_norm": 0.6794743537902832,
"learning_rate": 9.946320135127203e-06,
"loss": 0.022,
"step": 702
},
{
"epoch": 0.2704626334519573,
"grad_norm": 0.8453839421272278,
"learning_rate": 9.946141969706501e-06,
"loss": 0.0339,
"step": 703
},
{
"epoch": 0.27084735981533137,
"grad_norm": 2.207174301147461,
"learning_rate": 9.94596351070772e-06,
"loss": 0.0112,
"step": 704
},
{
"epoch": 0.2712320861787054,
"grad_norm": 0.6483219265937805,
"learning_rate": 9.945784758141449e-06,
"loss": 0.0156,
"step": 705
},
{
"epoch": 0.2716168125420794,
"grad_norm": 1.7637494802474976,
"learning_rate": 9.9456057120183e-06,
"loss": 0.0187,
"step": 706
},
{
"epoch": 0.2720015389054535,
"grad_norm": 0.9741355776786804,
"learning_rate": 9.945426372348896e-06,
"loss": 0.014,
"step": 707
},
{
"epoch": 0.27238626526882753,
"grad_norm": 0.42849379777908325,
"learning_rate": 9.945246739143888e-06,
"loss": 0.0147,
"step": 708
},
{
"epoch": 0.2727709916322016,
"grad_norm": 0.6075026392936707,
"learning_rate": 9.945066812413932e-06,
"loss": 0.0105,
"step": 709
},
{
"epoch": 0.27315571799557564,
"grad_norm": 0.5715228319168091,
"learning_rate": 9.944886592169712e-06,
"loss": 0.0049,
"step": 710
},
{
"epoch": 0.2735404443589497,
"grad_norm": 0.5022349953651428,
"learning_rate": 9.944706078421923e-06,
"loss": 0.0154,
"step": 711
},
{
"epoch": 0.27392517072232375,
"grad_norm": 3.6961655616760254,
"learning_rate": 9.94452527118128e-06,
"loss": 0.1265,
"step": 712
},
{
"epoch": 0.2743098970856978,
"grad_norm": 0.6201106905937195,
"learning_rate": 9.944344170458516e-06,
"loss": 0.0051,
"step": 713
},
{
"epoch": 0.27469462344907186,
"grad_norm": 0.5065118670463562,
"learning_rate": 9.944162776264376e-06,
"loss": 0.0041,
"step": 714
},
{
"epoch": 0.2750793498124459,
"grad_norm": 0.09867949038743973,
"learning_rate": 9.94398108860963e-06,
"loss": 0.001,
"step": 715
},
{
"epoch": 0.27546407617581997,
"grad_norm": 0.9551201462745667,
"learning_rate": 9.943799107505063e-06,
"loss": 0.0142,
"step": 716
},
{
"epoch": 0.27584880253919397,
"grad_norm": 1.8438011407852173,
"learning_rate": 9.943616832961475e-06,
"loss": 0.0185,
"step": 717
},
{
"epoch": 0.276233528902568,
"grad_norm": 1.110871434211731,
"learning_rate": 9.943434264989684e-06,
"loss": 0.0099,
"step": 718
},
{
"epoch": 0.2766182552659421,
"grad_norm": 0.585672914981842,
"learning_rate": 9.943251403600526e-06,
"loss": 0.0171,
"step": 719
},
{
"epoch": 0.27700298162931614,
"grad_norm": 0.41809943318367004,
"learning_rate": 9.94306824880486e-06,
"loss": 0.0034,
"step": 720
},
{
"epoch": 0.2773877079926902,
"grad_norm": 2.4627175331115723,
"learning_rate": 9.94288480061355e-06,
"loss": 0.0175,
"step": 721
},
{
"epoch": 0.27777243435606425,
"grad_norm": 0.7623758316040039,
"learning_rate": 9.942701059037487e-06,
"loss": 0.0114,
"step": 722
},
{
"epoch": 0.2781571607194383,
"grad_norm": 0.40503352880477905,
"learning_rate": 9.942517024087579e-06,
"loss": 0.0042,
"step": 723
},
{
"epoch": 0.27854188708281236,
"grad_norm": 1.7551190853118896,
"learning_rate": 9.942332695774747e-06,
"loss": 0.0115,
"step": 724
},
{
"epoch": 0.2789266134461864,
"grad_norm": 0.7830015420913696,
"learning_rate": 9.942148074109934e-06,
"loss": 0.0067,
"step": 725
},
{
"epoch": 0.27931133980956047,
"grad_norm": 2.3621063232421875,
"learning_rate": 9.941963159104095e-06,
"loss": 0.0261,
"step": 726
},
{
"epoch": 0.2796960661729345,
"grad_norm": 1.2633928060531616,
"learning_rate": 9.94177795076821e-06,
"loss": 0.0084,
"step": 727
},
{
"epoch": 0.2800807925363086,
"grad_norm": 0.08101649582386017,
"learning_rate": 9.941592449113268e-06,
"loss": 0.0008,
"step": 728
},
{
"epoch": 0.2804655188996826,
"grad_norm": 2.2081878185272217,
"learning_rate": 9.941406654150283e-06,
"loss": 0.0216,
"step": 729
},
{
"epoch": 0.28085024526305663,
"grad_norm": 2.1128742694854736,
"learning_rate": 9.94122056589028e-06,
"loss": 0.0111,
"step": 730
},
{
"epoch": 0.2812349716264307,
"grad_norm": 0.06335221230983734,
"learning_rate": 9.941034184344305e-06,
"loss": 0.0011,
"step": 731
},
{
"epoch": 0.28161969798980474,
"grad_norm": 0.8481754064559937,
"learning_rate": 9.940847509523422e-06,
"loss": 0.0065,
"step": 732
},
{
"epoch": 0.2820044243531788,
"grad_norm": 1.116052269935608,
"learning_rate": 9.940660541438708e-06,
"loss": 0.0156,
"step": 733
},
{
"epoch": 0.28238915071655285,
"grad_norm": 1.1566013097763062,
"learning_rate": 9.940473280101263e-06,
"loss": 0.0182,
"step": 734
},
{
"epoch": 0.2827738770799269,
"grad_norm": 2.7523629665374756,
"learning_rate": 9.940285725522203e-06,
"loss": 0.0417,
"step": 735
},
{
"epoch": 0.28315860344330096,
"grad_norm": 2.2533061504364014,
"learning_rate": 9.940097877712659e-06,
"loss": 0.027,
"step": 736
},
{
"epoch": 0.283543329806675,
"grad_norm": 1.0717406272888184,
"learning_rate": 9.939909736683778e-06,
"loss": 0.0092,
"step": 737
},
{
"epoch": 0.28392805617004907,
"grad_norm": 0.7087541818618774,
"learning_rate": 9.93972130244673e-06,
"loss": 0.0081,
"step": 738
},
{
"epoch": 0.2843127825334231,
"grad_norm": 1.6095730066299438,
"learning_rate": 9.939532575012698e-06,
"loss": 0.0172,
"step": 739
},
{
"epoch": 0.2846975088967972,
"grad_norm": 1.3097370862960815,
"learning_rate": 9.939343554392887e-06,
"loss": 0.0129,
"step": 740
},
{
"epoch": 0.2850822352601712,
"grad_norm": 0.4617692828178406,
"learning_rate": 9.939154240598513e-06,
"loss": 0.0056,
"step": 741
},
{
"epoch": 0.28546696162354523,
"grad_norm": 0.5378155708312988,
"learning_rate": 9.938964633640815e-06,
"loss": 0.0061,
"step": 742
},
{
"epoch": 0.2858516879869193,
"grad_norm": 0.8007967472076416,
"learning_rate": 9.938774733531045e-06,
"loss": 0.0115,
"step": 743
},
{
"epoch": 0.28623641435029334,
"grad_norm": 1.1707913875579834,
"learning_rate": 9.938584540280477e-06,
"loss": 0.0214,
"step": 744
},
{
"epoch": 0.2866211407136674,
"grad_norm": 0.751804769039154,
"learning_rate": 9.938394053900396e-06,
"loss": 0.006,
"step": 745
},
{
"epoch": 0.28700586707704145,
"grad_norm": 0.6133852005004883,
"learning_rate": 9.938203274402113e-06,
"loss": 0.0175,
"step": 746
},
{
"epoch": 0.2873905934404155,
"grad_norm": 1.4026976823806763,
"learning_rate": 9.938012201796948e-06,
"loss": 0.0157,
"step": 747
},
{
"epoch": 0.28777531980378956,
"grad_norm": 0.9868363738059998,
"learning_rate": 9.937820836096244e-06,
"loss": 0.0076,
"step": 748
},
{
"epoch": 0.2881600461671636,
"grad_norm": 0.4354103207588196,
"learning_rate": 9.937629177311359e-06,
"loss": 0.0065,
"step": 749
},
{
"epoch": 0.2885447725305377,
"grad_norm": 0.6645623445510864,
"learning_rate": 9.937437225453669e-06,
"loss": 0.0067,
"step": 750
},
{
"epoch": 0.28892949889391173,
"grad_norm": 1.250994086265564,
"learning_rate": 9.937244980534568e-06,
"loss": 0.0182,
"step": 751
},
{
"epoch": 0.28931422525728573,
"grad_norm": 0.5992980003356934,
"learning_rate": 9.937052442565464e-06,
"loss": 0.0033,
"step": 752
},
{
"epoch": 0.2896989516206598,
"grad_norm": 0.38558635115623474,
"learning_rate": 9.93685961155779e-06,
"loss": 0.0051,
"step": 753
},
{
"epoch": 0.29008367798403384,
"grad_norm": 7.663323879241943,
"learning_rate": 9.936666487522985e-06,
"loss": 0.0408,
"step": 754
},
{
"epoch": 0.2904684043474079,
"grad_norm": 0.9667056202888489,
"learning_rate": 9.93647307047252e-06,
"loss": 0.0104,
"step": 755
},
{
"epoch": 0.29085313071078195,
"grad_norm": 1.2200483083724976,
"learning_rate": 9.936279360417866e-06,
"loss": 0.0277,
"step": 756
},
{
"epoch": 0.291237857074156,
"grad_norm": 0.49670878052711487,
"learning_rate": 9.93608535737053e-06,
"loss": 0.0094,
"step": 757
},
{
"epoch": 0.29162258343753006,
"grad_norm": 1.3144538402557373,
"learning_rate": 9.935891061342017e-06,
"loss": 0.0274,
"step": 758
},
{
"epoch": 0.2920073098009041,
"grad_norm": 0.2239675670862198,
"learning_rate": 9.935696472343867e-06,
"loss": 0.0035,
"step": 759
},
{
"epoch": 0.29239203616427817,
"grad_norm": 3.239274740219116,
"learning_rate": 9.935501590387629e-06,
"loss": 0.0383,
"step": 760
},
{
"epoch": 0.2927767625276522,
"grad_norm": 0.9120214581489563,
"learning_rate": 9.935306415484868e-06,
"loss": 0.0074,
"step": 761
},
{
"epoch": 0.2931614888910263,
"grad_norm": 0.6452634334564209,
"learning_rate": 9.935110947647168e-06,
"loss": 0.0117,
"step": 762
},
{
"epoch": 0.29354621525440033,
"grad_norm": 0.6486735343933105,
"learning_rate": 9.934915186886136e-06,
"loss": 0.0061,
"step": 763
},
{
"epoch": 0.29393094161777433,
"grad_norm": 1.0978666543960571,
"learning_rate": 9.934719133213383e-06,
"loss": 0.0099,
"step": 764
},
{
"epoch": 0.2943156679811484,
"grad_norm": 0.5332254767417908,
"learning_rate": 9.934522786640555e-06,
"loss": 0.0091,
"step": 765
},
{
"epoch": 0.29470039434452244,
"grad_norm": 1.3437743186950684,
"learning_rate": 9.9343261471793e-06,
"loss": 0.0092,
"step": 766
},
{
"epoch": 0.2950851207078965,
"grad_norm": 1.3608458042144775,
"learning_rate": 9.93412921484129e-06,
"loss": 0.0118,
"step": 767
},
{
"epoch": 0.29546984707127055,
"grad_norm": 3.0785293579101562,
"learning_rate": 9.933931989638216e-06,
"loss": 0.0292,
"step": 768
},
{
"epoch": 0.2958545734346446,
"grad_norm": 2.402841806411743,
"learning_rate": 9.933734471581784e-06,
"loss": 0.0345,
"step": 769
},
{
"epoch": 0.29623929979801866,
"grad_norm": 3.3671507835388184,
"learning_rate": 9.933536660683718e-06,
"loss": 0.0718,
"step": 770
},
{
"epoch": 0.2966240261613927,
"grad_norm": 0.6962792277336121,
"learning_rate": 9.933338556955756e-06,
"loss": 0.0066,
"step": 771
},
{
"epoch": 0.2970087525247668,
"grad_norm": 1.287801742553711,
"learning_rate": 9.933140160409659e-06,
"loss": 0.0117,
"step": 772
},
{
"epoch": 0.2973934788881408,
"grad_norm": 0.6097240447998047,
"learning_rate": 9.932941471057202e-06,
"loss": 0.0112,
"step": 773
},
{
"epoch": 0.2977782052515149,
"grad_norm": 0.5029175281524658,
"learning_rate": 9.93274248891018e-06,
"loss": 0.0096,
"step": 774
},
{
"epoch": 0.29816293161488894,
"grad_norm": 0.8087600469589233,
"learning_rate": 9.932543213980402e-06,
"loss": 0.0059,
"step": 775
},
{
"epoch": 0.29854765797826294,
"grad_norm": 1.2778784036636353,
"learning_rate": 9.932343646279697e-06,
"loss": 0.0087,
"step": 776
},
{
"epoch": 0.298932384341637,
"grad_norm": 2.181772470474243,
"learning_rate": 9.932143785819908e-06,
"loss": 0.0397,
"step": 777
},
{
"epoch": 0.29931711070501105,
"grad_norm": 2.003788709640503,
"learning_rate": 9.931943632612897e-06,
"loss": 0.0455,
"step": 778
},
{
"epoch": 0.2997018370683851,
"grad_norm": 2.2337660789489746,
"learning_rate": 9.93174318667055e-06,
"loss": 0.0294,
"step": 779
},
{
"epoch": 0.30008656343175916,
"grad_norm": 1.9997296333312988,
"learning_rate": 9.93154244800476e-06,
"loss": 0.0365,
"step": 780
},
{
"epoch": 0.3004712897951332,
"grad_norm": 1.1483068466186523,
"learning_rate": 9.931341416627443e-06,
"loss": 0.0236,
"step": 781
},
{
"epoch": 0.30085601615850727,
"grad_norm": 0.22705046832561493,
"learning_rate": 9.931140092550528e-06,
"loss": 0.0022,
"step": 782
},
{
"epoch": 0.3012407425218813,
"grad_norm": 1.8457714319229126,
"learning_rate": 9.93093847578597e-06,
"loss": 0.0226,
"step": 783
},
{
"epoch": 0.3016254688852554,
"grad_norm": 1.0760217905044556,
"learning_rate": 9.930736566345732e-06,
"loss": 0.0051,
"step": 784
},
{
"epoch": 0.30201019524862943,
"grad_norm": 0.5447891354560852,
"learning_rate": 9.930534364241801e-06,
"loss": 0.0029,
"step": 785
},
{
"epoch": 0.3023949216120035,
"grad_norm": 1.1627178192138672,
"learning_rate": 9.930331869486176e-06,
"loss": 0.0058,
"step": 786
},
{
"epoch": 0.3027796479753775,
"grad_norm": 0.5428863167762756,
"learning_rate": 9.930129082090878e-06,
"loss": 0.0084,
"step": 787
},
{
"epoch": 0.30316437433875154,
"grad_norm": 0.6576860547065735,
"learning_rate": 9.929926002067944e-06,
"loss": 0.0056,
"step": 788
},
{
"epoch": 0.3035491007021256,
"grad_norm": 0.17010360956192017,
"learning_rate": 9.929722629429425e-06,
"loss": 0.0032,
"step": 789
},
{
"epoch": 0.30393382706549965,
"grad_norm": 0.7153604626655579,
"learning_rate": 9.929518964187395e-06,
"loss": 0.0183,
"step": 790
},
{
"epoch": 0.3043185534288737,
"grad_norm": 0.2032863050699234,
"learning_rate": 9.92931500635394e-06,
"loss": 0.0014,
"step": 791
},
{
"epoch": 0.30470327979224776,
"grad_norm": 0.4419695734977722,
"learning_rate": 9.929110755941168e-06,
"loss": 0.0032,
"step": 792
},
{
"epoch": 0.3050880061556218,
"grad_norm": 1.048749327659607,
"learning_rate": 9.928906212961202e-06,
"loss": 0.0072,
"step": 793
},
{
"epoch": 0.30547273251899587,
"grad_norm": 0.09899991005659103,
"learning_rate": 9.928701377426182e-06,
"loss": 0.0012,
"step": 794
},
{
"epoch": 0.3058574588823699,
"grad_norm": 0.7091645002365112,
"learning_rate": 9.928496249348265e-06,
"loss": 0.0268,
"step": 795
},
{
"epoch": 0.306242185245744,
"grad_norm": 0.4991755485534668,
"learning_rate": 9.928290828739631e-06,
"loss": 0.0064,
"step": 796
},
{
"epoch": 0.30662691160911804,
"grad_norm": 0.48444482684135437,
"learning_rate": 9.928085115612465e-06,
"loss": 0.0037,
"step": 797
},
{
"epoch": 0.3070116379724921,
"grad_norm": 0.4473024606704712,
"learning_rate": 9.927879109978984e-06,
"loss": 0.0049,
"step": 798
},
{
"epoch": 0.3073963643358661,
"grad_norm": 0.7630549669265747,
"learning_rate": 9.927672811851412e-06,
"loss": 0.015,
"step": 799
},
{
"epoch": 0.30778109069924015,
"grad_norm": 0.3428303301334381,
"learning_rate": 9.927466221241995e-06,
"loss": 0.0037,
"step": 800
},
{
"epoch": 0.3081658170626142,
"grad_norm": 1.1008957624435425,
"learning_rate": 9.927259338162995e-06,
"loss": 0.0114,
"step": 801
},
{
"epoch": 0.30855054342598826,
"grad_norm": 0.953392505645752,
"learning_rate": 9.927052162626693e-06,
"loss": 0.0061,
"step": 802
},
{
"epoch": 0.3089352697893623,
"grad_norm": 0.8205415606498718,
"learning_rate": 9.926844694645382e-06,
"loss": 0.0109,
"step": 803
},
{
"epoch": 0.30931999615273637,
"grad_norm": 0.1952117681503296,
"learning_rate": 9.92663693423138e-06,
"loss": 0.0016,
"step": 804
},
{
"epoch": 0.3097047225161104,
"grad_norm": 0.8701672554016113,
"learning_rate": 9.926428881397015e-06,
"loss": 0.0074,
"step": 805
},
{
"epoch": 0.3100894488794845,
"grad_norm": 0.9610002040863037,
"learning_rate": 9.92622053615464e-06,
"loss": 0.0093,
"step": 806
},
{
"epoch": 0.31047417524285853,
"grad_norm": 0.9083012938499451,
"learning_rate": 9.926011898516619e-06,
"loss": 0.0211,
"step": 807
},
{
"epoch": 0.3108589016062326,
"grad_norm": 0.12275370955467224,
"learning_rate": 9.925802968495337e-06,
"loss": 0.0022,
"step": 808
},
{
"epoch": 0.31124362796960664,
"grad_norm": 0.48980191349983215,
"learning_rate": 9.925593746103193e-06,
"loss": 0.0033,
"step": 809
},
{
"epoch": 0.3116283543329807,
"grad_norm": 1.5572385787963867,
"learning_rate": 9.925384231352607e-06,
"loss": 0.0246,
"step": 810
},
{
"epoch": 0.3120130806963547,
"grad_norm": 2.358025312423706,
"learning_rate": 9.925174424256015e-06,
"loss": 0.034,
"step": 811
},
{
"epoch": 0.31239780705972875,
"grad_norm": 4.787092685699463,
"learning_rate": 9.924964324825867e-06,
"loss": 0.0058,
"step": 812
},
{
"epoch": 0.3127825334231028,
"grad_norm": 0.6324902176856995,
"learning_rate": 9.924753933074637e-06,
"loss": 0.0055,
"step": 813
},
{
"epoch": 0.31316725978647686,
"grad_norm": 1.22360360622406,
"learning_rate": 9.924543249014814e-06,
"loss": 0.0376,
"step": 814
},
{
"epoch": 0.3135519861498509,
"grad_norm": 0.9349619746208191,
"learning_rate": 9.9243322726589e-06,
"loss": 0.0217,
"step": 815
},
{
"epoch": 0.31393671251322497,
"grad_norm": 1.0631873607635498,
"learning_rate": 9.924121004019416e-06,
"loss": 0.0106,
"step": 816
},
{
"epoch": 0.314321438876599,
"grad_norm": 0.6873127222061157,
"learning_rate": 9.923909443108906e-06,
"loss": 0.015,
"step": 817
},
{
"epoch": 0.3147061652399731,
"grad_norm": 1.320493459701538,
"learning_rate": 9.923697589939925e-06,
"loss": 0.0195,
"step": 818
},
{
"epoch": 0.31509089160334713,
"grad_norm": 1.1849138736724854,
"learning_rate": 9.923485444525047e-06,
"loss": 0.017,
"step": 819
},
{
"epoch": 0.3154756179667212,
"grad_norm": 0.3958847224712372,
"learning_rate": 9.923273006876865e-06,
"loss": 0.0108,
"step": 820
},
{
"epoch": 0.31586034433009524,
"grad_norm": 1.8317941427230835,
"learning_rate": 9.923060277007987e-06,
"loss": 0.0222,
"step": 821
},
{
"epoch": 0.31624507069346924,
"grad_norm": 0.6959751844406128,
"learning_rate": 9.922847254931043e-06,
"loss": 0.0132,
"step": 822
},
{
"epoch": 0.3166297970568433,
"grad_norm": 0.4210129976272583,
"learning_rate": 9.922633940658674e-06,
"loss": 0.0056,
"step": 823
},
{
"epoch": 0.31701452342021735,
"grad_norm": 0.5277615785598755,
"learning_rate": 9.922420334203539e-06,
"loss": 0.0107,
"step": 824
},
{
"epoch": 0.3173992497835914,
"grad_norm": 1.4433057308197021,
"learning_rate": 9.922206435578324e-06,
"loss": 0.0203,
"step": 825
},
{
"epoch": 0.31778397614696546,
"grad_norm": 0.3783744275569916,
"learning_rate": 9.921992244795716e-06,
"loss": 0.003,
"step": 826
},
{
"epoch": 0.3181687025103395,
"grad_norm": 0.7307786345481873,
"learning_rate": 9.921777761868434e-06,
"loss": 0.0119,
"step": 827
},
{
"epoch": 0.3185534288737136,
"grad_norm": 0.5341587066650391,
"learning_rate": 9.921562986809207e-06,
"loss": 0.0075,
"step": 828
},
{
"epoch": 0.31893815523708763,
"grad_norm": 0.9260216355323792,
"learning_rate": 9.921347919630784e-06,
"loss": 0.0315,
"step": 829
},
{
"epoch": 0.3193228816004617,
"grad_norm": 1.7915147542953491,
"learning_rate": 9.92113256034593e-06,
"loss": 0.0187,
"step": 830
},
{
"epoch": 0.31970760796383574,
"grad_norm": 0.3053416609764099,
"learning_rate": 9.920916908967424e-06,
"loss": 0.0114,
"step": 831
},
{
"epoch": 0.3200923343272098,
"grad_norm": 1.0637885332107544,
"learning_rate": 9.920700965508072e-06,
"loss": 0.0129,
"step": 832
},
{
"epoch": 0.32047706069058385,
"grad_norm": 0.7841233015060425,
"learning_rate": 9.920484729980689e-06,
"loss": 0.0066,
"step": 833
},
{
"epoch": 0.32086178705395785,
"grad_norm": 0.5703051090240479,
"learning_rate": 9.920268202398107e-06,
"loss": 0.0138,
"step": 834
},
{
"epoch": 0.3212465134173319,
"grad_norm": 0.6354265213012695,
"learning_rate": 9.920051382773179e-06,
"loss": 0.0127,
"step": 835
},
{
"epoch": 0.32163123978070596,
"grad_norm": 0.2680123448371887,
"learning_rate": 9.919834271118778e-06,
"loss": 0.0023,
"step": 836
},
{
"epoch": 0.32201596614408,
"grad_norm": 0.36873406171798706,
"learning_rate": 9.919616867447786e-06,
"loss": 0.0082,
"step": 837
},
{
"epoch": 0.32240069250745407,
"grad_norm": 0.748493492603302,
"learning_rate": 9.91939917177311e-06,
"loss": 0.0138,
"step": 838
},
{
"epoch": 0.3227854188708281,
"grad_norm": 1.3479591608047485,
"learning_rate": 9.91918118410767e-06,
"loss": 0.0135,
"step": 839
},
{
"epoch": 0.3231701452342022,
"grad_norm": 0.6877148747444153,
"learning_rate": 9.918962904464406e-06,
"loss": 0.025,
"step": 840
},
{
"epoch": 0.32355487159757623,
"grad_norm": 0.6824389696121216,
"learning_rate": 9.918744332856273e-06,
"loss": 0.0074,
"step": 841
},
{
"epoch": 0.3239395979609503,
"grad_norm": 0.2313074767589569,
"learning_rate": 9.918525469296243e-06,
"loss": 0.0046,
"step": 842
},
{
"epoch": 0.32432432432432434,
"grad_norm": 0.27155447006225586,
"learning_rate": 9.918306313797309e-06,
"loss": 0.003,
"step": 843
},
{
"epoch": 0.3247090506876984,
"grad_norm": 1.2494877576828003,
"learning_rate": 9.918086866372475e-06,
"loss": 0.015,
"step": 844
},
{
"epoch": 0.32509377705107245,
"grad_norm": 0.7449502944946289,
"learning_rate": 9.917867127034773e-06,
"loss": 0.0179,
"step": 845
},
{
"epoch": 0.32547850341444645,
"grad_norm": 0.2732839584350586,
"learning_rate": 9.917647095797241e-06,
"loss": 0.0022,
"step": 846
},
{
"epoch": 0.3258632297778205,
"grad_norm": 1.2635875940322876,
"learning_rate": 9.917426772672938e-06,
"loss": 0.0155,
"step": 847
},
{
"epoch": 0.32624795614119456,
"grad_norm": 0.8429836630821228,
"learning_rate": 9.917206157674943e-06,
"loss": 0.0092,
"step": 848
},
{
"epoch": 0.3266326825045686,
"grad_norm": 1.1740093231201172,
"learning_rate": 9.916985250816351e-06,
"loss": 0.0163,
"step": 849
},
{
"epoch": 0.32701740886794267,
"grad_norm": 1.1592316627502441,
"learning_rate": 9.916764052110274e-06,
"loss": 0.015,
"step": 850
},
{
"epoch": 0.3274021352313167,
"grad_norm": 0.21816864609718323,
"learning_rate": 9.916542561569843e-06,
"loss": 0.0027,
"step": 851
},
{
"epoch": 0.3277868615946908,
"grad_norm": 0.8437023758888245,
"learning_rate": 9.916320779208199e-06,
"loss": 0.0039,
"step": 852
},
{
"epoch": 0.32817158795806484,
"grad_norm": 0.7825914621353149,
"learning_rate": 9.91609870503851e-06,
"loss": 0.0086,
"step": 853
},
{
"epoch": 0.3285563143214389,
"grad_norm": 0.21168985962867737,
"learning_rate": 9.915876339073955e-06,
"loss": 0.0045,
"step": 854
},
{
"epoch": 0.32894104068481295,
"grad_norm": 0.4206426441669464,
"learning_rate": 9.915653681327736e-06,
"loss": 0.0077,
"step": 855
},
{
"epoch": 0.329325767048187,
"grad_norm": 0.9891207814216614,
"learning_rate": 9.915430731813067e-06,
"loss": 0.0086,
"step": 856
},
{
"epoch": 0.329710493411561,
"grad_norm": 0.5929882526397705,
"learning_rate": 9.915207490543179e-06,
"loss": 0.0101,
"step": 857
},
{
"epoch": 0.33009521977493506,
"grad_norm": 0.8196332454681396,
"learning_rate": 9.914983957531327e-06,
"loss": 0.027,
"step": 858
},
{
"epoch": 0.3304799461383091,
"grad_norm": 0.8502101302146912,
"learning_rate": 9.914760132790776e-06,
"loss": 0.0055,
"step": 859
},
{
"epoch": 0.33086467250168317,
"grad_norm": 0.08041344583034515,
"learning_rate": 9.914536016334808e-06,
"loss": 0.0005,
"step": 860
},
{
"epoch": 0.3312493988650572,
"grad_norm": 0.30731120705604553,
"learning_rate": 9.914311608176732e-06,
"loss": 0.0019,
"step": 861
},
{
"epoch": 0.3316341252284313,
"grad_norm": 1.0212372541427612,
"learning_rate": 9.914086908329863e-06,
"loss": 0.0069,
"step": 862
},
{
"epoch": 0.33201885159180533,
"grad_norm": 0.44622012972831726,
"learning_rate": 9.913861916807539e-06,
"loss": 0.0017,
"step": 863
},
{
"epoch": 0.3324035779551794,
"grad_norm": 0.5307901501655579,
"learning_rate": 9.913636633623116e-06,
"loss": 0.003,
"step": 864
},
{
"epoch": 0.33278830431855344,
"grad_norm": 0.32324591279029846,
"learning_rate": 9.913411058789964e-06,
"loss": 0.0009,
"step": 865
},
{
"epoch": 0.3331730306819275,
"grad_norm": 5.367404937744141,
"learning_rate": 9.913185192321473e-06,
"loss": 0.0318,
"step": 866
},
{
"epoch": 0.33355775704530155,
"grad_norm": 1.1971156597137451,
"learning_rate": 9.912959034231049e-06,
"loss": 0.0189,
"step": 867
},
{
"epoch": 0.3339424834086756,
"grad_norm": 1.5217525959014893,
"learning_rate": 9.912732584532114e-06,
"loss": 0.0178,
"step": 868
},
{
"epoch": 0.3343272097720496,
"grad_norm": 0.824066698551178,
"learning_rate": 9.912505843238112e-06,
"loss": 0.0118,
"step": 869
},
{
"epoch": 0.33471193613542366,
"grad_norm": 1.85015070438385,
"learning_rate": 9.912278810362499e-06,
"loss": 0.0351,
"step": 870
},
{
"epoch": 0.3350966624987977,
"grad_norm": 1.2828856706619263,
"learning_rate": 9.912051485918752e-06,
"loss": 0.0118,
"step": 871
},
{
"epoch": 0.33548138886217177,
"grad_norm": 1.711953043937683,
"learning_rate": 9.911823869920362e-06,
"loss": 0.0453,
"step": 872
},
{
"epoch": 0.3358661152255458,
"grad_norm": 0.9143729209899902,
"learning_rate": 9.91159596238084e-06,
"loss": 0.0303,
"step": 873
},
{
"epoch": 0.3362508415889199,
"grad_norm": 1.732001543045044,
"learning_rate": 9.911367763313713e-06,
"loss": 0.043,
"step": 874
},
{
"epoch": 0.33663556795229393,
"grad_norm": 0.790520429611206,
"learning_rate": 9.911139272732528e-06,
"loss": 0.0114,
"step": 875
},
{
"epoch": 0.337020294315668,
"grad_norm": 1.3358290195465088,
"learning_rate": 9.910910490650844e-06,
"loss": 0.0199,
"step": 876
},
{
"epoch": 0.33740502067904204,
"grad_norm": 0.4768337309360504,
"learning_rate": 9.910681417082241e-06,
"loss": 0.0097,
"step": 877
},
{
"epoch": 0.3377897470424161,
"grad_norm": 1.01479971408844,
"learning_rate": 9.910452052040318e-06,
"loss": 0.0079,
"step": 878
},
{
"epoch": 0.33817447340579015,
"grad_norm": 0.8050641417503357,
"learning_rate": 9.910222395538686e-06,
"loss": 0.0216,
"step": 879
},
{
"epoch": 0.3385591997691642,
"grad_norm": 1.4165840148925781,
"learning_rate": 9.90999244759098e-06,
"loss": 0.0175,
"step": 880
},
{
"epoch": 0.3389439261325382,
"grad_norm": 0.4710328280925751,
"learning_rate": 9.909762208210843e-06,
"loss": 0.0062,
"step": 881
},
{
"epoch": 0.33932865249591226,
"grad_norm": 0.6688194870948792,
"learning_rate": 9.909531677411945e-06,
"loss": 0.0177,
"step": 882
},
{
"epoch": 0.3397133788592863,
"grad_norm": 0.6211721301078796,
"learning_rate": 9.909300855207969e-06,
"loss": 0.012,
"step": 883
},
{
"epoch": 0.3400981052226604,
"grad_norm": 0.8269939422607422,
"learning_rate": 9.909069741612614e-06,
"loss": 0.0116,
"step": 884
},
{
"epoch": 0.34048283158603443,
"grad_norm": 1.0314085483551025,
"learning_rate": 9.908838336639597e-06,
"loss": 0.0155,
"step": 885
},
{
"epoch": 0.3408675579494085,
"grad_norm": 1.0547674894332886,
"learning_rate": 9.908606640302656e-06,
"loss": 0.0167,
"step": 886
},
{
"epoch": 0.34125228431278254,
"grad_norm": 1.4624102115631104,
"learning_rate": 9.90837465261554e-06,
"loss": 0.0138,
"step": 887
},
{
"epoch": 0.3416370106761566,
"grad_norm": 0.5952054858207703,
"learning_rate": 9.908142373592022e-06,
"loss": 0.0148,
"step": 888
},
{
"epoch": 0.34202173703953065,
"grad_norm": 1.6350924968719482,
"learning_rate": 9.907909803245887e-06,
"loss": 0.0161,
"step": 889
},
{
"epoch": 0.3424064634029047,
"grad_norm": 1.3565099239349365,
"learning_rate": 9.90767694159094e-06,
"loss": 0.0231,
"step": 890
},
{
"epoch": 0.34279118976627876,
"grad_norm": 0.9824076294898987,
"learning_rate": 9.907443788641e-06,
"loss": 0.0099,
"step": 891
},
{
"epoch": 0.34317591612965276,
"grad_norm": 0.9306924343109131,
"learning_rate": 9.907210344409908e-06,
"loss": 0.0063,
"step": 892
},
{
"epoch": 0.3435606424930268,
"grad_norm": 0.978356122970581,
"learning_rate": 9.906976608911521e-06,
"loss": 0.0139,
"step": 893
},
{
"epoch": 0.34394536885640087,
"grad_norm": 0.7147358655929565,
"learning_rate": 9.90674258215971e-06,
"loss": 0.0054,
"step": 894
},
{
"epoch": 0.3443300952197749,
"grad_norm": 1.1813998222351074,
"learning_rate": 9.906508264168366e-06,
"loss": 0.0207,
"step": 895
},
{
"epoch": 0.344714821583149,
"grad_norm": 1.3888568878173828,
"learning_rate": 9.906273654951399e-06,
"loss": 0.0131,
"step": 896
},
{
"epoch": 0.34509954794652303,
"grad_norm": 0.6323987245559692,
"learning_rate": 9.906038754522733e-06,
"loss": 0.0094,
"step": 897
},
{
"epoch": 0.3454842743098971,
"grad_norm": 0.9774952530860901,
"learning_rate": 9.90580356289631e-06,
"loss": 0.0204,
"step": 898
},
{
"epoch": 0.34586900067327114,
"grad_norm": 1.0801823139190674,
"learning_rate": 9.90556808008609e-06,
"loss": 0.0129,
"step": 899
},
{
"epoch": 0.3462537270366452,
"grad_norm": 0.7844304442405701,
"learning_rate": 9.905332306106051e-06,
"loss": 0.0065,
"step": 900
},
{
"epoch": 0.34663845340001925,
"grad_norm": 0.9649366736412048,
"learning_rate": 9.905096240970184e-06,
"loss": 0.0129,
"step": 901
},
{
"epoch": 0.3470231797633933,
"grad_norm": 0.8109127283096313,
"learning_rate": 9.904859884692507e-06,
"loss": 0.0184,
"step": 902
},
{
"epoch": 0.34740790612676736,
"grad_norm": 0.47789689898490906,
"learning_rate": 9.904623237287044e-06,
"loss": 0.0042,
"step": 903
},
{
"epoch": 0.34779263249014136,
"grad_norm": 0.3855844736099243,
"learning_rate": 9.904386298767841e-06,
"loss": 0.0041,
"step": 904
},
{
"epoch": 0.3481773588535154,
"grad_norm": 0.8192360401153564,
"learning_rate": 9.904149069148962e-06,
"loss": 0.0057,
"step": 905
},
{
"epoch": 0.3485620852168895,
"grad_norm": 0.7672753930091858,
"learning_rate": 9.90391154844449e-06,
"loss": 0.0146,
"step": 906
},
{
"epoch": 0.3489468115802635,
"grad_norm": 0.6804184913635254,
"learning_rate": 9.903673736668524e-06,
"loss": 0.0074,
"step": 907
},
{
"epoch": 0.3493315379436376,
"grad_norm": 0.7533726096153259,
"learning_rate": 9.903435633835174e-06,
"loss": 0.0107,
"step": 908
},
{
"epoch": 0.34971626430701164,
"grad_norm": 1.295393466949463,
"learning_rate": 9.903197239958578e-06,
"loss": 0.013,
"step": 909
},
{
"epoch": 0.3501009906703857,
"grad_norm": 0.5216909646987915,
"learning_rate": 9.902958555052882e-06,
"loss": 0.0029,
"step": 910
},
{
"epoch": 0.35048571703375975,
"grad_norm": 0.47120431065559387,
"learning_rate": 9.902719579132253e-06,
"loss": 0.0046,
"step": 911
},
{
"epoch": 0.3508704433971338,
"grad_norm": 1.1378737688064575,
"learning_rate": 9.90248031221088e-06,
"loss": 0.0151,
"step": 912
},
{
"epoch": 0.35125516976050786,
"grad_norm": 0.936591386795044,
"learning_rate": 9.90224075430296e-06,
"loss": 0.0106,
"step": 913
},
{
"epoch": 0.3516398961238819,
"grad_norm": 0.3474515676498413,
"learning_rate": 9.902000905422712e-06,
"loss": 0.0049,
"step": 914
},
{
"epoch": 0.3520246224872559,
"grad_norm": 0.588219404220581,
"learning_rate": 9.901760765584376e-06,
"loss": 0.0116,
"step": 915
},
{
"epoch": 0.35240934885062997,
"grad_norm": 1.3973325490951538,
"learning_rate": 9.901520334802203e-06,
"loss": 0.0184,
"step": 916
},
{
"epoch": 0.352794075214004,
"grad_norm": 0.41826605796813965,
"learning_rate": 9.901279613090464e-06,
"loss": 0.0095,
"step": 917
},
{
"epoch": 0.3531788015773781,
"grad_norm": 1.216929316520691,
"learning_rate": 9.901038600463446e-06,
"loss": 0.0138,
"step": 918
},
{
"epoch": 0.35356352794075213,
"grad_norm": 0.243361234664917,
"learning_rate": 9.900797296935455e-06,
"loss": 0.0043,
"step": 919
},
{
"epoch": 0.3539482543041262,
"grad_norm": 0.4657389521598816,
"learning_rate": 9.900555702520817e-06,
"loss": 0.008,
"step": 920
},
{
"epoch": 0.35433298066750024,
"grad_norm": 0.6215384006500244,
"learning_rate": 9.900313817233867e-06,
"loss": 0.0218,
"step": 921
},
{
"epoch": 0.3547177070308743,
"grad_norm": 0.2648528516292572,
"learning_rate": 9.900071641088962e-06,
"loss": 0.0031,
"step": 922
},
{
"epoch": 0.35510243339424835,
"grad_norm": 1.0400984287261963,
"learning_rate": 9.89982917410048e-06,
"loss": 0.0153,
"step": 923
},
{
"epoch": 0.3554871597576224,
"grad_norm": 0.3454299867153168,
"learning_rate": 9.899586416282811e-06,
"loss": 0.0039,
"step": 924
},
{
"epoch": 0.35587188612099646,
"grad_norm": 1.5531036853790283,
"learning_rate": 9.899343367650364e-06,
"loss": 0.0069,
"step": 925
},
{
"epoch": 0.3562566124843705,
"grad_norm": 0.21192412078380585,
"learning_rate": 9.899100028217566e-06,
"loss": 0.0037,
"step": 926
},
{
"epoch": 0.3566413388477445,
"grad_norm": 0.9417625665664673,
"learning_rate": 9.898856397998856e-06,
"loss": 0.0311,
"step": 927
},
{
"epoch": 0.35702606521111857,
"grad_norm": 0.4254940450191498,
"learning_rate": 9.8986124770087e-06,
"loss": 0.0091,
"step": 928
},
{
"epoch": 0.3574107915744926,
"grad_norm": 0.2836921811103821,
"learning_rate": 9.898368265261573e-06,
"loss": 0.0042,
"step": 929
},
{
"epoch": 0.3577955179378667,
"grad_norm": 1.2444170713424683,
"learning_rate": 9.898123762771972e-06,
"loss": 0.0122,
"step": 930
},
{
"epoch": 0.35818024430124074,
"grad_norm": 0.540425181388855,
"learning_rate": 9.897878969554407e-06,
"loss": 0.0045,
"step": 931
},
{
"epoch": 0.3585649706646148,
"grad_norm": 0.6315858960151672,
"learning_rate": 9.89763388562341e-06,
"loss": 0.0142,
"step": 932
},
{
"epoch": 0.35894969702798885,
"grad_norm": 0.6477653980255127,
"learning_rate": 9.897388510993527e-06,
"loss": 0.0071,
"step": 933
},
{
"epoch": 0.3593344233913629,
"grad_norm": 0.7630776166915894,
"learning_rate": 9.897142845679325e-06,
"loss": 0.0163,
"step": 934
},
{
"epoch": 0.35971914975473696,
"grad_norm": 1.7266794443130493,
"learning_rate": 9.896896889695377e-06,
"loss": 0.0233,
"step": 935
},
{
"epoch": 0.360103876118111,
"grad_norm": 0.28014814853668213,
"learning_rate": 9.896650643056292e-06,
"loss": 0.0025,
"step": 936
},
{
"epoch": 0.36048860248148507,
"grad_norm": 0.8066553473472595,
"learning_rate": 9.89640410577668e-06,
"loss": 0.0067,
"step": 937
},
{
"epoch": 0.3608733288448591,
"grad_norm": 0.7131628394126892,
"learning_rate": 9.896157277871175e-06,
"loss": 0.0092,
"step": 938
},
{
"epoch": 0.3612580552082331,
"grad_norm": 0.8047205209732056,
"learning_rate": 9.89591015935443e-06,
"loss": 0.0099,
"step": 939
},
{
"epoch": 0.3616427815716072,
"grad_norm": 0.2616739273071289,
"learning_rate": 9.895662750241109e-06,
"loss": 0.0035,
"step": 940
},
{
"epoch": 0.36202750793498123,
"grad_norm": 1.0631747245788574,
"learning_rate": 9.8954150505459e-06,
"loss": 0.0202,
"step": 941
},
{
"epoch": 0.3624122342983553,
"grad_norm": 1.4130353927612305,
"learning_rate": 9.895167060283504e-06,
"loss": 0.0166,
"step": 942
},
{
"epoch": 0.36279696066172934,
"grad_norm": 0.5478530526161194,
"learning_rate": 9.894918779468639e-06,
"loss": 0.0054,
"step": 943
},
{
"epoch": 0.3631816870251034,
"grad_norm": 0.7579345107078552,
"learning_rate": 9.894670208116044e-06,
"loss": 0.017,
"step": 944
},
{
"epoch": 0.36356641338847745,
"grad_norm": 1.7270785570144653,
"learning_rate": 9.894421346240472e-06,
"loss": 0.0252,
"step": 945
},
{
"epoch": 0.3639511397518515,
"grad_norm": 0.6318998336791992,
"learning_rate": 9.894172193856695e-06,
"loss": 0.0111,
"step": 946
},
{
"epoch": 0.36433586611522556,
"grad_norm": 0.5709639191627502,
"learning_rate": 9.8939227509795e-06,
"loss": 0.005,
"step": 947
},
{
"epoch": 0.3647205924785996,
"grad_norm": 0.7520560622215271,
"learning_rate": 9.893673017623692e-06,
"loss": 0.014,
"step": 948
},
{
"epoch": 0.36510531884197367,
"grad_norm": 1.3842604160308838,
"learning_rate": 9.893422993804097e-06,
"loss": 0.0186,
"step": 949
},
{
"epoch": 0.36549004520534767,
"grad_norm": 0.60210782289505,
"learning_rate": 9.893172679535554e-06,
"loss": 0.0109,
"step": 950
},
{
"epoch": 0.3658747715687217,
"grad_norm": 0.819170355796814,
"learning_rate": 9.892922074832918e-06,
"loss": 0.0109,
"step": 951
},
{
"epoch": 0.3662594979320958,
"grad_norm": 1.0711147785186768,
"learning_rate": 9.892671179711067e-06,
"loss": 0.0079,
"step": 952
},
{
"epoch": 0.36664422429546983,
"grad_norm": 0.4525466859340668,
"learning_rate": 9.89241999418489e-06,
"loss": 0.0065,
"step": 953
},
{
"epoch": 0.3670289506588439,
"grad_norm": 1.9187066555023193,
"learning_rate": 9.892168518269298e-06,
"loss": 0.0113,
"step": 954
},
{
"epoch": 0.36741367702221794,
"grad_norm": 0.6548601984977722,
"learning_rate": 9.891916751979218e-06,
"loss": 0.0086,
"step": 955
},
{
"epoch": 0.367798403385592,
"grad_norm": 0.6366754174232483,
"learning_rate": 9.89166469532959e-06,
"loss": 0.0088,
"step": 956
},
{
"epoch": 0.36818312974896605,
"grad_norm": 0.8541706800460815,
"learning_rate": 9.891412348335379e-06,
"loss": 0.0152,
"step": 957
},
{
"epoch": 0.3685678561123401,
"grad_norm": 0.5631262063980103,
"learning_rate": 9.89115971101156e-06,
"loss": 0.0128,
"step": 958
},
{
"epoch": 0.36895258247571416,
"grad_norm": 71.88910675048828,
"learning_rate": 9.890906783373131e-06,
"loss": 0.1686,
"step": 959
},
{
"epoch": 0.3693373088390882,
"grad_norm": 1.38483464717865,
"learning_rate": 9.890653565435102e-06,
"loss": 0.019,
"step": 960
},
{
"epoch": 0.3697220352024623,
"grad_norm": 0.4331851899623871,
"learning_rate": 9.890400057212504e-06,
"loss": 0.0058,
"step": 961
},
{
"epoch": 0.3701067615658363,
"grad_norm": 1.5132745504379272,
"learning_rate": 9.890146258720384e-06,
"loss": 0.0167,
"step": 962
},
{
"epoch": 0.37049148792921033,
"grad_norm": 1.5691161155700684,
"learning_rate": 9.889892169973806e-06,
"loss": 0.0106,
"step": 963
},
{
"epoch": 0.3708762142925844,
"grad_norm": 3.956345319747925,
"learning_rate": 9.889637790987852e-06,
"loss": 0.0238,
"step": 964
},
{
"epoch": 0.37126094065595844,
"grad_norm": 1.152163028717041,
"learning_rate": 9.889383121777618e-06,
"loss": 0.0066,
"step": 965
},
{
"epoch": 0.3716456670193325,
"grad_norm": 0.507671058177948,
"learning_rate": 9.889128162358223e-06,
"loss": 0.0088,
"step": 966
},
{
"epoch": 0.37203039338270655,
"grad_norm": 0.4464673101902008,
"learning_rate": 9.888872912744799e-06,
"loss": 0.0055,
"step": 967
},
{
"epoch": 0.3724151197460806,
"grad_norm": 0.769550621509552,
"learning_rate": 9.888617372952497e-06,
"loss": 0.0073,
"step": 968
},
{
"epoch": 0.37279984610945466,
"grad_norm": 2.157733917236328,
"learning_rate": 9.888361542996483e-06,
"loss": 0.0428,
"step": 969
},
{
"epoch": 0.3731845724728287,
"grad_norm": 1.0443061590194702,
"learning_rate": 9.888105422891942e-06,
"loss": 0.0151,
"step": 970
},
{
"epoch": 0.37356929883620277,
"grad_norm": 1.335445761680603,
"learning_rate": 9.887849012654079e-06,
"loss": 0.0193,
"step": 971
},
{
"epoch": 0.3739540251995768,
"grad_norm": 0.2949625849723816,
"learning_rate": 9.887592312298108e-06,
"loss": 0.0024,
"step": 972
},
{
"epoch": 0.3743387515629509,
"grad_norm": 1.2898743152618408,
"learning_rate": 9.88733532183927e-06,
"loss": 0.0109,
"step": 973
},
{
"epoch": 0.3747234779263249,
"grad_norm": 0.6975098252296448,
"learning_rate": 9.887078041292818e-06,
"loss": 0.0108,
"step": 974
},
{
"epoch": 0.37510820428969893,
"grad_norm": 0.5480782389640808,
"learning_rate": 9.88682047067402e-06,
"loss": 0.0033,
"step": 975
},
{
"epoch": 0.375492930653073,
"grad_norm": 1.3251780271530151,
"learning_rate": 9.886562609998165e-06,
"loss": 0.0145,
"step": 976
},
{
"epoch": 0.37587765701644704,
"grad_norm": 1.0030848979949951,
"learning_rate": 9.886304459280563e-06,
"loss": 0.0126,
"step": 977
},
{
"epoch": 0.3762623833798211,
"grad_norm": 0.46078020334243774,
"learning_rate": 9.886046018536529e-06,
"loss": 0.0033,
"step": 978
},
{
"epoch": 0.37664710974319515,
"grad_norm": 0.5685471296310425,
"learning_rate": 9.88578728778141e-06,
"loss": 0.0039,
"step": 979
},
{
"epoch": 0.3770318361065692,
"grad_norm": 0.578025758266449,
"learning_rate": 9.885528267030556e-06,
"loss": 0.016,
"step": 980
},
{
"epoch": 0.37741656246994326,
"grad_norm": 0.30404070019721985,
"learning_rate": 9.885268956299348e-06,
"loss": 0.0026,
"step": 981
},
{
"epoch": 0.3778012888333173,
"grad_norm": 0.7305883169174194,
"learning_rate": 9.885009355603172e-06,
"loss": 0.0052,
"step": 982
},
{
"epoch": 0.37818601519669137,
"grad_norm": 0.5870364904403687,
"learning_rate": 9.884749464957438e-06,
"loss": 0.0063,
"step": 983
},
{
"epoch": 0.3785707415600654,
"grad_norm": 1.9455591440200806,
"learning_rate": 9.884489284377575e-06,
"loss": 0.0235,
"step": 984
},
{
"epoch": 0.3789554679234394,
"grad_norm": 0.6894332766532898,
"learning_rate": 9.88422881387902e-06,
"loss": 0.0213,
"step": 985
},
{
"epoch": 0.3793401942868135,
"grad_norm": 0.99710613489151,
"learning_rate": 9.88396805347724e-06,
"loss": 0.0058,
"step": 986
},
{
"epoch": 0.37972492065018754,
"grad_norm": 1.2745070457458496,
"learning_rate": 9.883707003187708e-06,
"loss": 0.0103,
"step": 987
},
{
"epoch": 0.3801096470135616,
"grad_norm": 0.676323652267456,
"learning_rate": 9.88344566302592e-06,
"loss": 0.0053,
"step": 988
},
{
"epoch": 0.38049437337693565,
"grad_norm": 1.0734655857086182,
"learning_rate": 9.883184033007385e-06,
"loss": 0.0062,
"step": 989
},
{
"epoch": 0.3808790997403097,
"grad_norm": 2.1206910610198975,
"learning_rate": 9.882922113147637e-06,
"loss": 0.0335,
"step": 990
},
{
"epoch": 0.38126382610368376,
"grad_norm": 0.948715090751648,
"learning_rate": 9.88265990346222e-06,
"loss": 0.0198,
"step": 991
},
{
"epoch": 0.3816485524670578,
"grad_norm": 0.38453373312950134,
"learning_rate": 9.882397403966696e-06,
"loss": 0.0024,
"step": 992
},
{
"epoch": 0.38203327883043187,
"grad_norm": 2.032207727432251,
"learning_rate": 9.882134614676647e-06,
"loss": 0.0197,
"step": 993
},
{
"epoch": 0.3824180051938059,
"grad_norm": 0.744788408279419,
"learning_rate": 9.881871535607672e-06,
"loss": 0.0091,
"step": 994
},
{
"epoch": 0.38280273155718,
"grad_norm": 0.6455265879631042,
"learning_rate": 9.881608166775384e-06,
"loss": 0.0097,
"step": 995
},
{
"epoch": 0.38318745792055403,
"grad_norm": 1.2942465543746948,
"learning_rate": 9.881344508195416e-06,
"loss": 0.0043,
"step": 996
},
{
"epoch": 0.38357218428392803,
"grad_norm": 0.12089524418115616,
"learning_rate": 9.881080559883418e-06,
"loss": 0.0013,
"step": 997
},
{
"epoch": 0.3839569106473021,
"grad_norm": 1.5285307168960571,
"learning_rate": 9.880816321855055e-06,
"loss": 0.028,
"step": 998
},
{
"epoch": 0.38434163701067614,
"grad_norm": 0.778884768486023,
"learning_rate": 9.880551794126015e-06,
"loss": 0.0036,
"step": 999
},
{
"epoch": 0.3847263633740502,
"grad_norm": 1.3341624736785889,
"learning_rate": 9.880286976711992e-06,
"loss": 0.0091,
"step": 1000
},
{
"epoch": 0.38511108973742425,
"grad_norm": 0.40356457233428955,
"learning_rate": 9.880021869628711e-06,
"loss": 0.0035,
"step": 1001
},
{
"epoch": 0.3854958161007983,
"grad_norm": 0.27904146909713745,
"learning_rate": 9.879756472891904e-06,
"loss": 0.0013,
"step": 1002
},
{
"epoch": 0.38588054246417236,
"grad_norm": 1.152552843093872,
"learning_rate": 9.879490786517326e-06,
"loss": 0.0176,
"step": 1003
},
{
"epoch": 0.3862652688275464,
"grad_norm": 0.23684851825237274,
"learning_rate": 9.879224810520743e-06,
"loss": 0.0013,
"step": 1004
},
{
"epoch": 0.38664999519092047,
"grad_norm": 0.44542622566223145,
"learning_rate": 9.878958544917943e-06,
"loss": 0.0025,
"step": 1005
},
{
"epoch": 0.3870347215542945,
"grad_norm": 0.21203474700450897,
"learning_rate": 9.878691989724734e-06,
"loss": 0.0022,
"step": 1006
},
{
"epoch": 0.3874194479176686,
"grad_norm": 0.6239928007125854,
"learning_rate": 9.878425144956933e-06,
"loss": 0.0058,
"step": 1007
},
{
"epoch": 0.38780417428104264,
"grad_norm": 1.1376228332519531,
"learning_rate": 9.87815801063038e-06,
"loss": 0.0116,
"step": 1008
},
{
"epoch": 0.38818890064441663,
"grad_norm": 1.0717668533325195,
"learning_rate": 9.877890586760932e-06,
"loss": 0.021,
"step": 1009
},
{
"epoch": 0.3885736270077907,
"grad_norm": 1.4471285343170166,
"learning_rate": 9.877622873364461e-06,
"loss": 0.0195,
"step": 1010
},
{
"epoch": 0.38895835337116474,
"grad_norm": 1.0057027339935303,
"learning_rate": 9.877354870456856e-06,
"loss": 0.007,
"step": 1011
},
{
"epoch": 0.3893430797345388,
"grad_norm": 0.5728893280029297,
"learning_rate": 9.877086578054026e-06,
"loss": 0.0068,
"step": 1012
},
{
"epoch": 0.38972780609791285,
"grad_norm": 0.2538914382457733,
"learning_rate": 9.876817996171895e-06,
"loss": 0.0028,
"step": 1013
},
{
"epoch": 0.3901125324612869,
"grad_norm": 1.154489517211914,
"learning_rate": 9.876549124826405e-06,
"loss": 0.0227,
"step": 1014
},
{
"epoch": 0.39049725882466096,
"grad_norm": 0.3190124034881592,
"learning_rate": 9.876279964033513e-06,
"loss": 0.0025,
"step": 1015
},
{
"epoch": 0.390881985188035,
"grad_norm": 1.2935168743133545,
"learning_rate": 9.876010513809195e-06,
"loss": 0.0095,
"step": 1016
},
{
"epoch": 0.3912667115514091,
"grad_norm": 0.6997138261795044,
"learning_rate": 9.875740774169449e-06,
"loss": 0.0052,
"step": 1017
},
{
"epoch": 0.39165143791478313,
"grad_norm": 1.2706716060638428,
"learning_rate": 9.87547074513028e-06,
"loss": 0.0135,
"step": 1018
},
{
"epoch": 0.3920361642781572,
"grad_norm": 0.1183147132396698,
"learning_rate": 9.875200426707718e-06,
"loss": 0.0018,
"step": 1019
},
{
"epoch": 0.3924208906415312,
"grad_norm": 0.41291019320487976,
"learning_rate": 9.874929818917806e-06,
"loss": 0.008,
"step": 1020
},
{
"epoch": 0.39280561700490524,
"grad_norm": 0.35821545124053955,
"learning_rate": 9.874658921776609e-06,
"loss": 0.0033,
"step": 1021
},
{
"epoch": 0.3931903433682793,
"grad_norm": 0.6069442629814148,
"learning_rate": 9.874387735300204e-06,
"loss": 0.0067,
"step": 1022
},
{
"epoch": 0.39357506973165335,
"grad_norm": 1.0225082635879517,
"learning_rate": 9.874116259504687e-06,
"loss": 0.0093,
"step": 1023
},
{
"epoch": 0.3939597960950274,
"grad_norm": 0.4544816315174103,
"learning_rate": 9.873844494406173e-06,
"loss": 0.0047,
"step": 1024
},
{
"epoch": 0.39434452245840146,
"grad_norm": 0.37393704056739807,
"learning_rate": 9.873572440020792e-06,
"loss": 0.0028,
"step": 1025
},
{
"epoch": 0.3947292488217755,
"grad_norm": 0.9215484261512756,
"learning_rate": 9.873300096364688e-06,
"loss": 0.0143,
"step": 1026
},
{
"epoch": 0.39511397518514957,
"grad_norm": 0.4308564066886902,
"learning_rate": 9.873027463454032e-06,
"loss": 0.0123,
"step": 1027
},
{
"epoch": 0.3954987015485236,
"grad_norm": 0.06539972871541977,
"learning_rate": 9.872754541305003e-06,
"loss": 0.0007,
"step": 1028
},
{
"epoch": 0.3958834279118977,
"grad_norm": 1.0676727294921875,
"learning_rate": 9.8724813299338e-06,
"loss": 0.0177,
"step": 1029
},
{
"epoch": 0.39626815427527173,
"grad_norm": 1.1856839656829834,
"learning_rate": 9.872207829356641e-06,
"loss": 0.0129,
"step": 1030
},
{
"epoch": 0.3966528806386458,
"grad_norm": 0.5124572515487671,
"learning_rate": 9.871934039589758e-06,
"loss": 0.0047,
"step": 1031
},
{
"epoch": 0.3970376070020198,
"grad_norm": 0.4818362295627594,
"learning_rate": 9.871659960649402e-06,
"loss": 0.0085,
"step": 1032
},
{
"epoch": 0.39742233336539384,
"grad_norm": 0.4717956483364105,
"learning_rate": 9.871385592551843e-06,
"loss": 0.005,
"step": 1033
},
{
"epoch": 0.3978070597287679,
"grad_norm": 0.8839301466941833,
"learning_rate": 9.871110935313364e-06,
"loss": 0.011,
"step": 1034
},
{
"epoch": 0.39819178609214195,
"grad_norm": 0.579458475112915,
"learning_rate": 9.87083598895027e-06,
"loss": 0.0048,
"step": 1035
},
{
"epoch": 0.398576512455516,
"grad_norm": 0.11299765110015869,
"learning_rate": 9.870560753478875e-06,
"loss": 0.0016,
"step": 1036
},
{
"epoch": 0.39896123881889006,
"grad_norm": 0.35690218210220337,
"learning_rate": 9.87028522891552e-06,
"loss": 0.0067,
"step": 1037
},
{
"epoch": 0.3993459651822641,
"grad_norm": 1.820987582206726,
"learning_rate": 9.870009415276557e-06,
"loss": 0.0232,
"step": 1038
},
{
"epoch": 0.3997306915456382,
"grad_norm": 0.9093754291534424,
"learning_rate": 9.86973331257836e-06,
"loss": 0.0087,
"step": 1039
},
{
"epoch": 0.4001154179090122,
"grad_norm": 0.2319750189781189,
"learning_rate": 9.869456920837312e-06,
"loss": 0.0021,
"step": 1040
},
{
"epoch": 0.4005001442723863,
"grad_norm": 0.22607071697711945,
"learning_rate": 9.869180240069822e-06,
"loss": 0.0085,
"step": 1041
},
{
"epoch": 0.40088487063576034,
"grad_norm": 1.4031366109848022,
"learning_rate": 9.868903270292311e-06,
"loss": 0.0204,
"step": 1042
},
{
"epoch": 0.4012695969991344,
"grad_norm": 0.7285005450248718,
"learning_rate": 9.868626011521219e-06,
"loss": 0.0076,
"step": 1043
},
{
"epoch": 0.4016543233625084,
"grad_norm": 0.4827679395675659,
"learning_rate": 9.868348463773003e-06,
"loss": 0.0069,
"step": 1044
},
{
"epoch": 0.40203904972588245,
"grad_norm": 0.32623255252838135,
"learning_rate": 9.868070627064135e-06,
"loss": 0.002,
"step": 1045
},
{
"epoch": 0.4024237760892565,
"grad_norm": 0.22089290618896484,
"learning_rate": 9.867792501411108e-06,
"loss": 0.0017,
"step": 1046
},
{
"epoch": 0.40280850245263056,
"grad_norm": 0.9565873146057129,
"learning_rate": 9.86751408683043e-06,
"loss": 0.0077,
"step": 1047
},
{
"epoch": 0.4031932288160046,
"grad_norm": 0.15053868293762207,
"learning_rate": 9.867235383338625e-06,
"loss": 0.0017,
"step": 1048
},
{
"epoch": 0.40357795517937867,
"grad_norm": 1.419770359992981,
"learning_rate": 9.866956390952236e-06,
"loss": 0.0205,
"step": 1049
},
{
"epoch": 0.4039626815427527,
"grad_norm": 0.6458820700645447,
"learning_rate": 9.866677109687823e-06,
"loss": 0.0047,
"step": 1050
},
{
"epoch": 0.4043474079061268,
"grad_norm": 0.46830523014068604,
"learning_rate": 9.866397539561962e-06,
"loss": 0.004,
"step": 1051
},
{
"epoch": 0.40473213426950083,
"grad_norm": 0.7895929217338562,
"learning_rate": 9.866117680591248e-06,
"loss": 0.0274,
"step": 1052
},
{
"epoch": 0.4051168606328749,
"grad_norm": 0.4475204646587372,
"learning_rate": 9.86583753279229e-06,
"loss": 0.0033,
"step": 1053
},
{
"epoch": 0.40550158699624894,
"grad_norm": 0.9515151977539062,
"learning_rate": 9.865557096181718e-06,
"loss": 0.0072,
"step": 1054
},
{
"epoch": 0.40588631335962294,
"grad_norm": 1.6716892719268799,
"learning_rate": 9.865276370776178e-06,
"loss": 0.0156,
"step": 1055
},
{
"epoch": 0.406271039722997,
"grad_norm": 0.8594703674316406,
"learning_rate": 9.86499535659233e-06,
"loss": 0.0177,
"step": 1056
},
{
"epoch": 0.40665576608637105,
"grad_norm": 0.5373953580856323,
"learning_rate": 9.864714053646856e-06,
"loss": 0.0108,
"step": 1057
},
{
"epoch": 0.4070404924497451,
"grad_norm": 1.1601587533950806,
"learning_rate": 9.86443246195645e-06,
"loss": 0.0148,
"step": 1058
},
{
"epoch": 0.40742521881311916,
"grad_norm": 0.06577706336975098,
"learning_rate": 9.864150581537828e-06,
"loss": 0.001,
"step": 1059
},
{
"epoch": 0.4078099451764932,
"grad_norm": 0.7323742508888245,
"learning_rate": 9.863868412407721e-06,
"loss": 0.0077,
"step": 1060
},
{
"epoch": 0.40819467153986727,
"grad_norm": 0.14481770992279053,
"learning_rate": 9.863585954582876e-06,
"loss": 0.0022,
"step": 1061
},
{
"epoch": 0.4085793979032413,
"grad_norm": 0.33891233801841736,
"learning_rate": 9.86330320808006e-06,
"loss": 0.0077,
"step": 1062
},
{
"epoch": 0.4089641242666154,
"grad_norm": 0.4427297115325928,
"learning_rate": 9.863020172916054e-06,
"loss": 0.0064,
"step": 1063
},
{
"epoch": 0.40934885062998944,
"grad_norm": 0.6941469311714172,
"learning_rate": 9.862736849107656e-06,
"loss": 0.006,
"step": 1064
},
{
"epoch": 0.4097335769933635,
"grad_norm": 0.2557992935180664,
"learning_rate": 9.862453236671685e-06,
"loss": 0.0171,
"step": 1065
},
{
"epoch": 0.41011830335673755,
"grad_norm": 0.44381460547447205,
"learning_rate": 9.862169335624976e-06,
"loss": 0.007,
"step": 1066
},
{
"epoch": 0.41050302972011155,
"grad_norm": 1.5972293615341187,
"learning_rate": 9.861885145984377e-06,
"loss": 0.0077,
"step": 1067
},
{
"epoch": 0.4108877560834856,
"grad_norm": 2.6916608810424805,
"learning_rate": 9.861600667766758e-06,
"loss": 0.0161,
"step": 1068
},
{
"epoch": 0.41127248244685966,
"grad_norm": 0.8969532251358032,
"learning_rate": 9.861315900989001e-06,
"loss": 0.0019,
"step": 1069
},
{
"epoch": 0.4116572088102337,
"grad_norm": 0.25799357891082764,
"learning_rate": 9.861030845668014e-06,
"loss": 0.0029,
"step": 1070
},
{
"epoch": 0.41204193517360777,
"grad_norm": 0.9042412638664246,
"learning_rate": 9.860745501820712e-06,
"loss": 0.0098,
"step": 1071
},
{
"epoch": 0.4124266615369818,
"grad_norm": 0.4798334538936615,
"learning_rate": 9.860459869464032e-06,
"loss": 0.0032,
"step": 1072
},
{
"epoch": 0.4128113879003559,
"grad_norm": 0.728665828704834,
"learning_rate": 9.860173948614929e-06,
"loss": 0.005,
"step": 1073
},
{
"epoch": 0.41319611426372993,
"grad_norm": 0.6425368189811707,
"learning_rate": 9.859887739290375e-06,
"loss": 0.0149,
"step": 1074
},
{
"epoch": 0.413580840627104,
"grad_norm": 0.6536305546760559,
"learning_rate": 9.859601241507354e-06,
"loss": 0.0055,
"step": 1075
},
{
"epoch": 0.41396556699047804,
"grad_norm": 1.2144076824188232,
"learning_rate": 9.859314455282873e-06,
"loss": 0.0102,
"step": 1076
},
{
"epoch": 0.4143502933538521,
"grad_norm": 0.04166639596223831,
"learning_rate": 9.859027380633956e-06,
"loss": 0.0005,
"step": 1077
},
{
"epoch": 0.41473501971722615,
"grad_norm": 0.9971767067909241,
"learning_rate": 9.858740017577642e-06,
"loss": 0.0052,
"step": 1078
},
{
"epoch": 0.41511974608060015,
"grad_norm": 0.9212055206298828,
"learning_rate": 9.858452366130983e-06,
"loss": 0.0186,
"step": 1079
},
{
"epoch": 0.4155044724439742,
"grad_norm": 0.07661807537078857,
"learning_rate": 9.858164426311059e-06,
"loss": 0.0007,
"step": 1080
},
{
"epoch": 0.41588919880734826,
"grad_norm": 1.0191240310668945,
"learning_rate": 9.857876198134957e-06,
"loss": 0.0183,
"step": 1081
},
{
"epoch": 0.4162739251707223,
"grad_norm": 0.6929414868354797,
"learning_rate": 9.857587681619784e-06,
"loss": 0.0115,
"step": 1082
},
{
"epoch": 0.41665865153409637,
"grad_norm": 0.6691508293151855,
"learning_rate": 9.857298876782666e-06,
"loss": 0.0113,
"step": 1083
},
{
"epoch": 0.4170433778974704,
"grad_norm": 1.7734836339950562,
"learning_rate": 9.857009783640746e-06,
"loss": 0.0107,
"step": 1084
},
{
"epoch": 0.4174281042608445,
"grad_norm": 0.8787226676940918,
"learning_rate": 9.856720402211182e-06,
"loss": 0.0037,
"step": 1085
},
{
"epoch": 0.41781283062421853,
"grad_norm": 0.6904507279396057,
"learning_rate": 9.85643073251115e-06,
"loss": 0.0177,
"step": 1086
},
{
"epoch": 0.4181975569875926,
"grad_norm": 0.550852358341217,
"learning_rate": 9.856140774557843e-06,
"loss": 0.0062,
"step": 1087
},
{
"epoch": 0.41858228335096664,
"grad_norm": 0.6680750250816345,
"learning_rate": 9.855850528368473e-06,
"loss": 0.0093,
"step": 1088
},
{
"epoch": 0.4189670097143407,
"grad_norm": 0.508996307849884,
"learning_rate": 9.855559993960269e-06,
"loss": 0.0017,
"step": 1089
},
{
"epoch": 0.4193517360777147,
"grad_norm": 0.9837608933448792,
"learning_rate": 9.855269171350471e-06,
"loss": 0.0054,
"step": 1090
},
{
"epoch": 0.41973646244108875,
"grad_norm": 0.5428667068481445,
"learning_rate": 9.854978060556343e-06,
"loss": 0.0048,
"step": 1091
},
{
"epoch": 0.4201211888044628,
"grad_norm": 1.060753345489502,
"learning_rate": 9.854686661595166e-06,
"loss": 0.0093,
"step": 1092
},
{
"epoch": 0.42050591516783686,
"grad_norm": 0.2443462312221527,
"learning_rate": 9.854394974484233e-06,
"loss": 0.0015,
"step": 1093
},
{
"epoch": 0.4208906415312109,
"grad_norm": 0.8732754588127136,
"learning_rate": 9.854102999240858e-06,
"loss": 0.0077,
"step": 1094
},
{
"epoch": 0.421275367894585,
"grad_norm": 1.7188763618469238,
"learning_rate": 9.853810735882371e-06,
"loss": 0.0202,
"step": 1095
},
{
"epoch": 0.42166009425795903,
"grad_norm": 1.274489164352417,
"learning_rate": 9.85351818442612e-06,
"loss": 0.0182,
"step": 1096
},
{
"epoch": 0.4220448206213331,
"grad_norm": 1.6367267370224,
"learning_rate": 9.85322534488947e-06,
"loss": 0.011,
"step": 1097
},
{
"epoch": 0.42242954698470714,
"grad_norm": 1.1013633012771606,
"learning_rate": 9.852932217289798e-06,
"loss": 0.0096,
"step": 1098
},
{
"epoch": 0.4228142733480812,
"grad_norm": 0.5000425577163696,
"learning_rate": 9.852638801644509e-06,
"loss": 0.0049,
"step": 1099
},
{
"epoch": 0.42319899971145525,
"grad_norm": 1.0980358123779297,
"learning_rate": 9.852345097971017e-06,
"loss": 0.0102,
"step": 1100
},
{
"epoch": 0.4235837260748293,
"grad_norm": 1.013664960861206,
"learning_rate": 9.85205110628675e-06,
"loss": 0.0202,
"step": 1101
},
{
"epoch": 0.4239684524382033,
"grad_norm": 0.8884641528129578,
"learning_rate": 9.851756826609164e-06,
"loss": 0.0173,
"step": 1102
},
{
"epoch": 0.42435317880157736,
"grad_norm": 0.13429048657417297,
"learning_rate": 9.851462258955722e-06,
"loss": 0.001,
"step": 1103
},
{
"epoch": 0.4247379051649514,
"grad_norm": 0.36847949028015137,
"learning_rate": 9.851167403343911e-06,
"loss": 0.0101,
"step": 1104
},
{
"epoch": 0.42512263152832547,
"grad_norm": 1.019138216972351,
"learning_rate": 9.850872259791228e-06,
"loss": 0.009,
"step": 1105
},
{
"epoch": 0.4255073578916995,
"grad_norm": 0.6060003042221069,
"learning_rate": 9.850576828315196e-06,
"loss": 0.0054,
"step": 1106
},
{
"epoch": 0.4258920842550736,
"grad_norm": 1.8947292566299438,
"learning_rate": 9.850281108933346e-06,
"loss": 0.0138,
"step": 1107
},
{
"epoch": 0.42627681061844763,
"grad_norm": 1.506212592124939,
"learning_rate": 9.849985101663235e-06,
"loss": 0.021,
"step": 1108
},
{
"epoch": 0.4266615369818217,
"grad_norm": 1.6545723676681519,
"learning_rate": 9.849688806522428e-06,
"loss": 0.0157,
"step": 1109
},
{
"epoch": 0.42704626334519574,
"grad_norm": 0.5730804800987244,
"learning_rate": 9.849392223528514e-06,
"loss": 0.0055,
"step": 1110
},
{
"epoch": 0.4274309897085698,
"grad_norm": 0.5096026659011841,
"learning_rate": 9.849095352699096e-06,
"loss": 0.0033,
"step": 1111
},
{
"epoch": 0.42781571607194385,
"grad_norm": 2.3961026668548584,
"learning_rate": 9.848798194051797e-06,
"loss": 0.0082,
"step": 1112
},
{
"epoch": 0.4282004424353179,
"grad_norm": 0.8801290988922119,
"learning_rate": 9.84850074760425e-06,
"loss": 0.0119,
"step": 1113
},
{
"epoch": 0.4285851687986919,
"grad_norm": 3.352750778198242,
"learning_rate": 9.848203013374113e-06,
"loss": 0.0191,
"step": 1114
},
{
"epoch": 0.42896989516206596,
"grad_norm": 1.3553608655929565,
"learning_rate": 9.847904991379061e-06,
"loss": 0.0211,
"step": 1115
},
{
"epoch": 0.42935462152544,
"grad_norm": 2.213486671447754,
"learning_rate": 9.847606681636776e-06,
"loss": 0.0346,
"step": 1116
},
{
"epoch": 0.42973934788881407,
"grad_norm": 0.18214163184165955,
"learning_rate": 9.84730808416497e-06,
"loss": 0.0013,
"step": 1117
},
{
"epoch": 0.4301240742521881,
"grad_norm": 0.08196653425693512,
"learning_rate": 9.847009198981364e-06,
"loss": 0.0009,
"step": 1118
},
{
"epoch": 0.4305088006155622,
"grad_norm": 0.6414639353752136,
"learning_rate": 9.846710026103698e-06,
"loss": 0.0187,
"step": 1119
},
{
"epoch": 0.43089352697893624,
"grad_norm": 0.3984313905239105,
"learning_rate": 9.846410565549732e-06,
"loss": 0.0026,
"step": 1120
},
{
"epoch": 0.4312782533423103,
"grad_norm": 0.411411851644516,
"learning_rate": 9.846110817337237e-06,
"loss": 0.0023,
"step": 1121
},
{
"epoch": 0.43166297970568435,
"grad_norm": 0.3851987421512604,
"learning_rate": 9.845810781484005e-06,
"loss": 0.0032,
"step": 1122
},
{
"epoch": 0.4320477060690584,
"grad_norm": 2.438159942626953,
"learning_rate": 9.845510458007848e-06,
"loss": 0.0126,
"step": 1123
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.19791530072689056,
"learning_rate": 9.845209846926587e-06,
"loss": 0.001,
"step": 1124
},
{
"epoch": 0.43281715879580646,
"grad_norm": 0.5351514220237732,
"learning_rate": 9.844908948258067e-06,
"loss": 0.0074,
"step": 1125
},
{
"epoch": 0.4332018851591805,
"grad_norm": 0.5280462503433228,
"learning_rate": 9.84460776202015e-06,
"loss": 0.0113,
"step": 1126
},
{
"epoch": 0.43358661152255457,
"grad_norm": 1.4508618116378784,
"learning_rate": 9.844306288230709e-06,
"loss": 0.0255,
"step": 1127
},
{
"epoch": 0.4339713378859286,
"grad_norm": 1.2629048824310303,
"learning_rate": 9.84400452690764e-06,
"loss": 0.0135,
"step": 1128
},
{
"epoch": 0.4343560642493027,
"grad_norm": 1.1086742877960205,
"learning_rate": 9.843702478068855e-06,
"loss": 0.0168,
"step": 1129
},
{
"epoch": 0.43474079061267673,
"grad_norm": 0.42978718876838684,
"learning_rate": 9.84340014173228e-06,
"loss": 0.0031,
"step": 1130
},
{
"epoch": 0.4351255169760508,
"grad_norm": 0.5256184339523315,
"learning_rate": 9.84309751791586e-06,
"loss": 0.003,
"step": 1131
},
{
"epoch": 0.43551024333942484,
"grad_norm": 0.3392198085784912,
"learning_rate": 9.84279460663756e-06,
"loss": 0.0025,
"step": 1132
},
{
"epoch": 0.4358949697027989,
"grad_norm": 0.4952617287635803,
"learning_rate": 9.842491407915358e-06,
"loss": 0.0038,
"step": 1133
},
{
"epoch": 0.43627969606617295,
"grad_norm": 1.2189668416976929,
"learning_rate": 9.842187921767248e-06,
"loss": 0.0062,
"step": 1134
},
{
"epoch": 0.436664422429547,
"grad_norm": 0.4146406650543213,
"learning_rate": 9.841884148211248e-06,
"loss": 0.0205,
"step": 1135
},
{
"epoch": 0.43704914879292106,
"grad_norm": 0.19388249516487122,
"learning_rate": 9.841580087265384e-06,
"loss": 0.0013,
"step": 1136
},
{
"epoch": 0.43743387515629506,
"grad_norm": 0.3461119532585144,
"learning_rate": 9.841275738947704e-06,
"loss": 0.004,
"step": 1137
},
{
"epoch": 0.4378186015196691,
"grad_norm": 0.6838563084602356,
"learning_rate": 9.840971103276276e-06,
"loss": 0.0099,
"step": 1138
},
{
"epoch": 0.43820332788304317,
"grad_norm": 0.8505207896232605,
"learning_rate": 9.840666180269178e-06,
"loss": 0.0071,
"step": 1139
},
{
"epoch": 0.4385880542464172,
"grad_norm": 0.705162525177002,
"learning_rate": 9.840360969944511e-06,
"loss": 0.0037,
"step": 1140
},
{
"epoch": 0.4389727806097913,
"grad_norm": 0.9783769845962524,
"learning_rate": 9.84005547232039e-06,
"loss": 0.006,
"step": 1141
},
{
"epoch": 0.43935750697316533,
"grad_norm": 1.1325137615203857,
"learning_rate": 9.839749687414947e-06,
"loss": 0.0202,
"step": 1142
},
{
"epoch": 0.4397422333365394,
"grad_norm": 1.34829580783844,
"learning_rate": 9.839443615246334e-06,
"loss": 0.0029,
"step": 1143
},
{
"epoch": 0.44012695969991344,
"grad_norm": 0.5483993291854858,
"learning_rate": 9.839137255832715e-06,
"loss": 0.0047,
"step": 1144
},
{
"epoch": 0.4405116860632875,
"grad_norm": 0.7500666975975037,
"learning_rate": 9.838830609192277e-06,
"loss": 0.0193,
"step": 1145
},
{
"epoch": 0.44089641242666155,
"grad_norm": 0.3582031726837158,
"learning_rate": 9.83852367534322e-06,
"loss": 0.0035,
"step": 1146
},
{
"epoch": 0.4412811387900356,
"grad_norm": 1.813923954963684,
"learning_rate": 9.83821645430376e-06,
"loss": 0.0267,
"step": 1147
},
{
"epoch": 0.44166586515340966,
"grad_norm": 1.7281469106674194,
"learning_rate": 9.837908946092134e-06,
"loss": 0.0104,
"step": 1148
},
{
"epoch": 0.44205059151678366,
"grad_norm": 4.751184463500977,
"learning_rate": 9.837601150726594e-06,
"loss": 0.0282,
"step": 1149
},
{
"epoch": 0.4424353178801577,
"grad_norm": 0.6789389252662659,
"learning_rate": 9.837293068225408e-06,
"loss": 0.0082,
"step": 1150
},
{
"epoch": 0.4428200442435318,
"grad_norm": 0.36475417017936707,
"learning_rate": 9.836984698606865e-06,
"loss": 0.0105,
"step": 1151
},
{
"epoch": 0.44320477060690583,
"grad_norm": 2.3141651153564453,
"learning_rate": 9.836676041889265e-06,
"loss": 0.0109,
"step": 1152
},
{
"epoch": 0.4435894969702799,
"grad_norm": 0.6193933486938477,
"learning_rate": 9.836367098090931e-06,
"loss": 0.0074,
"step": 1153
},
{
"epoch": 0.44397422333365394,
"grad_norm": 0.4467495083808899,
"learning_rate": 9.836057867230198e-06,
"loss": 0.0107,
"step": 1154
},
{
"epoch": 0.444358949697028,
"grad_norm": 0.3161061108112335,
"learning_rate": 9.835748349325423e-06,
"loss": 0.009,
"step": 1155
},
{
"epoch": 0.44474367606040205,
"grad_norm": 0.33670666813850403,
"learning_rate": 9.835438544394973e-06,
"loss": 0.0061,
"step": 1156
},
{
"epoch": 0.4451284024237761,
"grad_norm": 1.2605667114257812,
"learning_rate": 9.835128452457241e-06,
"loss": 0.0127,
"step": 1157
},
{
"epoch": 0.44551312878715016,
"grad_norm": 0.40594059228897095,
"learning_rate": 9.834818073530632e-06,
"loss": 0.0066,
"step": 1158
},
{
"epoch": 0.4458978551505242,
"grad_norm": 1.4011011123657227,
"learning_rate": 9.834507407633567e-06,
"loss": 0.0164,
"step": 1159
},
{
"epoch": 0.4462825815138982,
"grad_norm": 0.4081282615661621,
"learning_rate": 9.834196454784485e-06,
"loss": 0.0077,
"step": 1160
},
{
"epoch": 0.44666730787727227,
"grad_norm": 0.34367305040359497,
"learning_rate": 9.833885215001844e-06,
"loss": 0.0023,
"step": 1161
},
{
"epoch": 0.4470520342406463,
"grad_norm": 0.45217686891555786,
"learning_rate": 9.833573688304117e-06,
"loss": 0.0052,
"step": 1162
},
{
"epoch": 0.4474367606040204,
"grad_norm": 1.1429780721664429,
"learning_rate": 9.833261874709794e-06,
"loss": 0.0444,
"step": 1163
},
{
"epoch": 0.44782148696739443,
"grad_norm": 0.1226806491613388,
"learning_rate": 9.832949774237385e-06,
"loss": 0.0014,
"step": 1164
},
{
"epoch": 0.4482062133307685,
"grad_norm": 1.016471266746521,
"learning_rate": 9.832637386905413e-06,
"loss": 0.0053,
"step": 1165
},
{
"epoch": 0.44859093969414254,
"grad_norm": 0.697564959526062,
"learning_rate": 9.832324712732419e-06,
"loss": 0.0151,
"step": 1166
},
{
"epoch": 0.4489756660575166,
"grad_norm": 0.4925800561904907,
"learning_rate": 9.832011751736965e-06,
"loss": 0.0059,
"step": 1167
},
{
"epoch": 0.44936039242089065,
"grad_norm": 0.31300026178359985,
"learning_rate": 9.831698503937623e-06,
"loss": 0.003,
"step": 1168
},
{
"epoch": 0.4497451187842647,
"grad_norm": 1.6890839338302612,
"learning_rate": 9.831384969352985e-06,
"loss": 0.0255,
"step": 1169
},
{
"epoch": 0.45012984514763876,
"grad_norm": 0.5058892965316772,
"learning_rate": 9.831071148001668e-06,
"loss": 0.0036,
"step": 1170
},
{
"epoch": 0.4505145715110128,
"grad_norm": 0.48593196272850037,
"learning_rate": 9.83075703990229e-06,
"loss": 0.0039,
"step": 1171
},
{
"epoch": 0.4508992978743868,
"grad_norm": 0.8874308466911316,
"learning_rate": 9.8304426450735e-06,
"loss": 0.015,
"step": 1172
},
{
"epoch": 0.4512840242377609,
"grad_norm": 0.7723665237426758,
"learning_rate": 9.83012796353396e-06,
"loss": 0.006,
"step": 1173
},
{
"epoch": 0.4516687506011349,
"grad_norm": 2.9279704093933105,
"learning_rate": 9.829812995302344e-06,
"loss": 0.0163,
"step": 1174
},
{
"epoch": 0.452053476964509,
"grad_norm": 0.4482285976409912,
"learning_rate": 9.829497740397349e-06,
"loss": 0.0053,
"step": 1175
},
{
"epoch": 0.45243820332788304,
"grad_norm": 0.8719068765640259,
"learning_rate": 9.829182198837686e-06,
"loss": 0.0078,
"step": 1176
},
{
"epoch": 0.4528229296912571,
"grad_norm": 0.6514776349067688,
"learning_rate": 9.828866370642086e-06,
"loss": 0.0116,
"step": 1177
},
{
"epoch": 0.45320765605463115,
"grad_norm": 1.1655035018920898,
"learning_rate": 9.828550255829291e-06,
"loss": 0.013,
"step": 1178
},
{
"epoch": 0.4535923824180052,
"grad_norm": 0.6634146571159363,
"learning_rate": 9.828233854418067e-06,
"loss": 0.0074,
"step": 1179
},
{
"epoch": 0.45397710878137926,
"grad_norm": 1.4584119319915771,
"learning_rate": 9.827917166427196e-06,
"loss": 0.0141,
"step": 1180
},
{
"epoch": 0.4543618351447533,
"grad_norm": 2.0052123069763184,
"learning_rate": 9.82760019187547e-06,
"loss": 0.0076,
"step": 1181
},
{
"epoch": 0.45474656150812737,
"grad_norm": 0.17516352236270905,
"learning_rate": 9.827282930781706e-06,
"loss": 0.0012,
"step": 1182
},
{
"epoch": 0.4551312878715014,
"grad_norm": 0.6905176043510437,
"learning_rate": 9.826965383164736e-06,
"loss": 0.0053,
"step": 1183
},
{
"epoch": 0.4555160142348754,
"grad_norm": 1.4227741956710815,
"learning_rate": 9.826647549043404e-06,
"loss": 0.0105,
"step": 1184
},
{
"epoch": 0.4559007405982495,
"grad_norm": 0.27875468134880066,
"learning_rate": 9.82632942843658e-06,
"loss": 0.002,
"step": 1185
},
{
"epoch": 0.45628546696162353,
"grad_norm": 0.11191385239362717,
"learning_rate": 9.826011021363142e-06,
"loss": 0.001,
"step": 1186
},
{
"epoch": 0.4566701933249976,
"grad_norm": 1.0875827074050903,
"learning_rate": 9.825692327841991e-06,
"loss": 0.0186,
"step": 1187
},
{
"epoch": 0.45705491968837164,
"grad_norm": 1.3509730100631714,
"learning_rate": 9.825373347892044e-06,
"loss": 0.0168,
"step": 1188
},
{
"epoch": 0.4574396460517457,
"grad_norm": 0.4952923357486725,
"learning_rate": 9.825054081532233e-06,
"loss": 0.0044,
"step": 1189
},
{
"epoch": 0.45782437241511975,
"grad_norm": 0.09577671438455582,
"learning_rate": 9.824734528781506e-06,
"loss": 0.0014,
"step": 1190
},
{
"epoch": 0.4582090987784938,
"grad_norm": 1.6081347465515137,
"learning_rate": 9.824414689658832e-06,
"loss": 0.0132,
"step": 1191
},
{
"epoch": 0.45859382514186786,
"grad_norm": 1.3940715789794922,
"learning_rate": 9.824094564183194e-06,
"loss": 0.0075,
"step": 1192
},
{
"epoch": 0.4589785515052419,
"grad_norm": 0.5964284539222717,
"learning_rate": 9.823774152373597e-06,
"loss": 0.0093,
"step": 1193
},
{
"epoch": 0.45936327786861597,
"grad_norm": 1.5253177881240845,
"learning_rate": 9.823453454249055e-06,
"loss": 0.0132,
"step": 1194
},
{
"epoch": 0.45974800423198997,
"grad_norm": 1.2830257415771484,
"learning_rate": 9.823132469828603e-06,
"loss": 0.0131,
"step": 1195
},
{
"epoch": 0.460132730595364,
"grad_norm": 0.6056346893310547,
"learning_rate": 9.822811199131293e-06,
"loss": 0.0123,
"step": 1196
},
{
"epoch": 0.4605174569587381,
"grad_norm": 0.48189786076545715,
"learning_rate": 9.822489642176195e-06,
"loss": 0.0078,
"step": 1197
},
{
"epoch": 0.46090218332211214,
"grad_norm": 0.28781089186668396,
"learning_rate": 9.822167798982398e-06,
"loss": 0.0061,
"step": 1198
},
{
"epoch": 0.4612869096854862,
"grad_norm": 0.8235536217689514,
"learning_rate": 9.821845669569e-06,
"loss": 0.0105,
"step": 1199
},
{
"epoch": 0.46167163604886025,
"grad_norm": 0.42384400963783264,
"learning_rate": 9.821523253955123e-06,
"loss": 0.0117,
"step": 1200
},
{
"epoch": 0.4620563624122343,
"grad_norm": 0.3422315716743469,
"learning_rate": 9.821200552159906e-06,
"loss": 0.0085,
"step": 1201
},
{
"epoch": 0.46244108877560836,
"grad_norm": 0.5151508450508118,
"learning_rate": 9.820877564202498e-06,
"loss": 0.0067,
"step": 1202
},
{
"epoch": 0.4628258151389824,
"grad_norm": 0.6236201524734497,
"learning_rate": 9.820554290102074e-06,
"loss": 0.0079,
"step": 1203
},
{
"epoch": 0.46321054150235647,
"grad_norm": 0.8919386267662048,
"learning_rate": 9.82023072987782e-06,
"loss": 0.0066,
"step": 1204
},
{
"epoch": 0.4635952678657305,
"grad_norm": 1.0158860683441162,
"learning_rate": 9.819906883548943e-06,
"loss": 0.0149,
"step": 1205
},
{
"epoch": 0.4639799942291046,
"grad_norm": 0.2793009579181671,
"learning_rate": 9.819582751134663e-06,
"loss": 0.0022,
"step": 1206
},
{
"epoch": 0.4643647205924786,
"grad_norm": 0.1821567565202713,
"learning_rate": 9.81925833265422e-06,
"loss": 0.0026,
"step": 1207
},
{
"epoch": 0.46474944695585263,
"grad_norm": 0.7648979425430298,
"learning_rate": 9.818933628126867e-06,
"loss": 0.0064,
"step": 1208
},
{
"epoch": 0.4651341733192267,
"grad_norm": 0.1448449194431305,
"learning_rate": 9.818608637571882e-06,
"loss": 0.0022,
"step": 1209
},
{
"epoch": 0.46551889968260074,
"grad_norm": 5.033487796783447,
"learning_rate": 9.81828336100855e-06,
"loss": 0.0216,
"step": 1210
},
{
"epoch": 0.4659036260459748,
"grad_norm": 1.1414070129394531,
"learning_rate": 9.817957798456181e-06,
"loss": 0.011,
"step": 1211
},
{
"epoch": 0.46628835240934885,
"grad_norm": 3.526010513305664,
"learning_rate": 9.817631949934096e-06,
"loss": 0.0065,
"step": 1212
},
{
"epoch": 0.4666730787727229,
"grad_norm": 0.4803553819656372,
"learning_rate": 9.81730581546164e-06,
"loss": 0.0084,
"step": 1213
},
{
"epoch": 0.46705780513609696,
"grad_norm": 0.8895028233528137,
"learning_rate": 9.816979395058164e-06,
"loss": 0.0048,
"step": 1214
},
{
"epoch": 0.467442531499471,
"grad_norm": 0.428845077753067,
"learning_rate": 9.81665268874305e-06,
"loss": 0.0015,
"step": 1215
},
{
"epoch": 0.46782725786284507,
"grad_norm": 13.160736083984375,
"learning_rate": 9.816325696535684e-06,
"loss": 0.0737,
"step": 1216
},
{
"epoch": 0.4682119842262191,
"grad_norm": 1.7152873277664185,
"learning_rate": 9.815998418455477e-06,
"loss": 0.0029,
"step": 1217
},
{
"epoch": 0.4685967105895932,
"grad_norm": 0.5329907536506653,
"learning_rate": 9.815670854521855e-06,
"loss": 0.0038,
"step": 1218
},
{
"epoch": 0.4689814369529672,
"grad_norm": 1.1879760026931763,
"learning_rate": 9.815343004754259e-06,
"loss": 0.0045,
"step": 1219
},
{
"epoch": 0.46936616331634123,
"grad_norm": 0.43010175228118896,
"learning_rate": 9.81501486917215e-06,
"loss": 0.0023,
"step": 1220
},
{
"epoch": 0.4697508896797153,
"grad_norm": 1.874674916267395,
"learning_rate": 9.814686447795004e-06,
"loss": 0.0243,
"step": 1221
},
{
"epoch": 0.47013561604308934,
"grad_norm": 1.4021985530853271,
"learning_rate": 9.814357740642314e-06,
"loss": 0.0359,
"step": 1222
},
{
"epoch": 0.4705203424064634,
"grad_norm": 0.5732538104057312,
"learning_rate": 9.81402874773359e-06,
"loss": 0.0143,
"step": 1223
},
{
"epoch": 0.47090506876983745,
"grad_norm": 0.4352494776248932,
"learning_rate": 9.813699469088362e-06,
"loss": 0.0037,
"step": 1224
},
{
"epoch": 0.4712897951332115,
"grad_norm": 0.9380084276199341,
"learning_rate": 9.81336990472617e-06,
"loss": 0.0078,
"step": 1225
},
{
"epoch": 0.47167452149658556,
"grad_norm": 1.202060580253601,
"learning_rate": 9.81304005466658e-06,
"loss": 0.0107,
"step": 1226
},
{
"epoch": 0.4720592478599596,
"grad_norm": 1.4367294311523438,
"learning_rate": 9.812709918929168e-06,
"loss": 0.0051,
"step": 1227
},
{
"epoch": 0.4724439742233337,
"grad_norm": 1.4543755054473877,
"learning_rate": 9.812379497533528e-06,
"loss": 0.0135,
"step": 1228
},
{
"epoch": 0.47282870058670773,
"grad_norm": 2.6267175674438477,
"learning_rate": 9.812048790499273e-06,
"loss": 0.0133,
"step": 1229
},
{
"epoch": 0.47321342695008173,
"grad_norm": 2.746267795562744,
"learning_rate": 9.811717797846035e-06,
"loss": 0.0191,
"step": 1230
},
{
"epoch": 0.4735981533134558,
"grad_norm": 3.3681936264038086,
"learning_rate": 9.811386519593455e-06,
"loss": 0.0129,
"step": 1231
},
{
"epoch": 0.47398287967682984,
"grad_norm": 1.2645982503890991,
"learning_rate": 9.811054955761199e-06,
"loss": 0.0094,
"step": 1232
},
{
"epoch": 0.4743676060402039,
"grad_norm": 1.5508323907852173,
"learning_rate": 9.810723106368946e-06,
"loss": 0.0171,
"step": 1233
},
{
"epoch": 0.47475233240357795,
"grad_norm": 0.6971696615219116,
"learning_rate": 9.810390971436393e-06,
"loss": 0.0053,
"step": 1234
},
{
"epoch": 0.475137058766952,
"grad_norm": 0.4388929009437561,
"learning_rate": 9.810058550983255e-06,
"loss": 0.005,
"step": 1235
},
{
"epoch": 0.47552178513032606,
"grad_norm": 1.5642904043197632,
"learning_rate": 9.809725845029262e-06,
"loss": 0.0123,
"step": 1236
},
{
"epoch": 0.4759065114937001,
"grad_norm": 0.6748471260070801,
"learning_rate": 9.809392853594162e-06,
"loss": 0.0049,
"step": 1237
},
{
"epoch": 0.47629123785707417,
"grad_norm": 1.3276309967041016,
"learning_rate": 9.809059576697719e-06,
"loss": 0.0311,
"step": 1238
},
{
"epoch": 0.4766759642204482,
"grad_norm": 4.248703479766846,
"learning_rate": 9.808726014359715e-06,
"loss": 0.0061,
"step": 1239
},
{
"epoch": 0.4770606905838223,
"grad_norm": 1.4735746383666992,
"learning_rate": 9.808392166599948e-06,
"loss": 0.0173,
"step": 1240
},
{
"epoch": 0.47744541694719633,
"grad_norm": 1.083099603652954,
"learning_rate": 9.808058033438235e-06,
"loss": 0.0171,
"step": 1241
},
{
"epoch": 0.47783014331057033,
"grad_norm": 0.7577223777770996,
"learning_rate": 9.807723614894407e-06,
"loss": 0.0066,
"step": 1242
},
{
"epoch": 0.4782148696739444,
"grad_norm": 0.8862943053245544,
"learning_rate": 9.807388910988316e-06,
"loss": 0.0111,
"step": 1243
},
{
"epoch": 0.47859959603731844,
"grad_norm": 0.418156236410141,
"learning_rate": 9.807053921739825e-06,
"loss": 0.0041,
"step": 1244
},
{
"epoch": 0.4789843224006925,
"grad_norm": 0.23023249208927155,
"learning_rate": 9.806718647168818e-06,
"loss": 0.0035,
"step": 1245
},
{
"epoch": 0.47936904876406655,
"grad_norm": 0.7867752909660339,
"learning_rate": 9.806383087295197e-06,
"loss": 0.0177,
"step": 1246
},
{
"epoch": 0.4797537751274406,
"grad_norm": 0.9785721302032471,
"learning_rate": 9.806047242138877e-06,
"loss": 0.0087,
"step": 1247
},
{
"epoch": 0.48013850149081466,
"grad_norm": 0.673989474773407,
"learning_rate": 9.805711111719794e-06,
"loss": 0.0146,
"step": 1248
},
{
"epoch": 0.4805232278541887,
"grad_norm": 0.9172596335411072,
"learning_rate": 9.805374696057896e-06,
"loss": 0.0097,
"step": 1249
},
{
"epoch": 0.48090795421756277,
"grad_norm": 0.4231598377227783,
"learning_rate": 9.805037995173156e-06,
"loss": 0.0043,
"step": 1250
},
{
"epoch": 0.4812926805809368,
"grad_norm": 1.4501237869262695,
"learning_rate": 9.804701009085554e-06,
"loss": 0.0126,
"step": 1251
},
{
"epoch": 0.4816774069443109,
"grad_norm": 0.2911331355571747,
"learning_rate": 9.804363737815095e-06,
"loss": 0.0038,
"step": 1252
},
{
"epoch": 0.48206213330768494,
"grad_norm": 1.0575670003890991,
"learning_rate": 9.804026181381796e-06,
"loss": 0.0202,
"step": 1253
},
{
"epoch": 0.48244685967105894,
"grad_norm": 0.6805642247200012,
"learning_rate": 9.803688339805693e-06,
"loss": 0.0041,
"step": 1254
},
{
"epoch": 0.482831586034433,
"grad_norm": 0.6432686448097229,
"learning_rate": 9.803350213106837e-06,
"loss": 0.0074,
"step": 1255
},
{
"epoch": 0.48321631239780705,
"grad_norm": 0.19403788447380066,
"learning_rate": 9.8030118013053e-06,
"loss": 0.0015,
"step": 1256
},
{
"epoch": 0.4836010387611811,
"grad_norm": 0.8219681978225708,
"learning_rate": 9.80267310442117e-06,
"loss": 0.0182,
"step": 1257
},
{
"epoch": 0.48398576512455516,
"grad_norm": 0.2699289321899414,
"learning_rate": 9.802334122474544e-06,
"loss": 0.0024,
"step": 1258
},
{
"epoch": 0.4843704914879292,
"grad_norm": 0.44587215781211853,
"learning_rate": 9.801994855485549e-06,
"loss": 0.0026,
"step": 1259
},
{
"epoch": 0.48475521785130327,
"grad_norm": 4.586838245391846,
"learning_rate": 9.801655303474319e-06,
"loss": 0.0091,
"step": 1260
},
{
"epoch": 0.4851399442146773,
"grad_norm": 1.5810905694961548,
"learning_rate": 9.801315466461008e-06,
"loss": 0.013,
"step": 1261
},
{
"epoch": 0.4855246705780514,
"grad_norm": 1.4810349941253662,
"learning_rate": 9.800975344465787e-06,
"loss": 0.0247,
"step": 1262
},
{
"epoch": 0.48590939694142543,
"grad_norm": 0.39250168204307556,
"learning_rate": 9.800634937508846e-06,
"loss": 0.0024,
"step": 1263
},
{
"epoch": 0.4862941233047995,
"grad_norm": 0.8914512395858765,
"learning_rate": 9.800294245610387e-06,
"loss": 0.0248,
"step": 1264
},
{
"epoch": 0.4866788496681735,
"grad_norm": 0.3135848641395569,
"learning_rate": 9.799953268790633e-06,
"loss": 0.0024,
"step": 1265
},
{
"epoch": 0.48706357603154754,
"grad_norm": 2.137585163116455,
"learning_rate": 9.799612007069823e-06,
"loss": 0.012,
"step": 1266
},
{
"epoch": 0.4874483023949216,
"grad_norm": 1.12441086769104,
"learning_rate": 9.79927046046821e-06,
"loss": 0.0097,
"step": 1267
},
{
"epoch": 0.48783302875829565,
"grad_norm": 0.8531357645988464,
"learning_rate": 9.798928629006072e-06,
"loss": 0.0139,
"step": 1268
},
{
"epoch": 0.4882177551216697,
"grad_norm": 0.7545097470283508,
"learning_rate": 9.798586512703695e-06,
"loss": 0.0027,
"step": 1269
},
{
"epoch": 0.48860248148504376,
"grad_norm": 2.2054431438446045,
"learning_rate": 9.798244111581382e-06,
"loss": 0.0124,
"step": 1270
},
{
"epoch": 0.4889872078484178,
"grad_norm": 1.043981671333313,
"learning_rate": 9.797901425659463e-06,
"loss": 0.0118,
"step": 1271
},
{
"epoch": 0.48937193421179187,
"grad_norm": 0.8491412401199341,
"learning_rate": 9.797558454958273e-06,
"loss": 0.0054,
"step": 1272
},
{
"epoch": 0.4897566605751659,
"grad_norm": 0.9706763625144958,
"learning_rate": 9.797215199498171e-06,
"loss": 0.0053,
"step": 1273
},
{
"epoch": 0.49014138693854,
"grad_norm": 3.596212148666382,
"learning_rate": 9.796871659299531e-06,
"loss": 0.0406,
"step": 1274
},
{
"epoch": 0.49052611330191404,
"grad_norm": 1.1573026180267334,
"learning_rate": 9.796527834382745e-06,
"loss": 0.0057,
"step": 1275
},
{
"epoch": 0.4909108396652881,
"grad_norm": 5.570591449737549,
"learning_rate": 9.796183724768218e-06,
"loss": 0.015,
"step": 1276
},
{
"epoch": 0.4912955660286621,
"grad_norm": 3.9478743076324463,
"learning_rate": 9.795839330476376e-06,
"loss": 0.0391,
"step": 1277
},
{
"epoch": 0.49168029239203614,
"grad_norm": 2.5740485191345215,
"learning_rate": 9.795494651527658e-06,
"loss": 0.0173,
"step": 1278
},
{
"epoch": 0.4920650187554102,
"grad_norm": 2.615067958831787,
"learning_rate": 9.795149687942527e-06,
"loss": 0.0221,
"step": 1279
},
{
"epoch": 0.49244974511878425,
"grad_norm": 0.9380441308021545,
"learning_rate": 9.794804439741455e-06,
"loss": 0.0072,
"step": 1280
},
{
"epoch": 0.4928344714821583,
"grad_norm": 1.8455308675765991,
"learning_rate": 9.794458906944937e-06,
"loss": 0.0049,
"step": 1281
},
{
"epoch": 0.49321919784553236,
"grad_norm": 0.48025840520858765,
"learning_rate": 9.794113089573479e-06,
"loss": 0.003,
"step": 1282
},
{
"epoch": 0.4936039242089064,
"grad_norm": 0.4569655656814575,
"learning_rate": 9.793766987647607e-06,
"loss": 0.0064,
"step": 1283
},
{
"epoch": 0.4939886505722805,
"grad_norm": 0.36443641781806946,
"learning_rate": 9.793420601187867e-06,
"loss": 0.0026,
"step": 1284
},
{
"epoch": 0.49437337693565453,
"grad_norm": 0.8915921449661255,
"learning_rate": 9.793073930214817e-06,
"loss": 0.0093,
"step": 1285
},
{
"epoch": 0.4947581032990286,
"grad_norm": 1.146666169166565,
"learning_rate": 9.792726974749032e-06,
"loss": 0.0083,
"step": 1286
},
{
"epoch": 0.49514282966240264,
"grad_norm": 0.8065195679664612,
"learning_rate": 9.792379734811108e-06,
"loss": 0.0099,
"step": 1287
},
{
"epoch": 0.49552755602577664,
"grad_norm": 0.5037975907325745,
"learning_rate": 9.792032210421656e-06,
"loss": 0.0117,
"step": 1288
},
{
"epoch": 0.4959122823891507,
"grad_norm": 1.7066353559494019,
"learning_rate": 9.7916844016013e-06,
"loss": 0.0153,
"step": 1289
},
{
"epoch": 0.49629700875252475,
"grad_norm": 0.36341947317123413,
"learning_rate": 9.791336308370687e-06,
"loss": 0.0019,
"step": 1290
},
{
"epoch": 0.4966817351158988,
"grad_norm": 2.803046464920044,
"learning_rate": 9.790987930750475e-06,
"loss": 0.009,
"step": 1291
},
{
"epoch": 0.49706646147927286,
"grad_norm": 0.07424652576446533,
"learning_rate": 9.790639268761346e-06,
"loss": 0.0011,
"step": 1292
},
{
"epoch": 0.4974511878426469,
"grad_norm": 0.5972120761871338,
"learning_rate": 9.790290322423992e-06,
"loss": 0.0054,
"step": 1293
},
{
"epoch": 0.49783591420602097,
"grad_norm": 1.204017996788025,
"learning_rate": 9.789941091759125e-06,
"loss": 0.0055,
"step": 1294
},
{
"epoch": 0.498220640569395,
"grad_norm": 0.9379375576972961,
"learning_rate": 9.789591576787476e-06,
"loss": 0.0036,
"step": 1295
},
{
"epoch": 0.4986053669327691,
"grad_norm": 0.4606468975543976,
"learning_rate": 9.789241777529787e-06,
"loss": 0.0057,
"step": 1296
},
{
"epoch": 0.49899009329614313,
"grad_norm": 1.5546983480453491,
"learning_rate": 9.78889169400682e-06,
"loss": 0.0143,
"step": 1297
},
{
"epoch": 0.4993748196595172,
"grad_norm": 4.400365352630615,
"learning_rate": 9.788541326239361e-06,
"loss": 0.0086,
"step": 1298
},
{
"epoch": 0.49975954602289124,
"grad_norm": 0.3501478135585785,
"learning_rate": 9.788190674248197e-06,
"loss": 0.0006,
"step": 1299
},
{
"epoch": 0.5001442723862652,
"grad_norm": 0.5171528458595276,
"learning_rate": 9.787839738054147e-06,
"loss": 0.0045,
"step": 1300
},
{
"epoch": 0.5001442723862652,
"eval_loss": 0.015632135793566704,
"eval_runtime": 232.3635,
"eval_samples_per_second": 0.826,
"eval_steps_per_second": 0.413,
"step": 1300
}
],
"logging_steps": 1,
"max_steps": 12995,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.9457475174465536e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}