random_eGoI96wFSeFELPuC / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
e734195 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002105263157894737,
"grad_norm": 6.217544016659503,
"learning_rate": 9.999972660400536e-06,
"loss": 0.3989,
"step": 1
},
{
"epoch": 0.004210526315789474,
"grad_norm": 4.110718040113299,
"learning_rate": 9.999890641901124e-06,
"loss": 0.3669,
"step": 2
},
{
"epoch": 0.00631578947368421,
"grad_norm": 4.338497302088012,
"learning_rate": 9.999753945398704e-06,
"loss": 0.312,
"step": 3
},
{
"epoch": 0.008421052631578947,
"grad_norm": 5.282542189044485,
"learning_rate": 9.99956257238817e-06,
"loss": 0.3437,
"step": 4
},
{
"epoch": 0.010526315789473684,
"grad_norm": 3.834449844048163,
"learning_rate": 9.999316524962347e-06,
"loss": 0.2323,
"step": 5
},
{
"epoch": 0.01263157894736842,
"grad_norm": 3.8748984755674143,
"learning_rate": 9.999015805811965e-06,
"loss": 0.3381,
"step": 6
},
{
"epoch": 0.014736842105263158,
"grad_norm": 3.0258910678432107,
"learning_rate": 9.998660418225645e-06,
"loss": 0.2191,
"step": 7
},
{
"epoch": 0.016842105263157894,
"grad_norm": 3.569909197687586,
"learning_rate": 9.998250366089848e-06,
"loss": 0.2458,
"step": 8
},
{
"epoch": 0.018947368421052633,
"grad_norm": 5.072654329244835,
"learning_rate": 9.997785653888835e-06,
"loss": 0.308,
"step": 9
},
{
"epoch": 0.021052631578947368,
"grad_norm": 3.23335292769489,
"learning_rate": 9.99726628670463e-06,
"loss": 0.2266,
"step": 10
},
{
"epoch": 0.023157894736842106,
"grad_norm": 3.287012798491171,
"learning_rate": 9.996692270216946e-06,
"loss": 0.2592,
"step": 11
},
{
"epoch": 0.02526315789473684,
"grad_norm": 4.0856720065160745,
"learning_rate": 9.996063610703138e-06,
"loss": 0.2674,
"step": 12
},
{
"epoch": 0.02736842105263158,
"grad_norm": 3.454246688370746,
"learning_rate": 9.995380315038119e-06,
"loss": 0.2565,
"step": 13
},
{
"epoch": 0.029473684210526315,
"grad_norm": 3.2274320728380066,
"learning_rate": 9.994642390694308e-06,
"loss": 0.1976,
"step": 14
},
{
"epoch": 0.031578947368421054,
"grad_norm": 4.260565705891167,
"learning_rate": 9.993849845741525e-06,
"loss": 0.245,
"step": 15
},
{
"epoch": 0.03368421052631579,
"grad_norm": 3.4341257739977102,
"learning_rate": 9.993002688846913e-06,
"loss": 0.2833,
"step": 16
},
{
"epoch": 0.035789473684210524,
"grad_norm": 3.62754868119338,
"learning_rate": 9.992100929274848e-06,
"loss": 0.2459,
"step": 17
},
{
"epoch": 0.037894736842105266,
"grad_norm": 3.3599215508623788,
"learning_rate": 9.991144576886824e-06,
"loss": 0.2597,
"step": 18
},
{
"epoch": 0.04,
"grad_norm": 2.759190741252958,
"learning_rate": 9.990133642141359e-06,
"loss": 0.2007,
"step": 19
},
{
"epoch": 0.042105263157894736,
"grad_norm": 3.1433858959494656,
"learning_rate": 9.989068136093873e-06,
"loss": 0.2126,
"step": 20
},
{
"epoch": 0.04421052631578947,
"grad_norm": 4.780930240687513,
"learning_rate": 9.987948070396572e-06,
"loss": 0.2567,
"step": 21
},
{
"epoch": 0.04631578947368421,
"grad_norm": 3.1361116248862704,
"learning_rate": 9.986773457298311e-06,
"loss": 0.218,
"step": 22
},
{
"epoch": 0.04842105263157895,
"grad_norm": 4.660863559826474,
"learning_rate": 9.985544309644474e-06,
"loss": 0.3171,
"step": 23
},
{
"epoch": 0.05052631578947368,
"grad_norm": 3.83833154370056,
"learning_rate": 9.984260640876821e-06,
"loss": 0.2453,
"step": 24
},
{
"epoch": 0.05263157894736842,
"grad_norm": 4.394031981209525,
"learning_rate": 9.98292246503335e-06,
"loss": 0.3022,
"step": 25
},
{
"epoch": 0.05473684210526316,
"grad_norm": 3.815312734391198,
"learning_rate": 9.981529796748135e-06,
"loss": 0.2566,
"step": 26
},
{
"epoch": 0.056842105263157895,
"grad_norm": 3.751567223965873,
"learning_rate": 9.980082651251175e-06,
"loss": 0.2057,
"step": 27
},
{
"epoch": 0.05894736842105263,
"grad_norm": 3.5703892124307886,
"learning_rate": 9.97858104436822e-06,
"loss": 0.2611,
"step": 28
},
{
"epoch": 0.061052631578947365,
"grad_norm": 4.882927650965578,
"learning_rate": 9.977024992520604e-06,
"loss": 0.2957,
"step": 29
},
{
"epoch": 0.06315789473684211,
"grad_norm": 3.5773603919322614,
"learning_rate": 9.975414512725058e-06,
"loss": 0.2483,
"step": 30
},
{
"epoch": 0.06526315789473684,
"grad_norm": 3.19691324527535,
"learning_rate": 9.973749622593534e-06,
"loss": 0.199,
"step": 31
},
{
"epoch": 0.06736842105263158,
"grad_norm": 3.285608727213878,
"learning_rate": 9.972030340333e-06,
"loss": 0.2476,
"step": 32
},
{
"epoch": 0.06947368421052631,
"grad_norm": 3.9448084183142202,
"learning_rate": 9.970256684745258e-06,
"loss": 0.2584,
"step": 33
},
{
"epoch": 0.07157894736842105,
"grad_norm": 3.093609102136492,
"learning_rate": 9.968428675226714e-06,
"loss": 0.1965,
"step": 34
},
{
"epoch": 0.07368421052631578,
"grad_norm": 2.866766273599304,
"learning_rate": 9.966546331768192e-06,
"loss": 0.235,
"step": 35
},
{
"epoch": 0.07578947368421053,
"grad_norm": 3.1680983150404862,
"learning_rate": 9.964609674954696e-06,
"loss": 0.2179,
"step": 36
},
{
"epoch": 0.07789473684210527,
"grad_norm": 2.531925390508716,
"learning_rate": 9.962618725965196e-06,
"loss": 0.1638,
"step": 37
},
{
"epoch": 0.08,
"grad_norm": 3.479913242409686,
"learning_rate": 9.960573506572391e-06,
"loss": 0.2607,
"step": 38
},
{
"epoch": 0.08210526315789474,
"grad_norm": 3.453241583441747,
"learning_rate": 9.95847403914247e-06,
"loss": 0.2846,
"step": 39
},
{
"epoch": 0.08421052631578947,
"grad_norm": 3.7499588076121415,
"learning_rate": 9.956320346634877e-06,
"loss": 0.2474,
"step": 40
},
{
"epoch": 0.0863157894736842,
"grad_norm": 3.7513628477476626,
"learning_rate": 9.954112452602045e-06,
"loss": 0.257,
"step": 41
},
{
"epoch": 0.08842105263157894,
"grad_norm": 3.249974357345021,
"learning_rate": 9.951850381189152e-06,
"loss": 0.2342,
"step": 42
},
{
"epoch": 0.09052631578947369,
"grad_norm": 3.5256183668310053,
"learning_rate": 9.949534157133844e-06,
"loss": 0.254,
"step": 43
},
{
"epoch": 0.09263157894736843,
"grad_norm": 3.0367239176760554,
"learning_rate": 9.94716380576598e-06,
"loss": 0.2181,
"step": 44
},
{
"epoch": 0.09473684210526316,
"grad_norm": 2.680983187953939,
"learning_rate": 9.944739353007344e-06,
"loss": 0.2092,
"step": 45
},
{
"epoch": 0.0968421052631579,
"grad_norm": 3.172342567980907,
"learning_rate": 9.942260825371359e-06,
"loss": 0.1665,
"step": 46
},
{
"epoch": 0.09894736842105263,
"grad_norm": 3.683978184159089,
"learning_rate": 9.939728249962808e-06,
"loss": 0.2671,
"step": 47
},
{
"epoch": 0.10105263157894737,
"grad_norm": 3.6089008339524664,
"learning_rate": 9.937141654477529e-06,
"loss": 0.2557,
"step": 48
},
{
"epoch": 0.1031578947368421,
"grad_norm": 3.1128433475033224,
"learning_rate": 9.934501067202117e-06,
"loss": 0.2298,
"step": 49
},
{
"epoch": 0.10526315789473684,
"grad_norm": 3.837339505212247,
"learning_rate": 9.931806517013612e-06,
"loss": 0.2121,
"step": 50
},
{
"epoch": 0.10736842105263159,
"grad_norm": 3.1188404843719986,
"learning_rate": 9.929058033379181e-06,
"loss": 0.2449,
"step": 51
},
{
"epoch": 0.10947368421052632,
"grad_norm": 3.499970578608811,
"learning_rate": 9.926255646355804e-06,
"loss": 0.2362,
"step": 52
},
{
"epoch": 0.11157894736842106,
"grad_norm": 3.8814562389117127,
"learning_rate": 9.923399386589933e-06,
"loss": 0.2403,
"step": 53
},
{
"epoch": 0.11368421052631579,
"grad_norm": 3.887530063847657,
"learning_rate": 9.920489285317169e-06,
"loss": 0.2276,
"step": 54
},
{
"epoch": 0.11578947368421053,
"grad_norm": 3.6293508472876455,
"learning_rate": 9.917525374361913e-06,
"loss": 0.2577,
"step": 55
},
{
"epoch": 0.11789473684210526,
"grad_norm": 3.865568740211283,
"learning_rate": 9.91450768613702e-06,
"loss": 0.2416,
"step": 56
},
{
"epoch": 0.12,
"grad_norm": 3.7137281964397095,
"learning_rate": 9.911436253643445e-06,
"loss": 0.2411,
"step": 57
},
{
"epoch": 0.12210526315789473,
"grad_norm": 3.6900754543193153,
"learning_rate": 9.908311110469881e-06,
"loss": 0.267,
"step": 58
},
{
"epoch": 0.12421052631578948,
"grad_norm": 3.4314998812484165,
"learning_rate": 9.905132290792395e-06,
"loss": 0.2415,
"step": 59
},
{
"epoch": 0.12631578947368421,
"grad_norm": 3.5441260133809154,
"learning_rate": 9.901899829374048e-06,
"loss": 0.2649,
"step": 60
},
{
"epoch": 0.12842105263157894,
"grad_norm": 2.969279579151304,
"learning_rate": 9.89861376156452e-06,
"loss": 0.2342,
"step": 61
},
{
"epoch": 0.13052631578947368,
"grad_norm": 3.0270242630571644,
"learning_rate": 9.895274123299724e-06,
"loss": 0.215,
"step": 62
},
{
"epoch": 0.13263157894736843,
"grad_norm": 3.2468251690158945,
"learning_rate": 9.891880951101407e-06,
"loss": 0.2645,
"step": 63
},
{
"epoch": 0.13473684210526315,
"grad_norm": 3.7405952840185255,
"learning_rate": 9.888434282076759e-06,
"loss": 0.224,
"step": 64
},
{
"epoch": 0.1368421052631579,
"grad_norm": 3.7386890942637736,
"learning_rate": 9.884934153917998e-06,
"loss": 0.2338,
"step": 65
},
{
"epoch": 0.13894736842105262,
"grad_norm": 3.6559224501234655,
"learning_rate": 9.881380604901964e-06,
"loss": 0.2674,
"step": 66
},
{
"epoch": 0.14105263157894737,
"grad_norm": 3.2126708403650723,
"learning_rate": 9.877773673889702e-06,
"loss": 0.257,
"step": 67
},
{
"epoch": 0.1431578947368421,
"grad_norm": 3.3544140835288387,
"learning_rate": 9.874113400326031e-06,
"loss": 0.2644,
"step": 68
},
{
"epoch": 0.14526315789473684,
"grad_norm": 3.2197549480894305,
"learning_rate": 9.870399824239116e-06,
"loss": 0.2337,
"step": 69
},
{
"epoch": 0.14736842105263157,
"grad_norm": 3.0956092032854787,
"learning_rate": 9.86663298624003e-06,
"loss": 0.2018,
"step": 70
},
{
"epoch": 0.14947368421052631,
"grad_norm": 2.728914245630825,
"learning_rate": 9.86281292752231e-06,
"loss": 0.1902,
"step": 71
},
{
"epoch": 0.15157894736842106,
"grad_norm": 2.728563361219932,
"learning_rate": 9.858939689861506e-06,
"loss": 0.1998,
"step": 72
},
{
"epoch": 0.15368421052631578,
"grad_norm": 3.6065260596056428,
"learning_rate": 9.855013315614725e-06,
"loss": 0.2589,
"step": 73
},
{
"epoch": 0.15578947368421053,
"grad_norm": 3.9437466543223016,
"learning_rate": 9.851033847720167e-06,
"loss": 0.2839,
"step": 74
},
{
"epoch": 0.15789473684210525,
"grad_norm": 2.6492053943266973,
"learning_rate": 9.847001329696653e-06,
"loss": 0.1926,
"step": 75
},
{
"epoch": 0.16,
"grad_norm": 3.4928074094432975,
"learning_rate": 9.842915805643156e-06,
"loss": 0.2567,
"step": 76
},
{
"epoch": 0.16210526315789472,
"grad_norm": 3.745761767248173,
"learning_rate": 9.838777320238312e-06,
"loss": 0.2473,
"step": 77
},
{
"epoch": 0.16421052631578947,
"grad_norm": 4.662473350343442,
"learning_rate": 9.834585918739936e-06,
"loss": 0.2534,
"step": 78
},
{
"epoch": 0.16631578947368422,
"grad_norm": 3.5985866535045092,
"learning_rate": 9.830341646984521e-06,
"loss": 0.2375,
"step": 79
},
{
"epoch": 0.16842105263157894,
"grad_norm": 3.3095318865144323,
"learning_rate": 9.826044551386743e-06,
"loss": 0.2179,
"step": 80
},
{
"epoch": 0.1705263157894737,
"grad_norm": 3.218832777420868,
"learning_rate": 9.821694678938954e-06,
"loss": 0.2245,
"step": 81
},
{
"epoch": 0.1726315789473684,
"grad_norm": 3.4749041260361326,
"learning_rate": 9.817292077210658e-06,
"loss": 0.2451,
"step": 82
},
{
"epoch": 0.17473684210526316,
"grad_norm": 3.6052413099966376,
"learning_rate": 9.812836794348005e-06,
"loss": 0.2132,
"step": 83
},
{
"epoch": 0.17684210526315788,
"grad_norm": 3.819893458132905,
"learning_rate": 9.808328879073251e-06,
"loss": 0.2518,
"step": 84
},
{
"epoch": 0.17894736842105263,
"grad_norm": 3.447449141237711,
"learning_rate": 9.803768380684242e-06,
"loss": 0.2832,
"step": 85
},
{
"epoch": 0.18105263157894738,
"grad_norm": 3.357478738557209,
"learning_rate": 9.79915534905385e-06,
"loss": 0.2568,
"step": 86
},
{
"epoch": 0.1831578947368421,
"grad_norm": 3.7920160087811476,
"learning_rate": 9.794489834629457e-06,
"loss": 0.263,
"step": 87
},
{
"epoch": 0.18526315789473685,
"grad_norm": 3.182104834724342,
"learning_rate": 9.789771888432375e-06,
"loss": 0.2245,
"step": 88
},
{
"epoch": 0.18736842105263157,
"grad_norm": 3.4674212312793813,
"learning_rate": 9.785001562057311e-06,
"loss": 0.2417,
"step": 89
},
{
"epoch": 0.18947368421052632,
"grad_norm": 4.117467872656145,
"learning_rate": 9.780178907671788e-06,
"loss": 0.2768,
"step": 90
},
{
"epoch": 0.19157894736842104,
"grad_norm": 3.631475929836605,
"learning_rate": 9.775303978015585e-06,
"loss": 0.2437,
"step": 91
},
{
"epoch": 0.1936842105263158,
"grad_norm": 3.3413603143822335,
"learning_rate": 9.77037682640015e-06,
"loss": 0.2642,
"step": 92
},
{
"epoch": 0.1957894736842105,
"grad_norm": 3.9842190799219876,
"learning_rate": 9.765397506708023e-06,
"loss": 0.3169,
"step": 93
},
{
"epoch": 0.19789473684210526,
"grad_norm": 3.7032684341350173,
"learning_rate": 9.760366073392246e-06,
"loss": 0.1791,
"step": 94
},
{
"epoch": 0.2,
"grad_norm": 3.987149618467848,
"learning_rate": 9.755282581475769e-06,
"loss": 0.3039,
"step": 95
},
{
"epoch": 0.20210526315789473,
"grad_norm": 3.2139873581817486,
"learning_rate": 9.750147086550843e-06,
"loss": 0.2504,
"step": 96
},
{
"epoch": 0.20421052631578948,
"grad_norm": 3.566561813353208,
"learning_rate": 9.744959644778422e-06,
"loss": 0.2863,
"step": 97
},
{
"epoch": 0.2063157894736842,
"grad_norm": 3.7268559206075946,
"learning_rate": 9.739720312887536e-06,
"loss": 0.2167,
"step": 98
},
{
"epoch": 0.20842105263157895,
"grad_norm": 2.9695827733722595,
"learning_rate": 9.734429148174676e-06,
"loss": 0.2393,
"step": 99
},
{
"epoch": 0.21052631578947367,
"grad_norm": 3.6108040436142823,
"learning_rate": 9.729086208503174e-06,
"loss": 0.295,
"step": 100
},
{
"epoch": 0.21263157894736842,
"grad_norm": 3.925500913610178,
"learning_rate": 9.723691552302563e-06,
"loss": 0.2467,
"step": 101
},
{
"epoch": 0.21473684210526317,
"grad_norm": 3.5695343047388666,
"learning_rate": 9.718245238567939e-06,
"loss": 0.2242,
"step": 102
},
{
"epoch": 0.2168421052631579,
"grad_norm": 3.3235918183280866,
"learning_rate": 9.712747326859316e-06,
"loss": 0.2278,
"step": 103
},
{
"epoch": 0.21894736842105264,
"grad_norm": 3.764042590013744,
"learning_rate": 9.707197877300974e-06,
"loss": 0.2921,
"step": 104
},
{
"epoch": 0.22105263157894736,
"grad_norm": 2.903454636343328,
"learning_rate": 9.701596950580807e-06,
"loss": 0.2165,
"step": 105
},
{
"epoch": 0.2231578947368421,
"grad_norm": 2.5462228191934124,
"learning_rate": 9.69594460794965e-06,
"loss": 0.1913,
"step": 106
},
{
"epoch": 0.22526315789473683,
"grad_norm": 3.048612041076824,
"learning_rate": 9.690240911220618e-06,
"loss": 0.1913,
"step": 107
},
{
"epoch": 0.22736842105263158,
"grad_norm": 2.7190276333100885,
"learning_rate": 9.684485922768422e-06,
"loss": 0.1846,
"step": 108
},
{
"epoch": 0.2294736842105263,
"grad_norm": 3.3279879332903164,
"learning_rate": 9.678679705528699e-06,
"loss": 0.2444,
"step": 109
},
{
"epoch": 0.23157894736842105,
"grad_norm": 3.086182493388614,
"learning_rate": 9.672822322997305e-06,
"loss": 0.1827,
"step": 110
},
{
"epoch": 0.2336842105263158,
"grad_norm": 3.0198656142842433,
"learning_rate": 9.666913839229639e-06,
"loss": 0.2064,
"step": 111
},
{
"epoch": 0.23578947368421052,
"grad_norm": 3.871643739742935,
"learning_rate": 9.660954318839934e-06,
"loss": 0.2537,
"step": 112
},
{
"epoch": 0.23789473684210527,
"grad_norm": 4.034332856853841,
"learning_rate": 9.654943827000548e-06,
"loss": 0.2499,
"step": 113
},
{
"epoch": 0.24,
"grad_norm": 3.7628273980242515,
"learning_rate": 9.648882429441258e-06,
"loss": 0.2557,
"step": 114
},
{
"epoch": 0.24210526315789474,
"grad_norm": 3.5786660920291493,
"learning_rate": 9.642770192448537e-06,
"loss": 0.2677,
"step": 115
},
{
"epoch": 0.24421052631578946,
"grad_norm": 4.532586938593248,
"learning_rate": 9.636607182864828e-06,
"loss": 0.2685,
"step": 116
},
{
"epoch": 0.2463157894736842,
"grad_norm": 3.0674072329356856,
"learning_rate": 9.630393468087818e-06,
"loss": 0.1846,
"step": 117
},
{
"epoch": 0.24842105263157896,
"grad_norm": 4.2865579808258945,
"learning_rate": 9.624129116069695e-06,
"loss": 0.342,
"step": 118
},
{
"epoch": 0.2505263157894737,
"grad_norm": 3.8921150967122156,
"learning_rate": 9.61781419531641e-06,
"loss": 0.2634,
"step": 119
},
{
"epoch": 0.25263157894736843,
"grad_norm": 3.3675053578108978,
"learning_rate": 9.611448774886925e-06,
"loss": 0.2273,
"step": 120
},
{
"epoch": 0.25473684210526315,
"grad_norm": 4.105187040991947,
"learning_rate": 9.605032924392457e-06,
"loss": 0.242,
"step": 121
},
{
"epoch": 0.25684210526315787,
"grad_norm": 3.2229116365485924,
"learning_rate": 9.598566713995718e-06,
"loss": 0.2471,
"step": 122
},
{
"epoch": 0.25894736842105265,
"grad_norm": 2.8700645053873126,
"learning_rate": 9.592050214410152e-06,
"loss": 0.2465,
"step": 123
},
{
"epoch": 0.26105263157894737,
"grad_norm": 3.6312759588775783,
"learning_rate": 9.585483496899151e-06,
"loss": 0.24,
"step": 124
},
{
"epoch": 0.2631578947368421,
"grad_norm": 2.9630698799183226,
"learning_rate": 9.578866633275289e-06,
"loss": 0.2054,
"step": 125
},
{
"epoch": 0.26526315789473687,
"grad_norm": 3.544581712241485,
"learning_rate": 9.572199695899522e-06,
"loss": 0.2314,
"step": 126
},
{
"epoch": 0.2673684210526316,
"grad_norm": 3.255776996164575,
"learning_rate": 9.565482757680415e-06,
"loss": 0.2785,
"step": 127
},
{
"epoch": 0.2694736842105263,
"grad_norm": 2.8952918607035363,
"learning_rate": 9.558715892073324e-06,
"loss": 0.2218,
"step": 128
},
{
"epoch": 0.27157894736842103,
"grad_norm": 3.4853221427011065,
"learning_rate": 9.551899173079607e-06,
"loss": 0.2862,
"step": 129
},
{
"epoch": 0.2736842105263158,
"grad_norm": 4.128929611734161,
"learning_rate": 9.545032675245814e-06,
"loss": 0.3055,
"step": 130
},
{
"epoch": 0.27578947368421053,
"grad_norm": 2.974600343932656,
"learning_rate": 9.538116473662862e-06,
"loss": 0.215,
"step": 131
},
{
"epoch": 0.27789473684210525,
"grad_norm": 2.7669257916596823,
"learning_rate": 9.531150643965224e-06,
"loss": 0.2182,
"step": 132
},
{
"epoch": 0.28,
"grad_norm": 3.97273403473512,
"learning_rate": 9.524135262330098e-06,
"loss": 0.2658,
"step": 133
},
{
"epoch": 0.28210526315789475,
"grad_norm": 3.6696654833766895,
"learning_rate": 9.517070405476575e-06,
"loss": 0.2305,
"step": 134
},
{
"epoch": 0.28421052631578947,
"grad_norm": 2.5127237232679667,
"learning_rate": 9.509956150664796e-06,
"loss": 0.1511,
"step": 135
},
{
"epoch": 0.2863157894736842,
"grad_norm": 3.371290504994853,
"learning_rate": 9.502792575695112e-06,
"loss": 0.26,
"step": 136
},
{
"epoch": 0.28842105263157897,
"grad_norm": 3.0689625597079684,
"learning_rate": 9.495579758907231e-06,
"loss": 0.2524,
"step": 137
},
{
"epoch": 0.2905263157894737,
"grad_norm": 3.183088939033141,
"learning_rate": 9.48831777917936e-06,
"loss": 0.2122,
"step": 138
},
{
"epoch": 0.2926315789473684,
"grad_norm": 3.8034642187376035,
"learning_rate": 9.481006715927352e-06,
"loss": 0.2593,
"step": 139
},
{
"epoch": 0.29473684210526313,
"grad_norm": 3.8705009381157343,
"learning_rate": 9.473646649103819e-06,
"loss": 0.2594,
"step": 140
},
{
"epoch": 0.2968421052631579,
"grad_norm": 3.0318361821750286,
"learning_rate": 9.466237659197271e-06,
"loss": 0.2254,
"step": 141
},
{
"epoch": 0.29894736842105263,
"grad_norm": 3.615169294903516,
"learning_rate": 9.458779827231237e-06,
"loss": 0.2096,
"step": 142
},
{
"epoch": 0.30105263157894735,
"grad_norm": 3.2733885578132313,
"learning_rate": 9.451273234763372e-06,
"loss": 0.2431,
"step": 143
},
{
"epoch": 0.3031578947368421,
"grad_norm": 3.2273667507387533,
"learning_rate": 9.443717963884568e-06,
"loss": 0.2228,
"step": 144
},
{
"epoch": 0.30526315789473685,
"grad_norm": 3.632106743266242,
"learning_rate": 9.43611409721806e-06,
"loss": 0.248,
"step": 145
},
{
"epoch": 0.30736842105263157,
"grad_norm": 3.320763426450409,
"learning_rate": 9.428461717918512e-06,
"loss": 0.2878,
"step": 146
},
{
"epoch": 0.3094736842105263,
"grad_norm": 3.549945610201063,
"learning_rate": 9.420760909671119e-06,
"loss": 0.231,
"step": 147
},
{
"epoch": 0.31157894736842107,
"grad_norm": 3.282653593524781,
"learning_rate": 9.413011756690686e-06,
"loss": 0.2659,
"step": 148
},
{
"epoch": 0.3136842105263158,
"grad_norm": 3.261626438744862,
"learning_rate": 9.405214343720708e-06,
"loss": 0.2586,
"step": 149
},
{
"epoch": 0.3157894736842105,
"grad_norm": 2.924567282994091,
"learning_rate": 9.397368756032445e-06,
"loss": 0.1778,
"step": 150
},
{
"epoch": 0.3178947368421053,
"grad_norm": 3.0558746792231464,
"learning_rate": 9.389475079423988e-06,
"loss": 0.2471,
"step": 151
},
{
"epoch": 0.32,
"grad_norm": 3.7586909960856207,
"learning_rate": 9.381533400219319e-06,
"loss": 0.258,
"step": 152
},
{
"epoch": 0.32210526315789473,
"grad_norm": 3.392179309145632,
"learning_rate": 9.373543805267367e-06,
"loss": 0.249,
"step": 153
},
{
"epoch": 0.32421052631578945,
"grad_norm": 4.083035200586394,
"learning_rate": 9.365506381941066e-06,
"loss": 0.2789,
"step": 154
},
{
"epoch": 0.3263157894736842,
"grad_norm": 3.19777370952952,
"learning_rate": 9.357421218136387e-06,
"loss": 0.2062,
"step": 155
},
{
"epoch": 0.32842105263157895,
"grad_norm": 3.4456582477689928,
"learning_rate": 9.349288402271387e-06,
"loss": 0.2382,
"step": 156
},
{
"epoch": 0.33052631578947367,
"grad_norm": 2.765215943542346,
"learning_rate": 9.341108023285239e-06,
"loss": 0.1827,
"step": 157
},
{
"epoch": 0.33263157894736844,
"grad_norm": 2.883818519531558,
"learning_rate": 9.332880170637252e-06,
"loss": 0.1995,
"step": 158
},
{
"epoch": 0.33473684210526317,
"grad_norm": 3.6487183139434234,
"learning_rate": 9.324604934305911e-06,
"loss": 0.2598,
"step": 159
},
{
"epoch": 0.3368421052631579,
"grad_norm": 3.9308083666697344,
"learning_rate": 9.31628240478787e-06,
"loss": 0.2412,
"step": 160
},
{
"epoch": 0.3389473684210526,
"grad_norm": 3.5970617830856773,
"learning_rate": 9.30791267309698e-06,
"loss": 0.2851,
"step": 161
},
{
"epoch": 0.3410526315789474,
"grad_norm": 3.467839501820664,
"learning_rate": 9.299495830763285e-06,
"loss": 0.2853,
"step": 162
},
{
"epoch": 0.3431578947368421,
"grad_norm": 3.602755193669457,
"learning_rate": 9.291031969832026e-06,
"loss": 0.2225,
"step": 163
},
{
"epoch": 0.3452631578947368,
"grad_norm": 3.0886925699452985,
"learning_rate": 9.28252118286263e-06,
"loss": 0.1903,
"step": 164
},
{
"epoch": 0.3473684210526316,
"grad_norm": 3.439033801554011,
"learning_rate": 9.273963562927695e-06,
"loss": 0.2287,
"step": 165
},
{
"epoch": 0.3494736842105263,
"grad_norm": 3.3882150165690783,
"learning_rate": 9.265359203611988e-06,
"loss": 0.2904,
"step": 166
},
{
"epoch": 0.35157894736842105,
"grad_norm": 3.3452062359089507,
"learning_rate": 9.256708199011402e-06,
"loss": 0.2339,
"step": 167
},
{
"epoch": 0.35368421052631577,
"grad_norm": 3.700729392048823,
"learning_rate": 9.248010643731936e-06,
"loss": 0.2796,
"step": 168
},
{
"epoch": 0.35578947368421054,
"grad_norm": 3.1210284485776874,
"learning_rate": 9.23926663288866e-06,
"loss": 0.2126,
"step": 169
},
{
"epoch": 0.35789473684210527,
"grad_norm": 3.6574312237344992,
"learning_rate": 9.230476262104678e-06,
"loss": 0.2493,
"step": 170
},
{
"epoch": 0.36,
"grad_norm": 3.679409049213219,
"learning_rate": 9.221639627510076e-06,
"loss": 0.2551,
"step": 171
},
{
"epoch": 0.36210526315789476,
"grad_norm": 3.3167727556758693,
"learning_rate": 9.212756825740874e-06,
"loss": 0.2482,
"step": 172
},
{
"epoch": 0.3642105263157895,
"grad_norm": 2.401115724431016,
"learning_rate": 9.203827953937969e-06,
"loss": 0.1881,
"step": 173
},
{
"epoch": 0.3663157894736842,
"grad_norm": 3.4427049239845533,
"learning_rate": 9.194853109746073e-06,
"loss": 0.2088,
"step": 174
},
{
"epoch": 0.3684210526315789,
"grad_norm": 3.2573463355993826,
"learning_rate": 9.185832391312644e-06,
"loss": 0.2495,
"step": 175
},
{
"epoch": 0.3705263157894737,
"grad_norm": 3.2797367783671234,
"learning_rate": 9.176765897286812e-06,
"loss": 0.2435,
"step": 176
},
{
"epoch": 0.3726315789473684,
"grad_norm": 3.6588831382550806,
"learning_rate": 9.167653726818305e-06,
"loss": 0.2293,
"step": 177
},
{
"epoch": 0.37473684210526315,
"grad_norm": 2.7755595814174363,
"learning_rate": 9.15849597955636e-06,
"loss": 0.2044,
"step": 178
},
{
"epoch": 0.37684210526315787,
"grad_norm": 3.5962245807858255,
"learning_rate": 9.149292755648631e-06,
"loss": 0.2214,
"step": 179
},
{
"epoch": 0.37894736842105264,
"grad_norm": 3.4357076470231305,
"learning_rate": 9.140044155740102e-06,
"loss": 0.2402,
"step": 180
},
{
"epoch": 0.38105263157894737,
"grad_norm": 3.4123688743753853,
"learning_rate": 9.130750280971978e-06,
"loss": 0.2553,
"step": 181
},
{
"epoch": 0.3831578947368421,
"grad_norm": 2.741940954433975,
"learning_rate": 9.121411232980589e-06,
"loss": 0.1907,
"step": 182
},
{
"epoch": 0.38526315789473686,
"grad_norm": 3.6400973883721384,
"learning_rate": 9.112027113896262e-06,
"loss": 0.2616,
"step": 183
},
{
"epoch": 0.3873684210526316,
"grad_norm": 3.5161597871058277,
"learning_rate": 9.102598026342223e-06,
"loss": 0.2114,
"step": 184
},
{
"epoch": 0.3894736842105263,
"grad_norm": 3.5917090120879904,
"learning_rate": 9.093124073433464e-06,
"loss": 0.2304,
"step": 185
},
{
"epoch": 0.391578947368421,
"grad_norm": 3.1502787194480897,
"learning_rate": 9.083605358775612e-06,
"loss": 0.2032,
"step": 186
},
{
"epoch": 0.3936842105263158,
"grad_norm": 3.9729218899091063,
"learning_rate": 9.074041986463808e-06,
"loss": 0.2325,
"step": 187
},
{
"epoch": 0.3957894736842105,
"grad_norm": 3.0185052960999523,
"learning_rate": 9.064434061081562e-06,
"loss": 0.1981,
"step": 188
},
{
"epoch": 0.39789473684210525,
"grad_norm": 4.209209496600263,
"learning_rate": 9.0547816876996e-06,
"loss": 0.2586,
"step": 189
},
{
"epoch": 0.4,
"grad_norm": 3.967209050356877,
"learning_rate": 9.045084971874738e-06,
"loss": 0.2946,
"step": 190
},
{
"epoch": 0.40210526315789474,
"grad_norm": 3.017106050384066,
"learning_rate": 9.035344019648701e-06,
"loss": 0.2465,
"step": 191
},
{
"epoch": 0.40421052631578946,
"grad_norm": 3.917356737129771,
"learning_rate": 9.025558937546987e-06,
"loss": 0.3207,
"step": 192
},
{
"epoch": 0.4063157894736842,
"grad_norm": 3.2403291834347767,
"learning_rate": 9.015729832577681e-06,
"loss": 0.233,
"step": 193
},
{
"epoch": 0.40842105263157896,
"grad_norm": 3.322798306669591,
"learning_rate": 9.005856812230304e-06,
"loss": 0.1899,
"step": 194
},
{
"epoch": 0.4105263157894737,
"grad_norm": 3.430365209049047,
"learning_rate": 8.995939984474624e-06,
"loss": 0.2304,
"step": 195
},
{
"epoch": 0.4126315789473684,
"grad_norm": 3.39458107073051,
"learning_rate": 8.98597945775948e-06,
"loss": 0.2357,
"step": 196
},
{
"epoch": 0.4147368421052632,
"grad_norm": 3.3413958584715475,
"learning_rate": 8.975975341011595e-06,
"loss": 0.2855,
"step": 197
},
{
"epoch": 0.4168421052631579,
"grad_norm": 2.5726994940315415,
"learning_rate": 8.96592774363439e-06,
"loss": 0.1901,
"step": 198
},
{
"epoch": 0.4189473684210526,
"grad_norm": 4.335519110486464,
"learning_rate": 8.955836775506776e-06,
"loss": 0.2933,
"step": 199
},
{
"epoch": 0.42105263157894735,
"grad_norm": 4.19815390177116,
"learning_rate": 8.94570254698197e-06,
"loss": 0.2688,
"step": 200
},
{
"epoch": 0.42105263157894735,
"eval_loss": 0.2162427455186844,
"eval_runtime": 0.9508,
"eval_samples_per_second": 41.017,
"eval_steps_per_second": 10.517,
"step": 200
},
{
"epoch": 0.4231578947368421,
"grad_norm": 3.269776118271859,
"learning_rate": 8.935525168886263e-06,
"loss": 0.2096,
"step": 201
},
{
"epoch": 0.42526315789473684,
"grad_norm": 4.04123612262294,
"learning_rate": 8.92530475251784e-06,
"loss": 0.2568,
"step": 202
},
{
"epoch": 0.42736842105263156,
"grad_norm": 3.8368271309479933,
"learning_rate": 8.91504140964553e-06,
"loss": 0.2657,
"step": 203
},
{
"epoch": 0.42947368421052634,
"grad_norm": 3.1272371621856037,
"learning_rate": 8.90473525250761e-06,
"loss": 0.2268,
"step": 204
},
{
"epoch": 0.43157894736842106,
"grad_norm": 3.093955290257307,
"learning_rate": 8.894386393810563e-06,
"loss": 0.2042,
"step": 205
},
{
"epoch": 0.4336842105263158,
"grad_norm": 2.6581659343898543,
"learning_rate": 8.883994946727848e-06,
"loss": 0.1746,
"step": 206
},
{
"epoch": 0.4357894736842105,
"grad_norm": 3.955613588917937,
"learning_rate": 8.873561024898668e-06,
"loss": 0.1996,
"step": 207
},
{
"epoch": 0.4378947368421053,
"grad_norm": 2.7835005086015903,
"learning_rate": 8.863084742426719e-06,
"loss": 0.192,
"step": 208
},
{
"epoch": 0.44,
"grad_norm": 3.347640148688381,
"learning_rate": 8.852566213878947e-06,
"loss": 0.1955,
"step": 209
},
{
"epoch": 0.4421052631578947,
"grad_norm": 3.6781001625254643,
"learning_rate": 8.842005554284296e-06,
"loss": 0.2583,
"step": 210
},
{
"epoch": 0.4442105263157895,
"grad_norm": 3.3060488425103416,
"learning_rate": 8.831402879132447e-06,
"loss": 0.2273,
"step": 211
},
{
"epoch": 0.4463157894736842,
"grad_norm": 3.924014440413263,
"learning_rate": 8.820758304372557e-06,
"loss": 0.2294,
"step": 212
},
{
"epoch": 0.44842105263157894,
"grad_norm": 3.7994401024720066,
"learning_rate": 8.810071946411989e-06,
"loss": 0.2199,
"step": 213
},
{
"epoch": 0.45052631578947366,
"grad_norm": 3.376294637610717,
"learning_rate": 8.799343922115045e-06,
"loss": 0.2433,
"step": 214
},
{
"epoch": 0.45263157894736844,
"grad_norm": 3.3193795798150165,
"learning_rate": 8.788574348801676e-06,
"loss": 0.209,
"step": 215
},
{
"epoch": 0.45473684210526316,
"grad_norm": 3.0915010534262795,
"learning_rate": 8.777763344246209e-06,
"loss": 0.179,
"step": 216
},
{
"epoch": 0.4568421052631579,
"grad_norm": 2.8659181552677375,
"learning_rate": 8.766911026676063e-06,
"loss": 0.1811,
"step": 217
},
{
"epoch": 0.4589473684210526,
"grad_norm": 3.45215463473198,
"learning_rate": 8.756017514770444e-06,
"loss": 0.2281,
"step": 218
},
{
"epoch": 0.4610526315789474,
"grad_norm": 3.1257499399451394,
"learning_rate": 8.745082927659048e-06,
"loss": 0.2184,
"step": 219
},
{
"epoch": 0.4631578947368421,
"grad_norm": 3.8271139734522945,
"learning_rate": 8.734107384920771e-06,
"loss": 0.2623,
"step": 220
},
{
"epoch": 0.4652631578947368,
"grad_norm": 2.835561102259285,
"learning_rate": 8.72309100658239e-06,
"loss": 0.1964,
"step": 221
},
{
"epoch": 0.4673684210526316,
"grad_norm": 3.3688712713428766,
"learning_rate": 8.71203391311725e-06,
"loss": 0.2168,
"step": 222
},
{
"epoch": 0.4694736842105263,
"grad_norm": 3.7240976383868736,
"learning_rate": 8.700936225443958e-06,
"loss": 0.2518,
"step": 223
},
{
"epoch": 0.47157894736842104,
"grad_norm": 2.96476521824005,
"learning_rate": 8.689798064925049e-06,
"loss": 0.2378,
"step": 224
},
{
"epoch": 0.47368421052631576,
"grad_norm": 2.7984591391533953,
"learning_rate": 8.67861955336566e-06,
"loss": 0.2252,
"step": 225
},
{
"epoch": 0.47578947368421054,
"grad_norm": 2.7976795282629254,
"learning_rate": 8.6674008130122e-06,
"loss": 0.1755,
"step": 226
},
{
"epoch": 0.47789473684210526,
"grad_norm": 3.33023467809358,
"learning_rate": 8.65614196655102e-06,
"loss": 0.2361,
"step": 227
},
{
"epoch": 0.48,
"grad_norm": 2.966759381413828,
"learning_rate": 8.644843137107058e-06,
"loss": 0.2027,
"step": 228
},
{
"epoch": 0.48210526315789476,
"grad_norm": 3.1104223364393535,
"learning_rate": 8.633504448242504e-06,
"loss": 0.1961,
"step": 229
},
{
"epoch": 0.4842105263157895,
"grad_norm": 2.787274197616676,
"learning_rate": 8.622126023955446e-06,
"loss": 0.2031,
"step": 230
},
{
"epoch": 0.4863157894736842,
"grad_norm": 3.3738049865267925,
"learning_rate": 8.610707988678504e-06,
"loss": 0.2533,
"step": 231
},
{
"epoch": 0.4884210526315789,
"grad_norm": 3.407815533241093,
"learning_rate": 8.599250467277483e-06,
"loss": 0.2524,
"step": 232
},
{
"epoch": 0.4905263157894737,
"grad_norm": 3.296831884586839,
"learning_rate": 8.587753585050004e-06,
"loss": 0.2396,
"step": 233
},
{
"epoch": 0.4926315789473684,
"grad_norm": 2.8560599820160073,
"learning_rate": 8.576217467724129e-06,
"loss": 0.2416,
"step": 234
},
{
"epoch": 0.49473684210526314,
"grad_norm": 2.9054696528766524,
"learning_rate": 8.564642241456986e-06,
"loss": 0.1973,
"step": 235
},
{
"epoch": 0.4968421052631579,
"grad_norm": 2.8181421804733358,
"learning_rate": 8.553028032833397e-06,
"loss": 0.179,
"step": 236
},
{
"epoch": 0.49894736842105264,
"grad_norm": 2.7050097156036284,
"learning_rate": 8.541374968864486e-06,
"loss": 0.2037,
"step": 237
},
{
"epoch": 0.5010526315789474,
"grad_norm": 2.585908271011497,
"learning_rate": 8.529683176986295e-06,
"loss": 0.1633,
"step": 238
},
{
"epoch": 0.5031578947368421,
"grad_norm": 3.6063087447245414,
"learning_rate": 8.517952785058385e-06,
"loss": 0.2354,
"step": 239
},
{
"epoch": 0.5052631578947369,
"grad_norm": 2.8004827647319073,
"learning_rate": 8.506183921362443e-06,
"loss": 0.1783,
"step": 240
},
{
"epoch": 0.5073684210526316,
"grad_norm": 3.0924391138448777,
"learning_rate": 8.494376714600878e-06,
"loss": 0.2086,
"step": 241
},
{
"epoch": 0.5094736842105263,
"grad_norm": 3.28651564075383,
"learning_rate": 8.482531293895412e-06,
"loss": 0.2345,
"step": 242
},
{
"epoch": 0.511578947368421,
"grad_norm": 3.2830296016413056,
"learning_rate": 8.470647788785665e-06,
"loss": 0.2149,
"step": 243
},
{
"epoch": 0.5136842105263157,
"grad_norm": 3.546287405553885,
"learning_rate": 8.458726329227748e-06,
"loss": 0.2261,
"step": 244
},
{
"epoch": 0.5157894736842106,
"grad_norm": 3.394923024937159,
"learning_rate": 8.446767045592829e-06,
"loss": 0.2468,
"step": 245
},
{
"epoch": 0.5178947368421053,
"grad_norm": 3.864701196963864,
"learning_rate": 8.434770068665723e-06,
"loss": 0.2638,
"step": 246
},
{
"epoch": 0.52,
"grad_norm": 3.4189011314403976,
"learning_rate": 8.422735529643445e-06,
"loss": 0.2219,
"step": 247
},
{
"epoch": 0.5221052631578947,
"grad_norm": 3.4940583139796497,
"learning_rate": 8.410663560133784e-06,
"loss": 0.2055,
"step": 248
},
{
"epoch": 0.5242105263157895,
"grad_norm": 2.9563885540382717,
"learning_rate": 8.398554292153866e-06,
"loss": 0.2063,
"step": 249
},
{
"epoch": 0.5263157894736842,
"grad_norm": 3.856575711945962,
"learning_rate": 8.386407858128707e-06,
"loss": 0.2493,
"step": 250
},
{
"epoch": 0.5284210526315789,
"grad_norm": 2.963714344149301,
"learning_rate": 8.37422439088976e-06,
"loss": 0.2173,
"step": 251
},
{
"epoch": 0.5305263157894737,
"grad_norm": 3.5084770315497953,
"learning_rate": 8.362004023673473e-06,
"loss": 0.2637,
"step": 252
},
{
"epoch": 0.5326315789473685,
"grad_norm": 3.2627548109310545,
"learning_rate": 8.349746890119826e-06,
"loss": 0.2059,
"step": 253
},
{
"epoch": 0.5347368421052632,
"grad_norm": 3.537857944594144,
"learning_rate": 8.337453124270864e-06,
"loss": 0.2064,
"step": 254
},
{
"epoch": 0.5368421052631579,
"grad_norm": 3.203619307633033,
"learning_rate": 8.325122860569241e-06,
"loss": 0.1859,
"step": 255
},
{
"epoch": 0.5389473684210526,
"grad_norm": 2.8427156228829946,
"learning_rate": 8.31275623385675e-06,
"loss": 0.1781,
"step": 256
},
{
"epoch": 0.5410526315789473,
"grad_norm": 3.4548444256099495,
"learning_rate": 8.300353379372834e-06,
"loss": 0.2253,
"step": 257
},
{
"epoch": 0.5431578947368421,
"grad_norm": 3.316389585769609,
"learning_rate": 8.287914432753123e-06,
"loss": 0.2496,
"step": 258
},
{
"epoch": 0.5452631578947369,
"grad_norm": 3.925056071030507,
"learning_rate": 8.275439530027948e-06,
"loss": 0.2259,
"step": 259
},
{
"epoch": 0.5473684210526316,
"grad_norm": 3.992456726752316,
"learning_rate": 8.262928807620843e-06,
"loss": 0.2566,
"step": 260
},
{
"epoch": 0.5494736842105263,
"grad_norm": 3.432001698331824,
"learning_rate": 8.250382402347066e-06,
"loss": 0.2084,
"step": 261
},
{
"epoch": 0.5515789473684211,
"grad_norm": 3.4259679677663843,
"learning_rate": 8.237800451412095e-06,
"loss": 0.2381,
"step": 262
},
{
"epoch": 0.5536842105263158,
"grad_norm": 3.1299226563183193,
"learning_rate": 8.225183092410128e-06,
"loss": 0.2374,
"step": 263
},
{
"epoch": 0.5557894736842105,
"grad_norm": 3.2234103937622924,
"learning_rate": 8.212530463322584e-06,
"loss": 0.2192,
"step": 264
},
{
"epoch": 0.5578947368421052,
"grad_norm": 3.840611086800957,
"learning_rate": 8.199842702516584e-06,
"loss": 0.2349,
"step": 265
},
{
"epoch": 0.56,
"grad_norm": 3.090365309566825,
"learning_rate": 8.18711994874345e-06,
"loss": 0.2441,
"step": 266
},
{
"epoch": 0.5621052631578948,
"grad_norm": 3.5041886865116783,
"learning_rate": 8.174362341137177e-06,
"loss": 0.2659,
"step": 267
},
{
"epoch": 0.5642105263157895,
"grad_norm": 3.0931593729585516,
"learning_rate": 8.161570019212921e-06,
"loss": 0.2308,
"step": 268
},
{
"epoch": 0.5663157894736842,
"grad_norm": 3.6356498976901332,
"learning_rate": 8.148743122865463e-06,
"loss": 0.2534,
"step": 269
},
{
"epoch": 0.5684210526315789,
"grad_norm": 3.408126383096958,
"learning_rate": 8.135881792367686e-06,
"loss": 0.2321,
"step": 270
},
{
"epoch": 0.5705263157894737,
"grad_norm": 2.6458628263496284,
"learning_rate": 8.12298616836904e-06,
"loss": 0.1978,
"step": 271
},
{
"epoch": 0.5726315789473684,
"grad_norm": 3.1483733395983595,
"learning_rate": 8.110056391894005e-06,
"loss": 0.2172,
"step": 272
},
{
"epoch": 0.5747368421052632,
"grad_norm": 3.467397710095167,
"learning_rate": 8.097092604340543e-06,
"loss": 0.2394,
"step": 273
},
{
"epoch": 0.5768421052631579,
"grad_norm": 3.8996216518849454,
"learning_rate": 8.084094947478556e-06,
"loss": 0.2731,
"step": 274
},
{
"epoch": 0.5789473684210527,
"grad_norm": 3.0037248186783936,
"learning_rate": 8.071063563448341e-06,
"loss": 0.1767,
"step": 275
},
{
"epoch": 0.5810526315789474,
"grad_norm": 2.5277085823211864,
"learning_rate": 8.057998594759022e-06,
"loss": 0.1814,
"step": 276
},
{
"epoch": 0.5831578947368421,
"grad_norm": 3.3543130599108255,
"learning_rate": 8.044900184287007e-06,
"loss": 0.2266,
"step": 277
},
{
"epoch": 0.5852631578947368,
"grad_norm": 3.1857375439158266,
"learning_rate": 8.031768475274412e-06,
"loss": 0.2343,
"step": 278
},
{
"epoch": 0.5873684210526315,
"grad_norm": 3.055157108563214,
"learning_rate": 8.018603611327505e-06,
"loss": 0.227,
"step": 279
},
{
"epoch": 0.5894736842105263,
"grad_norm": 3.2243095637150927,
"learning_rate": 8.005405736415127e-06,
"loss": 0.1937,
"step": 280
},
{
"epoch": 0.5915789473684211,
"grad_norm": 3.250488849370332,
"learning_rate": 7.992174994867124e-06,
"loss": 0.2374,
"step": 281
},
{
"epoch": 0.5936842105263158,
"grad_norm": 3.0167916103746912,
"learning_rate": 7.978911531372764e-06,
"loss": 0.225,
"step": 282
},
{
"epoch": 0.5957894736842105,
"grad_norm": 3.2651532548799374,
"learning_rate": 7.965615490979165e-06,
"loss": 0.2337,
"step": 283
},
{
"epoch": 0.5978947368421053,
"grad_norm": 3.896346456849055,
"learning_rate": 7.952287019089686e-06,
"loss": 0.2748,
"step": 284
},
{
"epoch": 0.6,
"grad_norm": 3.5822792888425803,
"learning_rate": 7.938926261462366e-06,
"loss": 0.211,
"step": 285
},
{
"epoch": 0.6021052631578947,
"grad_norm": 3.444306149909226,
"learning_rate": 7.925533364208308e-06,
"loss": 0.1983,
"step": 286
},
{
"epoch": 0.6042105263157894,
"grad_norm": 4.1948069859545445,
"learning_rate": 7.912108473790092e-06,
"loss": 0.2328,
"step": 287
},
{
"epoch": 0.6063157894736843,
"grad_norm": 3.4747320472234517,
"learning_rate": 7.898651737020166e-06,
"loss": 0.265,
"step": 288
},
{
"epoch": 0.608421052631579,
"grad_norm": 3.240236939628344,
"learning_rate": 7.885163301059251e-06,
"loss": 0.2105,
"step": 289
},
{
"epoch": 0.6105263157894737,
"grad_norm": 3.721836217869373,
"learning_rate": 7.871643313414718e-06,
"loss": 0.2183,
"step": 290
},
{
"epoch": 0.6126315789473684,
"grad_norm": 3.326881302452429,
"learning_rate": 7.858091921938989e-06,
"loss": 0.2394,
"step": 291
},
{
"epoch": 0.6147368421052631,
"grad_norm": 4.006855011965986,
"learning_rate": 7.844509274827907e-06,
"loss": 0.2294,
"step": 292
},
{
"epoch": 0.6168421052631579,
"grad_norm": 2.977288794276405,
"learning_rate": 7.830895520619129e-06,
"loss": 0.1943,
"step": 293
},
{
"epoch": 0.6189473684210526,
"grad_norm": 3.503869431295621,
"learning_rate": 7.817250808190483e-06,
"loss": 0.2271,
"step": 294
},
{
"epoch": 0.6210526315789474,
"grad_norm": 2.397881273267794,
"learning_rate": 7.803575286758365e-06,
"loss": 0.1522,
"step": 295
},
{
"epoch": 0.6231578947368421,
"grad_norm": 3.1498677204648855,
"learning_rate": 7.789869105876083e-06,
"loss": 0.2223,
"step": 296
},
{
"epoch": 0.6252631578947369,
"grad_norm": 3.532048573053879,
"learning_rate": 7.776132415432234e-06,
"loss": 0.2548,
"step": 297
},
{
"epoch": 0.6273684210526316,
"grad_norm": 2.9494325963626777,
"learning_rate": 7.762365365649068e-06,
"loss": 0.2047,
"step": 298
},
{
"epoch": 0.6294736842105263,
"grad_norm": 3.1322331545957707,
"learning_rate": 7.748568107080831e-06,
"loss": 0.2239,
"step": 299
},
{
"epoch": 0.631578947368421,
"grad_norm": 2.996031382032748,
"learning_rate": 7.734740790612137e-06,
"loss": 0.177,
"step": 300
},
{
"epoch": 0.6336842105263157,
"grad_norm": 3.6318074014394135,
"learning_rate": 7.720883567456299e-06,
"loss": 0.2797,
"step": 301
},
{
"epoch": 0.6357894736842106,
"grad_norm": 3.5126271433689817,
"learning_rate": 7.70699658915369e-06,
"loss": 0.2965,
"step": 302
},
{
"epoch": 0.6378947368421053,
"grad_norm": 3.067374146183351,
"learning_rate": 7.693080007570084e-06,
"loss": 0.2311,
"step": 303
},
{
"epoch": 0.64,
"grad_norm": 2.8467013786071735,
"learning_rate": 7.679133974894984e-06,
"loss": 0.1952,
"step": 304
},
{
"epoch": 0.6421052631578947,
"grad_norm": 3.298916474796445,
"learning_rate": 7.66515864363997e-06,
"loss": 0.2233,
"step": 305
},
{
"epoch": 0.6442105263157895,
"grad_norm": 4.447954496664178,
"learning_rate": 7.651154166637025e-06,
"loss": 0.3085,
"step": 306
},
{
"epoch": 0.6463157894736842,
"grad_norm": 3.0739296320424736,
"learning_rate": 7.637120697036866e-06,
"loss": 0.1874,
"step": 307
},
{
"epoch": 0.6484210526315789,
"grad_norm": 2.672772402397274,
"learning_rate": 7.62305838830727e-06,
"loss": 0.2168,
"step": 308
},
{
"epoch": 0.6505263157894737,
"grad_norm": 3.5823577010326844,
"learning_rate": 7.608967394231387e-06,
"loss": 0.2523,
"step": 309
},
{
"epoch": 0.6526315789473685,
"grad_norm": 3.363408010518267,
"learning_rate": 7.594847868906076e-06,
"loss": 0.213,
"step": 310
},
{
"epoch": 0.6547368421052632,
"grad_norm": 3.0932376636426238,
"learning_rate": 7.580699966740201e-06,
"loss": 0.2267,
"step": 311
},
{
"epoch": 0.6568421052631579,
"grad_norm": 3.483318561632507,
"learning_rate": 7.566523842452958e-06,
"loss": 0.256,
"step": 312
},
{
"epoch": 0.6589473684210526,
"grad_norm": 2.7912893670301484,
"learning_rate": 7.552319651072164e-06,
"loss": 0.2106,
"step": 313
},
{
"epoch": 0.6610526315789473,
"grad_norm": 3.4981450541010704,
"learning_rate": 7.5380875479325855e-06,
"loss": 0.2547,
"step": 314
},
{
"epoch": 0.6631578947368421,
"grad_norm": 3.124883447115098,
"learning_rate": 7.52382768867422e-06,
"loss": 0.1939,
"step": 315
},
{
"epoch": 0.6652631578947369,
"grad_norm": 4.620680045339017,
"learning_rate": 7.509540229240601e-06,
"loss": 0.2953,
"step": 316
},
{
"epoch": 0.6673684210526316,
"grad_norm": 3.2282886161755786,
"learning_rate": 7.4952253258771036e-06,
"loss": 0.2112,
"step": 317
},
{
"epoch": 0.6694736842105263,
"grad_norm": 3.047727830370946,
"learning_rate": 7.480883135129211e-06,
"loss": 0.2086,
"step": 318
},
{
"epoch": 0.671578947368421,
"grad_norm": 2.584859580905444,
"learning_rate": 7.4665138138408255e-06,
"loss": 0.2119,
"step": 319
},
{
"epoch": 0.6736842105263158,
"grad_norm": 3.316066265356493,
"learning_rate": 7.452117519152542e-06,
"loss": 0.2489,
"step": 320
},
{
"epoch": 0.6757894736842105,
"grad_norm": 3.2406113992536136,
"learning_rate": 7.437694408499932e-06,
"loss": 0.1915,
"step": 321
},
{
"epoch": 0.6778947368421052,
"grad_norm": 2.956072384698419,
"learning_rate": 7.4232446396118265e-06,
"loss": 0.2141,
"step": 322
},
{
"epoch": 0.68,
"grad_norm": 2.911407487056924,
"learning_rate": 7.408768370508577e-06,
"loss": 0.2149,
"step": 323
},
{
"epoch": 0.6821052631578948,
"grad_norm": 2.8116902443594016,
"learning_rate": 7.394265759500348e-06,
"loss": 0.1691,
"step": 324
},
{
"epoch": 0.6842105263157895,
"grad_norm": 3.276193445347204,
"learning_rate": 7.379736965185369e-06,
"loss": 0.2003,
"step": 325
},
{
"epoch": 0.6863157894736842,
"grad_norm": 2.980429816982403,
"learning_rate": 7.365182146448205e-06,
"loss": 0.2071,
"step": 326
},
{
"epoch": 0.6884210526315789,
"grad_norm": 3.168944857843924,
"learning_rate": 7.350601462458025e-06,
"loss": 0.2249,
"step": 327
},
{
"epoch": 0.6905263157894737,
"grad_norm": 3.2312005808906608,
"learning_rate": 7.335995072666848e-06,
"loss": 0.1985,
"step": 328
},
{
"epoch": 0.6926315789473684,
"grad_norm": 3.0522979756884236,
"learning_rate": 7.3213631368078196e-06,
"loss": 0.2025,
"step": 329
},
{
"epoch": 0.6947368421052632,
"grad_norm": 2.787658703366056,
"learning_rate": 7.30670581489344e-06,
"loss": 0.1983,
"step": 330
},
{
"epoch": 0.6968421052631579,
"grad_norm": 4.3667882707177625,
"learning_rate": 7.292023267213836e-06,
"loss": 0.2243,
"step": 331
},
{
"epoch": 0.6989473684210527,
"grad_norm": 5.1674527899722085,
"learning_rate": 7.2773156543349965e-06,
"loss": 0.2317,
"step": 332
},
{
"epoch": 0.7010526315789474,
"grad_norm": 2.7521986848960216,
"learning_rate": 7.262583137097019e-06,
"loss": 0.1964,
"step": 333
},
{
"epoch": 0.7031578947368421,
"grad_norm": 2.8301069192286445,
"learning_rate": 7.247825876612353e-06,
"loss": 0.2043,
"step": 334
},
{
"epoch": 0.7052631578947368,
"grad_norm": 3.770631339460926,
"learning_rate": 7.233044034264034e-06,
"loss": 0.1965,
"step": 335
},
{
"epoch": 0.7073684210526315,
"grad_norm": 2.8548456329448872,
"learning_rate": 7.218237771703921e-06,
"loss": 0.1819,
"step": 336
},
{
"epoch": 0.7094736842105264,
"grad_norm": 3.6843919985708173,
"learning_rate": 7.203407250850929e-06,
"loss": 0.2101,
"step": 337
},
{
"epoch": 0.7115789473684211,
"grad_norm": 2.481860597568968,
"learning_rate": 7.18855263388926e-06,
"loss": 0.1619,
"step": 338
},
{
"epoch": 0.7136842105263158,
"grad_norm": 2.8454463712055653,
"learning_rate": 7.173674083266624e-06,
"loss": 0.1548,
"step": 339
},
{
"epoch": 0.7157894736842105,
"grad_norm": 3.1220177562190297,
"learning_rate": 7.158771761692464e-06,
"loss": 0.1873,
"step": 340
},
{
"epoch": 0.7178947368421053,
"grad_norm": 3.1026746108893204,
"learning_rate": 7.143845832136188e-06,
"loss": 0.1708,
"step": 341
},
{
"epoch": 0.72,
"grad_norm": 3.613177488828585,
"learning_rate": 7.128896457825364e-06,
"loss": 0.2051,
"step": 342
},
{
"epoch": 0.7221052631578947,
"grad_norm": 4.023734813281506,
"learning_rate": 7.113923802243957e-06,
"loss": 0.2371,
"step": 343
},
{
"epoch": 0.7242105263157895,
"grad_norm": 2.4891706091722283,
"learning_rate": 7.098928029130529e-06,
"loss": 0.1585,
"step": 344
},
{
"epoch": 0.7263157894736842,
"grad_norm": 3.625956257810872,
"learning_rate": 7.083909302476453e-06,
"loss": 0.2379,
"step": 345
},
{
"epoch": 0.728421052631579,
"grad_norm": 3.409493884604401,
"learning_rate": 7.068867786524116e-06,
"loss": 0.1783,
"step": 346
},
{
"epoch": 0.7305263157894737,
"grad_norm": 3.0090022256319866,
"learning_rate": 7.053803645765128e-06,
"loss": 0.1831,
"step": 347
},
{
"epoch": 0.7326315789473684,
"grad_norm": 3.5360587589584127,
"learning_rate": 7.038717044938519e-06,
"loss": 0.2413,
"step": 348
},
{
"epoch": 0.7347368421052631,
"grad_norm": 3.4382217950294236,
"learning_rate": 7.023608149028936e-06,
"loss": 0.2155,
"step": 349
},
{
"epoch": 0.7368421052631579,
"grad_norm": 4.004045022458863,
"learning_rate": 7.008477123264849e-06,
"loss": 0.2836,
"step": 350
},
{
"epoch": 0.7389473684210527,
"grad_norm": 3.3203295306272196,
"learning_rate": 6.993324133116726e-06,
"loss": 0.2658,
"step": 351
},
{
"epoch": 0.7410526315789474,
"grad_norm": 2.548964384681694,
"learning_rate": 6.978149344295242e-06,
"loss": 0.1785,
"step": 352
},
{
"epoch": 0.7431578947368421,
"grad_norm": 3.4483832833571912,
"learning_rate": 6.9629529227494575e-06,
"loss": 0.2214,
"step": 353
},
{
"epoch": 0.7452631578947368,
"grad_norm": 3.4383584987113274,
"learning_rate": 6.9477350346650016e-06,
"loss": 0.192,
"step": 354
},
{
"epoch": 0.7473684210526316,
"grad_norm": 3.691018189312247,
"learning_rate": 6.932495846462262e-06,
"loss": 0.2435,
"step": 355
},
{
"epoch": 0.7494736842105263,
"grad_norm": 3.385770493095089,
"learning_rate": 6.9172355247945586e-06,
"loss": 0.205,
"step": 356
},
{
"epoch": 0.751578947368421,
"grad_norm": 2.662810311197674,
"learning_rate": 6.901954236546324e-06,
"loss": 0.1659,
"step": 357
},
{
"epoch": 0.7536842105263157,
"grad_norm": 4.168405645399794,
"learning_rate": 6.88665214883128e-06,
"loss": 0.2934,
"step": 358
},
{
"epoch": 0.7557894736842106,
"grad_norm": 3.5383114057012843,
"learning_rate": 6.871329428990602e-06,
"loss": 0.2157,
"step": 359
},
{
"epoch": 0.7578947368421053,
"grad_norm": 2.8894956368254103,
"learning_rate": 6.855986244591104e-06,
"loss": 0.1912,
"step": 360
},
{
"epoch": 0.76,
"grad_norm": 2.787711973501566,
"learning_rate": 6.840622763423391e-06,
"loss": 0.1706,
"step": 361
},
{
"epoch": 0.7621052631578947,
"grad_norm": 2.72901738571353,
"learning_rate": 6.825239153500029e-06,
"loss": 0.164,
"step": 362
},
{
"epoch": 0.7642105263157895,
"grad_norm": 3.189665352469265,
"learning_rate": 6.809835583053716e-06,
"loss": 0.1764,
"step": 363
},
{
"epoch": 0.7663157894736842,
"grad_norm": 3.1275848607099133,
"learning_rate": 6.794412220535426e-06,
"loss": 0.2197,
"step": 364
},
{
"epoch": 0.7684210526315789,
"grad_norm": 3.5188488634301263,
"learning_rate": 6.778969234612583e-06,
"loss": 0.2439,
"step": 365
},
{
"epoch": 0.7705263157894737,
"grad_norm": 2.62111339980637,
"learning_rate": 6.763506794167207e-06,
"loss": 0.1879,
"step": 366
},
{
"epoch": 0.7726315789473684,
"grad_norm": 2.8407752570746005,
"learning_rate": 6.748025068294067e-06,
"loss": 0.179,
"step": 367
},
{
"epoch": 0.7747368421052632,
"grad_norm": 3.230423148951695,
"learning_rate": 6.732524226298841e-06,
"loss": 0.1906,
"step": 368
},
{
"epoch": 0.7768421052631579,
"grad_norm": 3.9240082867236974,
"learning_rate": 6.717004437696249e-06,
"loss": 0.2593,
"step": 369
},
{
"epoch": 0.7789473684210526,
"grad_norm": 2.949281736906227,
"learning_rate": 6.701465872208216e-06,
"loss": 0.1767,
"step": 370
},
{
"epoch": 0.7810526315789473,
"grad_norm": 3.4699155102688293,
"learning_rate": 6.685908699762003e-06,
"loss": 0.2495,
"step": 371
},
{
"epoch": 0.783157894736842,
"grad_norm": 3.441878628446404,
"learning_rate": 6.670333090488357e-06,
"loss": 0.2499,
"step": 372
},
{
"epoch": 0.7852631578947369,
"grad_norm": 3.1405985518052772,
"learning_rate": 6.654739214719642e-06,
"loss": 0.2127,
"step": 373
},
{
"epoch": 0.7873684210526316,
"grad_norm": 2.593987567673624,
"learning_rate": 6.6391272429879886e-06,
"loss": 0.1835,
"step": 374
},
{
"epoch": 0.7894736842105263,
"grad_norm": 3.276693821618827,
"learning_rate": 6.6234973460234184e-06,
"loss": 0.2027,
"step": 375
},
{
"epoch": 0.791578947368421,
"grad_norm": 2.995174038901829,
"learning_rate": 6.607849694751978e-06,
"loss": 0.2003,
"step": 376
},
{
"epoch": 0.7936842105263158,
"grad_norm": 2.6846031430529567,
"learning_rate": 6.592184460293878e-06,
"loss": 0.1421,
"step": 377
},
{
"epoch": 0.7957894736842105,
"grad_norm": 3.312415514283232,
"learning_rate": 6.576501813961609e-06,
"loss": 0.1863,
"step": 378
},
{
"epoch": 0.7978947368421052,
"grad_norm": 3.775675123728028,
"learning_rate": 6.560801927258081e-06,
"loss": 0.1958,
"step": 379
},
{
"epoch": 0.8,
"grad_norm": 2.5927726340982264,
"learning_rate": 6.545084971874738e-06,
"loss": 0.1625,
"step": 380
},
{
"epoch": 0.8021052631578948,
"grad_norm": 3.1587922841231917,
"learning_rate": 6.529351119689687e-06,
"loss": 0.1965,
"step": 381
},
{
"epoch": 0.8042105263157895,
"grad_norm": 3.1769362735899356,
"learning_rate": 6.513600542765816e-06,
"loss": 0.2057,
"step": 382
},
{
"epoch": 0.8063157894736842,
"grad_norm": 3.808162384466138,
"learning_rate": 6.49783341334891e-06,
"loss": 0.2042,
"step": 383
},
{
"epoch": 0.8084210526315789,
"grad_norm": 3.3063478217630107,
"learning_rate": 6.4820499038657695e-06,
"loss": 0.1916,
"step": 384
},
{
"epoch": 0.8105263157894737,
"grad_norm": 3.043905617430906,
"learning_rate": 6.466250186922325e-06,
"loss": 0.1944,
"step": 385
},
{
"epoch": 0.8126315789473684,
"grad_norm": 4.168593975170044,
"learning_rate": 6.450434435301751e-06,
"loss": 0.2748,
"step": 386
},
{
"epoch": 0.8147368421052632,
"grad_norm": 4.274013610158174,
"learning_rate": 6.434602821962571e-06,
"loss": 0.2494,
"step": 387
},
{
"epoch": 0.8168421052631579,
"grad_norm": 3.5963573539929463,
"learning_rate": 6.418755520036775e-06,
"loss": 0.2013,
"step": 388
},
{
"epoch": 0.8189473684210526,
"grad_norm": 2.9666962047426666,
"learning_rate": 6.402892702827916e-06,
"loss": 0.187,
"step": 389
},
{
"epoch": 0.8210526315789474,
"grad_norm": 2.9643994270594884,
"learning_rate": 6.387014543809224e-06,
"loss": 0.2049,
"step": 390
},
{
"epoch": 0.8231578947368421,
"grad_norm": 2.3657391759758397,
"learning_rate": 6.371121216621698e-06,
"loss": 0.1751,
"step": 391
},
{
"epoch": 0.8252631578947368,
"grad_norm": 3.3529253765458167,
"learning_rate": 6.355212895072223e-06,
"loss": 0.2193,
"step": 392
},
{
"epoch": 0.8273684210526315,
"grad_norm": 3.1720607901606206,
"learning_rate": 6.339289753131649e-06,
"loss": 0.2148,
"step": 393
},
{
"epoch": 0.8294736842105264,
"grad_norm": 3.3584897742031834,
"learning_rate": 6.323351964932909e-06,
"loss": 0.2302,
"step": 394
},
{
"epoch": 0.8315789473684211,
"grad_norm": 4.380475651099131,
"learning_rate": 6.3073997047691e-06,
"loss": 0.2887,
"step": 395
},
{
"epoch": 0.8336842105263158,
"grad_norm": 3.289882212635633,
"learning_rate": 6.291433147091583e-06,
"loss": 0.2106,
"step": 396
},
{
"epoch": 0.8357894736842105,
"grad_norm": 2.9345166529972952,
"learning_rate": 6.275452466508076e-06,
"loss": 0.2063,
"step": 397
},
{
"epoch": 0.8378947368421052,
"grad_norm": 3.6273888701243355,
"learning_rate": 6.259457837780741e-06,
"loss": 0.2245,
"step": 398
},
{
"epoch": 0.84,
"grad_norm": 3.6790816473406847,
"learning_rate": 6.243449435824276e-06,
"loss": 0.193,
"step": 399
},
{
"epoch": 0.8421052631578947,
"grad_norm": 3.1914433056812426,
"learning_rate": 6.227427435703997e-06,
"loss": 0.2164,
"step": 400
},
{
"epoch": 0.8421052631578947,
"eval_loss": 0.19365844130516052,
"eval_runtime": 0.9303,
"eval_samples_per_second": 41.923,
"eval_steps_per_second": 10.749,
"step": 400
},
{
"epoch": 0.8442105263157895,
"grad_norm": 3.0422393517644095,
"learning_rate": 6.211392012633932e-06,
"loss": 0.1945,
"step": 401
},
{
"epoch": 0.8463157894736842,
"grad_norm": 3.3896222895957204,
"learning_rate": 6.1953433419748995e-06,
"loss": 0.2183,
"step": 402
},
{
"epoch": 0.848421052631579,
"grad_norm": 2.8202481621645226,
"learning_rate": 6.179281599232592e-06,
"loss": 0.222,
"step": 403
},
{
"epoch": 0.8505263157894737,
"grad_norm": 2.7904065123537545,
"learning_rate": 6.163206960055652e-06,
"loss": 0.1965,
"step": 404
},
{
"epoch": 0.8526315789473684,
"grad_norm": 3.318994535797195,
"learning_rate": 6.147119600233758e-06,
"loss": 0.2116,
"step": 405
},
{
"epoch": 0.8547368421052631,
"grad_norm": 3.787907520422109,
"learning_rate": 6.131019695695702e-06,
"loss": 0.2441,
"step": 406
},
{
"epoch": 0.8568421052631578,
"grad_norm": 2.6044409986603947,
"learning_rate": 6.114907422507459e-06,
"loss": 0.1696,
"step": 407
},
{
"epoch": 0.8589473684210527,
"grad_norm": 3.1123186046200577,
"learning_rate": 6.098782956870266e-06,
"loss": 0.1714,
"step": 408
},
{
"epoch": 0.8610526315789474,
"grad_norm": 3.5641698976572886,
"learning_rate": 6.0826464751187e-06,
"loss": 0.2129,
"step": 409
},
{
"epoch": 0.8631578947368421,
"grad_norm": 3.4449729307238397,
"learning_rate": 6.066498153718735e-06,
"loss": 0.2059,
"step": 410
},
{
"epoch": 0.8652631578947368,
"grad_norm": 3.091646410008194,
"learning_rate": 6.0503381692658305e-06,
"loss": 0.2244,
"step": 411
},
{
"epoch": 0.8673684210526316,
"grad_norm": 3.426356919246921,
"learning_rate": 6.034166698482984e-06,
"loss": 0.2493,
"step": 412
},
{
"epoch": 0.8694736842105263,
"grad_norm": 3.009157338394937,
"learning_rate": 6.0179839182188125e-06,
"loss": 0.1769,
"step": 413
},
{
"epoch": 0.871578947368421,
"grad_norm": 2.68571377740786,
"learning_rate": 6.001790005445607e-06,
"loss": 0.1801,
"step": 414
},
{
"epoch": 0.8736842105263158,
"grad_norm": 3.13266305671967,
"learning_rate": 5.985585137257401e-06,
"loss": 0.2552,
"step": 415
},
{
"epoch": 0.8757894736842106,
"grad_norm": 3.118129327899299,
"learning_rate": 5.969369490868042e-06,
"loss": 0.2213,
"step": 416
},
{
"epoch": 0.8778947368421053,
"grad_norm": 3.1170850548476428,
"learning_rate": 5.953143243609235e-06,
"loss": 0.2228,
"step": 417
},
{
"epoch": 0.88,
"grad_norm": 3.4825948598222136,
"learning_rate": 5.936906572928625e-06,
"loss": 0.2319,
"step": 418
},
{
"epoch": 0.8821052631578947,
"grad_norm": 3.364021447031936,
"learning_rate": 5.920659656387836e-06,
"loss": 0.1935,
"step": 419
},
{
"epoch": 0.8842105263157894,
"grad_norm": 2.7683123292862497,
"learning_rate": 5.904402671660551e-06,
"loss": 0.1622,
"step": 420
},
{
"epoch": 0.8863157894736842,
"grad_norm": 3.089059939046834,
"learning_rate": 5.8881357965305444e-06,
"loss": 0.1677,
"step": 421
},
{
"epoch": 0.888421052631579,
"grad_norm": 3.1348448785512675,
"learning_rate": 5.871859208889759e-06,
"loss": 0.1814,
"step": 422
},
{
"epoch": 0.8905263157894737,
"grad_norm": 3.230597062554221,
"learning_rate": 5.855573086736351e-06,
"loss": 0.2091,
"step": 423
},
{
"epoch": 0.8926315789473684,
"grad_norm": 2.883110792133594,
"learning_rate": 5.839277608172739e-06,
"loss": 0.1836,
"step": 424
},
{
"epoch": 0.8947368421052632,
"grad_norm": 4.415508413152931,
"learning_rate": 5.82297295140367e-06,
"loss": 0.3021,
"step": 425
},
{
"epoch": 0.8968421052631579,
"grad_norm": 2.953180474766528,
"learning_rate": 5.806659294734256e-06,
"loss": 0.1912,
"step": 426
},
{
"epoch": 0.8989473684210526,
"grad_norm": 2.5058106964907814,
"learning_rate": 5.790336816568033e-06,
"loss": 0.1418,
"step": 427
},
{
"epoch": 0.9010526315789473,
"grad_norm": 2.784908569571114,
"learning_rate": 5.774005695405008e-06,
"loss": 0.1733,
"step": 428
},
{
"epoch": 0.9031578947368422,
"grad_norm": 3.2074914294258643,
"learning_rate": 5.7576661098397024e-06,
"loss": 0.217,
"step": 429
},
{
"epoch": 0.9052631578947369,
"grad_norm": 3.8184949532629955,
"learning_rate": 5.74131823855921e-06,
"loss": 0.1928,
"step": 430
},
{
"epoch": 0.9073684210526316,
"grad_norm": 2.884763048980032,
"learning_rate": 5.72496226034123e-06,
"loss": 0.179,
"step": 431
},
{
"epoch": 0.9094736842105263,
"grad_norm": 3.131007686373488,
"learning_rate": 5.708598354052122e-06,
"loss": 0.2092,
"step": 432
},
{
"epoch": 0.911578947368421,
"grad_norm": 3.600180991489015,
"learning_rate": 5.692226698644938e-06,
"loss": 0.1771,
"step": 433
},
{
"epoch": 0.9136842105263158,
"grad_norm": 2.6092430120715386,
"learning_rate": 5.675847473157485e-06,
"loss": 0.1505,
"step": 434
},
{
"epoch": 0.9157894736842105,
"grad_norm": 3.758561821727175,
"learning_rate": 5.659460856710346e-06,
"loss": 0.2449,
"step": 435
},
{
"epoch": 0.9178947368421052,
"grad_norm": 3.005737007201367,
"learning_rate": 5.643067028504931e-06,
"loss": 0.1706,
"step": 436
},
{
"epoch": 0.92,
"grad_norm": 2.9179364125259557,
"learning_rate": 5.626666167821522e-06,
"loss": 0.1812,
"step": 437
},
{
"epoch": 0.9221052631578948,
"grad_norm": 3.1976728733646738,
"learning_rate": 5.610258454017301e-06,
"loss": 0.2345,
"step": 438
},
{
"epoch": 0.9242105263157895,
"grad_norm": 3.475521355404778,
"learning_rate": 5.593844066524401e-06,
"loss": 0.254,
"step": 439
},
{
"epoch": 0.9263157894736842,
"grad_norm": 3.5995093334761963,
"learning_rate": 5.577423184847932e-06,
"loss": 0.2348,
"step": 440
},
{
"epoch": 0.9284210526315789,
"grad_norm": 2.835624142601258,
"learning_rate": 5.560995988564023e-06,
"loss": 0.1802,
"step": 441
},
{
"epoch": 0.9305263157894736,
"grad_norm": 3.8989119467413613,
"learning_rate": 5.544562657317863e-06,
"loss": 0.2229,
"step": 442
},
{
"epoch": 0.9326315789473684,
"grad_norm": 3.62544713638484,
"learning_rate": 5.52812337082173e-06,
"loss": 0.2153,
"step": 443
},
{
"epoch": 0.9347368421052632,
"grad_norm": 3.392283457067749,
"learning_rate": 5.5116783088530255e-06,
"loss": 0.1824,
"step": 444
},
{
"epoch": 0.9368421052631579,
"grad_norm": 4.303709047671292,
"learning_rate": 5.495227651252315e-06,
"loss": 0.298,
"step": 445
},
{
"epoch": 0.9389473684210526,
"grad_norm": 2.830025115217364,
"learning_rate": 5.478771577921351e-06,
"loss": 0.1657,
"step": 446
},
{
"epoch": 0.9410526315789474,
"grad_norm": 3.2810223083748826,
"learning_rate": 5.4623102688211186e-06,
"loss": 0.2494,
"step": 447
},
{
"epoch": 0.9431578947368421,
"grad_norm": 3.4438213790356444,
"learning_rate": 5.445843903969854e-06,
"loss": 0.2062,
"step": 448
},
{
"epoch": 0.9452631578947368,
"grad_norm": 2.879757240077144,
"learning_rate": 5.429372663441086e-06,
"loss": 0.2002,
"step": 449
},
{
"epoch": 0.9473684210526315,
"grad_norm": 2.8548701745465563,
"learning_rate": 5.412896727361663e-06,
"loss": 0.1942,
"step": 450
},
{
"epoch": 0.9494736842105264,
"grad_norm": 3.3673638986518872,
"learning_rate": 5.396416275909779e-06,
"loss": 0.2442,
"step": 451
},
{
"epoch": 0.9515789473684211,
"grad_norm": 3.151677859424395,
"learning_rate": 5.379931489313016e-06,
"loss": 0.1857,
"step": 452
},
{
"epoch": 0.9536842105263158,
"grad_norm": 2.3401970680752653,
"learning_rate": 5.363442547846356e-06,
"loss": 0.1574,
"step": 453
},
{
"epoch": 0.9557894736842105,
"grad_norm": 3.171440734498741,
"learning_rate": 5.346949631830221e-06,
"loss": 0.1858,
"step": 454
},
{
"epoch": 0.9578947368421052,
"grad_norm": 3.572091487862273,
"learning_rate": 5.3304529216284974e-06,
"loss": 0.233,
"step": 455
},
{
"epoch": 0.96,
"grad_norm": 3.3362097570655704,
"learning_rate": 5.3139525976465675e-06,
"loss": 0.1577,
"step": 456
},
{
"epoch": 0.9621052631578947,
"grad_norm": 3.521394981695169,
"learning_rate": 5.2974488403293285e-06,
"loss": 0.2165,
"step": 457
},
{
"epoch": 0.9642105263157895,
"grad_norm": 3.5537369585027876,
"learning_rate": 5.280941830159228e-06,
"loss": 0.2035,
"step": 458
},
{
"epoch": 0.9663157894736842,
"grad_norm": 2.6967873973758336,
"learning_rate": 5.264431747654284e-06,
"loss": 0.1903,
"step": 459
},
{
"epoch": 0.968421052631579,
"grad_norm": 3.451224252952003,
"learning_rate": 5.247918773366112e-06,
"loss": 0.2189,
"step": 460
},
{
"epoch": 0.9705263157894737,
"grad_norm": 3.3703738535372305,
"learning_rate": 5.231403087877955e-06,
"loss": 0.1658,
"step": 461
},
{
"epoch": 0.9726315789473684,
"grad_norm": 2.850165218926584,
"learning_rate": 5.214884871802703e-06,
"loss": 0.1932,
"step": 462
},
{
"epoch": 0.9747368421052631,
"grad_norm": 3.37619966686572,
"learning_rate": 5.198364305780922e-06,
"loss": 0.1988,
"step": 463
},
{
"epoch": 0.9768421052631578,
"grad_norm": 2.960765636480082,
"learning_rate": 5.1818415704788725e-06,
"loss": 0.1904,
"step": 464
},
{
"epoch": 0.9789473684210527,
"grad_norm": 2.7214682076892354,
"learning_rate": 5.165316846586541e-06,
"loss": 0.2017,
"step": 465
},
{
"epoch": 0.9810526315789474,
"grad_norm": 2.4388957400236624,
"learning_rate": 5.148790314815662e-06,
"loss": 0.1764,
"step": 466
},
{
"epoch": 0.9831578947368421,
"grad_norm": 2.8678613327792184,
"learning_rate": 5.132262155897739e-06,
"loss": 0.1778,
"step": 467
},
{
"epoch": 0.9852631578947368,
"grad_norm": 3.210155912400773,
"learning_rate": 5.11573255058207e-06,
"loss": 0.2211,
"step": 468
},
{
"epoch": 0.9873684210526316,
"grad_norm": 3.5663187219986074,
"learning_rate": 5.099201679633769e-06,
"loss": 0.2235,
"step": 469
},
{
"epoch": 0.9894736842105263,
"grad_norm": 2.956548038927285,
"learning_rate": 5.082669723831793e-06,
"loss": 0.1466,
"step": 470
},
{
"epoch": 0.991578947368421,
"grad_norm": 3.930944163022198,
"learning_rate": 5.066136863966963e-06,
"loss": 0.2018,
"step": 471
},
{
"epoch": 0.9936842105263158,
"grad_norm": 3.031263337005746,
"learning_rate": 5.049603280839982e-06,
"loss": 0.2197,
"step": 472
},
{
"epoch": 0.9957894736842106,
"grad_norm": 3.721525482445003,
"learning_rate": 5.033069155259471e-06,
"loss": 0.2175,
"step": 473
},
{
"epoch": 0.9978947368421053,
"grad_norm": 2.238401391190845,
"learning_rate": 5.016534668039976e-06,
"loss": 0.1057,
"step": 474
},
{
"epoch": 1.0,
"grad_norm": 2.700095513199168,
"learning_rate": 5e-06,
"loss": 0.1825,
"step": 475
},
{
"epoch": 1.0021052631578948,
"grad_norm": 2.102327993497151,
"learning_rate": 4.983465331960025e-06,
"loss": 0.0885,
"step": 476
},
{
"epoch": 1.0042105263157894,
"grad_norm": 2.2164179180128127,
"learning_rate": 4.96693084474053e-06,
"loss": 0.101,
"step": 477
},
{
"epoch": 1.0063157894736843,
"grad_norm": 2.591907392954552,
"learning_rate": 4.950396719160019e-06,
"loss": 0.1016,
"step": 478
},
{
"epoch": 1.0084210526315789,
"grad_norm": 2.3536825700723707,
"learning_rate": 4.93386313603304e-06,
"loss": 0.12,
"step": 479
},
{
"epoch": 1.0105263157894737,
"grad_norm": 2.7777745052862106,
"learning_rate": 4.917330276168208e-06,
"loss": 0.102,
"step": 480
},
{
"epoch": 1.0126315789473683,
"grad_norm": 2.4751195672854704,
"learning_rate": 4.900798320366233e-06,
"loss": 0.0989,
"step": 481
},
{
"epoch": 1.0147368421052632,
"grad_norm": 1.889820583657195,
"learning_rate": 4.884267449417932e-06,
"loss": 0.0762,
"step": 482
},
{
"epoch": 1.016842105263158,
"grad_norm": 2.2064010641296155,
"learning_rate": 4.867737844102261e-06,
"loss": 0.0932,
"step": 483
},
{
"epoch": 1.0189473684210526,
"grad_norm": 2.9805461448209556,
"learning_rate": 4.851209685184339e-06,
"loss": 0.092,
"step": 484
},
{
"epoch": 1.0210526315789474,
"grad_norm": 2.2601627421875032,
"learning_rate": 4.8346831534134595e-06,
"loss": 0.09,
"step": 485
},
{
"epoch": 1.023157894736842,
"grad_norm": 2.3939195754809055,
"learning_rate": 4.818158429521129e-06,
"loss": 0.1179,
"step": 486
},
{
"epoch": 1.0252631578947369,
"grad_norm": 2.3451597644966573,
"learning_rate": 4.801635694219079e-06,
"loss": 0.08,
"step": 487
},
{
"epoch": 1.0273684210526315,
"grad_norm": 2.6640782365642592,
"learning_rate": 4.785115128197298e-06,
"loss": 0.1017,
"step": 488
},
{
"epoch": 1.0294736842105263,
"grad_norm": 2.1177638500079365,
"learning_rate": 4.768596912122046e-06,
"loss": 0.0731,
"step": 489
},
{
"epoch": 1.0315789473684212,
"grad_norm": 3.3240436401313618,
"learning_rate": 4.752081226633888e-06,
"loss": 0.0919,
"step": 490
},
{
"epoch": 1.0336842105263158,
"grad_norm": 2.2384781946355794,
"learning_rate": 4.735568252345718e-06,
"loss": 0.0719,
"step": 491
},
{
"epoch": 1.0357894736842106,
"grad_norm": 2.983441854897483,
"learning_rate": 4.719058169840773e-06,
"loss": 0.0745,
"step": 492
},
{
"epoch": 1.0378947368421052,
"grad_norm": 2.6556422035702045,
"learning_rate": 4.702551159670672e-06,
"loss": 0.0734,
"step": 493
},
{
"epoch": 1.04,
"grad_norm": 3.219998720581149,
"learning_rate": 4.686047402353433e-06,
"loss": 0.0775,
"step": 494
},
{
"epoch": 1.0421052631578946,
"grad_norm": 2.6391239908163233,
"learning_rate": 4.669547078371503e-06,
"loss": 0.0787,
"step": 495
},
{
"epoch": 1.0442105263157895,
"grad_norm": 3.041237149660994,
"learning_rate": 4.65305036816978e-06,
"loss": 0.089,
"step": 496
},
{
"epoch": 1.0463157894736843,
"grad_norm": 3.687880198514741,
"learning_rate": 4.636557452153645e-06,
"loss": 0.0831,
"step": 497
},
{
"epoch": 1.048421052631579,
"grad_norm": 5.095705229375661,
"learning_rate": 4.620068510686985e-06,
"loss": 0.0804,
"step": 498
},
{
"epoch": 1.0505263157894738,
"grad_norm": 3.197432814925761,
"learning_rate": 4.60358372409022e-06,
"loss": 0.0574,
"step": 499
},
{
"epoch": 1.0526315789473684,
"grad_norm": 3.09106465983814,
"learning_rate": 4.587103272638339e-06,
"loss": 0.0823,
"step": 500
},
{
"epoch": 1.0547368421052632,
"grad_norm": 3.4094205016943193,
"learning_rate": 4.570627336558915e-06,
"loss": 0.077,
"step": 501
},
{
"epoch": 1.0568421052631578,
"grad_norm": 3.2700532893266723,
"learning_rate": 4.554156096030149e-06,
"loss": 0.0888,
"step": 502
},
{
"epoch": 1.0589473684210526,
"grad_norm": 3.8444997651481274,
"learning_rate": 4.537689731178883e-06,
"loss": 0.0995,
"step": 503
},
{
"epoch": 1.0610526315789475,
"grad_norm": 3.460457464328528,
"learning_rate": 4.5212284220786495e-06,
"loss": 0.0852,
"step": 504
},
{
"epoch": 1.063157894736842,
"grad_norm": 3.5197825821543844,
"learning_rate": 4.504772348747687e-06,
"loss": 0.089,
"step": 505
},
{
"epoch": 1.065263157894737,
"grad_norm": 2.9315058365098148,
"learning_rate": 4.488321691146975e-06,
"loss": 0.0917,
"step": 506
},
{
"epoch": 1.0673684210526315,
"grad_norm": 2.959097650131179,
"learning_rate": 4.471876629178273e-06,
"loss": 0.0927,
"step": 507
},
{
"epoch": 1.0694736842105264,
"grad_norm": 2.9752640084242543,
"learning_rate": 4.4554373426821375e-06,
"loss": 0.0754,
"step": 508
},
{
"epoch": 1.071578947368421,
"grad_norm": 3.6867032363293633,
"learning_rate": 4.439004011435979e-06,
"loss": 0.0931,
"step": 509
},
{
"epoch": 1.0736842105263158,
"grad_norm": 3.9162282663094437,
"learning_rate": 4.42257681515207e-06,
"loss": 0.0915,
"step": 510
},
{
"epoch": 1.0757894736842106,
"grad_norm": 3.245956552006904,
"learning_rate": 4.406155933475599e-06,
"loss": 0.0825,
"step": 511
},
{
"epoch": 1.0778947368421052,
"grad_norm": 3.3407176706701303,
"learning_rate": 4.3897415459827e-06,
"loss": 0.0833,
"step": 512
},
{
"epoch": 1.08,
"grad_norm": 3.2605562952641325,
"learning_rate": 4.373333832178478e-06,
"loss": 0.0836,
"step": 513
},
{
"epoch": 1.0821052631578947,
"grad_norm": 3.022186927091034,
"learning_rate": 4.356932971495071e-06,
"loss": 0.0893,
"step": 514
},
{
"epoch": 1.0842105263157895,
"grad_norm": 2.570982657234066,
"learning_rate": 4.340539143289655e-06,
"loss": 0.0691,
"step": 515
},
{
"epoch": 1.0863157894736841,
"grad_norm": 2.6495275768322966,
"learning_rate": 4.324152526842517e-06,
"loss": 0.0703,
"step": 516
},
{
"epoch": 1.088421052631579,
"grad_norm": 2.9529381342049357,
"learning_rate": 4.307773301355063e-06,
"loss": 0.0878,
"step": 517
},
{
"epoch": 1.0905263157894738,
"grad_norm": 3.2139910027789913,
"learning_rate": 4.291401645947879e-06,
"loss": 0.0858,
"step": 518
},
{
"epoch": 1.0926315789473684,
"grad_norm": 3.359687231677775,
"learning_rate": 4.275037739658771e-06,
"loss": 0.0991,
"step": 519
},
{
"epoch": 1.0947368421052632,
"grad_norm": 2.7257961811651867,
"learning_rate": 4.25868176144079e-06,
"loss": 0.0636,
"step": 520
},
{
"epoch": 1.0968421052631578,
"grad_norm": 3.0300299027782205,
"learning_rate": 4.242333890160299e-06,
"loss": 0.0744,
"step": 521
},
{
"epoch": 1.0989473684210527,
"grad_norm": 2.673076324741469,
"learning_rate": 4.225994304594994e-06,
"loss": 0.0733,
"step": 522
},
{
"epoch": 1.1010526315789473,
"grad_norm": 3.434548397420313,
"learning_rate": 4.209663183431969e-06,
"loss": 0.0894,
"step": 523
},
{
"epoch": 1.1031578947368421,
"grad_norm": 2.656738290688316,
"learning_rate": 4.193340705265746e-06,
"loss": 0.0816,
"step": 524
},
{
"epoch": 1.1052631578947367,
"grad_norm": 2.752642253228426,
"learning_rate": 4.17702704859633e-06,
"loss": 0.0737,
"step": 525
},
{
"epoch": 1.1073684210526316,
"grad_norm": 2.8240215409779204,
"learning_rate": 4.160722391827262e-06,
"loss": 0.0946,
"step": 526
},
{
"epoch": 1.1094736842105264,
"grad_norm": 2.6310984451059523,
"learning_rate": 4.14442691326365e-06,
"loss": 0.075,
"step": 527
},
{
"epoch": 1.111578947368421,
"grad_norm": 3.3487552916421097,
"learning_rate": 4.128140791110243e-06,
"loss": 0.0904,
"step": 528
},
{
"epoch": 1.1136842105263158,
"grad_norm": 3.071402311195297,
"learning_rate": 4.111864203469457e-06,
"loss": 0.079,
"step": 529
},
{
"epoch": 1.1157894736842104,
"grad_norm": 2.8739724428152993,
"learning_rate": 4.0955973283394525e-06,
"loss": 0.0844,
"step": 530
},
{
"epoch": 1.1178947368421053,
"grad_norm": 2.7822422708634442,
"learning_rate": 4.079340343612165e-06,
"loss": 0.0943,
"step": 531
},
{
"epoch": 1.12,
"grad_norm": 2.5902291543214364,
"learning_rate": 4.063093427071376e-06,
"loss": 0.0827,
"step": 532
},
{
"epoch": 1.1221052631578947,
"grad_norm": 3.084348236278604,
"learning_rate": 4.046856756390767e-06,
"loss": 0.0892,
"step": 533
},
{
"epoch": 1.1242105263157895,
"grad_norm": 2.95461042174687,
"learning_rate": 4.03063050913196e-06,
"loss": 0.0816,
"step": 534
},
{
"epoch": 1.1263157894736842,
"grad_norm": 2.7009483055282892,
"learning_rate": 4.0144148627426e-06,
"loss": 0.063,
"step": 535
},
{
"epoch": 1.128421052631579,
"grad_norm": 3.2167472489705062,
"learning_rate": 3.998209994554395e-06,
"loss": 0.0993,
"step": 536
},
{
"epoch": 1.1305263157894736,
"grad_norm": 3.164155379501995,
"learning_rate": 3.982016081781189e-06,
"loss": 0.0928,
"step": 537
},
{
"epoch": 1.1326315789473684,
"grad_norm": 2.6712684161255873,
"learning_rate": 3.965833301517017e-06,
"loss": 0.0792,
"step": 538
},
{
"epoch": 1.134736842105263,
"grad_norm": 3.590217130090868,
"learning_rate": 3.949661830734172e-06,
"loss": 0.1122,
"step": 539
},
{
"epoch": 1.1368421052631579,
"grad_norm": 2.757855187593266,
"learning_rate": 3.9335018462812664e-06,
"loss": 0.0732,
"step": 540
},
{
"epoch": 1.1389473684210527,
"grad_norm": 3.536487052728721,
"learning_rate": 3.9173535248813026e-06,
"loss": 0.0678,
"step": 541
},
{
"epoch": 1.1410526315789473,
"grad_norm": 2.4862366546978483,
"learning_rate": 3.901217043129735e-06,
"loss": 0.0728,
"step": 542
},
{
"epoch": 1.1431578947368422,
"grad_norm": 3.2902474007718907,
"learning_rate": 3.885092577492543e-06,
"loss": 0.1086,
"step": 543
},
{
"epoch": 1.1452631578947368,
"grad_norm": 3.451017932646852,
"learning_rate": 3.8689803043043e-06,
"loss": 0.0868,
"step": 544
},
{
"epoch": 1.1473684210526316,
"grad_norm": 2.8980165245573692,
"learning_rate": 3.852880399766243e-06,
"loss": 0.0829,
"step": 545
},
{
"epoch": 1.1494736842105264,
"grad_norm": 2.8916674632956134,
"learning_rate": 3.8367930399443495e-06,
"loss": 0.0782,
"step": 546
},
{
"epoch": 1.151578947368421,
"grad_norm": 2.505026430566736,
"learning_rate": 3.820718400767409e-06,
"loss": 0.0763,
"step": 547
},
{
"epoch": 1.1536842105263159,
"grad_norm": 3.7982142015035305,
"learning_rate": 3.8046566580251e-06,
"loss": 0.0895,
"step": 548
},
{
"epoch": 1.1557894736842105,
"grad_norm": 2.550264620010391,
"learning_rate": 3.7886079873660693e-06,
"loss": 0.085,
"step": 549
},
{
"epoch": 1.1578947368421053,
"grad_norm": 3.2823943298060483,
"learning_rate": 3.7725725642960047e-06,
"loss": 0.0838,
"step": 550
},
{
"epoch": 1.16,
"grad_norm": 3.091694066417572,
"learning_rate": 3.756550564175727e-06,
"loss": 0.0945,
"step": 551
},
{
"epoch": 1.1621052631578948,
"grad_norm": 2.6667880955040855,
"learning_rate": 3.7405421622192607e-06,
"loss": 0.067,
"step": 552
},
{
"epoch": 1.1642105263157894,
"grad_norm": 3.2831274480460055,
"learning_rate": 3.7245475334919246e-06,
"loss": 0.0994,
"step": 553
},
{
"epoch": 1.1663157894736842,
"grad_norm": 2.2115339073168903,
"learning_rate": 3.7085668529084183e-06,
"loss": 0.0609,
"step": 554
},
{
"epoch": 1.168421052631579,
"grad_norm": 3.103922834249276,
"learning_rate": 3.6926002952309015e-06,
"loss": 0.0705,
"step": 555
},
{
"epoch": 1.1705263157894736,
"grad_norm": 3.1379466258374413,
"learning_rate": 3.676648035067093e-06,
"loss": 0.0755,
"step": 556
},
{
"epoch": 1.1726315789473685,
"grad_norm": 3.144558129851556,
"learning_rate": 3.6607102468683524e-06,
"loss": 0.0906,
"step": 557
},
{
"epoch": 1.174736842105263,
"grad_norm": 2.843620211669143,
"learning_rate": 3.64478710492778e-06,
"loss": 0.0752,
"step": 558
},
{
"epoch": 1.176842105263158,
"grad_norm": 2.737119577196797,
"learning_rate": 3.628878783378302e-06,
"loss": 0.0855,
"step": 559
},
{
"epoch": 1.1789473684210527,
"grad_norm": 3.38606655435301,
"learning_rate": 3.6129854561907786e-06,
"loss": 0.1073,
"step": 560
},
{
"epoch": 1.1810526315789474,
"grad_norm": 3.022666071905334,
"learning_rate": 3.5971072971720844e-06,
"loss": 0.096,
"step": 561
},
{
"epoch": 1.1831578947368422,
"grad_norm": 3.013844862309235,
"learning_rate": 3.581244479963225e-06,
"loss": 0.0699,
"step": 562
},
{
"epoch": 1.1852631578947368,
"grad_norm": 2.4616484667093532,
"learning_rate": 3.56539717803743e-06,
"loss": 0.0686,
"step": 563
},
{
"epoch": 1.1873684210526316,
"grad_norm": 3.092537315474559,
"learning_rate": 3.5495655646982506e-06,
"loss": 0.1022,
"step": 564
},
{
"epoch": 1.1894736842105262,
"grad_norm": 2.781358161394791,
"learning_rate": 3.533749813077677e-06,
"loss": 0.0804,
"step": 565
},
{
"epoch": 1.191578947368421,
"grad_norm": 2.453888657239943,
"learning_rate": 3.517950096134232e-06,
"loss": 0.0577,
"step": 566
},
{
"epoch": 1.1936842105263157,
"grad_norm": 2.935143310735812,
"learning_rate": 3.5021665866510924e-06,
"loss": 0.0905,
"step": 567
},
{
"epoch": 1.1957894736842105,
"grad_norm": 2.6509666167907726,
"learning_rate": 3.4863994572341845e-06,
"loss": 0.0854,
"step": 568
},
{
"epoch": 1.1978947368421053,
"grad_norm": 2.955009635915876,
"learning_rate": 3.470648880310313e-06,
"loss": 0.0883,
"step": 569
},
{
"epoch": 1.2,
"grad_norm": 3.2167621312639794,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.072,
"step": 570
},
{
"epoch": 1.2021052631578948,
"grad_norm": 2.7638395907473225,
"learning_rate": 3.4391980727419206e-06,
"loss": 0.082,
"step": 571
},
{
"epoch": 1.2042105263157894,
"grad_norm": 3.4412996909683806,
"learning_rate": 3.423498186038393e-06,
"loss": 0.1093,
"step": 572
},
{
"epoch": 1.2063157894736842,
"grad_norm": 2.6520865262952515,
"learning_rate": 3.4078155397061243e-06,
"loss": 0.07,
"step": 573
},
{
"epoch": 1.208421052631579,
"grad_norm": 2.376283277388007,
"learning_rate": 3.3921503052480243e-06,
"loss": 0.0748,
"step": 574
},
{
"epoch": 1.2105263157894737,
"grad_norm": 2.242871891939484,
"learning_rate": 3.3765026539765832e-06,
"loss": 0.0681,
"step": 575
},
{
"epoch": 1.2126315789473685,
"grad_norm": 4.4744073948915535,
"learning_rate": 3.3608727570120114e-06,
"loss": 0.0928,
"step": 576
},
{
"epoch": 1.2147368421052631,
"grad_norm": 2.7537788196508903,
"learning_rate": 3.3452607852803585e-06,
"loss": 0.0966,
"step": 577
},
{
"epoch": 1.216842105263158,
"grad_norm": 3.1307439683337517,
"learning_rate": 3.3296669095116454e-06,
"loss": 0.0778,
"step": 578
},
{
"epoch": 1.2189473684210526,
"grad_norm": 2.7177020729876253,
"learning_rate": 3.3140913002379993e-06,
"loss": 0.0697,
"step": 579
},
{
"epoch": 1.2210526315789474,
"grad_norm": 2.4210212110003484,
"learning_rate": 3.298534127791785e-06,
"loss": 0.0898,
"step": 580
},
{
"epoch": 1.223157894736842,
"grad_norm": 2.486998916941089,
"learning_rate": 3.2829955623037536e-06,
"loss": 0.0906,
"step": 581
},
{
"epoch": 1.2252631578947368,
"grad_norm": 2.398945212379016,
"learning_rate": 3.267475773701161e-06,
"loss": 0.072,
"step": 582
},
{
"epoch": 1.2273684210526317,
"grad_norm": 2.98342220040315,
"learning_rate": 3.251974931705933e-06,
"loss": 0.0884,
"step": 583
},
{
"epoch": 1.2294736842105263,
"grad_norm": 2.471587722526877,
"learning_rate": 3.236493205832795e-06,
"loss": 0.0803,
"step": 584
},
{
"epoch": 1.231578947368421,
"grad_norm": 3.3871586088205277,
"learning_rate": 3.2210307653874175e-06,
"loss": 0.0905,
"step": 585
},
{
"epoch": 1.2336842105263157,
"grad_norm": 3.1548244792093727,
"learning_rate": 3.205587779464576e-06,
"loss": 0.0807,
"step": 586
},
{
"epoch": 1.2357894736842105,
"grad_norm": 2.751888064247698,
"learning_rate": 3.1901644169462854e-06,
"loss": 0.1001,
"step": 587
},
{
"epoch": 1.2378947368421054,
"grad_norm": 2.953100733477854,
"learning_rate": 3.1747608464999723e-06,
"loss": 0.0859,
"step": 588
},
{
"epoch": 1.24,
"grad_norm": 3.0915284845137383,
"learning_rate": 3.1593772365766107e-06,
"loss": 0.0916,
"step": 589
},
{
"epoch": 1.2421052631578948,
"grad_norm": 3.212692328218412,
"learning_rate": 3.1440137554088957e-06,
"loss": 0.087,
"step": 590
},
{
"epoch": 1.2442105263157894,
"grad_norm": 3.6741186905601673,
"learning_rate": 3.128670571009399e-06,
"loss": 0.0918,
"step": 591
},
{
"epoch": 1.2463157894736843,
"grad_norm": 3.052869956993313,
"learning_rate": 3.1133478511687217e-06,
"loss": 0.0882,
"step": 592
},
{
"epoch": 1.2484210526315789,
"grad_norm": 2.7038650862339026,
"learning_rate": 3.0980457634536775e-06,
"loss": 0.0694,
"step": 593
},
{
"epoch": 1.2505263157894737,
"grad_norm": 3.018334207884892,
"learning_rate": 3.082764475205442e-06,
"loss": 0.0858,
"step": 594
},
{
"epoch": 1.2526315789473683,
"grad_norm": 2.569495801078813,
"learning_rate": 3.06750415353774e-06,
"loss": 0.0782,
"step": 595
},
{
"epoch": 1.2547368421052632,
"grad_norm": 3.3984298834388835,
"learning_rate": 3.052264965335e-06,
"loss": 0.109,
"step": 596
},
{
"epoch": 1.256842105263158,
"grad_norm": 2.3590945955494416,
"learning_rate": 3.0370470772505433e-06,
"loss": 0.071,
"step": 597
},
{
"epoch": 1.2589473684210526,
"grad_norm": 2.234877038235419,
"learning_rate": 3.02185065570476e-06,
"loss": 0.0692,
"step": 598
},
{
"epoch": 1.2610526315789474,
"grad_norm": 4.052354805427412,
"learning_rate": 3.0066758668832752e-06,
"loss": 0.0948,
"step": 599
},
{
"epoch": 1.263157894736842,
"grad_norm": 2.742079337401893,
"learning_rate": 2.991522876735154e-06,
"loss": 0.0969,
"step": 600
},
{
"epoch": 1.263157894736842,
"eval_loss": 0.20044729113578796,
"eval_runtime": 0.929,
"eval_samples_per_second": 41.983,
"eval_steps_per_second": 10.765,
"step": 600
},
{
"epoch": 1.2652631578947369,
"grad_norm": 2.8741805199859205,
"learning_rate": 2.9763918509710647e-06,
"loss": 0.0963,
"step": 601
},
{
"epoch": 1.2673684210526317,
"grad_norm": 2.737388943410033,
"learning_rate": 2.9612829550614836e-06,
"loss": 0.0826,
"step": 602
},
{
"epoch": 1.2694736842105263,
"grad_norm": 2.717582073137317,
"learning_rate": 2.9461963542348737e-06,
"loss": 0.0681,
"step": 603
},
{
"epoch": 1.271578947368421,
"grad_norm": 3.3716699599065123,
"learning_rate": 2.931132213475884e-06,
"loss": 0.101,
"step": 604
},
{
"epoch": 1.2736842105263158,
"grad_norm": 2.439989476563021,
"learning_rate": 2.9160906975235493e-06,
"loss": 0.0732,
"step": 605
},
{
"epoch": 1.2757894736842106,
"grad_norm": 3.092634953355724,
"learning_rate": 2.9010719708694724e-06,
"loss": 0.0744,
"step": 606
},
{
"epoch": 1.2778947368421052,
"grad_norm": 2.638312682828106,
"learning_rate": 2.8860761977560435e-06,
"loss": 0.0757,
"step": 607
},
{
"epoch": 1.28,
"grad_norm": 2.3219077212278494,
"learning_rate": 2.871103542174637e-06,
"loss": 0.0941,
"step": 608
},
{
"epoch": 1.2821052631578946,
"grad_norm": 2.7468019529994607,
"learning_rate": 2.8561541678638145e-06,
"loss": 0.0679,
"step": 609
},
{
"epoch": 1.2842105263157895,
"grad_norm": 2.592555737944712,
"learning_rate": 2.8412282383075362e-06,
"loss": 0.072,
"step": 610
},
{
"epoch": 1.2863157894736843,
"grad_norm": 2.5748600678466493,
"learning_rate": 2.826325916733378e-06,
"loss": 0.078,
"step": 611
},
{
"epoch": 1.288421052631579,
"grad_norm": 3.272935966473756,
"learning_rate": 2.811447366110741e-06,
"loss": 0.0985,
"step": 612
},
{
"epoch": 1.2905263157894737,
"grad_norm": 2.718391171117533,
"learning_rate": 2.796592749149071e-06,
"loss": 0.0856,
"step": 613
},
{
"epoch": 1.2926315789473684,
"grad_norm": 2.895251659895727,
"learning_rate": 2.7817622282960816e-06,
"loss": 0.0814,
"step": 614
},
{
"epoch": 1.2947368421052632,
"grad_norm": 3.098118941203153,
"learning_rate": 2.766955965735968e-06,
"loss": 0.1,
"step": 615
},
{
"epoch": 1.296842105263158,
"grad_norm": 2.4925459738486078,
"learning_rate": 2.7521741233876496e-06,
"loss": 0.066,
"step": 616
},
{
"epoch": 1.2989473684210526,
"grad_norm": 3.2668047779458447,
"learning_rate": 2.7374168629029814e-06,
"loss": 0.0662,
"step": 617
},
{
"epoch": 1.3010526315789472,
"grad_norm": 2.365373318259492,
"learning_rate": 2.722684345665004e-06,
"loss": 0.0568,
"step": 618
},
{
"epoch": 1.303157894736842,
"grad_norm": 2.674596379228086,
"learning_rate": 2.707976732786166e-06,
"loss": 0.0716,
"step": 619
},
{
"epoch": 1.305263157894737,
"grad_norm": 2.629799474227148,
"learning_rate": 2.693294185106562e-06,
"loss": 0.0708,
"step": 620
},
{
"epoch": 1.3073684210526315,
"grad_norm": 2.7427790950830917,
"learning_rate": 2.678636863192184e-06,
"loss": 0.0819,
"step": 621
},
{
"epoch": 1.3094736842105263,
"grad_norm": 2.355303580669883,
"learning_rate": 2.6640049273331516e-06,
"loss": 0.0682,
"step": 622
},
{
"epoch": 1.311578947368421,
"grad_norm": 2.5202135513477595,
"learning_rate": 2.649398537541978e-06,
"loss": 0.0592,
"step": 623
},
{
"epoch": 1.3136842105263158,
"grad_norm": 3.1053392641729056,
"learning_rate": 2.6348178535517967e-06,
"loss": 0.0815,
"step": 624
},
{
"epoch": 1.3157894736842106,
"grad_norm": 2.287461098881695,
"learning_rate": 2.6202630348146323e-06,
"loss": 0.0809,
"step": 625
},
{
"epoch": 1.3178947368421052,
"grad_norm": 2.443760626345547,
"learning_rate": 2.605734240499652e-06,
"loss": 0.0604,
"step": 626
},
{
"epoch": 1.32,
"grad_norm": 2.845171834877243,
"learning_rate": 2.5912316294914232e-06,
"loss": 0.0744,
"step": 627
},
{
"epoch": 1.3221052631578947,
"grad_norm": 2.1955991785027127,
"learning_rate": 2.576755360388177e-06,
"loss": 0.0592,
"step": 628
},
{
"epoch": 1.3242105263157895,
"grad_norm": 3.8739275751093456,
"learning_rate": 2.562305591500069e-06,
"loss": 0.1056,
"step": 629
},
{
"epoch": 1.3263157894736843,
"grad_norm": 2.6932111092229234,
"learning_rate": 2.5478824808474613e-06,
"loss": 0.0762,
"step": 630
},
{
"epoch": 1.328421052631579,
"grad_norm": 2.3653261808302393,
"learning_rate": 2.5334861861591753e-06,
"loss": 0.072,
"step": 631
},
{
"epoch": 1.3305263157894736,
"grad_norm": 3.0167769884448057,
"learning_rate": 2.5191168648707888e-06,
"loss": 0.0896,
"step": 632
},
{
"epoch": 1.3326315789473684,
"grad_norm": 2.4444541248066796,
"learning_rate": 2.5047746741228977e-06,
"loss": 0.0679,
"step": 633
},
{
"epoch": 1.3347368421052632,
"grad_norm": 3.1319838501376056,
"learning_rate": 2.490459770759398e-06,
"loss": 0.0794,
"step": 634
},
{
"epoch": 1.3368421052631578,
"grad_norm": 2.4160632314580583,
"learning_rate": 2.476172311325783e-06,
"loss": 0.057,
"step": 635
},
{
"epoch": 1.3389473684210527,
"grad_norm": 2.8056259509770083,
"learning_rate": 2.461912452067415e-06,
"loss": 0.0788,
"step": 636
},
{
"epoch": 1.3410526315789473,
"grad_norm": 3.5606250812923994,
"learning_rate": 2.447680348927837e-06,
"loss": 0.0991,
"step": 637
},
{
"epoch": 1.343157894736842,
"grad_norm": 2.939424365838173,
"learning_rate": 2.433476157547044e-06,
"loss": 0.0791,
"step": 638
},
{
"epoch": 1.345263157894737,
"grad_norm": 2.3944747127012547,
"learning_rate": 2.4193000332597984e-06,
"loss": 0.0776,
"step": 639
},
{
"epoch": 1.3473684210526315,
"grad_norm": 2.4097140332528144,
"learning_rate": 2.4051521310939258e-06,
"loss": 0.0548,
"step": 640
},
{
"epoch": 1.3494736842105264,
"grad_norm": 2.484876677592921,
"learning_rate": 2.391032605768613e-06,
"loss": 0.0639,
"step": 641
},
{
"epoch": 1.351578947368421,
"grad_norm": 2.9569313033101023,
"learning_rate": 2.3769416116927335e-06,
"loss": 0.0702,
"step": 642
},
{
"epoch": 1.3536842105263158,
"grad_norm": 2.3154837496863268,
"learning_rate": 2.3628793029631353e-06,
"loss": 0.0696,
"step": 643
},
{
"epoch": 1.3557894736842107,
"grad_norm": 4.0950527559872345,
"learning_rate": 2.3488458333629777e-06,
"loss": 0.0988,
"step": 644
},
{
"epoch": 1.3578947368421053,
"grad_norm": 3.2898299985671953,
"learning_rate": 2.3348413563600324e-06,
"loss": 0.0998,
"step": 645
},
{
"epoch": 1.3599999999999999,
"grad_norm": 3.5565589465236402,
"learning_rate": 2.320866025105016e-06,
"loss": 0.0748,
"step": 646
},
{
"epoch": 1.3621052631578947,
"grad_norm": 2.9838579493260142,
"learning_rate": 2.3069199924299175e-06,
"loss": 0.0781,
"step": 647
},
{
"epoch": 1.3642105263157895,
"grad_norm": 3.109898543479839,
"learning_rate": 2.29300341084631e-06,
"loss": 0.0702,
"step": 648
},
{
"epoch": 1.3663157894736842,
"grad_norm": 2.736969851304859,
"learning_rate": 2.2791164325437047e-06,
"loss": 0.0792,
"step": 649
},
{
"epoch": 1.368421052631579,
"grad_norm": 3.3868564041377973,
"learning_rate": 2.265259209387867e-06,
"loss": 0.0899,
"step": 650
},
{
"epoch": 1.3705263157894736,
"grad_norm": 2.7082372890636375,
"learning_rate": 2.2514318929191707e-06,
"loss": 0.0752,
"step": 651
},
{
"epoch": 1.3726315789473684,
"grad_norm": 3.274685542658562,
"learning_rate": 2.2376346343509343e-06,
"loss": 0.0789,
"step": 652
},
{
"epoch": 1.3747368421052633,
"grad_norm": 3.3918195389906436,
"learning_rate": 2.2238675845677663e-06,
"loss": 0.0811,
"step": 653
},
{
"epoch": 1.3768421052631579,
"grad_norm": 2.436153684588233,
"learning_rate": 2.2101308941239204e-06,
"loss": 0.0694,
"step": 654
},
{
"epoch": 1.3789473684210527,
"grad_norm": 3.2956597758884816,
"learning_rate": 2.1964247132416373e-06,
"loss": 0.0845,
"step": 655
},
{
"epoch": 1.3810526315789473,
"grad_norm": 2.891107537325035,
"learning_rate": 2.182749191809518e-06,
"loss": 0.0806,
"step": 656
},
{
"epoch": 1.3831578947368421,
"grad_norm": 3.174882207717556,
"learning_rate": 2.1691044793808734e-06,
"loss": 0.0766,
"step": 657
},
{
"epoch": 1.385263157894737,
"grad_norm": 3.239310623984899,
"learning_rate": 2.1554907251720947e-06,
"loss": 0.1132,
"step": 658
},
{
"epoch": 1.3873684210526316,
"grad_norm": 3.0000793459288797,
"learning_rate": 2.1419080780610123e-06,
"loss": 0.0779,
"step": 659
},
{
"epoch": 1.3894736842105262,
"grad_norm": 3.0156978277959166,
"learning_rate": 2.1283566865852824e-06,
"loss": 0.074,
"step": 660
},
{
"epoch": 1.391578947368421,
"grad_norm": 2.6551132879995007,
"learning_rate": 2.11483669894075e-06,
"loss": 0.0746,
"step": 661
},
{
"epoch": 1.3936842105263159,
"grad_norm": 2.3673083937385875,
"learning_rate": 2.1013482629798334e-06,
"loss": 0.0714,
"step": 662
},
{
"epoch": 1.3957894736842105,
"grad_norm": 3.2553558351574288,
"learning_rate": 2.08789152620991e-06,
"loss": 0.1127,
"step": 663
},
{
"epoch": 1.3978947368421053,
"grad_norm": 3.1793358246388386,
"learning_rate": 2.0744666357916925e-06,
"loss": 0.1027,
"step": 664
},
{
"epoch": 1.4,
"grad_norm": 2.3044156367485766,
"learning_rate": 2.061073738537635e-06,
"loss": 0.079,
"step": 665
},
{
"epoch": 1.4021052631578947,
"grad_norm": 3.038506581721067,
"learning_rate": 2.0477129809103147e-06,
"loss": 0.078,
"step": 666
},
{
"epoch": 1.4042105263157896,
"grad_norm": 3.2134574970286254,
"learning_rate": 2.034384509020837e-06,
"loss": 0.0787,
"step": 667
},
{
"epoch": 1.4063157894736842,
"grad_norm": 2.8833109727229593,
"learning_rate": 2.021088468627237e-06,
"loss": 0.0945,
"step": 668
},
{
"epoch": 1.408421052631579,
"grad_norm": 2.6681060575893922,
"learning_rate": 2.0078250051328783e-06,
"loss": 0.0785,
"step": 669
},
{
"epoch": 1.4105263157894736,
"grad_norm": 3.309105305859723,
"learning_rate": 1.9945942635848745e-06,
"loss": 0.0932,
"step": 670
},
{
"epoch": 1.4126315789473685,
"grad_norm": 2.7557375649742992,
"learning_rate": 1.981396388672496e-06,
"loss": 0.0704,
"step": 671
},
{
"epoch": 1.4147368421052633,
"grad_norm": 2.780789111855544,
"learning_rate": 1.9682315247255897e-06,
"loss": 0.0681,
"step": 672
},
{
"epoch": 1.416842105263158,
"grad_norm": 2.891129671769762,
"learning_rate": 1.9550998157129946e-06,
"loss": 0.0689,
"step": 673
},
{
"epoch": 1.4189473684210525,
"grad_norm": 2.8438288324834136,
"learning_rate": 1.9420014052409793e-06,
"loss": 0.0948,
"step": 674
},
{
"epoch": 1.4210526315789473,
"grad_norm": 2.8648199763393363,
"learning_rate": 1.928936436551661e-06,
"loss": 0.0852,
"step": 675
},
{
"epoch": 1.4231578947368422,
"grad_norm": 2.4983276715177802,
"learning_rate": 1.915905052521445e-06,
"loss": 0.0691,
"step": 676
},
{
"epoch": 1.4252631578947368,
"grad_norm": 2.6685238310395167,
"learning_rate": 1.9029073956594607e-06,
"loss": 0.0902,
"step": 677
},
{
"epoch": 1.4273684210526316,
"grad_norm": 2.6981688261841623,
"learning_rate": 1.8899436081059974e-06,
"loss": 0.0626,
"step": 678
},
{
"epoch": 1.4294736842105262,
"grad_norm": 3.4512093051473287,
"learning_rate": 1.877013831630961e-06,
"loss": 0.0873,
"step": 679
},
{
"epoch": 1.431578947368421,
"grad_norm": 3.136937795473418,
"learning_rate": 1.864118207632315e-06,
"loss": 0.0817,
"step": 680
},
{
"epoch": 1.433684210526316,
"grad_norm": 2.845589565177414,
"learning_rate": 1.851256877134538e-06,
"loss": 0.084,
"step": 681
},
{
"epoch": 1.4357894736842105,
"grad_norm": 2.6247730269493634,
"learning_rate": 1.838429980787081e-06,
"loss": 0.0868,
"step": 682
},
{
"epoch": 1.4378947368421053,
"grad_norm": 2.212932444663422,
"learning_rate": 1.825637658862824e-06,
"loss": 0.056,
"step": 683
},
{
"epoch": 1.44,
"grad_norm": 3.0708180874395525,
"learning_rate": 1.8128800512565514e-06,
"loss": 0.0819,
"step": 684
},
{
"epoch": 1.4421052631578948,
"grad_norm": 3.191848306499893,
"learning_rate": 1.8001572974834169e-06,
"loss": 0.0874,
"step": 685
},
{
"epoch": 1.4442105263157896,
"grad_norm": 3.118644672611863,
"learning_rate": 1.7874695366774191e-06,
"loss": 0.0703,
"step": 686
},
{
"epoch": 1.4463157894736842,
"grad_norm": 3.3640943896050577,
"learning_rate": 1.774816907589873e-06,
"loss": 0.0792,
"step": 687
},
{
"epoch": 1.4484210526315788,
"grad_norm": 2.4258203150187994,
"learning_rate": 1.7621995485879062e-06,
"loss": 0.0759,
"step": 688
},
{
"epoch": 1.4505263157894737,
"grad_norm": 2.581622498733916,
"learning_rate": 1.749617597652934e-06,
"loss": 0.063,
"step": 689
},
{
"epoch": 1.4526315789473685,
"grad_norm": 2.946473481196987,
"learning_rate": 1.7370711923791567e-06,
"loss": 0.0822,
"step": 690
},
{
"epoch": 1.454736842105263,
"grad_norm": 2.4753384481559055,
"learning_rate": 1.7245604699720536e-06,
"loss": 0.0598,
"step": 691
},
{
"epoch": 1.456842105263158,
"grad_norm": 3.416152120993626,
"learning_rate": 1.7120855672468779e-06,
"loss": 0.0907,
"step": 692
},
{
"epoch": 1.4589473684210525,
"grad_norm": 2.695879145021625,
"learning_rate": 1.6996466206271679e-06,
"loss": 0.0612,
"step": 693
},
{
"epoch": 1.4610526315789474,
"grad_norm": 2.9873858087756635,
"learning_rate": 1.6872437661432518e-06,
"loss": 0.0811,
"step": 694
},
{
"epoch": 1.4631578947368422,
"grad_norm": 3.4960013543762125,
"learning_rate": 1.6748771394307584e-06,
"loss": 0.0813,
"step": 695
},
{
"epoch": 1.4652631578947368,
"grad_norm": 2.277927581675807,
"learning_rate": 1.6625468757291379e-06,
"loss": 0.0561,
"step": 696
},
{
"epoch": 1.4673684210526317,
"grad_norm": 2.3801716966637105,
"learning_rate": 1.6502531098801756e-06,
"loss": 0.0672,
"step": 697
},
{
"epoch": 1.4694736842105263,
"grad_norm": 3.4687383026734837,
"learning_rate": 1.6379959763265268e-06,
"loss": 0.0876,
"step": 698
},
{
"epoch": 1.471578947368421,
"grad_norm": 3.2466329337423874,
"learning_rate": 1.62577560911024e-06,
"loss": 0.0778,
"step": 699
},
{
"epoch": 1.4736842105263157,
"grad_norm": 2.904526223024363,
"learning_rate": 1.6135921418712959e-06,
"loss": 0.0938,
"step": 700
},
{
"epoch": 1.4757894736842105,
"grad_norm": 2.686553988217244,
"learning_rate": 1.6014457078461354e-06,
"loss": 0.0643,
"step": 701
},
{
"epoch": 1.4778947368421052,
"grad_norm": 3.5299540645067444,
"learning_rate": 1.5893364398662175e-06,
"loss": 0.0936,
"step": 702
},
{
"epoch": 1.48,
"grad_norm": 2.9376299944007855,
"learning_rate": 1.5772644703565564e-06,
"loss": 0.0853,
"step": 703
},
{
"epoch": 1.4821052631578948,
"grad_norm": 2.688100953143273,
"learning_rate": 1.5652299313342772e-06,
"loss": 0.0792,
"step": 704
},
{
"epoch": 1.4842105263157894,
"grad_norm": 3.4048347453623857,
"learning_rate": 1.5532329544071712e-06,
"loss": 0.083,
"step": 705
},
{
"epoch": 1.4863157894736843,
"grad_norm": 2.0479188515220623,
"learning_rate": 1.5412736707722537e-06,
"loss": 0.0483,
"step": 706
},
{
"epoch": 1.4884210526315789,
"grad_norm": 2.2750434748340935,
"learning_rate": 1.5293522112143371e-06,
"loss": 0.0619,
"step": 707
},
{
"epoch": 1.4905263157894737,
"grad_norm": 2.7200793140054103,
"learning_rate": 1.517468706104589e-06,
"loss": 0.0727,
"step": 708
},
{
"epoch": 1.4926315789473685,
"grad_norm": 2.2546054623423335,
"learning_rate": 1.505623285399121e-06,
"loss": 0.0488,
"step": 709
},
{
"epoch": 1.4947368421052631,
"grad_norm": 2.7989024442399435,
"learning_rate": 1.4938160786375571e-06,
"loss": 0.0921,
"step": 710
},
{
"epoch": 1.496842105263158,
"grad_norm": 2.6090857807395023,
"learning_rate": 1.4820472149416153e-06,
"loss": 0.074,
"step": 711
},
{
"epoch": 1.4989473684210526,
"grad_norm": 2.5952217582934756,
"learning_rate": 1.4703168230137072e-06,
"loss": 0.0531,
"step": 712
},
{
"epoch": 1.5010526315789474,
"grad_norm": 2.854421948263307,
"learning_rate": 1.4586250311355132e-06,
"loss": 0.0706,
"step": 713
},
{
"epoch": 1.5031578947368422,
"grad_norm": 2.491393713483,
"learning_rate": 1.4469719671666043e-06,
"loss": 0.0712,
"step": 714
},
{
"epoch": 1.5052631578947369,
"grad_norm": 3.0941426529266085,
"learning_rate": 1.4353577585430152e-06,
"loss": 0.1008,
"step": 715
},
{
"epoch": 1.5073684210526315,
"grad_norm": 2.1188906422201153,
"learning_rate": 1.4237825322758735e-06,
"loss": 0.053,
"step": 716
},
{
"epoch": 1.5094736842105263,
"grad_norm": 2.499562810650923,
"learning_rate": 1.412246414949997e-06,
"loss": 0.0773,
"step": 717
},
{
"epoch": 1.5115789473684211,
"grad_norm": 2.686352758516756,
"learning_rate": 1.4007495327225162e-06,
"loss": 0.0803,
"step": 718
},
{
"epoch": 1.5136842105263157,
"grad_norm": 3.003166198685424,
"learning_rate": 1.389292011321498e-06,
"loss": 0.0942,
"step": 719
},
{
"epoch": 1.5157894736842106,
"grad_norm": 3.276518434334761,
"learning_rate": 1.3778739760445552e-06,
"loss": 0.0822,
"step": 720
},
{
"epoch": 1.5178947368421052,
"grad_norm": 2.5581478643854147,
"learning_rate": 1.3664955517574967e-06,
"loss": 0.0656,
"step": 721
},
{
"epoch": 1.52,
"grad_norm": 2.9505141669864203,
"learning_rate": 1.3551568628929434e-06,
"loss": 0.0695,
"step": 722
},
{
"epoch": 1.5221052631578948,
"grad_norm": 2.8839627500523632,
"learning_rate": 1.343858033448982e-06,
"loss": 0.0731,
"step": 723
},
{
"epoch": 1.5242105263157895,
"grad_norm": 2.7109817535795693,
"learning_rate": 1.3325991869878013e-06,
"loss": 0.0648,
"step": 724
},
{
"epoch": 1.526315789473684,
"grad_norm": 2.9195695673715063,
"learning_rate": 1.321380446634342e-06,
"loss": 0.0825,
"step": 725
},
{
"epoch": 1.528421052631579,
"grad_norm": 2.857165833663471,
"learning_rate": 1.3102019350749528e-06,
"loss": 0.062,
"step": 726
},
{
"epoch": 1.5305263157894737,
"grad_norm": 3.120768459996078,
"learning_rate": 1.2990637745560418e-06,
"loss": 0.0638,
"step": 727
},
{
"epoch": 1.5326315789473686,
"grad_norm": 3.601845356286033,
"learning_rate": 1.2879660868827508e-06,
"loss": 0.0606,
"step": 728
},
{
"epoch": 1.5347368421052632,
"grad_norm": 2.754264101684756,
"learning_rate": 1.2769089934176126e-06,
"loss": 0.061,
"step": 729
},
{
"epoch": 1.5368421052631578,
"grad_norm": 3.763355031418207,
"learning_rate": 1.2658926150792321e-06,
"loss": 0.0853,
"step": 730
},
{
"epoch": 1.5389473684210526,
"grad_norm": 2.435184182108376,
"learning_rate": 1.2549170723409548e-06,
"loss": 0.068,
"step": 731
},
{
"epoch": 1.5410526315789475,
"grad_norm": 3.1384502245840205,
"learning_rate": 1.243982485229559e-06,
"loss": 0.0839,
"step": 732
},
{
"epoch": 1.543157894736842,
"grad_norm": 2.915181764624064,
"learning_rate": 1.233088973323937e-06,
"loss": 0.0932,
"step": 733
},
{
"epoch": 1.545263157894737,
"grad_norm": 3.4631955921569824,
"learning_rate": 1.2222366557537911e-06,
"loss": 0.0902,
"step": 734
},
{
"epoch": 1.5473684210526315,
"grad_norm": 3.012931176367388,
"learning_rate": 1.2114256511983274e-06,
"loss": 0.0887,
"step": 735
},
{
"epoch": 1.5494736842105263,
"grad_norm": 3.1207818685791144,
"learning_rate": 1.200656077884958e-06,
"loss": 0.1018,
"step": 736
},
{
"epoch": 1.5515789473684212,
"grad_norm": 2.4612609560279877,
"learning_rate": 1.189928053588012e-06,
"loss": 0.0822,
"step": 737
},
{
"epoch": 1.5536842105263158,
"grad_norm": 3.8256380681691797,
"learning_rate": 1.1792416956274443e-06,
"loss": 0.0885,
"step": 738
},
{
"epoch": 1.5557894736842104,
"grad_norm": 2.3636768015398557,
"learning_rate": 1.1685971208675539e-06,
"loss": 0.0459,
"step": 739
},
{
"epoch": 1.5578947368421052,
"grad_norm": 2.6401608062517825,
"learning_rate": 1.157994445715706e-06,
"loss": 0.0828,
"step": 740
},
{
"epoch": 1.56,
"grad_norm": 2.3934171850503727,
"learning_rate": 1.1474337861210543e-06,
"loss": 0.0678,
"step": 741
},
{
"epoch": 1.5621052631578949,
"grad_norm": 2.337161351456917,
"learning_rate": 1.1369152575732823e-06,
"loss": 0.0514,
"step": 742
},
{
"epoch": 1.5642105263157895,
"grad_norm": 2.7480915820160154,
"learning_rate": 1.1264389751013326e-06,
"loss": 0.0881,
"step": 743
},
{
"epoch": 1.566315789473684,
"grad_norm": 2.3256118886994943,
"learning_rate": 1.1160050532721527e-06,
"loss": 0.0733,
"step": 744
},
{
"epoch": 1.568421052631579,
"grad_norm": 2.798649624701208,
"learning_rate": 1.1056136061894386e-06,
"loss": 0.0833,
"step": 745
},
{
"epoch": 1.5705263157894738,
"grad_norm": 3.3074889617939847,
"learning_rate": 1.095264747492391e-06,
"loss": 0.0854,
"step": 746
},
{
"epoch": 1.5726315789473684,
"grad_norm": 3.246747529237332,
"learning_rate": 1.0849585903544707e-06,
"loss": 0.09,
"step": 747
},
{
"epoch": 1.5747368421052632,
"grad_norm": 2.268435499902229,
"learning_rate": 1.0746952474821615e-06,
"loss": 0.0635,
"step": 748
},
{
"epoch": 1.5768421052631578,
"grad_norm": 3.048564605805993,
"learning_rate": 1.0644748311137377e-06,
"loss": 0.0736,
"step": 749
},
{
"epoch": 1.5789473684210527,
"grad_norm": 2.729660101843144,
"learning_rate": 1.0542974530180327e-06,
"loss": 0.0815,
"step": 750
},
{
"epoch": 1.5810526315789475,
"grad_norm": 3.846053704689222,
"learning_rate": 1.0441632244932238e-06,
"loss": 0.0855,
"step": 751
},
{
"epoch": 1.583157894736842,
"grad_norm": 3.242518428961916,
"learning_rate": 1.0340722563656109e-06,
"loss": 0.0617,
"step": 752
},
{
"epoch": 1.5852631578947367,
"grad_norm": 3.288508939138202,
"learning_rate": 1.0240246589884046e-06,
"loss": 0.0627,
"step": 753
},
{
"epoch": 1.5873684210526315,
"grad_norm": 2.6583829662575753,
"learning_rate": 1.0140205422405213e-06,
"loss": 0.0668,
"step": 754
},
{
"epoch": 1.5894736842105264,
"grad_norm": 3.1324991437039547,
"learning_rate": 1.0040600155253766e-06,
"loss": 0.0875,
"step": 755
},
{
"epoch": 1.5915789473684212,
"grad_norm": 2.1752967431606436,
"learning_rate": 9.941431877696955e-07,
"loss": 0.0625,
"step": 756
},
{
"epoch": 1.5936842105263158,
"grad_norm": 3.153051450237148,
"learning_rate": 9.842701674223187e-07,
"loss": 0.0916,
"step": 757
},
{
"epoch": 1.5957894736842104,
"grad_norm": 3.0851678174272656,
"learning_rate": 9.744410624530148e-07,
"loss": 0.0682,
"step": 758
},
{
"epoch": 1.5978947368421053,
"grad_norm": 3.6017843313201627,
"learning_rate": 9.646559803512995e-07,
"loss": 0.0718,
"step": 759
},
{
"epoch": 1.6,
"grad_norm": 2.6419335189350717,
"learning_rate": 9.549150281252633e-07,
"loss": 0.0771,
"step": 760
},
{
"epoch": 1.6021052631578947,
"grad_norm": 2.672014578151709,
"learning_rate": 9.452183123003999e-07,
"loss": 0.0664,
"step": 761
},
{
"epoch": 1.6042105263157893,
"grad_norm": 2.7604515088486776,
"learning_rate": 9.355659389184396e-07,
"loss": 0.0802,
"step": 762
},
{
"epoch": 1.6063157894736841,
"grad_norm": 2.4989218985147827,
"learning_rate": 9.259580135361929e-07,
"loss": 0.0712,
"step": 763
},
{
"epoch": 1.608421052631579,
"grad_norm": 2.761122397313585,
"learning_rate": 9.163946412243896e-07,
"loss": 0.0776,
"step": 764
},
{
"epoch": 1.6105263157894738,
"grad_norm": 2.502582479061757,
"learning_rate": 9.068759265665384e-07,
"loss": 0.0579,
"step": 765
},
{
"epoch": 1.6126315789473684,
"grad_norm": 2.564496551486698,
"learning_rate": 8.974019736577777e-07,
"loss": 0.067,
"step": 766
},
{
"epoch": 1.614736842105263,
"grad_norm": 2.6370265688116774,
"learning_rate": 8.879728861037385e-07,
"loss": 0.0851,
"step": 767
},
{
"epoch": 1.6168421052631579,
"grad_norm": 2.711692043610178,
"learning_rate": 8.785887670194137e-07,
"loss": 0.0503,
"step": 768
},
{
"epoch": 1.6189473684210527,
"grad_norm": 2.9973543080867993,
"learning_rate": 8.692497190280225e-07,
"loss": 0.084,
"step": 769
},
{
"epoch": 1.6210526315789475,
"grad_norm": 2.797781874617319,
"learning_rate": 8.599558442598998e-07,
"loss": 0.0772,
"step": 770
},
{
"epoch": 1.6231578947368421,
"grad_norm": 3.008160474882518,
"learning_rate": 8.507072443513703e-07,
"loss": 0.0718,
"step": 771
},
{
"epoch": 1.6252631578947367,
"grad_norm": 2.569308456782316,
"learning_rate": 8.415040204436426e-07,
"loss": 0.0566,
"step": 772
},
{
"epoch": 1.6273684210526316,
"grad_norm": 2.605706053568482,
"learning_rate": 8.323462731816962e-07,
"loss": 0.0572,
"step": 773
},
{
"epoch": 1.6294736842105264,
"grad_norm": 2.326197374578758,
"learning_rate": 8.232341027131885e-07,
"loss": 0.0627,
"step": 774
},
{
"epoch": 1.631578947368421,
"grad_norm": 2.5642000583273283,
"learning_rate": 8.141676086873574e-07,
"loss": 0.0751,
"step": 775
},
{
"epoch": 1.6336842105263156,
"grad_norm": 2.191527946956071,
"learning_rate": 8.051468902539272e-07,
"loss": 0.0383,
"step": 776
},
{
"epoch": 1.6357894736842105,
"grad_norm": 2.4051907305716265,
"learning_rate": 7.961720460620321e-07,
"loss": 0.0627,
"step": 777
},
{
"epoch": 1.6378947368421053,
"grad_norm": 2.5165851730543114,
"learning_rate": 7.872431742591268e-07,
"loss": 0.0579,
"step": 778
},
{
"epoch": 1.6400000000000001,
"grad_norm": 3.389030001143065,
"learning_rate": 7.783603724899258e-07,
"loss": 0.0897,
"step": 779
},
{
"epoch": 1.6421052631578947,
"grad_norm": 3.150859694485845,
"learning_rate": 7.695237378953224e-07,
"loss": 0.0889,
"step": 780
},
{
"epoch": 1.6442105263157893,
"grad_norm": 2.5782746782491577,
"learning_rate": 7.607333671113409e-07,
"loss": 0.0691,
"step": 781
},
{
"epoch": 1.6463157894736842,
"grad_norm": 2.932437547604155,
"learning_rate": 7.519893562680663e-07,
"loss": 0.0747,
"step": 782
},
{
"epoch": 1.648421052631579,
"grad_norm": 2.7289448649430486,
"learning_rate": 7.432918009885997e-07,
"loss": 0.0894,
"step": 783
},
{
"epoch": 1.6505263157894738,
"grad_norm": 2.622139736136532,
"learning_rate": 7.346407963880137e-07,
"loss": 0.0657,
"step": 784
},
{
"epoch": 1.6526315789473685,
"grad_norm": 2.844401593647993,
"learning_rate": 7.260364370723044e-07,
"loss": 0.0678,
"step": 785
},
{
"epoch": 1.654736842105263,
"grad_norm": 2.46801387849074,
"learning_rate": 7.174788171373731e-07,
"loss": 0.0698,
"step": 786
},
{
"epoch": 1.656842105263158,
"grad_norm": 3.2459197537543103,
"learning_rate": 7.089680301679752e-07,
"loss": 0.0763,
"step": 787
},
{
"epoch": 1.6589473684210527,
"grad_norm": 2.1306283714857694,
"learning_rate": 7.005041692367154e-07,
"loss": 0.0516,
"step": 788
},
{
"epoch": 1.6610526315789473,
"grad_norm": 2.6775897413319028,
"learning_rate": 6.92087326903022e-07,
"loss": 0.0619,
"step": 789
},
{
"epoch": 1.663157894736842,
"grad_norm": 2.524367526847338,
"learning_rate": 6.837175952121305e-07,
"loss": 0.0688,
"step": 790
},
{
"epoch": 1.6652631578947368,
"grad_norm": 2.708587596728961,
"learning_rate": 6.753950656940905e-07,
"loss": 0.0703,
"step": 791
},
{
"epoch": 1.6673684210526316,
"grad_norm": 2.897104706239707,
"learning_rate": 6.671198293627479e-07,
"loss": 0.0621,
"step": 792
},
{
"epoch": 1.6694736842105264,
"grad_norm": 3.359155853905581,
"learning_rate": 6.58891976714764e-07,
"loss": 0.0843,
"step": 793
},
{
"epoch": 1.671578947368421,
"grad_norm": 3.1572683992293564,
"learning_rate": 6.507115977286144e-07,
"loss": 0.0631,
"step": 794
},
{
"epoch": 1.6736842105263157,
"grad_norm": 2.5427050293849613,
"learning_rate": 6.425787818636131e-07,
"loss": 0.0789,
"step": 795
},
{
"epoch": 1.6757894736842105,
"grad_norm": 3.066625121124378,
"learning_rate": 6.34493618058935e-07,
"loss": 0.0686,
"step": 796
},
{
"epoch": 1.6778947368421053,
"grad_norm": 2.822755036635395,
"learning_rate": 6.264561947326331e-07,
"loss": 0.0684,
"step": 797
},
{
"epoch": 1.6800000000000002,
"grad_norm": 2.8703507999231035,
"learning_rate": 6.184665997806832e-07,
"loss": 0.0747,
"step": 798
},
{
"epoch": 1.6821052631578948,
"grad_norm": 3.333171116783649,
"learning_rate": 6.105249205760128e-07,
"loss": 0.089,
"step": 799
},
{
"epoch": 1.6842105263157894,
"grad_norm": 2.2878271103214316,
"learning_rate": 6.026312439675553e-07,
"loss": 0.0622,
"step": 800
},
{
"epoch": 1.6842105263157894,
"eval_loss": 0.1972101926803589,
"eval_runtime": 0.9281,
"eval_samples_per_second": 42.023,
"eval_steps_per_second": 10.775,
"step": 800
},
{
"epoch": 1.6863157894736842,
"grad_norm": 2.812568484004764,
"learning_rate": 5.947856562792926e-07,
"loss": 0.0654,
"step": 801
},
{
"epoch": 1.688421052631579,
"grad_norm": 2.7128006161210108,
"learning_rate": 5.869882433093154e-07,
"loss": 0.0864,
"step": 802
},
{
"epoch": 1.6905263157894737,
"grad_norm": 3.0041687010640983,
"learning_rate": 5.79239090328883e-07,
"loss": 0.0747,
"step": 803
},
{
"epoch": 1.6926315789473683,
"grad_norm": 2.9138447859502095,
"learning_rate": 5.715382820814885e-07,
"loss": 0.0802,
"step": 804
},
{
"epoch": 1.694736842105263,
"grad_norm": 2.5166922519171506,
"learning_rate": 5.63885902781941e-07,
"loss": 0.0675,
"step": 805
},
{
"epoch": 1.696842105263158,
"grad_norm": 2.8497481916116896,
"learning_rate": 5.562820361154315e-07,
"loss": 0.0696,
"step": 806
},
{
"epoch": 1.6989473684210528,
"grad_norm": 3.4754839940102,
"learning_rate": 5.487267652366291e-07,
"loss": 0.0852,
"step": 807
},
{
"epoch": 1.7010526315789474,
"grad_norm": 3.2547037998817596,
"learning_rate": 5.412201727687644e-07,
"loss": 0.0862,
"step": 808
},
{
"epoch": 1.703157894736842,
"grad_norm": 2.414633712271825,
"learning_rate": 5.337623408027293e-07,
"loss": 0.061,
"step": 809
},
{
"epoch": 1.7052631578947368,
"grad_norm": 3.401374616115059,
"learning_rate": 5.263533508961827e-07,
"loss": 0.0952,
"step": 810
},
{
"epoch": 1.7073684210526316,
"grad_norm": 2.73608904289166,
"learning_rate": 5.189932840726486e-07,
"loss": 0.0679,
"step": 811
},
{
"epoch": 1.7094736842105265,
"grad_norm": 2.808779656242632,
"learning_rate": 5.116822208206396e-07,
"loss": 0.0636,
"step": 812
},
{
"epoch": 1.711578947368421,
"grad_norm": 2.8278948519331046,
"learning_rate": 5.044202410927707e-07,
"loss": 0.0757,
"step": 813
},
{
"epoch": 1.7136842105263157,
"grad_norm": 2.5103777723289054,
"learning_rate": 4.972074243048896e-07,
"loss": 0.0603,
"step": 814
},
{
"epoch": 1.7157894736842105,
"grad_norm": 3.1933744646672135,
"learning_rate": 4.900438493352056e-07,
"loss": 0.0682,
"step": 815
},
{
"epoch": 1.7178947368421054,
"grad_norm": 3.1503659721543706,
"learning_rate": 4.829295945234258e-07,
"loss": 0.072,
"step": 816
},
{
"epoch": 1.72,
"grad_norm": 2.747297586956878,
"learning_rate": 4.758647376699033e-07,
"loss": 0.0528,
"step": 817
},
{
"epoch": 1.7221052631578946,
"grad_norm": 2.2158924674953777,
"learning_rate": 4.6884935603477733e-07,
"loss": 0.0565,
"step": 818
},
{
"epoch": 1.7242105263157894,
"grad_norm": 2.8793571493416823,
"learning_rate": 4.6188352633713964e-07,
"loss": 0.072,
"step": 819
},
{
"epoch": 1.7263157894736842,
"grad_norm": 3.1524529277439055,
"learning_rate": 4.549673247541875e-07,
"loss": 0.0759,
"step": 820
},
{
"epoch": 1.728421052631579,
"grad_norm": 2.5386910299707854,
"learning_rate": 4.48100826920394e-07,
"loss": 0.0715,
"step": 821
},
{
"epoch": 1.7305263157894737,
"grad_norm": 3.012214075182323,
"learning_rate": 4.412841079266778e-07,
"loss": 0.0613,
"step": 822
},
{
"epoch": 1.7326315789473683,
"grad_norm": 2.708221561942324,
"learning_rate": 4.345172423195865e-07,
"loss": 0.0977,
"step": 823
},
{
"epoch": 1.7347368421052631,
"grad_norm": 3.115720504229963,
"learning_rate": 4.27800304100478e-07,
"loss": 0.0688,
"step": 824
},
{
"epoch": 1.736842105263158,
"grad_norm": 1.987345247381263,
"learning_rate": 4.211333667247125e-07,
"loss": 0.0569,
"step": 825
},
{
"epoch": 1.7389473684210528,
"grad_norm": 3.3300694952948593,
"learning_rate": 4.1451650310085076e-07,
"loss": 0.0718,
"step": 826
},
{
"epoch": 1.7410526315789474,
"grad_norm": 2.6073178578766365,
"learning_rate": 4.079497855898501e-07,
"loss": 0.0651,
"step": 827
},
{
"epoch": 1.743157894736842,
"grad_norm": 2.520132180808996,
"learning_rate": 4.01433286004283e-07,
"loss": 0.0746,
"step": 828
},
{
"epoch": 1.7452631578947368,
"grad_norm": 4.216004495536436,
"learning_rate": 3.949670756075447e-07,
"loss": 0.0986,
"step": 829
},
{
"epoch": 1.7473684210526317,
"grad_norm": 3.0641794864142313,
"learning_rate": 3.885512251130763e-07,
"loss": 0.0694,
"step": 830
},
{
"epoch": 1.7494736842105263,
"grad_norm": 2.3436697576482493,
"learning_rate": 3.8218580468359136e-07,
"loss": 0.0677,
"step": 831
},
{
"epoch": 1.751578947368421,
"grad_norm": 2.5525226853131406,
"learning_rate": 3.7587088393030604e-07,
"loss": 0.07,
"step": 832
},
{
"epoch": 1.7536842105263157,
"grad_norm": 2.557138885671911,
"learning_rate": 3.6960653191218333e-07,
"loss": 0.0715,
"step": 833
},
{
"epoch": 1.7557894736842106,
"grad_norm": 2.7336947284134907,
"learning_rate": 3.6339281713517304e-07,
"loss": 0.0766,
"step": 834
},
{
"epoch": 1.7578947368421054,
"grad_norm": 2.7665277428264687,
"learning_rate": 3.572298075514652e-07,
"loss": 0.0892,
"step": 835
},
{
"epoch": 1.76,
"grad_norm": 3.7811259313548935,
"learning_rate": 3.511175705587433e-07,
"loss": 0.0848,
"step": 836
},
{
"epoch": 1.7621052631578946,
"grad_norm": 3.144574327722061,
"learning_rate": 3.450561729994534e-07,
"loss": 0.0738,
"step": 837
},
{
"epoch": 1.7642105263157895,
"grad_norm": 2.857062951752684,
"learning_rate": 3.390456811600673e-07,
"loss": 0.0733,
"step": 838
},
{
"epoch": 1.7663157894736843,
"grad_norm": 3.655857210909399,
"learning_rate": 3.3308616077036113e-07,
"loss": 0.1221,
"step": 839
},
{
"epoch": 1.768421052631579,
"grad_norm": 2.1902799347040944,
"learning_rate": 3.271776770026963e-07,
"loss": 0.0592,
"step": 840
},
{
"epoch": 1.7705263157894737,
"grad_norm": 3.4061442220418305,
"learning_rate": 3.213202944713023e-07,
"loss": 0.0959,
"step": 841
},
{
"epoch": 1.7726315789473683,
"grad_norm": 2.1231955359369286,
"learning_rate": 3.1551407723157734e-07,
"loss": 0.065,
"step": 842
},
{
"epoch": 1.7747368421052632,
"grad_norm": 2.8471057173295273,
"learning_rate": 3.0975908877938277e-07,
"loss": 0.0744,
"step": 843
},
{
"epoch": 1.776842105263158,
"grad_norm": 3.2337983555019605,
"learning_rate": 3.040553920503503e-07,
"loss": 0.0905,
"step": 844
},
{
"epoch": 1.7789473684210526,
"grad_norm": 2.542206604869396,
"learning_rate": 2.984030494191942e-07,
"loss": 0.0689,
"step": 845
},
{
"epoch": 1.7810526315789472,
"grad_norm": 2.980410819168702,
"learning_rate": 2.928021226990263e-07,
"loss": 0.0698,
"step": 846
},
{
"epoch": 1.783157894736842,
"grad_norm": 2.117615427082996,
"learning_rate": 2.8725267314068496e-07,
"loss": 0.0611,
"step": 847
},
{
"epoch": 1.7852631578947369,
"grad_norm": 2.3322615491429475,
"learning_rate": 2.817547614320615e-07,
"loss": 0.0606,
"step": 848
},
{
"epoch": 1.7873684210526317,
"grad_norm": 2.1217132504251626,
"learning_rate": 2.763084476974376e-07,
"loss": 0.0677,
"step": 849
},
{
"epoch": 1.7894736842105263,
"grad_norm": 2.635485392016026,
"learning_rate": 2.7091379149682683e-07,
"loss": 0.0654,
"step": 850
},
{
"epoch": 1.791578947368421,
"grad_norm": 2.9790312851694876,
"learning_rate": 2.655708518253258e-07,
"loss": 0.0677,
"step": 851
},
{
"epoch": 1.7936842105263158,
"grad_norm": 2.3973095715711317,
"learning_rate": 2.602796871124663e-07,
"loss": 0.0504,
"step": 852
},
{
"epoch": 1.7957894736842106,
"grad_norm": 2.7082756582731973,
"learning_rate": 2.5504035522157853e-07,
"loss": 0.0647,
"step": 853
},
{
"epoch": 1.7978947368421052,
"grad_norm": 2.9121449991346062,
"learning_rate": 2.4985291344915675e-07,
"loss": 0.0903,
"step": 854
},
{
"epoch": 1.8,
"grad_norm": 2.9867534692660245,
"learning_rate": 2.447174185242324e-07,
"loss": 0.074,
"step": 855
},
{
"epoch": 1.8021052631578947,
"grad_norm": 2.8737059550127455,
"learning_rate": 2.3963392660775576e-07,
"loss": 0.084,
"step": 856
},
{
"epoch": 1.8042105263157895,
"grad_norm": 3.2375432000666216,
"learning_rate": 2.3460249329197825e-07,
"loss": 0.0939,
"step": 857
},
{
"epoch": 1.8063157894736843,
"grad_norm": 3.286934040592846,
"learning_rate": 2.296231735998511e-07,
"loss": 0.0756,
"step": 858
},
{
"epoch": 1.808421052631579,
"grad_norm": 3.06301786931153,
"learning_rate": 2.2469602198441575e-07,
"loss": 0.0777,
"step": 859
},
{
"epoch": 1.8105263157894735,
"grad_norm": 3.1645723085351123,
"learning_rate": 2.198210923282118e-07,
"loss": 0.073,
"step": 860
},
{
"epoch": 1.8126315789473684,
"grad_norm": 2.493058988989823,
"learning_rate": 2.149984379426906e-07,
"loss": 0.0764,
"step": 861
},
{
"epoch": 1.8147368421052632,
"grad_norm": 2.881863955432623,
"learning_rate": 2.102281115676258e-07,
"loss": 0.0809,
"step": 862
},
{
"epoch": 1.816842105263158,
"grad_norm": 2.7694248049652974,
"learning_rate": 2.0551016537054492e-07,
"loss": 0.0627,
"step": 863
},
{
"epoch": 1.8189473684210526,
"grad_norm": 2.879825150355328,
"learning_rate": 2.008446509461498e-07,
"loss": 0.0651,
"step": 864
},
{
"epoch": 1.8210526315789473,
"grad_norm": 2.6298059648937135,
"learning_rate": 1.962316193157593e-07,
"loss": 0.077,
"step": 865
},
{
"epoch": 1.823157894736842,
"grad_norm": 2.302690351699349,
"learning_rate": 1.91671120926748e-07,
"loss": 0.055,
"step": 866
},
{
"epoch": 1.825263157894737,
"grad_norm": 2.954098530395544,
"learning_rate": 1.871632056519962e-07,
"loss": 0.0876,
"step": 867
},
{
"epoch": 1.8273684210526315,
"grad_norm": 2.7636736978855594,
"learning_rate": 1.8270792278934302e-07,
"loss": 0.0915,
"step": 868
},
{
"epoch": 1.8294736842105264,
"grad_norm": 2.7545832309484988,
"learning_rate": 1.7830532106104747e-07,
"loss": 0.0667,
"step": 869
},
{
"epoch": 1.831578947368421,
"grad_norm": 2.543153382990055,
"learning_rate": 1.7395544861325718e-07,
"loss": 0.0584,
"step": 870
},
{
"epoch": 1.8336842105263158,
"grad_norm": 2.807508111947715,
"learning_rate": 1.696583530154794e-07,
"loss": 0.0797,
"step": 871
},
{
"epoch": 1.8357894736842106,
"grad_norm": 2.8192335775327795,
"learning_rate": 1.6541408126006464e-07,
"loss": 0.0872,
"step": 872
},
{
"epoch": 1.8378947368421052,
"grad_norm": 3.209795989098829,
"learning_rate": 1.6122267976168783e-07,
"loss": 0.0999,
"step": 873
},
{
"epoch": 1.8399999999999999,
"grad_norm": 3.4761511974704264,
"learning_rate": 1.5708419435684463e-07,
"loss": 0.0922,
"step": 874
},
{
"epoch": 1.8421052631578947,
"grad_norm": 3.1674697236400813,
"learning_rate": 1.5299867030334815e-07,
"loss": 0.0635,
"step": 875
},
{
"epoch": 1.8442105263157895,
"grad_norm": 3.6723508356857484,
"learning_rate": 1.4896615227983468e-07,
"loss": 0.0602,
"step": 876
},
{
"epoch": 1.8463157894736844,
"grad_norm": 2.911727155785081,
"learning_rate": 1.4498668438527597e-07,
"loss": 0.0529,
"step": 877
},
{
"epoch": 1.848421052631579,
"grad_norm": 2.3359023762817825,
"learning_rate": 1.4106031013849498e-07,
"loss": 0.0514,
"step": 878
},
{
"epoch": 1.8505263157894736,
"grad_norm": 2.409615553053479,
"learning_rate": 1.3718707247769137e-07,
"loss": 0.0555,
"step": 879
},
{
"epoch": 1.8526315789473684,
"grad_norm": 2.8117387336774855,
"learning_rate": 1.333670137599713e-07,
"loss": 0.0916,
"step": 880
},
{
"epoch": 1.8547368421052632,
"grad_norm": 2.0682295293481077,
"learning_rate": 1.2960017576088445e-07,
"loss": 0.0582,
"step": 881
},
{
"epoch": 1.8568421052631578,
"grad_norm": 2.551562601977345,
"learning_rate": 1.2588659967396998e-07,
"loss": 0.0732,
"step": 882
},
{
"epoch": 1.8589473684210527,
"grad_norm": 3.2003699780502597,
"learning_rate": 1.222263261102985e-07,
"loss": 0.0711,
"step": 883
},
{
"epoch": 1.8610526315789473,
"grad_norm": 3.486580924102614,
"learning_rate": 1.1861939509803688e-07,
"loss": 0.0748,
"step": 884
},
{
"epoch": 1.8631578947368421,
"grad_norm": 2.2455491108015613,
"learning_rate": 1.1506584608200366e-07,
"loss": 0.0727,
"step": 885
},
{
"epoch": 1.865263157894737,
"grad_norm": 2.4398853127583284,
"learning_rate": 1.1156571792324212e-07,
"loss": 0.0529,
"step": 886
},
{
"epoch": 1.8673684210526316,
"grad_norm": 2.624358820662373,
"learning_rate": 1.0811904889859337e-07,
"loss": 0.0796,
"step": 887
},
{
"epoch": 1.8694736842105262,
"grad_norm": 2.984398704743138,
"learning_rate": 1.0472587670027678e-07,
"loss": 0.0853,
"step": 888
},
{
"epoch": 1.871578947368421,
"grad_norm": 3.4342828404155736,
"learning_rate": 1.0138623843548078e-07,
"loss": 0.0807,
"step": 889
},
{
"epoch": 1.8736842105263158,
"grad_norm": 2.135864268441869,
"learning_rate": 9.810017062595322e-08,
"loss": 0.053,
"step": 890
},
{
"epoch": 1.8757894736842107,
"grad_norm": 2.915254739899194,
"learning_rate": 9.486770920760668e-08,
"loss": 0.0751,
"step": 891
},
{
"epoch": 1.8778947368421053,
"grad_norm": 2.867686868160426,
"learning_rate": 9.16888895301199e-08,
"loss": 0.0695,
"step": 892
},
{
"epoch": 1.88,
"grad_norm": 2.4403837817102008,
"learning_rate": 8.856374635655696e-08,
"loss": 0.0552,
"step": 893
},
{
"epoch": 1.8821052631578947,
"grad_norm": 2.96278001070486,
"learning_rate": 8.549231386298151e-08,
"loss": 0.0745,
"step": 894
},
{
"epoch": 1.8842105263157896,
"grad_norm": 2.2812297751359134,
"learning_rate": 8.247462563808816e-08,
"loss": 0.058,
"step": 895
},
{
"epoch": 1.8863157894736842,
"grad_norm": 2.8257873863079532,
"learning_rate": 7.951071468283166e-08,
"loss": 0.0799,
"step": 896
},
{
"epoch": 1.888421052631579,
"grad_norm": 3.826719667852147,
"learning_rate": 7.660061341006719e-08,
"loss": 0.0713,
"step": 897
},
{
"epoch": 1.8905263157894736,
"grad_norm": 3.7405575996641156,
"learning_rate": 7.374435364419675e-08,
"loss": 0.0948,
"step": 898
},
{
"epoch": 1.8926315789473684,
"grad_norm": 3.4882300697302058,
"learning_rate": 7.094196662081832e-08,
"loss": 0.0778,
"step": 899
},
{
"epoch": 1.8947368421052633,
"grad_norm": 2.7491143569308525,
"learning_rate": 6.819348298638839e-08,
"loss": 0.0549,
"step": 900
},
{
"epoch": 1.8968421052631579,
"grad_norm": 2.255741826050675,
"learning_rate": 6.549893279788278e-08,
"loss": 0.0581,
"step": 901
},
{
"epoch": 1.8989473684210525,
"grad_norm": 2.8944471113447774,
"learning_rate": 6.285834552247127e-08,
"loss": 0.101,
"step": 902
},
{
"epoch": 1.9010526315789473,
"grad_norm": 3.516368002515928,
"learning_rate": 6.027175003719354e-08,
"loss": 0.0843,
"step": 903
},
{
"epoch": 1.9031578947368422,
"grad_norm": 2.8055662943099717,
"learning_rate": 5.773917462864265e-08,
"loss": 0.0718,
"step": 904
},
{
"epoch": 1.905263157894737,
"grad_norm": 2.8528278587154317,
"learning_rate": 5.526064699265754e-08,
"loss": 0.0771,
"step": 905
},
{
"epoch": 1.9073684210526316,
"grad_norm": 2.4442597669717374,
"learning_rate": 5.2836194234019976e-08,
"loss": 0.0578,
"step": 906
},
{
"epoch": 1.9094736842105262,
"grad_norm": 3.328401624060881,
"learning_rate": 5.0465842866156965e-08,
"loss": 0.0834,
"step": 907
},
{
"epoch": 1.911578947368421,
"grad_norm": 3.1255146432363095,
"learning_rate": 4.8149618810850454e-08,
"loss": 0.0775,
"step": 908
},
{
"epoch": 1.9136842105263159,
"grad_norm": 2.6888604157650615,
"learning_rate": 4.588754739795587e-08,
"loss": 0.0697,
"step": 909
},
{
"epoch": 1.9157894736842105,
"grad_norm": 4.068321596359929,
"learning_rate": 4.367965336512403e-08,
"loss": 0.0959,
"step": 910
},
{
"epoch": 1.917894736842105,
"grad_norm": 2.432792122907068,
"learning_rate": 4.1525960857530244e-08,
"loss": 0.0583,
"step": 911
},
{
"epoch": 1.92,
"grad_norm": 2.77665437029982,
"learning_rate": 3.9426493427611177e-08,
"loss": 0.0672,
"step": 912
},
{
"epoch": 1.9221052631578948,
"grad_norm": 3.087230249032823,
"learning_rate": 3.738127403480507e-08,
"loss": 0.0699,
"step": 913
},
{
"epoch": 1.9242105263157896,
"grad_norm": 2.6241279675480054,
"learning_rate": 3.5390325045304704e-08,
"loss": 0.0649,
"step": 914
},
{
"epoch": 1.9263157894736842,
"grad_norm": 2.7148617633550507,
"learning_rate": 3.345366823180929e-08,
"loss": 0.0677,
"step": 915
},
{
"epoch": 1.9284210526315788,
"grad_norm": 2.5474852778498316,
"learning_rate": 3.1571324773286284e-08,
"loss": 0.0639,
"step": 916
},
{
"epoch": 1.9305263157894736,
"grad_norm": 2.1409033097729813,
"learning_rate": 2.9743315254743834e-08,
"loss": 0.041,
"step": 917
},
{
"epoch": 1.9326315789473685,
"grad_norm": 3.275485354161904,
"learning_rate": 2.7969659666999273e-08,
"loss": 0.0881,
"step": 918
},
{
"epoch": 1.9347368421052633,
"grad_norm": 2.6681527860380396,
"learning_rate": 2.625037740646763e-08,
"loss": 0.0772,
"step": 919
},
{
"epoch": 1.936842105263158,
"grad_norm": 2.3495006124196576,
"learning_rate": 2.4585487274942922e-08,
"loss": 0.0557,
"step": 920
},
{
"epoch": 1.9389473684210525,
"grad_norm": 2.5228819355573235,
"learning_rate": 2.2975007479397736e-08,
"loss": 0.0507,
"step": 921
},
{
"epoch": 1.9410526315789474,
"grad_norm": 3.756140115268572,
"learning_rate": 2.1418955631781203e-08,
"loss": 0.1053,
"step": 922
},
{
"epoch": 1.9431578947368422,
"grad_norm": 2.960441030514824,
"learning_rate": 1.9917348748826337e-08,
"loss": 0.0708,
"step": 923
},
{
"epoch": 1.9452631578947368,
"grad_norm": 2.8382521221794863,
"learning_rate": 1.847020325186577e-08,
"loss": 0.0531,
"step": 924
},
{
"epoch": 1.9473684210526314,
"grad_norm": 3.0227161901258914,
"learning_rate": 1.7077534966650767e-08,
"loss": 0.0752,
"step": 925
},
{
"epoch": 1.9494736842105262,
"grad_norm": 2.666050724013321,
"learning_rate": 1.5739359123178587e-08,
"loss": 0.0606,
"step": 926
},
{
"epoch": 1.951578947368421,
"grad_norm": 3.1791911217585054,
"learning_rate": 1.4455690355525964e-08,
"loss": 0.0657,
"step": 927
},
{
"epoch": 1.953684210526316,
"grad_norm": 2.9100920884957406,
"learning_rate": 1.3226542701689215e-08,
"loss": 0.0674,
"step": 928
},
{
"epoch": 1.9557894736842105,
"grad_norm": 2.903965005728698,
"learning_rate": 1.2051929603428824e-08,
"loss": 0.0801,
"step": 929
},
{
"epoch": 1.9578947368421051,
"grad_norm": 3.3785772789177706,
"learning_rate": 1.0931863906127327e-08,
"loss": 0.0813,
"step": 930
},
{
"epoch": 1.96,
"grad_norm": 2.8464427553618235,
"learning_rate": 9.866357858642206e-09,
"loss": 0.0773,
"step": 931
},
{
"epoch": 1.9621052631578948,
"grad_norm": 2.683834178385762,
"learning_rate": 8.855423113177664e-09,
"loss": 0.0878,
"step": 932
},
{
"epoch": 1.9642105263157896,
"grad_norm": 2.293423722958731,
"learning_rate": 7.899070725153612e-09,
"loss": 0.0574,
"step": 933
},
{
"epoch": 1.9663157894736842,
"grad_norm": 2.9921722390971985,
"learning_rate": 6.997311153086883e-09,
"loss": 0.0786,
"step": 934
},
{
"epoch": 1.9684210526315788,
"grad_norm": 3.0905052771040102,
"learning_rate": 6.150154258476315e-09,
"loss": 0.0795,
"step": 935
},
{
"epoch": 1.9705263157894737,
"grad_norm": 2.91652752925557,
"learning_rate": 5.357609305692291e-09,
"loss": 0.0869,
"step": 936
},
{
"epoch": 1.9726315789473685,
"grad_norm": 2.664322796790042,
"learning_rate": 4.619684961881255e-09,
"loss": 0.0603,
"step": 937
},
{
"epoch": 1.9747368421052631,
"grad_norm": 2.5227019769378507,
"learning_rate": 3.936389296864129e-09,
"loss": 0.0691,
"step": 938
},
{
"epoch": 1.9768421052631577,
"grad_norm": 3.070188327105861,
"learning_rate": 3.307729783054159e-09,
"loss": 0.0701,
"step": 939
},
{
"epoch": 1.9789473684210526,
"grad_norm": 2.527356227943579,
"learning_rate": 2.7337132953697555e-09,
"loss": 0.0543,
"step": 940
},
{
"epoch": 1.9810526315789474,
"grad_norm": 3.2854112794071066,
"learning_rate": 2.214346111164556e-09,
"loss": 0.0843,
"step": 941
},
{
"epoch": 1.9831578947368422,
"grad_norm": 2.4190595898058014,
"learning_rate": 1.749633910153592e-09,
"loss": 0.0584,
"step": 942
},
{
"epoch": 1.9852631578947368,
"grad_norm": 2.924977300546306,
"learning_rate": 1.3395817743561135e-09,
"loss": 0.0867,
"step": 943
},
{
"epoch": 1.9873684210526315,
"grad_norm": 2.892067107131255,
"learning_rate": 9.841941880361917e-10,
"loss": 0.0712,
"step": 944
},
{
"epoch": 1.9894736842105263,
"grad_norm": 2.507649725691115,
"learning_rate": 6.834750376549793e-10,
"loss": 0.0695,
"step": 945
},
{
"epoch": 1.9915789473684211,
"grad_norm": 3.029465353157538,
"learning_rate": 4.374276118301879e-10,
"loss": 0.0804,
"step": 946
},
{
"epoch": 1.993684210526316,
"grad_norm": 3.759725367800689,
"learning_rate": 2.4605460129556446e-10,
"loss": 0.0821,
"step": 947
},
{
"epoch": 1.9957894736842106,
"grad_norm": 2.500823320563419,
"learning_rate": 1.0935809887702154e-10,
"loss": 0.0678,
"step": 948
},
{
"epoch": 1.9978947368421052,
"grad_norm": 3.6772672072650834,
"learning_rate": 2.733959946432663e-11,
"loss": 0.0815,
"step": 949
},
{
"epoch": 2.0,
"grad_norm": 2.6395246965587384,
"learning_rate": 0.0,
"loss": 0.0596,
"step": 950
},
{
"epoch": 2.0,
"step": 950,
"total_flos": 1598198317056.0,
"train_loss": 0.15143464744875307,
"train_runtime": 436.4524,
"train_samples_per_second": 17.409,
"train_steps_per_second": 2.177
}
],
"logging_steps": 1,
"max_steps": 950,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1598198317056.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}