random_EC4NbuBFf7esPYHc / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
4337327 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002105263157894737,
"grad_norm": 5.903966976524179,
"learning_rate": 9.999972660400536e-06,
"loss": 0.5085,
"step": 1
},
{
"epoch": 0.004210526315789474,
"grad_norm": 4.938036117868723,
"learning_rate": 9.999890641901124e-06,
"loss": 0.4436,
"step": 2
},
{
"epoch": 0.00631578947368421,
"grad_norm": 4.388144856014597,
"learning_rate": 9.999753945398704e-06,
"loss": 0.3735,
"step": 3
},
{
"epoch": 0.008421052631578947,
"grad_norm": 3.677991014602486,
"learning_rate": 9.99956257238817e-06,
"loss": 0.3626,
"step": 4
},
{
"epoch": 0.010526315789473684,
"grad_norm": 4.1261678577077365,
"learning_rate": 9.999316524962347e-06,
"loss": 0.3439,
"step": 5
},
{
"epoch": 0.01263157894736842,
"grad_norm": 4.480913214762388,
"learning_rate": 9.999015805811965e-06,
"loss": 0.3425,
"step": 6
},
{
"epoch": 0.014736842105263158,
"grad_norm": 3.6087816386136216,
"learning_rate": 9.998660418225645e-06,
"loss": 0.3288,
"step": 7
},
{
"epoch": 0.016842105263157894,
"grad_norm": 3.8303469041456903,
"learning_rate": 9.998250366089848e-06,
"loss": 0.3631,
"step": 8
},
{
"epoch": 0.018947368421052633,
"grad_norm": 4.051441024522369,
"learning_rate": 9.997785653888835e-06,
"loss": 0.3475,
"step": 9
},
{
"epoch": 0.021052631578947368,
"grad_norm": 3.2811851205186113,
"learning_rate": 9.99726628670463e-06,
"loss": 0.2727,
"step": 10
},
{
"epoch": 0.023157894736842106,
"grad_norm": 3.076267871254497,
"learning_rate": 9.996692270216946e-06,
"loss": 0.2735,
"step": 11
},
{
"epoch": 0.02526315789473684,
"grad_norm": 4.218304225935037,
"learning_rate": 9.996063610703138e-06,
"loss": 0.3884,
"step": 12
},
{
"epoch": 0.02736842105263158,
"grad_norm": 3.0056525766185564,
"learning_rate": 9.995380315038119e-06,
"loss": 0.2533,
"step": 13
},
{
"epoch": 0.029473684210526315,
"grad_norm": 3.4173235428174,
"learning_rate": 9.994642390694308e-06,
"loss": 0.2708,
"step": 14
},
{
"epoch": 0.031578947368421054,
"grad_norm": 3.804816278379765,
"learning_rate": 9.993849845741525e-06,
"loss": 0.3844,
"step": 15
},
{
"epoch": 0.03368421052631579,
"grad_norm": 3.2085820806655585,
"learning_rate": 9.993002688846913e-06,
"loss": 0.34,
"step": 16
},
{
"epoch": 0.035789473684210524,
"grad_norm": 3.343915010488622,
"learning_rate": 9.992100929274848e-06,
"loss": 0.3061,
"step": 17
},
{
"epoch": 0.037894736842105266,
"grad_norm": 2.953981749061276,
"learning_rate": 9.991144576886824e-06,
"loss": 0.3124,
"step": 18
},
{
"epoch": 0.04,
"grad_norm": 3.065585554970114,
"learning_rate": 9.990133642141359e-06,
"loss": 0.2862,
"step": 19
},
{
"epoch": 0.042105263157894736,
"grad_norm": 3.0851420366070537,
"learning_rate": 9.989068136093873e-06,
"loss": 0.2916,
"step": 20
},
{
"epoch": 0.04421052631578947,
"grad_norm": 3.201737423886665,
"learning_rate": 9.987948070396572e-06,
"loss": 0.288,
"step": 21
},
{
"epoch": 0.04631578947368421,
"grad_norm": 2.859254132653785,
"learning_rate": 9.986773457298311e-06,
"loss": 0.2687,
"step": 22
},
{
"epoch": 0.04842105263157895,
"grad_norm": 3.7432890508375283,
"learning_rate": 9.985544309644474e-06,
"loss": 0.3261,
"step": 23
},
{
"epoch": 0.05052631578947368,
"grad_norm": 2.692301660805473,
"learning_rate": 9.984260640876821e-06,
"loss": 0.2282,
"step": 24
},
{
"epoch": 0.05263157894736842,
"grad_norm": 4.281628785614743,
"learning_rate": 9.98292246503335e-06,
"loss": 0.4261,
"step": 25
},
{
"epoch": 0.05473684210526316,
"grad_norm": 3.8562642975899055,
"learning_rate": 9.981529796748135e-06,
"loss": 0.3469,
"step": 26
},
{
"epoch": 0.056842105263157895,
"grad_norm": 3.4474081351012,
"learning_rate": 9.980082651251175e-06,
"loss": 0.2656,
"step": 27
},
{
"epoch": 0.05894736842105263,
"grad_norm": 3.3764979188026447,
"learning_rate": 9.97858104436822e-06,
"loss": 0.3002,
"step": 28
},
{
"epoch": 0.061052631578947365,
"grad_norm": 4.197028644917946,
"learning_rate": 9.977024992520604e-06,
"loss": 0.3497,
"step": 29
},
{
"epoch": 0.06315789473684211,
"grad_norm": 3.721020976898217,
"learning_rate": 9.975414512725058e-06,
"loss": 0.3571,
"step": 30
},
{
"epoch": 0.06526315789473684,
"grad_norm": 3.555769583954405,
"learning_rate": 9.973749622593534e-06,
"loss": 0.2659,
"step": 31
},
{
"epoch": 0.06736842105263158,
"grad_norm": 3.4068044809170934,
"learning_rate": 9.972030340333e-06,
"loss": 0.2353,
"step": 32
},
{
"epoch": 0.06947368421052631,
"grad_norm": 3.602249926130539,
"learning_rate": 9.970256684745258e-06,
"loss": 0.2838,
"step": 33
},
{
"epoch": 0.07157894736842105,
"grad_norm": 3.1569700401997474,
"learning_rate": 9.968428675226714e-06,
"loss": 0.2556,
"step": 34
},
{
"epoch": 0.07368421052631578,
"grad_norm": 3.2728677083109523,
"learning_rate": 9.966546331768192e-06,
"loss": 0.2962,
"step": 35
},
{
"epoch": 0.07578947368421053,
"grad_norm": 3.067636390716889,
"learning_rate": 9.964609674954696e-06,
"loss": 0.2882,
"step": 36
},
{
"epoch": 0.07789473684210527,
"grad_norm": 2.8096887544728935,
"learning_rate": 9.962618725965196e-06,
"loss": 0.233,
"step": 37
},
{
"epoch": 0.08,
"grad_norm": 3.3000170129014386,
"learning_rate": 9.960573506572391e-06,
"loss": 0.2534,
"step": 38
},
{
"epoch": 0.08210526315789474,
"grad_norm": 3.5389529592924576,
"learning_rate": 9.95847403914247e-06,
"loss": 0.2772,
"step": 39
},
{
"epoch": 0.08421052631578947,
"grad_norm": 3.453344441208766,
"learning_rate": 9.956320346634877e-06,
"loss": 0.272,
"step": 40
},
{
"epoch": 0.0863157894736842,
"grad_norm": 4.307183932380211,
"learning_rate": 9.954112452602045e-06,
"loss": 0.3301,
"step": 41
},
{
"epoch": 0.08842105263157894,
"grad_norm": 2.9656779667391193,
"learning_rate": 9.951850381189152e-06,
"loss": 0.2625,
"step": 42
},
{
"epoch": 0.09052631578947369,
"grad_norm": 3.5490499282339343,
"learning_rate": 9.949534157133844e-06,
"loss": 0.2999,
"step": 43
},
{
"epoch": 0.09263157894736843,
"grad_norm": 3.113609674175823,
"learning_rate": 9.94716380576598e-06,
"loss": 0.2757,
"step": 44
},
{
"epoch": 0.09473684210526316,
"grad_norm": 2.9818002459346573,
"learning_rate": 9.944739353007344e-06,
"loss": 0.2816,
"step": 45
},
{
"epoch": 0.0968421052631579,
"grad_norm": 3.434946218127203,
"learning_rate": 9.942260825371359e-06,
"loss": 0.212,
"step": 46
},
{
"epoch": 0.09894736842105263,
"grad_norm": 4.459352367699622,
"learning_rate": 9.939728249962808e-06,
"loss": 0.4215,
"step": 47
},
{
"epoch": 0.10105263157894737,
"grad_norm": 3.612646296219494,
"learning_rate": 9.937141654477529e-06,
"loss": 0.3106,
"step": 48
},
{
"epoch": 0.1031578947368421,
"grad_norm": 3.243176699962646,
"learning_rate": 9.934501067202117e-06,
"loss": 0.2759,
"step": 49
},
{
"epoch": 0.10526315789473684,
"grad_norm": 3.5224479463075893,
"learning_rate": 9.931806517013612e-06,
"loss": 0.3075,
"step": 50
},
{
"epoch": 0.10736842105263159,
"grad_norm": 3.092501849170216,
"learning_rate": 9.929058033379181e-06,
"loss": 0.3048,
"step": 51
},
{
"epoch": 0.10947368421052632,
"grad_norm": 4.090711703984495,
"learning_rate": 9.926255646355804e-06,
"loss": 0.2582,
"step": 52
},
{
"epoch": 0.11157894736842106,
"grad_norm": 3.5682754090683435,
"learning_rate": 9.923399386589933e-06,
"loss": 0.3061,
"step": 53
},
{
"epoch": 0.11368421052631579,
"grad_norm": 3.348524601466594,
"learning_rate": 9.920489285317169e-06,
"loss": 0.2372,
"step": 54
},
{
"epoch": 0.11578947368421053,
"grad_norm": 3.1821874258149823,
"learning_rate": 9.917525374361913e-06,
"loss": 0.2545,
"step": 55
},
{
"epoch": 0.11789473684210526,
"grad_norm": 3.539674414581924,
"learning_rate": 9.91450768613702e-06,
"loss": 0.3059,
"step": 56
},
{
"epoch": 0.12,
"grad_norm": 3.4913151862965317,
"learning_rate": 9.911436253643445e-06,
"loss": 0.3316,
"step": 57
},
{
"epoch": 0.12210526315789473,
"grad_norm": 4.020069826990793,
"learning_rate": 9.908311110469881e-06,
"loss": 0.3612,
"step": 58
},
{
"epoch": 0.12421052631578948,
"grad_norm": 2.8568567871983936,
"learning_rate": 9.905132290792395e-06,
"loss": 0.2476,
"step": 59
},
{
"epoch": 0.12631578947368421,
"grad_norm": 3.261450325703877,
"learning_rate": 9.901899829374048e-06,
"loss": 0.2954,
"step": 60
},
{
"epoch": 0.12842105263157894,
"grad_norm": 2.7362895761373807,
"learning_rate": 9.89861376156452e-06,
"loss": 0.2721,
"step": 61
},
{
"epoch": 0.13052631578947368,
"grad_norm": 3.11244127366191,
"learning_rate": 9.895274123299724e-06,
"loss": 0.298,
"step": 62
},
{
"epoch": 0.13263157894736843,
"grad_norm": 3.6632993774201568,
"learning_rate": 9.891880951101407e-06,
"loss": 0.2902,
"step": 63
},
{
"epoch": 0.13473684210526315,
"grad_norm": 3.320679243796475,
"learning_rate": 9.888434282076759e-06,
"loss": 0.254,
"step": 64
},
{
"epoch": 0.1368421052631579,
"grad_norm": 3.532964946280944,
"learning_rate": 9.884934153917998e-06,
"loss": 0.3431,
"step": 65
},
{
"epoch": 0.13894736842105262,
"grad_norm": 3.64231126319289,
"learning_rate": 9.881380604901964e-06,
"loss": 0.3407,
"step": 66
},
{
"epoch": 0.14105263157894737,
"grad_norm": 3.1390516154053008,
"learning_rate": 9.877773673889702e-06,
"loss": 0.2751,
"step": 67
},
{
"epoch": 0.1431578947368421,
"grad_norm": 3.4517465264493046,
"learning_rate": 9.874113400326031e-06,
"loss": 0.2667,
"step": 68
},
{
"epoch": 0.14526315789473684,
"grad_norm": 3.2206234405147036,
"learning_rate": 9.870399824239116e-06,
"loss": 0.283,
"step": 69
},
{
"epoch": 0.14736842105263157,
"grad_norm": 2.9995284377277542,
"learning_rate": 9.86663298624003e-06,
"loss": 0.2184,
"step": 70
},
{
"epoch": 0.14947368421052631,
"grad_norm": 2.999026879401691,
"learning_rate": 9.86281292752231e-06,
"loss": 0.2429,
"step": 71
},
{
"epoch": 0.15157894736842106,
"grad_norm": 2.766840628283391,
"learning_rate": 9.858939689861506e-06,
"loss": 0.2379,
"step": 72
},
{
"epoch": 0.15368421052631578,
"grad_norm": 3.8140293080779113,
"learning_rate": 9.855013315614725e-06,
"loss": 0.2823,
"step": 73
},
{
"epoch": 0.15578947368421053,
"grad_norm": 3.3982807201281147,
"learning_rate": 9.851033847720167e-06,
"loss": 0.2992,
"step": 74
},
{
"epoch": 0.15789473684210525,
"grad_norm": 2.931894314971863,
"learning_rate": 9.847001329696653e-06,
"loss": 0.1989,
"step": 75
},
{
"epoch": 0.16,
"grad_norm": 3.5469960575121307,
"learning_rate": 9.842915805643156e-06,
"loss": 0.3598,
"step": 76
},
{
"epoch": 0.16210526315789472,
"grad_norm": 3.213106605680639,
"learning_rate": 9.838777320238312e-06,
"loss": 0.3194,
"step": 77
},
{
"epoch": 0.16421052631578947,
"grad_norm": 4.433153434861342,
"learning_rate": 9.834585918739936e-06,
"loss": 0.3447,
"step": 78
},
{
"epoch": 0.16631578947368422,
"grad_norm": 3.0884694722442303,
"learning_rate": 9.830341646984521e-06,
"loss": 0.26,
"step": 79
},
{
"epoch": 0.16842105263157894,
"grad_norm": 3.3484965798832493,
"learning_rate": 9.826044551386743e-06,
"loss": 0.2775,
"step": 80
},
{
"epoch": 0.1705263157894737,
"grad_norm": 3.4971689879223833,
"learning_rate": 9.821694678938954e-06,
"loss": 0.3016,
"step": 81
},
{
"epoch": 0.1726315789473684,
"grad_norm": 3.282170006280911,
"learning_rate": 9.817292077210658e-06,
"loss": 0.3392,
"step": 82
},
{
"epoch": 0.17473684210526316,
"grad_norm": 3.839326039283396,
"learning_rate": 9.812836794348005e-06,
"loss": 0.3436,
"step": 83
},
{
"epoch": 0.17684210526315788,
"grad_norm": 3.942224451339831,
"learning_rate": 9.808328879073251e-06,
"loss": 0.3596,
"step": 84
},
{
"epoch": 0.17894736842105263,
"grad_norm": 3.3687590727357355,
"learning_rate": 9.803768380684242e-06,
"loss": 0.3408,
"step": 85
},
{
"epoch": 0.18105263157894738,
"grad_norm": 2.8534809428590497,
"learning_rate": 9.79915534905385e-06,
"loss": 0.2532,
"step": 86
},
{
"epoch": 0.1831578947368421,
"grad_norm": 3.020705734497323,
"learning_rate": 9.794489834629457e-06,
"loss": 0.2882,
"step": 87
},
{
"epoch": 0.18526315789473685,
"grad_norm": 2.7674834071058796,
"learning_rate": 9.789771888432375e-06,
"loss": 0.2338,
"step": 88
},
{
"epoch": 0.18736842105263157,
"grad_norm": 3.4431668867918694,
"learning_rate": 9.785001562057311e-06,
"loss": 0.2596,
"step": 89
},
{
"epoch": 0.18947368421052632,
"grad_norm": 3.4078628366971433,
"learning_rate": 9.780178907671788e-06,
"loss": 0.3006,
"step": 90
},
{
"epoch": 0.19157894736842104,
"grad_norm": 3.3261364660865707,
"learning_rate": 9.775303978015585e-06,
"loss": 0.2951,
"step": 91
},
{
"epoch": 0.1936842105263158,
"grad_norm": 3.4790214002153803,
"learning_rate": 9.77037682640015e-06,
"loss": 0.3475,
"step": 92
},
{
"epoch": 0.1957894736842105,
"grad_norm": 3.6562472061364484,
"learning_rate": 9.765397506708023e-06,
"loss": 0.3599,
"step": 93
},
{
"epoch": 0.19789473684210526,
"grad_norm": 3.3045836139909994,
"learning_rate": 9.760366073392246e-06,
"loss": 0.2597,
"step": 94
},
{
"epoch": 0.2,
"grad_norm": 3.3815788433479175,
"learning_rate": 9.755282581475769e-06,
"loss": 0.3071,
"step": 95
},
{
"epoch": 0.20210526315789473,
"grad_norm": 3.045252776887845,
"learning_rate": 9.750147086550843e-06,
"loss": 0.2225,
"step": 96
},
{
"epoch": 0.20421052631578948,
"grad_norm": 3.373471179536508,
"learning_rate": 9.744959644778422e-06,
"loss": 0.3346,
"step": 97
},
{
"epoch": 0.2063157894736842,
"grad_norm": 3.9726149132566326,
"learning_rate": 9.739720312887536e-06,
"loss": 0.3535,
"step": 98
},
{
"epoch": 0.20842105263157895,
"grad_norm": 3.1493223747136225,
"learning_rate": 9.734429148174676e-06,
"loss": 0.2348,
"step": 99
},
{
"epoch": 0.21052631578947367,
"grad_norm": 3.2666485311731583,
"learning_rate": 9.729086208503174e-06,
"loss": 0.3287,
"step": 100
},
{
"epoch": 0.21263157894736842,
"grad_norm": 3.560150543272795,
"learning_rate": 9.723691552302563e-06,
"loss": 0.2926,
"step": 101
},
{
"epoch": 0.21473684210526317,
"grad_norm": 3.7358995280744938,
"learning_rate": 9.718245238567939e-06,
"loss": 0.3386,
"step": 102
},
{
"epoch": 0.2168421052631579,
"grad_norm": 2.915691266754973,
"learning_rate": 9.712747326859316e-06,
"loss": 0.2469,
"step": 103
},
{
"epoch": 0.21894736842105264,
"grad_norm": 3.765457080419128,
"learning_rate": 9.707197877300974e-06,
"loss": 0.351,
"step": 104
},
{
"epoch": 0.22105263157894736,
"grad_norm": 3.0668748196886395,
"learning_rate": 9.701596950580807e-06,
"loss": 0.2802,
"step": 105
},
{
"epoch": 0.2231578947368421,
"grad_norm": 4.1412641761679465,
"learning_rate": 9.69594460794965e-06,
"loss": 0.2647,
"step": 106
},
{
"epoch": 0.22526315789473683,
"grad_norm": 2.8209337016614167,
"learning_rate": 9.690240911220618e-06,
"loss": 0.2186,
"step": 107
},
{
"epoch": 0.22736842105263158,
"grad_norm": 2.8786177165081424,
"learning_rate": 9.684485922768422e-06,
"loss": 0.231,
"step": 108
},
{
"epoch": 0.2294736842105263,
"grad_norm": 3.324372122776949,
"learning_rate": 9.678679705528699e-06,
"loss": 0.2543,
"step": 109
},
{
"epoch": 0.23157894736842105,
"grad_norm": 3.925045356831471,
"learning_rate": 9.672822322997305e-06,
"loss": 0.281,
"step": 110
},
{
"epoch": 0.2336842105263158,
"grad_norm": 3.168490279807808,
"learning_rate": 9.666913839229639e-06,
"loss": 0.298,
"step": 111
},
{
"epoch": 0.23578947368421052,
"grad_norm": 3.2577818549862867,
"learning_rate": 9.660954318839934e-06,
"loss": 0.2663,
"step": 112
},
{
"epoch": 0.23789473684210527,
"grad_norm": 3.8656752201735984,
"learning_rate": 9.654943827000548e-06,
"loss": 0.2712,
"step": 113
},
{
"epoch": 0.24,
"grad_norm": 3.5154758832724218,
"learning_rate": 9.648882429441258e-06,
"loss": 0.2731,
"step": 114
},
{
"epoch": 0.24210526315789474,
"grad_norm": 2.8687141447582376,
"learning_rate": 9.642770192448537e-06,
"loss": 0.2452,
"step": 115
},
{
"epoch": 0.24421052631578946,
"grad_norm": 3.6957846757319532,
"learning_rate": 9.636607182864828e-06,
"loss": 0.3313,
"step": 116
},
{
"epoch": 0.2463157894736842,
"grad_norm": 2.9551806554723603,
"learning_rate": 9.630393468087818e-06,
"loss": 0.2452,
"step": 117
},
{
"epoch": 0.24842105263157896,
"grad_norm": 3.2418146201338076,
"learning_rate": 9.624129116069695e-06,
"loss": 0.2902,
"step": 118
},
{
"epoch": 0.2505263157894737,
"grad_norm": 3.42312293812338,
"learning_rate": 9.61781419531641e-06,
"loss": 0.3117,
"step": 119
},
{
"epoch": 0.25263157894736843,
"grad_norm": 3.7071145440083355,
"learning_rate": 9.611448774886925e-06,
"loss": 0.3434,
"step": 120
},
{
"epoch": 0.25473684210526315,
"grad_norm": 3.2140221247594725,
"learning_rate": 9.605032924392457e-06,
"loss": 0.296,
"step": 121
},
{
"epoch": 0.25684210526315787,
"grad_norm": 3.5032254608727325,
"learning_rate": 9.598566713995718e-06,
"loss": 0.2626,
"step": 122
},
{
"epoch": 0.25894736842105265,
"grad_norm": 3.15141293077419,
"learning_rate": 9.592050214410152e-06,
"loss": 0.2757,
"step": 123
},
{
"epoch": 0.26105263157894737,
"grad_norm": 3.6472637355316255,
"learning_rate": 9.585483496899151e-06,
"loss": 0.2756,
"step": 124
},
{
"epoch": 0.2631578947368421,
"grad_norm": 2.951085484163403,
"learning_rate": 9.578866633275289e-06,
"loss": 0.2277,
"step": 125
},
{
"epoch": 0.26526315789473687,
"grad_norm": 3.6608344418079124,
"learning_rate": 9.572199695899522e-06,
"loss": 0.3656,
"step": 126
},
{
"epoch": 0.2673684210526316,
"grad_norm": 2.875867453563808,
"learning_rate": 9.565482757680415e-06,
"loss": 0.2981,
"step": 127
},
{
"epoch": 0.2694736842105263,
"grad_norm": 2.969989429575355,
"learning_rate": 9.558715892073324e-06,
"loss": 0.3036,
"step": 128
},
{
"epoch": 0.27157894736842103,
"grad_norm": 2.8534681274152036,
"learning_rate": 9.551899173079607e-06,
"loss": 0.2927,
"step": 129
},
{
"epoch": 0.2736842105263158,
"grad_norm": 3.082901025759934,
"learning_rate": 9.545032675245814e-06,
"loss": 0.3322,
"step": 130
},
{
"epoch": 0.27578947368421053,
"grad_norm": 3.5290876802965996,
"learning_rate": 9.538116473662862e-06,
"loss": 0.3379,
"step": 131
},
{
"epoch": 0.27789473684210525,
"grad_norm": 3.0428232899935406,
"learning_rate": 9.531150643965224e-06,
"loss": 0.2357,
"step": 132
},
{
"epoch": 0.28,
"grad_norm": 3.5333703853706457,
"learning_rate": 9.524135262330098e-06,
"loss": 0.2428,
"step": 133
},
{
"epoch": 0.28210526315789475,
"grad_norm": 3.0475010473798005,
"learning_rate": 9.517070405476575e-06,
"loss": 0.2935,
"step": 134
},
{
"epoch": 0.28421052631578947,
"grad_norm": 3.240537024042731,
"learning_rate": 9.509956150664796e-06,
"loss": 0.2613,
"step": 135
},
{
"epoch": 0.2863157894736842,
"grad_norm": 2.9128730104491973,
"learning_rate": 9.502792575695112e-06,
"loss": 0.2347,
"step": 136
},
{
"epoch": 0.28842105263157897,
"grad_norm": 3.330795542026535,
"learning_rate": 9.495579758907231e-06,
"loss": 0.3263,
"step": 137
},
{
"epoch": 0.2905263157894737,
"grad_norm": 3.2067221175533023,
"learning_rate": 9.48831777917936e-06,
"loss": 0.3092,
"step": 138
},
{
"epoch": 0.2926315789473684,
"grad_norm": 3.0353623237386813,
"learning_rate": 9.481006715927352e-06,
"loss": 0.2901,
"step": 139
},
{
"epoch": 0.29473684210526313,
"grad_norm": 3.477639320680317,
"learning_rate": 9.473646649103819e-06,
"loss": 0.2697,
"step": 140
},
{
"epoch": 0.2968421052631579,
"grad_norm": 2.721056544196183,
"learning_rate": 9.466237659197271e-06,
"loss": 0.2241,
"step": 141
},
{
"epoch": 0.29894736842105263,
"grad_norm": 3.163833843700446,
"learning_rate": 9.458779827231237e-06,
"loss": 0.2656,
"step": 142
},
{
"epoch": 0.30105263157894735,
"grad_norm": 3.110748967764754,
"learning_rate": 9.451273234763372e-06,
"loss": 0.2617,
"step": 143
},
{
"epoch": 0.3031578947368421,
"grad_norm": 3.0170106549603504,
"learning_rate": 9.443717963884568e-06,
"loss": 0.2447,
"step": 144
},
{
"epoch": 0.30526315789473685,
"grad_norm": 3.3315550272526844,
"learning_rate": 9.43611409721806e-06,
"loss": 0.3067,
"step": 145
},
{
"epoch": 0.30736842105263157,
"grad_norm": 3.1066342898280905,
"learning_rate": 9.428461717918512e-06,
"loss": 0.3262,
"step": 146
},
{
"epoch": 0.3094736842105263,
"grad_norm": 3.3496341428081315,
"learning_rate": 9.420760909671119e-06,
"loss": 0.2454,
"step": 147
},
{
"epoch": 0.31157894736842107,
"grad_norm": 3.421650153297572,
"learning_rate": 9.413011756690686e-06,
"loss": 0.3029,
"step": 148
},
{
"epoch": 0.3136842105263158,
"grad_norm": 3.044311191542287,
"learning_rate": 9.405214343720708e-06,
"loss": 0.2959,
"step": 149
},
{
"epoch": 0.3157894736842105,
"grad_norm": 2.7783609067575545,
"learning_rate": 9.397368756032445e-06,
"loss": 0.2575,
"step": 150
},
{
"epoch": 0.3178947368421053,
"grad_norm": 3.1794076058884895,
"learning_rate": 9.389475079423988e-06,
"loss": 0.2782,
"step": 151
},
{
"epoch": 0.32,
"grad_norm": 4.565412279405237,
"learning_rate": 9.381533400219319e-06,
"loss": 0.3381,
"step": 152
},
{
"epoch": 0.32210526315789473,
"grad_norm": 3.7246780777461637,
"learning_rate": 9.373543805267367e-06,
"loss": 0.2958,
"step": 153
},
{
"epoch": 0.32421052631578945,
"grad_norm": 3.056241661591908,
"learning_rate": 9.365506381941066e-06,
"loss": 0.281,
"step": 154
},
{
"epoch": 0.3263157894736842,
"grad_norm": 3.366326120576616,
"learning_rate": 9.357421218136387e-06,
"loss": 0.202,
"step": 155
},
{
"epoch": 0.32842105263157895,
"grad_norm": 3.457666911533099,
"learning_rate": 9.349288402271387e-06,
"loss": 0.3005,
"step": 156
},
{
"epoch": 0.33052631578947367,
"grad_norm": 2.3640574343694096,
"learning_rate": 9.341108023285239e-06,
"loss": 0.1791,
"step": 157
},
{
"epoch": 0.33263157894736844,
"grad_norm": 3.5072575679163203,
"learning_rate": 9.332880170637252e-06,
"loss": 0.2887,
"step": 158
},
{
"epoch": 0.33473684210526317,
"grad_norm": 4.034163555212422,
"learning_rate": 9.324604934305911e-06,
"loss": 0.2583,
"step": 159
},
{
"epoch": 0.3368421052631579,
"grad_norm": 3.4137170461677178,
"learning_rate": 9.31628240478787e-06,
"loss": 0.3196,
"step": 160
},
{
"epoch": 0.3389473684210526,
"grad_norm": 3.5366155768451297,
"learning_rate": 9.30791267309698e-06,
"loss": 0.2977,
"step": 161
},
{
"epoch": 0.3410526315789474,
"grad_norm": 3.2286495303508684,
"learning_rate": 9.299495830763285e-06,
"loss": 0.3006,
"step": 162
},
{
"epoch": 0.3431578947368421,
"grad_norm": 3.515490852518791,
"learning_rate": 9.291031969832026e-06,
"loss": 0.276,
"step": 163
},
{
"epoch": 0.3452631578947368,
"grad_norm": 3.8556810694559855,
"learning_rate": 9.28252118286263e-06,
"loss": 0.2854,
"step": 164
},
{
"epoch": 0.3473684210526316,
"grad_norm": 3.275312388330099,
"learning_rate": 9.273963562927695e-06,
"loss": 0.2563,
"step": 165
},
{
"epoch": 0.3494736842105263,
"grad_norm": 3.491765481220755,
"learning_rate": 9.265359203611988e-06,
"loss": 0.3133,
"step": 166
},
{
"epoch": 0.35157894736842105,
"grad_norm": 3.473334617513911,
"learning_rate": 9.256708199011402e-06,
"loss": 0.2385,
"step": 167
},
{
"epoch": 0.35368421052631577,
"grad_norm": 3.397043567474343,
"learning_rate": 9.248010643731936e-06,
"loss": 0.247,
"step": 168
},
{
"epoch": 0.35578947368421054,
"grad_norm": 3.649860343855273,
"learning_rate": 9.23926663288866e-06,
"loss": 0.2802,
"step": 169
},
{
"epoch": 0.35789473684210527,
"grad_norm": 3.813391941041034,
"learning_rate": 9.230476262104678e-06,
"loss": 0.3087,
"step": 170
},
{
"epoch": 0.36,
"grad_norm": 3.6905074018171344,
"learning_rate": 9.221639627510076e-06,
"loss": 0.3462,
"step": 171
},
{
"epoch": 0.36210526315789476,
"grad_norm": 3.6369332762547124,
"learning_rate": 9.212756825740874e-06,
"loss": 0.2893,
"step": 172
},
{
"epoch": 0.3642105263157895,
"grad_norm": 3.124067402795342,
"learning_rate": 9.203827953937969e-06,
"loss": 0.2621,
"step": 173
},
{
"epoch": 0.3663157894736842,
"grad_norm": 4.009622033887673,
"learning_rate": 9.194853109746073e-06,
"loss": 0.2958,
"step": 174
},
{
"epoch": 0.3684210526315789,
"grad_norm": 3.277654059786843,
"learning_rate": 9.185832391312644e-06,
"loss": 0.2765,
"step": 175
},
{
"epoch": 0.3705263157894737,
"grad_norm": 2.77349749923877,
"learning_rate": 9.176765897286812e-06,
"loss": 0.2127,
"step": 176
},
{
"epoch": 0.3726315789473684,
"grad_norm": 3.527606374595019,
"learning_rate": 9.167653726818305e-06,
"loss": 0.3067,
"step": 177
},
{
"epoch": 0.37473684210526315,
"grad_norm": 2.6924440680853947,
"learning_rate": 9.15849597955636e-06,
"loss": 0.2259,
"step": 178
},
{
"epoch": 0.37684210526315787,
"grad_norm": 3.886574108001305,
"learning_rate": 9.149292755648631e-06,
"loss": 0.2923,
"step": 179
},
{
"epoch": 0.37894736842105264,
"grad_norm": 2.818379903417439,
"learning_rate": 9.140044155740102e-06,
"loss": 0.2295,
"step": 180
},
{
"epoch": 0.38105263157894737,
"grad_norm": 3.2260018781876623,
"learning_rate": 9.130750280971978e-06,
"loss": 0.3383,
"step": 181
},
{
"epoch": 0.3831578947368421,
"grad_norm": 2.739913862532641,
"learning_rate": 9.121411232980589e-06,
"loss": 0.2404,
"step": 182
},
{
"epoch": 0.38526315789473686,
"grad_norm": 3.199557557458389,
"learning_rate": 9.112027113896262e-06,
"loss": 0.288,
"step": 183
},
{
"epoch": 0.3873684210526316,
"grad_norm": 3.757331027330083,
"learning_rate": 9.102598026342223e-06,
"loss": 0.2045,
"step": 184
},
{
"epoch": 0.3894736842105263,
"grad_norm": 3.799798504081319,
"learning_rate": 9.093124073433464e-06,
"loss": 0.3113,
"step": 185
},
{
"epoch": 0.391578947368421,
"grad_norm": 2.9479882498425405,
"learning_rate": 9.083605358775612e-06,
"loss": 0.2591,
"step": 186
},
{
"epoch": 0.3936842105263158,
"grad_norm": 2.8984352246005636,
"learning_rate": 9.074041986463808e-06,
"loss": 0.2094,
"step": 187
},
{
"epoch": 0.3957894736842105,
"grad_norm": 3.1258831680405943,
"learning_rate": 9.064434061081562e-06,
"loss": 0.2644,
"step": 188
},
{
"epoch": 0.39789473684210525,
"grad_norm": 3.8610806420066046,
"learning_rate": 9.0547816876996e-06,
"loss": 0.3445,
"step": 189
},
{
"epoch": 0.4,
"grad_norm": 3.0731247252689755,
"learning_rate": 9.045084971874738e-06,
"loss": 0.2819,
"step": 190
},
{
"epoch": 0.40210526315789474,
"grad_norm": 2.7506464613941524,
"learning_rate": 9.035344019648701e-06,
"loss": 0.2222,
"step": 191
},
{
"epoch": 0.40421052631578946,
"grad_norm": 3.551940976528996,
"learning_rate": 9.025558937546987e-06,
"loss": 0.4067,
"step": 192
},
{
"epoch": 0.4063157894736842,
"grad_norm": 3.0238265322464923,
"learning_rate": 9.015729832577681e-06,
"loss": 0.2453,
"step": 193
},
{
"epoch": 0.40842105263157896,
"grad_norm": 3.177883029033834,
"learning_rate": 9.005856812230304e-06,
"loss": 0.2448,
"step": 194
},
{
"epoch": 0.4105263157894737,
"grad_norm": 2.9637061343290427,
"learning_rate": 8.995939984474624e-06,
"loss": 0.2293,
"step": 195
},
{
"epoch": 0.4126315789473684,
"grad_norm": 2.946477365180333,
"learning_rate": 8.98597945775948e-06,
"loss": 0.2538,
"step": 196
},
{
"epoch": 0.4147368421052632,
"grad_norm": 3.165367892256443,
"learning_rate": 8.975975341011595e-06,
"loss": 0.3132,
"step": 197
},
{
"epoch": 0.4168421052631579,
"grad_norm": 2.786548810153527,
"learning_rate": 8.96592774363439e-06,
"loss": 0.2135,
"step": 198
},
{
"epoch": 0.4189473684210526,
"grad_norm": 3.285599254850826,
"learning_rate": 8.955836775506776e-06,
"loss": 0.2379,
"step": 199
},
{
"epoch": 0.42105263157894735,
"grad_norm": 4.108210407116029,
"learning_rate": 8.94570254698197e-06,
"loss": 0.2778,
"step": 200
},
{
"epoch": 0.42105263157894735,
"eval_loss": 0.26044589281082153,
"eval_runtime": 0.9326,
"eval_samples_per_second": 41.818,
"eval_steps_per_second": 10.722,
"step": 200
},
{
"epoch": 0.4231578947368421,
"grad_norm": 3.5939015363403843,
"learning_rate": 8.935525168886263e-06,
"loss": 0.2727,
"step": 201
},
{
"epoch": 0.42526315789473684,
"grad_norm": 3.266123999642131,
"learning_rate": 8.92530475251784e-06,
"loss": 0.2421,
"step": 202
},
{
"epoch": 0.42736842105263156,
"grad_norm": 4.2885736406975825,
"learning_rate": 8.91504140964553e-06,
"loss": 0.3279,
"step": 203
},
{
"epoch": 0.42947368421052634,
"grad_norm": 3.6581971350961715,
"learning_rate": 8.90473525250761e-06,
"loss": 0.2979,
"step": 204
},
{
"epoch": 0.43157894736842106,
"grad_norm": 3.9824032374811025,
"learning_rate": 8.894386393810563e-06,
"loss": 0.2764,
"step": 205
},
{
"epoch": 0.4336842105263158,
"grad_norm": 3.1254122691470183,
"learning_rate": 8.883994946727848e-06,
"loss": 0.2089,
"step": 206
},
{
"epoch": 0.4357894736842105,
"grad_norm": 4.247423358470422,
"learning_rate": 8.873561024898668e-06,
"loss": 0.2691,
"step": 207
},
{
"epoch": 0.4378947368421053,
"grad_norm": 3.0648535317590655,
"learning_rate": 8.863084742426719e-06,
"loss": 0.2043,
"step": 208
},
{
"epoch": 0.44,
"grad_norm": 3.068552327060875,
"learning_rate": 8.852566213878947e-06,
"loss": 0.2468,
"step": 209
},
{
"epoch": 0.4421052631578947,
"grad_norm": 3.764319293504002,
"learning_rate": 8.842005554284296e-06,
"loss": 0.3041,
"step": 210
},
{
"epoch": 0.4442105263157895,
"grad_norm": 3.28381437259028,
"learning_rate": 8.831402879132447e-06,
"loss": 0.2951,
"step": 211
},
{
"epoch": 0.4463157894736842,
"grad_norm": 3.0292296611478173,
"learning_rate": 8.820758304372557e-06,
"loss": 0.2497,
"step": 212
},
{
"epoch": 0.44842105263157894,
"grad_norm": 3.3884010965584945,
"learning_rate": 8.810071946411989e-06,
"loss": 0.2622,
"step": 213
},
{
"epoch": 0.45052631578947366,
"grad_norm": 3.393874059981341,
"learning_rate": 8.799343922115045e-06,
"loss": 0.2708,
"step": 214
},
{
"epoch": 0.45263157894736844,
"grad_norm": 2.909948972542914,
"learning_rate": 8.788574348801676e-06,
"loss": 0.2542,
"step": 215
},
{
"epoch": 0.45473684210526316,
"grad_norm": 3.3522048696292694,
"learning_rate": 8.777763344246209e-06,
"loss": 0.2503,
"step": 216
},
{
"epoch": 0.4568421052631579,
"grad_norm": 2.821449551025134,
"learning_rate": 8.766911026676063e-06,
"loss": 0.2493,
"step": 217
},
{
"epoch": 0.4589473684210526,
"grad_norm": 3.5684952127975613,
"learning_rate": 8.756017514770444e-06,
"loss": 0.2407,
"step": 218
},
{
"epoch": 0.4610526315789474,
"grad_norm": 3.7589924204118867,
"learning_rate": 8.745082927659048e-06,
"loss": 0.3244,
"step": 219
},
{
"epoch": 0.4631578947368421,
"grad_norm": 3.408146894460298,
"learning_rate": 8.734107384920771e-06,
"loss": 0.3119,
"step": 220
},
{
"epoch": 0.4652631578947368,
"grad_norm": 2.555651765279721,
"learning_rate": 8.72309100658239e-06,
"loss": 0.2244,
"step": 221
},
{
"epoch": 0.4673684210526316,
"grad_norm": 2.868395817759443,
"learning_rate": 8.71203391311725e-06,
"loss": 0.227,
"step": 222
},
{
"epoch": 0.4694736842105263,
"grad_norm": 3.2734154431001676,
"learning_rate": 8.700936225443958e-06,
"loss": 0.3075,
"step": 223
},
{
"epoch": 0.47157894736842104,
"grad_norm": 3.1578793105562317,
"learning_rate": 8.689798064925049e-06,
"loss": 0.3325,
"step": 224
},
{
"epoch": 0.47368421052631576,
"grad_norm": 2.6029524703601674,
"learning_rate": 8.67861955336566e-06,
"loss": 0.2634,
"step": 225
},
{
"epoch": 0.47578947368421054,
"grad_norm": 2.488560140997595,
"learning_rate": 8.6674008130122e-06,
"loss": 0.2149,
"step": 226
},
{
"epoch": 0.47789473684210526,
"grad_norm": 3.71862701964302,
"learning_rate": 8.65614196655102e-06,
"loss": 0.2992,
"step": 227
},
{
"epoch": 0.48,
"grad_norm": 3.2971708770584387,
"learning_rate": 8.644843137107058e-06,
"loss": 0.2897,
"step": 228
},
{
"epoch": 0.48210526315789476,
"grad_norm": 2.7980397594087467,
"learning_rate": 8.633504448242504e-06,
"loss": 0.2262,
"step": 229
},
{
"epoch": 0.4842105263157895,
"grad_norm": 2.7495976567100606,
"learning_rate": 8.622126023955446e-06,
"loss": 0.1996,
"step": 230
},
{
"epoch": 0.4863157894736842,
"grad_norm": 3.2796491708704,
"learning_rate": 8.610707988678504e-06,
"loss": 0.2389,
"step": 231
},
{
"epoch": 0.4884210526315789,
"grad_norm": 3.4418510848878943,
"learning_rate": 8.599250467277483e-06,
"loss": 0.3318,
"step": 232
},
{
"epoch": 0.4905263157894737,
"grad_norm": 3.3793449517646366,
"learning_rate": 8.587753585050004e-06,
"loss": 0.2316,
"step": 233
},
{
"epoch": 0.4926315789473684,
"grad_norm": 2.9552327613137646,
"learning_rate": 8.576217467724129e-06,
"loss": 0.2581,
"step": 234
},
{
"epoch": 0.49473684210526314,
"grad_norm": 2.976640505829934,
"learning_rate": 8.564642241456986e-06,
"loss": 0.2318,
"step": 235
},
{
"epoch": 0.4968421052631579,
"grad_norm": 3.34417602657529,
"learning_rate": 8.553028032833397e-06,
"loss": 0.1977,
"step": 236
},
{
"epoch": 0.49894736842105264,
"grad_norm": 3.2756869130672746,
"learning_rate": 8.541374968864486e-06,
"loss": 0.2541,
"step": 237
},
{
"epoch": 0.5010526315789474,
"grad_norm": 3.139891699786457,
"learning_rate": 8.529683176986295e-06,
"loss": 0.2076,
"step": 238
},
{
"epoch": 0.5031578947368421,
"grad_norm": 3.4708450417927312,
"learning_rate": 8.517952785058385e-06,
"loss": 0.3085,
"step": 239
},
{
"epoch": 0.5052631578947369,
"grad_norm": 3.172121164209103,
"learning_rate": 8.506183921362443e-06,
"loss": 0.2338,
"step": 240
},
{
"epoch": 0.5073684210526316,
"grad_norm": 3.3114937881451367,
"learning_rate": 8.494376714600878e-06,
"loss": 0.2751,
"step": 241
},
{
"epoch": 0.5094736842105263,
"grad_norm": 3.300701046185496,
"learning_rate": 8.482531293895412e-06,
"loss": 0.257,
"step": 242
},
{
"epoch": 0.511578947368421,
"grad_norm": 3.287351087582164,
"learning_rate": 8.470647788785665e-06,
"loss": 0.2511,
"step": 243
},
{
"epoch": 0.5136842105263157,
"grad_norm": 3.2948211696172995,
"learning_rate": 8.458726329227748e-06,
"loss": 0.2761,
"step": 244
},
{
"epoch": 0.5157894736842106,
"grad_norm": 3.042525996501928,
"learning_rate": 8.446767045592829e-06,
"loss": 0.2234,
"step": 245
},
{
"epoch": 0.5178947368421053,
"grad_norm": 3.437886226784909,
"learning_rate": 8.434770068665723e-06,
"loss": 0.2827,
"step": 246
},
{
"epoch": 0.52,
"grad_norm": 3.007789073559327,
"learning_rate": 8.422735529643445e-06,
"loss": 0.2458,
"step": 247
},
{
"epoch": 0.5221052631578947,
"grad_norm": 3.3888617933137035,
"learning_rate": 8.410663560133784e-06,
"loss": 0.2861,
"step": 248
},
{
"epoch": 0.5242105263157895,
"grad_norm": 3.5687647723690015,
"learning_rate": 8.398554292153866e-06,
"loss": 0.2458,
"step": 249
},
{
"epoch": 0.5263157894736842,
"grad_norm": 3.0507951054357476,
"learning_rate": 8.386407858128707e-06,
"loss": 0.3154,
"step": 250
},
{
"epoch": 0.5284210526315789,
"grad_norm": 2.863867309728824,
"learning_rate": 8.37422439088976e-06,
"loss": 0.2347,
"step": 251
},
{
"epoch": 0.5305263157894737,
"grad_norm": 3.3374440655963156,
"learning_rate": 8.362004023673473e-06,
"loss": 0.2541,
"step": 252
},
{
"epoch": 0.5326315789473685,
"grad_norm": 3.3399986517698754,
"learning_rate": 8.349746890119826e-06,
"loss": 0.268,
"step": 253
},
{
"epoch": 0.5347368421052632,
"grad_norm": 2.8070218152646103,
"learning_rate": 8.337453124270864e-06,
"loss": 0.2391,
"step": 254
},
{
"epoch": 0.5368421052631579,
"grad_norm": 3.0741729288875472,
"learning_rate": 8.325122860569241e-06,
"loss": 0.1825,
"step": 255
},
{
"epoch": 0.5389473684210526,
"grad_norm": 3.3102116063181914,
"learning_rate": 8.31275623385675e-06,
"loss": 0.2482,
"step": 256
},
{
"epoch": 0.5410526315789473,
"grad_norm": 3.403284847164163,
"learning_rate": 8.300353379372834e-06,
"loss": 0.2601,
"step": 257
},
{
"epoch": 0.5431578947368421,
"grad_norm": 3.3396369078731882,
"learning_rate": 8.287914432753123e-06,
"loss": 0.2411,
"step": 258
},
{
"epoch": 0.5452631578947369,
"grad_norm": 4.012947971042055,
"learning_rate": 8.275439530027948e-06,
"loss": 0.3046,
"step": 259
},
{
"epoch": 0.5473684210526316,
"grad_norm": 3.3973537835566465,
"learning_rate": 8.262928807620843e-06,
"loss": 0.2689,
"step": 260
},
{
"epoch": 0.5494736842105263,
"grad_norm": 3.421404804284609,
"learning_rate": 8.250382402347066e-06,
"loss": 0.2591,
"step": 261
},
{
"epoch": 0.5515789473684211,
"grad_norm": 2.8126406756186775,
"learning_rate": 8.237800451412095e-06,
"loss": 0.2222,
"step": 262
},
{
"epoch": 0.5536842105263158,
"grad_norm": 3.3510651542940693,
"learning_rate": 8.225183092410128e-06,
"loss": 0.2697,
"step": 263
},
{
"epoch": 0.5557894736842105,
"grad_norm": 2.986604644452873,
"learning_rate": 8.212530463322584e-06,
"loss": 0.2808,
"step": 264
},
{
"epoch": 0.5578947368421052,
"grad_norm": 3.5548559595995957,
"learning_rate": 8.199842702516584e-06,
"loss": 0.291,
"step": 265
},
{
"epoch": 0.56,
"grad_norm": 3.6456873792089257,
"learning_rate": 8.18711994874345e-06,
"loss": 0.2706,
"step": 266
},
{
"epoch": 0.5621052631578948,
"grad_norm": 3.733960403232091,
"learning_rate": 8.174362341137177e-06,
"loss": 0.266,
"step": 267
},
{
"epoch": 0.5642105263157895,
"grad_norm": 3.4930660301469643,
"learning_rate": 8.161570019212921e-06,
"loss": 0.2699,
"step": 268
},
{
"epoch": 0.5663157894736842,
"grad_norm": 3.0306590004360796,
"learning_rate": 8.148743122865463e-06,
"loss": 0.2661,
"step": 269
},
{
"epoch": 0.5684210526315789,
"grad_norm": 3.773204424271571,
"learning_rate": 8.135881792367686e-06,
"loss": 0.3432,
"step": 270
},
{
"epoch": 0.5705263157894737,
"grad_norm": 3.1394554778302526,
"learning_rate": 8.12298616836904e-06,
"loss": 0.2436,
"step": 271
},
{
"epoch": 0.5726315789473684,
"grad_norm": 2.8431644921557213,
"learning_rate": 8.110056391894005e-06,
"loss": 0.2228,
"step": 272
},
{
"epoch": 0.5747368421052632,
"grad_norm": 3.2898202937032823,
"learning_rate": 8.097092604340543e-06,
"loss": 0.2782,
"step": 273
},
{
"epoch": 0.5768421052631579,
"grad_norm": 3.7772688134474293,
"learning_rate": 8.084094947478556e-06,
"loss": 0.2909,
"step": 274
},
{
"epoch": 0.5789473684210527,
"grad_norm": 3.094367907000018,
"learning_rate": 8.071063563448341e-06,
"loss": 0.2325,
"step": 275
},
{
"epoch": 0.5810526315789474,
"grad_norm": 2.4776499929932534,
"learning_rate": 8.057998594759022e-06,
"loss": 0.1811,
"step": 276
},
{
"epoch": 0.5831578947368421,
"grad_norm": 3.2492815402735284,
"learning_rate": 8.044900184287007e-06,
"loss": 0.2943,
"step": 277
},
{
"epoch": 0.5852631578947368,
"grad_norm": 3.1427693790707383,
"learning_rate": 8.031768475274412e-06,
"loss": 0.2753,
"step": 278
},
{
"epoch": 0.5873684210526315,
"grad_norm": 3.177805841670074,
"learning_rate": 8.018603611327505e-06,
"loss": 0.2545,
"step": 279
},
{
"epoch": 0.5894736842105263,
"grad_norm": 3.776526328121304,
"learning_rate": 8.005405736415127e-06,
"loss": 0.3011,
"step": 280
},
{
"epoch": 0.5915789473684211,
"grad_norm": 3.3699727333670557,
"learning_rate": 7.992174994867124e-06,
"loss": 0.2121,
"step": 281
},
{
"epoch": 0.5936842105263158,
"grad_norm": 2.8041575327507795,
"learning_rate": 7.978911531372764e-06,
"loss": 0.2449,
"step": 282
},
{
"epoch": 0.5957894736842105,
"grad_norm": 3.331718845848788,
"learning_rate": 7.965615490979165e-06,
"loss": 0.277,
"step": 283
},
{
"epoch": 0.5978947368421053,
"grad_norm": 3.729754183258392,
"learning_rate": 7.952287019089686e-06,
"loss": 0.2904,
"step": 284
},
{
"epoch": 0.6,
"grad_norm": 3.5336258021533884,
"learning_rate": 7.938926261462366e-06,
"loss": 0.2918,
"step": 285
},
{
"epoch": 0.6021052631578947,
"grad_norm": 3.1709904925861125,
"learning_rate": 7.925533364208308e-06,
"loss": 0.2525,
"step": 286
},
{
"epoch": 0.6042105263157894,
"grad_norm": 3.866549016558195,
"learning_rate": 7.912108473790092e-06,
"loss": 0.2392,
"step": 287
},
{
"epoch": 0.6063157894736843,
"grad_norm": 3.516428202377018,
"learning_rate": 7.898651737020166e-06,
"loss": 0.3108,
"step": 288
},
{
"epoch": 0.608421052631579,
"grad_norm": 3.828883844428092,
"learning_rate": 7.885163301059251e-06,
"loss": 0.2484,
"step": 289
},
{
"epoch": 0.6105263157894737,
"grad_norm": 4.038586072604508,
"learning_rate": 7.871643313414718e-06,
"loss": 0.3028,
"step": 290
},
{
"epoch": 0.6126315789473684,
"grad_norm": 3.217055390670865,
"learning_rate": 7.858091921938989e-06,
"loss": 0.295,
"step": 291
},
{
"epoch": 0.6147368421052631,
"grad_norm": 3.667294038950859,
"learning_rate": 7.844509274827907e-06,
"loss": 0.278,
"step": 292
},
{
"epoch": 0.6168421052631579,
"grad_norm": 2.8686712875415314,
"learning_rate": 7.830895520619129e-06,
"loss": 0.2609,
"step": 293
},
{
"epoch": 0.6189473684210526,
"grad_norm": 2.9401482502186167,
"learning_rate": 7.817250808190483e-06,
"loss": 0.2616,
"step": 294
},
{
"epoch": 0.6210526315789474,
"grad_norm": 2.492677050804819,
"learning_rate": 7.803575286758365e-06,
"loss": 0.205,
"step": 295
},
{
"epoch": 0.6231578947368421,
"grad_norm": 3.2739091101128235,
"learning_rate": 7.789869105876083e-06,
"loss": 0.2597,
"step": 296
},
{
"epoch": 0.6252631578947369,
"grad_norm": 2.8382019555630333,
"learning_rate": 7.776132415432234e-06,
"loss": 0.2632,
"step": 297
},
{
"epoch": 0.6273684210526316,
"grad_norm": 3.2912951032196847,
"learning_rate": 7.762365365649068e-06,
"loss": 0.2792,
"step": 298
},
{
"epoch": 0.6294736842105263,
"grad_norm": 3.1441667052432853,
"learning_rate": 7.748568107080831e-06,
"loss": 0.3344,
"step": 299
},
{
"epoch": 0.631578947368421,
"grad_norm": 2.687162590514958,
"learning_rate": 7.734740790612137e-06,
"loss": 0.2099,
"step": 300
},
{
"epoch": 0.6336842105263157,
"grad_norm": 3.6579379448846834,
"learning_rate": 7.720883567456299e-06,
"loss": 0.3209,
"step": 301
},
{
"epoch": 0.6357894736842106,
"grad_norm": 3.4874070496305523,
"learning_rate": 7.70699658915369e-06,
"loss": 0.3369,
"step": 302
},
{
"epoch": 0.6378947368421053,
"grad_norm": 3.2296813376833504,
"learning_rate": 7.693080007570084e-06,
"loss": 0.3226,
"step": 303
},
{
"epoch": 0.64,
"grad_norm": 2.951561052105942,
"learning_rate": 7.679133974894984e-06,
"loss": 0.2387,
"step": 304
},
{
"epoch": 0.6421052631578947,
"grad_norm": 3.7736306036911005,
"learning_rate": 7.66515864363997e-06,
"loss": 0.2642,
"step": 305
},
{
"epoch": 0.6442105263157895,
"grad_norm": 3.9684522413257417,
"learning_rate": 7.651154166637025e-06,
"loss": 0.3537,
"step": 306
},
{
"epoch": 0.6463157894736842,
"grad_norm": 2.847959880083427,
"learning_rate": 7.637120697036866e-06,
"loss": 0.2129,
"step": 307
},
{
"epoch": 0.6484210526315789,
"grad_norm": 3.3228471321798874,
"learning_rate": 7.62305838830727e-06,
"loss": 0.2872,
"step": 308
},
{
"epoch": 0.6505263157894737,
"grad_norm": 3.226721545415034,
"learning_rate": 7.608967394231387e-06,
"loss": 0.3071,
"step": 309
},
{
"epoch": 0.6526315789473685,
"grad_norm": 3.0776063116907038,
"learning_rate": 7.594847868906076e-06,
"loss": 0.2046,
"step": 310
},
{
"epoch": 0.6547368421052632,
"grad_norm": 3.035402994986961,
"learning_rate": 7.580699966740201e-06,
"loss": 0.2609,
"step": 311
},
{
"epoch": 0.6568421052631579,
"grad_norm": 4.098144779390545,
"learning_rate": 7.566523842452958e-06,
"loss": 0.3306,
"step": 312
},
{
"epoch": 0.6589473684210526,
"grad_norm": 2.8711609459832084,
"learning_rate": 7.552319651072164e-06,
"loss": 0.2736,
"step": 313
},
{
"epoch": 0.6610526315789473,
"grad_norm": 3.534378429059425,
"learning_rate": 7.5380875479325855e-06,
"loss": 0.234,
"step": 314
},
{
"epoch": 0.6631578947368421,
"grad_norm": 3.1761510007883835,
"learning_rate": 7.52382768867422e-06,
"loss": 0.2159,
"step": 315
},
{
"epoch": 0.6652631578947369,
"grad_norm": 4.187380079118218,
"learning_rate": 7.509540229240601e-06,
"loss": 0.3721,
"step": 316
},
{
"epoch": 0.6673684210526316,
"grad_norm": 3.1492384845261157,
"learning_rate": 7.4952253258771036e-06,
"loss": 0.2671,
"step": 317
},
{
"epoch": 0.6694736842105263,
"grad_norm": 3.3725068433395866,
"learning_rate": 7.480883135129211e-06,
"loss": 0.2781,
"step": 318
},
{
"epoch": 0.671578947368421,
"grad_norm": 2.880810684845612,
"learning_rate": 7.4665138138408255e-06,
"loss": 0.2399,
"step": 319
},
{
"epoch": 0.6736842105263158,
"grad_norm": 3.675738307936544,
"learning_rate": 7.452117519152542e-06,
"loss": 0.2816,
"step": 320
},
{
"epoch": 0.6757894736842105,
"grad_norm": 3.556869112337663,
"learning_rate": 7.437694408499932e-06,
"loss": 0.2141,
"step": 321
},
{
"epoch": 0.6778947368421052,
"grad_norm": 3.0077086336701737,
"learning_rate": 7.4232446396118265e-06,
"loss": 0.258,
"step": 322
},
{
"epoch": 0.68,
"grad_norm": 3.8856684116292284,
"learning_rate": 7.408768370508577e-06,
"loss": 0.2716,
"step": 323
},
{
"epoch": 0.6821052631578948,
"grad_norm": 2.704555689211984,
"learning_rate": 7.394265759500348e-06,
"loss": 0.1959,
"step": 324
},
{
"epoch": 0.6842105263157895,
"grad_norm": 3.6831975327095794,
"learning_rate": 7.379736965185369e-06,
"loss": 0.2326,
"step": 325
},
{
"epoch": 0.6863157894736842,
"grad_norm": 2.892505230264137,
"learning_rate": 7.365182146448205e-06,
"loss": 0.2145,
"step": 326
},
{
"epoch": 0.6884210526315789,
"grad_norm": 2.913603216291662,
"learning_rate": 7.350601462458025e-06,
"loss": 0.2467,
"step": 327
},
{
"epoch": 0.6905263157894737,
"grad_norm": 3.1224065548434314,
"learning_rate": 7.335995072666848e-06,
"loss": 0.2332,
"step": 328
},
{
"epoch": 0.6926315789473684,
"grad_norm": 3.208885963759053,
"learning_rate": 7.3213631368078196e-06,
"loss": 0.2521,
"step": 329
},
{
"epoch": 0.6947368421052632,
"grad_norm": 2.738726314664612,
"learning_rate": 7.30670581489344e-06,
"loss": 0.2332,
"step": 330
},
{
"epoch": 0.6968421052631579,
"grad_norm": 3.697170501068047,
"learning_rate": 7.292023267213836e-06,
"loss": 0.2796,
"step": 331
},
{
"epoch": 0.6989473684210527,
"grad_norm": 3.4594202559170832,
"learning_rate": 7.2773156543349965e-06,
"loss": 0.2845,
"step": 332
},
{
"epoch": 0.7010526315789474,
"grad_norm": 2.819311848002194,
"learning_rate": 7.262583137097019e-06,
"loss": 0.2236,
"step": 333
},
{
"epoch": 0.7031578947368421,
"grad_norm": 3.7286636270733102,
"learning_rate": 7.247825876612353e-06,
"loss": 0.3536,
"step": 334
},
{
"epoch": 0.7052631578947368,
"grad_norm": 2.8883909376141936,
"learning_rate": 7.233044034264034e-06,
"loss": 0.2394,
"step": 335
},
{
"epoch": 0.7073684210526315,
"grad_norm": 2.8405834985207123,
"learning_rate": 7.218237771703921e-06,
"loss": 0.2204,
"step": 336
},
{
"epoch": 0.7094736842105264,
"grad_norm": 4.087265898865519,
"learning_rate": 7.203407250850929e-06,
"loss": 0.2904,
"step": 337
},
{
"epoch": 0.7115789473684211,
"grad_norm": 2.47484433615118,
"learning_rate": 7.18855263388926e-06,
"loss": 0.1984,
"step": 338
},
{
"epoch": 0.7136842105263158,
"grad_norm": 3.301930582984559,
"learning_rate": 7.173674083266624e-06,
"loss": 0.1802,
"step": 339
},
{
"epoch": 0.7157894736842105,
"grad_norm": 2.779259357803785,
"learning_rate": 7.158771761692464e-06,
"loss": 0.2229,
"step": 340
},
{
"epoch": 0.7178947368421053,
"grad_norm": 3.144584289615483,
"learning_rate": 7.143845832136188e-06,
"loss": 0.2245,
"step": 341
},
{
"epoch": 0.72,
"grad_norm": 3.4111423970994186,
"learning_rate": 7.128896457825364e-06,
"loss": 0.2389,
"step": 342
},
{
"epoch": 0.7221052631578947,
"grad_norm": 3.7528655875468884,
"learning_rate": 7.113923802243957e-06,
"loss": 0.2757,
"step": 343
},
{
"epoch": 0.7242105263157895,
"grad_norm": 2.695128263190076,
"learning_rate": 7.098928029130529e-06,
"loss": 0.2143,
"step": 344
},
{
"epoch": 0.7263157894736842,
"grad_norm": 3.6700100764481247,
"learning_rate": 7.083909302476453e-06,
"loss": 0.3314,
"step": 345
},
{
"epoch": 0.728421052631579,
"grad_norm": 3.079325282545559,
"learning_rate": 7.068867786524116e-06,
"loss": 0.2287,
"step": 346
},
{
"epoch": 0.7305263157894737,
"grad_norm": 3.5393540118526383,
"learning_rate": 7.053803645765128e-06,
"loss": 0.2296,
"step": 347
},
{
"epoch": 0.7326315789473684,
"grad_norm": 3.077685464826638,
"learning_rate": 7.038717044938519e-06,
"loss": 0.2433,
"step": 348
},
{
"epoch": 0.7347368421052631,
"grad_norm": 3.2908130199128194,
"learning_rate": 7.023608149028936e-06,
"loss": 0.2678,
"step": 349
},
{
"epoch": 0.7368421052631579,
"grad_norm": 3.466611772144543,
"learning_rate": 7.008477123264849e-06,
"loss": 0.2877,
"step": 350
},
{
"epoch": 0.7389473684210527,
"grad_norm": 2.6649586894139623,
"learning_rate": 6.993324133116726e-06,
"loss": 0.2341,
"step": 351
},
{
"epoch": 0.7410526315789474,
"grad_norm": 2.9965647115149525,
"learning_rate": 6.978149344295242e-06,
"loss": 0.2468,
"step": 352
},
{
"epoch": 0.7431578947368421,
"grad_norm": 3.3207502931977184,
"learning_rate": 6.9629529227494575e-06,
"loss": 0.2328,
"step": 353
},
{
"epoch": 0.7452631578947368,
"grad_norm": 2.9140064494968216,
"learning_rate": 6.9477350346650016e-06,
"loss": 0.2592,
"step": 354
},
{
"epoch": 0.7473684210526316,
"grad_norm": 3.29979844169393,
"learning_rate": 6.932495846462262e-06,
"loss": 0.2766,
"step": 355
},
{
"epoch": 0.7494736842105263,
"grad_norm": 3.1444817547605175,
"learning_rate": 6.9172355247945586e-06,
"loss": 0.2483,
"step": 356
},
{
"epoch": 0.751578947368421,
"grad_norm": 3.0389647906768222,
"learning_rate": 6.901954236546324e-06,
"loss": 0.244,
"step": 357
},
{
"epoch": 0.7536842105263157,
"grad_norm": 3.881978525237369,
"learning_rate": 6.88665214883128e-06,
"loss": 0.3913,
"step": 358
},
{
"epoch": 0.7557894736842106,
"grad_norm": 3.4375207455529604,
"learning_rate": 6.871329428990602e-06,
"loss": 0.2782,
"step": 359
},
{
"epoch": 0.7578947368421053,
"grad_norm": 3.070535976817441,
"learning_rate": 6.855986244591104e-06,
"loss": 0.233,
"step": 360
},
{
"epoch": 0.76,
"grad_norm": 2.7760813127556343,
"learning_rate": 6.840622763423391e-06,
"loss": 0.2264,
"step": 361
},
{
"epoch": 0.7621052631578947,
"grad_norm": 3.2637686834139297,
"learning_rate": 6.825239153500029e-06,
"loss": 0.2083,
"step": 362
},
{
"epoch": 0.7642105263157895,
"grad_norm": 2.9278070407238457,
"learning_rate": 6.809835583053716e-06,
"loss": 0.1856,
"step": 363
},
{
"epoch": 0.7663157894736842,
"grad_norm": 3.2629951336042695,
"learning_rate": 6.794412220535426e-06,
"loss": 0.2623,
"step": 364
},
{
"epoch": 0.7684210526315789,
"grad_norm": 3.6433993183893927,
"learning_rate": 6.778969234612583e-06,
"loss": 0.279,
"step": 365
},
{
"epoch": 0.7705263157894737,
"grad_norm": 2.6196051292482117,
"learning_rate": 6.763506794167207e-06,
"loss": 0.2303,
"step": 366
},
{
"epoch": 0.7726315789473684,
"grad_norm": 3.494474145198592,
"learning_rate": 6.748025068294067e-06,
"loss": 0.2685,
"step": 367
},
{
"epoch": 0.7747368421052632,
"grad_norm": 3.7734222764866043,
"learning_rate": 6.732524226298841e-06,
"loss": 0.2162,
"step": 368
},
{
"epoch": 0.7768421052631579,
"grad_norm": 4.056775355673952,
"learning_rate": 6.717004437696249e-06,
"loss": 0.3167,
"step": 369
},
{
"epoch": 0.7789473684210526,
"grad_norm": 3.1668706807914133,
"learning_rate": 6.701465872208216e-06,
"loss": 0.2249,
"step": 370
},
{
"epoch": 0.7810526315789473,
"grad_norm": 3.522250191145657,
"learning_rate": 6.685908699762003e-06,
"loss": 0.245,
"step": 371
},
{
"epoch": 0.783157894736842,
"grad_norm": 3.2561672462411044,
"learning_rate": 6.670333090488357e-06,
"loss": 0.2627,
"step": 372
},
{
"epoch": 0.7852631578947369,
"grad_norm": 3.3885462690447468,
"learning_rate": 6.654739214719642e-06,
"loss": 0.248,
"step": 373
},
{
"epoch": 0.7873684210526316,
"grad_norm": 2.48853753145028,
"learning_rate": 6.6391272429879886e-06,
"loss": 0.1883,
"step": 374
},
{
"epoch": 0.7894736842105263,
"grad_norm": 2.8009606275161802,
"learning_rate": 6.6234973460234184e-06,
"loss": 0.2771,
"step": 375
},
{
"epoch": 0.791578947368421,
"grad_norm": 2.9535272661326255,
"learning_rate": 6.607849694751978e-06,
"loss": 0.2588,
"step": 376
},
{
"epoch": 0.7936842105263158,
"grad_norm": 2.6990032013425505,
"learning_rate": 6.592184460293878e-06,
"loss": 0.188,
"step": 377
},
{
"epoch": 0.7957894736842105,
"grad_norm": 3.5549552239785998,
"learning_rate": 6.576501813961609e-06,
"loss": 0.2613,
"step": 378
},
{
"epoch": 0.7978947368421052,
"grad_norm": 3.1945303123549795,
"learning_rate": 6.560801927258081e-06,
"loss": 0.2114,
"step": 379
},
{
"epoch": 0.8,
"grad_norm": 3.0825376124104857,
"learning_rate": 6.545084971874738e-06,
"loss": 0.1909,
"step": 380
},
{
"epoch": 0.8021052631578948,
"grad_norm": 3.1479328070094956,
"learning_rate": 6.529351119689687e-06,
"loss": 0.3183,
"step": 381
},
{
"epoch": 0.8042105263157895,
"grad_norm": 2.6365167516964108,
"learning_rate": 6.513600542765816e-06,
"loss": 0.1805,
"step": 382
},
{
"epoch": 0.8063157894736842,
"grad_norm": 2.826135221160075,
"learning_rate": 6.49783341334891e-06,
"loss": 0.2439,
"step": 383
},
{
"epoch": 0.8084210526315789,
"grad_norm": 2.743253754387738,
"learning_rate": 6.4820499038657695e-06,
"loss": 0.168,
"step": 384
},
{
"epoch": 0.8105263157894737,
"grad_norm": 2.809461612204882,
"learning_rate": 6.466250186922325e-06,
"loss": 0.2294,
"step": 385
},
{
"epoch": 0.8126315789473684,
"grad_norm": 3.775735076866835,
"learning_rate": 6.450434435301751e-06,
"loss": 0.3232,
"step": 386
},
{
"epoch": 0.8147368421052632,
"grad_norm": 3.5899045267489527,
"learning_rate": 6.434602821962571e-06,
"loss": 0.2821,
"step": 387
},
{
"epoch": 0.8168421052631579,
"grad_norm": 3.078761874653056,
"learning_rate": 6.418755520036775e-06,
"loss": 0.2414,
"step": 388
},
{
"epoch": 0.8189473684210526,
"grad_norm": 3.2185090152119704,
"learning_rate": 6.402892702827916e-06,
"loss": 0.197,
"step": 389
},
{
"epoch": 0.8210526315789474,
"grad_norm": 2.778844684918169,
"learning_rate": 6.387014543809224e-06,
"loss": 0.2411,
"step": 390
},
{
"epoch": 0.8231578947368421,
"grad_norm": 2.9779712090916566,
"learning_rate": 6.371121216621698e-06,
"loss": 0.2405,
"step": 391
},
{
"epoch": 0.8252631578947368,
"grad_norm": 3.0589344259681575,
"learning_rate": 6.355212895072223e-06,
"loss": 0.2662,
"step": 392
},
{
"epoch": 0.8273684210526315,
"grad_norm": 3.044689736554792,
"learning_rate": 6.339289753131649e-06,
"loss": 0.2052,
"step": 393
},
{
"epoch": 0.8294736842105264,
"grad_norm": 2.909939485878581,
"learning_rate": 6.323351964932909e-06,
"loss": 0.2637,
"step": 394
},
{
"epoch": 0.8315789473684211,
"grad_norm": 3.7255183082841206,
"learning_rate": 6.3073997047691e-06,
"loss": 0.3135,
"step": 395
},
{
"epoch": 0.8336842105263158,
"grad_norm": 3.09956930432557,
"learning_rate": 6.291433147091583e-06,
"loss": 0.2073,
"step": 396
},
{
"epoch": 0.8357894736842105,
"grad_norm": 3.208752314181852,
"learning_rate": 6.275452466508076e-06,
"loss": 0.2369,
"step": 397
},
{
"epoch": 0.8378947368421052,
"grad_norm": 3.332585956508526,
"learning_rate": 6.259457837780741e-06,
"loss": 0.2653,
"step": 398
},
{
"epoch": 0.84,
"grad_norm": 3.712274323531537,
"learning_rate": 6.243449435824276e-06,
"loss": 0.299,
"step": 399
},
{
"epoch": 0.8421052631578947,
"grad_norm": 3.4787692263707193,
"learning_rate": 6.227427435703997e-06,
"loss": 0.2623,
"step": 400
},
{
"epoch": 0.8421052631578947,
"eval_loss": 0.22630169987678528,
"eval_runtime": 0.9241,
"eval_samples_per_second": 42.203,
"eval_steps_per_second": 10.821,
"step": 400
},
{
"epoch": 0.8442105263157895,
"grad_norm": 3.251478421854529,
"learning_rate": 6.211392012633932e-06,
"loss": 0.2174,
"step": 401
},
{
"epoch": 0.8463157894736842,
"grad_norm": 3.5725773836009496,
"learning_rate": 6.1953433419748995e-06,
"loss": 0.3207,
"step": 402
},
{
"epoch": 0.848421052631579,
"grad_norm": 2.678551532523693,
"learning_rate": 6.179281599232592e-06,
"loss": 0.2134,
"step": 403
},
{
"epoch": 0.8505263157894737,
"grad_norm": 2.930489162104918,
"learning_rate": 6.163206960055652e-06,
"loss": 0.234,
"step": 404
},
{
"epoch": 0.8526315789473684,
"grad_norm": 3.3941005641549373,
"learning_rate": 6.147119600233758e-06,
"loss": 0.2361,
"step": 405
},
{
"epoch": 0.8547368421052631,
"grad_norm": 3.4358663312804594,
"learning_rate": 6.131019695695702e-06,
"loss": 0.3,
"step": 406
},
{
"epoch": 0.8568421052631578,
"grad_norm": 3.3210473494075443,
"learning_rate": 6.114907422507459e-06,
"loss": 0.2277,
"step": 407
},
{
"epoch": 0.8589473684210527,
"grad_norm": 3.397617904327594,
"learning_rate": 6.098782956870266e-06,
"loss": 0.2644,
"step": 408
},
{
"epoch": 0.8610526315789474,
"grad_norm": 3.3393246989566787,
"learning_rate": 6.0826464751187e-06,
"loss": 0.2439,
"step": 409
},
{
"epoch": 0.8631578947368421,
"grad_norm": 3.8491410481959227,
"learning_rate": 6.066498153718735e-06,
"loss": 0.2582,
"step": 410
},
{
"epoch": 0.8652631578947368,
"grad_norm": 2.7791598243883193,
"learning_rate": 6.0503381692658305e-06,
"loss": 0.2287,
"step": 411
},
{
"epoch": 0.8673684210526316,
"grad_norm": 3.6422120621092957,
"learning_rate": 6.034166698482984e-06,
"loss": 0.3196,
"step": 412
},
{
"epoch": 0.8694736842105263,
"grad_norm": 3.1317609570704787,
"learning_rate": 6.0179839182188125e-06,
"loss": 0.1853,
"step": 413
},
{
"epoch": 0.871578947368421,
"grad_norm": 2.7141515622263257,
"learning_rate": 6.001790005445607e-06,
"loss": 0.1998,
"step": 414
},
{
"epoch": 0.8736842105263158,
"grad_norm": 3.1598775704943836,
"learning_rate": 5.985585137257401e-06,
"loss": 0.2632,
"step": 415
},
{
"epoch": 0.8757894736842106,
"grad_norm": 3.446573016898305,
"learning_rate": 5.969369490868042e-06,
"loss": 0.2501,
"step": 416
},
{
"epoch": 0.8778947368421053,
"grad_norm": 3.0594511196593444,
"learning_rate": 5.953143243609235e-06,
"loss": 0.2514,
"step": 417
},
{
"epoch": 0.88,
"grad_norm": 3.2145264513666905,
"learning_rate": 5.936906572928625e-06,
"loss": 0.2655,
"step": 418
},
{
"epoch": 0.8821052631578947,
"grad_norm": 3.3756141872799645,
"learning_rate": 5.920659656387836e-06,
"loss": 0.2525,
"step": 419
},
{
"epoch": 0.8842105263157894,
"grad_norm": 2.758856115220705,
"learning_rate": 5.904402671660551e-06,
"loss": 0.2103,
"step": 420
},
{
"epoch": 0.8863157894736842,
"grad_norm": 3.2338348283135,
"learning_rate": 5.8881357965305444e-06,
"loss": 0.2543,
"step": 421
},
{
"epoch": 0.888421052631579,
"grad_norm": 2.898358500545659,
"learning_rate": 5.871859208889759e-06,
"loss": 0.2117,
"step": 422
},
{
"epoch": 0.8905263157894737,
"grad_norm": 3.0693690724990064,
"learning_rate": 5.855573086736351e-06,
"loss": 0.228,
"step": 423
},
{
"epoch": 0.8926315789473684,
"grad_norm": 3.22975196387293,
"learning_rate": 5.839277608172739e-06,
"loss": 0.2577,
"step": 424
},
{
"epoch": 0.8947368421052632,
"grad_norm": 4.392345175921754,
"learning_rate": 5.82297295140367e-06,
"loss": 0.3319,
"step": 425
},
{
"epoch": 0.8968421052631579,
"grad_norm": 3.35468992972689,
"learning_rate": 5.806659294734256e-06,
"loss": 0.2141,
"step": 426
},
{
"epoch": 0.8989473684210526,
"grad_norm": 3.2440283643656262,
"learning_rate": 5.790336816568033e-06,
"loss": 0.2201,
"step": 427
},
{
"epoch": 0.9010526315789473,
"grad_norm": 2.935911272760269,
"learning_rate": 5.774005695405008e-06,
"loss": 0.2017,
"step": 428
},
{
"epoch": 0.9031578947368422,
"grad_norm": 3.1890795129615994,
"learning_rate": 5.7576661098397024e-06,
"loss": 0.2918,
"step": 429
},
{
"epoch": 0.9052631578947369,
"grad_norm": 3.500288661868837,
"learning_rate": 5.74131823855921e-06,
"loss": 0.2761,
"step": 430
},
{
"epoch": 0.9073684210526316,
"grad_norm": 3.414156857820881,
"learning_rate": 5.72496226034123e-06,
"loss": 0.2257,
"step": 431
},
{
"epoch": 0.9094736842105263,
"grad_norm": 3.225368384638819,
"learning_rate": 5.708598354052122e-06,
"loss": 0.2721,
"step": 432
},
{
"epoch": 0.911578947368421,
"grad_norm": 3.5718042208512375,
"learning_rate": 5.692226698644938e-06,
"loss": 0.2247,
"step": 433
},
{
"epoch": 0.9136842105263158,
"grad_norm": 2.5936514914444047,
"learning_rate": 5.675847473157485e-06,
"loss": 0.215,
"step": 434
},
{
"epoch": 0.9157894736842105,
"grad_norm": 3.8268265538198327,
"learning_rate": 5.659460856710346e-06,
"loss": 0.2675,
"step": 435
},
{
"epoch": 0.9178947368421052,
"grad_norm": 2.591405308225686,
"learning_rate": 5.643067028504931e-06,
"loss": 0.1786,
"step": 436
},
{
"epoch": 0.92,
"grad_norm": 3.098466939386034,
"learning_rate": 5.626666167821522e-06,
"loss": 0.205,
"step": 437
},
{
"epoch": 0.9221052631578948,
"grad_norm": 3.1761782292767564,
"learning_rate": 5.610258454017301e-06,
"loss": 0.2534,
"step": 438
},
{
"epoch": 0.9242105263157895,
"grad_norm": 2.9386193495829795,
"learning_rate": 5.593844066524401e-06,
"loss": 0.2702,
"step": 439
},
{
"epoch": 0.9263157894736842,
"grad_norm": 3.2355731219150394,
"learning_rate": 5.577423184847932e-06,
"loss": 0.2768,
"step": 440
},
{
"epoch": 0.9284210526315789,
"grad_norm": 3.253238208386654,
"learning_rate": 5.560995988564023e-06,
"loss": 0.2728,
"step": 441
},
{
"epoch": 0.9305263157894736,
"grad_norm": 4.1574139012640225,
"learning_rate": 5.544562657317863e-06,
"loss": 0.2815,
"step": 442
},
{
"epoch": 0.9326315789473684,
"grad_norm": 3.4530558929645663,
"learning_rate": 5.52812337082173e-06,
"loss": 0.2517,
"step": 443
},
{
"epoch": 0.9347368421052632,
"grad_norm": 3.034360151981781,
"learning_rate": 5.5116783088530255e-06,
"loss": 0.2124,
"step": 444
},
{
"epoch": 0.9368421052631579,
"grad_norm": 3.4312547440487333,
"learning_rate": 5.495227651252315e-06,
"loss": 0.2977,
"step": 445
},
{
"epoch": 0.9389473684210526,
"grad_norm": 2.855876800810897,
"learning_rate": 5.478771577921351e-06,
"loss": 0.2099,
"step": 446
},
{
"epoch": 0.9410526315789474,
"grad_norm": 3.332726274404472,
"learning_rate": 5.4623102688211186e-06,
"loss": 0.2924,
"step": 447
},
{
"epoch": 0.9431578947368421,
"grad_norm": 4.407070803673364,
"learning_rate": 5.445843903969854e-06,
"loss": 0.2892,
"step": 448
},
{
"epoch": 0.9452631578947368,
"grad_norm": 3.0253661541082875,
"learning_rate": 5.429372663441086e-06,
"loss": 0.2156,
"step": 449
},
{
"epoch": 0.9473684210526315,
"grad_norm": 2.8412635352832187,
"learning_rate": 5.412896727361663e-06,
"loss": 0.2284,
"step": 450
},
{
"epoch": 0.9494736842105264,
"grad_norm": 3.5868259684516075,
"learning_rate": 5.396416275909779e-06,
"loss": 0.2759,
"step": 451
},
{
"epoch": 0.9515789473684211,
"grad_norm": 2.996265256795479,
"learning_rate": 5.379931489313016e-06,
"loss": 0.2234,
"step": 452
},
{
"epoch": 0.9536842105263158,
"grad_norm": 2.6413020642647242,
"learning_rate": 5.363442547846356e-06,
"loss": 0.1867,
"step": 453
},
{
"epoch": 0.9557894736842105,
"grad_norm": 2.989468542441537,
"learning_rate": 5.346949631830221e-06,
"loss": 0.2212,
"step": 454
},
{
"epoch": 0.9578947368421052,
"grad_norm": 3.517740282273809,
"learning_rate": 5.3304529216284974e-06,
"loss": 0.322,
"step": 455
},
{
"epoch": 0.96,
"grad_norm": 2.6529272773403485,
"learning_rate": 5.3139525976465675e-06,
"loss": 0.1765,
"step": 456
},
{
"epoch": 0.9621052631578947,
"grad_norm": 3.682108270044108,
"learning_rate": 5.2974488403293285e-06,
"loss": 0.2842,
"step": 457
},
{
"epoch": 0.9642105263157895,
"grad_norm": 3.020601772073395,
"learning_rate": 5.280941830159228e-06,
"loss": 0.2485,
"step": 458
},
{
"epoch": 0.9663157894736842,
"grad_norm": 2.7942998887866453,
"learning_rate": 5.264431747654284e-06,
"loss": 0.191,
"step": 459
},
{
"epoch": 0.968421052631579,
"grad_norm": 3.24583853868691,
"learning_rate": 5.247918773366112e-06,
"loss": 0.2327,
"step": 460
},
{
"epoch": 0.9705263157894737,
"grad_norm": 2.8171630730122534,
"learning_rate": 5.231403087877955e-06,
"loss": 0.2234,
"step": 461
},
{
"epoch": 0.9726315789473684,
"grad_norm": 2.8270485667195544,
"learning_rate": 5.214884871802703e-06,
"loss": 0.2309,
"step": 462
},
{
"epoch": 0.9747368421052631,
"grad_norm": 3.2640422013428587,
"learning_rate": 5.198364305780922e-06,
"loss": 0.2152,
"step": 463
},
{
"epoch": 0.9768421052631578,
"grad_norm": 2.8640531088651375,
"learning_rate": 5.1818415704788725e-06,
"loss": 0.2366,
"step": 464
},
{
"epoch": 0.9789473684210527,
"grad_norm": 2.842698759687521,
"learning_rate": 5.165316846586541e-06,
"loss": 0.2241,
"step": 465
},
{
"epoch": 0.9810526315789474,
"grad_norm": 2.4466969685896465,
"learning_rate": 5.148790314815662e-06,
"loss": 0.1975,
"step": 466
},
{
"epoch": 0.9831578947368421,
"grad_norm": 3.043987452889092,
"learning_rate": 5.132262155897739e-06,
"loss": 0.2044,
"step": 467
},
{
"epoch": 0.9852631578947368,
"grad_norm": 3.93638285635615,
"learning_rate": 5.11573255058207e-06,
"loss": 0.2927,
"step": 468
},
{
"epoch": 0.9873684210526316,
"grad_norm": 3.1860838277038224,
"learning_rate": 5.099201679633769e-06,
"loss": 0.3051,
"step": 469
},
{
"epoch": 0.9894736842105263,
"grad_norm": 2.6486537512870725,
"learning_rate": 5.082669723831793e-06,
"loss": 0.1287,
"step": 470
},
{
"epoch": 0.991578947368421,
"grad_norm": 3.022725909103708,
"learning_rate": 5.066136863966963e-06,
"loss": 0.2423,
"step": 471
},
{
"epoch": 0.9936842105263158,
"grad_norm": 3.2304036278113095,
"learning_rate": 5.049603280839982e-06,
"loss": 0.2274,
"step": 472
},
{
"epoch": 0.9957894736842106,
"grad_norm": 2.5800916231325393,
"learning_rate": 5.033069155259471e-06,
"loss": 0.2118,
"step": 473
},
{
"epoch": 0.9978947368421053,
"grad_norm": 2.3680672194474335,
"learning_rate": 5.016534668039976e-06,
"loss": 0.1445,
"step": 474
},
{
"epoch": 1.0,
"grad_norm": 2.8673882894664535,
"learning_rate": 5e-06,
"loss": 0.2295,
"step": 475
},
{
"epoch": 1.0021052631578948,
"grad_norm": 2.295670347098231,
"learning_rate": 4.983465331960025e-06,
"loss": 0.0976,
"step": 476
},
{
"epoch": 1.0042105263157894,
"grad_norm": 2.2098086978202747,
"learning_rate": 4.96693084474053e-06,
"loss": 0.1486,
"step": 477
},
{
"epoch": 1.0063157894736843,
"grad_norm": 2.2844750468890185,
"learning_rate": 4.950396719160019e-06,
"loss": 0.1171,
"step": 478
},
{
"epoch": 1.0084210526315789,
"grad_norm": 2.6215587379391407,
"learning_rate": 4.93386313603304e-06,
"loss": 0.1056,
"step": 479
},
{
"epoch": 1.0105263157894737,
"grad_norm": 2.7267133338007685,
"learning_rate": 4.917330276168208e-06,
"loss": 0.1602,
"step": 480
},
{
"epoch": 1.0126315789473683,
"grad_norm": 2.3440015356753774,
"learning_rate": 4.900798320366233e-06,
"loss": 0.1126,
"step": 481
},
{
"epoch": 1.0147368421052632,
"grad_norm": 2.052176267404225,
"learning_rate": 4.884267449417932e-06,
"loss": 0.0856,
"step": 482
},
{
"epoch": 1.016842105263158,
"grad_norm": 2.773326388210988,
"learning_rate": 4.867737844102261e-06,
"loss": 0.1343,
"step": 483
},
{
"epoch": 1.0189473684210526,
"grad_norm": 2.3704172545807984,
"learning_rate": 4.851209685184339e-06,
"loss": 0.0955,
"step": 484
},
{
"epoch": 1.0210526315789474,
"grad_norm": 2.693276477530952,
"learning_rate": 4.8346831534134595e-06,
"loss": 0.1107,
"step": 485
},
{
"epoch": 1.023157894736842,
"grad_norm": 2.5177397539501007,
"learning_rate": 4.818158429521129e-06,
"loss": 0.1256,
"step": 486
},
{
"epoch": 1.0252631578947369,
"grad_norm": 2.76258073472734,
"learning_rate": 4.801635694219079e-06,
"loss": 0.0977,
"step": 487
},
{
"epoch": 1.0273684210526315,
"grad_norm": 2.745546482466762,
"learning_rate": 4.785115128197298e-06,
"loss": 0.121,
"step": 488
},
{
"epoch": 1.0294736842105263,
"grad_norm": 2.8689928800544413,
"learning_rate": 4.768596912122046e-06,
"loss": 0.1043,
"step": 489
},
{
"epoch": 1.0315789473684212,
"grad_norm": 3.2713014779449185,
"learning_rate": 4.752081226633888e-06,
"loss": 0.1329,
"step": 490
},
{
"epoch": 1.0336842105263158,
"grad_norm": 2.6940653620000843,
"learning_rate": 4.735568252345718e-06,
"loss": 0.0763,
"step": 491
},
{
"epoch": 1.0357894736842106,
"grad_norm": 2.7748381875190065,
"learning_rate": 4.719058169840773e-06,
"loss": 0.1041,
"step": 492
},
{
"epoch": 1.0378947368421052,
"grad_norm": 2.96405722060865,
"learning_rate": 4.702551159670672e-06,
"loss": 0.1081,
"step": 493
},
{
"epoch": 1.04,
"grad_norm": 3.2702432569124644,
"learning_rate": 4.686047402353433e-06,
"loss": 0.1033,
"step": 494
},
{
"epoch": 1.0421052631578946,
"grad_norm": 3.1378226854518765,
"learning_rate": 4.669547078371503e-06,
"loss": 0.1203,
"step": 495
},
{
"epoch": 1.0442105263157895,
"grad_norm": 3.0459911779334723,
"learning_rate": 4.65305036816978e-06,
"loss": 0.1177,
"step": 496
},
{
"epoch": 1.0463157894736843,
"grad_norm": 3.039349255011064,
"learning_rate": 4.636557452153645e-06,
"loss": 0.0716,
"step": 497
},
{
"epoch": 1.048421052631579,
"grad_norm": 3.1967200174187647,
"learning_rate": 4.620068510686985e-06,
"loss": 0.0955,
"step": 498
},
{
"epoch": 1.0505263157894738,
"grad_norm": 3.08653598155148,
"learning_rate": 4.60358372409022e-06,
"loss": 0.1073,
"step": 499
},
{
"epoch": 1.0526315789473684,
"grad_norm": 3.608351560740164,
"learning_rate": 4.587103272638339e-06,
"loss": 0.1192,
"step": 500
},
{
"epoch": 1.0547368421052632,
"grad_norm": 2.985044729505937,
"learning_rate": 4.570627336558915e-06,
"loss": 0.1077,
"step": 501
},
{
"epoch": 1.0568421052631578,
"grad_norm": 2.3865033858620714,
"learning_rate": 4.554156096030149e-06,
"loss": 0.069,
"step": 502
},
{
"epoch": 1.0589473684210526,
"grad_norm": 2.9290127417667318,
"learning_rate": 4.537689731178883e-06,
"loss": 0.0982,
"step": 503
},
{
"epoch": 1.0610526315789475,
"grad_norm": 3.8440200841289887,
"learning_rate": 4.5212284220786495e-06,
"loss": 0.1302,
"step": 504
},
{
"epoch": 1.063157894736842,
"grad_norm": 2.8853341217614283,
"learning_rate": 4.504772348747687e-06,
"loss": 0.1145,
"step": 505
},
{
"epoch": 1.065263157894737,
"grad_norm": 2.6679058498506545,
"learning_rate": 4.488321691146975e-06,
"loss": 0.0904,
"step": 506
},
{
"epoch": 1.0673684210526315,
"grad_norm": 2.8951003825916075,
"learning_rate": 4.471876629178273e-06,
"loss": 0.103,
"step": 507
},
{
"epoch": 1.0694736842105264,
"grad_norm": 3.007348462141689,
"learning_rate": 4.4554373426821375e-06,
"loss": 0.099,
"step": 508
},
{
"epoch": 1.071578947368421,
"grad_norm": 3.328459980789624,
"learning_rate": 4.439004011435979e-06,
"loss": 0.1205,
"step": 509
},
{
"epoch": 1.0736842105263158,
"grad_norm": 2.949855877718661,
"learning_rate": 4.42257681515207e-06,
"loss": 0.0874,
"step": 510
},
{
"epoch": 1.0757894736842106,
"grad_norm": 3.6391199686370443,
"learning_rate": 4.406155933475599e-06,
"loss": 0.1249,
"step": 511
},
{
"epoch": 1.0778947368421052,
"grad_norm": 3.2478829168809926,
"learning_rate": 4.3897415459827e-06,
"loss": 0.1167,
"step": 512
},
{
"epoch": 1.08,
"grad_norm": 2.841083467241702,
"learning_rate": 4.373333832178478e-06,
"loss": 0.1017,
"step": 513
},
{
"epoch": 1.0821052631578947,
"grad_norm": 3.3982868679317044,
"learning_rate": 4.356932971495071e-06,
"loss": 0.0945,
"step": 514
},
{
"epoch": 1.0842105263157895,
"grad_norm": 2.890261480563195,
"learning_rate": 4.340539143289655e-06,
"loss": 0.1087,
"step": 515
},
{
"epoch": 1.0863157894736841,
"grad_norm": 3.308334318534236,
"learning_rate": 4.324152526842517e-06,
"loss": 0.0905,
"step": 516
},
{
"epoch": 1.088421052631579,
"grad_norm": 2.8073590228296204,
"learning_rate": 4.307773301355063e-06,
"loss": 0.1182,
"step": 517
},
{
"epoch": 1.0905263157894738,
"grad_norm": 2.9727549576474557,
"learning_rate": 4.291401645947879e-06,
"loss": 0.1169,
"step": 518
},
{
"epoch": 1.0926315789473684,
"grad_norm": 2.7817050820561464,
"learning_rate": 4.275037739658771e-06,
"loss": 0.1098,
"step": 519
},
{
"epoch": 1.0947368421052632,
"grad_norm": 2.3497572335814225,
"learning_rate": 4.25868176144079e-06,
"loss": 0.0562,
"step": 520
},
{
"epoch": 1.0968421052631578,
"grad_norm": 3.3243494735102814,
"learning_rate": 4.242333890160299e-06,
"loss": 0.1095,
"step": 521
},
{
"epoch": 1.0989473684210527,
"grad_norm": 2.6231886257195915,
"learning_rate": 4.225994304594994e-06,
"loss": 0.0896,
"step": 522
},
{
"epoch": 1.1010526315789473,
"grad_norm": 3.0525576073188803,
"learning_rate": 4.209663183431969e-06,
"loss": 0.123,
"step": 523
},
{
"epoch": 1.1031578947368421,
"grad_norm": 2.300663876671176,
"learning_rate": 4.193340705265746e-06,
"loss": 0.0967,
"step": 524
},
{
"epoch": 1.1052631578947367,
"grad_norm": 2.6473318145668077,
"learning_rate": 4.17702704859633e-06,
"loss": 0.0812,
"step": 525
},
{
"epoch": 1.1073684210526316,
"grad_norm": 3.1969088912792007,
"learning_rate": 4.160722391827262e-06,
"loss": 0.1204,
"step": 526
},
{
"epoch": 1.1094736842105264,
"grad_norm": 3.001046456322388,
"learning_rate": 4.14442691326365e-06,
"loss": 0.0939,
"step": 527
},
{
"epoch": 1.111578947368421,
"grad_norm": 2.992113792126852,
"learning_rate": 4.128140791110243e-06,
"loss": 0.0933,
"step": 528
},
{
"epoch": 1.1136842105263158,
"grad_norm": 2.873261259224336,
"learning_rate": 4.111864203469457e-06,
"loss": 0.113,
"step": 529
},
{
"epoch": 1.1157894736842104,
"grad_norm": 2.227852704764954,
"learning_rate": 4.0955973283394525e-06,
"loss": 0.0822,
"step": 530
},
{
"epoch": 1.1178947368421053,
"grad_norm": 2.503374847865978,
"learning_rate": 4.079340343612165e-06,
"loss": 0.1093,
"step": 531
},
{
"epoch": 1.12,
"grad_norm": 2.867424738355881,
"learning_rate": 4.063093427071376e-06,
"loss": 0.0901,
"step": 532
},
{
"epoch": 1.1221052631578947,
"grad_norm": 2.9247925764549842,
"learning_rate": 4.046856756390767e-06,
"loss": 0.0914,
"step": 533
},
{
"epoch": 1.1242105263157895,
"grad_norm": 3.122299881486441,
"learning_rate": 4.03063050913196e-06,
"loss": 0.0874,
"step": 534
},
{
"epoch": 1.1263157894736842,
"grad_norm": 2.4666341115474673,
"learning_rate": 4.0144148627426e-06,
"loss": 0.092,
"step": 535
},
{
"epoch": 1.128421052631579,
"grad_norm": 2.601382923998426,
"learning_rate": 3.998209994554395e-06,
"loss": 0.0896,
"step": 536
},
{
"epoch": 1.1305263157894736,
"grad_norm": 3.3371781721212095,
"learning_rate": 3.982016081781189e-06,
"loss": 0.1336,
"step": 537
},
{
"epoch": 1.1326315789473684,
"grad_norm": 3.2001419892406666,
"learning_rate": 3.965833301517017e-06,
"loss": 0.0934,
"step": 538
},
{
"epoch": 1.134736842105263,
"grad_norm": 2.971320892945007,
"learning_rate": 3.949661830734172e-06,
"loss": 0.1143,
"step": 539
},
{
"epoch": 1.1368421052631579,
"grad_norm": 2.710169633405829,
"learning_rate": 3.9335018462812664e-06,
"loss": 0.0817,
"step": 540
},
{
"epoch": 1.1389473684210527,
"grad_norm": 3.102427528491057,
"learning_rate": 3.9173535248813026e-06,
"loss": 0.09,
"step": 541
},
{
"epoch": 1.1410526315789473,
"grad_norm": 2.3838774593905443,
"learning_rate": 3.901217043129735e-06,
"loss": 0.0843,
"step": 542
},
{
"epoch": 1.1431578947368422,
"grad_norm": 3.195089972641954,
"learning_rate": 3.885092577492543e-06,
"loss": 0.1437,
"step": 543
},
{
"epoch": 1.1452631578947368,
"grad_norm": 2.9164511963016713,
"learning_rate": 3.8689803043043e-06,
"loss": 0.1064,
"step": 544
},
{
"epoch": 1.1473684210526316,
"grad_norm": 2.7560183158584914,
"learning_rate": 3.852880399766243e-06,
"loss": 0.0975,
"step": 545
},
{
"epoch": 1.1494736842105264,
"grad_norm": 2.743001182985708,
"learning_rate": 3.8367930399443495e-06,
"loss": 0.1093,
"step": 546
},
{
"epoch": 1.151578947368421,
"grad_norm": 2.717089115263253,
"learning_rate": 3.820718400767409e-06,
"loss": 0.0735,
"step": 547
},
{
"epoch": 1.1536842105263159,
"grad_norm": 3.4175913700020626,
"learning_rate": 3.8046566580251e-06,
"loss": 0.1101,
"step": 548
},
{
"epoch": 1.1557894736842105,
"grad_norm": 3.787166948611024,
"learning_rate": 3.7886079873660693e-06,
"loss": 0.1461,
"step": 549
},
{
"epoch": 1.1578947368421053,
"grad_norm": 2.668580712780857,
"learning_rate": 3.7725725642960047e-06,
"loss": 0.0858,
"step": 550
},
{
"epoch": 1.16,
"grad_norm": 2.7065005668874607,
"learning_rate": 3.756550564175727e-06,
"loss": 0.0695,
"step": 551
},
{
"epoch": 1.1621052631578948,
"grad_norm": 2.6470645824370354,
"learning_rate": 3.7405421622192607e-06,
"loss": 0.108,
"step": 552
},
{
"epoch": 1.1642105263157894,
"grad_norm": 3.2629124702164582,
"learning_rate": 3.7245475334919246e-06,
"loss": 0.1309,
"step": 553
},
{
"epoch": 1.1663157894736842,
"grad_norm": 2.9270566638475564,
"learning_rate": 3.7085668529084183e-06,
"loss": 0.076,
"step": 554
},
{
"epoch": 1.168421052631579,
"grad_norm": 2.9549638887471854,
"learning_rate": 3.6926002952309015e-06,
"loss": 0.0912,
"step": 555
},
{
"epoch": 1.1705263157894736,
"grad_norm": 2.920345651586514,
"learning_rate": 3.676648035067093e-06,
"loss": 0.0958,
"step": 556
},
{
"epoch": 1.1726315789473685,
"grad_norm": 3.7647197204198126,
"learning_rate": 3.6607102468683524e-06,
"loss": 0.104,
"step": 557
},
{
"epoch": 1.174736842105263,
"grad_norm": 2.25055864611599,
"learning_rate": 3.64478710492778e-06,
"loss": 0.0596,
"step": 558
},
{
"epoch": 1.176842105263158,
"grad_norm": 2.3389330498436447,
"learning_rate": 3.628878783378302e-06,
"loss": 0.0952,
"step": 559
},
{
"epoch": 1.1789473684210527,
"grad_norm": 3.2077006313008947,
"learning_rate": 3.6129854561907786e-06,
"loss": 0.0962,
"step": 560
},
{
"epoch": 1.1810526315789474,
"grad_norm": 3.0715551570828863,
"learning_rate": 3.5971072971720844e-06,
"loss": 0.1206,
"step": 561
},
{
"epoch": 1.1831578947368422,
"grad_norm": 3.6249545529720097,
"learning_rate": 3.581244479963225e-06,
"loss": 0.1172,
"step": 562
},
{
"epoch": 1.1852631578947368,
"grad_norm": 3.196135915761518,
"learning_rate": 3.56539717803743e-06,
"loss": 0.1041,
"step": 563
},
{
"epoch": 1.1873684210526316,
"grad_norm": 3.151854573878974,
"learning_rate": 3.5495655646982506e-06,
"loss": 0.1152,
"step": 564
},
{
"epoch": 1.1894736842105262,
"grad_norm": 2.684765122673656,
"learning_rate": 3.533749813077677e-06,
"loss": 0.1038,
"step": 565
},
{
"epoch": 1.191578947368421,
"grad_norm": 2.4791366806308277,
"learning_rate": 3.517950096134232e-06,
"loss": 0.0895,
"step": 566
},
{
"epoch": 1.1936842105263157,
"grad_norm": 2.4823858121020574,
"learning_rate": 3.5021665866510924e-06,
"loss": 0.0833,
"step": 567
},
{
"epoch": 1.1957894736842105,
"grad_norm": 2.465078895228644,
"learning_rate": 3.4863994572341845e-06,
"loss": 0.0962,
"step": 568
},
{
"epoch": 1.1978947368421053,
"grad_norm": 2.6695037410176767,
"learning_rate": 3.470648880310313e-06,
"loss": 0.0765,
"step": 569
},
{
"epoch": 1.2,
"grad_norm": 3.230727181652887,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.1003,
"step": 570
},
{
"epoch": 1.2021052631578948,
"grad_norm": 2.6749391293551943,
"learning_rate": 3.4391980727419206e-06,
"loss": 0.0825,
"step": 571
},
{
"epoch": 1.2042105263157894,
"grad_norm": 3.1995780646497654,
"learning_rate": 3.423498186038393e-06,
"loss": 0.095,
"step": 572
},
{
"epoch": 1.2063157894736842,
"grad_norm": 3.05943838476226,
"learning_rate": 3.4078155397061243e-06,
"loss": 0.0718,
"step": 573
},
{
"epoch": 1.208421052631579,
"grad_norm": 2.5246593838638285,
"learning_rate": 3.3921503052480243e-06,
"loss": 0.0689,
"step": 574
},
{
"epoch": 1.2105263157894737,
"grad_norm": 2.5885059878463297,
"learning_rate": 3.3765026539765832e-06,
"loss": 0.1036,
"step": 575
},
{
"epoch": 1.2126315789473685,
"grad_norm": 3.771547100866626,
"learning_rate": 3.3608727570120114e-06,
"loss": 0.1341,
"step": 576
},
{
"epoch": 1.2147368421052631,
"grad_norm": 2.88823941459496,
"learning_rate": 3.3452607852803585e-06,
"loss": 0.1221,
"step": 577
},
{
"epoch": 1.216842105263158,
"grad_norm": 2.5098432758853058,
"learning_rate": 3.3296669095116454e-06,
"loss": 0.0793,
"step": 578
},
{
"epoch": 1.2189473684210526,
"grad_norm": 3.173791392616761,
"learning_rate": 3.3140913002379993e-06,
"loss": 0.0924,
"step": 579
},
{
"epoch": 1.2210526315789474,
"grad_norm": 2.9056096452656655,
"learning_rate": 3.298534127791785e-06,
"loss": 0.1232,
"step": 580
},
{
"epoch": 1.223157894736842,
"grad_norm": 2.7600704251604133,
"learning_rate": 3.2829955623037536e-06,
"loss": 0.1011,
"step": 581
},
{
"epoch": 1.2252631578947368,
"grad_norm": 2.8902773903565824,
"learning_rate": 3.267475773701161e-06,
"loss": 0.0971,
"step": 582
},
{
"epoch": 1.2273684210526317,
"grad_norm": 2.7682434724339284,
"learning_rate": 3.251974931705933e-06,
"loss": 0.0863,
"step": 583
},
{
"epoch": 1.2294736842105263,
"grad_norm": 2.7630463201120534,
"learning_rate": 3.236493205832795e-06,
"loss": 0.0691,
"step": 584
},
{
"epoch": 1.231578947368421,
"grad_norm": 2.769102376716963,
"learning_rate": 3.2210307653874175e-06,
"loss": 0.069,
"step": 585
},
{
"epoch": 1.2336842105263157,
"grad_norm": 2.624470281681522,
"learning_rate": 3.205587779464576e-06,
"loss": 0.1175,
"step": 586
},
{
"epoch": 1.2357894736842105,
"grad_norm": 3.1237062690477515,
"learning_rate": 3.1901644169462854e-06,
"loss": 0.1124,
"step": 587
},
{
"epoch": 1.2378947368421054,
"grad_norm": 2.880688626388798,
"learning_rate": 3.1747608464999723e-06,
"loss": 0.0824,
"step": 588
},
{
"epoch": 1.24,
"grad_norm": 4.407201987822293,
"learning_rate": 3.1593772365766107e-06,
"loss": 0.1062,
"step": 589
},
{
"epoch": 1.2421052631578948,
"grad_norm": 2.87359540180944,
"learning_rate": 3.1440137554088957e-06,
"loss": 0.1029,
"step": 590
},
{
"epoch": 1.2442105263157894,
"grad_norm": 2.9419016713556503,
"learning_rate": 3.128670571009399e-06,
"loss": 0.0943,
"step": 591
},
{
"epoch": 1.2463157894736843,
"grad_norm": 2.9623583273834977,
"learning_rate": 3.1133478511687217e-06,
"loss": 0.083,
"step": 592
},
{
"epoch": 1.2484210526315789,
"grad_norm": 2.4484737136573056,
"learning_rate": 3.0980457634536775e-06,
"loss": 0.0697,
"step": 593
},
{
"epoch": 1.2505263157894737,
"grad_norm": 2.983258535236656,
"learning_rate": 3.082764475205442e-06,
"loss": 0.0895,
"step": 594
},
{
"epoch": 1.2526315789473683,
"grad_norm": 2.927679313524382,
"learning_rate": 3.06750415353774e-06,
"loss": 0.1269,
"step": 595
},
{
"epoch": 1.2547368421052632,
"grad_norm": 4.327826033733036,
"learning_rate": 3.052264965335e-06,
"loss": 0.1779,
"step": 596
},
{
"epoch": 1.256842105263158,
"grad_norm": 3.513787096535581,
"learning_rate": 3.0370470772505433e-06,
"loss": 0.1133,
"step": 597
},
{
"epoch": 1.2589473684210526,
"grad_norm": 2.1372319523914785,
"learning_rate": 3.02185065570476e-06,
"loss": 0.0641,
"step": 598
},
{
"epoch": 1.2610526315789474,
"grad_norm": 3.136784939810971,
"learning_rate": 3.0066758668832752e-06,
"loss": 0.1238,
"step": 599
},
{
"epoch": 1.263157894736842,
"grad_norm": 2.7814275077914346,
"learning_rate": 2.991522876735154e-06,
"loss": 0.106,
"step": 600
},
{
"epoch": 1.263157894736842,
"eval_loss": 0.23879918456077576,
"eval_runtime": 0.9256,
"eval_samples_per_second": 42.133,
"eval_steps_per_second": 10.803,
"step": 600
},
{
"epoch": 1.2652631578947369,
"grad_norm": 2.4532843992196898,
"learning_rate": 2.9763918509710647e-06,
"loss": 0.0901,
"step": 601
},
{
"epoch": 1.2673684210526317,
"grad_norm": 3.1129983606325835,
"learning_rate": 2.9612829550614836e-06,
"loss": 0.0836,
"step": 602
},
{
"epoch": 1.2694736842105263,
"grad_norm": 2.9949558761740893,
"learning_rate": 2.9461963542348737e-06,
"loss": 0.1009,
"step": 603
},
{
"epoch": 1.271578947368421,
"grad_norm": 3.5141135708573157,
"learning_rate": 2.931132213475884e-06,
"loss": 0.1061,
"step": 604
},
{
"epoch": 1.2736842105263158,
"grad_norm": 2.4903783090106293,
"learning_rate": 2.9160906975235493e-06,
"loss": 0.085,
"step": 605
},
{
"epoch": 1.2757894736842106,
"grad_norm": 2.665551851720217,
"learning_rate": 2.9010719708694724e-06,
"loss": 0.0795,
"step": 606
},
{
"epoch": 1.2778947368421052,
"grad_norm": 2.251800484141915,
"learning_rate": 2.8860761977560435e-06,
"loss": 0.0663,
"step": 607
},
{
"epoch": 1.28,
"grad_norm": 2.7348175822860044,
"learning_rate": 2.871103542174637e-06,
"loss": 0.0978,
"step": 608
},
{
"epoch": 1.2821052631578946,
"grad_norm": 3.8066305763587605,
"learning_rate": 2.8561541678638145e-06,
"loss": 0.1121,
"step": 609
},
{
"epoch": 1.2842105263157895,
"grad_norm": 2.896761981069954,
"learning_rate": 2.8412282383075362e-06,
"loss": 0.0972,
"step": 610
},
{
"epoch": 1.2863157894736843,
"grad_norm": 2.78332338625659,
"learning_rate": 2.826325916733378e-06,
"loss": 0.0944,
"step": 611
},
{
"epoch": 1.288421052631579,
"grad_norm": 3.3783431646548583,
"learning_rate": 2.811447366110741e-06,
"loss": 0.0838,
"step": 612
},
{
"epoch": 1.2905263157894737,
"grad_norm": 2.5129928876526195,
"learning_rate": 2.796592749149071e-06,
"loss": 0.1033,
"step": 613
},
{
"epoch": 1.2926315789473684,
"grad_norm": 2.7310324993825272,
"learning_rate": 2.7817622282960816e-06,
"loss": 0.0822,
"step": 614
},
{
"epoch": 1.2947368421052632,
"grad_norm": 2.7894468868847584,
"learning_rate": 2.766955965735968e-06,
"loss": 0.097,
"step": 615
},
{
"epoch": 1.296842105263158,
"grad_norm": 2.692351340111761,
"learning_rate": 2.7521741233876496e-06,
"loss": 0.1008,
"step": 616
},
{
"epoch": 1.2989473684210526,
"grad_norm": 2.7211489958669426,
"learning_rate": 2.7374168629029814e-06,
"loss": 0.0662,
"step": 617
},
{
"epoch": 1.3010526315789472,
"grad_norm": 3.0694815194789506,
"learning_rate": 2.722684345665004e-06,
"loss": 0.1068,
"step": 618
},
{
"epoch": 1.303157894736842,
"grad_norm": 2.311408410154088,
"learning_rate": 2.707976732786166e-06,
"loss": 0.0855,
"step": 619
},
{
"epoch": 1.305263157894737,
"grad_norm": 2.67419162103436,
"learning_rate": 2.693294185106562e-06,
"loss": 0.079,
"step": 620
},
{
"epoch": 1.3073684210526315,
"grad_norm": 3.2884510438078256,
"learning_rate": 2.678636863192184e-06,
"loss": 0.1076,
"step": 621
},
{
"epoch": 1.3094736842105263,
"grad_norm": 2.5974919371371192,
"learning_rate": 2.6640049273331516e-06,
"loss": 0.0738,
"step": 622
},
{
"epoch": 1.311578947368421,
"grad_norm": 2.9992265829306604,
"learning_rate": 2.649398537541978e-06,
"loss": 0.0948,
"step": 623
},
{
"epoch": 1.3136842105263158,
"grad_norm": 2.8279603530668953,
"learning_rate": 2.6348178535517967e-06,
"loss": 0.0952,
"step": 624
},
{
"epoch": 1.3157894736842106,
"grad_norm": 2.616721301345279,
"learning_rate": 2.6202630348146323e-06,
"loss": 0.0991,
"step": 625
},
{
"epoch": 1.3178947368421052,
"grad_norm": 3.1702653547071344,
"learning_rate": 2.605734240499652e-06,
"loss": 0.0863,
"step": 626
},
{
"epoch": 1.32,
"grad_norm": 2.675405760590981,
"learning_rate": 2.5912316294914232e-06,
"loss": 0.0731,
"step": 627
},
{
"epoch": 1.3221052631578947,
"grad_norm": 3.0539267481690784,
"learning_rate": 2.576755360388177e-06,
"loss": 0.0969,
"step": 628
},
{
"epoch": 1.3242105263157895,
"grad_norm": 2.905114269140358,
"learning_rate": 2.562305591500069e-06,
"loss": 0.0938,
"step": 629
},
{
"epoch": 1.3263157894736843,
"grad_norm": 2.731625609008078,
"learning_rate": 2.5478824808474613e-06,
"loss": 0.0794,
"step": 630
},
{
"epoch": 1.328421052631579,
"grad_norm": 2.613073402073297,
"learning_rate": 2.5334861861591753e-06,
"loss": 0.0854,
"step": 631
},
{
"epoch": 1.3305263157894736,
"grad_norm": 2.9544597662379553,
"learning_rate": 2.5191168648707888e-06,
"loss": 0.0999,
"step": 632
},
{
"epoch": 1.3326315789473684,
"grad_norm": 2.7561606747769094,
"learning_rate": 2.5047746741228977e-06,
"loss": 0.0714,
"step": 633
},
{
"epoch": 1.3347368421052632,
"grad_norm": 2.8887518490755153,
"learning_rate": 2.490459770759398e-06,
"loss": 0.0939,
"step": 634
},
{
"epoch": 1.3368421052631578,
"grad_norm": 3.317195982139847,
"learning_rate": 2.476172311325783e-06,
"loss": 0.0884,
"step": 635
},
{
"epoch": 1.3389473684210527,
"grad_norm": 3.1750724188681003,
"learning_rate": 2.461912452067415e-06,
"loss": 0.0936,
"step": 636
},
{
"epoch": 1.3410526315789473,
"grad_norm": 3.618501354891483,
"learning_rate": 2.447680348927837e-06,
"loss": 0.1202,
"step": 637
},
{
"epoch": 1.343157894736842,
"grad_norm": 3.3236366679875715,
"learning_rate": 2.433476157547044e-06,
"loss": 0.1117,
"step": 638
},
{
"epoch": 1.345263157894737,
"grad_norm": 2.752799678732494,
"learning_rate": 2.4193000332597984e-06,
"loss": 0.1264,
"step": 639
},
{
"epoch": 1.3473684210526315,
"grad_norm": 2.747193986144216,
"learning_rate": 2.4051521310939258e-06,
"loss": 0.106,
"step": 640
},
{
"epoch": 1.3494736842105264,
"grad_norm": 2.972619060264223,
"learning_rate": 2.391032605768613e-06,
"loss": 0.0923,
"step": 641
},
{
"epoch": 1.351578947368421,
"grad_norm": 3.053201871204932,
"learning_rate": 2.3769416116927335e-06,
"loss": 0.115,
"step": 642
},
{
"epoch": 1.3536842105263158,
"grad_norm": 2.7376151110688522,
"learning_rate": 2.3628793029631353e-06,
"loss": 0.116,
"step": 643
},
{
"epoch": 1.3557894736842107,
"grad_norm": 2.9174931758051836,
"learning_rate": 2.3488458333629777e-06,
"loss": 0.1051,
"step": 644
},
{
"epoch": 1.3578947368421053,
"grad_norm": 2.4716264469344704,
"learning_rate": 2.3348413563600324e-06,
"loss": 0.1107,
"step": 645
},
{
"epoch": 1.3599999999999999,
"grad_norm": 3.0530343837320117,
"learning_rate": 2.320866025105016e-06,
"loss": 0.1334,
"step": 646
},
{
"epoch": 1.3621052631578947,
"grad_norm": 3.6619589060484645,
"learning_rate": 2.3069199924299175e-06,
"loss": 0.1366,
"step": 647
},
{
"epoch": 1.3642105263157895,
"grad_norm": 2.922855147887494,
"learning_rate": 2.29300341084631e-06,
"loss": 0.094,
"step": 648
},
{
"epoch": 1.3663157894736842,
"grad_norm": 2.550799037652647,
"learning_rate": 2.2791164325437047e-06,
"loss": 0.0855,
"step": 649
},
{
"epoch": 1.368421052631579,
"grad_norm": 2.9412512573198053,
"learning_rate": 2.265259209387867e-06,
"loss": 0.1052,
"step": 650
},
{
"epoch": 1.3705263157894736,
"grad_norm": 2.511840236449145,
"learning_rate": 2.2514318929191707e-06,
"loss": 0.0708,
"step": 651
},
{
"epoch": 1.3726315789473684,
"grad_norm": 3.1808030308692596,
"learning_rate": 2.2376346343509343e-06,
"loss": 0.0748,
"step": 652
},
{
"epoch": 1.3747368421052633,
"grad_norm": 3.3523627968871987,
"learning_rate": 2.2238675845677663e-06,
"loss": 0.0983,
"step": 653
},
{
"epoch": 1.3768421052631579,
"grad_norm": 2.932190635989121,
"learning_rate": 2.2101308941239204e-06,
"loss": 0.0637,
"step": 654
},
{
"epoch": 1.3789473684210527,
"grad_norm": 3.5807536284623835,
"learning_rate": 2.1964247132416373e-06,
"loss": 0.1136,
"step": 655
},
{
"epoch": 1.3810526315789473,
"grad_norm": 2.9803291761278263,
"learning_rate": 2.182749191809518e-06,
"loss": 0.0949,
"step": 656
},
{
"epoch": 1.3831578947368421,
"grad_norm": 3.02234271668722,
"learning_rate": 2.1691044793808734e-06,
"loss": 0.0999,
"step": 657
},
{
"epoch": 1.385263157894737,
"grad_norm": 2.679900836385144,
"learning_rate": 2.1554907251720947e-06,
"loss": 0.0785,
"step": 658
},
{
"epoch": 1.3873684210526316,
"grad_norm": 2.2709498222109366,
"learning_rate": 2.1419080780610123e-06,
"loss": 0.066,
"step": 659
},
{
"epoch": 1.3894736842105262,
"grad_norm": 3.0917162559984215,
"learning_rate": 2.1283566865852824e-06,
"loss": 0.1,
"step": 660
},
{
"epoch": 1.391578947368421,
"grad_norm": 2.5872224056024313,
"learning_rate": 2.11483669894075e-06,
"loss": 0.09,
"step": 661
},
{
"epoch": 1.3936842105263159,
"grad_norm": 2.5962906490038,
"learning_rate": 2.1013482629798334e-06,
"loss": 0.0755,
"step": 662
},
{
"epoch": 1.3957894736842105,
"grad_norm": 2.9664694685613333,
"learning_rate": 2.08789152620991e-06,
"loss": 0.1177,
"step": 663
},
{
"epoch": 1.3978947368421053,
"grad_norm": 2.7105630014133517,
"learning_rate": 2.0744666357916925e-06,
"loss": 0.0926,
"step": 664
},
{
"epoch": 1.4,
"grad_norm": 2.762223630715002,
"learning_rate": 2.061073738537635e-06,
"loss": 0.0989,
"step": 665
},
{
"epoch": 1.4021052631578947,
"grad_norm": 2.895940938610082,
"learning_rate": 2.0477129809103147e-06,
"loss": 0.0845,
"step": 666
},
{
"epoch": 1.4042105263157896,
"grad_norm": 3.255484673069398,
"learning_rate": 2.034384509020837e-06,
"loss": 0.0987,
"step": 667
},
{
"epoch": 1.4063157894736842,
"grad_norm": 3.062882069284984,
"learning_rate": 2.021088468627237e-06,
"loss": 0.1285,
"step": 668
},
{
"epoch": 1.408421052631579,
"grad_norm": 2.450287955725665,
"learning_rate": 2.0078250051328783e-06,
"loss": 0.0728,
"step": 669
},
{
"epoch": 1.4105263157894736,
"grad_norm": 3.470552198050232,
"learning_rate": 1.9945942635848745e-06,
"loss": 0.1207,
"step": 670
},
{
"epoch": 1.4126315789473685,
"grad_norm": 2.8537747610834385,
"learning_rate": 1.981396388672496e-06,
"loss": 0.1049,
"step": 671
},
{
"epoch": 1.4147368421052633,
"grad_norm": 3.7024680682537885,
"learning_rate": 1.9682315247255897e-06,
"loss": 0.1043,
"step": 672
},
{
"epoch": 1.416842105263158,
"grad_norm": 2.8609102232586743,
"learning_rate": 1.9550998157129946e-06,
"loss": 0.0694,
"step": 673
},
{
"epoch": 1.4189473684210525,
"grad_norm": 3.262083913972591,
"learning_rate": 1.9420014052409793e-06,
"loss": 0.1082,
"step": 674
},
{
"epoch": 1.4210526315789473,
"grad_norm": 3.0240021646625235,
"learning_rate": 1.928936436551661e-06,
"loss": 0.0882,
"step": 675
},
{
"epoch": 1.4231578947368422,
"grad_norm": 3.4463090162258827,
"learning_rate": 1.915905052521445e-06,
"loss": 0.0924,
"step": 676
},
{
"epoch": 1.4252631578947368,
"grad_norm": 2.431106797497799,
"learning_rate": 1.9029073956594607e-06,
"loss": 0.0887,
"step": 677
},
{
"epoch": 1.4273684210526316,
"grad_norm": 2.915704184565381,
"learning_rate": 1.8899436081059974e-06,
"loss": 0.0847,
"step": 678
},
{
"epoch": 1.4294736842105262,
"grad_norm": 2.7352077174816993,
"learning_rate": 1.877013831630961e-06,
"loss": 0.0768,
"step": 679
},
{
"epoch": 1.431578947368421,
"grad_norm": 2.8189895917814205,
"learning_rate": 1.864118207632315e-06,
"loss": 0.0785,
"step": 680
},
{
"epoch": 1.433684210526316,
"grad_norm": 2.664425517756487,
"learning_rate": 1.851256877134538e-06,
"loss": 0.0836,
"step": 681
},
{
"epoch": 1.4357894736842105,
"grad_norm": 2.9705486470960083,
"learning_rate": 1.838429980787081e-06,
"loss": 0.1191,
"step": 682
},
{
"epoch": 1.4378947368421053,
"grad_norm": 2.540310721891877,
"learning_rate": 1.825637658862824e-06,
"loss": 0.0878,
"step": 683
},
{
"epoch": 1.44,
"grad_norm": 2.661992219604887,
"learning_rate": 1.8128800512565514e-06,
"loss": 0.0929,
"step": 684
},
{
"epoch": 1.4421052631578948,
"grad_norm": 3.04507185100223,
"learning_rate": 1.8001572974834169e-06,
"loss": 0.1103,
"step": 685
},
{
"epoch": 1.4442105263157896,
"grad_norm": 2.9020461461830642,
"learning_rate": 1.7874695366774191e-06,
"loss": 0.11,
"step": 686
},
{
"epoch": 1.4463157894736842,
"grad_norm": 3.11468571675733,
"learning_rate": 1.774816907589873e-06,
"loss": 0.0998,
"step": 687
},
{
"epoch": 1.4484210526315788,
"grad_norm": 2.359239660620598,
"learning_rate": 1.7621995485879062e-06,
"loss": 0.075,
"step": 688
},
{
"epoch": 1.4505263157894737,
"grad_norm": 3.2955548369625465,
"learning_rate": 1.749617597652934e-06,
"loss": 0.0777,
"step": 689
},
{
"epoch": 1.4526315789473685,
"grad_norm": 2.8006711851285577,
"learning_rate": 1.7370711923791567e-06,
"loss": 0.112,
"step": 690
},
{
"epoch": 1.454736842105263,
"grad_norm": 3.029355082755488,
"learning_rate": 1.7245604699720536e-06,
"loss": 0.0633,
"step": 691
},
{
"epoch": 1.456842105263158,
"grad_norm": 3.304711518434829,
"learning_rate": 1.7120855672468779e-06,
"loss": 0.0883,
"step": 692
},
{
"epoch": 1.4589473684210525,
"grad_norm": 2.765032151598941,
"learning_rate": 1.6996466206271679e-06,
"loss": 0.0793,
"step": 693
},
{
"epoch": 1.4610526315789474,
"grad_norm": 3.2989793435037122,
"learning_rate": 1.6872437661432518e-06,
"loss": 0.1019,
"step": 694
},
{
"epoch": 1.4631578947368422,
"grad_norm": 3.419570656017536,
"learning_rate": 1.6748771394307584e-06,
"loss": 0.1102,
"step": 695
},
{
"epoch": 1.4652631578947368,
"grad_norm": 2.6151861620453696,
"learning_rate": 1.6625468757291379e-06,
"loss": 0.0815,
"step": 696
},
{
"epoch": 1.4673684210526317,
"grad_norm": 2.8968743795344594,
"learning_rate": 1.6502531098801756e-06,
"loss": 0.117,
"step": 697
},
{
"epoch": 1.4694736842105263,
"grad_norm": 3.048208312310303,
"learning_rate": 1.6379959763265268e-06,
"loss": 0.1159,
"step": 698
},
{
"epoch": 1.471578947368421,
"grad_norm": 3.0348755335107644,
"learning_rate": 1.62577560911024e-06,
"loss": 0.0954,
"step": 699
},
{
"epoch": 1.4736842105263157,
"grad_norm": 2.731399950881053,
"learning_rate": 1.6135921418712959e-06,
"loss": 0.0796,
"step": 700
},
{
"epoch": 1.4757894736842105,
"grad_norm": 2.450146380331398,
"learning_rate": 1.6014457078461354e-06,
"loss": 0.0818,
"step": 701
},
{
"epoch": 1.4778947368421052,
"grad_norm": 3.1040839922685945,
"learning_rate": 1.5893364398662175e-06,
"loss": 0.1282,
"step": 702
},
{
"epoch": 1.48,
"grad_norm": 3.3128390186937953,
"learning_rate": 1.5772644703565564e-06,
"loss": 0.1131,
"step": 703
},
{
"epoch": 1.4821052631578948,
"grad_norm": 2.782315822851225,
"learning_rate": 1.5652299313342772e-06,
"loss": 0.1103,
"step": 704
},
{
"epoch": 1.4842105263157894,
"grad_norm": 3.348999608405835,
"learning_rate": 1.5532329544071712e-06,
"loss": 0.1013,
"step": 705
},
{
"epoch": 1.4863157894736843,
"grad_norm": 2.4420984824535874,
"learning_rate": 1.5412736707722537e-06,
"loss": 0.0961,
"step": 706
},
{
"epoch": 1.4884210526315789,
"grad_norm": 3.27158380123177,
"learning_rate": 1.5293522112143371e-06,
"loss": 0.1032,
"step": 707
},
{
"epoch": 1.4905263157894737,
"grad_norm": 2.5206467029781425,
"learning_rate": 1.517468706104589e-06,
"loss": 0.0673,
"step": 708
},
{
"epoch": 1.4926315789473685,
"grad_norm": 3.0061836822371304,
"learning_rate": 1.505623285399121e-06,
"loss": 0.0912,
"step": 709
},
{
"epoch": 1.4947368421052631,
"grad_norm": 3.6329090622373013,
"learning_rate": 1.4938160786375571e-06,
"loss": 0.1238,
"step": 710
},
{
"epoch": 1.496842105263158,
"grad_norm": 3.357709020463055,
"learning_rate": 1.4820472149416153e-06,
"loss": 0.1033,
"step": 711
},
{
"epoch": 1.4989473684210526,
"grad_norm": 2.982027831242349,
"learning_rate": 1.4703168230137072e-06,
"loss": 0.0745,
"step": 712
},
{
"epoch": 1.5010526315789474,
"grad_norm": 3.2153075823991433,
"learning_rate": 1.4586250311355132e-06,
"loss": 0.0748,
"step": 713
},
{
"epoch": 1.5031578947368422,
"grad_norm": 3.089733805213981,
"learning_rate": 1.4469719671666043e-06,
"loss": 0.1173,
"step": 714
},
{
"epoch": 1.5052631578947369,
"grad_norm": 2.5160669473252857,
"learning_rate": 1.4353577585430152e-06,
"loss": 0.065,
"step": 715
},
{
"epoch": 1.5073684210526315,
"grad_norm": 2.479299583306584,
"learning_rate": 1.4237825322758735e-06,
"loss": 0.074,
"step": 716
},
{
"epoch": 1.5094736842105263,
"grad_norm": 2.429769219845503,
"learning_rate": 1.412246414949997e-06,
"loss": 0.0876,
"step": 717
},
{
"epoch": 1.5115789473684211,
"grad_norm": 2.4449904248685743,
"learning_rate": 1.4007495327225162e-06,
"loss": 0.0925,
"step": 718
},
{
"epoch": 1.5136842105263157,
"grad_norm": 2.8176065325452755,
"learning_rate": 1.389292011321498e-06,
"loss": 0.1018,
"step": 719
},
{
"epoch": 1.5157894736842106,
"grad_norm": 3.1723117780798127,
"learning_rate": 1.3778739760445552e-06,
"loss": 0.1117,
"step": 720
},
{
"epoch": 1.5178947368421052,
"grad_norm": 2.7373105699595466,
"learning_rate": 1.3664955517574967e-06,
"loss": 0.0748,
"step": 721
},
{
"epoch": 1.52,
"grad_norm": 2.7280985272673495,
"learning_rate": 1.3551568628929434e-06,
"loss": 0.0755,
"step": 722
},
{
"epoch": 1.5221052631578948,
"grad_norm": 3.328587344616802,
"learning_rate": 1.343858033448982e-06,
"loss": 0.1077,
"step": 723
},
{
"epoch": 1.5242105263157895,
"grad_norm": 2.5855488551749906,
"learning_rate": 1.3325991869878013e-06,
"loss": 0.071,
"step": 724
},
{
"epoch": 1.526315789473684,
"grad_norm": 3.0254177227309165,
"learning_rate": 1.321380446634342e-06,
"loss": 0.1301,
"step": 725
},
{
"epoch": 1.528421052631579,
"grad_norm": 2.3929353303305274,
"learning_rate": 1.3102019350749528e-06,
"loss": 0.0688,
"step": 726
},
{
"epoch": 1.5305263157894737,
"grad_norm": 2.21131287248272,
"learning_rate": 1.2990637745560418e-06,
"loss": 0.0525,
"step": 727
},
{
"epoch": 1.5326315789473686,
"grad_norm": 2.738513806958703,
"learning_rate": 1.2879660868827508e-06,
"loss": 0.0767,
"step": 728
},
{
"epoch": 1.5347368421052632,
"grad_norm": 3.6188389817753053,
"learning_rate": 1.2769089934176126e-06,
"loss": 0.099,
"step": 729
},
{
"epoch": 1.5368421052631578,
"grad_norm": 3.045111314231803,
"learning_rate": 1.2658926150792321e-06,
"loss": 0.073,
"step": 730
},
{
"epoch": 1.5389473684210526,
"grad_norm": 3.184073980186101,
"learning_rate": 1.2549170723409548e-06,
"loss": 0.1014,
"step": 731
},
{
"epoch": 1.5410526315789475,
"grad_norm": 2.579054827239382,
"learning_rate": 1.243982485229559e-06,
"loss": 0.0795,
"step": 732
},
{
"epoch": 1.543157894736842,
"grad_norm": 3.416615737801065,
"learning_rate": 1.233088973323937e-06,
"loss": 0.0964,
"step": 733
},
{
"epoch": 1.545263157894737,
"grad_norm": 2.785734769889791,
"learning_rate": 1.2222366557537911e-06,
"loss": 0.0907,
"step": 734
},
{
"epoch": 1.5473684210526315,
"grad_norm": 2.970957261629437,
"learning_rate": 1.2114256511983274e-06,
"loss": 0.1131,
"step": 735
},
{
"epoch": 1.5494736842105263,
"grad_norm": 2.8363779652363417,
"learning_rate": 1.200656077884958e-06,
"loss": 0.0872,
"step": 736
},
{
"epoch": 1.5515789473684212,
"grad_norm": 2.7036119445964903,
"learning_rate": 1.189928053588012e-06,
"loss": 0.0964,
"step": 737
},
{
"epoch": 1.5536842105263158,
"grad_norm": 3.3335857753060516,
"learning_rate": 1.1792416956274443e-06,
"loss": 0.0786,
"step": 738
},
{
"epoch": 1.5557894736842104,
"grad_norm": 3.0926395174578056,
"learning_rate": 1.1685971208675539e-06,
"loss": 0.1136,
"step": 739
},
{
"epoch": 1.5578947368421052,
"grad_norm": 3.3419216028091627,
"learning_rate": 1.157994445715706e-06,
"loss": 0.1071,
"step": 740
},
{
"epoch": 1.56,
"grad_norm": 2.5742088370461005,
"learning_rate": 1.1474337861210543e-06,
"loss": 0.0929,
"step": 741
},
{
"epoch": 1.5621052631578949,
"grad_norm": 2.9130570321225955,
"learning_rate": 1.1369152575732823e-06,
"loss": 0.0717,
"step": 742
},
{
"epoch": 1.5642105263157895,
"grad_norm": 3.2401289778589417,
"learning_rate": 1.1264389751013326e-06,
"loss": 0.0987,
"step": 743
},
{
"epoch": 1.566315789473684,
"grad_norm": 2.4927969213579355,
"learning_rate": 1.1160050532721527e-06,
"loss": 0.0885,
"step": 744
},
{
"epoch": 1.568421052631579,
"grad_norm": 2.938530201077573,
"learning_rate": 1.1056136061894386e-06,
"loss": 0.127,
"step": 745
},
{
"epoch": 1.5705263157894738,
"grad_norm": 3.3187567343589444,
"learning_rate": 1.095264747492391e-06,
"loss": 0.1238,
"step": 746
},
{
"epoch": 1.5726315789473684,
"grad_norm": 3.8303977690990356,
"learning_rate": 1.0849585903544707e-06,
"loss": 0.1056,
"step": 747
},
{
"epoch": 1.5747368421052632,
"grad_norm": 3.14168947299502,
"learning_rate": 1.0746952474821615e-06,
"loss": 0.1123,
"step": 748
},
{
"epoch": 1.5768421052631578,
"grad_norm": 2.7173637195455473,
"learning_rate": 1.0644748311137377e-06,
"loss": 0.0663,
"step": 749
},
{
"epoch": 1.5789473684210527,
"grad_norm": 2.5766025285393144,
"learning_rate": 1.0542974530180327e-06,
"loss": 0.0977,
"step": 750
},
{
"epoch": 1.5810526315789475,
"grad_norm": 2.609993016293197,
"learning_rate": 1.0441632244932238e-06,
"loss": 0.0884,
"step": 751
},
{
"epoch": 1.583157894736842,
"grad_norm": 3.0209923604689077,
"learning_rate": 1.0340722563656109e-06,
"loss": 0.0964,
"step": 752
},
{
"epoch": 1.5852631578947367,
"grad_norm": 2.6635239464568286,
"learning_rate": 1.0240246589884046e-06,
"loss": 0.0657,
"step": 753
},
{
"epoch": 1.5873684210526315,
"grad_norm": 2.987774944183331,
"learning_rate": 1.0140205422405213e-06,
"loss": 0.0851,
"step": 754
},
{
"epoch": 1.5894736842105264,
"grad_norm": 3.3766295151075365,
"learning_rate": 1.0040600155253766e-06,
"loss": 0.1112,
"step": 755
},
{
"epoch": 1.5915789473684212,
"grad_norm": 3.0226247449523527,
"learning_rate": 9.941431877696955e-07,
"loss": 0.0976,
"step": 756
},
{
"epoch": 1.5936842105263158,
"grad_norm": 2.9054545208229405,
"learning_rate": 9.842701674223187e-07,
"loss": 0.0914,
"step": 757
},
{
"epoch": 1.5957894736842104,
"grad_norm": 2.610578072050956,
"learning_rate": 9.744410624530148e-07,
"loss": 0.0677,
"step": 758
},
{
"epoch": 1.5978947368421053,
"grad_norm": 3.6096944620452533,
"learning_rate": 9.646559803512995e-07,
"loss": 0.1039,
"step": 759
},
{
"epoch": 1.6,
"grad_norm": 2.608548813892452,
"learning_rate": 9.549150281252633e-07,
"loss": 0.0854,
"step": 760
},
{
"epoch": 1.6021052631578947,
"grad_norm": 2.294848325794609,
"learning_rate": 9.452183123003999e-07,
"loss": 0.0712,
"step": 761
},
{
"epoch": 1.6042105263157893,
"grad_norm": 2.812243878488753,
"learning_rate": 9.355659389184396e-07,
"loss": 0.1059,
"step": 762
},
{
"epoch": 1.6063157894736841,
"grad_norm": 3.5425866420134806,
"learning_rate": 9.259580135361929e-07,
"loss": 0.1032,
"step": 763
},
{
"epoch": 1.608421052631579,
"grad_norm": 2.8282691007632574,
"learning_rate": 9.163946412243896e-07,
"loss": 0.0948,
"step": 764
},
{
"epoch": 1.6105263157894738,
"grad_norm": 2.7856179217728023,
"learning_rate": 9.068759265665384e-07,
"loss": 0.0806,
"step": 765
},
{
"epoch": 1.6126315789473684,
"grad_norm": 2.7383044343143843,
"learning_rate": 8.974019736577777e-07,
"loss": 0.097,
"step": 766
},
{
"epoch": 1.614736842105263,
"grad_norm": 2.402534553214678,
"learning_rate": 8.879728861037385e-07,
"loss": 0.0946,
"step": 767
},
{
"epoch": 1.6168421052631579,
"grad_norm": 2.571133488597209,
"learning_rate": 8.785887670194137e-07,
"loss": 0.0743,
"step": 768
},
{
"epoch": 1.6189473684210527,
"grad_norm": 2.3849456172715144,
"learning_rate": 8.692497190280225e-07,
"loss": 0.122,
"step": 769
},
{
"epoch": 1.6210526315789475,
"grad_norm": 2.4256426604960684,
"learning_rate": 8.599558442598998e-07,
"loss": 0.0983,
"step": 770
},
{
"epoch": 1.6231578947368421,
"grad_norm": 3.1138269135730074,
"learning_rate": 8.507072443513703e-07,
"loss": 0.0681,
"step": 771
},
{
"epoch": 1.6252631578947367,
"grad_norm": 2.5456622584996205,
"learning_rate": 8.415040204436426e-07,
"loss": 0.0797,
"step": 772
},
{
"epoch": 1.6273684210526316,
"grad_norm": 2.4765639393665264,
"learning_rate": 8.323462731816962e-07,
"loss": 0.0808,
"step": 773
},
{
"epoch": 1.6294736842105264,
"grad_norm": 3.0808120883207053,
"learning_rate": 8.232341027131885e-07,
"loss": 0.1246,
"step": 774
},
{
"epoch": 1.631578947368421,
"grad_norm": 2.1969262047815743,
"learning_rate": 8.141676086873574e-07,
"loss": 0.0769,
"step": 775
},
{
"epoch": 1.6336842105263156,
"grad_norm": 2.620040982916962,
"learning_rate": 8.051468902539272e-07,
"loss": 0.0626,
"step": 776
},
{
"epoch": 1.6357894736842105,
"grad_norm": 2.3817308039237943,
"learning_rate": 7.961720460620321e-07,
"loss": 0.0606,
"step": 777
},
{
"epoch": 1.6378947368421053,
"grad_norm": 2.3280487070318077,
"learning_rate": 7.872431742591268e-07,
"loss": 0.0766,
"step": 778
},
{
"epoch": 1.6400000000000001,
"grad_norm": 3.073580547488845,
"learning_rate": 7.783603724899258e-07,
"loss": 0.0877,
"step": 779
},
{
"epoch": 1.6421052631578947,
"grad_norm": 2.9969577749713636,
"learning_rate": 7.695237378953224e-07,
"loss": 0.1094,
"step": 780
},
{
"epoch": 1.6442105263157893,
"grad_norm": 3.1923236806085726,
"learning_rate": 7.607333671113409e-07,
"loss": 0.1203,
"step": 781
},
{
"epoch": 1.6463157894736842,
"grad_norm": 3.394844382115183,
"learning_rate": 7.519893562680663e-07,
"loss": 0.0974,
"step": 782
},
{
"epoch": 1.648421052631579,
"grad_norm": 2.9206691570301317,
"learning_rate": 7.432918009885997e-07,
"loss": 0.1201,
"step": 783
},
{
"epoch": 1.6505263157894738,
"grad_norm": 2.5882116062754097,
"learning_rate": 7.346407963880137e-07,
"loss": 0.0782,
"step": 784
},
{
"epoch": 1.6526315789473685,
"grad_norm": 2.999647632411743,
"learning_rate": 7.260364370723044e-07,
"loss": 0.1032,
"step": 785
},
{
"epoch": 1.654736842105263,
"grad_norm": 2.7057840276614527,
"learning_rate": 7.174788171373731e-07,
"loss": 0.1068,
"step": 786
},
{
"epoch": 1.656842105263158,
"grad_norm": 3.1605976272604797,
"learning_rate": 7.089680301679752e-07,
"loss": 0.086,
"step": 787
},
{
"epoch": 1.6589473684210527,
"grad_norm": 2.0265271144542085,
"learning_rate": 7.005041692367154e-07,
"loss": 0.0654,
"step": 788
},
{
"epoch": 1.6610526315789473,
"grad_norm": 3.0449660028759076,
"learning_rate": 6.92087326903022e-07,
"loss": 0.1064,
"step": 789
},
{
"epoch": 1.663157894736842,
"grad_norm": 2.846318524701942,
"learning_rate": 6.837175952121305e-07,
"loss": 0.1143,
"step": 790
},
{
"epoch": 1.6652631578947368,
"grad_norm": 2.2451563492479387,
"learning_rate": 6.753950656940905e-07,
"loss": 0.0687,
"step": 791
},
{
"epoch": 1.6673684210526316,
"grad_norm": 2.6443002936675675,
"learning_rate": 6.671198293627479e-07,
"loss": 0.0663,
"step": 792
},
{
"epoch": 1.6694736842105264,
"grad_norm": 3.990455910934455,
"learning_rate": 6.58891976714764e-07,
"loss": 0.1302,
"step": 793
},
{
"epoch": 1.671578947368421,
"grad_norm": 2.7128378446081336,
"learning_rate": 6.507115977286144e-07,
"loss": 0.0681,
"step": 794
},
{
"epoch": 1.6736842105263157,
"grad_norm": 2.883050359735446,
"learning_rate": 6.425787818636131e-07,
"loss": 0.0864,
"step": 795
},
{
"epoch": 1.6757894736842105,
"grad_norm": 2.6456133797383674,
"learning_rate": 6.34493618058935e-07,
"loss": 0.0587,
"step": 796
},
{
"epoch": 1.6778947368421053,
"grad_norm": 3.139020707158627,
"learning_rate": 6.264561947326331e-07,
"loss": 0.0733,
"step": 797
},
{
"epoch": 1.6800000000000002,
"grad_norm": 3.4935886643877536,
"learning_rate": 6.184665997806832e-07,
"loss": 0.1061,
"step": 798
},
{
"epoch": 1.6821052631578948,
"grad_norm": 3.340770162416328,
"learning_rate": 6.105249205760128e-07,
"loss": 0.1071,
"step": 799
},
{
"epoch": 1.6842105263157894,
"grad_norm": 2.8315166821387794,
"learning_rate": 6.026312439675553e-07,
"loss": 0.0933,
"step": 800
},
{
"epoch": 1.6842105263157894,
"eval_loss": 0.23096558451652527,
"eval_runtime": 0.9217,
"eval_samples_per_second": 42.312,
"eval_steps_per_second": 10.849,
"step": 800
},
{
"epoch": 1.6863157894736842,
"grad_norm": 3.8068745353327795,
"learning_rate": 5.947856562792926e-07,
"loss": 0.0982,
"step": 801
},
{
"epoch": 1.688421052631579,
"grad_norm": 3.143938757336479,
"learning_rate": 5.869882433093154e-07,
"loss": 0.1176,
"step": 802
},
{
"epoch": 1.6905263157894737,
"grad_norm": 3.1475689679942365,
"learning_rate": 5.79239090328883e-07,
"loss": 0.084,
"step": 803
},
{
"epoch": 1.6926315789473683,
"grad_norm": 2.8459510622938557,
"learning_rate": 5.715382820814885e-07,
"loss": 0.0924,
"step": 804
},
{
"epoch": 1.694736842105263,
"grad_norm": 2.7457521722799774,
"learning_rate": 5.63885902781941e-07,
"loss": 0.1167,
"step": 805
},
{
"epoch": 1.696842105263158,
"grad_norm": 2.642505854996809,
"learning_rate": 5.562820361154315e-07,
"loss": 0.0883,
"step": 806
},
{
"epoch": 1.6989473684210528,
"grad_norm": 2.6453141131731805,
"learning_rate": 5.487267652366291e-07,
"loss": 0.1037,
"step": 807
},
{
"epoch": 1.7010526315789474,
"grad_norm": 2.9021664082276577,
"learning_rate": 5.412201727687644e-07,
"loss": 0.0928,
"step": 808
},
{
"epoch": 1.703157894736842,
"grad_norm": 2.845026463465637,
"learning_rate": 5.337623408027293e-07,
"loss": 0.0782,
"step": 809
},
{
"epoch": 1.7052631578947368,
"grad_norm": 2.7790818083094355,
"learning_rate": 5.263533508961827e-07,
"loss": 0.1048,
"step": 810
},
{
"epoch": 1.7073684210526316,
"grad_norm": 3.231780821125863,
"learning_rate": 5.189932840726486e-07,
"loss": 0.0993,
"step": 811
},
{
"epoch": 1.7094736842105265,
"grad_norm": 2.76959876603585,
"learning_rate": 5.116822208206396e-07,
"loss": 0.0762,
"step": 812
},
{
"epoch": 1.711578947368421,
"grad_norm": 2.931427414212537,
"learning_rate": 5.044202410927707e-07,
"loss": 0.1107,
"step": 813
},
{
"epoch": 1.7136842105263157,
"grad_norm": 3.464304450508926,
"learning_rate": 4.972074243048896e-07,
"loss": 0.1182,
"step": 814
},
{
"epoch": 1.7157894736842105,
"grad_norm": 3.0622245546273468,
"learning_rate": 4.900438493352056e-07,
"loss": 0.1385,
"step": 815
},
{
"epoch": 1.7178947368421054,
"grad_norm": 2.6699450462801257,
"learning_rate": 4.829295945234258e-07,
"loss": 0.0927,
"step": 816
},
{
"epoch": 1.72,
"grad_norm": 3.50687183202409,
"learning_rate": 4.758647376699033e-07,
"loss": 0.0874,
"step": 817
},
{
"epoch": 1.7221052631578946,
"grad_norm": 2.4422273374424885,
"learning_rate": 4.6884935603477733e-07,
"loss": 0.0761,
"step": 818
},
{
"epoch": 1.7242105263157894,
"grad_norm": 3.278144620455897,
"learning_rate": 4.6188352633713964e-07,
"loss": 0.0836,
"step": 819
},
{
"epoch": 1.7263157894736842,
"grad_norm": 3.3968120192443365,
"learning_rate": 4.549673247541875e-07,
"loss": 0.0841,
"step": 820
},
{
"epoch": 1.728421052631579,
"grad_norm": 2.8655594086772664,
"learning_rate": 4.48100826920394e-07,
"loss": 0.0942,
"step": 821
},
{
"epoch": 1.7305263157894737,
"grad_norm": 3.3736122986235455,
"learning_rate": 4.412841079266778e-07,
"loss": 0.079,
"step": 822
},
{
"epoch": 1.7326315789473683,
"grad_norm": 2.80681634104848,
"learning_rate": 4.345172423195865e-07,
"loss": 0.078,
"step": 823
},
{
"epoch": 1.7347368421052631,
"grad_norm": 3.017731325212494,
"learning_rate": 4.27800304100478e-07,
"loss": 0.1121,
"step": 824
},
{
"epoch": 1.736842105263158,
"grad_norm": 2.1538534908319016,
"learning_rate": 4.211333667247125e-07,
"loss": 0.0624,
"step": 825
},
{
"epoch": 1.7389473684210528,
"grad_norm": 3.051347701639027,
"learning_rate": 4.1451650310085076e-07,
"loss": 0.1013,
"step": 826
},
{
"epoch": 1.7410526315789474,
"grad_norm": 2.878022799209242,
"learning_rate": 4.079497855898501e-07,
"loss": 0.0813,
"step": 827
},
{
"epoch": 1.743157894736842,
"grad_norm": 2.7337628143067145,
"learning_rate": 4.01433286004283e-07,
"loss": 0.0888,
"step": 828
},
{
"epoch": 1.7452631578947368,
"grad_norm": 3.3634783248963607,
"learning_rate": 3.949670756075447e-07,
"loss": 0.1221,
"step": 829
},
{
"epoch": 1.7473684210526317,
"grad_norm": 2.4140802029160118,
"learning_rate": 3.885512251130763e-07,
"loss": 0.078,
"step": 830
},
{
"epoch": 1.7494736842105263,
"grad_norm": 2.4747610892496494,
"learning_rate": 3.8218580468359136e-07,
"loss": 0.0878,
"step": 831
},
{
"epoch": 1.751578947368421,
"grad_norm": 2.8846811138167348,
"learning_rate": 3.7587088393030604e-07,
"loss": 0.1013,
"step": 832
},
{
"epoch": 1.7536842105263157,
"grad_norm": 3.2969467838779787,
"learning_rate": 3.6960653191218333e-07,
"loss": 0.1004,
"step": 833
},
{
"epoch": 1.7557894736842106,
"grad_norm": 2.876697595239593,
"learning_rate": 3.6339281713517304e-07,
"loss": 0.0822,
"step": 834
},
{
"epoch": 1.7578947368421054,
"grad_norm": 2.600705895879165,
"learning_rate": 3.572298075514652e-07,
"loss": 0.0929,
"step": 835
},
{
"epoch": 1.76,
"grad_norm": 4.100244906164762,
"learning_rate": 3.511175705587433e-07,
"loss": 0.0931,
"step": 836
},
{
"epoch": 1.7621052631578946,
"grad_norm": 2.776561934122947,
"learning_rate": 3.450561729994534e-07,
"loss": 0.0749,
"step": 837
},
{
"epoch": 1.7642105263157895,
"grad_norm": 2.8067264258839106,
"learning_rate": 3.390456811600673e-07,
"loss": 0.1108,
"step": 838
},
{
"epoch": 1.7663157894736843,
"grad_norm": 2.568562849379649,
"learning_rate": 3.3308616077036113e-07,
"loss": 0.085,
"step": 839
},
{
"epoch": 1.768421052631579,
"grad_norm": 3.1802847449938105,
"learning_rate": 3.271776770026963e-07,
"loss": 0.1122,
"step": 840
},
{
"epoch": 1.7705263157894737,
"grad_norm": 3.4764158023876472,
"learning_rate": 3.213202944713023e-07,
"loss": 0.1063,
"step": 841
},
{
"epoch": 1.7726315789473683,
"grad_norm": 3.390798613362417,
"learning_rate": 3.1551407723157734e-07,
"loss": 0.1008,
"step": 842
},
{
"epoch": 1.7747368421052632,
"grad_norm": 3.038218122247456,
"learning_rate": 3.0975908877938277e-07,
"loss": 0.1125,
"step": 843
},
{
"epoch": 1.776842105263158,
"grad_norm": 3.3457689829793487,
"learning_rate": 3.040553920503503e-07,
"loss": 0.1072,
"step": 844
},
{
"epoch": 1.7789473684210526,
"grad_norm": 3.2377107894870925,
"learning_rate": 2.984030494191942e-07,
"loss": 0.0912,
"step": 845
},
{
"epoch": 1.7810526315789472,
"grad_norm": 2.750908478454545,
"learning_rate": 2.928021226990263e-07,
"loss": 0.0792,
"step": 846
},
{
"epoch": 1.783157894736842,
"grad_norm": 2.400795013700124,
"learning_rate": 2.8725267314068496e-07,
"loss": 0.0762,
"step": 847
},
{
"epoch": 1.7852631578947369,
"grad_norm": 3.110512202087771,
"learning_rate": 2.817547614320615e-07,
"loss": 0.0764,
"step": 848
},
{
"epoch": 1.7873684210526317,
"grad_norm": 2.5090668483357805,
"learning_rate": 2.763084476974376e-07,
"loss": 0.1128,
"step": 849
},
{
"epoch": 1.7894736842105263,
"grad_norm": 2.2736980741070543,
"learning_rate": 2.7091379149682683e-07,
"loss": 0.0636,
"step": 850
},
{
"epoch": 1.791578947368421,
"grad_norm": 2.8650745187192848,
"learning_rate": 2.655708518253258e-07,
"loss": 0.0841,
"step": 851
},
{
"epoch": 1.7936842105263158,
"grad_norm": 2.81864315109065,
"learning_rate": 2.602796871124663e-07,
"loss": 0.091,
"step": 852
},
{
"epoch": 1.7957894736842106,
"grad_norm": 2.749063211163363,
"learning_rate": 2.5504035522157853e-07,
"loss": 0.0943,
"step": 853
},
{
"epoch": 1.7978947368421052,
"grad_norm": 3.0229430850580945,
"learning_rate": 2.4985291344915675e-07,
"loss": 0.1269,
"step": 854
},
{
"epoch": 1.8,
"grad_norm": 2.9239017619537053,
"learning_rate": 2.447174185242324e-07,
"loss": 0.0911,
"step": 855
},
{
"epoch": 1.8021052631578947,
"grad_norm": 2.616988483077732,
"learning_rate": 2.3963392660775576e-07,
"loss": 0.083,
"step": 856
},
{
"epoch": 1.8042105263157895,
"grad_norm": 3.2018633874558917,
"learning_rate": 2.3460249329197825e-07,
"loss": 0.0969,
"step": 857
},
{
"epoch": 1.8063157894736843,
"grad_norm": 3.1492659688194813,
"learning_rate": 2.296231735998511e-07,
"loss": 0.0892,
"step": 858
},
{
"epoch": 1.808421052631579,
"grad_norm": 3.374868955965745,
"learning_rate": 2.2469602198441575e-07,
"loss": 0.1232,
"step": 859
},
{
"epoch": 1.8105263157894735,
"grad_norm": 2.910000563908748,
"learning_rate": 2.198210923282118e-07,
"loss": 0.1016,
"step": 860
},
{
"epoch": 1.8126315789473684,
"grad_norm": 2.7882770708484728,
"learning_rate": 2.149984379426906e-07,
"loss": 0.0752,
"step": 861
},
{
"epoch": 1.8147368421052632,
"grad_norm": 3.1470017852346266,
"learning_rate": 2.102281115676258e-07,
"loss": 0.102,
"step": 862
},
{
"epoch": 1.816842105263158,
"grad_norm": 3.2300809492562004,
"learning_rate": 2.0551016537054492e-07,
"loss": 0.1049,
"step": 863
},
{
"epoch": 1.8189473684210526,
"grad_norm": 2.8796612295432484,
"learning_rate": 2.008446509461498e-07,
"loss": 0.0808,
"step": 864
},
{
"epoch": 1.8210526315789473,
"grad_norm": 2.759597506021318,
"learning_rate": 1.962316193157593e-07,
"loss": 0.0695,
"step": 865
},
{
"epoch": 1.823157894736842,
"grad_norm": 3.442533845727977,
"learning_rate": 1.91671120926748e-07,
"loss": 0.0753,
"step": 866
},
{
"epoch": 1.825263157894737,
"grad_norm": 2.966239208218966,
"learning_rate": 1.871632056519962e-07,
"loss": 0.1021,
"step": 867
},
{
"epoch": 1.8273684210526315,
"grad_norm": 2.762544594000791,
"learning_rate": 1.8270792278934302e-07,
"loss": 0.0986,
"step": 868
},
{
"epoch": 1.8294736842105264,
"grad_norm": 3.4023336405774054,
"learning_rate": 1.7830532106104747e-07,
"loss": 0.0895,
"step": 869
},
{
"epoch": 1.831578947368421,
"grad_norm": 2.7309690429581677,
"learning_rate": 1.7395544861325718e-07,
"loss": 0.0782,
"step": 870
},
{
"epoch": 1.8336842105263158,
"grad_norm": 2.8560692270110053,
"learning_rate": 1.696583530154794e-07,
"loss": 0.0902,
"step": 871
},
{
"epoch": 1.8357894736842106,
"grad_norm": 3.0786659593547476,
"learning_rate": 1.6541408126006464e-07,
"loss": 0.1019,
"step": 872
},
{
"epoch": 1.8378947368421052,
"grad_norm": 2.714427085102136,
"learning_rate": 1.6122267976168783e-07,
"loss": 0.1038,
"step": 873
},
{
"epoch": 1.8399999999999999,
"grad_norm": 4.060810899220369,
"learning_rate": 1.5708419435684463e-07,
"loss": 0.0954,
"step": 874
},
{
"epoch": 1.8421052631578947,
"grad_norm": 2.9436675014639704,
"learning_rate": 1.5299867030334815e-07,
"loss": 0.0716,
"step": 875
},
{
"epoch": 1.8442105263157895,
"grad_norm": 2.5848552590925067,
"learning_rate": 1.4896615227983468e-07,
"loss": 0.0553,
"step": 876
},
{
"epoch": 1.8463157894736844,
"grad_norm": 2.7187088217026503,
"learning_rate": 1.4498668438527597e-07,
"loss": 0.0699,
"step": 877
},
{
"epoch": 1.848421052631579,
"grad_norm": 2.6374619830581283,
"learning_rate": 1.4106031013849498e-07,
"loss": 0.0715,
"step": 878
},
{
"epoch": 1.8505263157894736,
"grad_norm": 2.7359260474957736,
"learning_rate": 1.3718707247769137e-07,
"loss": 0.082,
"step": 879
},
{
"epoch": 1.8526315789473684,
"grad_norm": 2.8874455937698236,
"learning_rate": 1.333670137599713e-07,
"loss": 0.113,
"step": 880
},
{
"epoch": 1.8547368421052632,
"grad_norm": 2.3984320677740256,
"learning_rate": 1.2960017576088445e-07,
"loss": 0.0662,
"step": 881
},
{
"epoch": 1.8568421052631578,
"grad_norm": 2.631557349703051,
"learning_rate": 1.2588659967396998e-07,
"loss": 0.0591,
"step": 882
},
{
"epoch": 1.8589473684210527,
"grad_norm": 2.8554369642818824,
"learning_rate": 1.222263261102985e-07,
"loss": 0.0863,
"step": 883
},
{
"epoch": 1.8610526315789473,
"grad_norm": 3.824376982589371,
"learning_rate": 1.1861939509803688e-07,
"loss": 0.0965,
"step": 884
},
{
"epoch": 1.8631578947368421,
"grad_norm": 2.559431945156892,
"learning_rate": 1.1506584608200366e-07,
"loss": 0.0955,
"step": 885
},
{
"epoch": 1.865263157894737,
"grad_norm": 2.3957520209090055,
"learning_rate": 1.1156571792324212e-07,
"loss": 0.0595,
"step": 886
},
{
"epoch": 1.8673684210526316,
"grad_norm": 2.3363953763581704,
"learning_rate": 1.0811904889859337e-07,
"loss": 0.0745,
"step": 887
},
{
"epoch": 1.8694736842105262,
"grad_norm": 3.0084512102418355,
"learning_rate": 1.0472587670027678e-07,
"loss": 0.1011,
"step": 888
},
{
"epoch": 1.871578947368421,
"grad_norm": 2.7795110502364047,
"learning_rate": 1.0138623843548078e-07,
"loss": 0.0806,
"step": 889
},
{
"epoch": 1.8736842105263158,
"grad_norm": 2.1652550097362484,
"learning_rate": 9.810017062595322e-08,
"loss": 0.0608,
"step": 890
},
{
"epoch": 1.8757894736842107,
"grad_norm": 2.6793941600419444,
"learning_rate": 9.486770920760668e-08,
"loss": 0.0806,
"step": 891
},
{
"epoch": 1.8778947368421053,
"grad_norm": 3.2201122598929364,
"learning_rate": 9.16888895301199e-08,
"loss": 0.1135,
"step": 892
},
{
"epoch": 1.88,
"grad_norm": 2.566921068911272,
"learning_rate": 8.856374635655696e-08,
"loss": 0.0688,
"step": 893
},
{
"epoch": 1.8821052631578947,
"grad_norm": 3.1079888316462565,
"learning_rate": 8.549231386298151e-08,
"loss": 0.0997,
"step": 894
},
{
"epoch": 1.8842105263157896,
"grad_norm": 2.7999506483166305,
"learning_rate": 8.247462563808816e-08,
"loss": 0.0946,
"step": 895
},
{
"epoch": 1.8863157894736842,
"grad_norm": 3.367662443206227,
"learning_rate": 7.951071468283166e-08,
"loss": 0.0998,
"step": 896
},
{
"epoch": 1.888421052631579,
"grad_norm": 4.550576913048775,
"learning_rate": 7.660061341006719e-08,
"loss": 0.1035,
"step": 897
},
{
"epoch": 1.8905263157894736,
"grad_norm": 3.2490454117201626,
"learning_rate": 7.374435364419675e-08,
"loss": 0.1205,
"step": 898
},
{
"epoch": 1.8926315789473684,
"grad_norm": 3.011502455562598,
"learning_rate": 7.094196662081832e-08,
"loss": 0.1008,
"step": 899
},
{
"epoch": 1.8947368421052633,
"grad_norm": 2.974407865337836,
"learning_rate": 6.819348298638839e-08,
"loss": 0.0779,
"step": 900
},
{
"epoch": 1.8968421052631579,
"grad_norm": 2.895980879467302,
"learning_rate": 6.549893279788278e-08,
"loss": 0.0805,
"step": 901
},
{
"epoch": 1.8989473684210525,
"grad_norm": 2.8922313610110923,
"learning_rate": 6.285834552247127e-08,
"loss": 0.1067,
"step": 902
},
{
"epoch": 1.9010526315789473,
"grad_norm": 3.353278867338892,
"learning_rate": 6.027175003719354e-08,
"loss": 0.0958,
"step": 903
},
{
"epoch": 1.9031578947368422,
"grad_norm": 2.650567655819524,
"learning_rate": 5.773917462864265e-08,
"loss": 0.0702,
"step": 904
},
{
"epoch": 1.905263157894737,
"grad_norm": 2.6264185528158834,
"learning_rate": 5.526064699265754e-08,
"loss": 0.0912,
"step": 905
},
{
"epoch": 1.9073684210526316,
"grad_norm": 2.527286884707186,
"learning_rate": 5.2836194234019976e-08,
"loss": 0.0771,
"step": 906
},
{
"epoch": 1.9094736842105262,
"grad_norm": 2.612912180177236,
"learning_rate": 5.0465842866156965e-08,
"loss": 0.0666,
"step": 907
},
{
"epoch": 1.911578947368421,
"grad_norm": 2.8431243287193877,
"learning_rate": 4.8149618810850454e-08,
"loss": 0.0802,
"step": 908
},
{
"epoch": 1.9136842105263159,
"grad_norm": 2.5645766842154276,
"learning_rate": 4.588754739795587e-08,
"loss": 0.0793,
"step": 909
},
{
"epoch": 1.9157894736842105,
"grad_norm": 3.545293325505326,
"learning_rate": 4.367965336512403e-08,
"loss": 0.0987,
"step": 910
},
{
"epoch": 1.917894736842105,
"grad_norm": 3.0722983206898045,
"learning_rate": 4.1525960857530244e-08,
"loss": 0.0804,
"step": 911
},
{
"epoch": 1.92,
"grad_norm": 2.079813956741024,
"learning_rate": 3.9426493427611177e-08,
"loss": 0.0681,
"step": 912
},
{
"epoch": 1.9221052631578948,
"grad_norm": 3.574578189030846,
"learning_rate": 3.738127403480507e-08,
"loss": 0.1216,
"step": 913
},
{
"epoch": 1.9242105263157896,
"grad_norm": 2.575539056464793,
"learning_rate": 3.5390325045304704e-08,
"loss": 0.0517,
"step": 914
},
{
"epoch": 1.9263157894736842,
"grad_norm": 3.0048897811038255,
"learning_rate": 3.345366823180929e-08,
"loss": 0.0931,
"step": 915
},
{
"epoch": 1.9284210526315788,
"grad_norm": 3.046405391039599,
"learning_rate": 3.1571324773286284e-08,
"loss": 0.0969,
"step": 916
},
{
"epoch": 1.9305263157894736,
"grad_norm": 3.5520843745316872,
"learning_rate": 2.9743315254743834e-08,
"loss": 0.0782,
"step": 917
},
{
"epoch": 1.9326315789473685,
"grad_norm": 2.7132714135482208,
"learning_rate": 2.7969659666999273e-08,
"loss": 0.1037,
"step": 918
},
{
"epoch": 1.9347368421052633,
"grad_norm": 3.123212441352547,
"learning_rate": 2.625037740646763e-08,
"loss": 0.0882,
"step": 919
},
{
"epoch": 1.936842105263158,
"grad_norm": 3.646324477345249,
"learning_rate": 2.4585487274942922e-08,
"loss": 0.1263,
"step": 920
},
{
"epoch": 1.9389473684210525,
"grad_norm": 2.5318288248240397,
"learning_rate": 2.2975007479397736e-08,
"loss": 0.0716,
"step": 921
},
{
"epoch": 1.9410526315789474,
"grad_norm": 3.2685591223413817,
"learning_rate": 2.1418955631781203e-08,
"loss": 0.1168,
"step": 922
},
{
"epoch": 1.9431578947368422,
"grad_norm": 3.2425891311790136,
"learning_rate": 1.9917348748826337e-08,
"loss": 0.1161,
"step": 923
},
{
"epoch": 1.9452631578947368,
"grad_norm": 3.4814985891690884,
"learning_rate": 1.847020325186577e-08,
"loss": 0.0773,
"step": 924
},
{
"epoch": 1.9473684210526314,
"grad_norm": 2.9962557150283393,
"learning_rate": 1.7077534966650767e-08,
"loss": 0.1099,
"step": 925
},
{
"epoch": 1.9494736842105262,
"grad_norm": 3.1838150741664784,
"learning_rate": 1.5739359123178587e-08,
"loss": 0.1119,
"step": 926
},
{
"epoch": 1.951578947368421,
"grad_norm": 2.9531176654422855,
"learning_rate": 1.4455690355525964e-08,
"loss": 0.0849,
"step": 927
},
{
"epoch": 1.953684210526316,
"grad_norm": 3.237219758595749,
"learning_rate": 1.3226542701689215e-08,
"loss": 0.0956,
"step": 928
},
{
"epoch": 1.9557894736842105,
"grad_norm": 2.6636107881636657,
"learning_rate": 1.2051929603428824e-08,
"loss": 0.0833,
"step": 929
},
{
"epoch": 1.9578947368421051,
"grad_norm": 3.0753931013806675,
"learning_rate": 1.0931863906127327e-08,
"loss": 0.096,
"step": 930
},
{
"epoch": 1.96,
"grad_norm": 3.395827147776403,
"learning_rate": 9.866357858642206e-09,
"loss": 0.0886,
"step": 931
},
{
"epoch": 1.9621052631578948,
"grad_norm": 2.4050339156958307,
"learning_rate": 8.855423113177664e-09,
"loss": 0.0751,
"step": 932
},
{
"epoch": 1.9642105263157896,
"grad_norm": 2.6642559497093905,
"learning_rate": 7.899070725153612e-09,
"loss": 0.0684,
"step": 933
},
{
"epoch": 1.9663157894736842,
"grad_norm": 3.73805463863678,
"learning_rate": 6.997311153086883e-09,
"loss": 0.0979,
"step": 934
},
{
"epoch": 1.9684210526315788,
"grad_norm": 2.980994237992295,
"learning_rate": 6.150154258476315e-09,
"loss": 0.0797,
"step": 935
},
{
"epoch": 1.9705263157894737,
"grad_norm": 3.4371787351109333,
"learning_rate": 5.357609305692291e-09,
"loss": 0.103,
"step": 936
},
{
"epoch": 1.9726315789473685,
"grad_norm": 2.6478564332335193,
"learning_rate": 4.619684961881255e-09,
"loss": 0.0771,
"step": 937
},
{
"epoch": 1.9747368421052631,
"grad_norm": 2.4729221890510167,
"learning_rate": 3.936389296864129e-09,
"loss": 0.0667,
"step": 938
},
{
"epoch": 1.9768421052631577,
"grad_norm": 2.9102631943004686,
"learning_rate": 3.307729783054159e-09,
"loss": 0.1091,
"step": 939
},
{
"epoch": 1.9789473684210526,
"grad_norm": 2.655897353757651,
"learning_rate": 2.7337132953697555e-09,
"loss": 0.058,
"step": 940
},
{
"epoch": 1.9810526315789474,
"grad_norm": 2.971572950538632,
"learning_rate": 2.214346111164556e-09,
"loss": 0.1048,
"step": 941
},
{
"epoch": 1.9831578947368422,
"grad_norm": 2.889850391546724,
"learning_rate": 1.749633910153592e-09,
"loss": 0.087,
"step": 942
},
{
"epoch": 1.9852631578947368,
"grad_norm": 2.9975744327597327,
"learning_rate": 1.3395817743561135e-09,
"loss": 0.0898,
"step": 943
},
{
"epoch": 1.9873684210526315,
"grad_norm": 2.754815439262134,
"learning_rate": 9.841941880361917e-10,
"loss": 0.0916,
"step": 944
},
{
"epoch": 1.9894736842105263,
"grad_norm": 3.2077861868156314,
"learning_rate": 6.834750376549793e-10,
"loss": 0.1003,
"step": 945
},
{
"epoch": 1.9915789473684211,
"grad_norm": 2.908656770112915,
"learning_rate": 4.374276118301879e-10,
"loss": 0.0828,
"step": 946
},
{
"epoch": 1.993684210526316,
"grad_norm": 2.933260030488579,
"learning_rate": 2.4605460129556446e-10,
"loss": 0.09,
"step": 947
},
{
"epoch": 1.9957894736842106,
"grad_norm": 2.4387802348581027,
"learning_rate": 1.0935809887702154e-10,
"loss": 0.0727,
"step": 948
},
{
"epoch": 1.9978947368421052,
"grad_norm": 2.543159609703416,
"learning_rate": 2.733959946432663e-11,
"loss": 0.0934,
"step": 949
},
{
"epoch": 2.0,
"grad_norm": 2.6404824543206735,
"learning_rate": 0.0,
"loss": 0.0841,
"step": 950
},
{
"epoch": 2.0,
"step": 950,
"total_flos": 1782751297536.0,
"train_loss": 0.18172799837824546,
"train_runtime": 438.8142,
"train_samples_per_second": 17.315,
"train_steps_per_second": 2.165
}
],
"logging_steps": 1,
"max_steps": 950,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1782751297536.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}