model_3ca872ba_xp666 / checkpoint-2000 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
0e4034e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9671179883945842,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00048355899419729207,
"grad_norm": 1.1121258735656738,
"learning_rate": 0.0,
"loss": 0.8143,
"step": 1
},
{
"epoch": 0.0009671179883945841,
"grad_norm": 1.0534573793411255,
"learning_rate": 5.0000000000000004e-08,
"loss": 0.7758,
"step": 2
},
{
"epoch": 0.0014506769825918763,
"grad_norm": 1.1191091537475586,
"learning_rate": 1.0000000000000001e-07,
"loss": 0.7958,
"step": 3
},
{
"epoch": 0.0019342359767891683,
"grad_norm": 1.0339659452438354,
"learning_rate": 1.5000000000000002e-07,
"loss": 0.74,
"step": 4
},
{
"epoch": 0.0024177949709864605,
"grad_norm": 1.097814917564392,
"learning_rate": 2.0000000000000002e-07,
"loss": 0.7844,
"step": 5
},
{
"epoch": 0.0029013539651837525,
"grad_norm": 1.0650779008865356,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.7331,
"step": 6
},
{
"epoch": 0.0033849129593810446,
"grad_norm": 1.609460473060608,
"learning_rate": 3.0000000000000004e-07,
"loss": 0.7787,
"step": 7
},
{
"epoch": 0.0038684719535783366,
"grad_norm": 1.2944858074188232,
"learning_rate": 3.5000000000000004e-07,
"loss": 0.7915,
"step": 8
},
{
"epoch": 0.004352030947775629,
"grad_norm": 1.1526292562484741,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.803,
"step": 9
},
{
"epoch": 0.004835589941972921,
"grad_norm": 1.0605498552322388,
"learning_rate": 4.5000000000000003e-07,
"loss": 0.7525,
"step": 10
},
{
"epoch": 0.005319148936170213,
"grad_norm": 1.080625057220459,
"learning_rate": 5.000000000000001e-07,
"loss": 0.7566,
"step": 11
},
{
"epoch": 0.005802707930367505,
"grad_norm": 1.035576581954956,
"learning_rate": 5.5e-07,
"loss": 0.7419,
"step": 12
},
{
"epoch": 0.006286266924564797,
"grad_norm": 1.111180305480957,
"learning_rate": 6.000000000000001e-07,
"loss": 0.795,
"step": 13
},
{
"epoch": 0.006769825918762089,
"grad_norm": 1.0369406938552856,
"learning_rate": 6.5e-07,
"loss": 0.7713,
"step": 14
},
{
"epoch": 0.007253384912959381,
"grad_norm": 1.0771433115005493,
"learning_rate": 7.000000000000001e-07,
"loss": 0.7698,
"step": 15
},
{
"epoch": 0.007736943907156673,
"grad_norm": 1.0160727500915527,
"learning_rate": 7.5e-07,
"loss": 0.78,
"step": 16
},
{
"epoch": 0.008220502901353965,
"grad_norm": 0.992279052734375,
"learning_rate": 8.000000000000001e-07,
"loss": 0.7704,
"step": 17
},
{
"epoch": 0.008704061895551257,
"grad_norm": 1.0474507808685303,
"learning_rate": 8.500000000000001e-07,
"loss": 0.7584,
"step": 18
},
{
"epoch": 0.00918762088974855,
"grad_norm": 1.024119257926941,
"learning_rate": 9.000000000000001e-07,
"loss": 0.7809,
"step": 19
},
{
"epoch": 0.009671179883945842,
"grad_norm": 0.9913584589958191,
"learning_rate": 9.500000000000001e-07,
"loss": 0.7973,
"step": 20
},
{
"epoch": 0.010154738878143133,
"grad_norm": 0.9856535196304321,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.7498,
"step": 21
},
{
"epoch": 0.010638297872340425,
"grad_norm": 1.2484019994735718,
"learning_rate": 1.0500000000000001e-06,
"loss": 0.7672,
"step": 22
},
{
"epoch": 0.011121856866537718,
"grad_norm": 0.8720110058784485,
"learning_rate": 1.1e-06,
"loss": 0.7411,
"step": 23
},
{
"epoch": 0.01160541586073501,
"grad_norm": 1.005557894706726,
"learning_rate": 1.1500000000000002e-06,
"loss": 0.7659,
"step": 24
},
{
"epoch": 0.012088974854932301,
"grad_norm": 0.8640859723091125,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.7328,
"step": 25
},
{
"epoch": 0.012572533849129593,
"grad_norm": 0.8791433572769165,
"learning_rate": 1.25e-06,
"loss": 0.7549,
"step": 26
},
{
"epoch": 0.013056092843326886,
"grad_norm": 0.8279618620872498,
"learning_rate": 1.3e-06,
"loss": 0.7252,
"step": 27
},
{
"epoch": 0.013539651837524178,
"grad_norm": 0.8029568195343018,
"learning_rate": 1.3500000000000002e-06,
"loss": 0.7296,
"step": 28
},
{
"epoch": 0.01402321083172147,
"grad_norm": 0.8325296640396118,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.7522,
"step": 29
},
{
"epoch": 0.014506769825918761,
"grad_norm": 0.9094786643981934,
"learning_rate": 1.45e-06,
"loss": 0.7436,
"step": 30
},
{
"epoch": 0.014990328820116054,
"grad_norm": 0.7980680465698242,
"learning_rate": 1.5e-06,
"loss": 0.7018,
"step": 31
},
{
"epoch": 0.015473887814313346,
"grad_norm": 0.7666404247283936,
"learning_rate": 1.5500000000000002e-06,
"loss": 0.7499,
"step": 32
},
{
"epoch": 0.015957446808510637,
"grad_norm": 0.7892335653305054,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.7287,
"step": 33
},
{
"epoch": 0.01644100580270793,
"grad_norm": 0.7180586457252502,
"learning_rate": 1.6500000000000003e-06,
"loss": 0.7548,
"step": 34
},
{
"epoch": 0.016924564796905222,
"grad_norm": 0.6945351958274841,
"learning_rate": 1.7000000000000002e-06,
"loss": 0.7032,
"step": 35
},
{
"epoch": 0.017408123791102514,
"grad_norm": 0.752153754234314,
"learning_rate": 1.75e-06,
"loss": 0.7249,
"step": 36
},
{
"epoch": 0.017891682785299807,
"grad_norm": 0.6585816144943237,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.7217,
"step": 37
},
{
"epoch": 0.0183752417794971,
"grad_norm": 0.6001906991004944,
"learning_rate": 1.85e-06,
"loss": 0.7099,
"step": 38
},
{
"epoch": 0.018858800773694392,
"grad_norm": 0.5837531089782715,
"learning_rate": 1.9000000000000002e-06,
"loss": 0.678,
"step": 39
},
{
"epoch": 0.019342359767891684,
"grad_norm": 0.5784347057342529,
"learning_rate": 1.9500000000000004e-06,
"loss": 0.7126,
"step": 40
},
{
"epoch": 0.019825918762088973,
"grad_norm": 0.574742317199707,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.702,
"step": 41
},
{
"epoch": 0.020309477756286266,
"grad_norm": 0.6225464940071106,
"learning_rate": 2.05e-06,
"loss": 0.6716,
"step": 42
},
{
"epoch": 0.020793036750483558,
"grad_norm": 0.5501518845558167,
"learning_rate": 2.1000000000000002e-06,
"loss": 0.7047,
"step": 43
},
{
"epoch": 0.02127659574468085,
"grad_norm": 0.5282008051872253,
"learning_rate": 2.15e-06,
"loss": 0.6777,
"step": 44
},
{
"epoch": 0.021760154738878143,
"grad_norm": 0.5413048267364502,
"learning_rate": 2.2e-06,
"loss": 0.7094,
"step": 45
},
{
"epoch": 0.022243713733075435,
"grad_norm": 0.511887788772583,
"learning_rate": 2.25e-06,
"loss": 0.6785,
"step": 46
},
{
"epoch": 0.022727272727272728,
"grad_norm": 0.544187068939209,
"learning_rate": 2.3000000000000004e-06,
"loss": 0.6744,
"step": 47
},
{
"epoch": 0.02321083172147002,
"grad_norm": 0.5064325928688049,
"learning_rate": 2.35e-06,
"loss": 0.6593,
"step": 48
},
{
"epoch": 0.023694390715667313,
"grad_norm": 0.5299221873283386,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.645,
"step": 49
},
{
"epoch": 0.024177949709864602,
"grad_norm": 0.5178957581520081,
"learning_rate": 2.4500000000000003e-06,
"loss": 0.6697,
"step": 50
},
{
"epoch": 0.024661508704061894,
"grad_norm": 0.5200456976890564,
"learning_rate": 2.5e-06,
"loss": 0.6692,
"step": 51
},
{
"epoch": 0.025145067698259187,
"grad_norm": 0.4843021333217621,
"learning_rate": 2.55e-06,
"loss": 0.6505,
"step": 52
},
{
"epoch": 0.02562862669245648,
"grad_norm": 0.4847952127456665,
"learning_rate": 2.6e-06,
"loss": 0.6427,
"step": 53
},
{
"epoch": 0.02611218568665377,
"grad_norm": 0.49867674708366394,
"learning_rate": 2.6500000000000005e-06,
"loss": 0.677,
"step": 54
},
{
"epoch": 0.026595744680851064,
"grad_norm": 0.5223937630653381,
"learning_rate": 2.7000000000000004e-06,
"loss": 0.6494,
"step": 55
},
{
"epoch": 0.027079303675048357,
"grad_norm": 0.4924434721469879,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.642,
"step": 56
},
{
"epoch": 0.02756286266924565,
"grad_norm": 0.5522122383117676,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.6709,
"step": 57
},
{
"epoch": 0.02804642166344294,
"grad_norm": 0.49998044967651367,
"learning_rate": 2.85e-06,
"loss": 0.6633,
"step": 58
},
{
"epoch": 0.02852998065764023,
"grad_norm": 0.6035799980163574,
"learning_rate": 2.9e-06,
"loss": 0.6305,
"step": 59
},
{
"epoch": 0.029013539651837523,
"grad_norm": 0.7769137620925903,
"learning_rate": 2.95e-06,
"loss": 0.6307,
"step": 60
},
{
"epoch": 0.029497098646034815,
"grad_norm": 0.4490588307380676,
"learning_rate": 3e-06,
"loss": 0.6334,
"step": 61
},
{
"epoch": 0.029980657640232108,
"grad_norm": 0.42361530661582947,
"learning_rate": 3.05e-06,
"loss": 0.6053,
"step": 62
},
{
"epoch": 0.0304642166344294,
"grad_norm": 0.4436582326889038,
"learning_rate": 3.1000000000000004e-06,
"loss": 0.6293,
"step": 63
},
{
"epoch": 0.030947775628626693,
"grad_norm": 0.4686850607395172,
"learning_rate": 3.1500000000000003e-06,
"loss": 0.6568,
"step": 64
},
{
"epoch": 0.031431334622823985,
"grad_norm": 0.44556960463523865,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.6248,
"step": 65
},
{
"epoch": 0.031914893617021274,
"grad_norm": 0.4263205826282501,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.6153,
"step": 66
},
{
"epoch": 0.03239845261121857,
"grad_norm": 0.4737732708454132,
"learning_rate": 3.3000000000000006e-06,
"loss": 0.6566,
"step": 67
},
{
"epoch": 0.03288201160541586,
"grad_norm": 0.4620726704597473,
"learning_rate": 3.3500000000000005e-06,
"loss": 0.6119,
"step": 68
},
{
"epoch": 0.033365570599613155,
"grad_norm": 0.46939659118652344,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.6154,
"step": 69
},
{
"epoch": 0.033849129593810444,
"grad_norm": 0.4302070736885071,
"learning_rate": 3.45e-06,
"loss": 0.6399,
"step": 70
},
{
"epoch": 0.03433268858800774,
"grad_norm": 0.4119694232940674,
"learning_rate": 3.5e-06,
"loss": 0.6211,
"step": 71
},
{
"epoch": 0.03481624758220503,
"grad_norm": 0.4920046925544739,
"learning_rate": 3.5500000000000003e-06,
"loss": 0.6381,
"step": 72
},
{
"epoch": 0.03529980657640232,
"grad_norm": 0.51338130235672,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.6203,
"step": 73
},
{
"epoch": 0.035783365570599614,
"grad_norm": 0.4448227882385254,
"learning_rate": 3.65e-06,
"loss": 0.6157,
"step": 74
},
{
"epoch": 0.0362669245647969,
"grad_norm": 0.6254576444625854,
"learning_rate": 3.7e-06,
"loss": 0.6427,
"step": 75
},
{
"epoch": 0.0367504835589942,
"grad_norm": 0.4658832848072052,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.6257,
"step": 76
},
{
"epoch": 0.03723404255319149,
"grad_norm": 0.41669055819511414,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.6104,
"step": 77
},
{
"epoch": 0.037717601547388784,
"grad_norm": 0.4414327144622803,
"learning_rate": 3.85e-06,
"loss": 0.5986,
"step": 78
},
{
"epoch": 0.03820116054158607,
"grad_norm": 0.4977372884750366,
"learning_rate": 3.900000000000001e-06,
"loss": 0.6349,
"step": 79
},
{
"epoch": 0.03868471953578337,
"grad_norm": 0.4130990505218506,
"learning_rate": 3.95e-06,
"loss": 0.5956,
"step": 80
},
{
"epoch": 0.03916827852998066,
"grad_norm": 0.42152485251426697,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6196,
"step": 81
},
{
"epoch": 0.039651837524177946,
"grad_norm": 0.4156739115715027,
"learning_rate": 4.05e-06,
"loss": 0.6064,
"step": 82
},
{
"epoch": 0.04013539651837524,
"grad_norm": 0.4026014804840088,
"learning_rate": 4.1e-06,
"loss": 0.6334,
"step": 83
},
{
"epoch": 0.04061895551257253,
"grad_norm": 1.8549836874008179,
"learning_rate": 4.15e-06,
"loss": 0.6143,
"step": 84
},
{
"epoch": 0.04110251450676983,
"grad_norm": 0.39867880940437317,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.5951,
"step": 85
},
{
"epoch": 0.041586073500967116,
"grad_norm": 0.45038288831710815,
"learning_rate": 4.25e-06,
"loss": 0.5865,
"step": 86
},
{
"epoch": 0.04206963249516441,
"grad_norm": 0.43607354164123535,
"learning_rate": 4.3e-06,
"loss": 0.6164,
"step": 87
},
{
"epoch": 0.0425531914893617,
"grad_norm": 0.46121928095817566,
"learning_rate": 4.350000000000001e-06,
"loss": 0.5702,
"step": 88
},
{
"epoch": 0.043036750483559,
"grad_norm": 2.1593496799468994,
"learning_rate": 4.4e-06,
"loss": 0.6221,
"step": 89
},
{
"epoch": 0.043520309477756286,
"grad_norm": 0.4066154360771179,
"learning_rate": 4.450000000000001e-06,
"loss": 0.6203,
"step": 90
},
{
"epoch": 0.044003868471953575,
"grad_norm": 0.5225070118904114,
"learning_rate": 4.5e-06,
"loss": 0.5805,
"step": 91
},
{
"epoch": 0.04448742746615087,
"grad_norm": 0.5109372138977051,
"learning_rate": 4.5500000000000005e-06,
"loss": 0.5914,
"step": 92
},
{
"epoch": 0.04497098646034816,
"grad_norm": 0.42148903012275696,
"learning_rate": 4.600000000000001e-06,
"loss": 0.5955,
"step": 93
},
{
"epoch": 0.045454545454545456,
"grad_norm": 0.40933647751808167,
"learning_rate": 4.65e-06,
"loss": 0.6053,
"step": 94
},
{
"epoch": 0.045938104448742745,
"grad_norm": 0.6771563291549683,
"learning_rate": 4.7e-06,
"loss": 0.5938,
"step": 95
},
{
"epoch": 0.04642166344294004,
"grad_norm": 0.4065110683441162,
"learning_rate": 4.75e-06,
"loss": 0.6004,
"step": 96
},
{
"epoch": 0.04690522243713733,
"grad_norm": 0.4085659682750702,
"learning_rate": 4.800000000000001e-06,
"loss": 0.5791,
"step": 97
},
{
"epoch": 0.047388781431334626,
"grad_norm": 0.40749242901802063,
"learning_rate": 4.85e-06,
"loss": 0.5911,
"step": 98
},
{
"epoch": 0.047872340425531915,
"grad_norm": 0.402582049369812,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.6085,
"step": 99
},
{
"epoch": 0.048355899419729204,
"grad_norm": 0.6223399043083191,
"learning_rate": 4.95e-06,
"loss": 0.5811,
"step": 100
},
{
"epoch": 0.0488394584139265,
"grad_norm": 0.7025216221809387,
"learning_rate": 5e-06,
"loss": 0.5763,
"step": 101
},
{
"epoch": 0.04932301740812379,
"grad_norm": 0.4087183475494385,
"learning_rate": 4.99999991856056e-06,
"loss": 0.5868,
"step": 102
},
{
"epoch": 0.049806576402321084,
"grad_norm": 0.408280611038208,
"learning_rate": 4.999999674242244e-06,
"loss": 0.5668,
"step": 103
},
{
"epoch": 0.05029013539651837,
"grad_norm": 0.4054011106491089,
"learning_rate": 4.9999992670450685e-06,
"loss": 0.5912,
"step": 104
},
{
"epoch": 0.05077369439071567,
"grad_norm": 0.3877287209033966,
"learning_rate": 4.99999869696906e-06,
"loss": 0.5739,
"step": 105
},
{
"epoch": 0.05125725338491296,
"grad_norm": 0.4031350314617157,
"learning_rate": 4.999997964014256e-06,
"loss": 0.5982,
"step": 106
},
{
"epoch": 0.051740812379110254,
"grad_norm": 0.43434491753578186,
"learning_rate": 4.999997068180702e-06,
"loss": 0.5815,
"step": 107
},
{
"epoch": 0.05222437137330754,
"grad_norm": 0.4342786371707916,
"learning_rate": 4.99999600946846e-06,
"loss": 0.5841,
"step": 108
},
{
"epoch": 0.05270793036750483,
"grad_norm": 0.43785515427589417,
"learning_rate": 4.999994787877597e-06,
"loss": 0.5584,
"step": 109
},
{
"epoch": 0.05319148936170213,
"grad_norm": 0.38336819410324097,
"learning_rate": 4.999993403408192e-06,
"loss": 0.5736,
"step": 110
},
{
"epoch": 0.05367504835589942,
"grad_norm": 0.43652138113975525,
"learning_rate": 4.999991856060336e-06,
"loss": 0.5764,
"step": 111
},
{
"epoch": 0.05415860735009671,
"grad_norm": 0.4337776303291321,
"learning_rate": 4.999990145834131e-06,
"loss": 0.5784,
"step": 112
},
{
"epoch": 0.054642166344294,
"grad_norm": 0.43474823236465454,
"learning_rate": 4.999988272729685e-06,
"loss": 0.5687,
"step": 113
},
{
"epoch": 0.0551257253384913,
"grad_norm": 0.4115881025791168,
"learning_rate": 4.999986236747124e-06,
"loss": 0.5504,
"step": 114
},
{
"epoch": 0.05560928433268859,
"grad_norm": 1.258635401725769,
"learning_rate": 4.999984037886578e-06,
"loss": 0.5809,
"step": 115
},
{
"epoch": 0.05609284332688588,
"grad_norm": 0.46500417590141296,
"learning_rate": 4.999981676148191e-06,
"loss": 0.5855,
"step": 116
},
{
"epoch": 0.05657640232108317,
"grad_norm": 0.479522705078125,
"learning_rate": 4.999979151532119e-06,
"loss": 0.6054,
"step": 117
},
{
"epoch": 0.05705996131528046,
"grad_norm": 0.39936313033103943,
"learning_rate": 4.999976464038522e-06,
"loss": 0.5517,
"step": 118
},
{
"epoch": 0.05754352030947776,
"grad_norm": 0.597406268119812,
"learning_rate": 4.999973613667578e-06,
"loss": 0.5743,
"step": 119
},
{
"epoch": 0.058027079303675046,
"grad_norm": 0.6071715354919434,
"learning_rate": 4.999970600419474e-06,
"loss": 0.5463,
"step": 120
},
{
"epoch": 0.05851063829787234,
"grad_norm": 0.4146825671195984,
"learning_rate": 4.999967424294403e-06,
"loss": 0.5753,
"step": 121
},
{
"epoch": 0.05899419729206963,
"grad_norm": 0.48899951577186584,
"learning_rate": 4.999964085292573e-06,
"loss": 0.5582,
"step": 122
},
{
"epoch": 0.059477756286266927,
"grad_norm": 0.4463071823120117,
"learning_rate": 4.999960583414204e-06,
"loss": 0.5761,
"step": 123
},
{
"epoch": 0.059961315280464215,
"grad_norm": 0.4876002073287964,
"learning_rate": 4.999956918659521e-06,
"loss": 0.5842,
"step": 124
},
{
"epoch": 0.06044487427466151,
"grad_norm": 0.6326982975006104,
"learning_rate": 4.999953091028764e-06,
"loss": 0.5683,
"step": 125
},
{
"epoch": 0.0609284332688588,
"grad_norm": 0.45847585797309875,
"learning_rate": 4.999949100522183e-06,
"loss": 0.5895,
"step": 126
},
{
"epoch": 0.06141199226305609,
"grad_norm": 0.45837166905403137,
"learning_rate": 4.9999449471400364e-06,
"loss": 0.5595,
"step": 127
},
{
"epoch": 0.061895551257253385,
"grad_norm": 0.404506653547287,
"learning_rate": 4.999940630882597e-06,
"loss": 0.5798,
"step": 128
},
{
"epoch": 0.062379110251450674,
"grad_norm": 0.38707059621810913,
"learning_rate": 4.999936151750143e-06,
"loss": 0.5825,
"step": 129
},
{
"epoch": 0.06286266924564797,
"grad_norm": 0.4148559868335724,
"learning_rate": 4.99993150974297e-06,
"loss": 0.5782,
"step": 130
},
{
"epoch": 0.06334622823984526,
"grad_norm": 0.44726136326789856,
"learning_rate": 4.999926704861377e-06,
"loss": 0.5842,
"step": 131
},
{
"epoch": 0.06382978723404255,
"grad_norm": 0.4477657973766327,
"learning_rate": 4.999921737105678e-06,
"loss": 0.5649,
"step": 132
},
{
"epoch": 0.06431334622823985,
"grad_norm": 0.45821887254714966,
"learning_rate": 4.999916606476199e-06,
"loss": 0.5944,
"step": 133
},
{
"epoch": 0.06479690522243714,
"grad_norm": 0.4118689298629761,
"learning_rate": 4.999911312973271e-06,
"loss": 0.5617,
"step": 134
},
{
"epoch": 0.06528046421663443,
"grad_norm": 0.8030620813369751,
"learning_rate": 4.999905856597241e-06,
"loss": 0.5754,
"step": 135
},
{
"epoch": 0.06576402321083172,
"grad_norm": 0.42012104392051697,
"learning_rate": 4.999900237348463e-06,
"loss": 0.5944,
"step": 136
},
{
"epoch": 0.06624758220502901,
"grad_norm": 0.3971441388130188,
"learning_rate": 4.999894455227304e-06,
"loss": 0.5429,
"step": 137
},
{
"epoch": 0.06673114119922631,
"grad_norm": 0.44027745723724365,
"learning_rate": 4.999888510234141e-06,
"loss": 0.5817,
"step": 138
},
{
"epoch": 0.0672147001934236,
"grad_norm": 0.4141297936439514,
"learning_rate": 4.999882402369361e-06,
"loss": 0.588,
"step": 139
},
{
"epoch": 0.06769825918762089,
"grad_norm": 0.4220448434352875,
"learning_rate": 4.999876131633361e-06,
"loss": 0.5588,
"step": 140
},
{
"epoch": 0.06818181818181818,
"grad_norm": 0.47313711047172546,
"learning_rate": 4.999869698026551e-06,
"loss": 0.5783,
"step": 141
},
{
"epoch": 0.06866537717601548,
"grad_norm": 0.8543782234191895,
"learning_rate": 4.99986310154935e-06,
"loss": 0.5635,
"step": 142
},
{
"epoch": 0.06914893617021277,
"grad_norm": 0.4882458448410034,
"learning_rate": 4.999856342202187e-06,
"loss": 0.5568,
"step": 143
},
{
"epoch": 0.06963249516441006,
"grad_norm": 0.395766943693161,
"learning_rate": 4.999849419985502e-06,
"loss": 0.5607,
"step": 144
},
{
"epoch": 0.07011605415860735,
"grad_norm": 0.4293517768383026,
"learning_rate": 4.999842334899748e-06,
"loss": 0.5755,
"step": 145
},
{
"epoch": 0.07059961315280464,
"grad_norm": 0.41062092781066895,
"learning_rate": 4.999835086945384e-06,
"loss": 0.5569,
"step": 146
},
{
"epoch": 0.07108317214700194,
"grad_norm": 0.40252384543418884,
"learning_rate": 4.999827676122884e-06,
"loss": 0.5454,
"step": 147
},
{
"epoch": 0.07156673114119923,
"grad_norm": 0.4043247699737549,
"learning_rate": 4.999820102432731e-06,
"loss": 0.5663,
"step": 148
},
{
"epoch": 0.07205029013539652,
"grad_norm": 0.45598798990249634,
"learning_rate": 4.999812365875417e-06,
"loss": 0.5611,
"step": 149
},
{
"epoch": 0.0725338491295938,
"grad_norm": 0.46978089213371277,
"learning_rate": 4.999804466451446e-06,
"loss": 0.5454,
"step": 150
},
{
"epoch": 0.07301740812379111,
"grad_norm": 0.40247493982315063,
"learning_rate": 4.999796404161335e-06,
"loss": 0.55,
"step": 151
},
{
"epoch": 0.0735009671179884,
"grad_norm": 0.3953741192817688,
"learning_rate": 4.999788179005608e-06,
"loss": 0.5682,
"step": 152
},
{
"epoch": 0.07398452611218569,
"grad_norm": 0.41548067331314087,
"learning_rate": 4.999779790984799e-06,
"loss": 0.5694,
"step": 153
},
{
"epoch": 0.07446808510638298,
"grad_norm": 0.4155537188053131,
"learning_rate": 4.999771240099457e-06,
"loss": 0.5759,
"step": 154
},
{
"epoch": 0.07495164410058026,
"grad_norm": 0.46990489959716797,
"learning_rate": 4.999762526350138e-06,
"loss": 0.5706,
"step": 155
},
{
"epoch": 0.07543520309477757,
"grad_norm": 0.4070248007774353,
"learning_rate": 4.999753649737411e-06,
"loss": 0.5639,
"step": 156
},
{
"epoch": 0.07591876208897486,
"grad_norm": 0.4163321554660797,
"learning_rate": 4.999744610261852e-06,
"loss": 0.543,
"step": 157
},
{
"epoch": 0.07640232108317214,
"grad_norm": 0.42817747592926025,
"learning_rate": 4.999735407924052e-06,
"loss": 0.546,
"step": 158
},
{
"epoch": 0.07688588007736943,
"grad_norm": 0.44839778542518616,
"learning_rate": 4.9997260427246096e-06,
"loss": 0.5531,
"step": 159
},
{
"epoch": 0.07736943907156674,
"grad_norm": 0.48207932710647583,
"learning_rate": 4.999716514664135e-06,
"loss": 0.545,
"step": 160
},
{
"epoch": 0.07785299806576403,
"grad_norm": 0.38995426893234253,
"learning_rate": 4.999706823743248e-06,
"loss": 0.5368,
"step": 161
},
{
"epoch": 0.07833655705996131,
"grad_norm": 0.42563074827194214,
"learning_rate": 4.999696969962583e-06,
"loss": 0.5669,
"step": 162
},
{
"epoch": 0.0788201160541586,
"grad_norm": 0.42152607440948486,
"learning_rate": 4.999686953322779e-06,
"loss": 0.5815,
"step": 163
},
{
"epoch": 0.07930367504835589,
"grad_norm": 0.4055645167827606,
"learning_rate": 4.999676773824489e-06,
"loss": 0.5711,
"step": 164
},
{
"epoch": 0.0797872340425532,
"grad_norm": 0.4749949872493744,
"learning_rate": 4.9996664314683775e-06,
"loss": 0.5445,
"step": 165
},
{
"epoch": 0.08027079303675048,
"grad_norm": 0.5488531589508057,
"learning_rate": 4.999655926255118e-06,
"loss": 0.5615,
"step": 166
},
{
"epoch": 0.08075435203094777,
"grad_norm": 0.42545777559280396,
"learning_rate": 4.999645258185394e-06,
"loss": 0.5536,
"step": 167
},
{
"epoch": 0.08123791102514506,
"grad_norm": 0.4722123444080353,
"learning_rate": 4.999634427259902e-06,
"loss": 0.5566,
"step": 168
},
{
"epoch": 0.08172147001934237,
"grad_norm": 0.5403354167938232,
"learning_rate": 4.999623433479346e-06,
"loss": 0.5179,
"step": 169
},
{
"epoch": 0.08220502901353965,
"grad_norm": 0.40221697092056274,
"learning_rate": 4.999612276844444e-06,
"loss": 0.5404,
"step": 170
},
{
"epoch": 0.08268858800773694,
"grad_norm": 0.4005393981933594,
"learning_rate": 4.999600957355921e-06,
"loss": 0.5391,
"step": 171
},
{
"epoch": 0.08317214700193423,
"grad_norm": 0.4574699103832245,
"learning_rate": 4.999589475014516e-06,
"loss": 0.5431,
"step": 172
},
{
"epoch": 0.08365570599613152,
"grad_norm": 0.4481971561908722,
"learning_rate": 4.999577829820976e-06,
"loss": 0.5574,
"step": 173
},
{
"epoch": 0.08413926499032882,
"grad_norm": 0.44044750928878784,
"learning_rate": 4.999566021776061e-06,
"loss": 0.531,
"step": 174
},
{
"epoch": 0.08462282398452611,
"grad_norm": 0.4010845124721527,
"learning_rate": 4.9995540508805385e-06,
"loss": 0.5431,
"step": 175
},
{
"epoch": 0.0851063829787234,
"grad_norm": 0.48778480291366577,
"learning_rate": 4.99954191713519e-06,
"loss": 0.5458,
"step": 176
},
{
"epoch": 0.08558994197292069,
"grad_norm": 0.4292055070400238,
"learning_rate": 4.999529620540805e-06,
"loss": 0.5227,
"step": 177
},
{
"epoch": 0.086073500967118,
"grad_norm": 0.488142728805542,
"learning_rate": 4.999517161098186e-06,
"loss": 0.5262,
"step": 178
},
{
"epoch": 0.08655705996131528,
"grad_norm": 0.41607528924942017,
"learning_rate": 4.9995045388081434e-06,
"loss": 0.5653,
"step": 179
},
{
"epoch": 0.08704061895551257,
"grad_norm": 0.41551366448402405,
"learning_rate": 4.999491753671501e-06,
"loss": 0.5367,
"step": 180
},
{
"epoch": 0.08752417794970986,
"grad_norm": 0.4160298705101013,
"learning_rate": 4.999478805689089e-06,
"loss": 0.5446,
"step": 181
},
{
"epoch": 0.08800773694390715,
"grad_norm": 0.38679638504981995,
"learning_rate": 4.999465694861754e-06,
"loss": 0.5566,
"step": 182
},
{
"epoch": 0.08849129593810445,
"grad_norm": 0.3910638093948364,
"learning_rate": 4.999452421190348e-06,
"loss": 0.5369,
"step": 183
},
{
"epoch": 0.08897485493230174,
"grad_norm": 0.40391838550567627,
"learning_rate": 4.999438984675737e-06,
"loss": 0.525,
"step": 184
},
{
"epoch": 0.08945841392649903,
"grad_norm": 0.40901172161102295,
"learning_rate": 4.999425385318797e-06,
"loss": 0.5518,
"step": 185
},
{
"epoch": 0.08994197292069632,
"grad_norm": 0.44393712282180786,
"learning_rate": 4.999411623120413e-06,
"loss": 0.5416,
"step": 186
},
{
"epoch": 0.09042553191489362,
"grad_norm": 0.4262126386165619,
"learning_rate": 4.999397698081482e-06,
"loss": 0.5384,
"step": 187
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.4574022591114044,
"learning_rate": 4.9993836102029105e-06,
"loss": 0.5625,
"step": 188
},
{
"epoch": 0.0913926499032882,
"grad_norm": 0.42913538217544556,
"learning_rate": 4.999369359485617e-06,
"loss": 0.5402,
"step": 189
},
{
"epoch": 0.09187620889748549,
"grad_norm": 0.39854881167411804,
"learning_rate": 4.99935494593053e-06,
"loss": 0.5571,
"step": 190
},
{
"epoch": 0.09235976789168278,
"grad_norm": 0.4160971939563751,
"learning_rate": 4.999340369538588e-06,
"loss": 0.555,
"step": 191
},
{
"epoch": 0.09284332688588008,
"grad_norm": 0.39643704891204834,
"learning_rate": 4.999325630310741e-06,
"loss": 0.5252,
"step": 192
},
{
"epoch": 0.09332688588007737,
"grad_norm": 0.4330281615257263,
"learning_rate": 4.999310728247952e-06,
"loss": 0.5372,
"step": 193
},
{
"epoch": 0.09381044487427466,
"grad_norm": 0.37870827317237854,
"learning_rate": 4.999295663351186e-06,
"loss": 0.5402,
"step": 194
},
{
"epoch": 0.09429400386847195,
"grad_norm": 0.4608426094055176,
"learning_rate": 4.99928043562143e-06,
"loss": 0.5278,
"step": 195
},
{
"epoch": 0.09477756286266925,
"grad_norm": 0.43381285667419434,
"learning_rate": 4.9992650450596725e-06,
"loss": 0.5624,
"step": 196
},
{
"epoch": 0.09526112185686654,
"grad_norm": 0.414285272359848,
"learning_rate": 4.999249491666918e-06,
"loss": 0.5531,
"step": 197
},
{
"epoch": 0.09574468085106383,
"grad_norm": 0.46436765789985657,
"learning_rate": 4.9992337754441796e-06,
"loss": 0.5305,
"step": 198
},
{
"epoch": 0.09622823984526112,
"grad_norm": 0.5019548535346985,
"learning_rate": 4.999217896392481e-06,
"loss": 0.5429,
"step": 199
},
{
"epoch": 0.09671179883945841,
"grad_norm": 1.2389442920684814,
"learning_rate": 4.999201854512857e-06,
"loss": 0.5556,
"step": 200
},
{
"epoch": 0.09719535783365571,
"grad_norm": 0.42270204424858093,
"learning_rate": 4.999185649806352e-06,
"loss": 0.5333,
"step": 201
},
{
"epoch": 0.097678916827853,
"grad_norm": 0.40609827637672424,
"learning_rate": 4.999169282274023e-06,
"loss": 0.5577,
"step": 202
},
{
"epoch": 0.09816247582205029,
"grad_norm": 0.44571995735168457,
"learning_rate": 4.999152751916936e-06,
"loss": 0.5368,
"step": 203
},
{
"epoch": 0.09864603481624758,
"grad_norm": 0.3832789361476898,
"learning_rate": 4.999136058736167e-06,
"loss": 0.5423,
"step": 204
},
{
"epoch": 0.09912959381044488,
"grad_norm": 0.4100008010864258,
"learning_rate": 4.999119202732805e-06,
"loss": 0.539,
"step": 205
},
{
"epoch": 0.09961315280464217,
"grad_norm": 0.41159990429878235,
"learning_rate": 4.999102183907947e-06,
"loss": 0.5448,
"step": 206
},
{
"epoch": 0.10009671179883946,
"grad_norm": 0.39501649141311646,
"learning_rate": 4.999085002262701e-06,
"loss": 0.5319,
"step": 207
},
{
"epoch": 0.10058027079303675,
"grad_norm": 0.4287145733833313,
"learning_rate": 4.99906765779819e-06,
"loss": 0.5261,
"step": 208
},
{
"epoch": 0.10106382978723404,
"grad_norm": 0.45831891894340515,
"learning_rate": 4.999050150515541e-06,
"loss": 0.5393,
"step": 209
},
{
"epoch": 0.10154738878143134,
"grad_norm": 0.9308096170425415,
"learning_rate": 4.999032480415894e-06,
"loss": 0.5128,
"step": 210
},
{
"epoch": 0.10203094777562863,
"grad_norm": 0.4070744514465332,
"learning_rate": 4.999014647500403e-06,
"loss": 0.5284,
"step": 211
},
{
"epoch": 0.10251450676982592,
"grad_norm": 0.5258062481880188,
"learning_rate": 4.998996651770228e-06,
"loss": 0.5461,
"step": 212
},
{
"epoch": 0.1029980657640232,
"grad_norm": 0.43539905548095703,
"learning_rate": 4.998978493226542e-06,
"loss": 0.5179,
"step": 213
},
{
"epoch": 0.10348162475822051,
"grad_norm": 0.4096452593803406,
"learning_rate": 4.9989601718705275e-06,
"loss": 0.5055,
"step": 214
},
{
"epoch": 0.1039651837524178,
"grad_norm": 0.40248751640319824,
"learning_rate": 4.998941687703379e-06,
"loss": 0.5424,
"step": 215
},
{
"epoch": 0.10444874274661509,
"grad_norm": 0.40936747193336487,
"learning_rate": 4.9989230407263e-06,
"loss": 0.5297,
"step": 216
},
{
"epoch": 0.10493230174081238,
"grad_norm": 0.402515709400177,
"learning_rate": 4.998904230940506e-06,
"loss": 0.5415,
"step": 217
},
{
"epoch": 0.10541586073500966,
"grad_norm": 0.3962743878364563,
"learning_rate": 4.998885258347223e-06,
"loss": 0.5175,
"step": 218
},
{
"epoch": 0.10589941972920697,
"grad_norm": 0.39665189385414124,
"learning_rate": 4.998866122947685e-06,
"loss": 0.5459,
"step": 219
},
{
"epoch": 0.10638297872340426,
"grad_norm": 0.3833273649215698,
"learning_rate": 4.99884682474314e-06,
"loss": 0.5417,
"step": 220
},
{
"epoch": 0.10686653771760155,
"grad_norm": 0.3995283842086792,
"learning_rate": 4.998827363734846e-06,
"loss": 0.5301,
"step": 221
},
{
"epoch": 0.10735009671179883,
"grad_norm": 0.438424676656723,
"learning_rate": 4.998807739924071e-06,
"loss": 0.5189,
"step": 222
},
{
"epoch": 0.10783365570599614,
"grad_norm": 0.38158780336380005,
"learning_rate": 4.998787953312091e-06,
"loss": 0.5216,
"step": 223
},
{
"epoch": 0.10831721470019343,
"grad_norm": 0.4054715633392334,
"learning_rate": 4.998768003900198e-06,
"loss": 0.5203,
"step": 224
},
{
"epoch": 0.10880077369439071,
"grad_norm": 0.41561102867126465,
"learning_rate": 4.99874789168969e-06,
"loss": 0.5418,
"step": 225
},
{
"epoch": 0.109284332688588,
"grad_norm": 0.42851102352142334,
"learning_rate": 4.998727616681879e-06,
"loss": 0.5279,
"step": 226
},
{
"epoch": 0.10976789168278529,
"grad_norm": 0.3681640923023224,
"learning_rate": 4.998707178878084e-06,
"loss": 0.5106,
"step": 227
},
{
"epoch": 0.1102514506769826,
"grad_norm": 0.5779820084571838,
"learning_rate": 4.998686578279638e-06,
"loss": 0.553,
"step": 228
},
{
"epoch": 0.11073500967117988,
"grad_norm": 0.41628775000572205,
"learning_rate": 4.998665814887883e-06,
"loss": 0.542,
"step": 229
},
{
"epoch": 0.11121856866537717,
"grad_norm": 0.47802796959877014,
"learning_rate": 4.998644888704171e-06,
"loss": 0.5051,
"step": 230
},
{
"epoch": 0.11170212765957446,
"grad_norm": 0.41832587122917175,
"learning_rate": 4.998623799729865e-06,
"loss": 0.5403,
"step": 231
},
{
"epoch": 0.11218568665377177,
"grad_norm": 0.4051823019981384,
"learning_rate": 4.99860254796634e-06,
"loss": 0.5311,
"step": 232
},
{
"epoch": 0.11266924564796905,
"grad_norm": 0.40647706389427185,
"learning_rate": 4.998581133414981e-06,
"loss": 0.5128,
"step": 233
},
{
"epoch": 0.11315280464216634,
"grad_norm": 0.4021758437156677,
"learning_rate": 4.998559556077182e-06,
"loss": 0.5264,
"step": 234
},
{
"epoch": 0.11363636363636363,
"grad_norm": 0.47185152769088745,
"learning_rate": 4.99853781595435e-06,
"loss": 0.5353,
"step": 235
},
{
"epoch": 0.11411992263056092,
"grad_norm": 0.4166802763938904,
"learning_rate": 4.9985159130479e-06,
"loss": 0.5138,
"step": 236
},
{
"epoch": 0.11460348162475822,
"grad_norm": 0.4907703399658203,
"learning_rate": 4.99849384735926e-06,
"loss": 0.543,
"step": 237
},
{
"epoch": 0.11508704061895551,
"grad_norm": 0.6601998805999756,
"learning_rate": 4.998471618889867e-06,
"loss": 0.5431,
"step": 238
},
{
"epoch": 0.1155705996131528,
"grad_norm": 0.412659615278244,
"learning_rate": 4.99844922764117e-06,
"loss": 0.5267,
"step": 239
},
{
"epoch": 0.11605415860735009,
"grad_norm": 0.42117926478385925,
"learning_rate": 4.998426673614627e-06,
"loss": 0.5254,
"step": 240
},
{
"epoch": 0.1165377176015474,
"grad_norm": 0.3980475068092346,
"learning_rate": 4.998403956811708e-06,
"loss": 0.5077,
"step": 241
},
{
"epoch": 0.11702127659574468,
"grad_norm": 0.4634224474430084,
"learning_rate": 4.998381077233894e-06,
"loss": 0.5169,
"step": 242
},
{
"epoch": 0.11750483558994197,
"grad_norm": 0.4475371837615967,
"learning_rate": 4.998358034882673e-06,
"loss": 0.5394,
"step": 243
},
{
"epoch": 0.11798839458413926,
"grad_norm": 0.4416722059249878,
"learning_rate": 4.998334829759548e-06,
"loss": 0.5314,
"step": 244
},
{
"epoch": 0.11847195357833655,
"grad_norm": 0.42282068729400635,
"learning_rate": 4.998311461866031e-06,
"loss": 0.5262,
"step": 245
},
{
"epoch": 0.11895551257253385,
"grad_norm": 0.41945409774780273,
"learning_rate": 4.998287931203643e-06,
"loss": 0.5116,
"step": 246
},
{
"epoch": 0.11943907156673114,
"grad_norm": 0.423809677362442,
"learning_rate": 4.99826423777392e-06,
"loss": 0.5103,
"step": 247
},
{
"epoch": 0.11992263056092843,
"grad_norm": 0.4222816228866577,
"learning_rate": 4.998240381578403e-06,
"loss": 0.5519,
"step": 248
},
{
"epoch": 0.12040618955512572,
"grad_norm": 0.46973058581352234,
"learning_rate": 4.998216362618646e-06,
"loss": 0.5448,
"step": 249
},
{
"epoch": 0.12088974854932302,
"grad_norm": 0.4195155203342438,
"learning_rate": 4.998192180896217e-06,
"loss": 0.5339,
"step": 250
},
{
"epoch": 0.12137330754352031,
"grad_norm": 0.47963947057724,
"learning_rate": 4.998167836412688e-06,
"loss": 0.5365,
"step": 251
},
{
"epoch": 0.1218568665377176,
"grad_norm": 0.4399716556072235,
"learning_rate": 4.998143329169646e-06,
"loss": 0.5273,
"step": 252
},
{
"epoch": 0.12234042553191489,
"grad_norm": 0.436796635389328,
"learning_rate": 4.998118659168689e-06,
"loss": 0.5317,
"step": 253
},
{
"epoch": 0.12282398452611218,
"grad_norm": 0.5795308351516724,
"learning_rate": 4.998093826411423e-06,
"loss": 0.5165,
"step": 254
},
{
"epoch": 0.12330754352030948,
"grad_norm": 1.2130300998687744,
"learning_rate": 4.998068830899466e-06,
"loss": 0.5611,
"step": 255
},
{
"epoch": 0.12379110251450677,
"grad_norm": 0.44857141375541687,
"learning_rate": 4.998043672634448e-06,
"loss": 0.5224,
"step": 256
},
{
"epoch": 0.12427466150870406,
"grad_norm": 0.4253701865673065,
"learning_rate": 4.998018351618007e-06,
"loss": 0.527,
"step": 257
},
{
"epoch": 0.12475822050290135,
"grad_norm": 0.41572824120521545,
"learning_rate": 4.9979928678517915e-06,
"loss": 0.5286,
"step": 258
},
{
"epoch": 0.12524177949709864,
"grad_norm": 0.41825610399246216,
"learning_rate": 4.997967221337463e-06,
"loss": 0.5312,
"step": 259
},
{
"epoch": 0.12572533849129594,
"grad_norm": 0.45931923389434814,
"learning_rate": 4.997941412076693e-06,
"loss": 0.5417,
"step": 260
},
{
"epoch": 0.12620889748549324,
"grad_norm": 0.4032868444919586,
"learning_rate": 4.997915440071162e-06,
"loss": 0.5221,
"step": 261
},
{
"epoch": 0.12669245647969052,
"grad_norm": 0.8206331133842468,
"learning_rate": 4.997889305322563e-06,
"loss": 0.4896,
"step": 262
},
{
"epoch": 0.12717601547388782,
"grad_norm": 0.4170459806919098,
"learning_rate": 4.997863007832597e-06,
"loss": 0.532,
"step": 263
},
{
"epoch": 0.1276595744680851,
"grad_norm": 0.41096487641334534,
"learning_rate": 4.99783654760298e-06,
"loss": 0.5476,
"step": 264
},
{
"epoch": 0.1281431334622824,
"grad_norm": 0.40561696887016296,
"learning_rate": 4.997809924635434e-06,
"loss": 0.5031,
"step": 265
},
{
"epoch": 0.1286266924564797,
"grad_norm": 0.4312552809715271,
"learning_rate": 4.997783138931693e-06,
"loss": 0.5423,
"step": 266
},
{
"epoch": 0.12911025145067698,
"grad_norm": 0.43831390142440796,
"learning_rate": 4.997756190493505e-06,
"loss": 0.5207,
"step": 267
},
{
"epoch": 0.12959381044487428,
"grad_norm": 0.4064142107963562,
"learning_rate": 4.997729079322622e-06,
"loss": 0.5351,
"step": 268
},
{
"epoch": 0.13007736943907156,
"grad_norm": 0.5566521286964417,
"learning_rate": 4.997701805420813e-06,
"loss": 0.5231,
"step": 269
},
{
"epoch": 0.13056092843326886,
"grad_norm": 0.5654227137565613,
"learning_rate": 4.997674368789854e-06,
"loss": 0.5102,
"step": 270
},
{
"epoch": 0.13104448742746616,
"grad_norm": 0.42063838243484497,
"learning_rate": 4.997646769431532e-06,
"loss": 0.5284,
"step": 271
},
{
"epoch": 0.13152804642166344,
"grad_norm": 0.6299861073493958,
"learning_rate": 4.997619007347647e-06,
"loss": 0.5365,
"step": 272
},
{
"epoch": 0.13201160541586074,
"grad_norm": 0.4093479514122009,
"learning_rate": 4.997591082540006e-06,
"loss": 0.5297,
"step": 273
},
{
"epoch": 0.13249516441005801,
"grad_norm": 0.3970275819301605,
"learning_rate": 4.997562995010429e-06,
"loss": 0.5091,
"step": 274
},
{
"epoch": 0.13297872340425532,
"grad_norm": 0.4937737286090851,
"learning_rate": 4.9975347447607455e-06,
"loss": 0.5269,
"step": 275
},
{
"epoch": 0.13346228239845262,
"grad_norm": 0.3909159004688263,
"learning_rate": 4.997506331792796e-06,
"loss": 0.5098,
"step": 276
},
{
"epoch": 0.1339458413926499,
"grad_norm": 0.4457818269729614,
"learning_rate": 4.997477756108433e-06,
"loss": 0.5359,
"step": 277
},
{
"epoch": 0.1344294003868472,
"grad_norm": 0.451669842004776,
"learning_rate": 4.997449017709517e-06,
"loss": 0.534,
"step": 278
},
{
"epoch": 0.1349129593810445,
"grad_norm": 0.6622608304023743,
"learning_rate": 4.997420116597921e-06,
"loss": 0.5034,
"step": 279
},
{
"epoch": 0.13539651837524178,
"grad_norm": 1.008179783821106,
"learning_rate": 4.997391052775526e-06,
"loss": 0.5117,
"step": 280
},
{
"epoch": 0.13588007736943908,
"grad_norm": 0.4382878541946411,
"learning_rate": 4.997361826244229e-06,
"loss": 0.5219,
"step": 281
},
{
"epoch": 0.13636363636363635,
"grad_norm": 0.45464539527893066,
"learning_rate": 4.997332437005932e-06,
"loss": 0.5447,
"step": 282
},
{
"epoch": 0.13684719535783366,
"grad_norm": 0.5995938181877136,
"learning_rate": 4.99730288506255e-06,
"loss": 0.5292,
"step": 283
},
{
"epoch": 0.13733075435203096,
"grad_norm": 0.4235019385814667,
"learning_rate": 4.997273170416007e-06,
"loss": 0.5098,
"step": 284
},
{
"epoch": 0.13781431334622823,
"grad_norm": 0.45099836587905884,
"learning_rate": 4.997243293068242e-06,
"loss": 0.5151,
"step": 285
},
{
"epoch": 0.13829787234042554,
"grad_norm": 0.8173574209213257,
"learning_rate": 4.997213253021198e-06,
"loss": 0.542,
"step": 286
},
{
"epoch": 0.1387814313346228,
"grad_norm": 0.42866086959838867,
"learning_rate": 4.997183050276836e-06,
"loss": 0.5198,
"step": 287
},
{
"epoch": 0.13926499032882012,
"grad_norm": 0.42806968092918396,
"learning_rate": 4.997152684837121e-06,
"loss": 0.5303,
"step": 288
},
{
"epoch": 0.13974854932301742,
"grad_norm": 0.48591941595077515,
"learning_rate": 4.997122156704032e-06,
"loss": 0.5193,
"step": 289
},
{
"epoch": 0.1402321083172147,
"grad_norm": 0.4372103214263916,
"learning_rate": 4.997091465879559e-06,
"loss": 0.5227,
"step": 290
},
{
"epoch": 0.140715667311412,
"grad_norm": 0.4667884409427643,
"learning_rate": 4.9970606123656995e-06,
"loss": 0.5288,
"step": 291
},
{
"epoch": 0.14119922630560927,
"grad_norm": 0.44642403721809387,
"learning_rate": 4.997029596164466e-06,
"loss": 0.5187,
"step": 292
},
{
"epoch": 0.14168278529980657,
"grad_norm": 0.3939391076564789,
"learning_rate": 4.996998417277877e-06,
"loss": 0.5142,
"step": 293
},
{
"epoch": 0.14216634429400388,
"grad_norm": 4.678069114685059,
"learning_rate": 4.996967075707965e-06,
"loss": 0.5002,
"step": 294
},
{
"epoch": 0.14264990328820115,
"grad_norm": 0.5047649145126343,
"learning_rate": 4.996935571456773e-06,
"loss": 0.5261,
"step": 295
},
{
"epoch": 0.14313346228239845,
"grad_norm": 0.42343783378601074,
"learning_rate": 4.9969039045263515e-06,
"loss": 0.5199,
"step": 296
},
{
"epoch": 0.14361702127659576,
"grad_norm": 0.4126646816730499,
"learning_rate": 4.996872074918765e-06,
"loss": 0.5279,
"step": 297
},
{
"epoch": 0.14410058027079303,
"grad_norm": 0.412338525056839,
"learning_rate": 4.996840082636087e-06,
"loss": 0.5256,
"step": 298
},
{
"epoch": 0.14458413926499034,
"grad_norm": 0.41773638129234314,
"learning_rate": 4.996807927680401e-06,
"loss": 0.5316,
"step": 299
},
{
"epoch": 0.1450676982591876,
"grad_norm": 0.4320884644985199,
"learning_rate": 4.996775610053803e-06,
"loss": 0.5026,
"step": 300
},
{
"epoch": 0.1455512572533849,
"grad_norm": 0.4417104721069336,
"learning_rate": 4.996743129758398e-06,
"loss": 0.5238,
"step": 301
},
{
"epoch": 0.14603481624758222,
"grad_norm": 0.427249938249588,
"learning_rate": 4.9967104867963025e-06,
"loss": 0.5088,
"step": 302
},
{
"epoch": 0.1465183752417795,
"grad_norm": 0.500415027141571,
"learning_rate": 4.9966776811696435e-06,
"loss": 0.5269,
"step": 303
},
{
"epoch": 0.1470019342359768,
"grad_norm": 0.4268072545528412,
"learning_rate": 4.996644712880557e-06,
"loss": 0.5211,
"step": 304
},
{
"epoch": 0.14748549323017407,
"grad_norm": 0.4673326909542084,
"learning_rate": 4.9966115819311926e-06,
"loss": 0.4972,
"step": 305
},
{
"epoch": 0.14796905222437137,
"grad_norm": 0.43386024236679077,
"learning_rate": 4.996578288323708e-06,
"loss": 0.5025,
"step": 306
},
{
"epoch": 0.14845261121856868,
"grad_norm": 0.42369264364242554,
"learning_rate": 4.996544832060272e-06,
"loss": 0.5318,
"step": 307
},
{
"epoch": 0.14893617021276595,
"grad_norm": 0.4846673607826233,
"learning_rate": 4.996511213143065e-06,
"loss": 0.533,
"step": 308
},
{
"epoch": 0.14941972920696325,
"grad_norm": 0.448446661233902,
"learning_rate": 4.996477431574277e-06,
"loss": 0.5445,
"step": 309
},
{
"epoch": 0.14990328820116053,
"grad_norm": 0.409821480512619,
"learning_rate": 4.996443487356109e-06,
"loss": 0.4791,
"step": 310
},
{
"epoch": 0.15038684719535783,
"grad_norm": 0.43006864190101624,
"learning_rate": 4.9964093804907724e-06,
"loss": 0.5241,
"step": 311
},
{
"epoch": 0.15087040618955513,
"grad_norm": 0.4141005873680115,
"learning_rate": 4.99637511098049e-06,
"loss": 0.5248,
"step": 312
},
{
"epoch": 0.1513539651837524,
"grad_norm": 0.4346350133419037,
"learning_rate": 4.996340678827493e-06,
"loss": 0.5144,
"step": 313
},
{
"epoch": 0.1518375241779497,
"grad_norm": 0.41779354214668274,
"learning_rate": 4.996306084034026e-06,
"loss": 0.5277,
"step": 314
},
{
"epoch": 0.15232108317214701,
"grad_norm": 0.4917933940887451,
"learning_rate": 4.996271326602342e-06,
"loss": 0.5368,
"step": 315
},
{
"epoch": 0.1528046421663443,
"grad_norm": 0.5210337042808533,
"learning_rate": 4.996236406534707e-06,
"loss": 0.5445,
"step": 316
},
{
"epoch": 0.1532882011605416,
"grad_norm": 0.43788838386535645,
"learning_rate": 4.996201323833394e-06,
"loss": 0.5169,
"step": 317
},
{
"epoch": 0.15377176015473887,
"grad_norm": 0.4166138470172882,
"learning_rate": 4.996166078500691e-06,
"loss": 0.4993,
"step": 318
},
{
"epoch": 0.15425531914893617,
"grad_norm": 0.4216192662715912,
"learning_rate": 4.9961306705388925e-06,
"loss": 0.5269,
"step": 319
},
{
"epoch": 0.15473887814313347,
"grad_norm": 0.40549808740615845,
"learning_rate": 4.996095099950307e-06,
"loss": 0.5338,
"step": 320
},
{
"epoch": 0.15522243713733075,
"grad_norm": 0.4312421381473541,
"learning_rate": 4.9960593667372495e-06,
"loss": 0.5076,
"step": 321
},
{
"epoch": 0.15570599613152805,
"grad_norm": 0.4673048257827759,
"learning_rate": 4.99602347090205e-06,
"loss": 0.5189,
"step": 322
},
{
"epoch": 0.15618955512572533,
"grad_norm": 0.41923579573631287,
"learning_rate": 4.995987412447047e-06,
"loss": 0.5354,
"step": 323
},
{
"epoch": 0.15667311411992263,
"grad_norm": 0.548323392868042,
"learning_rate": 4.995951191374589e-06,
"loss": 0.5126,
"step": 324
},
{
"epoch": 0.15715667311411993,
"grad_norm": 0.5140036940574646,
"learning_rate": 4.995914807687037e-06,
"loss": 0.5273,
"step": 325
},
{
"epoch": 0.1576402321083172,
"grad_norm": 0.4467466473579407,
"learning_rate": 4.99587826138676e-06,
"loss": 0.5321,
"step": 326
},
{
"epoch": 0.1581237911025145,
"grad_norm": 0.41909724473953247,
"learning_rate": 4.9958415524761406e-06,
"loss": 0.5176,
"step": 327
},
{
"epoch": 0.15860735009671179,
"grad_norm": 0.44327664375305176,
"learning_rate": 4.995804680957569e-06,
"loss": 0.5159,
"step": 328
},
{
"epoch": 0.1590909090909091,
"grad_norm": 0.39020073413848877,
"learning_rate": 4.9957676468334485e-06,
"loss": 0.523,
"step": 329
},
{
"epoch": 0.1595744680851064,
"grad_norm": 0.5223175287246704,
"learning_rate": 4.995730450106191e-06,
"loss": 0.4969,
"step": 330
},
{
"epoch": 0.16005802707930367,
"grad_norm": 0.48410263657569885,
"learning_rate": 4.995693090778222e-06,
"loss": 0.4925,
"step": 331
},
{
"epoch": 0.16054158607350097,
"grad_norm": 0.41017135977745056,
"learning_rate": 4.995655568851973e-06,
"loss": 0.4897,
"step": 332
},
{
"epoch": 0.16102514506769827,
"grad_norm": 0.4232168197631836,
"learning_rate": 4.995617884329889e-06,
"loss": 0.5311,
"step": 333
},
{
"epoch": 0.16150870406189555,
"grad_norm": 0.4976195693016052,
"learning_rate": 4.995580037214427e-06,
"loss": 0.5191,
"step": 334
},
{
"epoch": 0.16199226305609285,
"grad_norm": 0.5347289443016052,
"learning_rate": 4.99554202750805e-06,
"loss": 0.4973,
"step": 335
},
{
"epoch": 0.16247582205029013,
"grad_norm": 0.4547256827354431,
"learning_rate": 4.995503855213237e-06,
"loss": 0.5302,
"step": 336
},
{
"epoch": 0.16295938104448743,
"grad_norm": 0.4252434968948364,
"learning_rate": 4.995465520332474e-06,
"loss": 0.4983,
"step": 337
},
{
"epoch": 0.16344294003868473,
"grad_norm": 0.4182872474193573,
"learning_rate": 4.995427022868259e-06,
"loss": 0.5161,
"step": 338
},
{
"epoch": 0.163926499032882,
"grad_norm": 0.42835015058517456,
"learning_rate": 4.9953883628231e-06,
"loss": 0.5101,
"step": 339
},
{
"epoch": 0.1644100580270793,
"grad_norm": 0.43284621834754944,
"learning_rate": 4.995349540199514e-06,
"loss": 0.513,
"step": 340
},
{
"epoch": 0.16489361702127658,
"grad_norm": 0.4314822852611542,
"learning_rate": 4.995310555000033e-06,
"loss": 0.5203,
"step": 341
},
{
"epoch": 0.1653771760154739,
"grad_norm": 0.4304635524749756,
"learning_rate": 4.995271407227195e-06,
"loss": 0.5221,
"step": 342
},
{
"epoch": 0.1658607350096712,
"grad_norm": 0.4306887686252594,
"learning_rate": 4.995232096883552e-06,
"loss": 0.516,
"step": 343
},
{
"epoch": 0.16634429400386846,
"grad_norm": 0.4424187242984772,
"learning_rate": 4.995192623971664e-06,
"loss": 0.5118,
"step": 344
},
{
"epoch": 0.16682785299806577,
"grad_norm": 0.4274136424064636,
"learning_rate": 4.995152988494103e-06,
"loss": 0.5032,
"step": 345
},
{
"epoch": 0.16731141199226304,
"grad_norm": 0.49120795726776123,
"learning_rate": 4.995113190453452e-06,
"loss": 0.5176,
"step": 346
},
{
"epoch": 0.16779497098646035,
"grad_norm": 0.4164622724056244,
"learning_rate": 4.995073229852303e-06,
"loss": 0.5168,
"step": 347
},
{
"epoch": 0.16827852998065765,
"grad_norm": 0.44215700030326843,
"learning_rate": 4.995033106693261e-06,
"loss": 0.5171,
"step": 348
},
{
"epoch": 0.16876208897485492,
"grad_norm": 0.5706837773323059,
"learning_rate": 4.994992820978937e-06,
"loss": 0.5117,
"step": 349
},
{
"epoch": 0.16924564796905223,
"grad_norm": 0.9634613394737244,
"learning_rate": 4.99495237271196e-06,
"loss": 0.4944,
"step": 350
},
{
"epoch": 0.16972920696324953,
"grad_norm": 0.4841616153717041,
"learning_rate": 4.9949117618949615e-06,
"loss": 0.53,
"step": 351
},
{
"epoch": 0.1702127659574468,
"grad_norm": 0.43846601247787476,
"learning_rate": 4.994870988530589e-06,
"loss": 0.5102,
"step": 352
},
{
"epoch": 0.1706963249516441,
"grad_norm": 0.42763012647628784,
"learning_rate": 4.994830052621499e-06,
"loss": 0.5339,
"step": 353
},
{
"epoch": 0.17117988394584138,
"grad_norm": 0.44781923294067383,
"learning_rate": 4.994788954170357e-06,
"loss": 0.5201,
"step": 354
},
{
"epoch": 0.17166344294003869,
"grad_norm": 0.4358518421649933,
"learning_rate": 4.994747693179844e-06,
"loss": 0.5129,
"step": 355
},
{
"epoch": 0.172147001934236,
"grad_norm": 0.4187730848789215,
"learning_rate": 4.994706269652644e-06,
"loss": 0.5057,
"step": 356
},
{
"epoch": 0.17263056092843326,
"grad_norm": 0.42493316531181335,
"learning_rate": 4.994664683591459e-06,
"loss": 0.522,
"step": 357
},
{
"epoch": 0.17311411992263057,
"grad_norm": 0.4245206117630005,
"learning_rate": 4.994622934998997e-06,
"loss": 0.5195,
"step": 358
},
{
"epoch": 0.17359767891682784,
"grad_norm": 0.6871814131736755,
"learning_rate": 4.994581023877979e-06,
"loss": 0.495,
"step": 359
},
{
"epoch": 0.17408123791102514,
"grad_norm": 0.45684391260147095,
"learning_rate": 4.994538950231134e-06,
"loss": 0.4917,
"step": 360
},
{
"epoch": 0.17456479690522245,
"grad_norm": 0.4136664569377899,
"learning_rate": 4.994496714061205e-06,
"loss": 0.5171,
"step": 361
},
{
"epoch": 0.17504835589941972,
"grad_norm": 0.43099161982536316,
"learning_rate": 4.994454315370943e-06,
"loss": 0.5176,
"step": 362
},
{
"epoch": 0.17553191489361702,
"grad_norm": 0.40930303931236267,
"learning_rate": 4.994411754163109e-06,
"loss": 0.5285,
"step": 363
},
{
"epoch": 0.1760154738878143,
"grad_norm": 0.4457148611545563,
"learning_rate": 4.994369030440477e-06,
"loss": 0.4951,
"step": 364
},
{
"epoch": 0.1764990328820116,
"grad_norm": 0.429066926240921,
"learning_rate": 4.994326144205831e-06,
"loss": 0.5055,
"step": 365
},
{
"epoch": 0.1769825918762089,
"grad_norm": 0.4551694691181183,
"learning_rate": 4.994283095461965e-06,
"loss": 0.5133,
"step": 366
},
{
"epoch": 0.17746615087040618,
"grad_norm": 0.4065353274345398,
"learning_rate": 4.994239884211683e-06,
"loss": 0.5115,
"step": 367
},
{
"epoch": 0.17794970986460348,
"grad_norm": 0.4265615940093994,
"learning_rate": 4.994196510457801e-06,
"loss": 0.5101,
"step": 368
},
{
"epoch": 0.1784332688588008,
"grad_norm": 0.4900777339935303,
"learning_rate": 4.994152974203143e-06,
"loss": 0.5122,
"step": 369
},
{
"epoch": 0.17891682785299806,
"grad_norm": 0.5280573964118958,
"learning_rate": 4.994109275450549e-06,
"loss": 0.5116,
"step": 370
},
{
"epoch": 0.17940038684719536,
"grad_norm": 0.6261852979660034,
"learning_rate": 4.994065414202863e-06,
"loss": 0.5095,
"step": 371
},
{
"epoch": 0.17988394584139264,
"grad_norm": 0.43888241052627563,
"learning_rate": 4.994021390462944e-06,
"loss": 0.5071,
"step": 372
},
{
"epoch": 0.18036750483558994,
"grad_norm": 0.44395115971565247,
"learning_rate": 4.99397720423366e-06,
"loss": 0.5157,
"step": 373
},
{
"epoch": 0.18085106382978725,
"grad_norm": 0.40000030398368835,
"learning_rate": 4.993932855517889e-06,
"loss": 0.5119,
"step": 374
},
{
"epoch": 0.18133462282398452,
"grad_norm": 0.413655549287796,
"learning_rate": 4.9938883443185215e-06,
"loss": 0.5129,
"step": 375
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.5221056938171387,
"learning_rate": 4.993843670638458e-06,
"loss": 0.5144,
"step": 376
},
{
"epoch": 0.1823017408123791,
"grad_norm": 0.4086361825466156,
"learning_rate": 4.993798834480607e-06,
"loss": 0.4922,
"step": 377
},
{
"epoch": 0.1827852998065764,
"grad_norm": 0.5094679594039917,
"learning_rate": 4.99375383584789e-06,
"loss": 0.5176,
"step": 378
},
{
"epoch": 0.1832688588007737,
"grad_norm": 0.43078911304473877,
"learning_rate": 4.993708674743241e-06,
"loss": 0.5031,
"step": 379
},
{
"epoch": 0.18375241779497098,
"grad_norm": 0.4423384666442871,
"learning_rate": 4.9936633511696e-06,
"loss": 0.5118,
"step": 380
},
{
"epoch": 0.18423597678916828,
"grad_norm": 0.42990249395370483,
"learning_rate": 4.99361786512992e-06,
"loss": 0.5073,
"step": 381
},
{
"epoch": 0.18471953578336556,
"grad_norm": 0.7277259826660156,
"learning_rate": 4.993572216627166e-06,
"loss": 0.4965,
"step": 382
},
{
"epoch": 0.18520309477756286,
"grad_norm": 0.41394710540771484,
"learning_rate": 4.993526405664311e-06,
"loss": 0.5197,
"step": 383
},
{
"epoch": 0.18568665377176016,
"grad_norm": 0.41026055812835693,
"learning_rate": 4.99348043224434e-06,
"loss": 0.5102,
"step": 384
},
{
"epoch": 0.18617021276595744,
"grad_norm": 0.41871216893196106,
"learning_rate": 4.9934342963702485e-06,
"loss": 0.5106,
"step": 385
},
{
"epoch": 0.18665377176015474,
"grad_norm": 0.41254499554634094,
"learning_rate": 4.993387998045041e-06,
"loss": 0.493,
"step": 386
},
{
"epoch": 0.18713733075435204,
"grad_norm": 0.6578147411346436,
"learning_rate": 4.993341537271735e-06,
"loss": 0.5113,
"step": 387
},
{
"epoch": 0.18762088974854932,
"grad_norm": 0.39468589425086975,
"learning_rate": 4.993294914053358e-06,
"loss": 0.5202,
"step": 388
},
{
"epoch": 0.18810444874274662,
"grad_norm": 0.39228197932243347,
"learning_rate": 4.993248128392947e-06,
"loss": 0.5397,
"step": 389
},
{
"epoch": 0.1885880077369439,
"grad_norm": 0.4504663944244385,
"learning_rate": 4.99320118029355e-06,
"loss": 0.5017,
"step": 390
},
{
"epoch": 0.1890715667311412,
"grad_norm": 0.44478482007980347,
"learning_rate": 4.993154069758226e-06,
"loss": 0.5128,
"step": 391
},
{
"epoch": 0.1895551257253385,
"grad_norm": 0.506403386592865,
"learning_rate": 4.993106796790044e-06,
"loss": 0.5166,
"step": 392
},
{
"epoch": 0.19003868471953578,
"grad_norm": 0.4066163897514343,
"learning_rate": 4.993059361392083e-06,
"loss": 0.515,
"step": 393
},
{
"epoch": 0.19052224371373308,
"grad_norm": 1.3290510177612305,
"learning_rate": 4.993011763567436e-06,
"loss": 0.4895,
"step": 394
},
{
"epoch": 0.19100580270793036,
"grad_norm": 0.5393165349960327,
"learning_rate": 4.992964003319202e-06,
"loss": 0.5167,
"step": 395
},
{
"epoch": 0.19148936170212766,
"grad_norm": 0.44667989015579224,
"learning_rate": 4.9929160806504925e-06,
"loss": 0.5063,
"step": 396
},
{
"epoch": 0.19197292069632496,
"grad_norm": 0.4737187922000885,
"learning_rate": 4.992867995564432e-06,
"loss": 0.5098,
"step": 397
},
{
"epoch": 0.19245647969052224,
"grad_norm": 0.4211980998516083,
"learning_rate": 4.992819748064151e-06,
"loss": 0.4908,
"step": 398
},
{
"epoch": 0.19294003868471954,
"grad_norm": 0.41971683502197266,
"learning_rate": 4.9927713381527944e-06,
"loss": 0.5169,
"step": 399
},
{
"epoch": 0.19342359767891681,
"grad_norm": 0.48630502820014954,
"learning_rate": 4.992722765833514e-06,
"loss": 0.4927,
"step": 400
},
{
"epoch": 0.19390715667311412,
"grad_norm": 0.4188506305217743,
"learning_rate": 4.992674031109477e-06,
"loss": 0.4921,
"step": 401
},
{
"epoch": 0.19439071566731142,
"grad_norm": 0.6137151718139648,
"learning_rate": 4.9926251339838574e-06,
"loss": 0.4918,
"step": 402
},
{
"epoch": 0.1948742746615087,
"grad_norm": 0.41266149282455444,
"learning_rate": 4.992576074459841e-06,
"loss": 0.5081,
"step": 403
},
{
"epoch": 0.195357833655706,
"grad_norm": 0.42797741293907166,
"learning_rate": 4.992526852540624e-06,
"loss": 0.5101,
"step": 404
},
{
"epoch": 0.1958413926499033,
"grad_norm": 0.43434789776802063,
"learning_rate": 4.992477468229413e-06,
"loss": 0.4931,
"step": 405
},
{
"epoch": 0.19632495164410058,
"grad_norm": 0.45915883779525757,
"learning_rate": 4.992427921529426e-06,
"loss": 0.4757,
"step": 406
},
{
"epoch": 0.19680851063829788,
"grad_norm": 0.5629348754882812,
"learning_rate": 4.992378212443891e-06,
"loss": 0.5037,
"step": 407
},
{
"epoch": 0.19729206963249515,
"grad_norm": 0.44655969738960266,
"learning_rate": 4.992328340976046e-06,
"loss": 0.489,
"step": 408
},
{
"epoch": 0.19777562862669246,
"grad_norm": 0.4506199061870575,
"learning_rate": 4.992278307129141e-06,
"loss": 0.5208,
"step": 409
},
{
"epoch": 0.19825918762088976,
"grad_norm": 0.43655723333358765,
"learning_rate": 4.992228110906436e-06,
"loss": 0.5089,
"step": 410
},
{
"epoch": 0.19874274661508703,
"grad_norm": 1.3825441598892212,
"learning_rate": 4.9921777523112e-06,
"loss": 0.4879,
"step": 411
},
{
"epoch": 0.19922630560928434,
"grad_norm": 0.416355699300766,
"learning_rate": 4.992127231346715e-06,
"loss": 0.5049,
"step": 412
},
{
"epoch": 0.1997098646034816,
"grad_norm": 1.0814541578292847,
"learning_rate": 4.992076548016272e-06,
"loss": 0.4802,
"step": 413
},
{
"epoch": 0.20019342359767892,
"grad_norm": 0.4396391212940216,
"learning_rate": 4.992025702323174e-06,
"loss": 0.5203,
"step": 414
},
{
"epoch": 0.20067698259187622,
"grad_norm": 0.525071918964386,
"learning_rate": 4.991974694270733e-06,
"loss": 0.494,
"step": 415
},
{
"epoch": 0.2011605415860735,
"grad_norm": 0.4244464933872223,
"learning_rate": 4.991923523862271e-06,
"loss": 0.4994,
"step": 416
},
{
"epoch": 0.2016441005802708,
"grad_norm": 0.4466478228569031,
"learning_rate": 4.991872191101124e-06,
"loss": 0.4946,
"step": 417
},
{
"epoch": 0.20212765957446807,
"grad_norm": 0.4531770348548889,
"learning_rate": 4.991820695990636e-06,
"loss": 0.5111,
"step": 418
},
{
"epoch": 0.20261121856866537,
"grad_norm": 0.4186277687549591,
"learning_rate": 4.991769038534161e-06,
"loss": 0.4844,
"step": 419
},
{
"epoch": 0.20309477756286268,
"grad_norm": 0.42326635122299194,
"learning_rate": 4.991717218735065e-06,
"loss": 0.5063,
"step": 420
},
{
"epoch": 0.20357833655705995,
"grad_norm": 0.42900604009628296,
"learning_rate": 4.991665236596724e-06,
"loss": 0.5078,
"step": 421
},
{
"epoch": 0.20406189555125726,
"grad_norm": 0.42313331365585327,
"learning_rate": 4.991613092122526e-06,
"loss": 0.5122,
"step": 422
},
{
"epoch": 0.20454545454545456,
"grad_norm": 0.49496522545814514,
"learning_rate": 4.991560785315866e-06,
"loss": 0.4717,
"step": 423
},
{
"epoch": 0.20502901353965183,
"grad_norm": 0.43023598194122314,
"learning_rate": 4.991508316180154e-06,
"loss": 0.5095,
"step": 424
},
{
"epoch": 0.20551257253384914,
"grad_norm": 0.41138070821762085,
"learning_rate": 4.9914556847188076e-06,
"loss": 0.4941,
"step": 425
},
{
"epoch": 0.2059961315280464,
"grad_norm": 0.40780410170555115,
"learning_rate": 4.991402890935255e-06,
"loss": 0.4808,
"step": 426
},
{
"epoch": 0.20647969052224371,
"grad_norm": 0.5113906264305115,
"learning_rate": 4.9913499348329375e-06,
"loss": 0.4957,
"step": 427
},
{
"epoch": 0.20696324951644102,
"grad_norm": 0.4982716739177704,
"learning_rate": 4.991296816415304e-06,
"loss": 0.4996,
"step": 428
},
{
"epoch": 0.2074468085106383,
"grad_norm": 0.47301238775253296,
"learning_rate": 4.991243535685815e-06,
"loss": 0.4982,
"step": 429
},
{
"epoch": 0.2079303675048356,
"grad_norm": 0.48449206352233887,
"learning_rate": 4.991190092647943e-06,
"loss": 0.4958,
"step": 430
},
{
"epoch": 0.20841392649903287,
"grad_norm": 0.47245344519615173,
"learning_rate": 4.991136487305169e-06,
"loss": 0.506,
"step": 431
},
{
"epoch": 0.20889748549323017,
"grad_norm": 0.4477512836456299,
"learning_rate": 4.9910827196609864e-06,
"loss": 0.496,
"step": 432
},
{
"epoch": 0.20938104448742748,
"grad_norm": 0.4135742485523224,
"learning_rate": 4.991028789718897e-06,
"loss": 0.5174,
"step": 433
},
{
"epoch": 0.20986460348162475,
"grad_norm": 0.4840208888053894,
"learning_rate": 4.990974697482415e-06,
"loss": 0.5087,
"step": 434
},
{
"epoch": 0.21034816247582205,
"grad_norm": 0.5320113301277161,
"learning_rate": 4.990920442955065e-06,
"loss": 0.5231,
"step": 435
},
{
"epoch": 0.21083172147001933,
"grad_norm": 0.41861191391944885,
"learning_rate": 4.9908660261403815e-06,
"loss": 0.4935,
"step": 436
},
{
"epoch": 0.21131528046421663,
"grad_norm": 0.4138176441192627,
"learning_rate": 4.99081144704191e-06,
"loss": 0.4774,
"step": 437
},
{
"epoch": 0.21179883945841393,
"grad_norm": 0.4283509850502014,
"learning_rate": 4.990756705663205e-06,
"loss": 0.5045,
"step": 438
},
{
"epoch": 0.2122823984526112,
"grad_norm": 0.4712899923324585,
"learning_rate": 4.990701802007835e-06,
"loss": 0.4902,
"step": 439
},
{
"epoch": 0.2127659574468085,
"grad_norm": 0.44904825091362,
"learning_rate": 4.990646736079376e-06,
"loss": 0.52,
"step": 440
},
{
"epoch": 0.21324951644100582,
"grad_norm": 0.42403462529182434,
"learning_rate": 4.990591507881416e-06,
"loss": 0.4943,
"step": 441
},
{
"epoch": 0.2137330754352031,
"grad_norm": 0.4171410799026489,
"learning_rate": 4.990536117417553e-06,
"loss": 0.4821,
"step": 442
},
{
"epoch": 0.2142166344294004,
"grad_norm": 0.44567805528640747,
"learning_rate": 4.990480564691396e-06,
"loss": 0.5198,
"step": 443
},
{
"epoch": 0.21470019342359767,
"grad_norm": 0.40193620324134827,
"learning_rate": 4.990424849706563e-06,
"loss": 0.5104,
"step": 444
},
{
"epoch": 0.21518375241779497,
"grad_norm": 0.43421146273612976,
"learning_rate": 4.990368972466686e-06,
"loss": 0.4961,
"step": 445
},
{
"epoch": 0.21566731141199227,
"grad_norm": 0.41703906655311584,
"learning_rate": 4.990312932975404e-06,
"loss": 0.5104,
"step": 446
},
{
"epoch": 0.21615087040618955,
"grad_norm": 0.42970454692840576,
"learning_rate": 4.99025673123637e-06,
"loss": 0.4923,
"step": 447
},
{
"epoch": 0.21663442940038685,
"grad_norm": 0.551898717880249,
"learning_rate": 4.990200367253243e-06,
"loss": 0.4843,
"step": 448
},
{
"epoch": 0.21711798839458413,
"grad_norm": 0.46391811966896057,
"learning_rate": 4.990143841029697e-06,
"loss": 0.5189,
"step": 449
},
{
"epoch": 0.21760154738878143,
"grad_norm": 0.4471249282360077,
"learning_rate": 4.9900871525694135e-06,
"loss": 0.4818,
"step": 450
},
{
"epoch": 0.21808510638297873,
"grad_norm": 0.43526965379714966,
"learning_rate": 4.990030301876087e-06,
"loss": 0.5076,
"step": 451
},
{
"epoch": 0.218568665377176,
"grad_norm": 0.4630556106567383,
"learning_rate": 4.989973288953421e-06,
"loss": 0.5124,
"step": 452
},
{
"epoch": 0.2190522243713733,
"grad_norm": 0.4085645377635956,
"learning_rate": 4.989916113805131e-06,
"loss": 0.5021,
"step": 453
},
{
"epoch": 0.21953578336557059,
"grad_norm": 0.4135761260986328,
"learning_rate": 4.98985877643494e-06,
"loss": 0.4826,
"step": 454
},
{
"epoch": 0.2200193423597679,
"grad_norm": 0.4335331618785858,
"learning_rate": 4.989801276846584e-06,
"loss": 0.4997,
"step": 455
},
{
"epoch": 0.2205029013539652,
"grad_norm": 0.43082305788993835,
"learning_rate": 4.989743615043811e-06,
"loss": 0.4937,
"step": 456
},
{
"epoch": 0.22098646034816247,
"grad_norm": 0.4116332232952118,
"learning_rate": 4.989685791030377e-06,
"loss": 0.5036,
"step": 457
},
{
"epoch": 0.22147001934235977,
"grad_norm": 0.5059003829956055,
"learning_rate": 4.989627804810047e-06,
"loss": 0.5024,
"step": 458
},
{
"epoch": 0.22195357833655707,
"grad_norm": 0.7639954090118408,
"learning_rate": 4.989569656386602e-06,
"loss": 0.5046,
"step": 459
},
{
"epoch": 0.22243713733075435,
"grad_norm": 0.43759623169898987,
"learning_rate": 4.989511345763829e-06,
"loss": 0.5198,
"step": 460
},
{
"epoch": 0.22292069632495165,
"grad_norm": 0.40205639600753784,
"learning_rate": 4.989452872945527e-06,
"loss": 0.5016,
"step": 461
},
{
"epoch": 0.22340425531914893,
"grad_norm": 0.4914066791534424,
"learning_rate": 4.989394237935507e-06,
"loss": 0.5008,
"step": 462
},
{
"epoch": 0.22388781431334623,
"grad_norm": 0.479674369096756,
"learning_rate": 4.989335440737587e-06,
"loss": 0.4899,
"step": 463
},
{
"epoch": 0.22437137330754353,
"grad_norm": 0.4184580147266388,
"learning_rate": 4.989276481355598e-06,
"loss": 0.502,
"step": 464
},
{
"epoch": 0.2248549323017408,
"grad_norm": 0.4303237199783325,
"learning_rate": 4.989217359793383e-06,
"loss": 0.4905,
"step": 465
},
{
"epoch": 0.2253384912959381,
"grad_norm": 0.4432399570941925,
"learning_rate": 4.989158076054793e-06,
"loss": 0.5035,
"step": 466
},
{
"epoch": 0.22582205029013538,
"grad_norm": 0.4811629056930542,
"learning_rate": 4.98909863014369e-06,
"loss": 0.508,
"step": 467
},
{
"epoch": 0.2263056092843327,
"grad_norm": 0.5259628295898438,
"learning_rate": 4.989039022063949e-06,
"loss": 0.4932,
"step": 468
},
{
"epoch": 0.22678916827853,
"grad_norm": 0.6295924186706543,
"learning_rate": 4.98897925181945e-06,
"loss": 0.505,
"step": 469
},
{
"epoch": 0.22727272727272727,
"grad_norm": 0.5184575319290161,
"learning_rate": 4.988919319414089e-06,
"loss": 0.4964,
"step": 470
},
{
"epoch": 0.22775628626692457,
"grad_norm": 0.45514237880706787,
"learning_rate": 4.988859224851772e-06,
"loss": 0.4944,
"step": 471
},
{
"epoch": 0.22823984526112184,
"grad_norm": 0.4045606255531311,
"learning_rate": 4.988798968136412e-06,
"loss": 0.4897,
"step": 472
},
{
"epoch": 0.22872340425531915,
"grad_norm": 0.8098049759864807,
"learning_rate": 4.988738549271937e-06,
"loss": 0.4773,
"step": 473
},
{
"epoch": 0.22920696324951645,
"grad_norm": 0.41523805260658264,
"learning_rate": 4.98867796826228e-06,
"loss": 0.5237,
"step": 474
},
{
"epoch": 0.22969052224371372,
"grad_norm": 0.4325871169567108,
"learning_rate": 4.988617225111392e-06,
"loss": 0.4991,
"step": 475
},
{
"epoch": 0.23017408123791103,
"grad_norm": 0.43960368633270264,
"learning_rate": 4.9885563198232275e-06,
"loss": 0.5099,
"step": 476
},
{
"epoch": 0.23065764023210833,
"grad_norm": 0.4785735607147217,
"learning_rate": 4.988495252401756e-06,
"loss": 0.5096,
"step": 477
},
{
"epoch": 0.2311411992263056,
"grad_norm": 0.4099835455417633,
"learning_rate": 4.988434022850956e-06,
"loss": 0.4957,
"step": 478
},
{
"epoch": 0.2316247582205029,
"grad_norm": 0.43405771255493164,
"learning_rate": 4.9883726311748165e-06,
"loss": 0.4832,
"step": 479
},
{
"epoch": 0.23210831721470018,
"grad_norm": 0.4160226881504059,
"learning_rate": 4.988311077377337e-06,
"loss": 0.5046,
"step": 480
},
{
"epoch": 0.23259187620889749,
"grad_norm": 0.44942131638526917,
"learning_rate": 4.988249361462528e-06,
"loss": 0.4798,
"step": 481
},
{
"epoch": 0.2330754352030948,
"grad_norm": 0.4407511353492737,
"learning_rate": 4.988187483434411e-06,
"loss": 0.4664,
"step": 482
},
{
"epoch": 0.23355899419729206,
"grad_norm": 0.4470459222793579,
"learning_rate": 4.988125443297017e-06,
"loss": 0.4917,
"step": 483
},
{
"epoch": 0.23404255319148937,
"grad_norm": 0.44523417949676514,
"learning_rate": 4.9880632410543885e-06,
"loss": 0.5103,
"step": 484
},
{
"epoch": 0.23452611218568664,
"grad_norm": 0.7564685344696045,
"learning_rate": 4.988000876710577e-06,
"loss": 0.4999,
"step": 485
},
{
"epoch": 0.23500967117988394,
"grad_norm": 0.5257455706596375,
"learning_rate": 4.987938350269646e-06,
"loss": 0.508,
"step": 486
},
{
"epoch": 0.23549323017408125,
"grad_norm": 0.41185519099235535,
"learning_rate": 4.987875661735669e-06,
"loss": 0.5103,
"step": 487
},
{
"epoch": 0.23597678916827852,
"grad_norm": 0.45921629667282104,
"learning_rate": 4.987812811112731e-06,
"loss": 0.5041,
"step": 488
},
{
"epoch": 0.23646034816247583,
"grad_norm": 0.44184067845344543,
"learning_rate": 4.987749798404927e-06,
"loss": 0.4948,
"step": 489
},
{
"epoch": 0.2369439071566731,
"grad_norm": 0.5813512802124023,
"learning_rate": 4.987686623616361e-06,
"loss": 0.5122,
"step": 490
},
{
"epoch": 0.2374274661508704,
"grad_norm": 0.42440474033355713,
"learning_rate": 4.98762328675115e-06,
"loss": 0.5046,
"step": 491
},
{
"epoch": 0.2379110251450677,
"grad_norm": 0.4219503104686737,
"learning_rate": 4.98755978781342e-06,
"loss": 0.4887,
"step": 492
},
{
"epoch": 0.23839458413926498,
"grad_norm": 0.4938005805015564,
"learning_rate": 4.9874961268073095e-06,
"loss": 0.4896,
"step": 493
},
{
"epoch": 0.23887814313346228,
"grad_norm": 0.47373083233833313,
"learning_rate": 4.987432303736963e-06,
"loss": 0.4957,
"step": 494
},
{
"epoch": 0.2393617021276596,
"grad_norm": 0.41886255145072937,
"learning_rate": 4.987368318606543e-06,
"loss": 0.4895,
"step": 495
},
{
"epoch": 0.23984526112185686,
"grad_norm": 0.41814935207366943,
"learning_rate": 4.987304171420214e-06,
"loss": 0.4957,
"step": 496
},
{
"epoch": 0.24032882011605416,
"grad_norm": 0.49376171827316284,
"learning_rate": 4.987239862182157e-06,
"loss": 0.4962,
"step": 497
},
{
"epoch": 0.24081237911025144,
"grad_norm": 0.4292319715023041,
"learning_rate": 4.987175390896563e-06,
"loss": 0.4968,
"step": 498
},
{
"epoch": 0.24129593810444874,
"grad_norm": 0.42256441712379456,
"learning_rate": 4.987110757567631e-06,
"loss": 0.4762,
"step": 499
},
{
"epoch": 0.24177949709864605,
"grad_norm": 0.42222288250923157,
"learning_rate": 4.987045962199572e-06,
"loss": 0.4998,
"step": 500
},
{
"epoch": 0.24226305609284332,
"grad_norm": 0.6006141304969788,
"learning_rate": 4.986981004796608e-06,
"loss": 0.4924,
"step": 501
},
{
"epoch": 0.24274661508704062,
"grad_norm": 0.4347086548805237,
"learning_rate": 4.986915885362971e-06,
"loss": 0.4808,
"step": 502
},
{
"epoch": 0.2432301740812379,
"grad_norm": 0.4513223171234131,
"learning_rate": 4.986850603902904e-06,
"loss": 0.4821,
"step": 503
},
{
"epoch": 0.2437137330754352,
"grad_norm": 0.45349133014678955,
"learning_rate": 4.986785160420659e-06,
"loss": 0.4844,
"step": 504
},
{
"epoch": 0.2441972920696325,
"grad_norm": 0.43619874119758606,
"learning_rate": 4.986719554920501e-06,
"loss": 0.4996,
"step": 505
},
{
"epoch": 0.24468085106382978,
"grad_norm": 0.43510088324546814,
"learning_rate": 4.986653787406703e-06,
"loss": 0.4749,
"step": 506
},
{
"epoch": 0.24516441005802708,
"grad_norm": 0.420758992433548,
"learning_rate": 4.986587857883551e-06,
"loss": 0.4503,
"step": 507
},
{
"epoch": 0.24564796905222436,
"grad_norm": 0.5028153657913208,
"learning_rate": 4.9865217663553405e-06,
"loss": 0.5111,
"step": 508
},
{
"epoch": 0.24613152804642166,
"grad_norm": 0.5986948609352112,
"learning_rate": 4.986455512826377e-06,
"loss": 0.5032,
"step": 509
},
{
"epoch": 0.24661508704061896,
"grad_norm": 0.6458590030670166,
"learning_rate": 4.986389097300976e-06,
"loss": 0.5118,
"step": 510
},
{
"epoch": 0.24709864603481624,
"grad_norm": 0.4113481640815735,
"learning_rate": 4.9863225197834674e-06,
"loss": 0.4939,
"step": 511
},
{
"epoch": 0.24758220502901354,
"grad_norm": 0.4217630624771118,
"learning_rate": 4.986255780278186e-06,
"loss": 0.4785,
"step": 512
},
{
"epoch": 0.24806576402321084,
"grad_norm": 0.44430986046791077,
"learning_rate": 4.986188878789481e-06,
"loss": 0.4975,
"step": 513
},
{
"epoch": 0.24854932301740812,
"grad_norm": 0.5256595611572266,
"learning_rate": 4.98612181532171e-06,
"loss": 0.4699,
"step": 514
},
{
"epoch": 0.24903288201160542,
"grad_norm": 0.472182035446167,
"learning_rate": 4.9860545898792455e-06,
"loss": 0.5036,
"step": 515
},
{
"epoch": 0.2495164410058027,
"grad_norm": 0.41556984186172485,
"learning_rate": 4.985987202466465e-06,
"loss": 0.4868,
"step": 516
},
{
"epoch": 0.25,
"grad_norm": 0.4145744740962982,
"learning_rate": 4.9859196530877586e-06,
"loss": 0.4791,
"step": 517
},
{
"epoch": 0.2504835589941973,
"grad_norm": 0.4527714252471924,
"learning_rate": 4.985851941747527e-06,
"loss": 0.5082,
"step": 518
},
{
"epoch": 0.2509671179883946,
"grad_norm": 0.4882536828517914,
"learning_rate": 4.985784068450184e-06,
"loss": 0.4883,
"step": 519
},
{
"epoch": 0.2514506769825919,
"grad_norm": 0.4307822585105896,
"learning_rate": 4.985716033200149e-06,
"loss": 0.4953,
"step": 520
},
{
"epoch": 0.25193423597678916,
"grad_norm": 0.4246330261230469,
"learning_rate": 4.985647836001857e-06,
"loss": 0.4944,
"step": 521
},
{
"epoch": 0.2524177949709865,
"grad_norm": 0.4518243968486786,
"learning_rate": 4.9855794768597484e-06,
"loss": 0.4917,
"step": 522
},
{
"epoch": 0.25290135396518376,
"grad_norm": 0.4250892698764801,
"learning_rate": 4.98551095577828e-06,
"loss": 0.4915,
"step": 523
},
{
"epoch": 0.25338491295938104,
"grad_norm": 0.4896804690361023,
"learning_rate": 4.9854422727619135e-06,
"loss": 0.4968,
"step": 524
},
{
"epoch": 0.2538684719535783,
"grad_norm": 0.4480974078178406,
"learning_rate": 4.985373427815125e-06,
"loss": 0.4923,
"step": 525
},
{
"epoch": 0.25435203094777564,
"grad_norm": 0.4182819426059723,
"learning_rate": 4.985304420942399e-06,
"loss": 0.5064,
"step": 526
},
{
"epoch": 0.2548355899419729,
"grad_norm": 0.42555907368659973,
"learning_rate": 4.985235252148233e-06,
"loss": 0.4666,
"step": 527
},
{
"epoch": 0.2553191489361702,
"grad_norm": 0.41366714239120483,
"learning_rate": 4.985165921437131e-06,
"loss": 0.4951,
"step": 528
},
{
"epoch": 0.2558027079303675,
"grad_norm": 0.4588848054409027,
"learning_rate": 4.985096428813613e-06,
"loss": 0.5035,
"step": 529
},
{
"epoch": 0.2562862669245648,
"grad_norm": 0.7335658669471741,
"learning_rate": 4.985026774282205e-06,
"loss": 0.452,
"step": 530
},
{
"epoch": 0.2567698259187621,
"grad_norm": 0.4520464539527893,
"learning_rate": 4.984956957847445e-06,
"loss": 0.506,
"step": 531
},
{
"epoch": 0.2572533849129594,
"grad_norm": 0.45352864265441895,
"learning_rate": 4.98488697951388e-06,
"loss": 0.4864,
"step": 532
},
{
"epoch": 0.2577369439071567,
"grad_norm": 0.4914877414703369,
"learning_rate": 4.984816839286072e-06,
"loss": 0.4866,
"step": 533
},
{
"epoch": 0.25822050290135395,
"grad_norm": 0.47104302048683167,
"learning_rate": 4.98474653716859e-06,
"loss": 0.482,
"step": 534
},
{
"epoch": 0.2587040618955513,
"grad_norm": 0.5308674573898315,
"learning_rate": 4.984676073166014e-06,
"loss": 0.5137,
"step": 535
},
{
"epoch": 0.25918762088974856,
"grad_norm": 0.44214561581611633,
"learning_rate": 4.984605447282934e-06,
"loss": 0.4973,
"step": 536
},
{
"epoch": 0.25967117988394584,
"grad_norm": 1.4595553874969482,
"learning_rate": 4.9845346595239525e-06,
"loss": 0.4951,
"step": 537
},
{
"epoch": 0.2601547388781431,
"grad_norm": 0.44012385606765747,
"learning_rate": 4.984463709893681e-06,
"loss": 0.4766,
"step": 538
},
{
"epoch": 0.26063829787234044,
"grad_norm": 0.5264063477516174,
"learning_rate": 4.984392598396742e-06,
"loss": 0.4601,
"step": 539
},
{
"epoch": 0.2611218568665377,
"grad_norm": 0.4360770881175995,
"learning_rate": 4.984321325037769e-06,
"loss": 0.5059,
"step": 540
},
{
"epoch": 0.261605415860735,
"grad_norm": 0.4725351631641388,
"learning_rate": 4.984249889821406e-06,
"loss": 0.5041,
"step": 541
},
{
"epoch": 0.2620889748549323,
"grad_norm": 0.40023666620254517,
"learning_rate": 4.984178292752305e-06,
"loss": 0.492,
"step": 542
},
{
"epoch": 0.2625725338491296,
"grad_norm": 0.4368029534816742,
"learning_rate": 4.984106533835132e-06,
"loss": 0.493,
"step": 543
},
{
"epoch": 0.26305609284332687,
"grad_norm": 0.457823246717453,
"learning_rate": 4.984034613074563e-06,
"loss": 0.4915,
"step": 544
},
{
"epoch": 0.2635396518375242,
"grad_norm": 0.44186931848526,
"learning_rate": 4.983962530475282e-06,
"loss": 0.5059,
"step": 545
},
{
"epoch": 0.2640232108317215,
"grad_norm": 0.4273228347301483,
"learning_rate": 4.983890286041987e-06,
"loss": 0.5036,
"step": 546
},
{
"epoch": 0.26450676982591875,
"grad_norm": 0.4629921317100525,
"learning_rate": 4.983817879779384e-06,
"loss": 0.4859,
"step": 547
},
{
"epoch": 0.26499032882011603,
"grad_norm": 0.44890058040618896,
"learning_rate": 4.983745311692189e-06,
"loss": 0.4962,
"step": 548
},
{
"epoch": 0.26547388781431336,
"grad_norm": 0.6259036064147949,
"learning_rate": 4.983672581785132e-06,
"loss": 0.4942,
"step": 549
},
{
"epoch": 0.26595744680851063,
"grad_norm": 0.45010271668434143,
"learning_rate": 4.983599690062953e-06,
"loss": 0.5171,
"step": 550
},
{
"epoch": 0.2664410058027079,
"grad_norm": 0.422604501247406,
"learning_rate": 4.983526636530396e-06,
"loss": 0.5138,
"step": 551
},
{
"epoch": 0.26692456479690524,
"grad_norm": 0.4481145739555359,
"learning_rate": 4.983453421192225e-06,
"loss": 0.5012,
"step": 552
},
{
"epoch": 0.2674081237911025,
"grad_norm": 0.4294244349002838,
"learning_rate": 4.983380044053208e-06,
"loss": 0.4903,
"step": 553
},
{
"epoch": 0.2678916827852998,
"grad_norm": 0.43048569560050964,
"learning_rate": 4.983306505118125e-06,
"loss": 0.4893,
"step": 554
},
{
"epoch": 0.2683752417794971,
"grad_norm": 0.43324270844459534,
"learning_rate": 4.98323280439177e-06,
"loss": 0.4919,
"step": 555
},
{
"epoch": 0.2688588007736944,
"grad_norm": 0.4504631459712982,
"learning_rate": 4.9831589418789415e-06,
"loss": 0.4825,
"step": 556
},
{
"epoch": 0.26934235976789167,
"grad_norm": 0.46867018938064575,
"learning_rate": 4.9830849175844544e-06,
"loss": 0.5159,
"step": 557
},
{
"epoch": 0.269825918762089,
"grad_norm": 0.48262712359428406,
"learning_rate": 4.98301073151313e-06,
"loss": 0.5027,
"step": 558
},
{
"epoch": 0.2703094777562863,
"grad_norm": 0.4334803819656372,
"learning_rate": 4.982936383669802e-06,
"loss": 0.5126,
"step": 559
},
{
"epoch": 0.27079303675048355,
"grad_norm": 0.5654119253158569,
"learning_rate": 4.982861874059314e-06,
"loss": 0.4848,
"step": 560
},
{
"epoch": 0.2712765957446808,
"grad_norm": 0.6120291352272034,
"learning_rate": 4.982787202686521e-06,
"loss": 0.5171,
"step": 561
},
{
"epoch": 0.27176015473887816,
"grad_norm": 0.40804538130760193,
"learning_rate": 4.982712369556287e-06,
"loss": 0.5073,
"step": 562
},
{
"epoch": 0.27224371373307543,
"grad_norm": 0.4400192201137543,
"learning_rate": 4.982637374673489e-06,
"loss": 0.486,
"step": 563
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.41215842962265015,
"learning_rate": 4.982562218043012e-06,
"loss": 0.4954,
"step": 564
},
{
"epoch": 0.27321083172147004,
"grad_norm": 0.44765132665634155,
"learning_rate": 4.9824868996697525e-06,
"loss": 0.4842,
"step": 565
},
{
"epoch": 0.2736943907156673,
"grad_norm": 0.47125014662742615,
"learning_rate": 4.982411419558618e-06,
"loss": 0.5019,
"step": 566
},
{
"epoch": 0.2741779497098646,
"grad_norm": 0.43019676208496094,
"learning_rate": 4.982335777714525e-06,
"loss": 0.4925,
"step": 567
},
{
"epoch": 0.2746615087040619,
"grad_norm": 0.4377966523170471,
"learning_rate": 4.9822599741424044e-06,
"loss": 0.5125,
"step": 568
},
{
"epoch": 0.2751450676982592,
"grad_norm": 0.5210200548171997,
"learning_rate": 4.982184008847192e-06,
"loss": 0.4736,
"step": 569
},
{
"epoch": 0.27562862669245647,
"grad_norm": 0.43145325779914856,
"learning_rate": 4.982107881833839e-06,
"loss": 0.4958,
"step": 570
},
{
"epoch": 0.2761121856866538,
"grad_norm": 0.5334268808364868,
"learning_rate": 4.9820315931073035e-06,
"loss": 0.5099,
"step": 571
},
{
"epoch": 0.2765957446808511,
"grad_norm": 0.46400728821754456,
"learning_rate": 4.981955142672558e-06,
"loss": 0.4918,
"step": 572
},
{
"epoch": 0.27707930367504835,
"grad_norm": 0.47233352065086365,
"learning_rate": 4.981878530534581e-06,
"loss": 0.4936,
"step": 573
},
{
"epoch": 0.2775628626692456,
"grad_norm": 0.46564990282058716,
"learning_rate": 4.9818017566983654e-06,
"loss": 0.4713,
"step": 574
},
{
"epoch": 0.27804642166344296,
"grad_norm": 0.42474499344825745,
"learning_rate": 4.981724821168913e-06,
"loss": 0.4944,
"step": 575
},
{
"epoch": 0.27852998065764023,
"grad_norm": 0.44720014929771423,
"learning_rate": 4.981647723951236e-06,
"loss": 0.513,
"step": 576
},
{
"epoch": 0.2790135396518375,
"grad_norm": 0.4741462767124176,
"learning_rate": 4.981570465050357e-06,
"loss": 0.4652,
"step": 577
},
{
"epoch": 0.27949709864603484,
"grad_norm": 0.4069628417491913,
"learning_rate": 4.9814930444713106e-06,
"loss": 0.4664,
"step": 578
},
{
"epoch": 0.2799806576402321,
"grad_norm": 0.47512662410736084,
"learning_rate": 4.98141546221914e-06,
"loss": 0.4695,
"step": 579
},
{
"epoch": 0.2804642166344294,
"grad_norm": 0.4442213773727417,
"learning_rate": 4.9813377182989e-06,
"loss": 0.4977,
"step": 580
},
{
"epoch": 0.2809477756286267,
"grad_norm": 0.4360348880290985,
"learning_rate": 4.981259812715656e-06,
"loss": 0.4918,
"step": 581
},
{
"epoch": 0.281431334622824,
"grad_norm": 0.4233061373233795,
"learning_rate": 4.981181745474483e-06,
"loss": 0.4818,
"step": 582
},
{
"epoch": 0.28191489361702127,
"grad_norm": 0.4427405893802643,
"learning_rate": 4.9811035165804675e-06,
"loss": 0.4954,
"step": 583
},
{
"epoch": 0.28239845261121854,
"grad_norm": 0.486393541097641,
"learning_rate": 4.981025126038708e-06,
"loss": 0.4938,
"step": 584
},
{
"epoch": 0.2828820116054159,
"grad_norm": 0.4374271035194397,
"learning_rate": 4.9809465738543084e-06,
"loss": 0.492,
"step": 585
},
{
"epoch": 0.28336557059961315,
"grad_norm": 0.458296537399292,
"learning_rate": 4.980867860032389e-06,
"loss": 0.4816,
"step": 586
},
{
"epoch": 0.2838491295938104,
"grad_norm": 0.44356656074523926,
"learning_rate": 4.980788984578077e-06,
"loss": 0.4664,
"step": 587
},
{
"epoch": 0.28433268858800775,
"grad_norm": 0.4328411817550659,
"learning_rate": 4.980709947496512e-06,
"loss": 0.4996,
"step": 588
},
{
"epoch": 0.28481624758220503,
"grad_norm": 0.4631772041320801,
"learning_rate": 4.980630748792843e-06,
"loss": 0.4821,
"step": 589
},
{
"epoch": 0.2852998065764023,
"grad_norm": 0.47806110978126526,
"learning_rate": 4.98055138847223e-06,
"loss": 0.4814,
"step": 590
},
{
"epoch": 0.28578336557059963,
"grad_norm": 0.4524025321006775,
"learning_rate": 4.980471866539843e-06,
"loss": 0.4669,
"step": 591
},
{
"epoch": 0.2862669245647969,
"grad_norm": 0.549818217754364,
"learning_rate": 4.980392183000864e-06,
"loss": 0.4769,
"step": 592
},
{
"epoch": 0.2867504835589942,
"grad_norm": 0.41930127143859863,
"learning_rate": 4.9803123378604836e-06,
"loss": 0.492,
"step": 593
},
{
"epoch": 0.2872340425531915,
"grad_norm": 0.4337260127067566,
"learning_rate": 4.980232331123904e-06,
"loss": 0.4972,
"step": 594
},
{
"epoch": 0.2877176015473888,
"grad_norm": 0.4237891137599945,
"learning_rate": 4.980152162796338e-06,
"loss": 0.4984,
"step": 595
},
{
"epoch": 0.28820116054158607,
"grad_norm": 0.4510140120983124,
"learning_rate": 4.980071832883008e-06,
"loss": 0.49,
"step": 596
},
{
"epoch": 0.28868471953578334,
"grad_norm": 0.420654296875,
"learning_rate": 4.9799913413891485e-06,
"loss": 0.4902,
"step": 597
},
{
"epoch": 0.28916827852998067,
"grad_norm": 0.49798089265823364,
"learning_rate": 4.979910688320004e-06,
"loss": 0.4991,
"step": 598
},
{
"epoch": 0.28965183752417795,
"grad_norm": 0.4498788118362427,
"learning_rate": 4.9798298736808286e-06,
"loss": 0.4903,
"step": 599
},
{
"epoch": 0.2901353965183752,
"grad_norm": 0.5480190515518188,
"learning_rate": 4.979748897476886e-06,
"loss": 0.492,
"step": 600
},
{
"epoch": 0.29061895551257255,
"grad_norm": 0.439396470785141,
"learning_rate": 4.9796677597134546e-06,
"loss": 0.4637,
"step": 601
},
{
"epoch": 0.2911025145067698,
"grad_norm": 0.444767028093338,
"learning_rate": 4.979586460395819e-06,
"loss": 0.4967,
"step": 602
},
{
"epoch": 0.2915860735009671,
"grad_norm": 0.42325684428215027,
"learning_rate": 4.9795049995292765e-06,
"loss": 0.4804,
"step": 603
},
{
"epoch": 0.29206963249516443,
"grad_norm": 0.42194440960884094,
"learning_rate": 4.979423377119134e-06,
"loss": 0.5036,
"step": 604
},
{
"epoch": 0.2925531914893617,
"grad_norm": 0.42215487360954285,
"learning_rate": 4.97934159317071e-06,
"loss": 0.4653,
"step": 605
},
{
"epoch": 0.293036750483559,
"grad_norm": 0.4361760914325714,
"learning_rate": 4.979259647689332e-06,
"loss": 0.4792,
"step": 606
},
{
"epoch": 0.2935203094777563,
"grad_norm": 0.4426310658454895,
"learning_rate": 4.979177540680339e-06,
"loss": 0.478,
"step": 607
},
{
"epoch": 0.2940038684719536,
"grad_norm": 0.4340149462223053,
"learning_rate": 4.979095272149081e-06,
"loss": 0.4987,
"step": 608
},
{
"epoch": 0.29448742746615086,
"grad_norm": 0.4175974726676941,
"learning_rate": 4.979012842100919e-06,
"loss": 0.505,
"step": 609
},
{
"epoch": 0.29497098646034814,
"grad_norm": 0.46888601779937744,
"learning_rate": 4.97893025054122e-06,
"loss": 0.4989,
"step": 610
},
{
"epoch": 0.29545454545454547,
"grad_norm": 0.4319513738155365,
"learning_rate": 4.978847497475369e-06,
"loss": 0.4702,
"step": 611
},
{
"epoch": 0.29593810444874274,
"grad_norm": 0.428959459066391,
"learning_rate": 4.978764582908754e-06,
"loss": 0.4823,
"step": 612
},
{
"epoch": 0.29642166344294,
"grad_norm": 0.4449647068977356,
"learning_rate": 4.97868150684678e-06,
"loss": 0.4938,
"step": 613
},
{
"epoch": 0.29690522243713735,
"grad_norm": 0.45159056782722473,
"learning_rate": 4.978598269294857e-06,
"loss": 0.5026,
"step": 614
},
{
"epoch": 0.2973887814313346,
"grad_norm": 0.44324159622192383,
"learning_rate": 4.978514870258408e-06,
"loss": 0.4788,
"step": 615
},
{
"epoch": 0.2978723404255319,
"grad_norm": 0.425523579120636,
"learning_rate": 4.9784313097428695e-06,
"loss": 0.5136,
"step": 616
},
{
"epoch": 0.29835589941972923,
"grad_norm": 0.4262832701206207,
"learning_rate": 4.978347587753683e-06,
"loss": 0.4714,
"step": 617
},
{
"epoch": 0.2988394584139265,
"grad_norm": 1.2277506589889526,
"learning_rate": 4.978263704296305e-06,
"loss": 0.5021,
"step": 618
},
{
"epoch": 0.2993230174081238,
"grad_norm": 0.883792519569397,
"learning_rate": 4.978179659376199e-06,
"loss": 0.4832,
"step": 619
},
{
"epoch": 0.29980657640232106,
"grad_norm": 0.4646826982498169,
"learning_rate": 4.978095452998841e-06,
"loss": 0.4987,
"step": 620
},
{
"epoch": 0.3002901353965184,
"grad_norm": 0.42909321188926697,
"learning_rate": 4.978011085169717e-06,
"loss": 0.4967,
"step": 621
},
{
"epoch": 0.30077369439071566,
"grad_norm": 0.5661286115646362,
"learning_rate": 4.9779265558943254e-06,
"loss": 0.4777,
"step": 622
},
{
"epoch": 0.30125725338491294,
"grad_norm": 0.4307956397533417,
"learning_rate": 4.977841865178171e-06,
"loss": 0.4998,
"step": 623
},
{
"epoch": 0.30174081237911027,
"grad_norm": 0.44329774379730225,
"learning_rate": 4.977757013026773e-06,
"loss": 0.4828,
"step": 624
},
{
"epoch": 0.30222437137330754,
"grad_norm": 0.40910235047340393,
"learning_rate": 4.977671999445659e-06,
"loss": 0.5033,
"step": 625
},
{
"epoch": 0.3027079303675048,
"grad_norm": 0.41079941391944885,
"learning_rate": 4.977586824440369e-06,
"loss": 0.5052,
"step": 626
},
{
"epoch": 0.30319148936170215,
"grad_norm": 0.46034660935401917,
"learning_rate": 4.977501488016451e-06,
"loss": 0.4866,
"step": 627
},
{
"epoch": 0.3036750483558994,
"grad_norm": 0.5553982853889465,
"learning_rate": 4.977415990179464e-06,
"loss": 0.5019,
"step": 628
},
{
"epoch": 0.3041586073500967,
"grad_norm": 0.46083885431289673,
"learning_rate": 4.977330330934981e-06,
"loss": 0.4858,
"step": 629
},
{
"epoch": 0.30464216634429403,
"grad_norm": 0.4224453270435333,
"learning_rate": 4.97724451028858e-06,
"loss": 0.4655,
"step": 630
},
{
"epoch": 0.3051257253384913,
"grad_norm": 0.4268517792224884,
"learning_rate": 4.977158528245855e-06,
"loss": 0.5089,
"step": 631
},
{
"epoch": 0.3056092843326886,
"grad_norm": 0.585080623626709,
"learning_rate": 4.977072384812406e-06,
"loss": 0.4792,
"step": 632
},
{
"epoch": 0.30609284332688586,
"grad_norm": 0.4740874171257019,
"learning_rate": 4.976986079993845e-06,
"loss": 0.4511,
"step": 633
},
{
"epoch": 0.3065764023210832,
"grad_norm": 0.7237485647201538,
"learning_rate": 4.976899613795797e-06,
"loss": 0.4709,
"step": 634
},
{
"epoch": 0.30705996131528046,
"grad_norm": 0.7460974454879761,
"learning_rate": 4.9768129862238935e-06,
"loss": 0.4724,
"step": 635
},
{
"epoch": 0.30754352030947774,
"grad_norm": 0.5382199287414551,
"learning_rate": 4.976726197283779e-06,
"loss": 0.4808,
"step": 636
},
{
"epoch": 0.30802707930367507,
"grad_norm": 0.42365172505378723,
"learning_rate": 4.976639246981108e-06,
"loss": 0.5015,
"step": 637
},
{
"epoch": 0.30851063829787234,
"grad_norm": 0.4385802447795868,
"learning_rate": 4.976552135321546e-06,
"loss": 0.4921,
"step": 638
},
{
"epoch": 0.3089941972920696,
"grad_norm": 0.4448932111263275,
"learning_rate": 4.976464862310768e-06,
"loss": 0.4604,
"step": 639
},
{
"epoch": 0.30947775628626695,
"grad_norm": 0.4279050827026367,
"learning_rate": 4.9763774279544595e-06,
"loss": 0.4919,
"step": 640
},
{
"epoch": 0.3099613152804642,
"grad_norm": 0.43031272292137146,
"learning_rate": 4.9762898322583184e-06,
"loss": 0.4872,
"step": 641
},
{
"epoch": 0.3104448742746615,
"grad_norm": 0.5989789366722107,
"learning_rate": 4.976202075228049e-06,
"loss": 0.4636,
"step": 642
},
{
"epoch": 0.31092843326885883,
"grad_norm": 0.41690775752067566,
"learning_rate": 4.9761141568693715e-06,
"loss": 0.4862,
"step": 643
},
{
"epoch": 0.3114119922630561,
"grad_norm": 0.5114609599113464,
"learning_rate": 4.976026077188013e-06,
"loss": 0.4757,
"step": 644
},
{
"epoch": 0.3118955512572534,
"grad_norm": 0.4282832741737366,
"learning_rate": 4.975937836189712e-06,
"loss": 0.4971,
"step": 645
},
{
"epoch": 0.31237911025145065,
"grad_norm": 0.42823049426078796,
"learning_rate": 4.975849433880218e-06,
"loss": 0.4859,
"step": 646
},
{
"epoch": 0.312862669245648,
"grad_norm": 0.4191463589668274,
"learning_rate": 4.975760870265289e-06,
"loss": 0.4951,
"step": 647
},
{
"epoch": 0.31334622823984526,
"grad_norm": 0.5171394944190979,
"learning_rate": 4.975672145350696e-06,
"loss": 0.4909,
"step": 648
},
{
"epoch": 0.31382978723404253,
"grad_norm": 0.4636901617050171,
"learning_rate": 4.97558325914222e-06,
"loss": 0.4849,
"step": 649
},
{
"epoch": 0.31431334622823986,
"grad_norm": 0.4889143407344818,
"learning_rate": 4.975494211645652e-06,
"loss": 0.4787,
"step": 650
},
{
"epoch": 0.31479690522243714,
"grad_norm": 0.4157842695713043,
"learning_rate": 4.975405002866793e-06,
"loss": 0.4851,
"step": 651
},
{
"epoch": 0.3152804642166344,
"grad_norm": 0.42362505197525024,
"learning_rate": 4.975315632811456e-06,
"loss": 0.4707,
"step": 652
},
{
"epoch": 0.31576402321083175,
"grad_norm": 0.4421435296535492,
"learning_rate": 4.9752261014854625e-06,
"loss": 0.4927,
"step": 653
},
{
"epoch": 0.316247582205029,
"grad_norm": 0.4442092776298523,
"learning_rate": 4.975136408894646e-06,
"loss": 0.5045,
"step": 654
},
{
"epoch": 0.3167311411992263,
"grad_norm": 0.4613809883594513,
"learning_rate": 4.975046555044851e-06,
"loss": 0.4934,
"step": 655
},
{
"epoch": 0.31721470019342357,
"grad_norm": 0.5220198631286621,
"learning_rate": 4.97495653994193e-06,
"loss": 0.4765,
"step": 656
},
{
"epoch": 0.3176982591876209,
"grad_norm": 0.4627764821052551,
"learning_rate": 4.974866363591749e-06,
"loss": 0.4757,
"step": 657
},
{
"epoch": 0.3181818181818182,
"grad_norm": 0.4076231122016907,
"learning_rate": 4.974776026000182e-06,
"loss": 0.4884,
"step": 658
},
{
"epoch": 0.31866537717601545,
"grad_norm": 0.44560858607292175,
"learning_rate": 4.974685527173116e-06,
"loss": 0.4909,
"step": 659
},
{
"epoch": 0.3191489361702128,
"grad_norm": 0.4903375506401062,
"learning_rate": 4.974594867116446e-06,
"loss": 0.4843,
"step": 660
},
{
"epoch": 0.31963249516441006,
"grad_norm": 0.4852094054222107,
"learning_rate": 4.974504045836079e-06,
"loss": 0.5082,
"step": 661
},
{
"epoch": 0.32011605415860733,
"grad_norm": 0.5490374565124512,
"learning_rate": 4.974413063337932e-06,
"loss": 0.4813,
"step": 662
},
{
"epoch": 0.32059961315280466,
"grad_norm": 0.451930969953537,
"learning_rate": 4.974321919627932e-06,
"loss": 0.4976,
"step": 663
},
{
"epoch": 0.32108317214700194,
"grad_norm": 0.44542908668518066,
"learning_rate": 4.97423061471202e-06,
"loss": 0.5068,
"step": 664
},
{
"epoch": 0.3215667311411992,
"grad_norm": 0.4401024281978607,
"learning_rate": 4.974139148596141e-06,
"loss": 0.4887,
"step": 665
},
{
"epoch": 0.32205029013539654,
"grad_norm": 0.4341893196105957,
"learning_rate": 4.9740475212862565e-06,
"loss": 0.4587,
"step": 666
},
{
"epoch": 0.3225338491295938,
"grad_norm": 0.44072195887565613,
"learning_rate": 4.973955732788335e-06,
"loss": 0.4919,
"step": 667
},
{
"epoch": 0.3230174081237911,
"grad_norm": 0.4512384831905365,
"learning_rate": 4.973863783108358e-06,
"loss": 0.5007,
"step": 668
},
{
"epoch": 0.32350096711798837,
"grad_norm": 0.42298397421836853,
"learning_rate": 4.9737716722523145e-06,
"loss": 0.4855,
"step": 669
},
{
"epoch": 0.3239845261121857,
"grad_norm": 0.43051955103874207,
"learning_rate": 4.973679400226207e-06,
"loss": 0.4746,
"step": 670
},
{
"epoch": 0.324468085106383,
"grad_norm": 0.43127870559692383,
"learning_rate": 4.973586967036046e-06,
"loss": 0.4682,
"step": 671
},
{
"epoch": 0.32495164410058025,
"grad_norm": 0.4191921651363373,
"learning_rate": 4.9734943726878545e-06,
"loss": 0.4799,
"step": 672
},
{
"epoch": 0.3254352030947776,
"grad_norm": 0.48285409808158875,
"learning_rate": 4.973401617187664e-06,
"loss": 0.4793,
"step": 673
},
{
"epoch": 0.32591876208897486,
"grad_norm": 0.4482521712779999,
"learning_rate": 4.97330870054152e-06,
"loss": 0.4866,
"step": 674
},
{
"epoch": 0.32640232108317213,
"grad_norm": 0.4234021008014679,
"learning_rate": 4.973215622755474e-06,
"loss": 0.4829,
"step": 675
},
{
"epoch": 0.32688588007736946,
"grad_norm": 0.8298510313034058,
"learning_rate": 4.9731223838355915e-06,
"loss": 0.4926,
"step": 676
},
{
"epoch": 0.32736943907156674,
"grad_norm": 0.4476756453514099,
"learning_rate": 4.973028983787947e-06,
"loss": 0.4739,
"step": 677
},
{
"epoch": 0.327852998065764,
"grad_norm": 0.39939942955970764,
"learning_rate": 4.972935422618624e-06,
"loss": 0.4431,
"step": 678
},
{
"epoch": 0.32833655705996134,
"grad_norm": 0.47416552901268005,
"learning_rate": 4.97284170033372e-06,
"loss": 0.4987,
"step": 679
},
{
"epoch": 0.3288201160541586,
"grad_norm": 0.4256126284599304,
"learning_rate": 4.9727478169393406e-06,
"loss": 0.4667,
"step": 680
},
{
"epoch": 0.3293036750483559,
"grad_norm": 0.4688650965690613,
"learning_rate": 4.972653772441602e-06,
"loss": 0.4957,
"step": 681
},
{
"epoch": 0.32978723404255317,
"grad_norm": 0.4217233955860138,
"learning_rate": 4.972559566846632e-06,
"loss": 0.4688,
"step": 682
},
{
"epoch": 0.3302707930367505,
"grad_norm": 0.4340645968914032,
"learning_rate": 4.972465200160568e-06,
"loss": 0.4859,
"step": 683
},
{
"epoch": 0.3307543520309478,
"grad_norm": 0.41588684916496277,
"learning_rate": 4.9723706723895584e-06,
"loss": 0.5181,
"step": 684
},
{
"epoch": 0.33123791102514505,
"grad_norm": 0.4644707143306732,
"learning_rate": 4.972275983539761e-06,
"loss": 0.4681,
"step": 685
},
{
"epoch": 0.3317214700193424,
"grad_norm": 0.5474352836608887,
"learning_rate": 4.972181133617345e-06,
"loss": 0.4864,
"step": 686
},
{
"epoch": 0.33220502901353965,
"grad_norm": 0.4336998760700226,
"learning_rate": 4.972086122628492e-06,
"loss": 0.4926,
"step": 687
},
{
"epoch": 0.33268858800773693,
"grad_norm": 0.42557400465011597,
"learning_rate": 4.97199095057939e-06,
"loss": 0.4817,
"step": 688
},
{
"epoch": 0.33317214700193426,
"grad_norm": 0.4498555362224579,
"learning_rate": 4.97189561747624e-06,
"loss": 0.4735,
"step": 689
},
{
"epoch": 0.33365570599613154,
"grad_norm": 0.4151577651500702,
"learning_rate": 4.971800123325253e-06,
"loss": 0.4756,
"step": 690
},
{
"epoch": 0.3341392649903288,
"grad_norm": 0.4264872968196869,
"learning_rate": 4.971704468132651e-06,
"loss": 0.4897,
"step": 691
},
{
"epoch": 0.3346228239845261,
"grad_norm": 0.40942180156707764,
"learning_rate": 4.971608651904667e-06,
"loss": 0.4798,
"step": 692
},
{
"epoch": 0.3351063829787234,
"grad_norm": 0.42875564098358154,
"learning_rate": 4.971512674647542e-06,
"loss": 0.487,
"step": 693
},
{
"epoch": 0.3355899419729207,
"grad_norm": 0.505617082118988,
"learning_rate": 4.9714165363675295e-06,
"loss": 0.474,
"step": 694
},
{
"epoch": 0.33607350096711797,
"grad_norm": 0.41302382946014404,
"learning_rate": 4.971320237070893e-06,
"loss": 0.4811,
"step": 695
},
{
"epoch": 0.3365570599613153,
"grad_norm": 0.4467024505138397,
"learning_rate": 4.9712237767639075e-06,
"loss": 0.4835,
"step": 696
},
{
"epoch": 0.33704061895551257,
"grad_norm": 0.42488452792167664,
"learning_rate": 4.971127155452856e-06,
"loss": 0.4899,
"step": 697
},
{
"epoch": 0.33752417794970985,
"grad_norm": 0.936638355255127,
"learning_rate": 4.971030373144035e-06,
"loss": 0.4954,
"step": 698
},
{
"epoch": 0.3380077369439072,
"grad_norm": 0.43264010548591614,
"learning_rate": 4.970933429843748e-06,
"loss": 0.5001,
"step": 699
},
{
"epoch": 0.33849129593810445,
"grad_norm": 0.4461173415184021,
"learning_rate": 4.970836325558314e-06,
"loss": 0.4979,
"step": 700
},
{
"epoch": 0.33897485493230173,
"grad_norm": 0.4384688138961792,
"learning_rate": 4.970739060294056e-06,
"loss": 0.5053,
"step": 701
},
{
"epoch": 0.33945841392649906,
"grad_norm": 0.44660210609436035,
"learning_rate": 4.970641634057314e-06,
"loss": 0.4922,
"step": 702
},
{
"epoch": 0.33994197292069633,
"grad_norm": 0.6774649620056152,
"learning_rate": 4.970544046854434e-06,
"loss": 0.4641,
"step": 703
},
{
"epoch": 0.3404255319148936,
"grad_norm": 0.4222927987575531,
"learning_rate": 4.970446298691775e-06,
"loss": 0.482,
"step": 704
},
{
"epoch": 0.3409090909090909,
"grad_norm": 0.4312184751033783,
"learning_rate": 4.970348389575704e-06,
"loss": 0.4799,
"step": 705
},
{
"epoch": 0.3413926499032882,
"grad_norm": 0.44325292110443115,
"learning_rate": 4.970250319512601e-06,
"loss": 0.474,
"step": 706
},
{
"epoch": 0.3418762088974855,
"grad_norm": 0.43495866656303406,
"learning_rate": 4.970152088508854e-06,
"loss": 0.4713,
"step": 707
},
{
"epoch": 0.34235976789168276,
"grad_norm": 0.41411668062210083,
"learning_rate": 4.970053696570865e-06,
"loss": 0.4958,
"step": 708
},
{
"epoch": 0.3428433268858801,
"grad_norm": 0.4402662515640259,
"learning_rate": 4.969955143705043e-06,
"loss": 0.4682,
"step": 709
},
{
"epoch": 0.34332688588007737,
"grad_norm": 0.5128571391105652,
"learning_rate": 4.96985642991781e-06,
"loss": 0.4667,
"step": 710
},
{
"epoch": 0.34381044487427465,
"grad_norm": 0.45290181040763855,
"learning_rate": 4.969757555215595e-06,
"loss": 0.4809,
"step": 711
},
{
"epoch": 0.344294003868472,
"grad_norm": 0.4683387875556946,
"learning_rate": 4.9696585196048425e-06,
"loss": 0.4798,
"step": 712
},
{
"epoch": 0.34477756286266925,
"grad_norm": 0.4333076775074005,
"learning_rate": 4.969559323092004e-06,
"loss": 0.4693,
"step": 713
},
{
"epoch": 0.3452611218568665,
"grad_norm": 0.4301561117172241,
"learning_rate": 4.96945996568354e-06,
"loss": 0.4771,
"step": 714
},
{
"epoch": 0.34574468085106386,
"grad_norm": 0.41485023498535156,
"learning_rate": 4.969360447385928e-06,
"loss": 0.4696,
"step": 715
},
{
"epoch": 0.34622823984526113,
"grad_norm": 0.4785935580730438,
"learning_rate": 4.969260768205649e-06,
"loss": 0.4836,
"step": 716
},
{
"epoch": 0.3467117988394584,
"grad_norm": 0.4168457090854645,
"learning_rate": 4.969160928149197e-06,
"loss": 0.4871,
"step": 717
},
{
"epoch": 0.3471953578336557,
"grad_norm": 0.5240389704704285,
"learning_rate": 4.969060927223079e-06,
"loss": 0.4856,
"step": 718
},
{
"epoch": 0.347678916827853,
"grad_norm": 0.43695124983787537,
"learning_rate": 4.968960765433808e-06,
"loss": 0.4732,
"step": 719
},
{
"epoch": 0.3481624758220503,
"grad_norm": 0.4459145963191986,
"learning_rate": 4.96886044278791e-06,
"loss": 0.474,
"step": 720
},
{
"epoch": 0.34864603481624756,
"grad_norm": 0.6060423851013184,
"learning_rate": 4.968759959291922e-06,
"loss": 0.4582,
"step": 721
},
{
"epoch": 0.3491295938104449,
"grad_norm": 0.45408162474632263,
"learning_rate": 4.968659314952391e-06,
"loss": 0.4875,
"step": 722
},
{
"epoch": 0.34961315280464217,
"grad_norm": 0.4357450306415558,
"learning_rate": 4.968558509775872e-06,
"loss": 0.4809,
"step": 723
},
{
"epoch": 0.35009671179883944,
"grad_norm": 0.4248330891132355,
"learning_rate": 4.9684575437689354e-06,
"loss": 0.4638,
"step": 724
},
{
"epoch": 0.3505802707930368,
"grad_norm": 0.41638749837875366,
"learning_rate": 4.968356416938158e-06,
"loss": 0.4607,
"step": 725
},
{
"epoch": 0.35106382978723405,
"grad_norm": 0.42239511013031006,
"learning_rate": 4.968255129290127e-06,
"loss": 0.4682,
"step": 726
},
{
"epoch": 0.3515473887814313,
"grad_norm": 0.4688323438167572,
"learning_rate": 4.968153680831444e-06,
"loss": 0.4909,
"step": 727
},
{
"epoch": 0.3520309477756286,
"grad_norm": 0.4212821125984192,
"learning_rate": 4.968052071568717e-06,
"loss": 0.4799,
"step": 728
},
{
"epoch": 0.35251450676982593,
"grad_norm": 0.4717614948749542,
"learning_rate": 4.967950301508566e-06,
"loss": 0.488,
"step": 729
},
{
"epoch": 0.3529980657640232,
"grad_norm": 0.4233686625957489,
"learning_rate": 4.967848370657622e-06,
"loss": 0.5076,
"step": 730
},
{
"epoch": 0.3534816247582205,
"grad_norm": 0.437971830368042,
"learning_rate": 4.967746279022526e-06,
"loss": 0.4656,
"step": 731
},
{
"epoch": 0.3539651837524178,
"grad_norm": 0.557462215423584,
"learning_rate": 4.967644026609929e-06,
"loss": 0.4708,
"step": 732
},
{
"epoch": 0.3544487427466151,
"grad_norm": 0.4785339832305908,
"learning_rate": 4.967541613426493e-06,
"loss": 0.4702,
"step": 733
},
{
"epoch": 0.35493230174081236,
"grad_norm": 0.446857213973999,
"learning_rate": 4.96743903947889e-06,
"loss": 0.442,
"step": 734
},
{
"epoch": 0.3554158607350097,
"grad_norm": 0.4136951267719269,
"learning_rate": 4.967336304773805e-06,
"loss": 0.4767,
"step": 735
},
{
"epoch": 0.35589941972920697,
"grad_norm": 0.4243564009666443,
"learning_rate": 4.967233409317928e-06,
"loss": 0.4739,
"step": 736
},
{
"epoch": 0.35638297872340424,
"grad_norm": 0.43003565073013306,
"learning_rate": 4.9671303531179635e-06,
"loss": 0.4597,
"step": 737
},
{
"epoch": 0.3568665377176016,
"grad_norm": 0.48309555649757385,
"learning_rate": 4.967027136180629e-06,
"loss": 0.478,
"step": 738
},
{
"epoch": 0.35735009671179885,
"grad_norm": 0.510793149471283,
"learning_rate": 4.966923758512645e-06,
"loss": 0.5017,
"step": 739
},
{
"epoch": 0.3578336557059961,
"grad_norm": 0.5192124843597412,
"learning_rate": 4.96682022012075e-06,
"loss": 0.5028,
"step": 740
},
{
"epoch": 0.3583172147001934,
"grad_norm": 0.6904450058937073,
"learning_rate": 4.966716521011688e-06,
"loss": 0.4749,
"step": 741
},
{
"epoch": 0.35880077369439073,
"grad_norm": 0.4111625552177429,
"learning_rate": 4.966612661192215e-06,
"loss": 0.4671,
"step": 742
},
{
"epoch": 0.359284332688588,
"grad_norm": 0.4130711555480957,
"learning_rate": 4.966508640669099e-06,
"loss": 0.5011,
"step": 743
},
{
"epoch": 0.3597678916827853,
"grad_norm": 0.469009667634964,
"learning_rate": 4.966404459449115e-06,
"loss": 0.4961,
"step": 744
},
{
"epoch": 0.3602514506769826,
"grad_norm": 0.4455870985984802,
"learning_rate": 4.966300117539052e-06,
"loss": 0.485,
"step": 745
},
{
"epoch": 0.3607350096711799,
"grad_norm": 0.40770214796066284,
"learning_rate": 4.966195614945709e-06,
"loss": 0.4699,
"step": 746
},
{
"epoch": 0.36121856866537716,
"grad_norm": 0.42602604627609253,
"learning_rate": 4.966090951675893e-06,
"loss": 0.4738,
"step": 747
},
{
"epoch": 0.3617021276595745,
"grad_norm": 0.4325065612792969,
"learning_rate": 4.965986127736423e-06,
"loss": 0.475,
"step": 748
},
{
"epoch": 0.36218568665377177,
"grad_norm": 0.43184852600097656,
"learning_rate": 4.965881143134128e-06,
"loss": 0.4738,
"step": 749
},
{
"epoch": 0.36266924564796904,
"grad_norm": 0.4798991084098816,
"learning_rate": 4.96577599787585e-06,
"loss": 0.4841,
"step": 750
},
{
"epoch": 0.36315280464216637,
"grad_norm": 0.4550189971923828,
"learning_rate": 4.965670691968438e-06,
"loss": 0.4663,
"step": 751
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.42557281255722046,
"learning_rate": 4.965565225418752e-06,
"loss": 0.4691,
"step": 752
},
{
"epoch": 0.3641199226305609,
"grad_norm": 0.43087345361709595,
"learning_rate": 4.965459598233664e-06,
"loss": 0.5025,
"step": 753
},
{
"epoch": 0.3646034816247582,
"grad_norm": 0.4292849600315094,
"learning_rate": 4.965353810420056e-06,
"loss": 0.4882,
"step": 754
},
{
"epoch": 0.3650870406189555,
"grad_norm": 1.3346878290176392,
"learning_rate": 4.965247861984821e-06,
"loss": 0.4776,
"step": 755
},
{
"epoch": 0.3655705996131528,
"grad_norm": 0.4174513518810272,
"learning_rate": 4.96514175293486e-06,
"loss": 0.4475,
"step": 756
},
{
"epoch": 0.3660541586073501,
"grad_norm": 0.4157787263393402,
"learning_rate": 4.965035483277088e-06,
"loss": 0.4717,
"step": 757
},
{
"epoch": 0.3665377176015474,
"grad_norm": 0.4442564845085144,
"learning_rate": 4.964929053018427e-06,
"loss": 0.4771,
"step": 758
},
{
"epoch": 0.3670212765957447,
"grad_norm": 0.4338626265525818,
"learning_rate": 4.9648224621658125e-06,
"loss": 0.4714,
"step": 759
},
{
"epoch": 0.36750483558994196,
"grad_norm": 0.44122618436813354,
"learning_rate": 4.964715710726188e-06,
"loss": 0.4595,
"step": 760
},
{
"epoch": 0.3679883945841393,
"grad_norm": 0.42671939730644226,
"learning_rate": 4.964608798706508e-06,
"loss": 0.4661,
"step": 761
},
{
"epoch": 0.36847195357833656,
"grad_norm": 0.41113489866256714,
"learning_rate": 4.964501726113741e-06,
"loss": 0.4914,
"step": 762
},
{
"epoch": 0.36895551257253384,
"grad_norm": 0.5205186605453491,
"learning_rate": 4.96439449295486e-06,
"loss": 0.4791,
"step": 763
},
{
"epoch": 0.3694390715667311,
"grad_norm": 0.435266375541687,
"learning_rate": 4.964287099236851e-06,
"loss": 0.4691,
"step": 764
},
{
"epoch": 0.36992263056092844,
"grad_norm": 0.41085487604141235,
"learning_rate": 4.964179544966713e-06,
"loss": 0.4728,
"step": 765
},
{
"epoch": 0.3704061895551257,
"grad_norm": 0.4392455518245697,
"learning_rate": 4.964071830151452e-06,
"loss": 0.4933,
"step": 766
},
{
"epoch": 0.370889748549323,
"grad_norm": 0.4227438271045685,
"learning_rate": 4.963963954798087e-06,
"loss": 0.5065,
"step": 767
},
{
"epoch": 0.3713733075435203,
"grad_norm": 0.4845467805862427,
"learning_rate": 4.963855918913645e-06,
"loss": 0.4714,
"step": 768
},
{
"epoch": 0.3718568665377176,
"grad_norm": 0.45647627115249634,
"learning_rate": 4.963747722505164e-06,
"loss": 0.498,
"step": 769
},
{
"epoch": 0.3723404255319149,
"grad_norm": 0.46499380469322205,
"learning_rate": 4.963639365579696e-06,
"loss": 0.4687,
"step": 770
},
{
"epoch": 0.3728239845261122,
"grad_norm": 0.4408628046512604,
"learning_rate": 4.963530848144298e-06,
"loss": 0.4769,
"step": 771
},
{
"epoch": 0.3733075435203095,
"grad_norm": 0.4910852015018463,
"learning_rate": 4.963422170206042e-06,
"loss": 0.4826,
"step": 772
},
{
"epoch": 0.37379110251450676,
"grad_norm": 0.4329937696456909,
"learning_rate": 4.963313331772008e-06,
"loss": 0.4635,
"step": 773
},
{
"epoch": 0.3742746615087041,
"grad_norm": 0.49897655844688416,
"learning_rate": 4.963204332849285e-06,
"loss": 0.4716,
"step": 774
},
{
"epoch": 0.37475822050290136,
"grad_norm": 0.4166103005409241,
"learning_rate": 4.963095173444976e-06,
"loss": 0.4552,
"step": 775
},
{
"epoch": 0.37524177949709864,
"grad_norm": 0.4559236168861389,
"learning_rate": 4.962985853566193e-06,
"loss": 0.4768,
"step": 776
},
{
"epoch": 0.3757253384912959,
"grad_norm": 0.49545004963874817,
"learning_rate": 4.962876373220059e-06,
"loss": 0.4615,
"step": 777
},
{
"epoch": 0.37620889748549324,
"grad_norm": 0.4941776394844055,
"learning_rate": 4.962766732413706e-06,
"loss": 0.4799,
"step": 778
},
{
"epoch": 0.3766924564796905,
"grad_norm": 0.6698254942893982,
"learning_rate": 4.962656931154277e-06,
"loss": 0.4769,
"step": 779
},
{
"epoch": 0.3771760154738878,
"grad_norm": 0.48517316579818726,
"learning_rate": 4.9625469694489266e-06,
"loss": 0.4738,
"step": 780
},
{
"epoch": 0.3776595744680851,
"grad_norm": 0.44697442650794983,
"learning_rate": 4.962436847304818e-06,
"loss": 0.4798,
"step": 781
},
{
"epoch": 0.3781431334622824,
"grad_norm": 0.5347298979759216,
"learning_rate": 4.962326564729126e-06,
"loss": 0.4625,
"step": 782
},
{
"epoch": 0.3786266924564797,
"grad_norm": 0.42385363578796387,
"learning_rate": 4.962216121729036e-06,
"loss": 0.4896,
"step": 783
},
{
"epoch": 0.379110251450677,
"grad_norm": 0.4330436885356903,
"learning_rate": 4.962105518311745e-06,
"loss": 0.4584,
"step": 784
},
{
"epoch": 0.3795938104448743,
"grad_norm": 0.42593157291412354,
"learning_rate": 4.961994754484456e-06,
"loss": 0.4726,
"step": 785
},
{
"epoch": 0.38007736943907156,
"grad_norm": 0.49842873215675354,
"learning_rate": 4.961883830254387e-06,
"loss": 0.4738,
"step": 786
},
{
"epoch": 0.3805609284332689,
"grad_norm": 0.46715858578681946,
"learning_rate": 4.961772745628765e-06,
"loss": 0.4809,
"step": 787
},
{
"epoch": 0.38104448742746616,
"grad_norm": 0.4344216287136078,
"learning_rate": 4.961661500614827e-06,
"loss": 0.4725,
"step": 788
},
{
"epoch": 0.38152804642166344,
"grad_norm": 0.4901852607727051,
"learning_rate": 4.961550095219821e-06,
"loss": 0.4622,
"step": 789
},
{
"epoch": 0.3820116054158607,
"grad_norm": 0.42893216013908386,
"learning_rate": 4.961438529451005e-06,
"loss": 0.4947,
"step": 790
},
{
"epoch": 0.38249516441005804,
"grad_norm": 0.42825618386268616,
"learning_rate": 4.961326803315648e-06,
"loss": 0.4694,
"step": 791
},
{
"epoch": 0.3829787234042553,
"grad_norm": 0.42393574118614197,
"learning_rate": 4.961214916821029e-06,
"loss": 0.4777,
"step": 792
},
{
"epoch": 0.3834622823984526,
"grad_norm": 0.4666205942630768,
"learning_rate": 4.961102869974438e-06,
"loss": 0.4588,
"step": 793
},
{
"epoch": 0.3839458413926499,
"grad_norm": 0.43628597259521484,
"learning_rate": 4.960990662783174e-06,
"loss": 0.4456,
"step": 794
},
{
"epoch": 0.3844294003868472,
"grad_norm": 0.43910062313079834,
"learning_rate": 4.960878295254549e-06,
"loss": 0.4581,
"step": 795
},
{
"epoch": 0.3849129593810445,
"grad_norm": 0.4191967248916626,
"learning_rate": 4.960765767395881e-06,
"loss": 0.4753,
"step": 796
},
{
"epoch": 0.3853965183752418,
"grad_norm": 0.4429578483104706,
"learning_rate": 4.960653079214505e-06,
"loss": 0.4614,
"step": 797
},
{
"epoch": 0.3858800773694391,
"grad_norm": 0.5809339880943298,
"learning_rate": 4.960540230717761e-06,
"loss": 0.4701,
"step": 798
},
{
"epoch": 0.38636363636363635,
"grad_norm": 0.48615169525146484,
"learning_rate": 4.960427221913e-06,
"loss": 0.4688,
"step": 799
},
{
"epoch": 0.38684719535783363,
"grad_norm": 0.4294247329235077,
"learning_rate": 4.960314052807588e-06,
"loss": 0.4804,
"step": 800
},
{
"epoch": 0.38733075435203096,
"grad_norm": 0.46103084087371826,
"learning_rate": 4.960200723408895e-06,
"loss": 0.482,
"step": 801
},
{
"epoch": 0.38781431334622823,
"grad_norm": 0.45017433166503906,
"learning_rate": 4.960087233724306e-06,
"loss": 0.4706,
"step": 802
},
{
"epoch": 0.3882978723404255,
"grad_norm": 0.42244574427604675,
"learning_rate": 4.959973583761215e-06,
"loss": 0.478,
"step": 803
},
{
"epoch": 0.38878143133462284,
"grad_norm": 0.44186943769454956,
"learning_rate": 4.959859773527027e-06,
"loss": 0.4984,
"step": 804
},
{
"epoch": 0.3892649903288201,
"grad_norm": 0.4306003153324127,
"learning_rate": 4.959745803029155e-06,
"loss": 0.4918,
"step": 805
},
{
"epoch": 0.3897485493230174,
"grad_norm": 0.45696625113487244,
"learning_rate": 4.959631672275026e-06,
"loss": 0.4896,
"step": 806
},
{
"epoch": 0.3902321083172147,
"grad_norm": 0.4278884828090668,
"learning_rate": 4.959517381272075e-06,
"loss": 0.4728,
"step": 807
},
{
"epoch": 0.390715667311412,
"grad_norm": 0.44117751717567444,
"learning_rate": 4.95940293002775e-06,
"loss": 0.4902,
"step": 808
},
{
"epoch": 0.39119922630560927,
"grad_norm": 0.4210093319416046,
"learning_rate": 4.959288318549505e-06,
"loss": 0.4642,
"step": 809
},
{
"epoch": 0.3916827852998066,
"grad_norm": 0.4378984868526459,
"learning_rate": 4.959173546844809e-06,
"loss": 0.4886,
"step": 810
},
{
"epoch": 0.3921663442940039,
"grad_norm": 0.42724746465682983,
"learning_rate": 4.959058614921139e-06,
"loss": 0.4938,
"step": 811
},
{
"epoch": 0.39264990328820115,
"grad_norm": 0.4462886452674866,
"learning_rate": 4.958943522785982e-06,
"loss": 0.4676,
"step": 812
},
{
"epoch": 0.3931334622823984,
"grad_norm": 0.4206470251083374,
"learning_rate": 4.95882827044684e-06,
"loss": 0.4939,
"step": 813
},
{
"epoch": 0.39361702127659576,
"grad_norm": 0.5013949871063232,
"learning_rate": 4.958712857911217e-06,
"loss": 0.4937,
"step": 814
},
{
"epoch": 0.39410058027079303,
"grad_norm": 0.5348606109619141,
"learning_rate": 4.958597285186635e-06,
"loss": 0.4844,
"step": 815
},
{
"epoch": 0.3945841392649903,
"grad_norm": 0.41226282715797424,
"learning_rate": 4.958481552280623e-06,
"loss": 0.4726,
"step": 816
},
{
"epoch": 0.39506769825918764,
"grad_norm": 0.4058435261249542,
"learning_rate": 4.958365659200722e-06,
"loss": 0.4628,
"step": 817
},
{
"epoch": 0.3955512572533849,
"grad_norm": 0.4504850506782532,
"learning_rate": 4.958249605954481e-06,
"loss": 0.4756,
"step": 818
},
{
"epoch": 0.3960348162475822,
"grad_norm": 0.42663368582725525,
"learning_rate": 4.9581333925494635e-06,
"loss": 0.484,
"step": 819
},
{
"epoch": 0.3965183752417795,
"grad_norm": 0.43051379919052124,
"learning_rate": 4.95801701899324e-06,
"loss": 0.4838,
"step": 820
},
{
"epoch": 0.3970019342359768,
"grad_norm": 0.44090908765792847,
"learning_rate": 4.9579004852933906e-06,
"loss": 0.4673,
"step": 821
},
{
"epoch": 0.39748549323017407,
"grad_norm": 0.4541039764881134,
"learning_rate": 4.95778379145751e-06,
"loss": 0.4829,
"step": 822
},
{
"epoch": 0.3979690522243714,
"grad_norm": 0.46486979722976685,
"learning_rate": 4.9576669374932e-06,
"loss": 0.4913,
"step": 823
},
{
"epoch": 0.3984526112185687,
"grad_norm": 0.4304236173629761,
"learning_rate": 4.957549923408074e-06,
"loss": 0.4571,
"step": 824
},
{
"epoch": 0.39893617021276595,
"grad_norm": 0.44261449575424194,
"learning_rate": 4.957432749209755e-06,
"loss": 0.4771,
"step": 825
},
{
"epoch": 0.3994197292069632,
"grad_norm": 0.5330602526664734,
"learning_rate": 4.957315414905877e-06,
"loss": 0.4721,
"step": 826
},
{
"epoch": 0.39990328820116056,
"grad_norm": 0.4316442012786865,
"learning_rate": 4.957197920504087e-06,
"loss": 0.4949,
"step": 827
},
{
"epoch": 0.40038684719535783,
"grad_norm": 0.42876294255256653,
"learning_rate": 4.957080266012037e-06,
"loss": 0.4501,
"step": 828
},
{
"epoch": 0.4008704061895551,
"grad_norm": 0.41960573196411133,
"learning_rate": 4.956962451437394e-06,
"loss": 0.4719,
"step": 829
},
{
"epoch": 0.40135396518375244,
"grad_norm": 0.44260984659194946,
"learning_rate": 4.9568444767878335e-06,
"loss": 0.4618,
"step": 830
},
{
"epoch": 0.4018375241779497,
"grad_norm": 0.44216662645339966,
"learning_rate": 4.95672634207104e-06,
"loss": 0.485,
"step": 831
},
{
"epoch": 0.402321083172147,
"grad_norm": 0.46946024894714355,
"learning_rate": 4.9566080472947134e-06,
"loss": 0.4676,
"step": 832
},
{
"epoch": 0.4028046421663443,
"grad_norm": 0.45229020714759827,
"learning_rate": 4.956489592466558e-06,
"loss": 0.4743,
"step": 833
},
{
"epoch": 0.4032882011605416,
"grad_norm": 0.45127013325691223,
"learning_rate": 4.9563709775942925e-06,
"loss": 0.4749,
"step": 834
},
{
"epoch": 0.40377176015473887,
"grad_norm": 0.44455087184906006,
"learning_rate": 4.956252202685645e-06,
"loss": 0.4852,
"step": 835
},
{
"epoch": 0.40425531914893614,
"grad_norm": 0.5142653584480286,
"learning_rate": 4.956133267748353e-06,
"loss": 0.4766,
"step": 836
},
{
"epoch": 0.4047388781431335,
"grad_norm": 0.43196821212768555,
"learning_rate": 4.956014172790166e-06,
"loss": 0.4726,
"step": 837
},
{
"epoch": 0.40522243713733075,
"grad_norm": 0.42082950472831726,
"learning_rate": 4.955894917818844e-06,
"loss": 0.4927,
"step": 838
},
{
"epoch": 0.405705996131528,
"grad_norm": 0.4351148009300232,
"learning_rate": 4.955775502842155e-06,
"loss": 0.47,
"step": 839
},
{
"epoch": 0.40618955512572535,
"grad_norm": 0.45302635431289673,
"learning_rate": 4.95565592786788e-06,
"loss": 0.4632,
"step": 840
},
{
"epoch": 0.40667311411992263,
"grad_norm": 0.45961737632751465,
"learning_rate": 4.955536192903809e-06,
"loss": 0.4802,
"step": 841
},
{
"epoch": 0.4071566731141199,
"grad_norm": 0.4482019543647766,
"learning_rate": 4.955416297957744e-06,
"loss": 0.4748,
"step": 842
},
{
"epoch": 0.40764023210831724,
"grad_norm": 0.44860005378723145,
"learning_rate": 4.955296243037494e-06,
"loss": 0.4775,
"step": 843
},
{
"epoch": 0.4081237911025145,
"grad_norm": 0.44915902614593506,
"learning_rate": 4.955176028150884e-06,
"loss": 0.481,
"step": 844
},
{
"epoch": 0.4086073500967118,
"grad_norm": 0.4330085813999176,
"learning_rate": 4.9550556533057435e-06,
"loss": 0.4956,
"step": 845
},
{
"epoch": 0.4090909090909091,
"grad_norm": 0.48874109983444214,
"learning_rate": 4.954935118509917e-06,
"loss": 0.4811,
"step": 846
},
{
"epoch": 0.4095744680851064,
"grad_norm": 0.4293757379055023,
"learning_rate": 4.9548144237712556e-06,
"loss": 0.4752,
"step": 847
},
{
"epoch": 0.41005802707930367,
"grad_norm": 0.4687563180923462,
"learning_rate": 4.954693569097625e-06,
"loss": 0.4645,
"step": 848
},
{
"epoch": 0.41054158607350094,
"grad_norm": 0.45286861062049866,
"learning_rate": 4.954572554496897e-06,
"loss": 0.4757,
"step": 849
},
{
"epoch": 0.41102514506769827,
"grad_norm": 0.4203643202781677,
"learning_rate": 4.9544513799769564e-06,
"loss": 0.4696,
"step": 850
},
{
"epoch": 0.41150870406189555,
"grad_norm": 0.4808759093284607,
"learning_rate": 4.954330045545699e-06,
"loss": 0.4771,
"step": 851
},
{
"epoch": 0.4119922630560928,
"grad_norm": 0.42002764344215393,
"learning_rate": 4.954208551211029e-06,
"loss": 0.4862,
"step": 852
},
{
"epoch": 0.41247582205029015,
"grad_norm": 0.4864538013935089,
"learning_rate": 4.954086896980863e-06,
"loss": 0.4572,
"step": 853
},
{
"epoch": 0.41295938104448743,
"grad_norm": 0.42396342754364014,
"learning_rate": 4.9539650828631246e-06,
"loss": 0.4565,
"step": 854
},
{
"epoch": 0.4134429400386847,
"grad_norm": 0.41734591126441956,
"learning_rate": 4.953843108865752e-06,
"loss": 0.5006,
"step": 855
},
{
"epoch": 0.41392649903288203,
"grad_norm": 0.42748942971229553,
"learning_rate": 4.953720974996692e-06,
"loss": 0.4865,
"step": 856
},
{
"epoch": 0.4144100580270793,
"grad_norm": 0.4658924639225006,
"learning_rate": 4.953598681263902e-06,
"loss": 0.4724,
"step": 857
},
{
"epoch": 0.4148936170212766,
"grad_norm": 0.45556163787841797,
"learning_rate": 4.953476227675349e-06,
"loss": 0.4956,
"step": 858
},
{
"epoch": 0.4153771760154739,
"grad_norm": 0.45793822407722473,
"learning_rate": 4.95335361423901e-06,
"loss": 0.4886,
"step": 859
},
{
"epoch": 0.4158607350096712,
"grad_norm": 0.43772590160369873,
"learning_rate": 4.953230840962876e-06,
"loss": 0.4696,
"step": 860
},
{
"epoch": 0.41634429400386846,
"grad_norm": 0.4840233623981476,
"learning_rate": 4.9531079078549434e-06,
"loss": 0.4663,
"step": 861
},
{
"epoch": 0.41682785299806574,
"grad_norm": 0.4341041147708893,
"learning_rate": 4.9529848149232244e-06,
"loss": 0.4748,
"step": 862
},
{
"epoch": 0.41731141199226307,
"grad_norm": 0.4478056728839874,
"learning_rate": 4.9528615621757345e-06,
"loss": 0.4872,
"step": 863
},
{
"epoch": 0.41779497098646035,
"grad_norm": 0.44516491889953613,
"learning_rate": 4.952738149620508e-06,
"loss": 0.4908,
"step": 864
},
{
"epoch": 0.4182785299806576,
"grad_norm": 0.472673237323761,
"learning_rate": 4.952614577265582e-06,
"loss": 0.4679,
"step": 865
},
{
"epoch": 0.41876208897485495,
"grad_norm": 0.4395389258861542,
"learning_rate": 4.95249084511901e-06,
"loss": 0.4921,
"step": 866
},
{
"epoch": 0.4192456479690522,
"grad_norm": 0.5204181671142578,
"learning_rate": 4.952366953188852e-06,
"loss": 0.4487,
"step": 867
},
{
"epoch": 0.4197292069632495,
"grad_norm": 0.4538368880748749,
"learning_rate": 4.952242901483181e-06,
"loss": 0.4677,
"step": 868
},
{
"epoch": 0.42021276595744683,
"grad_norm": 0.4413670301437378,
"learning_rate": 4.952118690010077e-06,
"loss": 0.468,
"step": 869
},
{
"epoch": 0.4206963249516441,
"grad_norm": 0.4103778898715973,
"learning_rate": 4.951994318777634e-06,
"loss": 0.4761,
"step": 870
},
{
"epoch": 0.4211798839458414,
"grad_norm": 0.4950055778026581,
"learning_rate": 4.951869787793956e-06,
"loss": 0.4851,
"step": 871
},
{
"epoch": 0.42166344294003866,
"grad_norm": 0.42650583386421204,
"learning_rate": 4.9517450970671544e-06,
"loss": 0.4815,
"step": 872
},
{
"epoch": 0.422147001934236,
"grad_norm": 0.5697447061538696,
"learning_rate": 4.951620246605353e-06,
"loss": 0.4688,
"step": 873
},
{
"epoch": 0.42263056092843326,
"grad_norm": 0.4130346179008484,
"learning_rate": 4.9514952364166886e-06,
"loss": 0.4542,
"step": 874
},
{
"epoch": 0.42311411992263054,
"grad_norm": 0.4870031177997589,
"learning_rate": 4.9513700665093025e-06,
"loss": 0.4784,
"step": 875
},
{
"epoch": 0.42359767891682787,
"grad_norm": 0.4782927334308624,
"learning_rate": 4.951244736891352e-06,
"loss": 0.4574,
"step": 876
},
{
"epoch": 0.42408123791102514,
"grad_norm": 0.4310762584209442,
"learning_rate": 4.951119247571001e-06,
"loss": 0.4901,
"step": 877
},
{
"epoch": 0.4245647969052224,
"grad_norm": 0.4448501765727997,
"learning_rate": 4.950993598556427e-06,
"loss": 0.4661,
"step": 878
},
{
"epoch": 0.42504835589941975,
"grad_norm": 0.43321534991264343,
"learning_rate": 4.950867789855815e-06,
"loss": 0.4873,
"step": 879
},
{
"epoch": 0.425531914893617,
"grad_norm": 0.4259124994277954,
"learning_rate": 4.950741821477361e-06,
"loss": 0.4714,
"step": 880
},
{
"epoch": 0.4260154738878143,
"grad_norm": 0.4658030867576599,
"learning_rate": 4.950615693429275e-06,
"loss": 0.4611,
"step": 881
},
{
"epoch": 0.42649903288201163,
"grad_norm": 0.44376733899116516,
"learning_rate": 4.950489405719771e-06,
"loss": 0.4789,
"step": 882
},
{
"epoch": 0.4269825918762089,
"grad_norm": 0.42676928639411926,
"learning_rate": 4.950362958357078e-06,
"loss": 0.4604,
"step": 883
},
{
"epoch": 0.4274661508704062,
"grad_norm": 0.4533901512622833,
"learning_rate": 4.950236351349436e-06,
"loss": 0.4909,
"step": 884
},
{
"epoch": 0.42794970986460346,
"grad_norm": 0.4707791209220886,
"learning_rate": 4.950109584705091e-06,
"loss": 0.4622,
"step": 885
},
{
"epoch": 0.4284332688588008,
"grad_norm": 0.46238699555397034,
"learning_rate": 4.949982658432303e-06,
"loss": 0.4925,
"step": 886
},
{
"epoch": 0.42891682785299806,
"grad_norm": 0.8114577531814575,
"learning_rate": 4.9498555725393415e-06,
"loss": 0.4728,
"step": 887
},
{
"epoch": 0.42940038684719534,
"grad_norm": 0.5063269138336182,
"learning_rate": 4.949728327034487e-06,
"loss": 0.4976,
"step": 888
},
{
"epoch": 0.42988394584139267,
"grad_norm": 1.0141496658325195,
"learning_rate": 4.949600921926029e-06,
"loss": 0.4818,
"step": 889
},
{
"epoch": 0.43036750483558994,
"grad_norm": 0.42779985070228577,
"learning_rate": 4.949473357222269e-06,
"loss": 0.4734,
"step": 890
},
{
"epoch": 0.4308510638297872,
"grad_norm": 0.42237281799316406,
"learning_rate": 4.949345632931516e-06,
"loss": 0.4607,
"step": 891
},
{
"epoch": 0.43133462282398455,
"grad_norm": 0.44154638051986694,
"learning_rate": 4.949217749062093e-06,
"loss": 0.4971,
"step": 892
},
{
"epoch": 0.4318181818181818,
"grad_norm": 0.5624458193778992,
"learning_rate": 4.949089705622333e-06,
"loss": 0.4617,
"step": 893
},
{
"epoch": 0.4323017408123791,
"grad_norm": 0.4163053035736084,
"learning_rate": 4.948961502620576e-06,
"loss": 0.4651,
"step": 894
},
{
"epoch": 0.43278529980657643,
"grad_norm": 0.4546625316143036,
"learning_rate": 4.948833140065175e-06,
"loss": 0.4694,
"step": 895
},
{
"epoch": 0.4332688588007737,
"grad_norm": 0.43393319845199585,
"learning_rate": 4.948704617964495e-06,
"loss": 0.4898,
"step": 896
},
{
"epoch": 0.433752417794971,
"grad_norm": 0.6030253171920776,
"learning_rate": 4.948575936326907e-06,
"loss": 0.4709,
"step": 897
},
{
"epoch": 0.43423597678916825,
"grad_norm": 0.46790799498558044,
"learning_rate": 4.948447095160796e-06,
"loss": 0.482,
"step": 898
},
{
"epoch": 0.4347195357833656,
"grad_norm": 0.42384713888168335,
"learning_rate": 4.948318094474555e-06,
"loss": 0.4757,
"step": 899
},
{
"epoch": 0.43520309477756286,
"grad_norm": 0.4444863796234131,
"learning_rate": 4.94818893427659e-06,
"loss": 0.4734,
"step": 900
},
{
"epoch": 0.43568665377176014,
"grad_norm": 1.2005969285964966,
"learning_rate": 4.948059614575316e-06,
"loss": 0.4649,
"step": 901
},
{
"epoch": 0.43617021276595747,
"grad_norm": 0.4669468402862549,
"learning_rate": 4.947930135379158e-06,
"loss": 0.4847,
"step": 902
},
{
"epoch": 0.43665377176015474,
"grad_norm": 0.4478405714035034,
"learning_rate": 4.947800496696551e-06,
"loss": 0.4699,
"step": 903
},
{
"epoch": 0.437137330754352,
"grad_norm": 0.419539213180542,
"learning_rate": 4.947670698535943e-06,
"loss": 0.4632,
"step": 904
},
{
"epoch": 0.43762088974854935,
"grad_norm": 0.4189528822898865,
"learning_rate": 4.947540740905789e-06,
"loss": 0.4982,
"step": 905
},
{
"epoch": 0.4381044487427466,
"grad_norm": 0.4514327943325043,
"learning_rate": 4.9474106238145555e-06,
"loss": 0.452,
"step": 906
},
{
"epoch": 0.4385880077369439,
"grad_norm": 0.4805455207824707,
"learning_rate": 4.947280347270721e-06,
"loss": 0.4752,
"step": 907
},
{
"epoch": 0.43907156673114117,
"grad_norm": 0.4451103210449219,
"learning_rate": 4.9471499112827726e-06,
"loss": 0.4759,
"step": 908
},
{
"epoch": 0.4395551257253385,
"grad_norm": 0.45392805337905884,
"learning_rate": 4.947019315859209e-06,
"loss": 0.468,
"step": 909
},
{
"epoch": 0.4400386847195358,
"grad_norm": 0.41176101565361023,
"learning_rate": 4.946888561008539e-06,
"loss": 0.4553,
"step": 910
},
{
"epoch": 0.44052224371373305,
"grad_norm": 0.465889573097229,
"learning_rate": 4.94675764673928e-06,
"loss": 0.4712,
"step": 911
},
{
"epoch": 0.4410058027079304,
"grad_norm": 0.4795728623867035,
"learning_rate": 4.946626573059963e-06,
"loss": 0.4866,
"step": 912
},
{
"epoch": 0.44148936170212766,
"grad_norm": 0.4447157084941864,
"learning_rate": 4.946495339979126e-06,
"loss": 0.4409,
"step": 913
},
{
"epoch": 0.44197292069632493,
"grad_norm": 0.47352585196495056,
"learning_rate": 4.946363947505321e-06,
"loss": 0.4564,
"step": 914
},
{
"epoch": 0.44245647969052226,
"grad_norm": 0.41461408138275146,
"learning_rate": 4.946232395647106e-06,
"loss": 0.4687,
"step": 915
},
{
"epoch": 0.44294003868471954,
"grad_norm": 0.4392583966255188,
"learning_rate": 4.946100684413053e-06,
"loss": 0.4679,
"step": 916
},
{
"epoch": 0.4434235976789168,
"grad_norm": 0.4193490743637085,
"learning_rate": 4.945968813811743e-06,
"loss": 0.4774,
"step": 917
},
{
"epoch": 0.44390715667311414,
"grad_norm": 0.4322820007801056,
"learning_rate": 4.945836783851769e-06,
"loss": 0.4764,
"step": 918
},
{
"epoch": 0.4443907156673114,
"grad_norm": 0.6839078664779663,
"learning_rate": 4.945704594541731e-06,
"loss": 0.4749,
"step": 919
},
{
"epoch": 0.4448742746615087,
"grad_norm": 0.4112018644809723,
"learning_rate": 4.945572245890242e-06,
"loss": 0.4743,
"step": 920
},
{
"epoch": 0.44535783365570597,
"grad_norm": 0.4316619634628296,
"learning_rate": 4.945439737905926e-06,
"loss": 0.4702,
"step": 921
},
{
"epoch": 0.4458413926499033,
"grad_norm": 0.4236302077770233,
"learning_rate": 4.945307070597414e-06,
"loss": 0.4705,
"step": 922
},
{
"epoch": 0.4463249516441006,
"grad_norm": 0.4219607412815094,
"learning_rate": 4.9451742439733505e-06,
"loss": 0.4506,
"step": 923
},
{
"epoch": 0.44680851063829785,
"grad_norm": 0.4156830608844757,
"learning_rate": 4.94504125804239e-06,
"loss": 0.4826,
"step": 924
},
{
"epoch": 0.4472920696324952,
"grad_norm": 0.4480549395084381,
"learning_rate": 4.9449081128131945e-06,
"loss": 0.4713,
"step": 925
},
{
"epoch": 0.44777562862669246,
"grad_norm": 0.4185437560081482,
"learning_rate": 4.944774808294441e-06,
"loss": 0.4748,
"step": 926
},
{
"epoch": 0.44825918762088973,
"grad_norm": 0.4328237473964691,
"learning_rate": 4.944641344494815e-06,
"loss": 0.4784,
"step": 927
},
{
"epoch": 0.44874274661508706,
"grad_norm": 0.44591256976127625,
"learning_rate": 4.9445077214230085e-06,
"loss": 0.4838,
"step": 928
},
{
"epoch": 0.44922630560928434,
"grad_norm": 0.4475826323032379,
"learning_rate": 4.94437393908773e-06,
"loss": 0.4829,
"step": 929
},
{
"epoch": 0.4497098646034816,
"grad_norm": 0.4374013841152191,
"learning_rate": 4.944239997497695e-06,
"loss": 0.4636,
"step": 930
},
{
"epoch": 0.45019342359767894,
"grad_norm": 0.4626249670982361,
"learning_rate": 4.944105896661629e-06,
"loss": 0.4648,
"step": 931
},
{
"epoch": 0.4506769825918762,
"grad_norm": 0.5082606077194214,
"learning_rate": 4.943971636588271e-06,
"loss": 0.4776,
"step": 932
},
{
"epoch": 0.4511605415860735,
"grad_norm": 0.4721316695213318,
"learning_rate": 4.943837217286367e-06,
"loss": 0.4839,
"step": 933
},
{
"epoch": 0.45164410058027077,
"grad_norm": 0.4364699125289917,
"learning_rate": 4.943702638764674e-06,
"loss": 0.4681,
"step": 934
},
{
"epoch": 0.4521276595744681,
"grad_norm": 0.4581095278263092,
"learning_rate": 4.94356790103196e-06,
"loss": 0.4656,
"step": 935
},
{
"epoch": 0.4526112185686654,
"grad_norm": 0.4476464092731476,
"learning_rate": 4.9434330040970054e-06,
"loss": 0.468,
"step": 936
},
{
"epoch": 0.45309477756286265,
"grad_norm": 0.43913260102272034,
"learning_rate": 4.9432979479685975e-06,
"loss": 0.4868,
"step": 937
},
{
"epoch": 0.45357833655706,
"grad_norm": 0.4544861912727356,
"learning_rate": 4.943162732655534e-06,
"loss": 0.4799,
"step": 938
},
{
"epoch": 0.45406189555125726,
"grad_norm": 0.436844140291214,
"learning_rate": 4.943027358166628e-06,
"loss": 0.462,
"step": 939
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.4419472813606262,
"learning_rate": 4.942891824510695e-06,
"loss": 0.4626,
"step": 940
},
{
"epoch": 0.45502901353965186,
"grad_norm": 0.7483431696891785,
"learning_rate": 4.942756131696569e-06,
"loss": 0.4629,
"step": 941
},
{
"epoch": 0.45551257253384914,
"grad_norm": 0.416421115398407,
"learning_rate": 4.942620279733089e-06,
"loss": 0.4794,
"step": 942
},
{
"epoch": 0.4559961315280464,
"grad_norm": 0.4357644021511078,
"learning_rate": 4.9424842686291056e-06,
"loss": 0.449,
"step": 943
},
{
"epoch": 0.4564796905222437,
"grad_norm": 0.4860648810863495,
"learning_rate": 4.9423480983934806e-06,
"loss": 0.4701,
"step": 944
},
{
"epoch": 0.456963249516441,
"grad_norm": 0.45846328139305115,
"learning_rate": 4.9422117690350855e-06,
"loss": 0.4876,
"step": 945
},
{
"epoch": 0.4574468085106383,
"grad_norm": 0.438472718000412,
"learning_rate": 4.942075280562802e-06,
"loss": 0.4594,
"step": 946
},
{
"epoch": 0.45793036750483557,
"grad_norm": 0.4141273498535156,
"learning_rate": 4.941938632985524e-06,
"loss": 0.4691,
"step": 947
},
{
"epoch": 0.4584139264990329,
"grad_norm": 0.4864295423030853,
"learning_rate": 4.941801826312153e-06,
"loss": 0.4619,
"step": 948
},
{
"epoch": 0.4588974854932302,
"grad_norm": 0.4366392493247986,
"learning_rate": 4.941664860551603e-06,
"loss": 0.4571,
"step": 949
},
{
"epoch": 0.45938104448742745,
"grad_norm": 0.43606841564178467,
"learning_rate": 4.941527735712796e-06,
"loss": 0.4656,
"step": 950
},
{
"epoch": 0.4598646034816248,
"grad_norm": 0.43493443727493286,
"learning_rate": 4.941390451804668e-06,
"loss": 0.4786,
"step": 951
},
{
"epoch": 0.46034816247582205,
"grad_norm": 0.49057891964912415,
"learning_rate": 4.941253008836162e-06,
"loss": 0.4673,
"step": 952
},
{
"epoch": 0.46083172147001933,
"grad_norm": 0.428835391998291,
"learning_rate": 4.9411154068162325e-06,
"loss": 0.4514,
"step": 953
},
{
"epoch": 0.46131528046421666,
"grad_norm": 0.41059479117393494,
"learning_rate": 4.940977645753845e-06,
"loss": 0.4852,
"step": 954
},
{
"epoch": 0.46179883945841393,
"grad_norm": 0.4246974587440491,
"learning_rate": 4.9408397256579745e-06,
"loss": 0.4863,
"step": 955
},
{
"epoch": 0.4622823984526112,
"grad_norm": 0.4776301085948944,
"learning_rate": 4.940701646537607e-06,
"loss": 0.4596,
"step": 956
},
{
"epoch": 0.4627659574468085,
"grad_norm": 0.4455105662345886,
"learning_rate": 4.940563408401738e-06,
"loss": 0.4892,
"step": 957
},
{
"epoch": 0.4632495164410058,
"grad_norm": 0.4926896095275879,
"learning_rate": 4.940425011259375e-06,
"loss": 0.4669,
"step": 958
},
{
"epoch": 0.4637330754352031,
"grad_norm": 0.42672210931777954,
"learning_rate": 4.940286455119535e-06,
"loss": 0.4654,
"step": 959
},
{
"epoch": 0.46421663442940037,
"grad_norm": 0.41769957542419434,
"learning_rate": 4.940147739991243e-06,
"loss": 0.4683,
"step": 960
},
{
"epoch": 0.4647001934235977,
"grad_norm": 0.438005268573761,
"learning_rate": 4.940008865883538e-06,
"loss": 0.482,
"step": 961
},
{
"epoch": 0.46518375241779497,
"grad_norm": 0.42409947514533997,
"learning_rate": 4.939869832805468e-06,
"loss": 0.4653,
"step": 962
},
{
"epoch": 0.46566731141199225,
"grad_norm": 0.42889320850372314,
"learning_rate": 4.93973064076609e-06,
"loss": 0.4695,
"step": 963
},
{
"epoch": 0.4661508704061896,
"grad_norm": 0.425735741853714,
"learning_rate": 4.9395912897744746e-06,
"loss": 0.4709,
"step": 964
},
{
"epoch": 0.46663442940038685,
"grad_norm": 0.4460727870464325,
"learning_rate": 4.9394517798397e-06,
"loss": 0.4758,
"step": 965
},
{
"epoch": 0.4671179883945841,
"grad_norm": 0.42308881878852844,
"learning_rate": 4.939312110970854e-06,
"loss": 0.4824,
"step": 966
},
{
"epoch": 0.46760154738878146,
"grad_norm": 0.42842990159988403,
"learning_rate": 4.939172283177037e-06,
"loss": 0.4573,
"step": 967
},
{
"epoch": 0.46808510638297873,
"grad_norm": 0.5351126194000244,
"learning_rate": 4.939032296467361e-06,
"loss": 0.4621,
"step": 968
},
{
"epoch": 0.468568665377176,
"grad_norm": 0.42374545335769653,
"learning_rate": 4.938892150850944e-06,
"loss": 0.4715,
"step": 969
},
{
"epoch": 0.4690522243713733,
"grad_norm": 0.42353230714797974,
"learning_rate": 4.938751846336917e-06,
"loss": 0.48,
"step": 970
},
{
"epoch": 0.4695357833655706,
"grad_norm": 0.44193336367607117,
"learning_rate": 4.938611382934421e-06,
"loss": 0.4676,
"step": 971
},
{
"epoch": 0.4700193423597679,
"grad_norm": 0.4455101191997528,
"learning_rate": 4.9384707606526084e-06,
"loss": 0.4788,
"step": 972
},
{
"epoch": 0.47050290135396516,
"grad_norm": 0.4394087791442871,
"learning_rate": 4.938329979500641e-06,
"loss": 0.4496,
"step": 973
},
{
"epoch": 0.4709864603481625,
"grad_norm": 0.5428414940834045,
"learning_rate": 4.9381890394876895e-06,
"loss": 0.4859,
"step": 974
},
{
"epoch": 0.47147001934235977,
"grad_norm": 0.42253056168556213,
"learning_rate": 4.9380479406229375e-06,
"loss": 0.4572,
"step": 975
},
{
"epoch": 0.47195357833655704,
"grad_norm": 0.4509589672088623,
"learning_rate": 4.9379066829155775e-06,
"loss": 0.4839,
"step": 976
},
{
"epoch": 0.4724371373307544,
"grad_norm": 0.4330948293209076,
"learning_rate": 4.9377652663748125e-06,
"loss": 0.4597,
"step": 977
},
{
"epoch": 0.47292069632495165,
"grad_norm": 0.48132073879241943,
"learning_rate": 4.9376236910098565e-06,
"loss": 0.4874,
"step": 978
},
{
"epoch": 0.4734042553191489,
"grad_norm": 0.5274008512496948,
"learning_rate": 4.937481956829933e-06,
"loss": 0.4685,
"step": 979
},
{
"epoch": 0.4738878143133462,
"grad_norm": 0.4428861737251282,
"learning_rate": 4.937340063844276e-06,
"loss": 0.5127,
"step": 980
},
{
"epoch": 0.47437137330754353,
"grad_norm": 0.45901939272880554,
"learning_rate": 4.937198012062131e-06,
"loss": 0.4591,
"step": 981
},
{
"epoch": 0.4748549323017408,
"grad_norm": 0.4397450089454651,
"learning_rate": 4.937055801492752e-06,
"loss": 0.4587,
"step": 982
},
{
"epoch": 0.4753384912959381,
"grad_norm": 0.4530391991138458,
"learning_rate": 4.936913432145403e-06,
"loss": 0.459,
"step": 983
},
{
"epoch": 0.4758220502901354,
"grad_norm": 0.4467536509037018,
"learning_rate": 4.936770904029362e-06,
"loss": 0.4735,
"step": 984
},
{
"epoch": 0.4763056092843327,
"grad_norm": 0.45150211453437805,
"learning_rate": 4.936628217153914e-06,
"loss": 0.4736,
"step": 985
},
{
"epoch": 0.47678916827852996,
"grad_norm": 0.4347638785839081,
"learning_rate": 4.936485371528356e-06,
"loss": 0.4715,
"step": 986
},
{
"epoch": 0.4772727272727273,
"grad_norm": 0.45149534940719604,
"learning_rate": 4.936342367161992e-06,
"loss": 0.4834,
"step": 987
},
{
"epoch": 0.47775628626692457,
"grad_norm": 0.47228920459747314,
"learning_rate": 4.936199204064142e-06,
"loss": 0.4773,
"step": 988
},
{
"epoch": 0.47823984526112184,
"grad_norm": 0.4575802683830261,
"learning_rate": 4.936055882244132e-06,
"loss": 0.4739,
"step": 989
},
{
"epoch": 0.4787234042553192,
"grad_norm": 0.44767946004867554,
"learning_rate": 4.935912401711299e-06,
"loss": 0.473,
"step": 990
},
{
"epoch": 0.47920696324951645,
"grad_norm": 0.490839421749115,
"learning_rate": 4.935768762474993e-06,
"loss": 0.4979,
"step": 991
},
{
"epoch": 0.4796905222437137,
"grad_norm": 0.50435870885849,
"learning_rate": 4.9356249645445695e-06,
"loss": 0.4661,
"step": 992
},
{
"epoch": 0.480174081237911,
"grad_norm": 0.49018508195877075,
"learning_rate": 4.935481007929399e-06,
"loss": 0.4678,
"step": 993
},
{
"epoch": 0.48065764023210833,
"grad_norm": 0.43506816029548645,
"learning_rate": 4.9353368926388615e-06,
"loss": 0.485,
"step": 994
},
{
"epoch": 0.4811411992263056,
"grad_norm": 0.6681720614433289,
"learning_rate": 4.935192618682343e-06,
"loss": 0.4334,
"step": 995
},
{
"epoch": 0.4816247582205029,
"grad_norm": 0.8788197040557861,
"learning_rate": 4.935048186069247e-06,
"loss": 0.4749,
"step": 996
},
{
"epoch": 0.4821083172147002,
"grad_norm": 0.46034419536590576,
"learning_rate": 4.934903594808981e-06,
"loss": 0.4545,
"step": 997
},
{
"epoch": 0.4825918762088975,
"grad_norm": 0.4211420714855194,
"learning_rate": 4.934758844910965e-06,
"loss": 0.4821,
"step": 998
},
{
"epoch": 0.48307543520309476,
"grad_norm": 0.4607398509979248,
"learning_rate": 4.934613936384632e-06,
"loss": 0.4493,
"step": 999
},
{
"epoch": 0.4835589941972921,
"grad_norm": 0.43166399002075195,
"learning_rate": 4.934468869239421e-06,
"loss": 0.4575,
"step": 1000
},
{
"epoch": 0.48404255319148937,
"grad_norm": 0.4517035484313965,
"learning_rate": 4.934323643484784e-06,
"loss": 0.4808,
"step": 1001
},
{
"epoch": 0.48452611218568664,
"grad_norm": 0.457938551902771,
"learning_rate": 4.934178259130183e-06,
"loss": 0.4672,
"step": 1002
},
{
"epoch": 0.48500967117988397,
"grad_norm": 1.534679651260376,
"learning_rate": 4.93403271618509e-06,
"loss": 0.4605,
"step": 1003
},
{
"epoch": 0.48549323017408125,
"grad_norm": 0.4722941517829895,
"learning_rate": 4.9338870146589866e-06,
"loss": 0.4811,
"step": 1004
},
{
"epoch": 0.4859767891682785,
"grad_norm": 0.4111993908882141,
"learning_rate": 4.933741154561367e-06,
"loss": 0.4608,
"step": 1005
},
{
"epoch": 0.4864603481624758,
"grad_norm": 0.5045567750930786,
"learning_rate": 4.933595135901733e-06,
"loss": 0.4731,
"step": 1006
},
{
"epoch": 0.48694390715667313,
"grad_norm": 0.4574251174926758,
"learning_rate": 4.9334489586895975e-06,
"loss": 0.467,
"step": 1007
},
{
"epoch": 0.4874274661508704,
"grad_norm": 0.4980320930480957,
"learning_rate": 4.933302622934485e-06,
"loss": 0.4492,
"step": 1008
},
{
"epoch": 0.4879110251450677,
"grad_norm": 0.4482066035270691,
"learning_rate": 4.933156128645929e-06,
"loss": 0.487,
"step": 1009
},
{
"epoch": 0.488394584139265,
"grad_norm": 0.42327502369880676,
"learning_rate": 4.933009475833474e-06,
"loss": 0.4513,
"step": 1010
},
{
"epoch": 0.4888781431334623,
"grad_norm": 0.44477933645248413,
"learning_rate": 4.9328626645066755e-06,
"loss": 0.4835,
"step": 1011
},
{
"epoch": 0.48936170212765956,
"grad_norm": 0.47308075428009033,
"learning_rate": 4.932715694675098e-06,
"loss": 0.464,
"step": 1012
},
{
"epoch": 0.4898452611218569,
"grad_norm": 0.563520610332489,
"learning_rate": 4.932568566348316e-06,
"loss": 0.4828,
"step": 1013
},
{
"epoch": 0.49032882011605416,
"grad_norm": 0.4451202154159546,
"learning_rate": 4.932421279535916e-06,
"loss": 0.4705,
"step": 1014
},
{
"epoch": 0.49081237911025144,
"grad_norm": 0.4930382966995239,
"learning_rate": 4.932273834247494e-06,
"loss": 0.4507,
"step": 1015
},
{
"epoch": 0.4912959381044487,
"grad_norm": 0.4431816637516022,
"learning_rate": 4.932126230492656e-06,
"loss": 0.4553,
"step": 1016
},
{
"epoch": 0.49177949709864605,
"grad_norm": 0.4477396607398987,
"learning_rate": 4.931978468281018e-06,
"loss": 0.4754,
"step": 1017
},
{
"epoch": 0.4922630560928433,
"grad_norm": 0.5088204145431519,
"learning_rate": 4.9318305476222074e-06,
"loss": 0.4628,
"step": 1018
},
{
"epoch": 0.4927466150870406,
"grad_norm": 0.43095219135284424,
"learning_rate": 4.931682468525863e-06,
"loss": 0.4791,
"step": 1019
},
{
"epoch": 0.4932301740812379,
"grad_norm": 0.5227051377296448,
"learning_rate": 4.931534231001629e-06,
"loss": 0.4715,
"step": 1020
},
{
"epoch": 0.4937137330754352,
"grad_norm": 0.44510793685913086,
"learning_rate": 4.931385835059167e-06,
"loss": 0.4518,
"step": 1021
},
{
"epoch": 0.4941972920696325,
"grad_norm": 0.4213857054710388,
"learning_rate": 4.9312372807081424e-06,
"loss": 0.4612,
"step": 1022
},
{
"epoch": 0.4946808510638298,
"grad_norm": 0.42244675755500793,
"learning_rate": 4.9310885679582355e-06,
"loss": 0.4554,
"step": 1023
},
{
"epoch": 0.4951644100580271,
"grad_norm": 0.5022426843643188,
"learning_rate": 4.930939696819135e-06,
"loss": 0.4664,
"step": 1024
},
{
"epoch": 0.49564796905222436,
"grad_norm": 0.45399320125579834,
"learning_rate": 4.930790667300539e-06,
"loss": 0.4684,
"step": 1025
},
{
"epoch": 0.4961315280464217,
"grad_norm": 0.52347731590271,
"learning_rate": 4.930641479412157e-06,
"loss": 0.4799,
"step": 1026
},
{
"epoch": 0.49661508704061896,
"grad_norm": 0.4325565993785858,
"learning_rate": 4.93049213316371e-06,
"loss": 0.4864,
"step": 1027
},
{
"epoch": 0.49709864603481624,
"grad_norm": 0.4530152380466461,
"learning_rate": 4.930342628564928e-06,
"loss": 0.4495,
"step": 1028
},
{
"epoch": 0.4975822050290135,
"grad_norm": 0.4562103748321533,
"learning_rate": 4.930192965625551e-06,
"loss": 0.4778,
"step": 1029
},
{
"epoch": 0.49806576402321084,
"grad_norm": 0.4942949116230011,
"learning_rate": 4.9300431443553295e-06,
"loss": 0.4752,
"step": 1030
},
{
"epoch": 0.4985493230174081,
"grad_norm": 0.4768705368041992,
"learning_rate": 4.929893164764025e-06,
"loss": 0.4871,
"step": 1031
},
{
"epoch": 0.4990328820116054,
"grad_norm": 0.41821593046188354,
"learning_rate": 4.929743026861409e-06,
"loss": 0.4548,
"step": 1032
},
{
"epoch": 0.4995164410058027,
"grad_norm": 0.4341305196285248,
"learning_rate": 4.929592730657262e-06,
"loss": 0.4626,
"step": 1033
},
{
"epoch": 0.5,
"grad_norm": 0.41584035754203796,
"learning_rate": 4.929442276161378e-06,
"loss": 0.4544,
"step": 1034
},
{
"epoch": 0.5004835589941973,
"grad_norm": 0.42965272068977356,
"learning_rate": 4.929291663383559e-06,
"loss": 0.4483,
"step": 1035
},
{
"epoch": 0.5009671179883946,
"grad_norm": 0.40160828828811646,
"learning_rate": 4.929140892333616e-06,
"loss": 0.4482,
"step": 1036
},
{
"epoch": 0.5014506769825918,
"grad_norm": 0.4636249840259552,
"learning_rate": 4.928989963021373e-06,
"loss": 0.4652,
"step": 1037
},
{
"epoch": 0.5019342359767892,
"grad_norm": 0.4661104083061218,
"learning_rate": 4.928838875456664e-06,
"loss": 0.4787,
"step": 1038
},
{
"epoch": 0.5024177949709865,
"grad_norm": 0.47609642148017883,
"learning_rate": 4.928687629649331e-06,
"loss": 0.4709,
"step": 1039
},
{
"epoch": 0.5029013539651838,
"grad_norm": 0.45435553789138794,
"learning_rate": 4.92853622560923e-06,
"loss": 0.4614,
"step": 1040
},
{
"epoch": 0.503384912959381,
"grad_norm": 0.4226183295249939,
"learning_rate": 4.928384663346223e-06,
"loss": 0.4724,
"step": 1041
},
{
"epoch": 0.5038684719535783,
"grad_norm": 0.4644449055194855,
"learning_rate": 4.9282329428701865e-06,
"loss": 0.4677,
"step": 1042
},
{
"epoch": 0.5043520309477756,
"grad_norm": 0.46138256788253784,
"learning_rate": 4.928081064191004e-06,
"loss": 0.4755,
"step": 1043
},
{
"epoch": 0.504835589941973,
"grad_norm": 0.6464718580245972,
"learning_rate": 4.92792902731857e-06,
"loss": 0.4525,
"step": 1044
},
{
"epoch": 0.5053191489361702,
"grad_norm": 0.4627712666988373,
"learning_rate": 4.927776832262792e-06,
"loss": 0.4712,
"step": 1045
},
{
"epoch": 0.5058027079303675,
"grad_norm": 0.44957953691482544,
"learning_rate": 4.9276244790335844e-06,
"loss": 0.448,
"step": 1046
},
{
"epoch": 0.5062862669245648,
"grad_norm": 0.44223344326019287,
"learning_rate": 4.927471967640873e-06,
"loss": 0.4597,
"step": 1047
},
{
"epoch": 0.5067698259187621,
"grad_norm": 0.47617247700691223,
"learning_rate": 4.927319298094596e-06,
"loss": 0.4704,
"step": 1048
},
{
"epoch": 0.5072533849129593,
"grad_norm": 0.4532371461391449,
"learning_rate": 4.927166470404698e-06,
"loss": 0.4833,
"step": 1049
},
{
"epoch": 0.5077369439071566,
"grad_norm": 0.4266747832298279,
"learning_rate": 4.9270134845811355e-06,
"loss": 0.455,
"step": 1050
},
{
"epoch": 0.508220502901354,
"grad_norm": 0.4532632529735565,
"learning_rate": 4.926860340633879e-06,
"loss": 0.4707,
"step": 1051
},
{
"epoch": 0.5087040618955513,
"grad_norm": 0.45259150862693787,
"learning_rate": 4.926707038572903e-06,
"loss": 0.467,
"step": 1052
},
{
"epoch": 0.5091876208897486,
"grad_norm": 0.4061692953109741,
"learning_rate": 4.9265535784081965e-06,
"loss": 0.4461,
"step": 1053
},
{
"epoch": 0.5096711798839458,
"grad_norm": 0.4721916913986206,
"learning_rate": 4.926399960149757e-06,
"loss": 0.4555,
"step": 1054
},
{
"epoch": 0.5101547388781431,
"grad_norm": 0.4325840473175049,
"learning_rate": 4.926246183807593e-06,
"loss": 0.4696,
"step": 1055
},
{
"epoch": 0.5106382978723404,
"grad_norm": 0.4277395009994507,
"learning_rate": 4.926092249391725e-06,
"loss": 0.4773,
"step": 1056
},
{
"epoch": 0.5111218568665378,
"grad_norm": 0.4312560260295868,
"learning_rate": 4.925938156912181e-06,
"loss": 0.4863,
"step": 1057
},
{
"epoch": 0.511605415860735,
"grad_norm": 1.3537847995758057,
"learning_rate": 4.925783906379e-06,
"loss": 0.4593,
"step": 1058
},
{
"epoch": 0.5120889748549323,
"grad_norm": 0.4193797707557678,
"learning_rate": 4.925629497802232e-06,
"loss": 0.4708,
"step": 1059
},
{
"epoch": 0.5125725338491296,
"grad_norm": 0.4714594781398773,
"learning_rate": 4.9254749311919355e-06,
"loss": 0.4675,
"step": 1060
},
{
"epoch": 0.5130560928433269,
"grad_norm": 0.44514137506484985,
"learning_rate": 4.925320206558184e-06,
"loss": 0.4916,
"step": 1061
},
{
"epoch": 0.5135396518375241,
"grad_norm": 0.4270736575126648,
"learning_rate": 4.9251653239110555e-06,
"loss": 0.488,
"step": 1062
},
{
"epoch": 0.5140232108317214,
"grad_norm": 0.43738606572151184,
"learning_rate": 4.925010283260641e-06,
"loss": 0.4749,
"step": 1063
},
{
"epoch": 0.5145067698259188,
"grad_norm": 0.5914368629455566,
"learning_rate": 4.924855084617042e-06,
"loss": 0.4686,
"step": 1064
},
{
"epoch": 0.5149903288201161,
"grad_norm": 0.5277994275093079,
"learning_rate": 4.92469972799037e-06,
"loss": 0.4839,
"step": 1065
},
{
"epoch": 0.5154738878143134,
"grad_norm": 0.5200470685958862,
"learning_rate": 4.9245442133907475e-06,
"loss": 0.4735,
"step": 1066
},
{
"epoch": 0.5159574468085106,
"grad_norm": 0.4452257454395294,
"learning_rate": 4.924388540828305e-06,
"loss": 0.4609,
"step": 1067
},
{
"epoch": 0.5164410058027079,
"grad_norm": 0.5149017572402954,
"learning_rate": 4.924232710313187e-06,
"loss": 0.4712,
"step": 1068
},
{
"epoch": 0.5169245647969052,
"grad_norm": 0.41530728340148926,
"learning_rate": 4.924076721855544e-06,
"loss": 0.4748,
"step": 1069
},
{
"epoch": 0.5174081237911026,
"grad_norm": 0.45812344551086426,
"learning_rate": 4.923920575465539e-06,
"loss": 0.4664,
"step": 1070
},
{
"epoch": 0.5178916827852998,
"grad_norm": 0.4297925531864166,
"learning_rate": 4.923764271153346e-06,
"loss": 0.4812,
"step": 1071
},
{
"epoch": 0.5183752417794971,
"grad_norm": 0.43673601746559143,
"learning_rate": 4.923607808929149e-06,
"loss": 0.4585,
"step": 1072
},
{
"epoch": 0.5188588007736944,
"grad_norm": 0.40688005089759827,
"learning_rate": 4.92345118880314e-06,
"loss": 0.4898,
"step": 1073
},
{
"epoch": 0.5193423597678917,
"grad_norm": 0.4260459244251251,
"learning_rate": 4.923294410785525e-06,
"loss": 0.4628,
"step": 1074
},
{
"epoch": 0.519825918762089,
"grad_norm": 0.44062745571136475,
"learning_rate": 4.923137474886517e-06,
"loss": 0.4706,
"step": 1075
},
{
"epoch": 0.5203094777562862,
"grad_norm": 0.6602137088775635,
"learning_rate": 4.92298038111634e-06,
"loss": 0.4732,
"step": 1076
},
{
"epoch": 0.5207930367504836,
"grad_norm": 0.4522308111190796,
"learning_rate": 4.922823129485231e-06,
"loss": 0.491,
"step": 1077
},
{
"epoch": 0.5212765957446809,
"grad_norm": 0.4215511381626129,
"learning_rate": 4.9226657200034335e-06,
"loss": 0.4792,
"step": 1078
},
{
"epoch": 0.5217601547388782,
"grad_norm": 0.44352343678474426,
"learning_rate": 4.922508152681205e-06,
"loss": 0.4434,
"step": 1079
},
{
"epoch": 0.5222437137330754,
"grad_norm": 0.4431546926498413,
"learning_rate": 4.922350427528808e-06,
"loss": 0.4652,
"step": 1080
},
{
"epoch": 0.5227272727272727,
"grad_norm": 0.4811055064201355,
"learning_rate": 4.922192544556521e-06,
"loss": 0.479,
"step": 1081
},
{
"epoch": 0.52321083172147,
"grad_norm": 0.6013498902320862,
"learning_rate": 4.922034503774629e-06,
"loss": 0.4493,
"step": 1082
},
{
"epoch": 0.5236943907156673,
"grad_norm": 0.5949416756629944,
"learning_rate": 4.921876305193431e-06,
"loss": 0.4614,
"step": 1083
},
{
"epoch": 0.5241779497098646,
"grad_norm": 0.4291402995586395,
"learning_rate": 4.9217179488232315e-06,
"loss": 0.4946,
"step": 1084
},
{
"epoch": 0.5246615087040619,
"grad_norm": 0.42322617769241333,
"learning_rate": 4.921559434674348e-06,
"loss": 0.4616,
"step": 1085
},
{
"epoch": 0.5251450676982592,
"grad_norm": 0.4919283390045166,
"learning_rate": 4.921400762757108e-06,
"loss": 0.4372,
"step": 1086
},
{
"epoch": 0.5256286266924565,
"grad_norm": 0.5418685078620911,
"learning_rate": 4.92124193308185e-06,
"loss": 0.486,
"step": 1087
},
{
"epoch": 0.5261121856866537,
"grad_norm": 0.4854177236557007,
"learning_rate": 4.921082945658922e-06,
"loss": 0.4812,
"step": 1088
},
{
"epoch": 0.526595744680851,
"grad_norm": 0.46961551904678345,
"learning_rate": 4.92092380049868e-06,
"loss": 0.4834,
"step": 1089
},
{
"epoch": 0.5270793036750484,
"grad_norm": 0.5009759664535522,
"learning_rate": 4.920764497611496e-06,
"loss": 0.4734,
"step": 1090
},
{
"epoch": 0.5275628626692457,
"grad_norm": 0.48609328269958496,
"learning_rate": 4.9206050370077464e-06,
"loss": 0.4739,
"step": 1091
},
{
"epoch": 0.528046421663443,
"grad_norm": 0.4356881082057953,
"learning_rate": 4.920445418697821e-06,
"loss": 0.473,
"step": 1092
},
{
"epoch": 0.5285299806576402,
"grad_norm": 0.4519873559474945,
"learning_rate": 4.9202856426921195e-06,
"loss": 0.4583,
"step": 1093
},
{
"epoch": 0.5290135396518375,
"grad_norm": 0.42688310146331787,
"learning_rate": 4.920125709001051e-06,
"loss": 0.4435,
"step": 1094
},
{
"epoch": 0.5294970986460348,
"grad_norm": 0.4380148649215698,
"learning_rate": 4.9199656176350354e-06,
"loss": 0.4768,
"step": 1095
},
{
"epoch": 0.5299806576402321,
"grad_norm": 0.4532415568828583,
"learning_rate": 4.9198053686045044e-06,
"loss": 0.4712,
"step": 1096
},
{
"epoch": 0.5304642166344294,
"grad_norm": 0.44293212890625,
"learning_rate": 4.919644961919896e-06,
"loss": 0.4499,
"step": 1097
},
{
"epoch": 0.5309477756286267,
"grad_norm": 0.45117902755737305,
"learning_rate": 4.919484397591663e-06,
"loss": 0.4719,
"step": 1098
},
{
"epoch": 0.531431334622824,
"grad_norm": 0.43399882316589355,
"learning_rate": 4.9193236756302654e-06,
"loss": 0.4739,
"step": 1099
},
{
"epoch": 0.5319148936170213,
"grad_norm": 0.4192769527435303,
"learning_rate": 4.9191627960461756e-06,
"loss": 0.458,
"step": 1100
},
{
"epoch": 0.5323984526112185,
"grad_norm": 0.4767812192440033,
"learning_rate": 4.919001758849873e-06,
"loss": 0.4871,
"step": 1101
},
{
"epoch": 0.5328820116054158,
"grad_norm": 0.47003915905952454,
"learning_rate": 4.918840564051851e-06,
"loss": 0.477,
"step": 1102
},
{
"epoch": 0.5333655705996132,
"grad_norm": 0.42697975039482117,
"learning_rate": 4.918679211662613e-06,
"loss": 0.4792,
"step": 1103
},
{
"epoch": 0.5338491295938105,
"grad_norm": 0.44821104407310486,
"learning_rate": 4.918517701692668e-06,
"loss": 0.4609,
"step": 1104
},
{
"epoch": 0.5343326885880078,
"grad_norm": 0.47149497270584106,
"learning_rate": 4.91835603415254e-06,
"loss": 0.4549,
"step": 1105
},
{
"epoch": 0.534816247582205,
"grad_norm": 0.4447268843650818,
"learning_rate": 4.918194209052764e-06,
"loss": 0.4653,
"step": 1106
},
{
"epoch": 0.5352998065764023,
"grad_norm": 0.44459810853004456,
"learning_rate": 4.9180322264038805e-06,
"loss": 0.4749,
"step": 1107
},
{
"epoch": 0.5357833655705996,
"grad_norm": 0.4288979768753052,
"learning_rate": 4.917870086216443e-06,
"loss": 0.4628,
"step": 1108
},
{
"epoch": 0.5362669245647969,
"grad_norm": 0.4439257085323334,
"learning_rate": 4.917707788501017e-06,
"loss": 0.4751,
"step": 1109
},
{
"epoch": 0.5367504835589942,
"grad_norm": 0.43748125433921814,
"learning_rate": 4.917545333268176e-06,
"loss": 0.4565,
"step": 1110
},
{
"epoch": 0.5372340425531915,
"grad_norm": 0.5199353694915771,
"learning_rate": 4.917382720528503e-06,
"loss": 0.4612,
"step": 1111
},
{
"epoch": 0.5377176015473888,
"grad_norm": 0.5482754111289978,
"learning_rate": 4.917219950292593e-06,
"loss": 0.4731,
"step": 1112
},
{
"epoch": 0.5382011605415861,
"grad_norm": 0.49058425426483154,
"learning_rate": 4.917057022571052e-06,
"loss": 0.4473,
"step": 1113
},
{
"epoch": 0.5386847195357833,
"grad_norm": 0.41992440819740295,
"learning_rate": 4.9168939373744926e-06,
"loss": 0.487,
"step": 1114
},
{
"epoch": 0.5391682785299806,
"grad_norm": 0.4376087486743927,
"learning_rate": 4.916730694713542e-06,
"loss": 0.4837,
"step": 1115
},
{
"epoch": 0.539651837524178,
"grad_norm": 0.44848495721817017,
"learning_rate": 4.916567294598835e-06,
"loss": 0.4742,
"step": 1116
},
{
"epoch": 0.5401353965183753,
"grad_norm": 0.6971132159233093,
"learning_rate": 4.916403737041018e-06,
"loss": 0.4804,
"step": 1117
},
{
"epoch": 0.5406189555125726,
"grad_norm": 0.45096465945243835,
"learning_rate": 4.916240022050746e-06,
"loss": 0.4502,
"step": 1118
},
{
"epoch": 0.5411025145067698,
"grad_norm": 0.4433208703994751,
"learning_rate": 4.916076149638686e-06,
"loss": 0.4732,
"step": 1119
},
{
"epoch": 0.5415860735009671,
"grad_norm": 0.44899240136146545,
"learning_rate": 4.915912119815513e-06,
"loss": 0.4709,
"step": 1120
},
{
"epoch": 0.5420696324951644,
"grad_norm": 0.4663994610309601,
"learning_rate": 4.915747932591916e-06,
"loss": 0.4566,
"step": 1121
},
{
"epoch": 0.5425531914893617,
"grad_norm": 0.42710351943969727,
"learning_rate": 4.915583587978591e-06,
"loss": 0.4637,
"step": 1122
},
{
"epoch": 0.543036750483559,
"grad_norm": 0.4772963523864746,
"learning_rate": 4.915419085986246e-06,
"loss": 0.468,
"step": 1123
},
{
"epoch": 0.5435203094777563,
"grad_norm": 0.4386994540691376,
"learning_rate": 4.915254426625597e-06,
"loss": 0.4963,
"step": 1124
},
{
"epoch": 0.5440038684719536,
"grad_norm": 0.4515167772769928,
"learning_rate": 4.915089609907374e-06,
"loss": 0.4943,
"step": 1125
},
{
"epoch": 0.5444874274661509,
"grad_norm": 0.4320240020751953,
"learning_rate": 4.914924635842314e-06,
"loss": 0.4656,
"step": 1126
},
{
"epoch": 0.5449709864603481,
"grad_norm": 0.47680628299713135,
"learning_rate": 4.914759504441165e-06,
"loss": 0.471,
"step": 1127
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.46807920932769775,
"learning_rate": 4.914594215714685e-06,
"loss": 0.454,
"step": 1128
},
{
"epoch": 0.5459381044487428,
"grad_norm": 0.48494380712509155,
"learning_rate": 4.914428769673644e-06,
"loss": 0.473,
"step": 1129
},
{
"epoch": 0.5464216634429401,
"grad_norm": 0.4585503935813904,
"learning_rate": 4.91426316632882e-06,
"loss": 0.4757,
"step": 1130
},
{
"epoch": 0.5469052224371374,
"grad_norm": 0.4581963121891022,
"learning_rate": 4.914097405691004e-06,
"loss": 0.4895,
"step": 1131
},
{
"epoch": 0.5473887814313346,
"grad_norm": 0.4545809328556061,
"learning_rate": 4.913931487770994e-06,
"loss": 0.4562,
"step": 1132
},
{
"epoch": 0.5478723404255319,
"grad_norm": 0.4689292907714844,
"learning_rate": 4.913765412579601e-06,
"loss": 0.4663,
"step": 1133
},
{
"epoch": 0.5483558994197292,
"grad_norm": 0.4483727216720581,
"learning_rate": 4.9135991801276435e-06,
"loss": 0.4647,
"step": 1134
},
{
"epoch": 0.5488394584139265,
"grad_norm": 0.5269478559494019,
"learning_rate": 4.9134327904259525e-06,
"loss": 0.463,
"step": 1135
},
{
"epoch": 0.5493230174081238,
"grad_norm": 0.4482375979423523,
"learning_rate": 4.91326624348537e-06,
"loss": 0.4689,
"step": 1136
},
{
"epoch": 0.5498065764023211,
"grad_norm": 0.5002569556236267,
"learning_rate": 4.913099539316744e-06,
"loss": 0.4805,
"step": 1137
},
{
"epoch": 0.5502901353965184,
"grad_norm": 0.48099225759506226,
"learning_rate": 4.912932677930939e-06,
"loss": 0.4671,
"step": 1138
},
{
"epoch": 0.5507736943907157,
"grad_norm": 0.42564383149147034,
"learning_rate": 4.912765659338823e-06,
"loss": 0.4614,
"step": 1139
},
{
"epoch": 0.5512572533849129,
"grad_norm": 0.4481738209724426,
"learning_rate": 4.912598483551279e-06,
"loss": 0.4674,
"step": 1140
},
{
"epoch": 0.5517408123791102,
"grad_norm": 0.42052412033081055,
"learning_rate": 4.9124311505792e-06,
"loss": 0.457,
"step": 1141
},
{
"epoch": 0.5522243713733076,
"grad_norm": 0.4620019793510437,
"learning_rate": 4.912263660433485e-06,
"loss": 0.4756,
"step": 1142
},
{
"epoch": 0.5527079303675049,
"grad_norm": 0.46690240502357483,
"learning_rate": 4.912096013125048e-06,
"loss": 0.4696,
"step": 1143
},
{
"epoch": 0.5531914893617021,
"grad_norm": 0.4311160743236542,
"learning_rate": 4.911928208664813e-06,
"loss": 0.4662,
"step": 1144
},
{
"epoch": 0.5536750483558994,
"grad_norm": 0.4894680678844452,
"learning_rate": 4.911760247063709e-06,
"loss": 0.462,
"step": 1145
},
{
"epoch": 0.5541586073500967,
"grad_norm": 0.4718480706214905,
"learning_rate": 4.9115921283326814e-06,
"loss": 0.4586,
"step": 1146
},
{
"epoch": 0.554642166344294,
"grad_norm": 0.44869405031204224,
"learning_rate": 4.911423852482684e-06,
"loss": 0.453,
"step": 1147
},
{
"epoch": 0.5551257253384912,
"grad_norm": 0.436161607503891,
"learning_rate": 4.9112554195246785e-06,
"loss": 0.4875,
"step": 1148
},
{
"epoch": 0.5556092843326886,
"grad_norm": 0.44257354736328125,
"learning_rate": 4.91108682946964e-06,
"loss": 0.4666,
"step": 1149
},
{
"epoch": 0.5560928433268859,
"grad_norm": 0.4438575506210327,
"learning_rate": 4.910918082328552e-06,
"loss": 0.4633,
"step": 1150
},
{
"epoch": 0.5565764023210832,
"grad_norm": 0.4686394929885864,
"learning_rate": 4.910749178112407e-06,
"loss": 0.466,
"step": 1151
},
{
"epoch": 0.5570599613152805,
"grad_norm": 0.45705240964889526,
"learning_rate": 4.910580116832212e-06,
"loss": 0.4821,
"step": 1152
},
{
"epoch": 0.5575435203094777,
"grad_norm": 0.458423376083374,
"learning_rate": 4.91041089849898e-06,
"loss": 0.4537,
"step": 1153
},
{
"epoch": 0.558027079303675,
"grad_norm": 0.45928600430488586,
"learning_rate": 4.910241523123736e-06,
"loss": 0.4624,
"step": 1154
},
{
"epoch": 0.5585106382978723,
"grad_norm": 0.44112783670425415,
"learning_rate": 4.910071990717516e-06,
"loss": 0.4542,
"step": 1155
},
{
"epoch": 0.5589941972920697,
"grad_norm": 0.5385165214538574,
"learning_rate": 4.909902301291364e-06,
"loss": 0.4553,
"step": 1156
},
{
"epoch": 0.559477756286267,
"grad_norm": 0.45208409428596497,
"learning_rate": 4.909732454856336e-06,
"loss": 0.4694,
"step": 1157
},
{
"epoch": 0.5599613152804642,
"grad_norm": 0.43043410778045654,
"learning_rate": 4.909562451423498e-06,
"loss": 0.486,
"step": 1158
},
{
"epoch": 0.5604448742746615,
"grad_norm": 0.5170729756355286,
"learning_rate": 4.909392291003926e-06,
"loss": 0.4555,
"step": 1159
},
{
"epoch": 0.5609284332688588,
"grad_norm": 0.5027633309364319,
"learning_rate": 4.909221973608705e-06,
"loss": 0.446,
"step": 1160
},
{
"epoch": 0.561411992263056,
"grad_norm": 0.5824642777442932,
"learning_rate": 4.909051499248934e-06,
"loss": 0.4772,
"step": 1161
},
{
"epoch": 0.5618955512572534,
"grad_norm": 0.42544853687286377,
"learning_rate": 4.908880867935717e-06,
"loss": 0.47,
"step": 1162
},
{
"epoch": 0.5623791102514507,
"grad_norm": 0.42126137018203735,
"learning_rate": 4.908710079680173e-06,
"loss": 0.4701,
"step": 1163
},
{
"epoch": 0.562862669245648,
"grad_norm": 0.432250052690506,
"learning_rate": 4.908539134493428e-06,
"loss": 0.4603,
"step": 1164
},
{
"epoch": 0.5633462282398453,
"grad_norm": 0.43167534470558167,
"learning_rate": 4.908368032386619e-06,
"loss": 0.4692,
"step": 1165
},
{
"epoch": 0.5638297872340425,
"grad_norm": 0.4323950707912445,
"learning_rate": 4.9081967733708945e-06,
"loss": 0.4536,
"step": 1166
},
{
"epoch": 0.5643133462282398,
"grad_norm": 0.42598477005958557,
"learning_rate": 4.908025357457412e-06,
"loss": 0.4628,
"step": 1167
},
{
"epoch": 0.5647969052224371,
"grad_norm": 0.445354163646698,
"learning_rate": 4.907853784657339e-06,
"loss": 0.4731,
"step": 1168
},
{
"epoch": 0.5652804642166345,
"grad_norm": 0.48324230313301086,
"learning_rate": 4.907682054981855e-06,
"loss": 0.4568,
"step": 1169
},
{
"epoch": 0.5657640232108317,
"grad_norm": 0.4197344481945038,
"learning_rate": 4.9075101684421474e-06,
"loss": 0.4668,
"step": 1170
},
{
"epoch": 0.566247582205029,
"grad_norm": 0.5068573355674744,
"learning_rate": 4.907338125049415e-06,
"loss": 0.4607,
"step": 1171
},
{
"epoch": 0.5667311411992263,
"grad_norm": 0.57051682472229,
"learning_rate": 4.907165924814866e-06,
"loss": 0.46,
"step": 1172
},
{
"epoch": 0.5672147001934236,
"grad_norm": 0.48964497447013855,
"learning_rate": 4.9069935677497206e-06,
"loss": 0.4494,
"step": 1173
},
{
"epoch": 0.5676982591876208,
"grad_norm": 0.4429013431072235,
"learning_rate": 4.906821053865208e-06,
"loss": 0.4689,
"step": 1174
},
{
"epoch": 0.5681818181818182,
"grad_norm": 0.5011411905288696,
"learning_rate": 4.906648383172567e-06,
"loss": 0.478,
"step": 1175
},
{
"epoch": 0.5686653771760155,
"grad_norm": 0.4188624620437622,
"learning_rate": 4.906475555683049e-06,
"loss": 0.4657,
"step": 1176
},
{
"epoch": 0.5691489361702128,
"grad_norm": 0.4383828341960907,
"learning_rate": 4.9063025714079125e-06,
"loss": 0.4604,
"step": 1177
},
{
"epoch": 0.5696324951644101,
"grad_norm": 0.4345352351665497,
"learning_rate": 4.906129430358428e-06,
"loss": 0.4467,
"step": 1178
},
{
"epoch": 0.5701160541586073,
"grad_norm": 0.46759259700775146,
"learning_rate": 4.905956132545876e-06,
"loss": 0.4553,
"step": 1179
},
{
"epoch": 0.5705996131528046,
"grad_norm": 0.4374469518661499,
"learning_rate": 4.905782677981546e-06,
"loss": 0.4414,
"step": 1180
},
{
"epoch": 0.5710831721470019,
"grad_norm": 0.553343653678894,
"learning_rate": 4.905609066676742e-06,
"loss": 0.4467,
"step": 1181
},
{
"epoch": 0.5715667311411993,
"grad_norm": 0.49388110637664795,
"learning_rate": 4.905435298642771e-06,
"loss": 0.4395,
"step": 1182
},
{
"epoch": 0.5720502901353965,
"grad_norm": 0.4327850341796875,
"learning_rate": 4.905261373890958e-06,
"loss": 0.4563,
"step": 1183
},
{
"epoch": 0.5725338491295938,
"grad_norm": 0.47157958149909973,
"learning_rate": 4.905087292432632e-06,
"loss": 0.4556,
"step": 1184
},
{
"epoch": 0.5730174081237911,
"grad_norm": 0.4458956718444824,
"learning_rate": 4.904913054279136e-06,
"loss": 0.4573,
"step": 1185
},
{
"epoch": 0.5735009671179884,
"grad_norm": 0.4228770434856415,
"learning_rate": 4.90473865944182e-06,
"loss": 0.4671,
"step": 1186
},
{
"epoch": 0.5739845261121856,
"grad_norm": 0.7752367854118347,
"learning_rate": 4.904564107932048e-06,
"loss": 0.4719,
"step": 1187
},
{
"epoch": 0.574468085106383,
"grad_norm": 0.49437078833580017,
"learning_rate": 4.904389399761192e-06,
"loss": 0.4663,
"step": 1188
},
{
"epoch": 0.5749516441005803,
"grad_norm": 0.4362508952617645,
"learning_rate": 4.9042145349406335e-06,
"loss": 0.464,
"step": 1189
},
{
"epoch": 0.5754352030947776,
"grad_norm": 0.4986853003501892,
"learning_rate": 4.9040395134817666e-06,
"loss": 0.4408,
"step": 1190
},
{
"epoch": 0.5759187620889749,
"grad_norm": 0.5251513719558716,
"learning_rate": 4.9038643353959935e-06,
"loss": 0.4656,
"step": 1191
},
{
"epoch": 0.5764023210831721,
"grad_norm": 1.008799433708191,
"learning_rate": 4.903689000694727e-06,
"loss": 0.4876,
"step": 1192
},
{
"epoch": 0.5768858800773694,
"grad_norm": 0.47108638286590576,
"learning_rate": 4.903513509389391e-06,
"loss": 0.4801,
"step": 1193
},
{
"epoch": 0.5773694390715667,
"grad_norm": 0.43808120489120483,
"learning_rate": 4.903337861491418e-06,
"loss": 0.4584,
"step": 1194
},
{
"epoch": 0.5778529980657641,
"grad_norm": 0.5051426887512207,
"learning_rate": 4.903162057012254e-06,
"loss": 0.469,
"step": 1195
},
{
"epoch": 0.5783365570599613,
"grad_norm": 0.5038641691207886,
"learning_rate": 4.9029860959633504e-06,
"loss": 0.4615,
"step": 1196
},
{
"epoch": 0.5788201160541586,
"grad_norm": 0.49047771096229553,
"learning_rate": 4.902809978356173e-06,
"loss": 0.4678,
"step": 1197
},
{
"epoch": 0.5793036750483559,
"grad_norm": 0.4524144232273102,
"learning_rate": 4.902633704202196e-06,
"loss": 0.4773,
"step": 1198
},
{
"epoch": 0.5797872340425532,
"grad_norm": 0.4350757598876953,
"learning_rate": 4.9024572735129026e-06,
"loss": 0.4725,
"step": 1199
},
{
"epoch": 0.5802707930367504,
"grad_norm": 0.4484714865684509,
"learning_rate": 4.902280686299789e-06,
"loss": 0.454,
"step": 1200
},
{
"epoch": 0.5807543520309478,
"grad_norm": 0.429993212223053,
"learning_rate": 4.902103942574358e-06,
"loss": 0.4608,
"step": 1201
},
{
"epoch": 0.5812379110251451,
"grad_norm": 0.4781748652458191,
"learning_rate": 4.901927042348128e-06,
"loss": 0.4678,
"step": 1202
},
{
"epoch": 0.5817214700193424,
"grad_norm": 0.41221436858177185,
"learning_rate": 4.901749985632622e-06,
"loss": 0.4516,
"step": 1203
},
{
"epoch": 0.5822050290135397,
"grad_norm": 0.5129284262657166,
"learning_rate": 4.901572772439376e-06,
"loss": 0.4649,
"step": 1204
},
{
"epoch": 0.5826885880077369,
"grad_norm": 0.4145076870918274,
"learning_rate": 4.901395402779936e-06,
"loss": 0.4341,
"step": 1205
},
{
"epoch": 0.5831721470019342,
"grad_norm": 0.4602621793746948,
"learning_rate": 4.901217876665858e-06,
"loss": 0.4457,
"step": 1206
},
{
"epoch": 0.5836557059961315,
"grad_norm": 0.4283426105976105,
"learning_rate": 4.9010401941087074e-06,
"loss": 0.4755,
"step": 1207
},
{
"epoch": 0.5841392649903289,
"grad_norm": 0.4351952373981476,
"learning_rate": 4.900862355120061e-06,
"loss": 0.4692,
"step": 1208
},
{
"epoch": 0.5846228239845261,
"grad_norm": 0.4370359182357788,
"learning_rate": 4.900684359711505e-06,
"loss": 0.4749,
"step": 1209
},
{
"epoch": 0.5851063829787234,
"grad_norm": 0.4324999749660492,
"learning_rate": 4.900506207894637e-06,
"loss": 0.4791,
"step": 1210
},
{
"epoch": 0.5855899419729207,
"grad_norm": 0.4950762391090393,
"learning_rate": 4.900327899681064e-06,
"loss": 0.4744,
"step": 1211
},
{
"epoch": 0.586073500967118,
"grad_norm": 0.44710519909858704,
"learning_rate": 4.9001494350824e-06,
"loss": 0.4681,
"step": 1212
},
{
"epoch": 0.5865570599613152,
"grad_norm": 0.4250546395778656,
"learning_rate": 4.899970814110276e-06,
"loss": 0.4687,
"step": 1213
},
{
"epoch": 0.5870406189555126,
"grad_norm": 0.5042000412940979,
"learning_rate": 4.899792036776327e-06,
"loss": 0.4839,
"step": 1214
},
{
"epoch": 0.5875241779497099,
"grad_norm": 0.5127058625221252,
"learning_rate": 4.899613103092202e-06,
"loss": 0.4536,
"step": 1215
},
{
"epoch": 0.5880077369439072,
"grad_norm": 0.4671849310398102,
"learning_rate": 4.899434013069558e-06,
"loss": 0.4656,
"step": 1216
},
{
"epoch": 0.5884912959381045,
"grad_norm": 0.4258441925048828,
"learning_rate": 4.899254766720064e-06,
"loss": 0.4626,
"step": 1217
},
{
"epoch": 0.5889748549323017,
"grad_norm": 0.46578940749168396,
"learning_rate": 4.899075364055398e-06,
"loss": 0.4835,
"step": 1218
},
{
"epoch": 0.589458413926499,
"grad_norm": 0.461418479681015,
"learning_rate": 4.898895805087247e-06,
"loss": 0.4676,
"step": 1219
},
{
"epoch": 0.5899419729206963,
"grad_norm": 0.4375152885913849,
"learning_rate": 4.89871608982731e-06,
"loss": 0.4608,
"step": 1220
},
{
"epoch": 0.5904255319148937,
"grad_norm": 0.47547534108161926,
"learning_rate": 4.898536218287296e-06,
"loss": 0.4752,
"step": 1221
},
{
"epoch": 0.5909090909090909,
"grad_norm": 0.46386417746543884,
"learning_rate": 4.898356190478925e-06,
"loss": 0.4508,
"step": 1222
},
{
"epoch": 0.5913926499032882,
"grad_norm": 0.4304327070713043,
"learning_rate": 4.898176006413925e-06,
"loss": 0.4706,
"step": 1223
},
{
"epoch": 0.5918762088974855,
"grad_norm": 0.6741103529930115,
"learning_rate": 4.897995666104035e-06,
"loss": 0.45,
"step": 1224
},
{
"epoch": 0.5923597678916828,
"grad_norm": 0.4445508122444153,
"learning_rate": 4.897815169561005e-06,
"loss": 0.4373,
"step": 1225
},
{
"epoch": 0.59284332688588,
"grad_norm": 0.6164776682853699,
"learning_rate": 4.897634516796595e-06,
"loss": 0.4469,
"step": 1226
},
{
"epoch": 0.5933268858800773,
"grad_norm": 0.4640771746635437,
"learning_rate": 4.897453707822574e-06,
"loss": 0.4671,
"step": 1227
},
{
"epoch": 0.5938104448742747,
"grad_norm": 0.5377663969993591,
"learning_rate": 4.897272742650722e-06,
"loss": 0.461,
"step": 1228
},
{
"epoch": 0.594294003868472,
"grad_norm": 0.4565662443637848,
"learning_rate": 4.8970916212928295e-06,
"loss": 0.4685,
"step": 1229
},
{
"epoch": 0.5947775628626693,
"grad_norm": 0.5152607560157776,
"learning_rate": 4.896910343760697e-06,
"loss": 0.4663,
"step": 1230
},
{
"epoch": 0.5952611218568665,
"grad_norm": 0.4439282715320587,
"learning_rate": 4.896728910066136e-06,
"loss": 0.4623,
"step": 1231
},
{
"epoch": 0.5957446808510638,
"grad_norm": 0.44906753301620483,
"learning_rate": 4.896547320220964e-06,
"loss": 0.4712,
"step": 1232
},
{
"epoch": 0.5962282398452611,
"grad_norm": 0.43596217036247253,
"learning_rate": 4.896365574237014e-06,
"loss": 0.4727,
"step": 1233
},
{
"epoch": 0.5967117988394585,
"grad_norm": 0.6117439866065979,
"learning_rate": 4.896183672126128e-06,
"loss": 0.4652,
"step": 1234
},
{
"epoch": 0.5971953578336557,
"grad_norm": 0.5615205764770508,
"learning_rate": 4.8960016139001555e-06,
"loss": 0.4608,
"step": 1235
},
{
"epoch": 0.597678916827853,
"grad_norm": 0.4595431089401245,
"learning_rate": 4.895819399570958e-06,
"loss": 0.4447,
"step": 1236
},
{
"epoch": 0.5981624758220503,
"grad_norm": 0.5729189515113831,
"learning_rate": 4.895637029150408e-06,
"loss": 0.4697,
"step": 1237
},
{
"epoch": 0.5986460348162476,
"grad_norm": 0.4351283311843872,
"learning_rate": 4.895454502650388e-06,
"loss": 0.4817,
"step": 1238
},
{
"epoch": 0.5991295938104448,
"grad_norm": 0.42649757862091064,
"learning_rate": 4.895271820082787e-06,
"loss": 0.4753,
"step": 1239
},
{
"epoch": 0.5996131528046421,
"grad_norm": 0.5059411525726318,
"learning_rate": 4.895088981459509e-06,
"loss": 0.4824,
"step": 1240
},
{
"epoch": 0.6000967117988395,
"grad_norm": 0.46130621433258057,
"learning_rate": 4.894905986792465e-06,
"loss": 0.4661,
"step": 1241
},
{
"epoch": 0.6005802707930368,
"grad_norm": 0.4530123770236969,
"learning_rate": 4.8947228360935795e-06,
"loss": 0.4606,
"step": 1242
},
{
"epoch": 0.601063829787234,
"grad_norm": 0.4376404583454132,
"learning_rate": 4.894539529374784e-06,
"loss": 0.4622,
"step": 1243
},
{
"epoch": 0.6015473887814313,
"grad_norm": 0.6113477945327759,
"learning_rate": 4.894356066648021e-06,
"loss": 0.4983,
"step": 1244
},
{
"epoch": 0.6020309477756286,
"grad_norm": 0.44446712732315063,
"learning_rate": 4.894172447925242e-06,
"loss": 0.4529,
"step": 1245
},
{
"epoch": 0.6025145067698259,
"grad_norm": 0.43907102942466736,
"learning_rate": 4.8939886732184125e-06,
"loss": 0.4532,
"step": 1246
},
{
"epoch": 0.6029980657640233,
"grad_norm": 0.43806588649749756,
"learning_rate": 4.893804742539505e-06,
"loss": 0.4707,
"step": 1247
},
{
"epoch": 0.6034816247582205,
"grad_norm": 0.4340677857398987,
"learning_rate": 4.893620655900502e-06,
"loss": 0.4455,
"step": 1248
},
{
"epoch": 0.6039651837524178,
"grad_norm": 0.45514774322509766,
"learning_rate": 4.893436413313398e-06,
"loss": 0.4786,
"step": 1249
},
{
"epoch": 0.6044487427466151,
"grad_norm": 0.5131993889808655,
"learning_rate": 4.893252014790195e-06,
"loss": 0.458,
"step": 1250
},
{
"epoch": 0.6049323017408124,
"grad_norm": 0.4115256667137146,
"learning_rate": 4.893067460342909e-06,
"loss": 0.4427,
"step": 1251
},
{
"epoch": 0.6054158607350096,
"grad_norm": 0.4817465543746948,
"learning_rate": 4.892882749983564e-06,
"loss": 0.4613,
"step": 1252
},
{
"epoch": 0.6058994197292069,
"grad_norm": 0.45432931184768677,
"learning_rate": 4.892697883724193e-06,
"loss": 0.4818,
"step": 1253
},
{
"epoch": 0.6063829787234043,
"grad_norm": 0.47187644243240356,
"learning_rate": 4.892512861576841e-06,
"loss": 0.4666,
"step": 1254
},
{
"epoch": 0.6068665377176016,
"grad_norm": 0.4426283538341522,
"learning_rate": 4.89232768355356e-06,
"loss": 0.475,
"step": 1255
},
{
"epoch": 0.6073500967117988,
"grad_norm": 0.4366290867328644,
"learning_rate": 4.892142349666418e-06,
"loss": 0.4646,
"step": 1256
},
{
"epoch": 0.6078336557059961,
"grad_norm": 0.4257701635360718,
"learning_rate": 4.891956859927489e-06,
"loss": 0.4656,
"step": 1257
},
{
"epoch": 0.6083172147001934,
"grad_norm": 0.4350696802139282,
"learning_rate": 4.891771214348857e-06,
"loss": 0.4552,
"step": 1258
},
{
"epoch": 0.6088007736943907,
"grad_norm": 0.4178631603717804,
"learning_rate": 4.891585412942617e-06,
"loss": 0.4759,
"step": 1259
},
{
"epoch": 0.6092843326885881,
"grad_norm": 0.43329527974128723,
"learning_rate": 4.8913994557208756e-06,
"loss": 0.4675,
"step": 1260
},
{
"epoch": 0.6097678916827853,
"grad_norm": 0.46253421902656555,
"learning_rate": 4.891213342695747e-06,
"loss": 0.4872,
"step": 1261
},
{
"epoch": 0.6102514506769826,
"grad_norm": 0.4532800316810608,
"learning_rate": 4.891027073879357e-06,
"loss": 0.4829,
"step": 1262
},
{
"epoch": 0.6107350096711799,
"grad_norm": 0.4343501925468445,
"learning_rate": 4.890840649283843e-06,
"loss": 0.4603,
"step": 1263
},
{
"epoch": 0.6112185686653772,
"grad_norm": 0.6483604907989502,
"learning_rate": 4.890654068921347e-06,
"loss": 0.4418,
"step": 1264
},
{
"epoch": 0.6117021276595744,
"grad_norm": 0.45191410183906555,
"learning_rate": 4.890467332804029e-06,
"loss": 0.4588,
"step": 1265
},
{
"epoch": 0.6121856866537717,
"grad_norm": 0.4154477119445801,
"learning_rate": 4.890280440944053e-06,
"loss": 0.4415,
"step": 1266
},
{
"epoch": 0.6126692456479691,
"grad_norm": 0.46999967098236084,
"learning_rate": 4.890093393353596e-06,
"loss": 0.469,
"step": 1267
},
{
"epoch": 0.6131528046421664,
"grad_norm": 0.4785558879375458,
"learning_rate": 4.889906190044843e-06,
"loss": 0.4627,
"step": 1268
},
{
"epoch": 0.6136363636363636,
"grad_norm": 0.43093442916870117,
"learning_rate": 4.889718831029993e-06,
"loss": 0.4383,
"step": 1269
},
{
"epoch": 0.6141199226305609,
"grad_norm": 0.43612828850746155,
"learning_rate": 4.889531316321251e-06,
"loss": 0.4759,
"step": 1270
},
{
"epoch": 0.6146034816247582,
"grad_norm": 0.40855318307876587,
"learning_rate": 4.889343645930834e-06,
"loss": 0.4579,
"step": 1271
},
{
"epoch": 0.6150870406189555,
"grad_norm": 0.8085166215896606,
"learning_rate": 4.88915581987097e-06,
"loss": 0.4704,
"step": 1272
},
{
"epoch": 0.6155705996131529,
"grad_norm": 0.4581427276134491,
"learning_rate": 4.8889678381538954e-06,
"loss": 0.4727,
"step": 1273
},
{
"epoch": 0.6160541586073501,
"grad_norm": 0.44169020652770996,
"learning_rate": 4.888779700791858e-06,
"loss": 0.4643,
"step": 1274
},
{
"epoch": 0.6165377176015474,
"grad_norm": 0.42497655749320984,
"learning_rate": 4.8885914077971155e-06,
"loss": 0.4699,
"step": 1275
},
{
"epoch": 0.6170212765957447,
"grad_norm": 0.5462936162948608,
"learning_rate": 4.888402959181934e-06,
"loss": 0.4687,
"step": 1276
},
{
"epoch": 0.617504835589942,
"grad_norm": 0.4208332598209381,
"learning_rate": 4.888214354958592e-06,
"loss": 0.4694,
"step": 1277
},
{
"epoch": 0.6179883945841392,
"grad_norm": 0.41665810346603394,
"learning_rate": 4.888025595139377e-06,
"loss": 0.4729,
"step": 1278
},
{
"epoch": 0.6184719535783365,
"grad_norm": 0.45152920484542847,
"learning_rate": 4.887836679736588e-06,
"loss": 0.4486,
"step": 1279
},
{
"epoch": 0.6189555125725339,
"grad_norm": 0.42997369170188904,
"learning_rate": 4.887647608762533e-06,
"loss": 0.4452,
"step": 1280
},
{
"epoch": 0.6194390715667312,
"grad_norm": 0.49401333928108215,
"learning_rate": 4.88745838222953e-06,
"loss": 0.47,
"step": 1281
},
{
"epoch": 0.6199226305609284,
"grad_norm": 0.4673040211200714,
"learning_rate": 4.887269000149907e-06,
"loss": 0.4401,
"step": 1282
},
{
"epoch": 0.6204061895551257,
"grad_norm": 0.4323924779891968,
"learning_rate": 4.887079462536003e-06,
"loss": 0.472,
"step": 1283
},
{
"epoch": 0.620889748549323,
"grad_norm": 0.4398268759250641,
"learning_rate": 4.886889769400166e-06,
"loss": 0.4595,
"step": 1284
},
{
"epoch": 0.6213733075435203,
"grad_norm": 0.6603145599365234,
"learning_rate": 4.886699920754755e-06,
"loss": 0.46,
"step": 1285
},
{
"epoch": 0.6218568665377177,
"grad_norm": 0.4661974608898163,
"learning_rate": 4.88650991661214e-06,
"loss": 0.4592,
"step": 1286
},
{
"epoch": 0.6223404255319149,
"grad_norm": 0.45898857712745667,
"learning_rate": 4.886319756984699e-06,
"loss": 0.474,
"step": 1287
},
{
"epoch": 0.6228239845261122,
"grad_norm": 0.4523521959781647,
"learning_rate": 4.886129441884822e-06,
"loss": 0.4629,
"step": 1288
},
{
"epoch": 0.6233075435203095,
"grad_norm": 0.4217586815357208,
"learning_rate": 4.8859389713249076e-06,
"loss": 0.4734,
"step": 1289
},
{
"epoch": 0.6237911025145068,
"grad_norm": 0.47298648953437805,
"learning_rate": 4.885748345317365e-06,
"loss": 0.4841,
"step": 1290
},
{
"epoch": 0.624274661508704,
"grad_norm": 0.49093514680862427,
"learning_rate": 4.885557563874614e-06,
"loss": 0.4458,
"step": 1291
},
{
"epoch": 0.6247582205029013,
"grad_norm": 0.4375835359096527,
"learning_rate": 4.885366627009085e-06,
"loss": 0.452,
"step": 1292
},
{
"epoch": 0.6252417794970987,
"grad_norm": 0.4391387701034546,
"learning_rate": 4.885175534733217e-06,
"loss": 0.4428,
"step": 1293
},
{
"epoch": 0.625725338491296,
"grad_norm": 0.4366895854473114,
"learning_rate": 4.88498428705946e-06,
"loss": 0.4591,
"step": 1294
},
{
"epoch": 0.6262088974854932,
"grad_norm": 0.44298285245895386,
"learning_rate": 4.8847928840002755e-06,
"loss": 0.4694,
"step": 1295
},
{
"epoch": 0.6266924564796905,
"grad_norm": 0.43226316571235657,
"learning_rate": 4.884601325568132e-06,
"loss": 0.438,
"step": 1296
},
{
"epoch": 0.6271760154738878,
"grad_norm": 0.42019107937812805,
"learning_rate": 4.88440961177551e-06,
"loss": 0.4494,
"step": 1297
},
{
"epoch": 0.6276595744680851,
"grad_norm": 0.44551870226860046,
"learning_rate": 4.884217742634901e-06,
"loss": 0.4768,
"step": 1298
},
{
"epoch": 0.6281431334622823,
"grad_norm": 0.48216238617897034,
"learning_rate": 4.884025718158806e-06,
"loss": 0.4603,
"step": 1299
},
{
"epoch": 0.6286266924564797,
"grad_norm": 0.42420119047164917,
"learning_rate": 4.883833538359733e-06,
"loss": 0.4574,
"step": 1300
},
{
"epoch": 0.629110251450677,
"grad_norm": 0.43553632497787476,
"learning_rate": 4.883641203250205e-06,
"loss": 0.4514,
"step": 1301
},
{
"epoch": 0.6295938104448743,
"grad_norm": 0.4379255473613739,
"learning_rate": 4.883448712842752e-06,
"loss": 0.4462,
"step": 1302
},
{
"epoch": 0.6300773694390716,
"grad_norm": 0.41880786418914795,
"learning_rate": 4.883256067149917e-06,
"loss": 0.4563,
"step": 1303
},
{
"epoch": 0.6305609284332688,
"grad_norm": 0.4387166500091553,
"learning_rate": 4.883063266184248e-06,
"loss": 0.4721,
"step": 1304
},
{
"epoch": 0.6310444874274661,
"grad_norm": 0.42293262481689453,
"learning_rate": 4.8828703099583086e-06,
"loss": 0.4489,
"step": 1305
},
{
"epoch": 0.6315280464216635,
"grad_norm": 0.5480451583862305,
"learning_rate": 4.882677198484669e-06,
"loss": 0.4717,
"step": 1306
},
{
"epoch": 0.6320116054158608,
"grad_norm": 0.4439707398414612,
"learning_rate": 4.8824839317759115e-06,
"loss": 0.4536,
"step": 1307
},
{
"epoch": 0.632495164410058,
"grad_norm": 0.5086662173271179,
"learning_rate": 4.882290509844627e-06,
"loss": 0.4607,
"step": 1308
},
{
"epoch": 0.6329787234042553,
"grad_norm": 0.42350244522094727,
"learning_rate": 4.882096932703418e-06,
"loss": 0.4621,
"step": 1309
},
{
"epoch": 0.6334622823984526,
"grad_norm": 0.4228970408439636,
"learning_rate": 4.881903200364897e-06,
"loss": 0.4841,
"step": 1310
},
{
"epoch": 0.6339458413926499,
"grad_norm": 0.4056392312049866,
"learning_rate": 4.881709312841684e-06,
"loss": 0.4572,
"step": 1311
},
{
"epoch": 0.6344294003868471,
"grad_norm": 0.427141934633255,
"learning_rate": 4.881515270146412e-06,
"loss": 0.4505,
"step": 1312
},
{
"epoch": 0.6349129593810445,
"grad_norm": 0.4352380335330963,
"learning_rate": 4.881321072291724e-06,
"loss": 0.4686,
"step": 1313
},
{
"epoch": 0.6353965183752418,
"grad_norm": 0.445075124502182,
"learning_rate": 4.88112671929027e-06,
"loss": 0.4724,
"step": 1314
},
{
"epoch": 0.6358800773694391,
"grad_norm": 0.42873072624206543,
"learning_rate": 4.880932211154715e-06,
"loss": 0.4748,
"step": 1315
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.44988590478897095,
"learning_rate": 4.880737547897731e-06,
"loss": 0.4551,
"step": 1316
},
{
"epoch": 0.6368471953578336,
"grad_norm": 0.9747801423072815,
"learning_rate": 4.880542729532e-06,
"loss": 0.4517,
"step": 1317
},
{
"epoch": 0.6373307543520309,
"grad_norm": 0.6301749348640442,
"learning_rate": 4.880347756070214e-06,
"loss": 0.4702,
"step": 1318
},
{
"epoch": 0.6378143133462283,
"grad_norm": 0.6370202302932739,
"learning_rate": 4.880152627525076e-06,
"loss": 0.4593,
"step": 1319
},
{
"epoch": 0.6382978723404256,
"grad_norm": 0.4780006408691406,
"learning_rate": 4.879957343909301e-06,
"loss": 0.4849,
"step": 1320
},
{
"epoch": 0.6387814313346228,
"grad_norm": 0.4865046739578247,
"learning_rate": 4.87976190523561e-06,
"loss": 0.4814,
"step": 1321
},
{
"epoch": 0.6392649903288201,
"grad_norm": 0.41326484084129333,
"learning_rate": 4.879566311516737e-06,
"loss": 0.4544,
"step": 1322
},
{
"epoch": 0.6397485493230174,
"grad_norm": 0.4544488489627838,
"learning_rate": 4.879370562765424e-06,
"loss": 0.4717,
"step": 1323
},
{
"epoch": 0.6402321083172147,
"grad_norm": 0.42522361874580383,
"learning_rate": 4.879174658994425e-06,
"loss": 0.4625,
"step": 1324
},
{
"epoch": 0.6407156673114119,
"grad_norm": 0.42701852321624756,
"learning_rate": 4.8789786002165055e-06,
"loss": 0.4857,
"step": 1325
},
{
"epoch": 0.6411992263056093,
"grad_norm": 0.45991528034210205,
"learning_rate": 4.8787823864444365e-06,
"loss": 0.4528,
"step": 1326
},
{
"epoch": 0.6416827852998066,
"grad_norm": 0.4402574896812439,
"learning_rate": 4.878586017691002e-06,
"loss": 0.4682,
"step": 1327
},
{
"epoch": 0.6421663442940039,
"grad_norm": 0.4804152846336365,
"learning_rate": 4.878389493968996e-06,
"loss": 0.4557,
"step": 1328
},
{
"epoch": 0.6426499032882012,
"grad_norm": 0.4432368874549866,
"learning_rate": 4.878192815291223e-06,
"loss": 0.4439,
"step": 1329
},
{
"epoch": 0.6431334622823984,
"grad_norm": 0.4438154697418213,
"learning_rate": 4.8779959816704955e-06,
"loss": 0.4455,
"step": 1330
},
{
"epoch": 0.6436170212765957,
"grad_norm": 0.4515506327152252,
"learning_rate": 4.877798993119639e-06,
"loss": 0.4624,
"step": 1331
},
{
"epoch": 0.6441005802707931,
"grad_norm": 0.4431227743625641,
"learning_rate": 4.877601849651487e-06,
"loss": 0.4571,
"step": 1332
},
{
"epoch": 0.6445841392649904,
"grad_norm": 0.4530756175518036,
"learning_rate": 4.877404551278883e-06,
"loss": 0.4572,
"step": 1333
},
{
"epoch": 0.6450676982591876,
"grad_norm": 0.43097323179244995,
"learning_rate": 4.877207098014682e-06,
"loss": 0.4582,
"step": 1334
},
{
"epoch": 0.6455512572533849,
"grad_norm": 0.43735161423683167,
"learning_rate": 4.8770094898717494e-06,
"loss": 0.4356,
"step": 1335
},
{
"epoch": 0.6460348162475822,
"grad_norm": 0.43248477578163147,
"learning_rate": 4.876811726862958e-06,
"loss": 0.4386,
"step": 1336
},
{
"epoch": 0.6465183752417795,
"grad_norm": 0.5172285437583923,
"learning_rate": 4.876613809001193e-06,
"loss": 0.4326,
"step": 1337
},
{
"epoch": 0.6470019342359767,
"grad_norm": 0.4423380494117737,
"learning_rate": 4.876415736299349e-06,
"loss": 0.4597,
"step": 1338
},
{
"epoch": 0.6474854932301741,
"grad_norm": 0.4481392204761505,
"learning_rate": 4.87621750877033e-06,
"loss": 0.4322,
"step": 1339
},
{
"epoch": 0.6479690522243714,
"grad_norm": 0.4456692636013031,
"learning_rate": 4.8760191264270525e-06,
"loss": 0.4702,
"step": 1340
},
{
"epoch": 0.6484526112185687,
"grad_norm": 1.2792326211929321,
"learning_rate": 4.8758205892824415e-06,
"loss": 0.4617,
"step": 1341
},
{
"epoch": 0.648936170212766,
"grad_norm": 0.5256432294845581,
"learning_rate": 4.8756218973494296e-06,
"loss": 0.4695,
"step": 1342
},
{
"epoch": 0.6494197292069632,
"grad_norm": 0.47011733055114746,
"learning_rate": 4.875423050640964e-06,
"loss": 0.4551,
"step": 1343
},
{
"epoch": 0.6499032882011605,
"grad_norm": 0.7151491641998291,
"learning_rate": 4.8752240491699985e-06,
"loss": 0.4633,
"step": 1344
},
{
"epoch": 0.6503868471953579,
"grad_norm": 0.6129742860794067,
"learning_rate": 4.875024892949499e-06,
"loss": 0.4528,
"step": 1345
},
{
"epoch": 0.6508704061895552,
"grad_norm": 0.43616563081741333,
"learning_rate": 4.874825581992442e-06,
"loss": 0.4596,
"step": 1346
},
{
"epoch": 0.6513539651837524,
"grad_norm": 0.43546509742736816,
"learning_rate": 4.874626116311812e-06,
"loss": 0.4571,
"step": 1347
},
{
"epoch": 0.6518375241779497,
"grad_norm": 0.47251904010772705,
"learning_rate": 4.874426495920603e-06,
"loss": 0.449,
"step": 1348
},
{
"epoch": 0.652321083172147,
"grad_norm": 0.45273882150650024,
"learning_rate": 4.874226720831823e-06,
"loss": 0.4543,
"step": 1349
},
{
"epoch": 0.6528046421663443,
"grad_norm": 0.4764840006828308,
"learning_rate": 4.874026791058486e-06,
"loss": 0.4888,
"step": 1350
},
{
"epoch": 0.6532882011605415,
"grad_norm": 0.4244127869606018,
"learning_rate": 4.873826706613618e-06,
"loss": 0.4724,
"step": 1351
},
{
"epoch": 0.6537717601547389,
"grad_norm": 0.455299973487854,
"learning_rate": 4.873626467510255e-06,
"loss": 0.4601,
"step": 1352
},
{
"epoch": 0.6542553191489362,
"grad_norm": 0.4337126910686493,
"learning_rate": 4.8734260737614435e-06,
"loss": 0.4518,
"step": 1353
},
{
"epoch": 0.6547388781431335,
"grad_norm": 0.46671974658966064,
"learning_rate": 4.873225525380239e-06,
"loss": 0.4776,
"step": 1354
},
{
"epoch": 0.6552224371373307,
"grad_norm": 0.40789374709129333,
"learning_rate": 4.873024822379707e-06,
"loss": 0.4425,
"step": 1355
},
{
"epoch": 0.655705996131528,
"grad_norm": 0.45303189754486084,
"learning_rate": 4.872823964772925e-06,
"loss": 0.4533,
"step": 1356
},
{
"epoch": 0.6561895551257253,
"grad_norm": 0.4961181879043579,
"learning_rate": 4.872622952572977e-06,
"loss": 0.4571,
"step": 1357
},
{
"epoch": 0.6566731141199227,
"grad_norm": 0.4303774833679199,
"learning_rate": 4.872421785792962e-06,
"loss": 0.4806,
"step": 1358
},
{
"epoch": 0.65715667311412,
"grad_norm": 0.4572507441043854,
"learning_rate": 4.872220464445983e-06,
"loss": 0.446,
"step": 1359
},
{
"epoch": 0.6576402321083172,
"grad_norm": 0.4358389973640442,
"learning_rate": 4.8720189885451605e-06,
"loss": 0.48,
"step": 1360
},
{
"epoch": 0.6581237911025145,
"grad_norm": 0.4195079505443573,
"learning_rate": 4.871817358103617e-06,
"loss": 0.4288,
"step": 1361
},
{
"epoch": 0.6586073500967118,
"grad_norm": 0.4293639659881592,
"learning_rate": 4.871615573134492e-06,
"loss": 0.4592,
"step": 1362
},
{
"epoch": 0.6590909090909091,
"grad_norm": 0.4390144646167755,
"learning_rate": 4.87141363365093e-06,
"loss": 0.4645,
"step": 1363
},
{
"epoch": 0.6595744680851063,
"grad_norm": 0.44983401894569397,
"learning_rate": 4.871211539666089e-06,
"loss": 0.4734,
"step": 1364
},
{
"epoch": 0.6600580270793037,
"grad_norm": 0.4398081600666046,
"learning_rate": 4.871009291193135e-06,
"loss": 0.4654,
"step": 1365
},
{
"epoch": 0.660541586073501,
"grad_norm": 0.43590307235717773,
"learning_rate": 4.870806888245245e-06,
"loss": 0.4594,
"step": 1366
},
{
"epoch": 0.6610251450676983,
"grad_norm": 0.4521521329879761,
"learning_rate": 4.870604330835606e-06,
"loss": 0.4633,
"step": 1367
},
{
"epoch": 0.6615087040618955,
"grad_norm": 0.46335622668266296,
"learning_rate": 4.870401618977415e-06,
"loss": 0.4625,
"step": 1368
},
{
"epoch": 0.6619922630560928,
"grad_norm": 0.42654484510421753,
"learning_rate": 4.870198752683879e-06,
"loss": 0.4518,
"step": 1369
},
{
"epoch": 0.6624758220502901,
"grad_norm": 0.45265164971351624,
"learning_rate": 4.869995731968214e-06,
"loss": 0.4798,
"step": 1370
},
{
"epoch": 0.6629593810444874,
"grad_norm": 0.4795035123825073,
"learning_rate": 4.86979255684365e-06,
"loss": 0.4721,
"step": 1371
},
{
"epoch": 0.6634429400386848,
"grad_norm": 0.44764506816864014,
"learning_rate": 4.869589227323421e-06,
"loss": 0.4578,
"step": 1372
},
{
"epoch": 0.663926499032882,
"grad_norm": 0.43399766087532043,
"learning_rate": 4.869385743420775e-06,
"loss": 0.4698,
"step": 1373
},
{
"epoch": 0.6644100580270793,
"grad_norm": 0.4521014094352722,
"learning_rate": 4.869182105148971e-06,
"loss": 0.465,
"step": 1374
},
{
"epoch": 0.6648936170212766,
"grad_norm": 0.41134724020957947,
"learning_rate": 4.868978312521274e-06,
"loss": 0.4454,
"step": 1375
},
{
"epoch": 0.6653771760154739,
"grad_norm": 0.47302302718162537,
"learning_rate": 4.868774365550963e-06,
"loss": 0.4777,
"step": 1376
},
{
"epoch": 0.6658607350096711,
"grad_norm": 0.42681750655174255,
"learning_rate": 4.868570264251324e-06,
"loss": 0.4767,
"step": 1377
},
{
"epoch": 0.6663442940038685,
"grad_norm": 0.4561176896095276,
"learning_rate": 4.868366008635657e-06,
"loss": 0.4527,
"step": 1378
},
{
"epoch": 0.6668278529980658,
"grad_norm": 0.5029308199882507,
"learning_rate": 4.868161598717267e-06,
"loss": 0.4567,
"step": 1379
},
{
"epoch": 0.6673114119922631,
"grad_norm": 0.41503024101257324,
"learning_rate": 4.867957034509473e-06,
"loss": 0.4495,
"step": 1380
},
{
"epoch": 0.6677949709864603,
"grad_norm": 0.6687749624252319,
"learning_rate": 4.867752316025602e-06,
"loss": 0.4604,
"step": 1381
},
{
"epoch": 0.6682785299806576,
"grad_norm": 0.4590419828891754,
"learning_rate": 4.867547443278993e-06,
"loss": 0.462,
"step": 1382
},
{
"epoch": 0.6687620889748549,
"grad_norm": 0.4743969142436981,
"learning_rate": 4.867342416282992e-06,
"loss": 0.4702,
"step": 1383
},
{
"epoch": 0.6692456479690522,
"grad_norm": 0.5056655406951904,
"learning_rate": 4.867137235050958e-06,
"loss": 0.4616,
"step": 1384
},
{
"epoch": 0.6697292069632496,
"grad_norm": 0.5750028491020203,
"learning_rate": 4.866931899596259e-06,
"loss": 0.4651,
"step": 1385
},
{
"epoch": 0.6702127659574468,
"grad_norm": 0.538818895816803,
"learning_rate": 4.866726409932272e-06,
"loss": 0.4619,
"step": 1386
},
{
"epoch": 0.6706963249516441,
"grad_norm": 0.5920456051826477,
"learning_rate": 4.866520766072385e-06,
"loss": 0.4565,
"step": 1387
},
{
"epoch": 0.6711798839458414,
"grad_norm": 0.6138905882835388,
"learning_rate": 4.866314968029997e-06,
"loss": 0.4764,
"step": 1388
},
{
"epoch": 0.6716634429400387,
"grad_norm": 0.4083728492259979,
"learning_rate": 4.866109015818515e-06,
"loss": 0.4402,
"step": 1389
},
{
"epoch": 0.6721470019342359,
"grad_norm": 0.5708425045013428,
"learning_rate": 4.865902909451358e-06,
"loss": 0.4363,
"step": 1390
},
{
"epoch": 0.6726305609284333,
"grad_norm": 0.4998311698436737,
"learning_rate": 4.865696648941954e-06,
"loss": 0.4493,
"step": 1391
},
{
"epoch": 0.6731141199226306,
"grad_norm": 0.42642462253570557,
"learning_rate": 4.865490234303741e-06,
"loss": 0.4605,
"step": 1392
},
{
"epoch": 0.6735976789168279,
"grad_norm": 0.42192861437797546,
"learning_rate": 4.865283665550167e-06,
"loss": 0.4624,
"step": 1393
},
{
"epoch": 0.6740812379110251,
"grad_norm": 0.4162391424179077,
"learning_rate": 4.8650769426946905e-06,
"loss": 0.4666,
"step": 1394
},
{
"epoch": 0.6745647969052224,
"grad_norm": 0.41674554347991943,
"learning_rate": 4.8648700657507794e-06,
"loss": 0.4637,
"step": 1395
},
{
"epoch": 0.6750483558994197,
"grad_norm": 0.40383294224739075,
"learning_rate": 4.864663034731913e-06,
"loss": 0.4698,
"step": 1396
},
{
"epoch": 0.675531914893617,
"grad_norm": 0.44404447078704834,
"learning_rate": 4.864455849651579e-06,
"loss": 0.4517,
"step": 1397
},
{
"epoch": 0.6760154738878144,
"grad_norm": 0.4714096784591675,
"learning_rate": 4.8642485105232766e-06,
"loss": 0.4562,
"step": 1398
},
{
"epoch": 0.6764990328820116,
"grad_norm": 0.41663625836372375,
"learning_rate": 4.864041017360512e-06,
"loss": 0.4449,
"step": 1399
},
{
"epoch": 0.6769825918762089,
"grad_norm": 0.5686074495315552,
"learning_rate": 4.863833370176807e-06,
"loss": 0.4554,
"step": 1400
},
{
"epoch": 0.6774661508704062,
"grad_norm": 0.41894927620887756,
"learning_rate": 4.863625568985688e-06,
"loss": 0.4749,
"step": 1401
},
{
"epoch": 0.6779497098646035,
"grad_norm": 0.4760285019874573,
"learning_rate": 4.863417613800694e-06,
"loss": 0.4605,
"step": 1402
},
{
"epoch": 0.6784332688588007,
"grad_norm": 0.6658610701560974,
"learning_rate": 4.863209504635373e-06,
"loss": 0.4447,
"step": 1403
},
{
"epoch": 0.6789168278529981,
"grad_norm": 0.46141868829727173,
"learning_rate": 4.863001241503285e-06,
"loss": 0.4561,
"step": 1404
},
{
"epoch": 0.6794003868471954,
"grad_norm": 1.1174705028533936,
"learning_rate": 4.862792824417998e-06,
"loss": 0.4542,
"step": 1405
},
{
"epoch": 0.6798839458413927,
"grad_norm": 0.4388240873813629,
"learning_rate": 4.86258425339309e-06,
"loss": 0.4666,
"step": 1406
},
{
"epoch": 0.6803675048355899,
"grad_norm": 0.4310016930103302,
"learning_rate": 4.862375528442152e-06,
"loss": 0.4604,
"step": 1407
},
{
"epoch": 0.6808510638297872,
"grad_norm": 0.47581034898757935,
"learning_rate": 4.862166649578779e-06,
"loss": 0.4608,
"step": 1408
},
{
"epoch": 0.6813346228239845,
"grad_norm": 0.47496527433395386,
"learning_rate": 4.861957616816583e-06,
"loss": 0.4674,
"step": 1409
},
{
"epoch": 0.6818181818181818,
"grad_norm": 0.45721524953842163,
"learning_rate": 4.861748430169181e-06,
"loss": 0.4589,
"step": 1410
},
{
"epoch": 0.6823017408123792,
"grad_norm": 0.43196386098861694,
"learning_rate": 4.8615390896502034e-06,
"loss": 0.4612,
"step": 1411
},
{
"epoch": 0.6827852998065764,
"grad_norm": 0.44146570563316345,
"learning_rate": 4.861329595273288e-06,
"loss": 0.4651,
"step": 1412
},
{
"epoch": 0.6832688588007737,
"grad_norm": 0.4383741021156311,
"learning_rate": 4.8611199470520844e-06,
"loss": 0.444,
"step": 1413
},
{
"epoch": 0.683752417794971,
"grad_norm": 0.41839542984962463,
"learning_rate": 4.86091014500025e-06,
"loss": 0.4783,
"step": 1414
},
{
"epoch": 0.6842359767891683,
"grad_norm": 0.41911396384239197,
"learning_rate": 4.860700189131456e-06,
"loss": 0.455,
"step": 1415
},
{
"epoch": 0.6847195357833655,
"grad_norm": 0.7411275506019592,
"learning_rate": 4.8604900794593786e-06,
"loss": 0.4546,
"step": 1416
},
{
"epoch": 0.6852030947775629,
"grad_norm": 0.4487959146499634,
"learning_rate": 4.860279815997709e-06,
"loss": 0.4693,
"step": 1417
},
{
"epoch": 0.6856866537717602,
"grad_norm": 0.4634031057357788,
"learning_rate": 4.860069398760146e-06,
"loss": 0.4527,
"step": 1418
},
{
"epoch": 0.6861702127659575,
"grad_norm": 0.42531654238700867,
"learning_rate": 4.859858827760396e-06,
"loss": 0.4653,
"step": 1419
},
{
"epoch": 0.6866537717601547,
"grad_norm": 0.43067359924316406,
"learning_rate": 4.859648103012183e-06,
"loss": 0.4571,
"step": 1420
},
{
"epoch": 0.687137330754352,
"grad_norm": 0.41355863213539124,
"learning_rate": 4.859437224529231e-06,
"loss": 0.4402,
"step": 1421
},
{
"epoch": 0.6876208897485493,
"grad_norm": 0.512670636177063,
"learning_rate": 4.859226192325281e-06,
"loss": 0.4648,
"step": 1422
},
{
"epoch": 0.6881044487427466,
"grad_norm": 0.42603832483291626,
"learning_rate": 4.859015006414084e-06,
"loss": 0.4761,
"step": 1423
},
{
"epoch": 0.688588007736944,
"grad_norm": 0.4183642268180847,
"learning_rate": 4.858803666809396e-06,
"loss": 0.4457,
"step": 1424
},
{
"epoch": 0.6890715667311412,
"grad_norm": 0.41630667448043823,
"learning_rate": 4.858592173524988e-06,
"loss": 0.4729,
"step": 1425
},
{
"epoch": 0.6895551257253385,
"grad_norm": 0.4732525646686554,
"learning_rate": 4.858380526574639e-06,
"loss": 0.4431,
"step": 1426
},
{
"epoch": 0.6900386847195358,
"grad_norm": 0.4732808470726013,
"learning_rate": 4.8581687259721375e-06,
"loss": 0.4718,
"step": 1427
},
{
"epoch": 0.690522243713733,
"grad_norm": 0.5042643547058105,
"learning_rate": 4.857956771731282e-06,
"loss": 0.4574,
"step": 1428
},
{
"epoch": 0.6910058027079303,
"grad_norm": 0.4248146414756775,
"learning_rate": 4.857744663865883e-06,
"loss": 0.4548,
"step": 1429
},
{
"epoch": 0.6914893617021277,
"grad_norm": 0.4656713902950287,
"learning_rate": 4.85753240238976e-06,
"loss": 0.4588,
"step": 1430
},
{
"epoch": 0.691972920696325,
"grad_norm": 0.5286734104156494,
"learning_rate": 4.85731998731674e-06,
"loss": 0.4612,
"step": 1431
},
{
"epoch": 0.6924564796905223,
"grad_norm": 0.40691936016082764,
"learning_rate": 4.857107418660664e-06,
"loss": 0.4375,
"step": 1432
},
{
"epoch": 0.6929400386847195,
"grad_norm": 0.41909584403038025,
"learning_rate": 4.85689469643538e-06,
"loss": 0.4766,
"step": 1433
},
{
"epoch": 0.6934235976789168,
"grad_norm": 0.45573991537094116,
"learning_rate": 4.85668182065475e-06,
"loss": 0.4252,
"step": 1434
},
{
"epoch": 0.6939071566731141,
"grad_norm": 0.44050395488739014,
"learning_rate": 4.856468791332638e-06,
"loss": 0.4725,
"step": 1435
},
{
"epoch": 0.6943907156673114,
"grad_norm": 0.4421895742416382,
"learning_rate": 4.8562556084829285e-06,
"loss": 0.4481,
"step": 1436
},
{
"epoch": 0.6948742746615088,
"grad_norm": 0.41216611862182617,
"learning_rate": 4.856042272119508e-06,
"loss": 0.4545,
"step": 1437
},
{
"epoch": 0.695357833655706,
"grad_norm": 0.8512147068977356,
"learning_rate": 4.8558287822562755e-06,
"loss": 0.4703,
"step": 1438
},
{
"epoch": 0.6958413926499033,
"grad_norm": 0.48798003792762756,
"learning_rate": 4.855615138907141e-06,
"loss": 0.4742,
"step": 1439
},
{
"epoch": 0.6963249516441006,
"grad_norm": 0.7163406014442444,
"learning_rate": 4.855401342086024e-06,
"loss": 0.4574,
"step": 1440
},
{
"epoch": 0.6968085106382979,
"grad_norm": 0.4780515432357788,
"learning_rate": 4.8551873918068525e-06,
"loss": 0.4829,
"step": 1441
},
{
"epoch": 0.6972920696324951,
"grad_norm": 0.4141700863838196,
"learning_rate": 4.854973288083566e-06,
"loss": 0.4712,
"step": 1442
},
{
"epoch": 0.6977756286266924,
"grad_norm": 0.43325740098953247,
"learning_rate": 4.854759030930115e-06,
"loss": 0.4399,
"step": 1443
},
{
"epoch": 0.6982591876208898,
"grad_norm": 0.44285354018211365,
"learning_rate": 4.854544620360458e-06,
"loss": 0.4502,
"step": 1444
},
{
"epoch": 0.6987427466150871,
"grad_norm": 0.44633156061172485,
"learning_rate": 4.854330056388563e-06,
"loss": 0.4764,
"step": 1445
},
{
"epoch": 0.6992263056092843,
"grad_norm": 0.40800923109054565,
"learning_rate": 4.854115339028411e-06,
"loss": 0.4415,
"step": 1446
},
{
"epoch": 0.6997098646034816,
"grad_norm": 0.44708821177482605,
"learning_rate": 4.85390046829399e-06,
"loss": 0.4656,
"step": 1447
},
{
"epoch": 0.7001934235976789,
"grad_norm": 0.4148036241531372,
"learning_rate": 4.853685444199299e-06,
"loss": 0.4406,
"step": 1448
},
{
"epoch": 0.7006769825918762,
"grad_norm": 0.4434484839439392,
"learning_rate": 4.853470266758348e-06,
"loss": 0.4481,
"step": 1449
},
{
"epoch": 0.7011605415860735,
"grad_norm": 0.4499351382255554,
"learning_rate": 4.8532549359851555e-06,
"loss": 0.4581,
"step": 1450
},
{
"epoch": 0.7016441005802708,
"grad_norm": 0.5162652730941772,
"learning_rate": 4.853039451893752e-06,
"loss": 0.4687,
"step": 1451
},
{
"epoch": 0.7021276595744681,
"grad_norm": 0.45000189542770386,
"learning_rate": 4.8528238144981745e-06,
"loss": 0.4345,
"step": 1452
},
{
"epoch": 0.7026112185686654,
"grad_norm": 0.4324718117713928,
"learning_rate": 4.852608023812473e-06,
"loss": 0.4595,
"step": 1453
},
{
"epoch": 0.7030947775628626,
"grad_norm": 0.4326983392238617,
"learning_rate": 4.852392079850707e-06,
"loss": 0.4457,
"step": 1454
},
{
"epoch": 0.7035783365570599,
"grad_norm": 0.431071400642395,
"learning_rate": 4.852175982626945e-06,
"loss": 0.4737,
"step": 1455
},
{
"epoch": 0.7040618955512572,
"grad_norm": 0.4474363923072815,
"learning_rate": 4.8519597321552666e-06,
"loss": 0.4716,
"step": 1456
},
{
"epoch": 0.7045454545454546,
"grad_norm": 0.4463392198085785,
"learning_rate": 4.85174332844976e-06,
"loss": 0.4482,
"step": 1457
},
{
"epoch": 0.7050290135396519,
"grad_norm": 0.5177990794181824,
"learning_rate": 4.851526771524526e-06,
"loss": 0.4768,
"step": 1458
},
{
"epoch": 0.7055125725338491,
"grad_norm": 0.4245111346244812,
"learning_rate": 4.8513100613936725e-06,
"loss": 0.4732,
"step": 1459
},
{
"epoch": 0.7059961315280464,
"grad_norm": 0.4291518032550812,
"learning_rate": 4.851093198071318e-06,
"loss": 0.4747,
"step": 1460
},
{
"epoch": 0.7064796905222437,
"grad_norm": 0.42464911937713623,
"learning_rate": 4.850876181571592e-06,
"loss": 0.4702,
"step": 1461
},
{
"epoch": 0.706963249516441,
"grad_norm": 0.4318511486053467,
"learning_rate": 4.850659011908633e-06,
"loss": 0.4663,
"step": 1462
},
{
"epoch": 0.7074468085106383,
"grad_norm": 0.42098772525787354,
"learning_rate": 4.850441689096591e-06,
"loss": 0.4641,
"step": 1463
},
{
"epoch": 0.7079303675048356,
"grad_norm": 0.4562412202358246,
"learning_rate": 4.850224213149624e-06,
"loss": 0.4346,
"step": 1464
},
{
"epoch": 0.7084139264990329,
"grad_norm": 0.43204471468925476,
"learning_rate": 4.850006584081901e-06,
"loss": 0.4743,
"step": 1465
},
{
"epoch": 0.7088974854932302,
"grad_norm": 0.4134417772293091,
"learning_rate": 4.849788801907602e-06,
"loss": 0.4462,
"step": 1466
},
{
"epoch": 0.7093810444874274,
"grad_norm": 0.472341388463974,
"learning_rate": 4.8495708666409135e-06,
"loss": 0.4558,
"step": 1467
},
{
"epoch": 0.7098646034816247,
"grad_norm": 0.6017382740974426,
"learning_rate": 4.849352778296037e-06,
"loss": 0.4409,
"step": 1468
},
{
"epoch": 0.710348162475822,
"grad_norm": 0.6916100978851318,
"learning_rate": 4.849134536887179e-06,
"loss": 0.4521,
"step": 1469
},
{
"epoch": 0.7108317214700194,
"grad_norm": 0.44142261147499084,
"learning_rate": 4.84891614242856e-06,
"loss": 0.4614,
"step": 1470
},
{
"epoch": 0.7113152804642167,
"grad_norm": 0.45912495255470276,
"learning_rate": 4.848697594934407e-06,
"loss": 0.4604,
"step": 1471
},
{
"epoch": 0.7117988394584139,
"grad_norm": 0.43075597286224365,
"learning_rate": 4.848478894418961e-06,
"loss": 0.4434,
"step": 1472
},
{
"epoch": 0.7122823984526112,
"grad_norm": 0.43654683232307434,
"learning_rate": 4.848260040896469e-06,
"loss": 0.4868,
"step": 1473
},
{
"epoch": 0.7127659574468085,
"grad_norm": 0.42006418108940125,
"learning_rate": 4.84804103438119e-06,
"loss": 0.4593,
"step": 1474
},
{
"epoch": 0.7132495164410058,
"grad_norm": 0.4998641610145569,
"learning_rate": 4.847821874887393e-06,
"loss": 0.4583,
"step": 1475
},
{
"epoch": 0.7137330754352031,
"grad_norm": 0.4955298900604248,
"learning_rate": 4.847602562429356e-06,
"loss": 0.4709,
"step": 1476
},
{
"epoch": 0.7142166344294004,
"grad_norm": 0.4439548850059509,
"learning_rate": 4.847383097021368e-06,
"loss": 0.465,
"step": 1477
},
{
"epoch": 0.7147001934235977,
"grad_norm": 0.44863393902778625,
"learning_rate": 4.847163478677726e-06,
"loss": 0.45,
"step": 1478
},
{
"epoch": 0.715183752417795,
"grad_norm": 0.42579901218414307,
"learning_rate": 4.846943707412741e-06,
"loss": 0.4527,
"step": 1479
},
{
"epoch": 0.7156673114119922,
"grad_norm": 0.4317091703414917,
"learning_rate": 4.84672378324073e-06,
"loss": 0.444,
"step": 1480
},
{
"epoch": 0.7161508704061895,
"grad_norm": 0.48940885066986084,
"learning_rate": 4.846503706176021e-06,
"loss": 0.4844,
"step": 1481
},
{
"epoch": 0.7166344294003868,
"grad_norm": 0.4427342414855957,
"learning_rate": 4.846283476232954e-06,
"loss": 0.4432,
"step": 1482
},
{
"epoch": 0.7171179883945842,
"grad_norm": 0.42922696471214294,
"learning_rate": 4.846063093425876e-06,
"loss": 0.4236,
"step": 1483
},
{
"epoch": 0.7176015473887815,
"grad_norm": 0.4381331503391266,
"learning_rate": 4.845842557769146e-06,
"loss": 0.4499,
"step": 1484
},
{
"epoch": 0.7180851063829787,
"grad_norm": 0.4614485204219818,
"learning_rate": 4.845621869277131e-06,
"loss": 0.4674,
"step": 1485
},
{
"epoch": 0.718568665377176,
"grad_norm": 0.4606797993183136,
"learning_rate": 4.8454010279642105e-06,
"loss": 0.4485,
"step": 1486
},
{
"epoch": 0.7190522243713733,
"grad_norm": 0.4316904842853546,
"learning_rate": 4.845180033844772e-06,
"loss": 0.447,
"step": 1487
},
{
"epoch": 0.7195357833655706,
"grad_norm": 0.45125576853752136,
"learning_rate": 4.844958886933215e-06,
"loss": 0.4676,
"step": 1488
},
{
"epoch": 0.7200193423597679,
"grad_norm": 0.4640989899635315,
"learning_rate": 4.844737587243944e-06,
"loss": 0.4608,
"step": 1489
},
{
"epoch": 0.7205029013539652,
"grad_norm": 0.4750809371471405,
"learning_rate": 4.844516134791381e-06,
"loss": 0.4561,
"step": 1490
},
{
"epoch": 0.7209864603481625,
"grad_norm": 0.42236974835395813,
"learning_rate": 4.844294529589952e-06,
"loss": 0.4595,
"step": 1491
},
{
"epoch": 0.7214700193423598,
"grad_norm": 0.5446487665176392,
"learning_rate": 4.8440727716540944e-06,
"loss": 0.4485,
"step": 1492
},
{
"epoch": 0.721953578336557,
"grad_norm": 0.43948227167129517,
"learning_rate": 4.843850860998258e-06,
"loss": 0.4565,
"step": 1493
},
{
"epoch": 0.7224371373307543,
"grad_norm": 0.45492276549339294,
"learning_rate": 4.8436287976369e-06,
"loss": 0.4523,
"step": 1494
},
{
"epoch": 0.7229206963249516,
"grad_norm": 0.4555799961090088,
"learning_rate": 4.843406581584487e-06,
"loss": 0.4535,
"step": 1495
},
{
"epoch": 0.723404255319149,
"grad_norm": 0.4507673382759094,
"learning_rate": 4.843184212855498e-06,
"loss": 0.4795,
"step": 1496
},
{
"epoch": 0.7238878143133463,
"grad_norm": 0.4655287265777588,
"learning_rate": 4.842961691464419e-06,
"loss": 0.4492,
"step": 1497
},
{
"epoch": 0.7243713733075435,
"grad_norm": 0.46638283133506775,
"learning_rate": 4.842739017425749e-06,
"loss": 0.4439,
"step": 1498
},
{
"epoch": 0.7248549323017408,
"grad_norm": 0.8781645894050598,
"learning_rate": 4.842516190753996e-06,
"loss": 0.4528,
"step": 1499
},
{
"epoch": 0.7253384912959381,
"grad_norm": 0.5471943616867065,
"learning_rate": 4.842293211463677e-06,
"loss": 0.4521,
"step": 1500
},
{
"epoch": 0.7258220502901354,
"grad_norm": 0.5306839942932129,
"learning_rate": 4.842070079569319e-06,
"loss": 0.4478,
"step": 1501
},
{
"epoch": 0.7263056092843327,
"grad_norm": 0.48267731070518494,
"learning_rate": 4.841846795085459e-06,
"loss": 0.4689,
"step": 1502
},
{
"epoch": 0.72678916827853,
"grad_norm": 0.4297829270362854,
"learning_rate": 4.841623358026646e-06,
"loss": 0.4576,
"step": 1503
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.47736454010009766,
"learning_rate": 4.8413997684074355e-06,
"loss": 0.4657,
"step": 1504
},
{
"epoch": 0.7277562862669246,
"grad_norm": 0.47262609004974365,
"learning_rate": 4.841176026242396e-06,
"loss": 0.439,
"step": 1505
},
{
"epoch": 0.7282398452611218,
"grad_norm": 0.4317893981933594,
"learning_rate": 4.840952131546103e-06,
"loss": 0.4468,
"step": 1506
},
{
"epoch": 0.7287234042553191,
"grad_norm": 0.4293960630893707,
"learning_rate": 4.8407280843331456e-06,
"loss": 0.4464,
"step": 1507
},
{
"epoch": 0.7292069632495164,
"grad_norm": 0.4995443522930145,
"learning_rate": 4.8405038846181195e-06,
"loss": 0.4596,
"step": 1508
},
{
"epoch": 0.7296905222437138,
"grad_norm": 0.4192506670951843,
"learning_rate": 4.840279532415633e-06,
"loss": 0.4655,
"step": 1509
},
{
"epoch": 0.730174081237911,
"grad_norm": 0.43838778138160706,
"learning_rate": 4.840055027740301e-06,
"loss": 0.4441,
"step": 1510
},
{
"epoch": 0.7306576402321083,
"grad_norm": 0.4230515956878662,
"learning_rate": 4.839830370606751e-06,
"loss": 0.4616,
"step": 1511
},
{
"epoch": 0.7311411992263056,
"grad_norm": 0.4396006464958191,
"learning_rate": 4.839605561029622e-06,
"loss": 0.4462,
"step": 1512
},
{
"epoch": 0.7316247582205029,
"grad_norm": 0.436576783657074,
"learning_rate": 4.839380599023558e-06,
"loss": 0.4694,
"step": 1513
},
{
"epoch": 0.7321083172147002,
"grad_norm": 0.5072314143180847,
"learning_rate": 4.839155484603216e-06,
"loss": 0.4641,
"step": 1514
},
{
"epoch": 0.7325918762088974,
"grad_norm": 0.42909497022628784,
"learning_rate": 4.838930217783263e-06,
"loss": 0.4506,
"step": 1515
},
{
"epoch": 0.7330754352030948,
"grad_norm": 0.42269250750541687,
"learning_rate": 4.838704798578377e-06,
"loss": 0.4555,
"step": 1516
},
{
"epoch": 0.7335589941972921,
"grad_norm": 0.4334566295146942,
"learning_rate": 4.838479227003241e-06,
"loss": 0.4756,
"step": 1517
},
{
"epoch": 0.7340425531914894,
"grad_norm": 0.4950511157512665,
"learning_rate": 4.838253503072554e-06,
"loss": 0.4704,
"step": 1518
},
{
"epoch": 0.7345261121856866,
"grad_norm": 0.44494107365608215,
"learning_rate": 4.838027626801021e-06,
"loss": 0.4502,
"step": 1519
},
{
"epoch": 0.7350096711798839,
"grad_norm": 0.45712941884994507,
"learning_rate": 4.83780159820336e-06,
"loss": 0.4722,
"step": 1520
},
{
"epoch": 0.7354932301740812,
"grad_norm": 0.4459114670753479,
"learning_rate": 4.837575417294295e-06,
"loss": 0.4578,
"step": 1521
},
{
"epoch": 0.7359767891682786,
"grad_norm": 0.557133138179779,
"learning_rate": 4.837349084088563e-06,
"loss": 0.4691,
"step": 1522
},
{
"epoch": 0.7364603481624759,
"grad_norm": 0.4571788012981415,
"learning_rate": 4.8371225986009104e-06,
"loss": 0.4564,
"step": 1523
},
{
"epoch": 0.7369439071566731,
"grad_norm": 0.4255046546459198,
"learning_rate": 4.836895960846092e-06,
"loss": 0.4501,
"step": 1524
},
{
"epoch": 0.7374274661508704,
"grad_norm": 0.43040698766708374,
"learning_rate": 4.836669170838874e-06,
"loss": 0.4522,
"step": 1525
},
{
"epoch": 0.7379110251450677,
"grad_norm": 0.4929506778717041,
"learning_rate": 4.836442228594032e-06,
"loss": 0.4553,
"step": 1526
},
{
"epoch": 0.738394584139265,
"grad_norm": 0.4765097200870514,
"learning_rate": 4.8362151341263515e-06,
"loss": 0.4727,
"step": 1527
},
{
"epoch": 0.7388781431334622,
"grad_norm": 0.4605850279331207,
"learning_rate": 4.83598788745063e-06,
"loss": 0.4484,
"step": 1528
},
{
"epoch": 0.7393617021276596,
"grad_norm": 0.7684630751609802,
"learning_rate": 4.83576048858167e-06,
"loss": 0.4534,
"step": 1529
},
{
"epoch": 0.7398452611218569,
"grad_norm": 0.42882004380226135,
"learning_rate": 4.835532937534289e-06,
"loss": 0.458,
"step": 1530
},
{
"epoch": 0.7403288201160542,
"grad_norm": 0.43994221091270447,
"learning_rate": 4.835305234323311e-06,
"loss": 0.4485,
"step": 1531
},
{
"epoch": 0.7408123791102514,
"grad_norm": 0.4888148903846741,
"learning_rate": 4.835077378963573e-06,
"loss": 0.4437,
"step": 1532
},
{
"epoch": 0.7412959381044487,
"grad_norm": 0.4104488492012024,
"learning_rate": 4.834849371469917e-06,
"loss": 0.454,
"step": 1533
},
{
"epoch": 0.741779497098646,
"grad_norm": 0.45566514134407043,
"learning_rate": 4.834621211857202e-06,
"loss": 0.4422,
"step": 1534
},
{
"epoch": 0.7422630560928434,
"grad_norm": 0.4125506579875946,
"learning_rate": 4.83439290014029e-06,
"loss": 0.4513,
"step": 1535
},
{
"epoch": 0.7427466150870407,
"grad_norm": 0.5421583652496338,
"learning_rate": 4.834164436334057e-06,
"loss": 0.4367,
"step": 1536
},
{
"epoch": 0.7432301740812379,
"grad_norm": 0.4610116481781006,
"learning_rate": 4.8339358204533874e-06,
"loss": 0.485,
"step": 1537
},
{
"epoch": 0.7437137330754352,
"grad_norm": 0.45787009596824646,
"learning_rate": 4.8337070525131755e-06,
"loss": 0.4576,
"step": 1538
},
{
"epoch": 0.7441972920696325,
"grad_norm": 0.43323206901550293,
"learning_rate": 4.833478132528328e-06,
"loss": 0.4725,
"step": 1539
},
{
"epoch": 0.7446808510638298,
"grad_norm": 0.46115002036094666,
"learning_rate": 4.833249060513756e-06,
"loss": 0.4678,
"step": 1540
},
{
"epoch": 0.745164410058027,
"grad_norm": 0.43544140458106995,
"learning_rate": 4.833019836484387e-06,
"loss": 0.4791,
"step": 1541
},
{
"epoch": 0.7456479690522244,
"grad_norm": 0.4073650538921356,
"learning_rate": 4.832790460455153e-06,
"loss": 0.4429,
"step": 1542
},
{
"epoch": 0.7461315280464217,
"grad_norm": 0.4155426621437073,
"learning_rate": 4.832560932441e-06,
"loss": 0.4519,
"step": 1543
},
{
"epoch": 0.746615087040619,
"grad_norm": 0.43398088216781616,
"learning_rate": 4.8323312524568825e-06,
"loss": 0.4478,
"step": 1544
},
{
"epoch": 0.7470986460348162,
"grad_norm": 0.4437168836593628,
"learning_rate": 4.832101420517761e-06,
"loss": 0.4553,
"step": 1545
},
{
"epoch": 0.7475822050290135,
"grad_norm": 0.46254605054855347,
"learning_rate": 4.831871436638613e-06,
"loss": 0.4623,
"step": 1546
},
{
"epoch": 0.7480657640232108,
"grad_norm": 0.4210186004638672,
"learning_rate": 4.8316413008344206e-06,
"loss": 0.4508,
"step": 1547
},
{
"epoch": 0.7485493230174082,
"grad_norm": 0.5048815608024597,
"learning_rate": 4.831411013120179e-06,
"loss": 0.4592,
"step": 1548
},
{
"epoch": 0.7490328820116054,
"grad_norm": 0.644477128982544,
"learning_rate": 4.83118057351089e-06,
"loss": 0.4583,
"step": 1549
},
{
"epoch": 0.7495164410058027,
"grad_norm": 0.5244016647338867,
"learning_rate": 4.830949982021568e-06,
"loss": 0.4583,
"step": 1550
},
{
"epoch": 0.75,
"grad_norm": 0.41632694005966187,
"learning_rate": 4.8307192386672365e-06,
"loss": 0.4528,
"step": 1551
},
{
"epoch": 0.7504835589941973,
"grad_norm": 1.1326220035552979,
"learning_rate": 4.8304883434629276e-06,
"loss": 0.4665,
"step": 1552
},
{
"epoch": 0.7509671179883946,
"grad_norm": 0.42171135544776917,
"learning_rate": 4.830257296423686e-06,
"loss": 0.4496,
"step": 1553
},
{
"epoch": 0.7514506769825918,
"grad_norm": 0.45120969414711,
"learning_rate": 4.830026097564564e-06,
"loss": 0.4775,
"step": 1554
},
{
"epoch": 0.7519342359767892,
"grad_norm": 0.4309103786945343,
"learning_rate": 4.829794746900626e-06,
"loss": 0.4561,
"step": 1555
},
{
"epoch": 0.7524177949709865,
"grad_norm": 0.4334292411804199,
"learning_rate": 4.829563244446942e-06,
"loss": 0.4518,
"step": 1556
},
{
"epoch": 0.7529013539651838,
"grad_norm": 0.42995062470436096,
"learning_rate": 4.829331590218597e-06,
"loss": 0.479,
"step": 1557
},
{
"epoch": 0.753384912959381,
"grad_norm": 0.6324965953826904,
"learning_rate": 4.829099784230683e-06,
"loss": 0.4869,
"step": 1558
},
{
"epoch": 0.7538684719535783,
"grad_norm": 0.45993176102638245,
"learning_rate": 4.828867826498302e-06,
"loss": 0.4686,
"step": 1559
},
{
"epoch": 0.7543520309477756,
"grad_norm": 0.4506562352180481,
"learning_rate": 4.828635717036569e-06,
"loss": 0.4779,
"step": 1560
},
{
"epoch": 0.754835589941973,
"grad_norm": 1.4067552089691162,
"learning_rate": 4.828403455860602e-06,
"loss": 0.4255,
"step": 1561
},
{
"epoch": 0.7553191489361702,
"grad_norm": 0.40801528096199036,
"learning_rate": 4.828171042985536e-06,
"loss": 0.4418,
"step": 1562
},
{
"epoch": 0.7558027079303675,
"grad_norm": 0.42918655276298523,
"learning_rate": 4.8279384784265124e-06,
"loss": 0.4633,
"step": 1563
},
{
"epoch": 0.7562862669245648,
"grad_norm": 0.43478161096572876,
"learning_rate": 4.827705762198683e-06,
"loss": 0.452,
"step": 1564
},
{
"epoch": 0.7567698259187621,
"grad_norm": 0.46473461389541626,
"learning_rate": 4.8274728943172105e-06,
"loss": 0.453,
"step": 1565
},
{
"epoch": 0.7572533849129593,
"grad_norm": 0.41810253262519836,
"learning_rate": 4.827239874797266e-06,
"loss": 0.4643,
"step": 1566
},
{
"epoch": 0.7577369439071566,
"grad_norm": 0.4875235855579376,
"learning_rate": 4.8270067036540305e-06,
"loss": 0.468,
"step": 1567
},
{
"epoch": 0.758220502901354,
"grad_norm": 0.4377727210521698,
"learning_rate": 4.826773380902696e-06,
"loss": 0.452,
"step": 1568
},
{
"epoch": 0.7587040618955513,
"grad_norm": 0.46963176131248474,
"learning_rate": 4.826539906558464e-06,
"loss": 0.4712,
"step": 1569
},
{
"epoch": 0.7591876208897486,
"grad_norm": 0.42545080184936523,
"learning_rate": 4.826306280636545e-06,
"loss": 0.4779,
"step": 1570
},
{
"epoch": 0.7596711798839458,
"grad_norm": 0.4469500184059143,
"learning_rate": 4.826072503152161e-06,
"loss": 0.4632,
"step": 1571
},
{
"epoch": 0.7601547388781431,
"grad_norm": 0.6386076807975769,
"learning_rate": 4.825838574120543e-06,
"loss": 0.4561,
"step": 1572
},
{
"epoch": 0.7606382978723404,
"grad_norm": 0.42124369740486145,
"learning_rate": 4.825604493556931e-06,
"loss": 0.4579,
"step": 1573
},
{
"epoch": 0.7611218568665378,
"grad_norm": 0.4487420320510864,
"learning_rate": 4.825370261476576e-06,
"loss": 0.4544,
"step": 1574
},
{
"epoch": 0.761605415860735,
"grad_norm": 0.4654780328273773,
"learning_rate": 4.825135877894739e-06,
"loss": 0.44,
"step": 1575
},
{
"epoch": 0.7620889748549323,
"grad_norm": 0.45695367455482483,
"learning_rate": 4.82490134282669e-06,
"loss": 0.4618,
"step": 1576
},
{
"epoch": 0.7625725338491296,
"grad_norm": 0.4306217133998871,
"learning_rate": 4.824666656287709e-06,
"loss": 0.4574,
"step": 1577
},
{
"epoch": 0.7630560928433269,
"grad_norm": 0.4209129512310028,
"learning_rate": 4.824431818293088e-06,
"loss": 0.4506,
"step": 1578
},
{
"epoch": 0.7635396518375241,
"grad_norm": 0.4458996653556824,
"learning_rate": 4.824196828858124e-06,
"loss": 0.4359,
"step": 1579
},
{
"epoch": 0.7640232108317214,
"grad_norm": 0.46759527921676636,
"learning_rate": 4.82396168799813e-06,
"loss": 0.445,
"step": 1580
},
{
"epoch": 0.7645067698259188,
"grad_norm": 0.4400273859500885,
"learning_rate": 4.823726395728424e-06,
"loss": 0.4393,
"step": 1581
},
{
"epoch": 0.7649903288201161,
"grad_norm": 0.49958503246307373,
"learning_rate": 4.823490952064337e-06,
"loss": 0.4619,
"step": 1582
},
{
"epoch": 0.7654738878143134,
"grad_norm": 0.4214346408843994,
"learning_rate": 4.823255357021206e-06,
"loss": 0.4671,
"step": 1583
},
{
"epoch": 0.7659574468085106,
"grad_norm": 0.4155096411705017,
"learning_rate": 4.8230196106143835e-06,
"loss": 0.4803,
"step": 1584
},
{
"epoch": 0.7664410058027079,
"grad_norm": 0.45440050959587097,
"learning_rate": 4.822783712859227e-06,
"loss": 0.4746,
"step": 1585
},
{
"epoch": 0.7669245647969052,
"grad_norm": 0.42145711183547974,
"learning_rate": 4.8225476637711055e-06,
"loss": 0.4658,
"step": 1586
},
{
"epoch": 0.7674081237911026,
"grad_norm": 0.9519943594932556,
"learning_rate": 4.8223114633653975e-06,
"loss": 0.4358,
"step": 1587
},
{
"epoch": 0.7678916827852998,
"grad_norm": 0.4257833659648895,
"learning_rate": 4.822075111657494e-06,
"loss": 0.4566,
"step": 1588
},
{
"epoch": 0.7683752417794971,
"grad_norm": 0.41692131757736206,
"learning_rate": 4.821838608662792e-06,
"loss": 0.4732,
"step": 1589
},
{
"epoch": 0.7688588007736944,
"grad_norm": 0.4908398687839508,
"learning_rate": 4.821601954396701e-06,
"loss": 0.4633,
"step": 1590
},
{
"epoch": 0.7693423597678917,
"grad_norm": 0.4296092987060547,
"learning_rate": 4.821365148874637e-06,
"loss": 0.4597,
"step": 1591
},
{
"epoch": 0.769825918762089,
"grad_norm": 0.4359409511089325,
"learning_rate": 4.821128192112031e-06,
"loss": 0.4615,
"step": 1592
},
{
"epoch": 0.7703094777562862,
"grad_norm": 0.4576115608215332,
"learning_rate": 4.820891084124321e-06,
"loss": 0.4521,
"step": 1593
},
{
"epoch": 0.7707930367504836,
"grad_norm": 0.43757978081703186,
"learning_rate": 4.820653824926953e-06,
"loss": 0.4576,
"step": 1594
},
{
"epoch": 0.7712765957446809,
"grad_norm": 0.40932992100715637,
"learning_rate": 4.820416414535386e-06,
"loss": 0.4452,
"step": 1595
},
{
"epoch": 0.7717601547388782,
"grad_norm": 0.43040093779563904,
"learning_rate": 4.820178852965088e-06,
"loss": 0.4562,
"step": 1596
},
{
"epoch": 0.7722437137330754,
"grad_norm": 0.48488855361938477,
"learning_rate": 4.8199411402315356e-06,
"loss": 0.4612,
"step": 1597
},
{
"epoch": 0.7727272727272727,
"grad_norm": 0.43302029371261597,
"learning_rate": 4.819703276350217e-06,
"loss": 0.4573,
"step": 1598
},
{
"epoch": 0.77321083172147,
"grad_norm": 0.49940812587738037,
"learning_rate": 4.819465261336629e-06,
"loss": 0.4455,
"step": 1599
},
{
"epoch": 0.7736943907156673,
"grad_norm": 0.4195043444633484,
"learning_rate": 4.819227095206278e-06,
"loss": 0.4618,
"step": 1600
},
{
"epoch": 0.7741779497098646,
"grad_norm": 0.4266219735145569,
"learning_rate": 4.818988777974682e-06,
"loss": 0.4634,
"step": 1601
},
{
"epoch": 0.7746615087040619,
"grad_norm": 0.44857192039489746,
"learning_rate": 4.8187503096573674e-06,
"loss": 0.4555,
"step": 1602
},
{
"epoch": 0.7751450676982592,
"grad_norm": 0.444608211517334,
"learning_rate": 4.81851169026987e-06,
"loss": 0.4562,
"step": 1603
},
{
"epoch": 0.7756286266924565,
"grad_norm": 0.4272661507129669,
"learning_rate": 4.818272919827737e-06,
"loss": 0.4651,
"step": 1604
},
{
"epoch": 0.7761121856866537,
"grad_norm": 0.41964107751846313,
"learning_rate": 4.8180339983465256e-06,
"loss": 0.4425,
"step": 1605
},
{
"epoch": 0.776595744680851,
"grad_norm": 0.6876490116119385,
"learning_rate": 4.8177949258418e-06,
"loss": 0.4552,
"step": 1606
},
{
"epoch": 0.7770793036750484,
"grad_norm": 0.6213021874427795,
"learning_rate": 4.8175557023291365e-06,
"loss": 0.444,
"step": 1607
},
{
"epoch": 0.7775628626692457,
"grad_norm": 0.44791480898857117,
"learning_rate": 4.817316327824122e-06,
"loss": 0.4477,
"step": 1608
},
{
"epoch": 0.778046421663443,
"grad_norm": 0.43226101994514465,
"learning_rate": 4.817076802342352e-06,
"loss": 0.4505,
"step": 1609
},
{
"epoch": 0.7785299806576402,
"grad_norm": 0.42529061436653137,
"learning_rate": 4.8168371258994305e-06,
"loss": 0.4762,
"step": 1610
},
{
"epoch": 0.7790135396518375,
"grad_norm": 0.43460187315940857,
"learning_rate": 4.816597298510974e-06,
"loss": 0.4569,
"step": 1611
},
{
"epoch": 0.7794970986460348,
"grad_norm": 0.9211375117301941,
"learning_rate": 4.816357320192608e-06,
"loss": 0.4413,
"step": 1612
},
{
"epoch": 0.7799806576402321,
"grad_norm": 0.4256466031074524,
"learning_rate": 4.816117190959966e-06,
"loss": 0.4705,
"step": 1613
},
{
"epoch": 0.7804642166344294,
"grad_norm": 0.45035386085510254,
"learning_rate": 4.815876910828694e-06,
"loss": 0.4625,
"step": 1614
},
{
"epoch": 0.7809477756286267,
"grad_norm": 0.446243017911911,
"learning_rate": 4.815636479814447e-06,
"loss": 0.4587,
"step": 1615
},
{
"epoch": 0.781431334622824,
"grad_norm": 0.46117037534713745,
"learning_rate": 4.815395897932888e-06,
"loss": 0.4632,
"step": 1616
},
{
"epoch": 0.7819148936170213,
"grad_norm": 0.417878121137619,
"learning_rate": 4.815155165199692e-06,
"loss": 0.462,
"step": 1617
},
{
"epoch": 0.7823984526112185,
"grad_norm": 0.43329668045043945,
"learning_rate": 4.814914281630543e-06,
"loss": 0.4555,
"step": 1618
},
{
"epoch": 0.7828820116054158,
"grad_norm": 0.41156476736068726,
"learning_rate": 4.814673247241135e-06,
"loss": 0.4582,
"step": 1619
},
{
"epoch": 0.7833655705996132,
"grad_norm": 0.5099841356277466,
"learning_rate": 4.814432062047172e-06,
"loss": 0.4259,
"step": 1620
},
{
"epoch": 0.7838491295938105,
"grad_norm": 0.41067326068878174,
"learning_rate": 4.814190726064367e-06,
"loss": 0.4444,
"step": 1621
},
{
"epoch": 0.7843326885880078,
"grad_norm": 0.4209578335285187,
"learning_rate": 4.813949239308444e-06,
"loss": 0.4651,
"step": 1622
},
{
"epoch": 0.784816247582205,
"grad_norm": 0.42823606729507446,
"learning_rate": 4.813707601795136e-06,
"loss": 0.4594,
"step": 1623
},
{
"epoch": 0.7852998065764023,
"grad_norm": 0.4377743899822235,
"learning_rate": 4.813465813540186e-06,
"loss": 0.4661,
"step": 1624
},
{
"epoch": 0.7857833655705996,
"grad_norm": 0.4125308692455292,
"learning_rate": 4.8132238745593474e-06,
"loss": 0.4528,
"step": 1625
},
{
"epoch": 0.7862669245647969,
"grad_norm": 0.42725464701652527,
"learning_rate": 4.812981784868383e-06,
"loss": 0.4549,
"step": 1626
},
{
"epoch": 0.7867504835589942,
"grad_norm": 0.49512672424316406,
"learning_rate": 4.812739544483064e-06,
"loss": 0.4482,
"step": 1627
},
{
"epoch": 0.7872340425531915,
"grad_norm": 0.42611178755760193,
"learning_rate": 4.812497153419173e-06,
"loss": 0.4514,
"step": 1628
},
{
"epoch": 0.7877176015473888,
"grad_norm": 0.4450073838233948,
"learning_rate": 4.812254611692504e-06,
"loss": 0.4625,
"step": 1629
},
{
"epoch": 0.7882011605415861,
"grad_norm": 0.6700217723846436,
"learning_rate": 4.812011919318857e-06,
"loss": 0.4854,
"step": 1630
},
{
"epoch": 0.7886847195357833,
"grad_norm": 0.4856039881706238,
"learning_rate": 4.811769076314044e-06,
"loss": 0.4487,
"step": 1631
},
{
"epoch": 0.7891682785299806,
"grad_norm": 0.4218294024467468,
"learning_rate": 4.811526082693888e-06,
"loss": 0.4488,
"step": 1632
},
{
"epoch": 0.789651837524178,
"grad_norm": 0.42976853251457214,
"learning_rate": 4.811282938474219e-06,
"loss": 0.4566,
"step": 1633
},
{
"epoch": 0.7901353965183753,
"grad_norm": 0.4193245470523834,
"learning_rate": 4.811039643670878e-06,
"loss": 0.4567,
"step": 1634
},
{
"epoch": 0.7906189555125726,
"grad_norm": 0.4194696247577667,
"learning_rate": 4.810796198299717e-06,
"loss": 0.4405,
"step": 1635
},
{
"epoch": 0.7911025145067698,
"grad_norm": 0.4444844424724579,
"learning_rate": 4.810552602376597e-06,
"loss": 0.4647,
"step": 1636
},
{
"epoch": 0.7915860735009671,
"grad_norm": 0.4571591019630432,
"learning_rate": 4.810308855917388e-06,
"loss": 0.4434,
"step": 1637
},
{
"epoch": 0.7920696324951644,
"grad_norm": 0.399015337228775,
"learning_rate": 4.810064958937971e-06,
"loss": 0.431,
"step": 1638
},
{
"epoch": 0.7925531914893617,
"grad_norm": 0.45619863271713257,
"learning_rate": 4.809820911454236e-06,
"loss": 0.4509,
"step": 1639
},
{
"epoch": 0.793036750483559,
"grad_norm": 0.427679181098938,
"learning_rate": 4.809576713482082e-06,
"loss": 0.468,
"step": 1640
},
{
"epoch": 0.7935203094777563,
"grad_norm": 0.5514840483665466,
"learning_rate": 4.809332365037421e-06,
"loss": 0.4567,
"step": 1641
},
{
"epoch": 0.7940038684719536,
"grad_norm": 0.45355403423309326,
"learning_rate": 4.809087866136172e-06,
"loss": 0.4705,
"step": 1642
},
{
"epoch": 0.7944874274661509,
"grad_norm": 0.4178369343280792,
"learning_rate": 4.8088432167942625e-06,
"loss": 0.4618,
"step": 1643
},
{
"epoch": 0.7949709864603481,
"grad_norm": 0.42189842462539673,
"learning_rate": 4.808598417027634e-06,
"loss": 0.4481,
"step": 1644
},
{
"epoch": 0.7954545454545454,
"grad_norm": 0.4578203856945038,
"learning_rate": 4.8083534668522345e-06,
"loss": 0.4324,
"step": 1645
},
{
"epoch": 0.7959381044487428,
"grad_norm": 0.449862003326416,
"learning_rate": 4.808108366284024e-06,
"loss": 0.4495,
"step": 1646
},
{
"epoch": 0.7964216634429401,
"grad_norm": 0.4230690598487854,
"learning_rate": 4.807863115338971e-06,
"loss": 0.4615,
"step": 1647
},
{
"epoch": 0.7969052224371374,
"grad_norm": 0.4719443917274475,
"learning_rate": 4.807617714033053e-06,
"loss": 0.4599,
"step": 1648
},
{
"epoch": 0.7973887814313346,
"grad_norm": 0.44040265679359436,
"learning_rate": 4.807372162382258e-06,
"loss": 0.4459,
"step": 1649
},
{
"epoch": 0.7978723404255319,
"grad_norm": 0.45189377665519714,
"learning_rate": 4.807126460402585e-06,
"loss": 0.4593,
"step": 1650
},
{
"epoch": 0.7983558994197292,
"grad_norm": 0.4428291618824005,
"learning_rate": 4.806880608110042e-06,
"loss": 0.456,
"step": 1651
},
{
"epoch": 0.7988394584139265,
"grad_norm": 0.4866504371166229,
"learning_rate": 4.8066346055206465e-06,
"loss": 0.4172,
"step": 1652
},
{
"epoch": 0.7993230174081238,
"grad_norm": 0.43505120277404785,
"learning_rate": 4.806388452650426e-06,
"loss": 0.4516,
"step": 1653
},
{
"epoch": 0.7998065764023211,
"grad_norm": 0.45473140478134155,
"learning_rate": 4.806142149515416e-06,
"loss": 0.4614,
"step": 1654
},
{
"epoch": 0.8002901353965184,
"grad_norm": 0.5264351963996887,
"learning_rate": 4.8058956961316675e-06,
"loss": 0.4461,
"step": 1655
},
{
"epoch": 0.8007736943907157,
"grad_norm": 0.435194730758667,
"learning_rate": 4.805649092515232e-06,
"loss": 0.4469,
"step": 1656
},
{
"epoch": 0.8012572533849129,
"grad_norm": 0.45554107427597046,
"learning_rate": 4.805402338682181e-06,
"loss": 0.461,
"step": 1657
},
{
"epoch": 0.8017408123791102,
"grad_norm": 0.4560002088546753,
"learning_rate": 4.8051554346485885e-06,
"loss": 0.4592,
"step": 1658
},
{
"epoch": 0.8022243713733076,
"grad_norm": 0.47332048416137695,
"learning_rate": 4.804908380430542e-06,
"loss": 0.4441,
"step": 1659
},
{
"epoch": 0.8027079303675049,
"grad_norm": 0.43564561009407043,
"learning_rate": 4.804661176044134e-06,
"loss": 0.474,
"step": 1660
},
{
"epoch": 0.8031914893617021,
"grad_norm": 0.43015816807746887,
"learning_rate": 4.8044138215054755e-06,
"loss": 0.4503,
"step": 1661
},
{
"epoch": 0.8036750483558994,
"grad_norm": 0.4809452295303345,
"learning_rate": 4.804166316830678e-06,
"loss": 0.4514,
"step": 1662
},
{
"epoch": 0.8041586073500967,
"grad_norm": 0.4462874233722687,
"learning_rate": 4.803918662035868e-06,
"loss": 0.463,
"step": 1663
},
{
"epoch": 0.804642166344294,
"grad_norm": 0.41628143191337585,
"learning_rate": 4.803670857137181e-06,
"loss": 0.4339,
"step": 1664
},
{
"epoch": 0.8051257253384912,
"grad_norm": 0.4212723672389984,
"learning_rate": 4.803422902150762e-06,
"loss": 0.4722,
"step": 1665
},
{
"epoch": 0.8056092843326886,
"grad_norm": 0.44856181740760803,
"learning_rate": 4.8031747970927645e-06,
"loss": 0.4467,
"step": 1666
},
{
"epoch": 0.8060928433268859,
"grad_norm": 0.45024457573890686,
"learning_rate": 4.802926541979354e-06,
"loss": 0.4616,
"step": 1667
},
{
"epoch": 0.8065764023210832,
"grad_norm": 0.42908668518066406,
"learning_rate": 4.802678136826704e-06,
"loss": 0.4625,
"step": 1668
},
{
"epoch": 0.8070599613152805,
"grad_norm": 0.4267566204071045,
"learning_rate": 4.8024295816509995e-06,
"loss": 0.4548,
"step": 1669
},
{
"epoch": 0.8075435203094777,
"grad_norm": 0.4682227373123169,
"learning_rate": 4.802180876468433e-06,
"loss": 0.4555,
"step": 1670
},
{
"epoch": 0.808027079303675,
"grad_norm": 0.7029394507408142,
"learning_rate": 4.801932021295209e-06,
"loss": 0.4631,
"step": 1671
},
{
"epoch": 0.8085106382978723,
"grad_norm": 0.5212095975875854,
"learning_rate": 4.801683016147541e-06,
"loss": 0.4471,
"step": 1672
},
{
"epoch": 0.8089941972920697,
"grad_norm": 0.4278648793697357,
"learning_rate": 4.801433861041651e-06,
"loss": 0.4652,
"step": 1673
},
{
"epoch": 0.809477756286267,
"grad_norm": 0.45294439792633057,
"learning_rate": 4.801184555993772e-06,
"loss": 0.4419,
"step": 1674
},
{
"epoch": 0.8099613152804642,
"grad_norm": 0.4857720136642456,
"learning_rate": 4.800935101020148e-06,
"loss": 0.4649,
"step": 1675
},
{
"epoch": 0.8104448742746615,
"grad_norm": 0.4251551926136017,
"learning_rate": 4.800685496137029e-06,
"loss": 0.4667,
"step": 1676
},
{
"epoch": 0.8109284332688588,
"grad_norm": 0.5644770860671997,
"learning_rate": 4.800435741360679e-06,
"loss": 0.4417,
"step": 1677
},
{
"epoch": 0.811411992263056,
"grad_norm": 0.44446080923080444,
"learning_rate": 4.80018583670737e-06,
"loss": 0.4592,
"step": 1678
},
{
"epoch": 0.8118955512572534,
"grad_norm": 0.42635470628738403,
"learning_rate": 4.799935782193383e-06,
"loss": 0.4415,
"step": 1679
},
{
"epoch": 0.8123791102514507,
"grad_norm": 0.472168892621994,
"learning_rate": 4.799685577835009e-06,
"loss": 0.4577,
"step": 1680
},
{
"epoch": 0.812862669245648,
"grad_norm": 0.40940558910369873,
"learning_rate": 4.79943522364855e-06,
"loss": 0.4394,
"step": 1681
},
{
"epoch": 0.8133462282398453,
"grad_norm": 0.4531543254852295,
"learning_rate": 4.799184719650316e-06,
"loss": 0.458,
"step": 1682
},
{
"epoch": 0.8138297872340425,
"grad_norm": 0.5482609868049622,
"learning_rate": 4.79893406585663e-06,
"loss": 0.4498,
"step": 1683
},
{
"epoch": 0.8143133462282398,
"grad_norm": 0.4192523956298828,
"learning_rate": 4.7986832622838195e-06,
"loss": 0.4456,
"step": 1684
},
{
"epoch": 0.8147969052224371,
"grad_norm": 0.4526176452636719,
"learning_rate": 4.798432308948227e-06,
"loss": 0.4259,
"step": 1685
},
{
"epoch": 0.8152804642166345,
"grad_norm": 0.4501959979534149,
"learning_rate": 4.798181205866201e-06,
"loss": 0.442,
"step": 1686
},
{
"epoch": 0.8157640232108317,
"grad_norm": 0.42557254433631897,
"learning_rate": 4.797929953054102e-06,
"loss": 0.4685,
"step": 1687
},
{
"epoch": 0.816247582205029,
"grad_norm": 0.457403302192688,
"learning_rate": 4.7976785505283e-06,
"loss": 0.4717,
"step": 1688
},
{
"epoch": 0.8167311411992263,
"grad_norm": 0.5547893047332764,
"learning_rate": 4.797426998305172e-06,
"loss": 0.4608,
"step": 1689
},
{
"epoch": 0.8172147001934236,
"grad_norm": 0.43235495686531067,
"learning_rate": 4.7971752964011105e-06,
"loss": 0.469,
"step": 1690
},
{
"epoch": 0.8176982591876208,
"grad_norm": 0.6597929000854492,
"learning_rate": 4.796923444832512e-06,
"loss": 0.4495,
"step": 1691
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.6541535258293152,
"learning_rate": 4.796671443615785e-06,
"loss": 0.4659,
"step": 1692
},
{
"epoch": 0.8186653771760155,
"grad_norm": 0.45156872272491455,
"learning_rate": 4.796419292767349e-06,
"loss": 0.4666,
"step": 1693
},
{
"epoch": 0.8191489361702128,
"grad_norm": 0.41430842876434326,
"learning_rate": 4.7961669923036304e-06,
"loss": 0.4534,
"step": 1694
},
{
"epoch": 0.8196324951644101,
"grad_norm": 1.6840633153915405,
"learning_rate": 4.795914542241069e-06,
"loss": 0.4598,
"step": 1695
},
{
"epoch": 0.8201160541586073,
"grad_norm": 0.44279929995536804,
"learning_rate": 4.7956619425961095e-06,
"loss": 0.449,
"step": 1696
},
{
"epoch": 0.8205996131528046,
"grad_norm": 0.48392534255981445,
"learning_rate": 4.7954091933852124e-06,
"loss": 0.4634,
"step": 1697
},
{
"epoch": 0.8210831721470019,
"grad_norm": 0.4240199029445648,
"learning_rate": 4.795156294624842e-06,
"loss": 0.4418,
"step": 1698
},
{
"epoch": 0.8215667311411993,
"grad_norm": 0.4629557430744171,
"learning_rate": 4.794903246331477e-06,
"loss": 0.4592,
"step": 1699
},
{
"epoch": 0.8220502901353965,
"grad_norm": 0.41459253430366516,
"learning_rate": 4.794650048521603e-06,
"loss": 0.4622,
"step": 1700
},
{
"epoch": 0.8225338491295938,
"grad_norm": 0.4223582148551941,
"learning_rate": 4.794396701211715e-06,
"loss": 0.4542,
"step": 1701
},
{
"epoch": 0.8230174081237911,
"grad_norm": 0.41140425205230713,
"learning_rate": 4.794143204418322e-06,
"loss": 0.4479,
"step": 1702
},
{
"epoch": 0.8235009671179884,
"grad_norm": 0.45532315969467163,
"learning_rate": 4.793889558157937e-06,
"loss": 0.4446,
"step": 1703
},
{
"epoch": 0.8239845261121856,
"grad_norm": 0.4596729576587677,
"learning_rate": 4.793635762447086e-06,
"loss": 0.4658,
"step": 1704
},
{
"epoch": 0.824468085106383,
"grad_norm": 0.4528654217720032,
"learning_rate": 4.793381817302306e-06,
"loss": 0.4689,
"step": 1705
},
{
"epoch": 0.8249516441005803,
"grad_norm": 0.4420747458934784,
"learning_rate": 4.793127722740139e-06,
"loss": 0.4459,
"step": 1706
},
{
"epoch": 0.8254352030947776,
"grad_norm": 0.4936273992061615,
"learning_rate": 4.792873478777143e-06,
"loss": 0.4551,
"step": 1707
},
{
"epoch": 0.8259187620889749,
"grad_norm": 0.41701599955558777,
"learning_rate": 4.792619085429879e-06,
"loss": 0.4189,
"step": 1708
},
{
"epoch": 0.8264023210831721,
"grad_norm": 0.7268118262290955,
"learning_rate": 4.792364542714923e-06,
"loss": 0.4482,
"step": 1709
},
{
"epoch": 0.8268858800773694,
"grad_norm": 0.4214200973510742,
"learning_rate": 4.792109850648859e-06,
"loss": 0.4618,
"step": 1710
},
{
"epoch": 0.8273694390715667,
"grad_norm": 0.4370262920856476,
"learning_rate": 4.791855009248279e-06,
"loss": 0.4581,
"step": 1711
},
{
"epoch": 0.8278529980657641,
"grad_norm": 0.4265212118625641,
"learning_rate": 4.7916000185297885e-06,
"loss": 0.4483,
"step": 1712
},
{
"epoch": 0.8283365570599613,
"grad_norm": 0.4249550402164459,
"learning_rate": 4.791344878509999e-06,
"loss": 0.4545,
"step": 1713
},
{
"epoch": 0.8288201160541586,
"grad_norm": 0.4179043173789978,
"learning_rate": 4.791089589205534e-06,
"loss": 0.4481,
"step": 1714
},
{
"epoch": 0.8293036750483559,
"grad_norm": 0.42232656478881836,
"learning_rate": 4.790834150633025e-06,
"loss": 0.4495,
"step": 1715
},
{
"epoch": 0.8297872340425532,
"grad_norm": 0.4011172652244568,
"learning_rate": 4.790578562809116e-06,
"loss": 0.4571,
"step": 1716
},
{
"epoch": 0.8302707930367504,
"grad_norm": 0.701311469078064,
"learning_rate": 4.7903228257504574e-06,
"loss": 0.4622,
"step": 1717
},
{
"epoch": 0.8307543520309478,
"grad_norm": 0.4265159070491791,
"learning_rate": 4.790066939473711e-06,
"loss": 0.4398,
"step": 1718
},
{
"epoch": 0.8312379110251451,
"grad_norm": 0.48582372069358826,
"learning_rate": 4.78981090399555e-06,
"loss": 0.4812,
"step": 1719
},
{
"epoch": 0.8317214700193424,
"grad_norm": 0.4251188337802887,
"learning_rate": 4.789554719332652e-06,
"loss": 0.4429,
"step": 1720
},
{
"epoch": 0.8322050290135397,
"grad_norm": 0.4369942843914032,
"learning_rate": 4.789298385501712e-06,
"loss": 0.4698,
"step": 1721
},
{
"epoch": 0.8326885880077369,
"grad_norm": 0.4649386405944824,
"learning_rate": 4.789041902519427e-06,
"loss": 0.4595,
"step": 1722
},
{
"epoch": 0.8331721470019342,
"grad_norm": 0.43647441267967224,
"learning_rate": 4.788785270402508e-06,
"loss": 0.4301,
"step": 1723
},
{
"epoch": 0.8336557059961315,
"grad_norm": 0.4348291754722595,
"learning_rate": 4.788528489167677e-06,
"loss": 0.4698,
"step": 1724
},
{
"epoch": 0.8341392649903289,
"grad_norm": 0.4395955204963684,
"learning_rate": 4.788271558831663e-06,
"loss": 0.4602,
"step": 1725
},
{
"epoch": 0.8346228239845261,
"grad_norm": 0.4474172592163086,
"learning_rate": 4.788014479411203e-06,
"loss": 0.4531,
"step": 1726
},
{
"epoch": 0.8351063829787234,
"grad_norm": 0.4625321924686432,
"learning_rate": 4.787757250923049e-06,
"loss": 0.4687,
"step": 1727
},
{
"epoch": 0.8355899419729207,
"grad_norm": 0.4593714773654938,
"learning_rate": 4.7874998733839585e-06,
"loss": 0.4457,
"step": 1728
},
{
"epoch": 0.836073500967118,
"grad_norm": 0.46115872263908386,
"learning_rate": 4.7872423468107e-06,
"loss": 0.4609,
"step": 1729
},
{
"epoch": 0.8365570599613152,
"grad_norm": 0.4356870949268341,
"learning_rate": 4.786984671220053e-06,
"loss": 0.474,
"step": 1730
},
{
"epoch": 0.8370406189555126,
"grad_norm": 0.4250693619251251,
"learning_rate": 4.786726846628804e-06,
"loss": 0.448,
"step": 1731
},
{
"epoch": 0.8375241779497099,
"grad_norm": 0.433578222990036,
"learning_rate": 4.786468873053751e-06,
"loss": 0.4563,
"step": 1732
},
{
"epoch": 0.8380077369439072,
"grad_norm": 0.40608447790145874,
"learning_rate": 4.786210750511701e-06,
"loss": 0.4776,
"step": 1733
},
{
"epoch": 0.8384912959381045,
"grad_norm": 0.42531365156173706,
"learning_rate": 4.785952479019472e-06,
"loss": 0.4628,
"step": 1734
},
{
"epoch": 0.8389748549323017,
"grad_norm": 0.4353698790073395,
"learning_rate": 4.785694058593891e-06,
"loss": 0.4355,
"step": 1735
},
{
"epoch": 0.839458413926499,
"grad_norm": 0.48642927408218384,
"learning_rate": 4.785435489251794e-06,
"loss": 0.4357,
"step": 1736
},
{
"epoch": 0.8399419729206963,
"grad_norm": 0.4886138439178467,
"learning_rate": 4.785176771010026e-06,
"loss": 0.4432,
"step": 1737
},
{
"epoch": 0.8404255319148937,
"grad_norm": 0.45023098587989807,
"learning_rate": 4.784917903885445e-06,
"loss": 0.4606,
"step": 1738
},
{
"epoch": 0.8409090909090909,
"grad_norm": 0.432458758354187,
"learning_rate": 4.7846588878949155e-06,
"loss": 0.4678,
"step": 1739
},
{
"epoch": 0.8413926499032882,
"grad_norm": 0.8182100653648376,
"learning_rate": 4.784399723055313e-06,
"loss": 0.4517,
"step": 1740
},
{
"epoch": 0.8418762088974855,
"grad_norm": 0.40394729375839233,
"learning_rate": 4.784140409383522e-06,
"loss": 0.4487,
"step": 1741
},
{
"epoch": 0.8423597678916828,
"grad_norm": 0.4333181083202362,
"learning_rate": 4.783880946896438e-06,
"loss": 0.4647,
"step": 1742
},
{
"epoch": 0.84284332688588,
"grad_norm": 0.4345736801624298,
"learning_rate": 4.783621335610965e-06,
"loss": 0.4709,
"step": 1743
},
{
"epoch": 0.8433268858800773,
"grad_norm": 0.4557689130306244,
"learning_rate": 4.783361575544017e-06,
"loss": 0.4335,
"step": 1744
},
{
"epoch": 0.8438104448742747,
"grad_norm": 0.46906688809394836,
"learning_rate": 4.783101666712517e-06,
"loss": 0.4603,
"step": 1745
},
{
"epoch": 0.844294003868472,
"grad_norm": 0.41164135932922363,
"learning_rate": 4.7828416091334e-06,
"loss": 0.4465,
"step": 1746
},
{
"epoch": 0.8447775628626693,
"grad_norm": 0.4292045533657074,
"learning_rate": 4.782581402823608e-06,
"loss": 0.4614,
"step": 1747
},
{
"epoch": 0.8452611218568665,
"grad_norm": 0.4357629716396332,
"learning_rate": 4.782321047800094e-06,
"loss": 0.4654,
"step": 1748
},
{
"epoch": 0.8457446808510638,
"grad_norm": 0.4188762307167053,
"learning_rate": 4.782060544079822e-06,
"loss": 0.4631,
"step": 1749
},
{
"epoch": 0.8462282398452611,
"grad_norm": 0.42249998450279236,
"learning_rate": 4.781799891679763e-06,
"loss": 0.4467,
"step": 1750
},
{
"epoch": 0.8467117988394585,
"grad_norm": 0.42382362484931946,
"learning_rate": 4.781539090616898e-06,
"loss": 0.4286,
"step": 1751
},
{
"epoch": 0.8471953578336557,
"grad_norm": 0.45076820254325867,
"learning_rate": 4.78127814090822e-06,
"loss": 0.4471,
"step": 1752
},
{
"epoch": 0.847678916827853,
"grad_norm": 0.4701208770275116,
"learning_rate": 4.781017042570729e-06,
"loss": 0.4637,
"step": 1753
},
{
"epoch": 0.8481624758220503,
"grad_norm": 0.44319573044776917,
"learning_rate": 4.780755795621438e-06,
"loss": 0.4604,
"step": 1754
},
{
"epoch": 0.8486460348162476,
"grad_norm": 0.4308934509754181,
"learning_rate": 4.7804944000773665e-06,
"loss": 0.4554,
"step": 1755
},
{
"epoch": 0.8491295938104448,
"grad_norm": 0.4575637876987457,
"learning_rate": 4.780232855955544e-06,
"loss": 0.4506,
"step": 1756
},
{
"epoch": 0.8496131528046421,
"grad_norm": 0.43875017762184143,
"learning_rate": 4.779971163273012e-06,
"loss": 0.4441,
"step": 1757
},
{
"epoch": 0.8500967117988395,
"grad_norm": 0.4851578176021576,
"learning_rate": 4.779709322046818e-06,
"loss": 0.4551,
"step": 1758
},
{
"epoch": 0.8505802707930368,
"grad_norm": 0.46814751625061035,
"learning_rate": 4.7794473322940234e-06,
"loss": 0.407,
"step": 1759
},
{
"epoch": 0.851063829787234,
"grad_norm": 0.4497595727443695,
"learning_rate": 4.779185194031698e-06,
"loss": 0.4621,
"step": 1760
},
{
"epoch": 0.8515473887814313,
"grad_norm": 0.41520148515701294,
"learning_rate": 4.778922907276917e-06,
"loss": 0.4521,
"step": 1761
},
{
"epoch": 0.8520309477756286,
"grad_norm": 0.4281027019023895,
"learning_rate": 4.778660472046773e-06,
"loss": 0.4625,
"step": 1762
},
{
"epoch": 0.8525145067698259,
"grad_norm": 0.4215813875198364,
"learning_rate": 4.77839788835836e-06,
"loss": 0.4404,
"step": 1763
},
{
"epoch": 0.8529980657640233,
"grad_norm": 0.4445689022541046,
"learning_rate": 4.77813515622879e-06,
"loss": 0.4363,
"step": 1764
},
{
"epoch": 0.8534816247582205,
"grad_norm": 0.6075159311294556,
"learning_rate": 4.777872275675176e-06,
"loss": 0.4429,
"step": 1765
},
{
"epoch": 0.8539651837524178,
"grad_norm": 0.4195761978626251,
"learning_rate": 4.777609246714648e-06,
"loss": 0.4488,
"step": 1766
},
{
"epoch": 0.8544487427466151,
"grad_norm": 0.46260061860084534,
"learning_rate": 4.777346069364343e-06,
"loss": 0.4588,
"step": 1767
},
{
"epoch": 0.8549323017408124,
"grad_norm": 0.44670024514198303,
"learning_rate": 4.777082743641406e-06,
"loss": 0.4381,
"step": 1768
},
{
"epoch": 0.8554158607350096,
"grad_norm": 0.44087886810302734,
"learning_rate": 4.776819269562992e-06,
"loss": 0.4298,
"step": 1769
},
{
"epoch": 0.8558994197292069,
"grad_norm": 0.42692020535469055,
"learning_rate": 4.776555647146269e-06,
"loss": 0.4662,
"step": 1770
},
{
"epoch": 0.8563829787234043,
"grad_norm": 0.4216541349887848,
"learning_rate": 4.776291876408412e-06,
"loss": 0.4815,
"step": 1771
},
{
"epoch": 0.8568665377176016,
"grad_norm": 0.4323381781578064,
"learning_rate": 4.776027957366605e-06,
"loss": 0.4588,
"step": 1772
},
{
"epoch": 0.8573500967117988,
"grad_norm": 0.5027583241462708,
"learning_rate": 4.775763890038045e-06,
"loss": 0.4692,
"step": 1773
},
{
"epoch": 0.8578336557059961,
"grad_norm": 0.42592036724090576,
"learning_rate": 4.775499674439934e-06,
"loss": 0.4538,
"step": 1774
},
{
"epoch": 0.8583172147001934,
"grad_norm": 0.40400874614715576,
"learning_rate": 4.775235310589487e-06,
"loss": 0.4186,
"step": 1775
},
{
"epoch": 0.8588007736943907,
"grad_norm": 0.5430548191070557,
"learning_rate": 4.774970798503926e-06,
"loss": 0.453,
"step": 1776
},
{
"epoch": 0.8592843326885881,
"grad_norm": 0.43877115845680237,
"learning_rate": 4.774706138200488e-06,
"loss": 0.4208,
"step": 1777
},
{
"epoch": 0.8597678916827853,
"grad_norm": 0.47961005568504333,
"learning_rate": 4.774441329696413e-06,
"loss": 0.4471,
"step": 1778
},
{
"epoch": 0.8602514506769826,
"grad_norm": 0.4484774172306061,
"learning_rate": 4.774176373008955e-06,
"loss": 0.4639,
"step": 1779
},
{
"epoch": 0.8607350096711799,
"grad_norm": 0.4233231544494629,
"learning_rate": 4.7739112681553754e-06,
"loss": 0.4733,
"step": 1780
},
{
"epoch": 0.8612185686653772,
"grad_norm": 0.4225790500640869,
"learning_rate": 4.773646015152947e-06,
"loss": 0.475,
"step": 1781
},
{
"epoch": 0.8617021276595744,
"grad_norm": 0.45184823870658875,
"learning_rate": 4.773380614018952e-06,
"loss": 0.4395,
"step": 1782
},
{
"epoch": 0.8621856866537717,
"grad_norm": 0.4106232523918152,
"learning_rate": 4.773115064770681e-06,
"loss": 0.4586,
"step": 1783
},
{
"epoch": 0.8626692456479691,
"grad_norm": 0.4108845293521881,
"learning_rate": 4.772849367425434e-06,
"loss": 0.4443,
"step": 1784
},
{
"epoch": 0.8631528046421664,
"grad_norm": 0.43157774209976196,
"learning_rate": 4.7725835220005235e-06,
"loss": 0.4535,
"step": 1785
},
{
"epoch": 0.8636363636363636,
"grad_norm": 0.4426245391368866,
"learning_rate": 4.772317528513268e-06,
"loss": 0.4661,
"step": 1786
},
{
"epoch": 0.8641199226305609,
"grad_norm": 0.4643140137195587,
"learning_rate": 4.772051386980998e-06,
"loss": 0.4537,
"step": 1787
},
{
"epoch": 0.8646034816247582,
"grad_norm": 0.4075002372264862,
"learning_rate": 4.7717850974210536e-06,
"loss": 0.4448,
"step": 1788
},
{
"epoch": 0.8650870406189555,
"grad_norm": 0.42226505279541016,
"learning_rate": 4.771518659850784e-06,
"loss": 0.4657,
"step": 1789
},
{
"epoch": 0.8655705996131529,
"grad_norm": 0.5288849472999573,
"learning_rate": 4.7712520742875465e-06,
"loss": 0.444,
"step": 1790
},
{
"epoch": 0.8660541586073501,
"grad_norm": 0.5637319087982178,
"learning_rate": 4.7709853407487105e-06,
"loss": 0.4764,
"step": 1791
},
{
"epoch": 0.8665377176015474,
"grad_norm": 0.4180159866809845,
"learning_rate": 4.770718459251655e-06,
"loss": 0.4605,
"step": 1792
},
{
"epoch": 0.8670212765957447,
"grad_norm": 0.543476402759552,
"learning_rate": 4.770451429813767e-06,
"loss": 0.4558,
"step": 1793
},
{
"epoch": 0.867504835589942,
"grad_norm": 0.42809557914733887,
"learning_rate": 4.770184252452443e-06,
"loss": 0.4566,
"step": 1794
},
{
"epoch": 0.8679883945841392,
"grad_norm": 0.4013577103614807,
"learning_rate": 4.769916927185092e-06,
"loss": 0.4363,
"step": 1795
},
{
"epoch": 0.8684719535783365,
"grad_norm": 0.40591180324554443,
"learning_rate": 4.7696494540291295e-06,
"loss": 0.4361,
"step": 1796
},
{
"epoch": 0.8689555125725339,
"grad_norm": 0.4472590386867523,
"learning_rate": 4.769381833001981e-06,
"loss": 0.4439,
"step": 1797
},
{
"epoch": 0.8694390715667312,
"grad_norm": 0.435757040977478,
"learning_rate": 4.769114064121083e-06,
"loss": 0.4494,
"step": 1798
},
{
"epoch": 0.8699226305609284,
"grad_norm": 0.45399346947669983,
"learning_rate": 4.768846147403883e-06,
"loss": 0.4543,
"step": 1799
},
{
"epoch": 0.8704061895551257,
"grad_norm": 0.46117937564849854,
"learning_rate": 4.768578082867833e-06,
"loss": 0.4442,
"step": 1800
},
{
"epoch": 0.870889748549323,
"grad_norm": 0.42407071590423584,
"learning_rate": 4.7683098705303995e-06,
"loss": 0.4448,
"step": 1801
},
{
"epoch": 0.8713733075435203,
"grad_norm": 0.44393467903137207,
"learning_rate": 4.7680415104090576e-06,
"loss": 0.4615,
"step": 1802
},
{
"epoch": 0.8718568665377177,
"grad_norm": 0.424594908952713,
"learning_rate": 4.767773002521289e-06,
"loss": 0.4566,
"step": 1803
},
{
"epoch": 0.8723404255319149,
"grad_norm": 0.4583720862865448,
"learning_rate": 4.76750434688459e-06,
"loss": 0.4618,
"step": 1804
},
{
"epoch": 0.8728239845261122,
"grad_norm": 0.3963436484336853,
"learning_rate": 4.767235543516463e-06,
"loss": 0.4199,
"step": 1805
},
{
"epoch": 0.8733075435203095,
"grad_norm": 0.41985222697257996,
"learning_rate": 4.7669665924344205e-06,
"loss": 0.4495,
"step": 1806
},
{
"epoch": 0.8737911025145068,
"grad_norm": 0.47478315234184265,
"learning_rate": 4.766697493655985e-06,
"loss": 0.4612,
"step": 1807
},
{
"epoch": 0.874274661508704,
"grad_norm": 0.42178353667259216,
"learning_rate": 4.76642824719869e-06,
"loss": 0.4431,
"step": 1808
},
{
"epoch": 0.8747582205029013,
"grad_norm": 0.42118799686431885,
"learning_rate": 4.766158853080076e-06,
"loss": 0.465,
"step": 1809
},
{
"epoch": 0.8752417794970987,
"grad_norm": 0.42525774240493774,
"learning_rate": 4.765889311317695e-06,
"loss": 0.4516,
"step": 1810
},
{
"epoch": 0.875725338491296,
"grad_norm": 0.4339666962623596,
"learning_rate": 4.765619621929108e-06,
"loss": 0.4615,
"step": 1811
},
{
"epoch": 0.8762088974854932,
"grad_norm": 0.43213528394699097,
"learning_rate": 4.765349784931885e-06,
"loss": 0.4276,
"step": 1812
},
{
"epoch": 0.8766924564796905,
"grad_norm": 0.406515896320343,
"learning_rate": 4.765079800343608e-06,
"loss": 0.4411,
"step": 1813
},
{
"epoch": 0.8771760154738878,
"grad_norm": 0.47391143441200256,
"learning_rate": 4.764809668181866e-06,
"loss": 0.4577,
"step": 1814
},
{
"epoch": 0.8776595744680851,
"grad_norm": 0.4145541489124298,
"learning_rate": 4.764539388464257e-06,
"loss": 0.4672,
"step": 1815
},
{
"epoch": 0.8781431334622823,
"grad_norm": 0.4604397416114807,
"learning_rate": 4.764268961208393e-06,
"loss": 0.4697,
"step": 1816
},
{
"epoch": 0.8786266924564797,
"grad_norm": 0.4669099748134613,
"learning_rate": 4.76399838643189e-06,
"loss": 0.4518,
"step": 1817
},
{
"epoch": 0.879110251450677,
"grad_norm": 0.4663965702056885,
"learning_rate": 4.763727664152378e-06,
"loss": 0.4523,
"step": 1818
},
{
"epoch": 0.8795938104448743,
"grad_norm": 0.44860151410102844,
"learning_rate": 4.763456794387495e-06,
"loss": 0.459,
"step": 1819
},
{
"epoch": 0.8800773694390716,
"grad_norm": 0.47004613280296326,
"learning_rate": 4.7631857771548875e-06,
"loss": 0.4593,
"step": 1820
},
{
"epoch": 0.8805609284332688,
"grad_norm": 0.4314551055431366,
"learning_rate": 4.762914612472214e-06,
"loss": 0.4233,
"step": 1821
},
{
"epoch": 0.8810444874274661,
"grad_norm": 0.416069895029068,
"learning_rate": 4.762643300357141e-06,
"loss": 0.4555,
"step": 1822
},
{
"epoch": 0.8815280464216635,
"grad_norm": 0.41660386323928833,
"learning_rate": 4.762371840827344e-06,
"loss": 0.4466,
"step": 1823
},
{
"epoch": 0.8820116054158608,
"grad_norm": 0.4352911412715912,
"learning_rate": 4.76210023390051e-06,
"loss": 0.4431,
"step": 1824
},
{
"epoch": 0.882495164410058,
"grad_norm": 0.4292300343513489,
"learning_rate": 4.761828479594334e-06,
"loss": 0.4661,
"step": 1825
},
{
"epoch": 0.8829787234042553,
"grad_norm": 0.4347653388977051,
"learning_rate": 4.761556577926522e-06,
"loss": 0.4352,
"step": 1826
},
{
"epoch": 0.8834622823984526,
"grad_norm": 0.4411576986312866,
"learning_rate": 4.761284528914787e-06,
"loss": 0.4621,
"step": 1827
},
{
"epoch": 0.8839458413926499,
"grad_norm": 0.43334683775901794,
"learning_rate": 4.761012332576856e-06,
"loss": 0.4608,
"step": 1828
},
{
"epoch": 0.8844294003868471,
"grad_norm": 0.450206995010376,
"learning_rate": 4.76073998893046e-06,
"loss": 0.46,
"step": 1829
},
{
"epoch": 0.8849129593810445,
"grad_norm": 0.42695486545562744,
"learning_rate": 4.760467497993347e-06,
"loss": 0.4461,
"step": 1830
},
{
"epoch": 0.8853965183752418,
"grad_norm": 0.4570879638195038,
"learning_rate": 4.760194859783266e-06,
"loss": 0.4412,
"step": 1831
},
{
"epoch": 0.8858800773694391,
"grad_norm": 0.44017043709754944,
"learning_rate": 4.759922074317981e-06,
"loss": 0.4665,
"step": 1832
},
{
"epoch": 0.8863636363636364,
"grad_norm": 0.42846250534057617,
"learning_rate": 4.759649141615265e-06,
"loss": 0.4623,
"step": 1833
},
{
"epoch": 0.8868471953578336,
"grad_norm": 0.4540502727031708,
"learning_rate": 4.759376061692899e-06,
"loss": 0.4458,
"step": 1834
},
{
"epoch": 0.8873307543520309,
"grad_norm": 0.42256003618240356,
"learning_rate": 4.7591028345686765e-06,
"loss": 0.4248,
"step": 1835
},
{
"epoch": 0.8878143133462283,
"grad_norm": 0.4467964470386505,
"learning_rate": 4.758829460260397e-06,
"loss": 0.4554,
"step": 1836
},
{
"epoch": 0.8882978723404256,
"grad_norm": 0.5088194608688354,
"learning_rate": 4.758555938785872e-06,
"loss": 0.4552,
"step": 1837
},
{
"epoch": 0.8887814313346228,
"grad_norm": 0.44091877341270447,
"learning_rate": 4.758282270162921e-06,
"loss": 0.4683,
"step": 1838
},
{
"epoch": 0.8892649903288201,
"grad_norm": 0.4545842111110687,
"learning_rate": 4.758008454409374e-06,
"loss": 0.4618,
"step": 1839
},
{
"epoch": 0.8897485493230174,
"grad_norm": 0.4302009046077728,
"learning_rate": 4.757734491543072e-06,
"loss": 0.4503,
"step": 1840
},
{
"epoch": 0.8902321083172147,
"grad_norm": 0.41503089666366577,
"learning_rate": 4.7574603815818624e-06,
"loss": 0.4373,
"step": 1841
},
{
"epoch": 0.8907156673114119,
"grad_norm": 0.5335795283317566,
"learning_rate": 4.7571861245436054e-06,
"loss": 0.4632,
"step": 1842
},
{
"epoch": 0.8911992263056093,
"grad_norm": 0.557690441608429,
"learning_rate": 4.756911720446168e-06,
"loss": 0.4666,
"step": 1843
},
{
"epoch": 0.8916827852998066,
"grad_norm": 0.40970084071159363,
"learning_rate": 4.756637169307429e-06,
"loss": 0.4389,
"step": 1844
},
{
"epoch": 0.8921663442940039,
"grad_norm": 0.4315798282623291,
"learning_rate": 4.756362471145275e-06,
"loss": 0.4389,
"step": 1845
},
{
"epoch": 0.8926499032882012,
"grad_norm": 0.4490053057670593,
"learning_rate": 4.756087625977603e-06,
"loss": 0.4486,
"step": 1846
},
{
"epoch": 0.8931334622823984,
"grad_norm": 0.41619452834129333,
"learning_rate": 4.755812633822321e-06,
"loss": 0.475,
"step": 1847
},
{
"epoch": 0.8936170212765957,
"grad_norm": 0.4929364323616028,
"learning_rate": 4.755537494697343e-06,
"loss": 0.4422,
"step": 1848
},
{
"epoch": 0.8941005802707931,
"grad_norm": 0.4437929093837738,
"learning_rate": 4.755262208620597e-06,
"loss": 0.4742,
"step": 1849
},
{
"epoch": 0.8945841392649904,
"grad_norm": 0.48250436782836914,
"learning_rate": 4.7549867756100155e-06,
"loss": 0.4388,
"step": 1850
},
{
"epoch": 0.8950676982591876,
"grad_norm": 0.41630637645721436,
"learning_rate": 4.754711195683547e-06,
"loss": 0.4263,
"step": 1851
},
{
"epoch": 0.8955512572533849,
"grad_norm": 0.4048013389110565,
"learning_rate": 4.754435468859143e-06,
"loss": 0.4483,
"step": 1852
},
{
"epoch": 0.8960348162475822,
"grad_norm": 0.4773472249507904,
"learning_rate": 4.754159595154768e-06,
"loss": 0.445,
"step": 1853
},
{
"epoch": 0.8965183752417795,
"grad_norm": 0.43505337834358215,
"learning_rate": 4.753883574588397e-06,
"loss": 0.4539,
"step": 1854
},
{
"epoch": 0.8970019342359767,
"grad_norm": 0.42329397797584534,
"learning_rate": 4.753607407178012e-06,
"loss": 0.4382,
"step": 1855
},
{
"epoch": 0.8974854932301741,
"grad_norm": 0.4487643241882324,
"learning_rate": 4.753331092941606e-06,
"loss": 0.4593,
"step": 1856
},
{
"epoch": 0.8979690522243714,
"grad_norm": 0.43058884143829346,
"learning_rate": 4.753054631897183e-06,
"loss": 0.4562,
"step": 1857
},
{
"epoch": 0.8984526112185687,
"grad_norm": 0.440266877412796,
"learning_rate": 4.752778024062752e-06,
"loss": 0.4274,
"step": 1858
},
{
"epoch": 0.898936170212766,
"grad_norm": 0.44400760531425476,
"learning_rate": 4.752501269456336e-06,
"loss": 0.4624,
"step": 1859
},
{
"epoch": 0.8994197292069632,
"grad_norm": 0.4999360740184784,
"learning_rate": 4.752224368095965e-06,
"loss": 0.4312,
"step": 1860
},
{
"epoch": 0.8999032882011605,
"grad_norm": 0.4733029901981354,
"learning_rate": 4.7519473199996806e-06,
"loss": 0.4529,
"step": 1861
},
{
"epoch": 0.9003868471953579,
"grad_norm": 0.41087576746940613,
"learning_rate": 4.751670125185532e-06,
"loss": 0.4427,
"step": 1862
},
{
"epoch": 0.9008704061895552,
"grad_norm": 0.42453569173812866,
"learning_rate": 4.75139278367158e-06,
"loss": 0.4547,
"step": 1863
},
{
"epoch": 0.9013539651837524,
"grad_norm": 0.4299073815345764,
"learning_rate": 4.751115295475893e-06,
"loss": 0.4491,
"step": 1864
},
{
"epoch": 0.9018375241779497,
"grad_norm": 0.4565071165561676,
"learning_rate": 4.75083766061655e-06,
"loss": 0.4319,
"step": 1865
},
{
"epoch": 0.902321083172147,
"grad_norm": 0.4117959141731262,
"learning_rate": 4.7505598791116395e-06,
"loss": 0.4272,
"step": 1866
},
{
"epoch": 0.9028046421663443,
"grad_norm": 0.435011625289917,
"learning_rate": 4.75028195097926e-06,
"loss": 0.4468,
"step": 1867
},
{
"epoch": 0.9032882011605415,
"grad_norm": 0.45105016231536865,
"learning_rate": 4.750003876237517e-06,
"loss": 0.4576,
"step": 1868
},
{
"epoch": 0.9037717601547389,
"grad_norm": 0.425686776638031,
"learning_rate": 4.749725654904529e-06,
"loss": 0.4793,
"step": 1869
},
{
"epoch": 0.9042553191489362,
"grad_norm": 0.5148751735687256,
"learning_rate": 4.749447286998422e-06,
"loss": 0.4732,
"step": 1870
},
{
"epoch": 0.9047388781431335,
"grad_norm": 0.41432178020477295,
"learning_rate": 4.749168772537333e-06,
"loss": 0.44,
"step": 1871
},
{
"epoch": 0.9052224371373307,
"grad_norm": 0.4313696324825287,
"learning_rate": 4.748890111539407e-06,
"loss": 0.4627,
"step": 1872
},
{
"epoch": 0.905705996131528,
"grad_norm": 0.43462052941322327,
"learning_rate": 4.748611304022799e-06,
"loss": 0.4246,
"step": 1873
},
{
"epoch": 0.9061895551257253,
"grad_norm": 0.4207950234413147,
"learning_rate": 4.748332350005674e-06,
"loss": 0.4393,
"step": 1874
},
{
"epoch": 0.9066731141199227,
"grad_norm": 0.42336976528167725,
"learning_rate": 4.748053249506206e-06,
"loss": 0.4682,
"step": 1875
},
{
"epoch": 0.90715667311412,
"grad_norm": 0.4382014572620392,
"learning_rate": 4.74777400254258e-06,
"loss": 0.4258,
"step": 1876
},
{
"epoch": 0.9076402321083172,
"grad_norm": 0.48035499453544617,
"learning_rate": 4.747494609132987e-06,
"loss": 0.4445,
"step": 1877
},
{
"epoch": 0.9081237911025145,
"grad_norm": 0.4145694077014923,
"learning_rate": 4.747215069295632e-06,
"loss": 0.4587,
"step": 1878
},
{
"epoch": 0.9086073500967118,
"grad_norm": 0.4445246756076813,
"learning_rate": 4.746935383048728e-06,
"loss": 0.4547,
"step": 1879
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.40601107478141785,
"learning_rate": 4.746655550410494e-06,
"loss": 0.4509,
"step": 1880
},
{
"epoch": 0.9095744680851063,
"grad_norm": 0.45228636264801025,
"learning_rate": 4.746375571399164e-06,
"loss": 0.4469,
"step": 1881
},
{
"epoch": 0.9100580270793037,
"grad_norm": 0.4875979721546173,
"learning_rate": 4.7460954460329775e-06,
"loss": 0.4196,
"step": 1882
},
{
"epoch": 0.910541586073501,
"grad_norm": 0.46213170886039734,
"learning_rate": 4.7458151743301876e-06,
"loss": 0.4709,
"step": 1883
},
{
"epoch": 0.9110251450676983,
"grad_norm": 0.47638139128685,
"learning_rate": 4.745534756309052e-06,
"loss": 0.4437,
"step": 1884
},
{
"epoch": 0.9115087040618955,
"grad_norm": 0.43488895893096924,
"learning_rate": 4.745254191987842e-06,
"loss": 0.4387,
"step": 1885
},
{
"epoch": 0.9119922630560928,
"grad_norm": 0.43625906109809875,
"learning_rate": 4.744973481384834e-06,
"loss": 0.4461,
"step": 1886
},
{
"epoch": 0.9124758220502901,
"grad_norm": 0.4320892095565796,
"learning_rate": 4.7446926245183215e-06,
"loss": 0.4588,
"step": 1887
},
{
"epoch": 0.9129593810444874,
"grad_norm": 0.4240933358669281,
"learning_rate": 4.744411621406598e-06,
"loss": 0.449,
"step": 1888
},
{
"epoch": 0.9134429400386848,
"grad_norm": 0.4831683337688446,
"learning_rate": 4.744130472067974e-06,
"loss": 0.4752,
"step": 1889
},
{
"epoch": 0.913926499032882,
"grad_norm": 0.4373767375946045,
"learning_rate": 4.743849176520766e-06,
"loss": 0.4529,
"step": 1890
},
{
"epoch": 0.9144100580270793,
"grad_norm": 0.42155158519744873,
"learning_rate": 4.743567734783301e-06,
"loss": 0.4264,
"step": 1891
},
{
"epoch": 0.9148936170212766,
"grad_norm": 0.42250940203666687,
"learning_rate": 4.7432861468739156e-06,
"loss": 0.4627,
"step": 1892
},
{
"epoch": 0.9153771760154739,
"grad_norm": 0.8477792739868164,
"learning_rate": 4.743004412810956e-06,
"loss": 0.4652,
"step": 1893
},
{
"epoch": 0.9158607350096711,
"grad_norm": 0.42357683181762695,
"learning_rate": 4.742722532612775e-06,
"loss": 0.4729,
"step": 1894
},
{
"epoch": 0.9163442940038685,
"grad_norm": 0.4757964611053467,
"learning_rate": 4.7424405062977404e-06,
"loss": 0.4491,
"step": 1895
},
{
"epoch": 0.9168278529980658,
"grad_norm": 0.4081232249736786,
"learning_rate": 4.742158333884227e-06,
"loss": 0.4434,
"step": 1896
},
{
"epoch": 0.9173114119922631,
"grad_norm": 0.4559086263179779,
"learning_rate": 4.741876015390616e-06,
"loss": 0.4418,
"step": 1897
},
{
"epoch": 0.9177949709864603,
"grad_norm": 0.405843049287796,
"learning_rate": 4.741593550835303e-06,
"loss": 0.4567,
"step": 1898
},
{
"epoch": 0.9182785299806576,
"grad_norm": 0.41238856315612793,
"learning_rate": 4.741310940236691e-06,
"loss": 0.4312,
"step": 1899
},
{
"epoch": 0.9187620889748549,
"grad_norm": 0.4161728620529175,
"learning_rate": 4.741028183613192e-06,
"loss": 0.4549,
"step": 1900
},
{
"epoch": 0.9192456479690522,
"grad_norm": 0.4297192394733429,
"learning_rate": 4.7407452809832275e-06,
"loss": 0.4578,
"step": 1901
},
{
"epoch": 0.9197292069632496,
"grad_norm": 0.5853244066238403,
"learning_rate": 4.7404622323652296e-06,
"loss": 0.4455,
"step": 1902
},
{
"epoch": 0.9202127659574468,
"grad_norm": 0.4154956638813019,
"learning_rate": 4.740179037777639e-06,
"loss": 0.4658,
"step": 1903
},
{
"epoch": 0.9206963249516441,
"grad_norm": 0.41744962334632874,
"learning_rate": 4.7398956972389074e-06,
"loss": 0.4431,
"step": 1904
},
{
"epoch": 0.9211798839458414,
"grad_norm": 0.44503679871559143,
"learning_rate": 4.7396122107674935e-06,
"loss": 0.4396,
"step": 1905
},
{
"epoch": 0.9216634429400387,
"grad_norm": 0.4330887794494629,
"learning_rate": 4.739328578381868e-06,
"loss": 0.4341,
"step": 1906
},
{
"epoch": 0.9221470019342359,
"grad_norm": 0.4313158690929413,
"learning_rate": 4.739044800100509e-06,
"loss": 0.4565,
"step": 1907
},
{
"epoch": 0.9226305609284333,
"grad_norm": 0.43022119998931885,
"learning_rate": 4.738760875941905e-06,
"loss": 0.4479,
"step": 1908
},
{
"epoch": 0.9231141199226306,
"grad_norm": 0.4282204806804657,
"learning_rate": 4.738476805924555e-06,
"loss": 0.468,
"step": 1909
},
{
"epoch": 0.9235976789168279,
"grad_norm": 0.5046052932739258,
"learning_rate": 4.738192590066967e-06,
"loss": 0.4633,
"step": 1910
},
{
"epoch": 0.9240812379110251,
"grad_norm": 0.4380984902381897,
"learning_rate": 4.737908228387656e-06,
"loss": 0.451,
"step": 1911
},
{
"epoch": 0.9245647969052224,
"grad_norm": 0.41640403866767883,
"learning_rate": 4.737623720905151e-06,
"loss": 0.4307,
"step": 1912
},
{
"epoch": 0.9250483558994197,
"grad_norm": 0.4518442153930664,
"learning_rate": 4.737339067637987e-06,
"loss": 0.4715,
"step": 1913
},
{
"epoch": 0.925531914893617,
"grad_norm": 0.4299834966659546,
"learning_rate": 4.737054268604709e-06,
"loss": 0.4693,
"step": 1914
},
{
"epoch": 0.9260154738878144,
"grad_norm": 0.41833609342575073,
"learning_rate": 4.736769323823873e-06,
"loss": 0.4533,
"step": 1915
},
{
"epoch": 0.9264990328820116,
"grad_norm": 0.4507361054420471,
"learning_rate": 4.7364842333140436e-06,
"loss": 0.4674,
"step": 1916
},
{
"epoch": 0.9269825918762089,
"grad_norm": 0.4408568739891052,
"learning_rate": 4.736198997093795e-06,
"loss": 0.4366,
"step": 1917
},
{
"epoch": 0.9274661508704062,
"grad_norm": 0.457612007856369,
"learning_rate": 4.7359136151817095e-06,
"loss": 0.4611,
"step": 1918
},
{
"epoch": 0.9279497098646035,
"grad_norm": 0.45070797204971313,
"learning_rate": 4.7356280875963814e-06,
"loss": 0.4364,
"step": 1919
},
{
"epoch": 0.9284332688588007,
"grad_norm": 0.7539900541305542,
"learning_rate": 4.735342414356413e-06,
"loss": 0.4404,
"step": 1920
},
{
"epoch": 0.9289168278529981,
"grad_norm": 0.43759679794311523,
"learning_rate": 4.735056595480417e-06,
"loss": 0.4603,
"step": 1921
},
{
"epoch": 0.9294003868471954,
"grad_norm": 0.44140011072158813,
"learning_rate": 4.734770630987013e-06,
"loss": 0.4505,
"step": 1922
},
{
"epoch": 0.9298839458413927,
"grad_norm": 0.43896445631980896,
"learning_rate": 4.734484520894834e-06,
"loss": 0.4397,
"step": 1923
},
{
"epoch": 0.9303675048355899,
"grad_norm": 0.4464511573314667,
"learning_rate": 4.73419826522252e-06,
"loss": 0.4594,
"step": 1924
},
{
"epoch": 0.9308510638297872,
"grad_norm": 0.5006834864616394,
"learning_rate": 4.7339118639887204e-06,
"loss": 0.4569,
"step": 1925
},
{
"epoch": 0.9313346228239845,
"grad_norm": 0.4364486038684845,
"learning_rate": 4.733625317212095e-06,
"loss": 0.4611,
"step": 1926
},
{
"epoch": 0.9318181818181818,
"grad_norm": 0.4558824598789215,
"learning_rate": 4.733338624911313e-06,
"loss": 0.4313,
"step": 1927
},
{
"epoch": 0.9323017408123792,
"grad_norm": 0.4253576099872589,
"learning_rate": 4.733051787105053e-06,
"loss": 0.4533,
"step": 1928
},
{
"epoch": 0.9327852998065764,
"grad_norm": 0.44522860646247864,
"learning_rate": 4.732764803812002e-06,
"loss": 0.4526,
"step": 1929
},
{
"epoch": 0.9332688588007737,
"grad_norm": 0.4420976936817169,
"learning_rate": 4.73247767505086e-06,
"loss": 0.452,
"step": 1930
},
{
"epoch": 0.933752417794971,
"grad_norm": 0.493862122297287,
"learning_rate": 4.73219040084033e-06,
"loss": 0.4504,
"step": 1931
},
{
"epoch": 0.9342359767891683,
"grad_norm": 0.4543628990650177,
"learning_rate": 4.73190298119913e-06,
"loss": 0.4687,
"step": 1932
},
{
"epoch": 0.9347195357833655,
"grad_norm": 0.46449723839759827,
"learning_rate": 4.731615416145987e-06,
"loss": 0.454,
"step": 1933
},
{
"epoch": 0.9352030947775629,
"grad_norm": 0.4249908924102783,
"learning_rate": 4.731327705699636e-06,
"loss": 0.4437,
"step": 1934
},
{
"epoch": 0.9356866537717602,
"grad_norm": 0.43488219380378723,
"learning_rate": 4.73103984987882e-06,
"loss": 0.4578,
"step": 1935
},
{
"epoch": 0.9361702127659575,
"grad_norm": 0.4787844121456146,
"learning_rate": 4.730751848702294e-06,
"loss": 0.4676,
"step": 1936
},
{
"epoch": 0.9366537717601547,
"grad_norm": 0.4363221824169159,
"learning_rate": 4.730463702188824e-06,
"loss": 0.4416,
"step": 1937
},
{
"epoch": 0.937137330754352,
"grad_norm": 0.42467185854911804,
"learning_rate": 4.73017541035718e-06,
"loss": 0.456,
"step": 1938
},
{
"epoch": 0.9376208897485493,
"grad_norm": 0.41111892461776733,
"learning_rate": 4.729886973226146e-06,
"loss": 0.4532,
"step": 1939
},
{
"epoch": 0.9381044487427466,
"grad_norm": 0.44362354278564453,
"learning_rate": 4.729598390814515e-06,
"loss": 0.4422,
"step": 1940
},
{
"epoch": 0.938588007736944,
"grad_norm": 0.43182167410850525,
"learning_rate": 4.7293096631410875e-06,
"loss": 0.4481,
"step": 1941
},
{
"epoch": 0.9390715667311412,
"grad_norm": 0.4451581835746765,
"learning_rate": 4.729020790224675e-06,
"loss": 0.4526,
"step": 1942
},
{
"epoch": 0.9395551257253385,
"grad_norm": 0.4309098422527313,
"learning_rate": 4.7287317720840974e-06,
"loss": 0.4394,
"step": 1943
},
{
"epoch": 0.9400386847195358,
"grad_norm": 0.4834847152233124,
"learning_rate": 4.728442608738185e-06,
"loss": 0.4521,
"step": 1944
},
{
"epoch": 0.940522243713733,
"grad_norm": 0.4676097631454468,
"learning_rate": 4.728153300205778e-06,
"loss": 0.4322,
"step": 1945
},
{
"epoch": 0.9410058027079303,
"grad_norm": 0.46261999011039734,
"learning_rate": 4.727863846505725e-06,
"loss": 0.4534,
"step": 1946
},
{
"epoch": 0.9414893617021277,
"grad_norm": 0.5883946418762207,
"learning_rate": 4.727574247656883e-06,
"loss": 0.4541,
"step": 1947
},
{
"epoch": 0.941972920696325,
"grad_norm": 0.41333869099617004,
"learning_rate": 4.727284503678121e-06,
"loss": 0.453,
"step": 1948
},
{
"epoch": 0.9424564796905223,
"grad_norm": 0.45521169900894165,
"learning_rate": 4.726994614588316e-06,
"loss": 0.4537,
"step": 1949
},
{
"epoch": 0.9429400386847195,
"grad_norm": 0.40193212032318115,
"learning_rate": 4.726704580406355e-06,
"loss": 0.4308,
"step": 1950
},
{
"epoch": 0.9434235976789168,
"grad_norm": 0.44298529624938965,
"learning_rate": 4.726414401151135e-06,
"loss": 0.466,
"step": 1951
},
{
"epoch": 0.9439071566731141,
"grad_norm": 0.436103880405426,
"learning_rate": 4.7261240768415595e-06,
"loss": 0.4465,
"step": 1952
},
{
"epoch": 0.9443907156673114,
"grad_norm": 0.414192795753479,
"learning_rate": 4.725833607496545e-06,
"loss": 0.4574,
"step": 1953
},
{
"epoch": 0.9448742746615088,
"grad_norm": 0.41243085265159607,
"learning_rate": 4.725542993135015e-06,
"loss": 0.4458,
"step": 1954
},
{
"epoch": 0.945357833655706,
"grad_norm": 0.4826000928878784,
"learning_rate": 4.725252233775905e-06,
"loss": 0.4551,
"step": 1955
},
{
"epoch": 0.9458413926499033,
"grad_norm": 0.42552217841148376,
"learning_rate": 4.724961329438158e-06,
"loss": 0.4476,
"step": 1956
},
{
"epoch": 0.9463249516441006,
"grad_norm": 0.4663180410861969,
"learning_rate": 4.724670280140726e-06,
"loss": 0.4567,
"step": 1957
},
{
"epoch": 0.9468085106382979,
"grad_norm": 0.6530309319496155,
"learning_rate": 4.7243790859025715e-06,
"loss": 0.4212,
"step": 1958
},
{
"epoch": 0.9472920696324951,
"grad_norm": 0.4663379192352295,
"learning_rate": 4.724087746742667e-06,
"loss": 0.4397,
"step": 1959
},
{
"epoch": 0.9477756286266924,
"grad_norm": 0.4135099947452545,
"learning_rate": 4.723796262679994e-06,
"loss": 0.4371,
"step": 1960
},
{
"epoch": 0.9482591876208898,
"grad_norm": 0.44753551483154297,
"learning_rate": 4.7235046337335415e-06,
"loss": 0.4543,
"step": 1961
},
{
"epoch": 0.9487427466150871,
"grad_norm": 0.42846837639808655,
"learning_rate": 4.7232128599223106e-06,
"loss": 0.4447,
"step": 1962
},
{
"epoch": 0.9492263056092843,
"grad_norm": 0.44100338220596313,
"learning_rate": 4.72292094126531e-06,
"loss": 0.463,
"step": 1963
},
{
"epoch": 0.9497098646034816,
"grad_norm": 0.4311354160308838,
"learning_rate": 4.722628877781561e-06,
"loss": 0.4407,
"step": 1964
},
{
"epoch": 0.9501934235976789,
"grad_norm": 0.44324174523353577,
"learning_rate": 4.722336669490089e-06,
"loss": 0.449,
"step": 1965
},
{
"epoch": 0.9506769825918762,
"grad_norm": 0.4119623601436615,
"learning_rate": 4.7220443164099335e-06,
"loss": 0.4584,
"step": 1966
},
{
"epoch": 0.9511605415860735,
"grad_norm": 0.41248446702957153,
"learning_rate": 4.721751818560142e-06,
"loss": 0.4311,
"step": 1967
},
{
"epoch": 0.9516441005802708,
"grad_norm": 0.40915167331695557,
"learning_rate": 4.721459175959769e-06,
"loss": 0.4563,
"step": 1968
},
{
"epoch": 0.9521276595744681,
"grad_norm": 0.41791391372680664,
"learning_rate": 4.721166388627884e-06,
"loss": 0.4518,
"step": 1969
},
{
"epoch": 0.9526112185686654,
"grad_norm": 0.4374818205833435,
"learning_rate": 4.72087345658356e-06,
"loss": 0.4581,
"step": 1970
},
{
"epoch": 0.9530947775628626,
"grad_norm": 0.38834431767463684,
"learning_rate": 4.720580379845884e-06,
"loss": 0.4391,
"step": 1971
},
{
"epoch": 0.9535783365570599,
"grad_norm": 0.4291421175003052,
"learning_rate": 4.720287158433947e-06,
"loss": 0.4566,
"step": 1972
},
{
"epoch": 0.9540618955512572,
"grad_norm": 0.45957908034324646,
"learning_rate": 4.719993792366857e-06,
"loss": 0.4325,
"step": 1973
},
{
"epoch": 0.9545454545454546,
"grad_norm": 0.4221996068954468,
"learning_rate": 4.7197002816637235e-06,
"loss": 0.4362,
"step": 1974
},
{
"epoch": 0.9550290135396519,
"grad_norm": 0.4279549717903137,
"learning_rate": 4.719406626343672e-06,
"loss": 0.4326,
"step": 1975
},
{
"epoch": 0.9555125725338491,
"grad_norm": 0.4264352023601532,
"learning_rate": 4.719112826425834e-06,
"loss": 0.455,
"step": 1976
},
{
"epoch": 0.9559961315280464,
"grad_norm": 0.43064695596694946,
"learning_rate": 4.71881888192935e-06,
"loss": 0.4504,
"step": 1977
},
{
"epoch": 0.9564796905222437,
"grad_norm": 0.4660034477710724,
"learning_rate": 4.718524792873371e-06,
"loss": 0.4453,
"step": 1978
},
{
"epoch": 0.956963249516441,
"grad_norm": 0.4238419830799103,
"learning_rate": 4.718230559277059e-06,
"loss": 0.4575,
"step": 1979
},
{
"epoch": 0.9574468085106383,
"grad_norm": 0.425527423620224,
"learning_rate": 4.717936181159581e-06,
"loss": 0.4301,
"step": 1980
},
{
"epoch": 0.9579303675048356,
"grad_norm": 0.39952564239501953,
"learning_rate": 4.7176416585401195e-06,
"loss": 0.4511,
"step": 1981
},
{
"epoch": 0.9584139264990329,
"grad_norm": 0.47706300020217896,
"learning_rate": 4.717346991437861e-06,
"loss": 0.4705,
"step": 1982
},
{
"epoch": 0.9588974854932302,
"grad_norm": 0.44544240832328796,
"learning_rate": 4.717052179872004e-06,
"loss": 0.4625,
"step": 1983
},
{
"epoch": 0.9593810444874274,
"grad_norm": 0.422228068113327,
"learning_rate": 4.716757223861755e-06,
"loss": 0.4447,
"step": 1984
},
{
"epoch": 0.9598646034816247,
"grad_norm": 1.7472119331359863,
"learning_rate": 4.7164621234263324e-06,
"loss": 0.4393,
"step": 1985
},
{
"epoch": 0.960348162475822,
"grad_norm": 0.45185375213623047,
"learning_rate": 4.716166878584962e-06,
"loss": 0.4627,
"step": 1986
},
{
"epoch": 0.9608317214700194,
"grad_norm": 0.9011144638061523,
"learning_rate": 4.715871489356879e-06,
"loss": 0.4511,
"step": 1987
},
{
"epoch": 0.9613152804642167,
"grad_norm": 0.4393264949321747,
"learning_rate": 4.715575955761328e-06,
"loss": 0.4374,
"step": 1988
},
{
"epoch": 0.9617988394584139,
"grad_norm": 0.4429907500743866,
"learning_rate": 4.715280277817565e-06,
"loss": 0.4458,
"step": 1989
},
{
"epoch": 0.9622823984526112,
"grad_norm": 0.5249834060668945,
"learning_rate": 4.714984455544853e-06,
"loss": 0.4126,
"step": 1990
},
{
"epoch": 0.9627659574468085,
"grad_norm": 1.696258544921875,
"learning_rate": 4.714688488962465e-06,
"loss": 0.4596,
"step": 1991
},
{
"epoch": 0.9632495164410058,
"grad_norm": 0.4275670647621155,
"learning_rate": 4.714392378089684e-06,
"loss": 0.4515,
"step": 1992
},
{
"epoch": 0.9637330754352031,
"grad_norm": 0.4399387538433075,
"learning_rate": 4.7140961229458025e-06,
"loss": 0.4523,
"step": 1993
},
{
"epoch": 0.9642166344294004,
"grad_norm": 0.4412606358528137,
"learning_rate": 4.713799723550121e-06,
"loss": 0.431,
"step": 1994
},
{
"epoch": 0.9647001934235977,
"grad_norm": 0.4216758906841278,
"learning_rate": 4.713503179921951e-06,
"loss": 0.4459,
"step": 1995
},
{
"epoch": 0.965183752417795,
"grad_norm": 0.432327002286911,
"learning_rate": 4.713206492080613e-06,
"loss": 0.4641,
"step": 1996
},
{
"epoch": 0.9656673114119922,
"grad_norm": 0.4335901141166687,
"learning_rate": 4.7129096600454375e-06,
"loss": 0.45,
"step": 1997
},
{
"epoch": 0.9661508704061895,
"grad_norm": 0.4222829341888428,
"learning_rate": 4.712612683835761e-06,
"loss": 0.4307,
"step": 1998
},
{
"epoch": 0.9666344294003868,
"grad_norm": 0.5455124378204346,
"learning_rate": 4.712315563470934e-06,
"loss": 0.4572,
"step": 1999
},
{
"epoch": 0.9671179883945842,
"grad_norm": 0.4141272008419037,
"learning_rate": 4.7120182989703136e-06,
"loss": 0.4536,
"step": 2000
}
],
"logging_steps": 1,
"max_steps": 12408,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.7155155942178816e+20,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}