model_pbuf420u / checkpoint-2115 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
7458547 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2115,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00047281323877068556,
"grad_norm": 5.163570880889893,
"learning_rate": 5.0000000000000004e-08,
"loss": 1.4628,
"step": 1
},
{
"epoch": 0.0009456264775413711,
"grad_norm": 6.298020839691162,
"learning_rate": 1.0000000000000001e-07,
"loss": 1.5003,
"step": 2
},
{
"epoch": 0.0014184397163120568,
"grad_norm": 5.853623390197754,
"learning_rate": 1.5000000000000002e-07,
"loss": 1.4495,
"step": 3
},
{
"epoch": 0.0018912529550827422,
"grad_norm": 5.456025123596191,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.3798,
"step": 4
},
{
"epoch": 0.002364066193853428,
"grad_norm": 5.757407188415527,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.4515,
"step": 5
},
{
"epoch": 0.0028368794326241137,
"grad_norm": 5.872277736663818,
"learning_rate": 3.0000000000000004e-07,
"loss": 1.4424,
"step": 6
},
{
"epoch": 0.003309692671394799,
"grad_norm": 6.7816009521484375,
"learning_rate": 3.5000000000000004e-07,
"loss": 1.4004,
"step": 7
},
{
"epoch": 0.0037825059101654845,
"grad_norm": 6.229667663574219,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.4494,
"step": 8
},
{
"epoch": 0.00425531914893617,
"grad_norm": 5.336202621459961,
"learning_rate": 4.5000000000000003e-07,
"loss": 1.3916,
"step": 9
},
{
"epoch": 0.004728132387706856,
"grad_norm": 5.589445114135742,
"learning_rate": 5.000000000000001e-07,
"loss": 1.2318,
"step": 10
},
{
"epoch": 0.005200945626477541,
"grad_norm": 5.720539569854736,
"learning_rate": 5.5e-07,
"loss": 1.4367,
"step": 11
},
{
"epoch": 0.005673758865248227,
"grad_norm": 5.913913726806641,
"learning_rate": 6.000000000000001e-07,
"loss": 1.342,
"step": 12
},
{
"epoch": 0.006146572104018913,
"grad_norm": 5.899744987487793,
"learning_rate": 6.5e-07,
"loss": 1.4307,
"step": 13
},
{
"epoch": 0.006619385342789598,
"grad_norm": 5.571037292480469,
"learning_rate": 7.000000000000001e-07,
"loss": 1.3372,
"step": 14
},
{
"epoch": 0.0070921985815602835,
"grad_norm": 5.480010509490967,
"learning_rate": 7.5e-07,
"loss": 1.3923,
"step": 15
},
{
"epoch": 0.007565011820330969,
"grad_norm": 5.254702091217041,
"learning_rate": 8.000000000000001e-07,
"loss": 1.2928,
"step": 16
},
{
"epoch": 0.008037825059101654,
"grad_norm": 6.090312480926514,
"learning_rate": 8.500000000000001e-07,
"loss": 1.4984,
"step": 17
},
{
"epoch": 0.00851063829787234,
"grad_norm": 5.689319610595703,
"learning_rate": 9.000000000000001e-07,
"loss": 1.4108,
"step": 18
},
{
"epoch": 0.008983451536643027,
"grad_norm": 5.386685848236084,
"learning_rate": 9.500000000000001e-07,
"loss": 1.425,
"step": 19
},
{
"epoch": 0.009456264775413711,
"grad_norm": 6.451584815979004,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.5507,
"step": 20
},
{
"epoch": 0.009929078014184398,
"grad_norm": 5.37647008895874,
"learning_rate": 1.0500000000000001e-06,
"loss": 1.4109,
"step": 21
},
{
"epoch": 0.010401891252955082,
"grad_norm": 4.716553211212158,
"learning_rate": 1.1e-06,
"loss": 1.2028,
"step": 22
},
{
"epoch": 0.010874704491725768,
"grad_norm": 4.950989723205566,
"learning_rate": 1.1500000000000002e-06,
"loss": 1.3043,
"step": 23
},
{
"epoch": 0.011347517730496455,
"grad_norm": 4.688975811004639,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.2708,
"step": 24
},
{
"epoch": 0.01182033096926714,
"grad_norm": 4.905868053436279,
"learning_rate": 1.25e-06,
"loss": 1.3268,
"step": 25
},
{
"epoch": 0.012293144208037825,
"grad_norm": 4.503395080566406,
"learning_rate": 1.3e-06,
"loss": 1.1799,
"step": 26
},
{
"epoch": 0.01276595744680851,
"grad_norm": 4.77382230758667,
"learning_rate": 1.3500000000000002e-06,
"loss": 1.3882,
"step": 27
},
{
"epoch": 0.013238770685579196,
"grad_norm": 4.734329700469971,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.3476,
"step": 28
},
{
"epoch": 0.013711583924349883,
"grad_norm": 4.775066375732422,
"learning_rate": 1.45e-06,
"loss": 1.2429,
"step": 29
},
{
"epoch": 0.014184397163120567,
"grad_norm": 4.978334426879883,
"learning_rate": 1.5e-06,
"loss": 1.2119,
"step": 30
},
{
"epoch": 0.014657210401891253,
"grad_norm": 4.506785869598389,
"learning_rate": 1.5500000000000002e-06,
"loss": 1.3157,
"step": 31
},
{
"epoch": 0.015130023640661938,
"grad_norm": 4.007757186889648,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.1451,
"step": 32
},
{
"epoch": 0.015602836879432624,
"grad_norm": 3.6621618270874023,
"learning_rate": 1.6500000000000003e-06,
"loss": 1.093,
"step": 33
},
{
"epoch": 0.01607565011820331,
"grad_norm": 3.8733766078948975,
"learning_rate": 1.7000000000000002e-06,
"loss": 1.2289,
"step": 34
},
{
"epoch": 0.016548463356973995,
"grad_norm": 4.3391900062561035,
"learning_rate": 1.75e-06,
"loss": 1.1453,
"step": 35
},
{
"epoch": 0.01702127659574468,
"grad_norm": 3.287623643875122,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.0257,
"step": 36
},
{
"epoch": 0.017494089834515367,
"grad_norm": 3.591721773147583,
"learning_rate": 1.85e-06,
"loss": 0.9976,
"step": 37
},
{
"epoch": 0.017966903073286054,
"grad_norm": 4.028271675109863,
"learning_rate": 1.9000000000000002e-06,
"loss": 1.0773,
"step": 38
},
{
"epoch": 0.018439716312056736,
"grad_norm": 3.3543951511383057,
"learning_rate": 1.9500000000000004e-06,
"loss": 1.1677,
"step": 39
},
{
"epoch": 0.018912529550827423,
"grad_norm": 3.807624340057373,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.1232,
"step": 40
},
{
"epoch": 0.01938534278959811,
"grad_norm": 4.242797374725342,
"learning_rate": 2.05e-06,
"loss": 1.1819,
"step": 41
},
{
"epoch": 0.019858156028368795,
"grad_norm": 3.4574992656707764,
"learning_rate": 2.1000000000000002e-06,
"loss": 0.9878,
"step": 42
},
{
"epoch": 0.02033096926713948,
"grad_norm": 3.906695604324341,
"learning_rate": 2.15e-06,
"loss": 1.0592,
"step": 43
},
{
"epoch": 0.020803782505910164,
"grad_norm": 3.7543163299560547,
"learning_rate": 2.2e-06,
"loss": 1.0309,
"step": 44
},
{
"epoch": 0.02127659574468085,
"grad_norm": 3.3777148723602295,
"learning_rate": 2.25e-06,
"loss": 1.0664,
"step": 45
},
{
"epoch": 0.021749408983451537,
"grad_norm": 3.6003634929656982,
"learning_rate": 2.3000000000000004e-06,
"loss": 1.0482,
"step": 46
},
{
"epoch": 0.022222222222222223,
"grad_norm": 3.3961377143859863,
"learning_rate": 2.35e-06,
"loss": 1.0252,
"step": 47
},
{
"epoch": 0.02269503546099291,
"grad_norm": 3.1601035594940186,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.0435,
"step": 48
},
{
"epoch": 0.023167848699763592,
"grad_norm": 3.4192967414855957,
"learning_rate": 2.4500000000000003e-06,
"loss": 1.0935,
"step": 49
},
{
"epoch": 0.02364066193853428,
"grad_norm": 3.1225922107696533,
"learning_rate": 2.5e-06,
"loss": 0.8988,
"step": 50
},
{
"epoch": 0.024113475177304965,
"grad_norm": 3.1423380374908447,
"learning_rate": 2.55e-06,
"loss": 1.0159,
"step": 51
},
{
"epoch": 0.02458628841607565,
"grad_norm": 3.4782402515411377,
"learning_rate": 2.6e-06,
"loss": 1.0231,
"step": 52
},
{
"epoch": 0.025059101654846337,
"grad_norm": 3.8362693786621094,
"learning_rate": 2.6500000000000005e-06,
"loss": 1.0725,
"step": 53
},
{
"epoch": 0.02553191489361702,
"grad_norm": 3.033294916152954,
"learning_rate": 2.7000000000000004e-06,
"loss": 0.9377,
"step": 54
},
{
"epoch": 0.026004728132387706,
"grad_norm": 3.849741220474243,
"learning_rate": 2.7500000000000004e-06,
"loss": 1.0046,
"step": 55
},
{
"epoch": 0.026477541371158392,
"grad_norm": 3.141876220703125,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.9226,
"step": 56
},
{
"epoch": 0.02695035460992908,
"grad_norm": 2.773594856262207,
"learning_rate": 2.85e-06,
"loss": 0.8662,
"step": 57
},
{
"epoch": 0.027423167848699765,
"grad_norm": 3.1460225582122803,
"learning_rate": 2.9e-06,
"loss": 0.9304,
"step": 58
},
{
"epoch": 0.027895981087470448,
"grad_norm": 3.293583631515503,
"learning_rate": 2.95e-06,
"loss": 1.0374,
"step": 59
},
{
"epoch": 0.028368794326241134,
"grad_norm": 3.8190863132476807,
"learning_rate": 3e-06,
"loss": 0.971,
"step": 60
},
{
"epoch": 0.02884160756501182,
"grad_norm": 3.4566776752471924,
"learning_rate": 3.05e-06,
"loss": 0.9631,
"step": 61
},
{
"epoch": 0.029314420803782507,
"grad_norm": 3.355741500854492,
"learning_rate": 3.1000000000000004e-06,
"loss": 1.0097,
"step": 62
},
{
"epoch": 0.029787234042553193,
"grad_norm": 3.29746675491333,
"learning_rate": 3.1500000000000003e-06,
"loss": 0.9459,
"step": 63
},
{
"epoch": 0.030260047281323876,
"grad_norm": 3.3122968673706055,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.8594,
"step": 64
},
{
"epoch": 0.030732860520094562,
"grad_norm": 3.477701187133789,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.9197,
"step": 65
},
{
"epoch": 0.031205673758865248,
"grad_norm": 3.3363406658172607,
"learning_rate": 3.3000000000000006e-06,
"loss": 0.9478,
"step": 66
},
{
"epoch": 0.03167848699763593,
"grad_norm": 4.143295764923096,
"learning_rate": 3.3500000000000005e-06,
"loss": 1.0534,
"step": 67
},
{
"epoch": 0.03215130023640662,
"grad_norm": 3.2363274097442627,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.9454,
"step": 68
},
{
"epoch": 0.032624113475177303,
"grad_norm": 3.198746681213379,
"learning_rate": 3.45e-06,
"loss": 0.9388,
"step": 69
},
{
"epoch": 0.03309692671394799,
"grad_norm": 3.5751023292541504,
"learning_rate": 3.5e-06,
"loss": 0.9444,
"step": 70
},
{
"epoch": 0.033569739952718676,
"grad_norm": 3.1745729446411133,
"learning_rate": 3.5500000000000003e-06,
"loss": 0.8683,
"step": 71
},
{
"epoch": 0.03404255319148936,
"grad_norm": 3.3210883140563965,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.8811,
"step": 72
},
{
"epoch": 0.03451536643026005,
"grad_norm": 3.2502429485321045,
"learning_rate": 3.65e-06,
"loss": 1.0012,
"step": 73
},
{
"epoch": 0.034988179669030735,
"grad_norm": 3.44598126411438,
"learning_rate": 3.7e-06,
"loss": 0.9217,
"step": 74
},
{
"epoch": 0.03546099290780142,
"grad_norm": 3.439117431640625,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.8976,
"step": 75
},
{
"epoch": 0.03593380614657211,
"grad_norm": 3.523627758026123,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.8996,
"step": 76
},
{
"epoch": 0.03640661938534279,
"grad_norm": 3.3716015815734863,
"learning_rate": 3.85e-06,
"loss": 0.9061,
"step": 77
},
{
"epoch": 0.03687943262411347,
"grad_norm": 3.33518385887146,
"learning_rate": 3.900000000000001e-06,
"loss": 0.9371,
"step": 78
},
{
"epoch": 0.03735224586288416,
"grad_norm": 3.833829879760742,
"learning_rate": 3.95e-06,
"loss": 0.9669,
"step": 79
},
{
"epoch": 0.037825059101654845,
"grad_norm": 3.260446786880493,
"learning_rate": 4.000000000000001e-06,
"loss": 0.9449,
"step": 80
},
{
"epoch": 0.03829787234042553,
"grad_norm": 3.532451629638672,
"learning_rate": 4.05e-06,
"loss": 0.897,
"step": 81
},
{
"epoch": 0.03877068557919622,
"grad_norm": 3.1156492233276367,
"learning_rate": 4.1e-06,
"loss": 0.8463,
"step": 82
},
{
"epoch": 0.039243498817966904,
"grad_norm": 2.8801751136779785,
"learning_rate": 4.15e-06,
"loss": 0.8616,
"step": 83
},
{
"epoch": 0.03971631205673759,
"grad_norm": 3.072476863861084,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.8387,
"step": 84
},
{
"epoch": 0.04018912529550828,
"grad_norm": 2.9601376056671143,
"learning_rate": 4.25e-06,
"loss": 0.8538,
"step": 85
},
{
"epoch": 0.04066193853427896,
"grad_norm": 3.521664619445801,
"learning_rate": 4.3e-06,
"loss": 0.8894,
"step": 86
},
{
"epoch": 0.04113475177304964,
"grad_norm": 3.2670981884002686,
"learning_rate": 4.350000000000001e-06,
"loss": 0.8387,
"step": 87
},
{
"epoch": 0.04160756501182033,
"grad_norm": 3.422089099884033,
"learning_rate": 4.4e-06,
"loss": 0.7728,
"step": 88
},
{
"epoch": 0.042080378250591015,
"grad_norm": 3.414034128189087,
"learning_rate": 4.450000000000001e-06,
"loss": 0.7968,
"step": 89
},
{
"epoch": 0.0425531914893617,
"grad_norm": 4.234285354614258,
"learning_rate": 4.5e-06,
"loss": 0.8502,
"step": 90
},
{
"epoch": 0.04302600472813239,
"grad_norm": 3.1446919441223145,
"learning_rate": 4.5500000000000005e-06,
"loss": 0.8236,
"step": 91
},
{
"epoch": 0.043498817966903074,
"grad_norm": 3.683443307876587,
"learning_rate": 4.600000000000001e-06,
"loss": 0.9792,
"step": 92
},
{
"epoch": 0.04397163120567376,
"grad_norm": 3.664219617843628,
"learning_rate": 4.65e-06,
"loss": 0.8743,
"step": 93
},
{
"epoch": 0.044444444444444446,
"grad_norm": 3.369479179382324,
"learning_rate": 4.7e-06,
"loss": 0.8741,
"step": 94
},
{
"epoch": 0.04491725768321513,
"grad_norm": 3.694949150085449,
"learning_rate": 4.75e-06,
"loss": 0.7574,
"step": 95
},
{
"epoch": 0.04539007092198582,
"grad_norm": 3.5144498348236084,
"learning_rate": 4.800000000000001e-06,
"loss": 0.9934,
"step": 96
},
{
"epoch": 0.0458628841607565,
"grad_norm": 3.164451837539673,
"learning_rate": 4.85e-06,
"loss": 0.7463,
"step": 97
},
{
"epoch": 0.046335697399527184,
"grad_norm": 3.222785472869873,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.7698,
"step": 98
},
{
"epoch": 0.04680851063829787,
"grad_norm": 2.9129555225372314,
"learning_rate": 4.95e-06,
"loss": 0.7856,
"step": 99
},
{
"epoch": 0.04728132387706856,
"grad_norm": 3.5061235427856445,
"learning_rate": 5e-06,
"loss": 0.8588,
"step": 100
},
{
"epoch": 0.04775413711583924,
"grad_norm": 3.2805044651031494,
"learning_rate": 4.999999922167982e-06,
"loss": 0.7643,
"step": 101
},
{
"epoch": 0.04822695035460993,
"grad_norm": 3.5461678504943848,
"learning_rate": 4.999999688671929e-06,
"loss": 0.8253,
"step": 102
},
{
"epoch": 0.048699763593380616,
"grad_norm": 3.2238264083862305,
"learning_rate": 4.99999929951186e-06,
"loss": 0.7622,
"step": 103
},
{
"epoch": 0.0491725768321513,
"grad_norm": 3.818955898284912,
"learning_rate": 4.999998754687795e-06,
"loss": 0.8471,
"step": 104
},
{
"epoch": 0.04964539007092199,
"grad_norm": 3.1252424716949463,
"learning_rate": 4.99999805419977e-06,
"loss": 0.8409,
"step": 105
},
{
"epoch": 0.050118203309692674,
"grad_norm": 3.604283571243286,
"learning_rate": 4.999997198047828e-06,
"loss": 0.9027,
"step": 106
},
{
"epoch": 0.050591016548463354,
"grad_norm": 3.6752424240112305,
"learning_rate": 4.999996186232023e-06,
"loss": 0.9336,
"step": 107
},
{
"epoch": 0.05106382978723404,
"grad_norm": 3.517557144165039,
"learning_rate": 4.9999950187524184e-06,
"loss": 0.8351,
"step": 108
},
{
"epoch": 0.051536643026004726,
"grad_norm": 3.427285671234131,
"learning_rate": 4.999993695609085e-06,
"loss": 0.8457,
"step": 109
},
{
"epoch": 0.05200945626477541,
"grad_norm": 3.2792510986328125,
"learning_rate": 4.999992216802107e-06,
"loss": 0.8391,
"step": 110
},
{
"epoch": 0.0524822695035461,
"grad_norm": 3.581094741821289,
"learning_rate": 4.999990582331576e-06,
"loss": 0.7533,
"step": 111
},
{
"epoch": 0.052955082742316785,
"grad_norm": 3.1667377948760986,
"learning_rate": 4.999988792197593e-06,
"loss": 0.9562,
"step": 112
},
{
"epoch": 0.05342789598108747,
"grad_norm": 3.3609890937805176,
"learning_rate": 4.99998684640027e-06,
"loss": 0.8181,
"step": 113
},
{
"epoch": 0.05390070921985816,
"grad_norm": 3.260627269744873,
"learning_rate": 4.999984744939729e-06,
"loss": 0.8012,
"step": 114
},
{
"epoch": 0.054373522458628844,
"grad_norm": 3.4535653591156006,
"learning_rate": 4.9999824878160985e-06,
"loss": 0.919,
"step": 115
},
{
"epoch": 0.05484633569739953,
"grad_norm": 3.4880740642547607,
"learning_rate": 4.999980075029522e-06,
"loss": 0.8114,
"step": 116
},
{
"epoch": 0.05531914893617021,
"grad_norm": 3.2546932697296143,
"learning_rate": 4.999977506580147e-06,
"loss": 0.8274,
"step": 117
},
{
"epoch": 0.055791962174940896,
"grad_norm": 3.2762744426727295,
"learning_rate": 4.999974782468136e-06,
"loss": 0.9018,
"step": 118
},
{
"epoch": 0.05626477541371158,
"grad_norm": 3.42825984954834,
"learning_rate": 4.999971902693657e-06,
"loss": 0.8262,
"step": 119
},
{
"epoch": 0.05673758865248227,
"grad_norm": 3.082496404647827,
"learning_rate": 4.99996886725689e-06,
"loss": 0.8181,
"step": 120
},
{
"epoch": 0.057210401891252954,
"grad_norm": 3.322869300842285,
"learning_rate": 4.9999656761580225e-06,
"loss": 0.8382,
"step": 121
},
{
"epoch": 0.05768321513002364,
"grad_norm": 3.6365339756011963,
"learning_rate": 4.9999623293972555e-06,
"loss": 0.7489,
"step": 122
},
{
"epoch": 0.05815602836879433,
"grad_norm": 3.376352548599243,
"learning_rate": 4.999958826974796e-06,
"loss": 0.9012,
"step": 123
},
{
"epoch": 0.05862884160756501,
"grad_norm": 3.49088716506958,
"learning_rate": 4.999955168890862e-06,
"loss": 0.8999,
"step": 124
},
{
"epoch": 0.0591016548463357,
"grad_norm": 3.3265068531036377,
"learning_rate": 4.999951355145682e-06,
"loss": 0.8161,
"step": 125
},
{
"epoch": 0.059574468085106386,
"grad_norm": 3.697282314300537,
"learning_rate": 4.999947385739493e-06,
"loss": 0.9623,
"step": 126
},
{
"epoch": 0.06004728132387707,
"grad_norm": 2.7901928424835205,
"learning_rate": 4.999943260672542e-06,
"loss": 0.7371,
"step": 127
},
{
"epoch": 0.06052009456264775,
"grad_norm": 3.110319137573242,
"learning_rate": 4.999938979945086e-06,
"loss": 0.715,
"step": 128
},
{
"epoch": 0.06099290780141844,
"grad_norm": 3.2211520671844482,
"learning_rate": 4.999934543557392e-06,
"loss": 0.8888,
"step": 129
},
{
"epoch": 0.061465721040189124,
"grad_norm": 3.2466187477111816,
"learning_rate": 4.999929951509735e-06,
"loss": 0.9389,
"step": 130
},
{
"epoch": 0.06193853427895981,
"grad_norm": 3.3574399948120117,
"learning_rate": 4.999925203802403e-06,
"loss": 0.8263,
"step": 131
},
{
"epoch": 0.062411347517730496,
"grad_norm": 3.275601625442505,
"learning_rate": 4.99992030043569e-06,
"loss": 0.8338,
"step": 132
},
{
"epoch": 0.06288416075650118,
"grad_norm": 3.6011312007904053,
"learning_rate": 4.999915241409902e-06,
"loss": 0.8351,
"step": 133
},
{
"epoch": 0.06335697399527186,
"grad_norm": 2.969011068344116,
"learning_rate": 4.999910026725352e-06,
"loss": 0.79,
"step": 134
},
{
"epoch": 0.06382978723404255,
"grad_norm": 3.690784454345703,
"learning_rate": 4.999904656382369e-06,
"loss": 0.8209,
"step": 135
},
{
"epoch": 0.06430260047281323,
"grad_norm": 3.3363115787506104,
"learning_rate": 4.999899130381283e-06,
"loss": 0.858,
"step": 136
},
{
"epoch": 0.06477541371158392,
"grad_norm": 3.206881523132324,
"learning_rate": 4.9998934487224405e-06,
"loss": 0.834,
"step": 137
},
{
"epoch": 0.06524822695035461,
"grad_norm": 2.773146152496338,
"learning_rate": 4.999887611406195e-06,
"loss": 0.7576,
"step": 138
},
{
"epoch": 0.0657210401891253,
"grad_norm": 3.307725667953491,
"learning_rate": 4.999881618432908e-06,
"loss": 0.7487,
"step": 139
},
{
"epoch": 0.06619385342789598,
"grad_norm": 4.273657321929932,
"learning_rate": 4.999875469802956e-06,
"loss": 0.8176,
"step": 140
},
{
"epoch": 0.06666666666666667,
"grad_norm": 3.0898005962371826,
"learning_rate": 4.999869165516719e-06,
"loss": 0.7578,
"step": 141
},
{
"epoch": 0.06713947990543735,
"grad_norm": 3.25150990486145,
"learning_rate": 4.9998627055745915e-06,
"loss": 0.7873,
"step": 142
},
{
"epoch": 0.06761229314420804,
"grad_norm": 2.9705755710601807,
"learning_rate": 4.999856089976974e-06,
"loss": 0.6473,
"step": 143
},
{
"epoch": 0.06808510638297872,
"grad_norm": 3.5658507347106934,
"learning_rate": 4.9998493187242804e-06,
"loss": 0.855,
"step": 144
},
{
"epoch": 0.06855791962174941,
"grad_norm": 3.3994076251983643,
"learning_rate": 4.99984239181693e-06,
"loss": 0.7926,
"step": 145
},
{
"epoch": 0.0690307328605201,
"grad_norm": 2.8266260623931885,
"learning_rate": 4.999835309255357e-06,
"loss": 0.7564,
"step": 146
},
{
"epoch": 0.06950354609929078,
"grad_norm": 3.1143875122070312,
"learning_rate": 4.999828071039999e-06,
"loss": 0.8398,
"step": 147
},
{
"epoch": 0.06997635933806147,
"grad_norm": 2.9364278316497803,
"learning_rate": 4.99982067717131e-06,
"loss": 0.7381,
"step": 148
},
{
"epoch": 0.07044917257683216,
"grad_norm": 3.4155616760253906,
"learning_rate": 4.999813127649748e-06,
"loss": 0.7933,
"step": 149
},
{
"epoch": 0.07092198581560284,
"grad_norm": 4.371236324310303,
"learning_rate": 4.999805422475784e-06,
"loss": 0.8292,
"step": 150
},
{
"epoch": 0.07139479905437353,
"grad_norm": 3.3967185020446777,
"learning_rate": 4.999797561649897e-06,
"loss": 0.8712,
"step": 151
},
{
"epoch": 0.07186761229314421,
"grad_norm": 3.343303680419922,
"learning_rate": 4.999789545172578e-06,
"loss": 0.8177,
"step": 152
},
{
"epoch": 0.07234042553191489,
"grad_norm": 3.040235757827759,
"learning_rate": 4.999781373044325e-06,
"loss": 0.7379,
"step": 153
},
{
"epoch": 0.07281323877068557,
"grad_norm": 3.4069204330444336,
"learning_rate": 4.999773045265647e-06,
"loss": 0.7939,
"step": 154
},
{
"epoch": 0.07328605200945626,
"grad_norm": 3.1939475536346436,
"learning_rate": 4.999764561837063e-06,
"loss": 0.8037,
"step": 155
},
{
"epoch": 0.07375886524822695,
"grad_norm": 4.452004909515381,
"learning_rate": 4.999755922759101e-06,
"loss": 0.8421,
"step": 156
},
{
"epoch": 0.07423167848699763,
"grad_norm": 3.2031240463256836,
"learning_rate": 4.999747128032298e-06,
"loss": 0.794,
"step": 157
},
{
"epoch": 0.07470449172576832,
"grad_norm": 3.175920009613037,
"learning_rate": 4.999738177657203e-06,
"loss": 0.759,
"step": 158
},
{
"epoch": 0.075177304964539,
"grad_norm": 3.7679688930511475,
"learning_rate": 4.9997290716343725e-06,
"loss": 0.8174,
"step": 159
},
{
"epoch": 0.07565011820330969,
"grad_norm": 3.7020037174224854,
"learning_rate": 4.999719809964373e-06,
"loss": 0.7116,
"step": 160
},
{
"epoch": 0.07612293144208038,
"grad_norm": 4.357471942901611,
"learning_rate": 4.999710392647783e-06,
"loss": 0.7649,
"step": 161
},
{
"epoch": 0.07659574468085106,
"grad_norm": 3.3439087867736816,
"learning_rate": 4.999700819685187e-06,
"loss": 0.7907,
"step": 162
},
{
"epoch": 0.07706855791962175,
"grad_norm": 3.210815191268921,
"learning_rate": 4.999691091077182e-06,
"loss": 0.8446,
"step": 163
},
{
"epoch": 0.07754137115839244,
"grad_norm": 3.1029553413391113,
"learning_rate": 4.9996812068243735e-06,
"loss": 0.7232,
"step": 164
},
{
"epoch": 0.07801418439716312,
"grad_norm": 2.9389400482177734,
"learning_rate": 4.999671166927378e-06,
"loss": 0.7413,
"step": 165
},
{
"epoch": 0.07848699763593381,
"grad_norm": 3.7062697410583496,
"learning_rate": 4.9996609713868185e-06,
"loss": 0.8773,
"step": 166
},
{
"epoch": 0.0789598108747045,
"grad_norm": 3.2768924236297607,
"learning_rate": 4.999650620203332e-06,
"loss": 0.8046,
"step": 167
},
{
"epoch": 0.07943262411347518,
"grad_norm": 3.380373001098633,
"learning_rate": 4.999640113377561e-06,
"loss": 0.7529,
"step": 168
},
{
"epoch": 0.07990543735224587,
"grad_norm": 3.520022392272949,
"learning_rate": 4.999629450910162e-06,
"loss": 0.7352,
"step": 169
},
{
"epoch": 0.08037825059101655,
"grad_norm": 3.43269419670105,
"learning_rate": 4.999618632801796e-06,
"loss": 0.9371,
"step": 170
},
{
"epoch": 0.08085106382978724,
"grad_norm": 3.555877923965454,
"learning_rate": 4.99960765905314e-06,
"loss": 0.8276,
"step": 171
},
{
"epoch": 0.08132387706855793,
"grad_norm": 3.597050189971924,
"learning_rate": 4.999596529664874e-06,
"loss": 0.8164,
"step": 172
},
{
"epoch": 0.0817966903073286,
"grad_norm": 3.2002956867218018,
"learning_rate": 4.999585244637693e-06,
"loss": 0.7824,
"step": 173
},
{
"epoch": 0.08226950354609928,
"grad_norm": 3.527275562286377,
"learning_rate": 4.999573803972299e-06,
"loss": 0.8033,
"step": 174
},
{
"epoch": 0.08274231678486997,
"grad_norm": 3.5184452533721924,
"learning_rate": 4.999562207669405e-06,
"loss": 0.724,
"step": 175
},
{
"epoch": 0.08321513002364066,
"grad_norm": 3.6635067462921143,
"learning_rate": 4.999550455729732e-06,
"loss": 0.819,
"step": 176
},
{
"epoch": 0.08368794326241134,
"grad_norm": 3.192399740219116,
"learning_rate": 4.999538548154012e-06,
"loss": 0.7999,
"step": 177
},
{
"epoch": 0.08416075650118203,
"grad_norm": 3.0946953296661377,
"learning_rate": 4.999526484942988e-06,
"loss": 0.7367,
"step": 178
},
{
"epoch": 0.08463356973995272,
"grad_norm": 2.847198009490967,
"learning_rate": 4.99951426609741e-06,
"loss": 0.7536,
"step": 179
},
{
"epoch": 0.0851063829787234,
"grad_norm": 2.7674827575683594,
"learning_rate": 4.999501891618037e-06,
"loss": 0.701,
"step": 180
},
{
"epoch": 0.08557919621749409,
"grad_norm": 3.357933521270752,
"learning_rate": 4.999489361505643e-06,
"loss": 0.8331,
"step": 181
},
{
"epoch": 0.08605200945626477,
"grad_norm": 3.1464426517486572,
"learning_rate": 4.999476675761004e-06,
"loss": 0.7931,
"step": 182
},
{
"epoch": 0.08652482269503546,
"grad_norm": 3.310697078704834,
"learning_rate": 4.999463834384915e-06,
"loss": 0.753,
"step": 183
},
{
"epoch": 0.08699763593380615,
"grad_norm": 2.9794881343841553,
"learning_rate": 4.999450837378171e-06,
"loss": 0.7091,
"step": 184
},
{
"epoch": 0.08747044917257683,
"grad_norm": 3.0776889324188232,
"learning_rate": 4.999437684741584e-06,
"loss": 0.7226,
"step": 185
},
{
"epoch": 0.08794326241134752,
"grad_norm": 3.6657519340515137,
"learning_rate": 4.999424376475972e-06,
"loss": 0.845,
"step": 186
},
{
"epoch": 0.0884160756501182,
"grad_norm": 3.872718572616577,
"learning_rate": 4.999410912582164e-06,
"loss": 0.812,
"step": 187
},
{
"epoch": 0.08888888888888889,
"grad_norm": 2.9184508323669434,
"learning_rate": 4.9993972930609976e-06,
"loss": 0.6823,
"step": 188
},
{
"epoch": 0.08936170212765958,
"grad_norm": 3.5567142963409424,
"learning_rate": 4.999383517913321e-06,
"loss": 0.7614,
"step": 189
},
{
"epoch": 0.08983451536643026,
"grad_norm": 3.3688533306121826,
"learning_rate": 4.999369587139992e-06,
"loss": 0.858,
"step": 190
},
{
"epoch": 0.09030732860520095,
"grad_norm": 2.893223524093628,
"learning_rate": 4.99935550074188e-06,
"loss": 0.6761,
"step": 191
},
{
"epoch": 0.09078014184397164,
"grad_norm": 3.400225877761841,
"learning_rate": 4.999341258719859e-06,
"loss": 0.7531,
"step": 192
},
{
"epoch": 0.09125295508274232,
"grad_norm": 3.6167714595794678,
"learning_rate": 4.999326861074817e-06,
"loss": 0.8164,
"step": 193
},
{
"epoch": 0.091725768321513,
"grad_norm": 4.325016498565674,
"learning_rate": 4.9993123078076506e-06,
"loss": 0.7069,
"step": 194
},
{
"epoch": 0.09219858156028368,
"grad_norm": 3.195317029953003,
"learning_rate": 4.999297598919266e-06,
"loss": 0.726,
"step": 195
},
{
"epoch": 0.09267139479905437,
"grad_norm": 3.146530866622925,
"learning_rate": 4.999282734410579e-06,
"loss": 0.7888,
"step": 196
},
{
"epoch": 0.09314420803782505,
"grad_norm": 3.5166752338409424,
"learning_rate": 4.999267714282515e-06,
"loss": 0.8473,
"step": 197
},
{
"epoch": 0.09361702127659574,
"grad_norm": 3.3140196800231934,
"learning_rate": 4.99925253853601e-06,
"loss": 0.7233,
"step": 198
},
{
"epoch": 0.09408983451536643,
"grad_norm": 3.0318164825439453,
"learning_rate": 4.999237207172008e-06,
"loss": 0.7543,
"step": 199
},
{
"epoch": 0.09456264775413711,
"grad_norm": 3.662214756011963,
"learning_rate": 4.999221720191464e-06,
"loss": 0.7783,
"step": 200
},
{
"epoch": 0.0950354609929078,
"grad_norm": 3.452078104019165,
"learning_rate": 4.9992060775953425e-06,
"loss": 0.7868,
"step": 201
},
{
"epoch": 0.09550827423167849,
"grad_norm": 3.4051287174224854,
"learning_rate": 4.999190279384617e-06,
"loss": 0.7849,
"step": 202
},
{
"epoch": 0.09598108747044917,
"grad_norm": 3.1377196311950684,
"learning_rate": 4.999174325560271e-06,
"loss": 0.8364,
"step": 203
},
{
"epoch": 0.09645390070921986,
"grad_norm": 3.129473924636841,
"learning_rate": 4.999158216123299e-06,
"loss": 0.7458,
"step": 204
},
{
"epoch": 0.09692671394799054,
"grad_norm": 3.169548749923706,
"learning_rate": 4.999141951074703e-06,
"loss": 0.7256,
"step": 205
},
{
"epoch": 0.09739952718676123,
"grad_norm": 3.186009168624878,
"learning_rate": 4.999125530415495e-06,
"loss": 0.783,
"step": 206
},
{
"epoch": 0.09787234042553192,
"grad_norm": 3.0995123386383057,
"learning_rate": 4.9991089541467e-06,
"loss": 0.7519,
"step": 207
},
{
"epoch": 0.0983451536643026,
"grad_norm": 3.1854088306427,
"learning_rate": 4.999092222269348e-06,
"loss": 0.7444,
"step": 208
},
{
"epoch": 0.09881796690307329,
"grad_norm": 3.1512246131896973,
"learning_rate": 4.999075334784482e-06,
"loss": 0.7882,
"step": 209
},
{
"epoch": 0.09929078014184398,
"grad_norm": 3.6199698448181152,
"learning_rate": 4.999058291693153e-06,
"loss": 0.8048,
"step": 210
},
{
"epoch": 0.09976359338061466,
"grad_norm": 2.956907272338867,
"learning_rate": 4.999041092996422e-06,
"loss": 0.7663,
"step": 211
},
{
"epoch": 0.10023640661938535,
"grad_norm": 3.3493971824645996,
"learning_rate": 4.99902373869536e-06,
"loss": 0.7639,
"step": 212
},
{
"epoch": 0.10070921985815603,
"grad_norm": 3.144812822341919,
"learning_rate": 4.9990062287910475e-06,
"loss": 0.7953,
"step": 213
},
{
"epoch": 0.10118203309692671,
"grad_norm": 3.5986971855163574,
"learning_rate": 4.998988563284576e-06,
"loss": 0.8297,
"step": 214
},
{
"epoch": 0.1016548463356974,
"grad_norm": 3.447584867477417,
"learning_rate": 4.998970742177044e-06,
"loss": 0.808,
"step": 215
},
{
"epoch": 0.10212765957446808,
"grad_norm": 3.791353940963745,
"learning_rate": 4.998952765469562e-06,
"loss": 0.8005,
"step": 216
},
{
"epoch": 0.10260047281323877,
"grad_norm": 3.4490807056427,
"learning_rate": 4.998934633163247e-06,
"loss": 0.8135,
"step": 217
},
{
"epoch": 0.10307328605200945,
"grad_norm": 3.1053314208984375,
"learning_rate": 4.998916345259232e-06,
"loss": 0.7888,
"step": 218
},
{
"epoch": 0.10354609929078014,
"grad_norm": 3.407862663269043,
"learning_rate": 4.9988979017586514e-06,
"loss": 0.7099,
"step": 219
},
{
"epoch": 0.10401891252955082,
"grad_norm": 3.116656541824341,
"learning_rate": 4.998879302662658e-06,
"loss": 0.8344,
"step": 220
},
{
"epoch": 0.10449172576832151,
"grad_norm": 3.339264154434204,
"learning_rate": 4.998860547972406e-06,
"loss": 0.8496,
"step": 221
},
{
"epoch": 0.1049645390070922,
"grad_norm": 3.251892566680908,
"learning_rate": 4.998841637689066e-06,
"loss": 0.7455,
"step": 222
},
{
"epoch": 0.10543735224586288,
"grad_norm": 4.098135471343994,
"learning_rate": 4.998822571813814e-06,
"loss": 0.7772,
"step": 223
},
{
"epoch": 0.10591016548463357,
"grad_norm": 3.9871134757995605,
"learning_rate": 4.998803350347837e-06,
"loss": 0.8261,
"step": 224
},
{
"epoch": 0.10638297872340426,
"grad_norm": 3.2822303771972656,
"learning_rate": 4.998783973292333e-06,
"loss": 0.8623,
"step": 225
},
{
"epoch": 0.10685579196217494,
"grad_norm": 3.0356857776641846,
"learning_rate": 4.998764440648507e-06,
"loss": 0.7426,
"step": 226
},
{
"epoch": 0.10732860520094563,
"grad_norm": 2.8932785987854004,
"learning_rate": 4.998744752417576e-06,
"loss": 0.6741,
"step": 227
},
{
"epoch": 0.10780141843971631,
"grad_norm": 3.085820436477661,
"learning_rate": 4.998724908600767e-06,
"loss": 0.6549,
"step": 228
},
{
"epoch": 0.108274231678487,
"grad_norm": 3.135829210281372,
"learning_rate": 4.998704909199314e-06,
"loss": 0.6702,
"step": 229
},
{
"epoch": 0.10874704491725769,
"grad_norm": 5.016134262084961,
"learning_rate": 4.9986847542144625e-06,
"loss": 0.7852,
"step": 230
},
{
"epoch": 0.10921985815602837,
"grad_norm": 3.9056200981140137,
"learning_rate": 4.998664443647468e-06,
"loss": 0.9654,
"step": 231
},
{
"epoch": 0.10969267139479906,
"grad_norm": 3.0880749225616455,
"learning_rate": 4.998643977499595e-06,
"loss": 0.7579,
"step": 232
},
{
"epoch": 0.11016548463356975,
"grad_norm": 3.6893601417541504,
"learning_rate": 4.998623355772118e-06,
"loss": 0.713,
"step": 233
},
{
"epoch": 0.11063829787234042,
"grad_norm": 4.181536674499512,
"learning_rate": 4.998602578466319e-06,
"loss": 0.7331,
"step": 234
},
{
"epoch": 0.1111111111111111,
"grad_norm": 3.036386728286743,
"learning_rate": 4.998581645583496e-06,
"loss": 0.7115,
"step": 235
},
{
"epoch": 0.11158392434988179,
"grad_norm": 3.6333255767822266,
"learning_rate": 4.998560557124948e-06,
"loss": 0.7544,
"step": 236
},
{
"epoch": 0.11205673758865248,
"grad_norm": 2.926417827606201,
"learning_rate": 4.9985393130919915e-06,
"loss": 0.715,
"step": 237
},
{
"epoch": 0.11252955082742316,
"grad_norm": 2.969158172607422,
"learning_rate": 4.998517913485946e-06,
"loss": 0.7304,
"step": 238
},
{
"epoch": 0.11300236406619385,
"grad_norm": 3.5254971981048584,
"learning_rate": 4.9984963583081466e-06,
"loss": 0.7725,
"step": 239
},
{
"epoch": 0.11347517730496454,
"grad_norm": 3.7840335369110107,
"learning_rate": 4.998474647559936e-06,
"loss": 0.8685,
"step": 240
},
{
"epoch": 0.11394799054373522,
"grad_norm": 3.0333125591278076,
"learning_rate": 4.9984527812426625e-06,
"loss": 0.7793,
"step": 241
},
{
"epoch": 0.11442080378250591,
"grad_norm": 3.290159225463867,
"learning_rate": 4.99843075935769e-06,
"loss": 0.7158,
"step": 242
},
{
"epoch": 0.1148936170212766,
"grad_norm": 3.3935494422912598,
"learning_rate": 4.99840858190639e-06,
"loss": 0.7643,
"step": 243
},
{
"epoch": 0.11536643026004728,
"grad_norm": 3.333965539932251,
"learning_rate": 4.998386248890142e-06,
"loss": 0.7255,
"step": 244
},
{
"epoch": 0.11583924349881797,
"grad_norm": 2.8129613399505615,
"learning_rate": 4.998363760310339e-06,
"loss": 0.768,
"step": 245
},
{
"epoch": 0.11631205673758865,
"grad_norm": 2.8678107261657715,
"learning_rate": 4.998341116168378e-06,
"loss": 0.7403,
"step": 246
},
{
"epoch": 0.11678486997635934,
"grad_norm": 2.8898239135742188,
"learning_rate": 4.998318316465672e-06,
"loss": 0.6844,
"step": 247
},
{
"epoch": 0.11725768321513003,
"grad_norm": 3.139777898788452,
"learning_rate": 4.998295361203637e-06,
"loss": 0.7936,
"step": 248
},
{
"epoch": 0.11773049645390071,
"grad_norm": 3.393721103668213,
"learning_rate": 4.998272250383707e-06,
"loss": 0.8173,
"step": 249
},
{
"epoch": 0.1182033096926714,
"grad_norm": 3.240973949432373,
"learning_rate": 4.998248984007318e-06,
"loss": 0.8252,
"step": 250
},
{
"epoch": 0.11867612293144209,
"grad_norm": 3.384855031967163,
"learning_rate": 4.998225562075918e-06,
"loss": 0.7244,
"step": 251
},
{
"epoch": 0.11914893617021277,
"grad_norm": 3.1881816387176514,
"learning_rate": 4.9982019845909675e-06,
"loss": 0.6818,
"step": 252
},
{
"epoch": 0.11962174940898346,
"grad_norm": 2.888364553451538,
"learning_rate": 4.998178251553934e-06,
"loss": 0.6753,
"step": 253
},
{
"epoch": 0.12009456264775414,
"grad_norm": 3.630093812942505,
"learning_rate": 4.9981543629662944e-06,
"loss": 0.7995,
"step": 254
},
{
"epoch": 0.12056737588652482,
"grad_norm": 2.9820947647094727,
"learning_rate": 4.998130318829537e-06,
"loss": 0.7478,
"step": 255
},
{
"epoch": 0.1210401891252955,
"grad_norm": 2.7094738483428955,
"learning_rate": 4.998106119145159e-06,
"loss": 0.7237,
"step": 256
},
{
"epoch": 0.12151300236406619,
"grad_norm": 3.1808104515075684,
"learning_rate": 4.9980817639146665e-06,
"loss": 0.7915,
"step": 257
},
{
"epoch": 0.12198581560283688,
"grad_norm": 3.1661291122436523,
"learning_rate": 4.998057253139575e-06,
"loss": 0.8053,
"step": 258
},
{
"epoch": 0.12245862884160756,
"grad_norm": 3.528749942779541,
"learning_rate": 4.998032586821413e-06,
"loss": 0.7946,
"step": 259
},
{
"epoch": 0.12293144208037825,
"grad_norm": 3.125964879989624,
"learning_rate": 4.998007764961716e-06,
"loss": 0.7569,
"step": 260
},
{
"epoch": 0.12340425531914893,
"grad_norm": 3.0778942108154297,
"learning_rate": 4.997982787562029e-06,
"loss": 0.7184,
"step": 261
},
{
"epoch": 0.12387706855791962,
"grad_norm": 3.3531930446624756,
"learning_rate": 4.997957654623906e-06,
"loss": 0.7586,
"step": 262
},
{
"epoch": 0.1243498817966903,
"grad_norm": 3.229278564453125,
"learning_rate": 4.997932366148913e-06,
"loss": 0.6092,
"step": 263
},
{
"epoch": 0.12482269503546099,
"grad_norm": 3.7286155223846436,
"learning_rate": 4.997906922138626e-06,
"loss": 0.7965,
"step": 264
},
{
"epoch": 0.12529550827423167,
"grad_norm": 3.300311803817749,
"learning_rate": 4.997881322594628e-06,
"loss": 0.7665,
"step": 265
},
{
"epoch": 0.12576832151300235,
"grad_norm": 3.411482572555542,
"learning_rate": 4.9978555675185115e-06,
"loss": 0.7253,
"step": 266
},
{
"epoch": 0.12624113475177304,
"grad_norm": 3.0884511470794678,
"learning_rate": 4.9978296569118825e-06,
"loss": 0.659,
"step": 267
},
{
"epoch": 0.12671394799054372,
"grad_norm": 3.0652925968170166,
"learning_rate": 4.9978035907763535e-06,
"loss": 0.6739,
"step": 268
},
{
"epoch": 0.1271867612293144,
"grad_norm": 3.280555009841919,
"learning_rate": 4.997777369113547e-06,
"loss": 0.8003,
"step": 269
},
{
"epoch": 0.1276595744680851,
"grad_norm": 2.980860948562622,
"learning_rate": 4.997750991925096e-06,
"loss": 0.7097,
"step": 270
},
{
"epoch": 0.12813238770685578,
"grad_norm": 3.301760673522949,
"learning_rate": 4.997724459212644e-06,
"loss": 0.7894,
"step": 271
},
{
"epoch": 0.12860520094562647,
"grad_norm": 2.9584903717041016,
"learning_rate": 4.997697770977841e-06,
"loss": 0.733,
"step": 272
},
{
"epoch": 0.12907801418439716,
"grad_norm": 3.5632214546203613,
"learning_rate": 4.99767092722235e-06,
"loss": 0.7228,
"step": 273
},
{
"epoch": 0.12955082742316784,
"grad_norm": 3.5900983810424805,
"learning_rate": 4.997643927947843e-06,
"loss": 0.7634,
"step": 274
},
{
"epoch": 0.13002364066193853,
"grad_norm": 3.332650661468506,
"learning_rate": 4.997616773156e-06,
"loss": 0.797,
"step": 275
},
{
"epoch": 0.13049645390070921,
"grad_norm": 3.1094167232513428,
"learning_rate": 4.997589462848512e-06,
"loss": 0.7849,
"step": 276
},
{
"epoch": 0.1309692671394799,
"grad_norm": 3.5359463691711426,
"learning_rate": 4.99756199702708e-06,
"loss": 0.6871,
"step": 277
},
{
"epoch": 0.1314420803782506,
"grad_norm": 3.190441846847534,
"learning_rate": 4.997534375693414e-06,
"loss": 0.6883,
"step": 278
},
{
"epoch": 0.13191489361702127,
"grad_norm": 3.063518762588501,
"learning_rate": 4.997506598849234e-06,
"loss": 0.7586,
"step": 279
},
{
"epoch": 0.13238770685579196,
"grad_norm": 3.4112050533294678,
"learning_rate": 4.997478666496269e-06,
"loss": 0.796,
"step": 280
},
{
"epoch": 0.13286052009456265,
"grad_norm": 3.231886386871338,
"learning_rate": 4.997450578636259e-06,
"loss": 0.7714,
"step": 281
},
{
"epoch": 0.13333333333333333,
"grad_norm": 3.279425621032715,
"learning_rate": 4.9974223352709515e-06,
"loss": 0.7793,
"step": 282
},
{
"epoch": 0.13380614657210402,
"grad_norm": 3.2154316902160645,
"learning_rate": 4.9973939364021075e-06,
"loss": 0.791,
"step": 283
},
{
"epoch": 0.1342789598108747,
"grad_norm": 3.2090768814086914,
"learning_rate": 4.9973653820314925e-06,
"loss": 0.6433,
"step": 284
},
{
"epoch": 0.1347517730496454,
"grad_norm": 3.1712026596069336,
"learning_rate": 4.997336672160886e-06,
"loss": 0.8128,
"step": 285
},
{
"epoch": 0.13522458628841608,
"grad_norm": 2.929229497909546,
"learning_rate": 4.997307806792076e-06,
"loss": 0.7594,
"step": 286
},
{
"epoch": 0.13569739952718676,
"grad_norm": 3.0363314151763916,
"learning_rate": 4.997278785926859e-06,
"loss": 0.7336,
"step": 287
},
{
"epoch": 0.13617021276595745,
"grad_norm": 3.1352357864379883,
"learning_rate": 4.997249609567042e-06,
"loss": 0.7225,
"step": 288
},
{
"epoch": 0.13664302600472814,
"grad_norm": 3.3171157836914062,
"learning_rate": 4.997220277714442e-06,
"loss": 0.7777,
"step": 289
},
{
"epoch": 0.13711583924349882,
"grad_norm": 3.050717353820801,
"learning_rate": 4.997190790370885e-06,
"loss": 0.6836,
"step": 290
},
{
"epoch": 0.1375886524822695,
"grad_norm": 3.0297694206237793,
"learning_rate": 4.997161147538208e-06,
"loss": 0.6883,
"step": 291
},
{
"epoch": 0.1380614657210402,
"grad_norm": 3.0566554069519043,
"learning_rate": 4.997131349218256e-06,
"loss": 0.6674,
"step": 292
},
{
"epoch": 0.13853427895981088,
"grad_norm": 3.799111843109131,
"learning_rate": 4.997101395412885e-06,
"loss": 0.8256,
"step": 293
},
{
"epoch": 0.13900709219858157,
"grad_norm": 3.1394248008728027,
"learning_rate": 4.9970712861239576e-06,
"loss": 0.7306,
"step": 294
},
{
"epoch": 0.13947990543735225,
"grad_norm": 3.0605666637420654,
"learning_rate": 4.997041021353352e-06,
"loss": 0.7212,
"step": 295
},
{
"epoch": 0.13995271867612294,
"grad_norm": 3.8813397884368896,
"learning_rate": 4.997010601102951e-06,
"loss": 0.769,
"step": 296
},
{
"epoch": 0.14042553191489363,
"grad_norm": 3.0514819622039795,
"learning_rate": 4.996980025374649e-06,
"loss": 0.7422,
"step": 297
},
{
"epoch": 0.1408983451536643,
"grad_norm": 2.9544146060943604,
"learning_rate": 4.99694929417035e-06,
"loss": 0.6912,
"step": 298
},
{
"epoch": 0.141371158392435,
"grad_norm": 3.2635602951049805,
"learning_rate": 4.996918407491966e-06,
"loss": 0.7395,
"step": 299
},
{
"epoch": 0.14184397163120568,
"grad_norm": 3.373882532119751,
"learning_rate": 4.996887365341423e-06,
"loss": 0.7799,
"step": 300
},
{
"epoch": 0.14231678486997637,
"grad_norm": 3.001128673553467,
"learning_rate": 4.996856167720652e-06,
"loss": 0.7168,
"step": 301
},
{
"epoch": 0.14278959810874706,
"grad_norm": 3.1026835441589355,
"learning_rate": 4.996824814631595e-06,
"loss": 0.7492,
"step": 302
},
{
"epoch": 0.14326241134751774,
"grad_norm": 3.41947603225708,
"learning_rate": 4.996793306076205e-06,
"loss": 0.6659,
"step": 303
},
{
"epoch": 0.14373522458628843,
"grad_norm": 3.2272400856018066,
"learning_rate": 4.996761642056444e-06,
"loss": 0.7184,
"step": 304
},
{
"epoch": 0.14420803782505912,
"grad_norm": 2.9488935470581055,
"learning_rate": 4.996729822574284e-06,
"loss": 0.7451,
"step": 305
},
{
"epoch": 0.14468085106382977,
"grad_norm": 3.268231153488159,
"learning_rate": 4.9966978476317065e-06,
"loss": 0.7798,
"step": 306
},
{
"epoch": 0.14515366430260046,
"grad_norm": 3.9086556434631348,
"learning_rate": 4.996665717230701e-06,
"loss": 0.7871,
"step": 307
},
{
"epoch": 0.14562647754137115,
"grad_norm": 3.3483879566192627,
"learning_rate": 4.996633431373269e-06,
"loss": 0.7415,
"step": 308
},
{
"epoch": 0.14609929078014183,
"grad_norm": 2.839400053024292,
"learning_rate": 4.99660099006142e-06,
"loss": 0.7192,
"step": 309
},
{
"epoch": 0.14657210401891252,
"grad_norm": 3.177302598953247,
"learning_rate": 4.996568393297175e-06,
"loss": 0.755,
"step": 310
},
{
"epoch": 0.1470449172576832,
"grad_norm": 3.5477044582366943,
"learning_rate": 4.996535641082563e-06,
"loss": 0.7531,
"step": 311
},
{
"epoch": 0.1475177304964539,
"grad_norm": 3.418576717376709,
"learning_rate": 4.996502733419624e-06,
"loss": 0.8009,
"step": 312
},
{
"epoch": 0.14799054373522458,
"grad_norm": 3.711341619491577,
"learning_rate": 4.996469670310407e-06,
"loss": 0.7362,
"step": 313
},
{
"epoch": 0.14846335697399526,
"grad_norm": 3.2419373989105225,
"learning_rate": 4.99643645175697e-06,
"loss": 0.7761,
"step": 314
},
{
"epoch": 0.14893617021276595,
"grad_norm": 3.121858835220337,
"learning_rate": 4.996403077761381e-06,
"loss": 0.6495,
"step": 315
},
{
"epoch": 0.14940898345153664,
"grad_norm": 3.123054265975952,
"learning_rate": 4.996369548325719e-06,
"loss": 0.7444,
"step": 316
},
{
"epoch": 0.14988179669030732,
"grad_norm": 2.780880928039551,
"learning_rate": 4.996335863452072e-06,
"loss": 0.672,
"step": 317
},
{
"epoch": 0.150354609929078,
"grad_norm": 3.3738629817962646,
"learning_rate": 4.996302023142536e-06,
"loss": 0.7972,
"step": 318
},
{
"epoch": 0.1508274231678487,
"grad_norm": 3.4874777793884277,
"learning_rate": 4.99626802739922e-06,
"loss": 0.8252,
"step": 319
},
{
"epoch": 0.15130023640661938,
"grad_norm": 3.7074787616729736,
"learning_rate": 4.9962338762242395e-06,
"loss": 0.8216,
"step": 320
},
{
"epoch": 0.15177304964539007,
"grad_norm": 3.281912326812744,
"learning_rate": 4.996199569619721e-06,
"loss": 0.8175,
"step": 321
},
{
"epoch": 0.15224586288416075,
"grad_norm": 2.9485340118408203,
"learning_rate": 4.996165107587801e-06,
"loss": 0.707,
"step": 322
},
{
"epoch": 0.15271867612293144,
"grad_norm": 3.3757646083831787,
"learning_rate": 4.996130490130625e-06,
"loss": 0.7955,
"step": 323
},
{
"epoch": 0.15319148936170213,
"grad_norm": 2.962181568145752,
"learning_rate": 4.996095717250349e-06,
"loss": 0.7067,
"step": 324
},
{
"epoch": 0.1536643026004728,
"grad_norm": 3.114272356033325,
"learning_rate": 4.996060788949136e-06,
"loss": 0.7486,
"step": 325
},
{
"epoch": 0.1541371158392435,
"grad_norm": 3.0621590614318848,
"learning_rate": 4.996025705229165e-06,
"loss": 0.6547,
"step": 326
},
{
"epoch": 0.15460992907801419,
"grad_norm": 2.8745882511138916,
"learning_rate": 4.995990466092616e-06,
"loss": 0.6435,
"step": 327
},
{
"epoch": 0.15508274231678487,
"grad_norm": 2.90841007232666,
"learning_rate": 4.995955071541686e-06,
"loss": 0.7331,
"step": 328
},
{
"epoch": 0.15555555555555556,
"grad_norm": 2.694580316543579,
"learning_rate": 4.9959195215785784e-06,
"loss": 0.6731,
"step": 329
},
{
"epoch": 0.15602836879432624,
"grad_norm": 3.158083438873291,
"learning_rate": 4.995883816205507e-06,
"loss": 0.7257,
"step": 330
},
{
"epoch": 0.15650118203309693,
"grad_norm": 3.3234715461730957,
"learning_rate": 4.995847955424694e-06,
"loss": 0.7389,
"step": 331
},
{
"epoch": 0.15697399527186762,
"grad_norm": 2.9406495094299316,
"learning_rate": 4.995811939238373e-06,
"loss": 0.643,
"step": 332
},
{
"epoch": 0.1574468085106383,
"grad_norm": 3.3191726207733154,
"learning_rate": 4.995775767648785e-06,
"loss": 0.7879,
"step": 333
},
{
"epoch": 0.157919621749409,
"grad_norm": 3.711925745010376,
"learning_rate": 4.995739440658185e-06,
"loss": 0.7586,
"step": 334
},
{
"epoch": 0.15839243498817968,
"grad_norm": 9.573421478271484,
"learning_rate": 4.995702958268833e-06,
"loss": 0.7842,
"step": 335
},
{
"epoch": 0.15886524822695036,
"grad_norm": 3.4154508113861084,
"learning_rate": 4.995666320483001e-06,
"loss": 0.6735,
"step": 336
},
{
"epoch": 0.15933806146572105,
"grad_norm": 3.4169859886169434,
"learning_rate": 4.995629527302971e-06,
"loss": 0.741,
"step": 337
},
{
"epoch": 0.15981087470449173,
"grad_norm": 3.287503242492676,
"learning_rate": 4.9955925787310335e-06,
"loss": 0.7139,
"step": 338
},
{
"epoch": 0.16028368794326242,
"grad_norm": 3.288409471511841,
"learning_rate": 4.995555474769488e-06,
"loss": 0.7636,
"step": 339
},
{
"epoch": 0.1607565011820331,
"grad_norm": 2.8021693229675293,
"learning_rate": 4.995518215420646e-06,
"loss": 0.5883,
"step": 340
},
{
"epoch": 0.1612293144208038,
"grad_norm": 2.7038564682006836,
"learning_rate": 4.995480800686827e-06,
"loss": 0.657,
"step": 341
},
{
"epoch": 0.16170212765957448,
"grad_norm": 3.2370235919952393,
"learning_rate": 4.9954432305703615e-06,
"loss": 0.6999,
"step": 342
},
{
"epoch": 0.16217494089834517,
"grad_norm": 2.8666412830352783,
"learning_rate": 4.995405505073588e-06,
"loss": 0.7199,
"step": 343
},
{
"epoch": 0.16264775413711585,
"grad_norm": 3.6467232704162598,
"learning_rate": 4.995367624198856e-06,
"loss": 0.7317,
"step": 344
},
{
"epoch": 0.16312056737588654,
"grad_norm": 2.7576327323913574,
"learning_rate": 4.9953295879485246e-06,
"loss": 0.647,
"step": 345
},
{
"epoch": 0.1635933806146572,
"grad_norm": 2.922232151031494,
"learning_rate": 4.995291396324959e-06,
"loss": 0.6686,
"step": 346
},
{
"epoch": 0.16406619385342788,
"grad_norm": 2.8693501949310303,
"learning_rate": 4.995253049330542e-06,
"loss": 0.6756,
"step": 347
},
{
"epoch": 0.16453900709219857,
"grad_norm": 3.671865701675415,
"learning_rate": 4.995214546967658e-06,
"loss": 0.7347,
"step": 348
},
{
"epoch": 0.16501182033096926,
"grad_norm": 3.024219274520874,
"learning_rate": 4.995175889238706e-06,
"loss": 0.7547,
"step": 349
},
{
"epoch": 0.16548463356973994,
"grad_norm": 2.8470778465270996,
"learning_rate": 4.995137076146091e-06,
"loss": 0.6764,
"step": 350
},
{
"epoch": 0.16595744680851063,
"grad_norm": 2.905057907104492,
"learning_rate": 4.9950981076922324e-06,
"loss": 0.6814,
"step": 351
},
{
"epoch": 0.16643026004728131,
"grad_norm": 3.504377841949463,
"learning_rate": 4.995058983879555e-06,
"loss": 0.7145,
"step": 352
},
{
"epoch": 0.166903073286052,
"grad_norm": 3.0029661655426025,
"learning_rate": 4.995019704710495e-06,
"loss": 0.7114,
"step": 353
},
{
"epoch": 0.1673758865248227,
"grad_norm": 2.8666274547576904,
"learning_rate": 4.994980270187499e-06,
"loss": 0.7416,
"step": 354
},
{
"epoch": 0.16784869976359337,
"grad_norm": 3.1644718647003174,
"learning_rate": 4.994940680313021e-06,
"loss": 0.661,
"step": 355
},
{
"epoch": 0.16832151300236406,
"grad_norm": 3.050391674041748,
"learning_rate": 4.994900935089527e-06,
"loss": 0.7243,
"step": 356
},
{
"epoch": 0.16879432624113475,
"grad_norm": 2.985466480255127,
"learning_rate": 4.994861034519491e-06,
"loss": 0.6917,
"step": 357
},
{
"epoch": 0.16926713947990543,
"grad_norm": 2.909342050552368,
"learning_rate": 4.9948209786053995e-06,
"loss": 0.6636,
"step": 358
},
{
"epoch": 0.16973995271867612,
"grad_norm": 3.2214784622192383,
"learning_rate": 4.9947807673497435e-06,
"loss": 0.7903,
"step": 359
},
{
"epoch": 0.1702127659574468,
"grad_norm": 2.5654983520507812,
"learning_rate": 4.994740400755029e-06,
"loss": 0.6129,
"step": 360
},
{
"epoch": 0.1706855791962175,
"grad_norm": 3.775646448135376,
"learning_rate": 4.99469987882377e-06,
"loss": 0.7145,
"step": 361
},
{
"epoch": 0.17115839243498818,
"grad_norm": 2.8965413570404053,
"learning_rate": 4.994659201558487e-06,
"loss": 0.7177,
"step": 362
},
{
"epoch": 0.17163120567375886,
"grad_norm": 3.485597848892212,
"learning_rate": 4.9946183689617146e-06,
"loss": 0.8107,
"step": 363
},
{
"epoch": 0.17210401891252955,
"grad_norm": 3.277839183807373,
"learning_rate": 4.994577381035995e-06,
"loss": 0.691,
"step": 364
},
{
"epoch": 0.17257683215130024,
"grad_norm": 2.8807685375213623,
"learning_rate": 4.99453623778388e-06,
"loss": 0.7627,
"step": 365
},
{
"epoch": 0.17304964539007092,
"grad_norm": 3.0659940242767334,
"learning_rate": 4.994494939207932e-06,
"loss": 0.6858,
"step": 366
},
{
"epoch": 0.1735224586288416,
"grad_norm": 3.0881855487823486,
"learning_rate": 4.994453485310723e-06,
"loss": 0.8212,
"step": 367
},
{
"epoch": 0.1739952718676123,
"grad_norm": 2.7199201583862305,
"learning_rate": 4.994411876094832e-06,
"loss": 0.6516,
"step": 368
},
{
"epoch": 0.17446808510638298,
"grad_norm": 2.955889940261841,
"learning_rate": 4.994370111562851e-06,
"loss": 0.6579,
"step": 369
},
{
"epoch": 0.17494089834515367,
"grad_norm": 3.1321663856506348,
"learning_rate": 4.994328191717382e-06,
"loss": 0.6891,
"step": 370
},
{
"epoch": 0.17541371158392435,
"grad_norm": 3.0560388565063477,
"learning_rate": 4.994286116561034e-06,
"loss": 0.7243,
"step": 371
},
{
"epoch": 0.17588652482269504,
"grad_norm": 3.1560704708099365,
"learning_rate": 4.994243886096425e-06,
"loss": 0.7262,
"step": 372
},
{
"epoch": 0.17635933806146573,
"grad_norm": 2.913541316986084,
"learning_rate": 4.994201500326187e-06,
"loss": 0.7318,
"step": 373
},
{
"epoch": 0.1768321513002364,
"grad_norm": 3.098376512527466,
"learning_rate": 4.994158959252958e-06,
"loss": 0.6419,
"step": 374
},
{
"epoch": 0.1773049645390071,
"grad_norm": 2.977508544921875,
"learning_rate": 4.994116262879387e-06,
"loss": 0.6709,
"step": 375
},
{
"epoch": 0.17777777777777778,
"grad_norm": 3.168186902999878,
"learning_rate": 4.994073411208133e-06,
"loss": 0.6608,
"step": 376
},
{
"epoch": 0.17825059101654847,
"grad_norm": 3.436844825744629,
"learning_rate": 4.994030404241864e-06,
"loss": 0.7227,
"step": 377
},
{
"epoch": 0.17872340425531916,
"grad_norm": 2.8998289108276367,
"learning_rate": 4.993987241983258e-06,
"loss": 0.6512,
"step": 378
},
{
"epoch": 0.17919621749408984,
"grad_norm": 3.407191514968872,
"learning_rate": 4.993943924435002e-06,
"loss": 0.616,
"step": 379
},
{
"epoch": 0.17966903073286053,
"grad_norm": 3.744858741760254,
"learning_rate": 4.993900451599793e-06,
"loss": 0.8599,
"step": 380
},
{
"epoch": 0.18014184397163122,
"grad_norm": 3.486283779144287,
"learning_rate": 4.993856823480338e-06,
"loss": 0.6634,
"step": 381
},
{
"epoch": 0.1806146572104019,
"grad_norm": 2.895719051361084,
"learning_rate": 4.993813040079355e-06,
"loss": 0.6972,
"step": 382
},
{
"epoch": 0.1810874704491726,
"grad_norm": 2.814133882522583,
"learning_rate": 4.993769101399569e-06,
"loss": 0.6271,
"step": 383
},
{
"epoch": 0.18156028368794327,
"grad_norm": 2.8609800338745117,
"learning_rate": 4.993725007443715e-06,
"loss": 0.6481,
"step": 384
},
{
"epoch": 0.18203309692671396,
"grad_norm": 3.2829644680023193,
"learning_rate": 4.99368075821454e-06,
"loss": 0.7999,
"step": 385
},
{
"epoch": 0.18250591016548465,
"grad_norm": 3.1417458057403564,
"learning_rate": 4.993636353714798e-06,
"loss": 0.6972,
"step": 386
},
{
"epoch": 0.1829787234042553,
"grad_norm": 3.0679385662078857,
"learning_rate": 4.993591793947256e-06,
"loss": 0.667,
"step": 387
},
{
"epoch": 0.183451536643026,
"grad_norm": 3.1387410163879395,
"learning_rate": 4.993547078914686e-06,
"loss": 0.7618,
"step": 388
},
{
"epoch": 0.18392434988179668,
"grad_norm": 2.9181406497955322,
"learning_rate": 4.993502208619872e-06,
"loss": 0.7391,
"step": 389
},
{
"epoch": 0.18439716312056736,
"grad_norm": 2.8952157497406006,
"learning_rate": 4.993457183065611e-06,
"loss": 0.6988,
"step": 390
},
{
"epoch": 0.18486997635933805,
"grad_norm": 3.2274813652038574,
"learning_rate": 4.993412002254704e-06,
"loss": 0.688,
"step": 391
},
{
"epoch": 0.18534278959810874,
"grad_norm": 3.4693779945373535,
"learning_rate": 4.993366666189965e-06,
"loss": 0.6634,
"step": 392
},
{
"epoch": 0.18581560283687942,
"grad_norm": 3.5358526706695557,
"learning_rate": 4.993321174874217e-06,
"loss": 0.7343,
"step": 393
},
{
"epoch": 0.1862884160756501,
"grad_norm": 3.013338088989258,
"learning_rate": 4.993275528310292e-06,
"loss": 0.7579,
"step": 394
},
{
"epoch": 0.1867612293144208,
"grad_norm": 2.694772720336914,
"learning_rate": 4.993229726501033e-06,
"loss": 0.718,
"step": 395
},
{
"epoch": 0.18723404255319148,
"grad_norm": 3.070612907409668,
"learning_rate": 4.9931837694492915e-06,
"loss": 0.6438,
"step": 396
},
{
"epoch": 0.18770685579196217,
"grad_norm": 2.9193027019500732,
"learning_rate": 4.993137657157928e-06,
"loss": 0.6788,
"step": 397
},
{
"epoch": 0.18817966903073285,
"grad_norm": 3.047682046890259,
"learning_rate": 4.993091389629816e-06,
"loss": 0.6826,
"step": 398
},
{
"epoch": 0.18865248226950354,
"grad_norm": 2.9629905223846436,
"learning_rate": 4.993044966867834e-06,
"loss": 0.7196,
"step": 399
},
{
"epoch": 0.18912529550827423,
"grad_norm": 3.0692050457000732,
"learning_rate": 4.992998388874874e-06,
"loss": 0.7015,
"step": 400
},
{
"epoch": 0.1895981087470449,
"grad_norm": 3.5427212715148926,
"learning_rate": 4.992951655653836e-06,
"loss": 0.8292,
"step": 401
},
{
"epoch": 0.1900709219858156,
"grad_norm": 2.643526554107666,
"learning_rate": 4.992904767207629e-06,
"loss": 0.624,
"step": 402
},
{
"epoch": 0.19054373522458629,
"grad_norm": 3.1185996532440186,
"learning_rate": 4.992857723539173e-06,
"loss": 0.7354,
"step": 403
},
{
"epoch": 0.19101654846335697,
"grad_norm": 3.006856679916382,
"learning_rate": 4.992810524651398e-06,
"loss": 0.7752,
"step": 404
},
{
"epoch": 0.19148936170212766,
"grad_norm": 2.9913275241851807,
"learning_rate": 4.9927631705472425e-06,
"loss": 0.7306,
"step": 405
},
{
"epoch": 0.19196217494089834,
"grad_norm": 2.6794071197509766,
"learning_rate": 4.992715661229655e-06,
"loss": 0.6136,
"step": 406
},
{
"epoch": 0.19243498817966903,
"grad_norm": 3.5933966636657715,
"learning_rate": 4.992667996701593e-06,
"loss": 0.7024,
"step": 407
},
{
"epoch": 0.19290780141843972,
"grad_norm": 2.862187623977661,
"learning_rate": 4.992620176966025e-06,
"loss": 0.692,
"step": 408
},
{
"epoch": 0.1933806146572104,
"grad_norm": 3.076845407485962,
"learning_rate": 4.9925722020259286e-06,
"loss": 0.7475,
"step": 409
},
{
"epoch": 0.1938534278959811,
"grad_norm": 3.372919797897339,
"learning_rate": 4.9925240718842895e-06,
"loss": 0.6886,
"step": 410
},
{
"epoch": 0.19432624113475178,
"grad_norm": 2.922977924346924,
"learning_rate": 4.992475786544108e-06,
"loss": 0.7049,
"step": 411
},
{
"epoch": 0.19479905437352246,
"grad_norm": 2.908034324645996,
"learning_rate": 4.992427346008387e-06,
"loss": 0.6498,
"step": 412
},
{
"epoch": 0.19527186761229315,
"grad_norm": 3.096723794937134,
"learning_rate": 4.992378750280144e-06,
"loss": 0.7151,
"step": 413
},
{
"epoch": 0.19574468085106383,
"grad_norm": 2.895237684249878,
"learning_rate": 4.992329999362405e-06,
"loss": 0.7277,
"step": 414
},
{
"epoch": 0.19621749408983452,
"grad_norm": 2.718230724334717,
"learning_rate": 4.9922810932582065e-06,
"loss": 0.6375,
"step": 415
},
{
"epoch": 0.1966903073286052,
"grad_norm": 3.187743663787842,
"learning_rate": 4.992232031970592e-06,
"loss": 0.6528,
"step": 416
},
{
"epoch": 0.1971631205673759,
"grad_norm": 2.996406316757202,
"learning_rate": 4.992182815502616e-06,
"loss": 0.6552,
"step": 417
},
{
"epoch": 0.19763593380614658,
"grad_norm": 3.301084041595459,
"learning_rate": 4.992133443857345e-06,
"loss": 0.7061,
"step": 418
},
{
"epoch": 0.19810874704491727,
"grad_norm": 3.7874677181243896,
"learning_rate": 4.992083917037853e-06,
"loss": 0.7859,
"step": 419
},
{
"epoch": 0.19858156028368795,
"grad_norm": 3.124253511428833,
"learning_rate": 4.992034235047222e-06,
"loss": 0.7615,
"step": 420
},
{
"epoch": 0.19905437352245864,
"grad_norm": 3.0488970279693604,
"learning_rate": 4.991984397888546e-06,
"loss": 0.6916,
"step": 421
},
{
"epoch": 0.19952718676122932,
"grad_norm": 3.1241321563720703,
"learning_rate": 4.991934405564929e-06,
"loss": 0.7055,
"step": 422
},
{
"epoch": 0.2,
"grad_norm": 3.396632432937622,
"learning_rate": 4.991884258079484e-06,
"loss": 0.7675,
"step": 423
},
{
"epoch": 0.2004728132387707,
"grad_norm": 3.7776873111724854,
"learning_rate": 4.9918339554353316e-06,
"loss": 0.7371,
"step": 424
},
{
"epoch": 0.20094562647754138,
"grad_norm": 3.3356032371520996,
"learning_rate": 4.991783497635606e-06,
"loss": 0.6778,
"step": 425
},
{
"epoch": 0.20141843971631207,
"grad_norm": 2.988856792449951,
"learning_rate": 4.9917328846834474e-06,
"loss": 0.6795,
"step": 426
},
{
"epoch": 0.20189125295508276,
"grad_norm": 3.264183282852173,
"learning_rate": 4.99168211658201e-06,
"loss": 0.7707,
"step": 427
},
{
"epoch": 0.20236406619385341,
"grad_norm": 3.878068208694458,
"learning_rate": 4.991631193334451e-06,
"loss": 0.857,
"step": 428
},
{
"epoch": 0.2028368794326241,
"grad_norm": 3.6377553939819336,
"learning_rate": 4.991580114943943e-06,
"loss": 0.8033,
"step": 429
},
{
"epoch": 0.2033096926713948,
"grad_norm": 2.95393967628479,
"learning_rate": 4.991528881413667e-06,
"loss": 0.6809,
"step": 430
},
{
"epoch": 0.20378250591016547,
"grad_norm": 3.058704376220703,
"learning_rate": 4.9914774927468125e-06,
"loss": 0.6664,
"step": 431
},
{
"epoch": 0.20425531914893616,
"grad_norm": 2.7783217430114746,
"learning_rate": 4.9914259489465795e-06,
"loss": 0.6478,
"step": 432
},
{
"epoch": 0.20472813238770685,
"grad_norm": 2.4825217723846436,
"learning_rate": 4.991374250016177e-06,
"loss": 0.6598,
"step": 433
},
{
"epoch": 0.20520094562647753,
"grad_norm": 2.8753600120544434,
"learning_rate": 4.991322395958824e-06,
"loss": 0.6947,
"step": 434
},
{
"epoch": 0.20567375886524822,
"grad_norm": 3.2339367866516113,
"learning_rate": 4.99127038677775e-06,
"loss": 0.8201,
"step": 435
},
{
"epoch": 0.2061465721040189,
"grad_norm": 2.9065537452697754,
"learning_rate": 4.991218222476193e-06,
"loss": 0.6679,
"step": 436
},
{
"epoch": 0.2066193853427896,
"grad_norm": 3.283228874206543,
"learning_rate": 4.991165903057401e-06,
"loss": 0.8039,
"step": 437
},
{
"epoch": 0.20709219858156028,
"grad_norm": 3.429872751235962,
"learning_rate": 4.991113428524631e-06,
"loss": 0.7392,
"step": 438
},
{
"epoch": 0.20756501182033096,
"grad_norm": 3.118943452835083,
"learning_rate": 4.991060798881152e-06,
"loss": 0.6794,
"step": 439
},
{
"epoch": 0.20803782505910165,
"grad_norm": 3.395970106124878,
"learning_rate": 4.99100801413024e-06,
"loss": 0.6862,
"step": 440
},
{
"epoch": 0.20851063829787234,
"grad_norm": 2.869191884994507,
"learning_rate": 4.99095507427518e-06,
"loss": 0.6076,
"step": 441
},
{
"epoch": 0.20898345153664302,
"grad_norm": 3.1934661865234375,
"learning_rate": 4.990901979319272e-06,
"loss": 0.6927,
"step": 442
},
{
"epoch": 0.2094562647754137,
"grad_norm": 2.9068603515625,
"learning_rate": 4.990848729265819e-06,
"loss": 0.6864,
"step": 443
},
{
"epoch": 0.2099290780141844,
"grad_norm": 3.0535948276519775,
"learning_rate": 4.9907953241181375e-06,
"loss": 0.6396,
"step": 444
},
{
"epoch": 0.21040189125295508,
"grad_norm": 2.871511459350586,
"learning_rate": 4.990741763879554e-06,
"loss": 0.6743,
"step": 445
},
{
"epoch": 0.21087470449172577,
"grad_norm": 2.9184393882751465,
"learning_rate": 4.9906880485534015e-06,
"loss": 0.6786,
"step": 446
},
{
"epoch": 0.21134751773049645,
"grad_norm": 3.0628271102905273,
"learning_rate": 4.990634178143026e-06,
"loss": 0.6326,
"step": 447
},
{
"epoch": 0.21182033096926714,
"grad_norm": 3.7878305912017822,
"learning_rate": 4.990580152651782e-06,
"loss": 0.7944,
"step": 448
},
{
"epoch": 0.21229314420803783,
"grad_norm": 2.8577189445495605,
"learning_rate": 4.990525972083031e-06,
"loss": 0.71,
"step": 449
},
{
"epoch": 0.2127659574468085,
"grad_norm": 3.307769775390625,
"learning_rate": 4.99047163644015e-06,
"loss": 0.6893,
"step": 450
},
{
"epoch": 0.2132387706855792,
"grad_norm": 2.7391717433929443,
"learning_rate": 4.990417145726519e-06,
"loss": 0.712,
"step": 451
},
{
"epoch": 0.21371158392434988,
"grad_norm": 2.938044786453247,
"learning_rate": 4.990362499945534e-06,
"loss": 0.7516,
"step": 452
},
{
"epoch": 0.21418439716312057,
"grad_norm": 2.7831056118011475,
"learning_rate": 4.990307699100595e-06,
"loss": 0.6168,
"step": 453
},
{
"epoch": 0.21465721040189126,
"grad_norm": 2.907977342605591,
"learning_rate": 4.990252743195116e-06,
"loss": 0.6706,
"step": 454
},
{
"epoch": 0.21513002364066194,
"grad_norm": 3.7882161140441895,
"learning_rate": 4.990197632232517e-06,
"loss": 0.6847,
"step": 455
},
{
"epoch": 0.21560283687943263,
"grad_norm": 2.899716854095459,
"learning_rate": 4.990142366216232e-06,
"loss": 0.6699,
"step": 456
},
{
"epoch": 0.21607565011820332,
"grad_norm": 2.907003879547119,
"learning_rate": 4.990086945149701e-06,
"loss": 0.6864,
"step": 457
},
{
"epoch": 0.216548463356974,
"grad_norm": 3.2407333850860596,
"learning_rate": 4.9900313690363736e-06,
"loss": 0.692,
"step": 458
},
{
"epoch": 0.2170212765957447,
"grad_norm": 2.9055583477020264,
"learning_rate": 4.989975637879712e-06,
"loss": 0.7113,
"step": 459
},
{
"epoch": 0.21749408983451538,
"grad_norm": 2.9836206436157227,
"learning_rate": 4.989919751683184e-06,
"loss": 0.6673,
"step": 460
},
{
"epoch": 0.21796690307328606,
"grad_norm": 3.371035575866699,
"learning_rate": 4.989863710450273e-06,
"loss": 0.7181,
"step": 461
},
{
"epoch": 0.21843971631205675,
"grad_norm": 2.9636635780334473,
"learning_rate": 4.989807514184465e-06,
"loss": 0.6082,
"step": 462
},
{
"epoch": 0.21891252955082743,
"grad_norm": 2.9634664058685303,
"learning_rate": 4.9897511628892615e-06,
"loss": 0.7086,
"step": 463
},
{
"epoch": 0.21938534278959812,
"grad_norm": 3.154763698577881,
"learning_rate": 4.98969465656817e-06,
"loss": 0.7027,
"step": 464
},
{
"epoch": 0.2198581560283688,
"grad_norm": 2.9959890842437744,
"learning_rate": 4.98963799522471e-06,
"loss": 0.6498,
"step": 465
},
{
"epoch": 0.2203309692671395,
"grad_norm": 3.5470590591430664,
"learning_rate": 4.989581178862408e-06,
"loss": 0.7199,
"step": 466
},
{
"epoch": 0.22080378250591018,
"grad_norm": 7.1873369216918945,
"learning_rate": 4.989524207484802e-06,
"loss": 0.6676,
"step": 467
},
{
"epoch": 0.22127659574468084,
"grad_norm": 3.1099541187286377,
"learning_rate": 4.98946708109544e-06,
"loss": 0.6785,
"step": 468
},
{
"epoch": 0.22174940898345152,
"grad_norm": 2.830991506576538,
"learning_rate": 4.9894097996978795e-06,
"loss": 0.6456,
"step": 469
},
{
"epoch": 0.2222222222222222,
"grad_norm": 3.0212316513061523,
"learning_rate": 4.989352363295687e-06,
"loss": 0.6048,
"step": 470
},
{
"epoch": 0.2226950354609929,
"grad_norm": 3.18776798248291,
"learning_rate": 4.989294771892437e-06,
"loss": 0.7078,
"step": 471
},
{
"epoch": 0.22316784869976358,
"grad_norm": 2.9972598552703857,
"learning_rate": 4.989237025491717e-06,
"loss": 0.7082,
"step": 472
},
{
"epoch": 0.22364066193853427,
"grad_norm": 3.4935688972473145,
"learning_rate": 4.989179124097123e-06,
"loss": 0.8199,
"step": 473
},
{
"epoch": 0.22411347517730495,
"grad_norm": 2.6485543251037598,
"learning_rate": 4.9891210677122595e-06,
"loss": 0.6371,
"step": 474
},
{
"epoch": 0.22458628841607564,
"grad_norm": 2.969233512878418,
"learning_rate": 4.989062856340742e-06,
"loss": 0.6879,
"step": 475
},
{
"epoch": 0.22505910165484633,
"grad_norm": 2.881875514984131,
"learning_rate": 4.989004489986194e-06,
"loss": 0.7415,
"step": 476
},
{
"epoch": 0.225531914893617,
"grad_norm": 2.624540090560913,
"learning_rate": 4.98894596865225e-06,
"loss": 0.6522,
"step": 477
},
{
"epoch": 0.2260047281323877,
"grad_norm": 3.61075496673584,
"learning_rate": 4.988887292342555e-06,
"loss": 0.7109,
"step": 478
},
{
"epoch": 0.2264775413711584,
"grad_norm": 2.9368972778320312,
"learning_rate": 4.988828461060762e-06,
"loss": 0.6843,
"step": 479
},
{
"epoch": 0.22695035460992907,
"grad_norm": 3.0670197010040283,
"learning_rate": 4.988769474810533e-06,
"loss": 0.6807,
"step": 480
},
{
"epoch": 0.22742316784869976,
"grad_norm": 2.9662792682647705,
"learning_rate": 4.988710333595542e-06,
"loss": 0.6796,
"step": 481
},
{
"epoch": 0.22789598108747045,
"grad_norm": 2.971235752105713,
"learning_rate": 4.988651037419472e-06,
"loss": 0.696,
"step": 482
},
{
"epoch": 0.22836879432624113,
"grad_norm": 2.931884527206421,
"learning_rate": 4.988591586286013e-06,
"loss": 0.7323,
"step": 483
},
{
"epoch": 0.22884160756501182,
"grad_norm": 2.8114213943481445,
"learning_rate": 4.988531980198868e-06,
"loss": 0.6584,
"step": 484
},
{
"epoch": 0.2293144208037825,
"grad_norm": 3.2785916328430176,
"learning_rate": 4.98847221916175e-06,
"loss": 0.7514,
"step": 485
},
{
"epoch": 0.2297872340425532,
"grad_norm": 3.0520215034484863,
"learning_rate": 4.988412303178377e-06,
"loss": 0.7564,
"step": 486
},
{
"epoch": 0.23026004728132388,
"grad_norm": 3.181002616882324,
"learning_rate": 4.988352232252483e-06,
"loss": 0.6768,
"step": 487
},
{
"epoch": 0.23073286052009456,
"grad_norm": 3.4953625202178955,
"learning_rate": 4.988292006387805e-06,
"loss": 0.7143,
"step": 488
},
{
"epoch": 0.23120567375886525,
"grad_norm": 3.326571226119995,
"learning_rate": 4.988231625588096e-06,
"loss": 0.7318,
"step": 489
},
{
"epoch": 0.23167848699763594,
"grad_norm": 3.09614634513855,
"learning_rate": 4.988171089857113e-06,
"loss": 0.6574,
"step": 490
},
{
"epoch": 0.23215130023640662,
"grad_norm": 2.7439446449279785,
"learning_rate": 4.9881103991986265e-06,
"loss": 0.6637,
"step": 491
},
{
"epoch": 0.2326241134751773,
"grad_norm": 3.0681190490722656,
"learning_rate": 4.988049553616416e-06,
"loss": 0.6326,
"step": 492
},
{
"epoch": 0.233096926713948,
"grad_norm": 3.0757341384887695,
"learning_rate": 4.98798855311427e-06,
"loss": 0.695,
"step": 493
},
{
"epoch": 0.23356973995271868,
"grad_norm": 2.8637635707855225,
"learning_rate": 4.987927397695985e-06,
"loss": 0.6598,
"step": 494
},
{
"epoch": 0.23404255319148937,
"grad_norm": 3.3641068935394287,
"learning_rate": 4.9878660873653715e-06,
"loss": 0.7435,
"step": 495
},
{
"epoch": 0.23451536643026005,
"grad_norm": 3.5025596618652344,
"learning_rate": 4.987804622126245e-06,
"loss": 0.735,
"step": 496
},
{
"epoch": 0.23498817966903074,
"grad_norm": 2.9298837184906006,
"learning_rate": 4.987743001982434e-06,
"loss": 0.7063,
"step": 497
},
{
"epoch": 0.23546099290780143,
"grad_norm": 2.70358943939209,
"learning_rate": 4.987681226937774e-06,
"loss": 0.6799,
"step": 498
},
{
"epoch": 0.2359338061465721,
"grad_norm": 3.027871608734131,
"learning_rate": 4.9876192969961125e-06,
"loss": 0.6881,
"step": 499
},
{
"epoch": 0.2364066193853428,
"grad_norm": 3.362306594848633,
"learning_rate": 4.987557212161304e-06,
"loss": 0.7906,
"step": 500
},
{
"epoch": 0.23687943262411348,
"grad_norm": 3.3136050701141357,
"learning_rate": 4.987494972437217e-06,
"loss": 0.6878,
"step": 501
},
{
"epoch": 0.23735224586288417,
"grad_norm": 3.017089605331421,
"learning_rate": 4.9874325778277255e-06,
"loss": 0.7279,
"step": 502
},
{
"epoch": 0.23782505910165486,
"grad_norm": 2.8300516605377197,
"learning_rate": 4.987370028336714e-06,
"loss": 0.6864,
"step": 503
},
{
"epoch": 0.23829787234042554,
"grad_norm": 3.201860189437866,
"learning_rate": 4.987307323968077e-06,
"loss": 0.7531,
"step": 504
},
{
"epoch": 0.23877068557919623,
"grad_norm": 2.685396194458008,
"learning_rate": 4.987244464725721e-06,
"loss": 0.5849,
"step": 505
},
{
"epoch": 0.23924349881796692,
"grad_norm": 2.8715312480926514,
"learning_rate": 4.987181450613557e-06,
"loss": 0.675,
"step": 506
},
{
"epoch": 0.2397163120567376,
"grad_norm": 2.813908815383911,
"learning_rate": 4.987118281635511e-06,
"loss": 0.6841,
"step": 507
},
{
"epoch": 0.2401891252955083,
"grad_norm": 3.2738473415374756,
"learning_rate": 4.987054957795514e-06,
"loss": 0.7158,
"step": 508
},
{
"epoch": 0.24066193853427895,
"grad_norm": 2.896134376525879,
"learning_rate": 4.986991479097511e-06,
"loss": 0.7542,
"step": 509
},
{
"epoch": 0.24113475177304963,
"grad_norm": 3.0390403270721436,
"learning_rate": 4.986927845545454e-06,
"loss": 0.6733,
"step": 510
},
{
"epoch": 0.24160756501182032,
"grad_norm": 3.0300254821777344,
"learning_rate": 4.9868640571433044e-06,
"loss": 0.722,
"step": 511
},
{
"epoch": 0.242080378250591,
"grad_norm": 3.3037352561950684,
"learning_rate": 4.986800113895035e-06,
"loss": 0.6811,
"step": 512
},
{
"epoch": 0.2425531914893617,
"grad_norm": 3.0358474254608154,
"learning_rate": 4.986736015804627e-06,
"loss": 0.7348,
"step": 513
},
{
"epoch": 0.24302600472813238,
"grad_norm": 3.108792304992676,
"learning_rate": 4.986671762876071e-06,
"loss": 0.6096,
"step": 514
},
{
"epoch": 0.24349881796690306,
"grad_norm": 3.1316237449645996,
"learning_rate": 4.986607355113367e-06,
"loss": 0.6357,
"step": 515
},
{
"epoch": 0.24397163120567375,
"grad_norm": 3.3095219135284424,
"learning_rate": 4.986542792520528e-06,
"loss": 0.7515,
"step": 516
},
{
"epoch": 0.24444444444444444,
"grad_norm": 3.4775984287261963,
"learning_rate": 4.986478075101572e-06,
"loss": 0.7104,
"step": 517
},
{
"epoch": 0.24491725768321512,
"grad_norm": 3.341708183288574,
"learning_rate": 4.986413202860528e-06,
"loss": 0.7339,
"step": 518
},
{
"epoch": 0.2453900709219858,
"grad_norm": 2.9646966457366943,
"learning_rate": 4.986348175801438e-06,
"loss": 0.6032,
"step": 519
},
{
"epoch": 0.2458628841607565,
"grad_norm": 3.1853902339935303,
"learning_rate": 4.986282993928349e-06,
"loss": 0.6925,
"step": 520
},
{
"epoch": 0.24633569739952718,
"grad_norm": 3.286909818649292,
"learning_rate": 4.98621765724532e-06,
"loss": 0.7447,
"step": 521
},
{
"epoch": 0.24680851063829787,
"grad_norm": 3.2255051136016846,
"learning_rate": 4.986152165756419e-06,
"loss": 0.7747,
"step": 522
},
{
"epoch": 0.24728132387706855,
"grad_norm": 3.002352237701416,
"learning_rate": 4.986086519465724e-06,
"loss": 0.6472,
"step": 523
},
{
"epoch": 0.24775413711583924,
"grad_norm": 3.4738974571228027,
"learning_rate": 4.986020718377322e-06,
"loss": 0.7381,
"step": 524
},
{
"epoch": 0.24822695035460993,
"grad_norm": 3.4470200538635254,
"learning_rate": 4.985954762495312e-06,
"loss": 0.6878,
"step": 525
},
{
"epoch": 0.2486997635933806,
"grad_norm": 2.9219350814819336,
"learning_rate": 4.985888651823799e-06,
"loss": 0.6317,
"step": 526
},
{
"epoch": 0.2491725768321513,
"grad_norm": 3.061767101287842,
"learning_rate": 4.985822386366899e-06,
"loss": 0.6842,
"step": 527
},
{
"epoch": 0.24964539007092199,
"grad_norm": 3.0291247367858887,
"learning_rate": 4.985755966128742e-06,
"loss": 0.6852,
"step": 528
},
{
"epoch": 0.25011820330969264,
"grad_norm": 2.964280843734741,
"learning_rate": 4.985689391113457e-06,
"loss": 0.7738,
"step": 529
},
{
"epoch": 0.25059101654846333,
"grad_norm": 3.058302164077759,
"learning_rate": 4.9856226613251955e-06,
"loss": 0.6677,
"step": 530
},
{
"epoch": 0.251063829787234,
"grad_norm": 3.345141649246216,
"learning_rate": 4.985555776768109e-06,
"loss": 0.7837,
"step": 531
},
{
"epoch": 0.2515366430260047,
"grad_norm": 3.565031051635742,
"learning_rate": 4.9854887374463636e-06,
"loss": 0.7231,
"step": 532
},
{
"epoch": 0.2520094562647754,
"grad_norm": 2.7953789234161377,
"learning_rate": 4.985421543364132e-06,
"loss": 0.6102,
"step": 533
},
{
"epoch": 0.2524822695035461,
"grad_norm": 2.887606620788574,
"learning_rate": 4.9853541945256e-06,
"loss": 0.6289,
"step": 534
},
{
"epoch": 0.25295508274231676,
"grad_norm": 3.1480495929718018,
"learning_rate": 4.985286690934961e-06,
"loss": 0.6348,
"step": 535
},
{
"epoch": 0.25342789598108745,
"grad_norm": 2.8912761211395264,
"learning_rate": 4.985219032596416e-06,
"loss": 0.595,
"step": 536
},
{
"epoch": 0.25390070921985813,
"grad_norm": 2.947936534881592,
"learning_rate": 4.98515121951418e-06,
"loss": 0.6196,
"step": 537
},
{
"epoch": 0.2543735224586288,
"grad_norm": 3.1085827350616455,
"learning_rate": 4.985083251692474e-06,
"loss": 0.6387,
"step": 538
},
{
"epoch": 0.2548463356973995,
"grad_norm": 3.1688334941864014,
"learning_rate": 4.985015129135531e-06,
"loss": 0.7055,
"step": 539
},
{
"epoch": 0.2553191489361702,
"grad_norm": 3.075042963027954,
"learning_rate": 4.984946851847593e-06,
"loss": 0.7515,
"step": 540
},
{
"epoch": 0.2557919621749409,
"grad_norm": 3.1933093070983887,
"learning_rate": 4.98487841983291e-06,
"loss": 0.7054,
"step": 541
},
{
"epoch": 0.25626477541371157,
"grad_norm": 3.043473958969116,
"learning_rate": 4.984809833095744e-06,
"loss": 0.6281,
"step": 542
},
{
"epoch": 0.25673758865248225,
"grad_norm": 3.0532584190368652,
"learning_rate": 4.9847410916403645e-06,
"loss": 0.6155,
"step": 543
},
{
"epoch": 0.25721040189125294,
"grad_norm": 3.608480215072632,
"learning_rate": 4.984672195471053e-06,
"loss": 0.7363,
"step": 544
},
{
"epoch": 0.2576832151300236,
"grad_norm": 2.7491862773895264,
"learning_rate": 4.9846031445921e-06,
"loss": 0.6594,
"step": 545
},
{
"epoch": 0.2581560283687943,
"grad_norm": 2.8602418899536133,
"learning_rate": 4.984533939007802e-06,
"loss": 0.6742,
"step": 546
},
{
"epoch": 0.258628841607565,
"grad_norm": 3.1782007217407227,
"learning_rate": 4.98446457872247e-06,
"loss": 0.731,
"step": 547
},
{
"epoch": 0.2591016548463357,
"grad_norm": 2.796147584915161,
"learning_rate": 4.984395063740423e-06,
"loss": 0.6617,
"step": 548
},
{
"epoch": 0.25957446808510637,
"grad_norm": 2.8392202854156494,
"learning_rate": 4.984325394065991e-06,
"loss": 0.6753,
"step": 549
},
{
"epoch": 0.26004728132387706,
"grad_norm": 3.134672164916992,
"learning_rate": 4.984255569703508e-06,
"loss": 0.7222,
"step": 550
},
{
"epoch": 0.26052009456264774,
"grad_norm": 2.734330177307129,
"learning_rate": 4.984185590657325e-06,
"loss": 0.6098,
"step": 551
},
{
"epoch": 0.26099290780141843,
"grad_norm": 3.739010810852051,
"learning_rate": 4.984115456931798e-06,
"loss": 0.7457,
"step": 552
},
{
"epoch": 0.2614657210401891,
"grad_norm": 2.8412528038024902,
"learning_rate": 4.9840451685312925e-06,
"loss": 0.6972,
"step": 553
},
{
"epoch": 0.2619385342789598,
"grad_norm": 3.017395496368408,
"learning_rate": 4.983974725460188e-06,
"loss": 0.6887,
"step": 554
},
{
"epoch": 0.2624113475177305,
"grad_norm": 3.2746949195861816,
"learning_rate": 4.98390412772287e-06,
"loss": 0.7047,
"step": 555
},
{
"epoch": 0.2628841607565012,
"grad_norm": 3.1561965942382812,
"learning_rate": 4.983833375323732e-06,
"loss": 0.7726,
"step": 556
},
{
"epoch": 0.26335697399527186,
"grad_norm": 3.2367217540740967,
"learning_rate": 4.9837624682671816e-06,
"loss": 0.6348,
"step": 557
},
{
"epoch": 0.26382978723404255,
"grad_norm": 2.8195858001708984,
"learning_rate": 4.983691406557633e-06,
"loss": 0.6387,
"step": 558
},
{
"epoch": 0.26430260047281323,
"grad_norm": 3.349820852279663,
"learning_rate": 4.983620190199511e-06,
"loss": 0.6776,
"step": 559
},
{
"epoch": 0.2647754137115839,
"grad_norm": 2.8025588989257812,
"learning_rate": 4.98354881919725e-06,
"loss": 0.6512,
"step": 560
},
{
"epoch": 0.2652482269503546,
"grad_norm": 2.9125499725341797,
"learning_rate": 4.983477293555295e-06,
"loss": 0.7024,
"step": 561
},
{
"epoch": 0.2657210401891253,
"grad_norm": 3.3479275703430176,
"learning_rate": 4.983405613278098e-06,
"loss": 0.688,
"step": 562
},
{
"epoch": 0.266193853427896,
"grad_norm": 3.123971462249756,
"learning_rate": 4.983333778370123e-06,
"loss": 0.6743,
"step": 563
},
{
"epoch": 0.26666666666666666,
"grad_norm": 2.891625165939331,
"learning_rate": 4.983261788835843e-06,
"loss": 0.5971,
"step": 564
},
{
"epoch": 0.26713947990543735,
"grad_norm": 3.5066864490509033,
"learning_rate": 4.98318964467974e-06,
"loss": 0.6958,
"step": 565
},
{
"epoch": 0.26761229314420804,
"grad_norm": 2.570547342300415,
"learning_rate": 4.983117345906306e-06,
"loss": 0.609,
"step": 566
},
{
"epoch": 0.2680851063829787,
"grad_norm": 3.005106210708618,
"learning_rate": 4.983044892520044e-06,
"loss": 0.6791,
"step": 567
},
{
"epoch": 0.2685579196217494,
"grad_norm": 3.429675340652466,
"learning_rate": 4.982972284525463e-06,
"loss": 0.6625,
"step": 568
},
{
"epoch": 0.2690307328605201,
"grad_norm": 3.825657367706299,
"learning_rate": 4.982899521927086e-06,
"loss": 0.6368,
"step": 569
},
{
"epoch": 0.2695035460992908,
"grad_norm": 2.8699095249176025,
"learning_rate": 4.982826604729443e-06,
"loss": 0.6425,
"step": 570
},
{
"epoch": 0.26997635933806147,
"grad_norm": 3.1688714027404785,
"learning_rate": 4.982753532937074e-06,
"loss": 0.6904,
"step": 571
},
{
"epoch": 0.27044917257683215,
"grad_norm": 3.3889992237091064,
"learning_rate": 4.98268030655453e-06,
"loss": 0.7575,
"step": 572
},
{
"epoch": 0.27092198581560284,
"grad_norm": 3.108315944671631,
"learning_rate": 4.982606925586367e-06,
"loss": 0.6648,
"step": 573
},
{
"epoch": 0.2713947990543735,
"grad_norm": 3.209831953048706,
"learning_rate": 4.982533390037159e-06,
"loss": 0.657,
"step": 574
},
{
"epoch": 0.2718676122931442,
"grad_norm": 3.1740927696228027,
"learning_rate": 4.982459699911482e-06,
"loss": 0.7262,
"step": 575
},
{
"epoch": 0.2723404255319149,
"grad_norm": 3.0190417766571045,
"learning_rate": 4.982385855213924e-06,
"loss": 0.6368,
"step": 576
},
{
"epoch": 0.2728132387706856,
"grad_norm": 3.05049467086792,
"learning_rate": 4.982311855949084e-06,
"loss": 0.72,
"step": 577
},
{
"epoch": 0.27328605200945627,
"grad_norm": 2.984816551208496,
"learning_rate": 4.98223770212157e-06,
"loss": 0.6856,
"step": 578
},
{
"epoch": 0.27375886524822696,
"grad_norm": 2.744969606399536,
"learning_rate": 4.982163393735998e-06,
"loss": 0.6023,
"step": 579
},
{
"epoch": 0.27423167848699764,
"grad_norm": 3.170564889907837,
"learning_rate": 4.982088930796996e-06,
"loss": 0.6678,
"step": 580
},
{
"epoch": 0.27470449172576833,
"grad_norm": 2.8686118125915527,
"learning_rate": 4.982014313309199e-06,
"loss": 0.6157,
"step": 581
},
{
"epoch": 0.275177304964539,
"grad_norm": 2.8768694400787354,
"learning_rate": 4.981939541277254e-06,
"loss": 0.6566,
"step": 582
},
{
"epoch": 0.2756501182033097,
"grad_norm": 2.621481418609619,
"learning_rate": 4.981864614705818e-06,
"loss": 0.7372,
"step": 583
},
{
"epoch": 0.2761229314420804,
"grad_norm": 3.527374267578125,
"learning_rate": 4.981789533599554e-06,
"loss": 0.6485,
"step": 584
},
{
"epoch": 0.2765957446808511,
"grad_norm": 3.3141074180603027,
"learning_rate": 4.981714297963138e-06,
"loss": 0.6816,
"step": 585
},
{
"epoch": 0.27706855791962176,
"grad_norm": 2.9247069358825684,
"learning_rate": 4.981638907801255e-06,
"loss": 0.7217,
"step": 586
},
{
"epoch": 0.27754137115839245,
"grad_norm": 2.875236749649048,
"learning_rate": 4.981563363118599e-06,
"loss": 0.6662,
"step": 587
},
{
"epoch": 0.27801418439716313,
"grad_norm": 2.9540364742279053,
"learning_rate": 4.981487663919874e-06,
"loss": 0.7225,
"step": 588
},
{
"epoch": 0.2784869976359338,
"grad_norm": 2.90889310836792,
"learning_rate": 4.981411810209793e-06,
"loss": 0.6054,
"step": 589
},
{
"epoch": 0.2789598108747045,
"grad_norm": 2.8541409969329834,
"learning_rate": 4.981335801993078e-06,
"loss": 0.6539,
"step": 590
},
{
"epoch": 0.2794326241134752,
"grad_norm": 3.1600730419158936,
"learning_rate": 4.981259639274465e-06,
"loss": 0.6415,
"step": 591
},
{
"epoch": 0.2799054373522459,
"grad_norm": 3.569376230239868,
"learning_rate": 4.981183322058693e-06,
"loss": 0.6944,
"step": 592
},
{
"epoch": 0.28037825059101656,
"grad_norm": 3.067667007446289,
"learning_rate": 4.981106850350515e-06,
"loss": 0.7378,
"step": 593
},
{
"epoch": 0.28085106382978725,
"grad_norm": 3.082073450088501,
"learning_rate": 4.981030224154693e-06,
"loss": 0.693,
"step": 594
},
{
"epoch": 0.28132387706855794,
"grad_norm": 2.902932643890381,
"learning_rate": 4.980953443475998e-06,
"loss": 0.6549,
"step": 595
},
{
"epoch": 0.2817966903073286,
"grad_norm": 2.6821181774139404,
"learning_rate": 4.980876508319211e-06,
"loss": 0.6231,
"step": 596
},
{
"epoch": 0.2822695035460993,
"grad_norm": 3.1747355461120605,
"learning_rate": 4.9807994186891215e-06,
"loss": 0.6826,
"step": 597
},
{
"epoch": 0.28274231678487,
"grad_norm": 2.6975860595703125,
"learning_rate": 4.980722174590531e-06,
"loss": 0.6669,
"step": 598
},
{
"epoch": 0.2832151300236407,
"grad_norm": 2.924285650253296,
"learning_rate": 4.9806447760282486e-06,
"loss": 0.689,
"step": 599
},
{
"epoch": 0.28368794326241137,
"grad_norm": 2.941417694091797,
"learning_rate": 4.980567223007093e-06,
"loss": 0.6672,
"step": 600
},
{
"epoch": 0.28416075650118205,
"grad_norm": 2.8582186698913574,
"learning_rate": 4.980489515531892e-06,
"loss": 0.6229,
"step": 601
},
{
"epoch": 0.28463356973995274,
"grad_norm": 2.6462013721466064,
"learning_rate": 4.9804116536074865e-06,
"loss": 0.606,
"step": 602
},
{
"epoch": 0.2851063829787234,
"grad_norm": 2.9029998779296875,
"learning_rate": 4.980333637238723e-06,
"loss": 0.5915,
"step": 603
},
{
"epoch": 0.2855791962174941,
"grad_norm": 3.9359042644500732,
"learning_rate": 4.980255466430462e-06,
"loss": 0.7035,
"step": 604
},
{
"epoch": 0.2860520094562648,
"grad_norm": 3.200524091720581,
"learning_rate": 4.980177141187566e-06,
"loss": 0.7156,
"step": 605
},
{
"epoch": 0.2865248226950355,
"grad_norm": 3.1708686351776123,
"learning_rate": 4.980098661514916e-06,
"loss": 0.746,
"step": 606
},
{
"epoch": 0.28699763593380617,
"grad_norm": 2.8926830291748047,
"learning_rate": 4.980020027417397e-06,
"loss": 0.6282,
"step": 607
},
{
"epoch": 0.28747044917257686,
"grad_norm": 3.0526294708251953,
"learning_rate": 4.979941238899906e-06,
"loss": 0.6594,
"step": 608
},
{
"epoch": 0.28794326241134754,
"grad_norm": 2.9869306087493896,
"learning_rate": 4.9798622959673486e-06,
"loss": 0.7771,
"step": 609
},
{
"epoch": 0.28841607565011823,
"grad_norm": 2.7894513607025146,
"learning_rate": 4.979783198624638e-06,
"loss": 0.6819,
"step": 610
},
{
"epoch": 0.28888888888888886,
"grad_norm": 2.958575963973999,
"learning_rate": 4.9797039468767025e-06,
"loss": 0.6474,
"step": 611
},
{
"epoch": 0.28936170212765955,
"grad_norm": 3.423748016357422,
"learning_rate": 4.979624540728475e-06,
"loss": 0.7389,
"step": 612
},
{
"epoch": 0.28983451536643023,
"grad_norm": 2.9641635417938232,
"learning_rate": 4.9795449801849e-06,
"loss": 0.6005,
"step": 613
},
{
"epoch": 0.2903073286052009,
"grad_norm": 3.02274227142334,
"learning_rate": 4.979465265250933e-06,
"loss": 0.6358,
"step": 614
},
{
"epoch": 0.2907801418439716,
"grad_norm": 3.0562758445739746,
"learning_rate": 4.979385395931534e-06,
"loss": 0.6313,
"step": 615
},
{
"epoch": 0.2912529550827423,
"grad_norm": 3.301816701889038,
"learning_rate": 4.97930537223168e-06,
"loss": 0.7264,
"step": 616
},
{
"epoch": 0.291725768321513,
"grad_norm": 2.975360870361328,
"learning_rate": 4.979225194156351e-06,
"loss": 0.613,
"step": 617
},
{
"epoch": 0.29219858156028367,
"grad_norm": 2.9245030879974365,
"learning_rate": 4.97914486171054e-06,
"loss": 0.6646,
"step": 618
},
{
"epoch": 0.29267139479905435,
"grad_norm": 3.1336188316345215,
"learning_rate": 4.979064374899249e-06,
"loss": 0.6421,
"step": 619
},
{
"epoch": 0.29314420803782504,
"grad_norm": 3.6298763751983643,
"learning_rate": 4.978983733727491e-06,
"loss": 0.6433,
"step": 620
},
{
"epoch": 0.2936170212765957,
"grad_norm": 2.919597625732422,
"learning_rate": 4.9789029382002845e-06,
"loss": 0.6288,
"step": 621
},
{
"epoch": 0.2940898345153664,
"grad_norm": 3.2206127643585205,
"learning_rate": 4.978821988322662e-06,
"loss": 0.7102,
"step": 622
},
{
"epoch": 0.2945626477541371,
"grad_norm": 3.1767101287841797,
"learning_rate": 4.978740884099664e-06,
"loss": 0.6722,
"step": 623
},
{
"epoch": 0.2950354609929078,
"grad_norm": 3.3425452709198,
"learning_rate": 4.97865962553634e-06,
"loss": 0.6492,
"step": 624
},
{
"epoch": 0.29550827423167847,
"grad_norm": 3.0408358573913574,
"learning_rate": 4.97857821263775e-06,
"loss": 0.6522,
"step": 625
},
{
"epoch": 0.29598108747044916,
"grad_norm": 2.8144783973693848,
"learning_rate": 4.978496645408963e-06,
"loss": 0.7237,
"step": 626
},
{
"epoch": 0.29645390070921984,
"grad_norm": 3.7010560035705566,
"learning_rate": 4.978414923855057e-06,
"loss": 0.7509,
"step": 627
},
{
"epoch": 0.29692671394799053,
"grad_norm": 2.9438371658325195,
"learning_rate": 4.978333047981122e-06,
"loss": 0.6244,
"step": 628
},
{
"epoch": 0.2973995271867612,
"grad_norm": 3.285982370376587,
"learning_rate": 4.978251017792255e-06,
"loss": 0.7553,
"step": 629
},
{
"epoch": 0.2978723404255319,
"grad_norm": 3.7021138668060303,
"learning_rate": 4.978168833293564e-06,
"loss": 0.7859,
"step": 630
},
{
"epoch": 0.2983451536643026,
"grad_norm": 3.481858730316162,
"learning_rate": 4.9780864944901654e-06,
"loss": 0.7146,
"step": 631
},
{
"epoch": 0.2988179669030733,
"grad_norm": 3.693824529647827,
"learning_rate": 4.978004001387188e-06,
"loss": 0.6608,
"step": 632
},
{
"epoch": 0.29929078014184396,
"grad_norm": 3.0069146156311035,
"learning_rate": 4.9779213539897665e-06,
"loss": 0.6506,
"step": 633
},
{
"epoch": 0.29976359338061465,
"grad_norm": 3.037644147872925,
"learning_rate": 4.977838552303048e-06,
"loss": 0.6487,
"step": 634
},
{
"epoch": 0.30023640661938533,
"grad_norm": 3.018554449081421,
"learning_rate": 4.977755596332188e-06,
"loss": 0.6128,
"step": 635
},
{
"epoch": 0.300709219858156,
"grad_norm": 3.000312089920044,
"learning_rate": 4.977672486082351e-06,
"loss": 0.6431,
"step": 636
},
{
"epoch": 0.3011820330969267,
"grad_norm": 2.836803913116455,
"learning_rate": 4.977589221558713e-06,
"loss": 0.5914,
"step": 637
},
{
"epoch": 0.3016548463356974,
"grad_norm": 3.080469846725464,
"learning_rate": 4.977505802766457e-06,
"loss": 0.7265,
"step": 638
},
{
"epoch": 0.3021276595744681,
"grad_norm": 3.2245471477508545,
"learning_rate": 4.97742222971078e-06,
"loss": 0.6895,
"step": 639
},
{
"epoch": 0.30260047281323876,
"grad_norm": 3.559006452560425,
"learning_rate": 4.977338502396882e-06,
"loss": 0.7439,
"step": 640
},
{
"epoch": 0.30307328605200945,
"grad_norm": 2.9116289615631104,
"learning_rate": 4.9772546208299795e-06,
"loss": 0.6907,
"step": 641
},
{
"epoch": 0.30354609929078014,
"grad_norm": 3.3645524978637695,
"learning_rate": 4.977170585015295e-06,
"loss": 0.6983,
"step": 642
},
{
"epoch": 0.3040189125295508,
"grad_norm": 3.080148458480835,
"learning_rate": 4.977086394958058e-06,
"loss": 0.7016,
"step": 643
},
{
"epoch": 0.3044917257683215,
"grad_norm": 2.9276750087738037,
"learning_rate": 4.977002050663515e-06,
"loss": 0.6509,
"step": 644
},
{
"epoch": 0.3049645390070922,
"grad_norm": 3.183609962463379,
"learning_rate": 4.976917552136914e-06,
"loss": 0.6814,
"step": 645
},
{
"epoch": 0.3054373522458629,
"grad_norm": 3.0980000495910645,
"learning_rate": 4.976832899383519e-06,
"loss": 0.6319,
"step": 646
},
{
"epoch": 0.30591016548463357,
"grad_norm": 3.211376190185547,
"learning_rate": 4.9767480924086e-06,
"loss": 0.6365,
"step": 647
},
{
"epoch": 0.30638297872340425,
"grad_norm": 3.214430093765259,
"learning_rate": 4.976663131217437e-06,
"loss": 0.6006,
"step": 648
},
{
"epoch": 0.30685579196217494,
"grad_norm": 3.0914318561553955,
"learning_rate": 4.976578015815321e-06,
"loss": 0.7162,
"step": 649
},
{
"epoch": 0.3073286052009456,
"grad_norm": 2.7644500732421875,
"learning_rate": 4.976492746207551e-06,
"loss": 0.6045,
"step": 650
},
{
"epoch": 0.3078014184397163,
"grad_norm": 3.1913280487060547,
"learning_rate": 4.9764073223994374e-06,
"loss": 0.6796,
"step": 651
},
{
"epoch": 0.308274231678487,
"grad_norm": 2.8919692039489746,
"learning_rate": 4.976321744396299e-06,
"loss": 0.6683,
"step": 652
},
{
"epoch": 0.3087470449172577,
"grad_norm": 2.862234115600586,
"learning_rate": 4.976236012203463e-06,
"loss": 0.6631,
"step": 653
},
{
"epoch": 0.30921985815602837,
"grad_norm": 2.9708092212677,
"learning_rate": 4.976150125826268e-06,
"loss": 0.6326,
"step": 654
},
{
"epoch": 0.30969267139479906,
"grad_norm": 2.892465353012085,
"learning_rate": 4.976064085270063e-06,
"loss": 0.6574,
"step": 655
},
{
"epoch": 0.31016548463356974,
"grad_norm": 3.9215126037597656,
"learning_rate": 4.975977890540205e-06,
"loss": 0.7351,
"step": 656
},
{
"epoch": 0.31063829787234043,
"grad_norm": 2.9544081687927246,
"learning_rate": 4.975891541642059e-06,
"loss": 0.7264,
"step": 657
},
{
"epoch": 0.3111111111111111,
"grad_norm": 2.995035409927368,
"learning_rate": 4.975805038581005e-06,
"loss": 0.7405,
"step": 658
},
{
"epoch": 0.3115839243498818,
"grad_norm": 2.9653120040893555,
"learning_rate": 4.975718381362427e-06,
"loss": 0.679,
"step": 659
},
{
"epoch": 0.3120567375886525,
"grad_norm": 2.93976092338562,
"learning_rate": 4.9756315699917205e-06,
"loss": 0.627,
"step": 660
},
{
"epoch": 0.3125295508274232,
"grad_norm": 3.106522560119629,
"learning_rate": 4.9755446044742915e-06,
"loss": 0.6329,
"step": 661
},
{
"epoch": 0.31300236406619386,
"grad_norm": 3.0238280296325684,
"learning_rate": 4.975457484815554e-06,
"loss": 0.6643,
"step": 662
},
{
"epoch": 0.31347517730496455,
"grad_norm": 2.943528175354004,
"learning_rate": 4.9753702110209356e-06,
"loss": 0.668,
"step": 663
},
{
"epoch": 0.31394799054373523,
"grad_norm": 2.6840121746063232,
"learning_rate": 4.9752827830958676e-06,
"loss": 0.5482,
"step": 664
},
{
"epoch": 0.3144208037825059,
"grad_norm": 2.823875904083252,
"learning_rate": 4.975195201045794e-06,
"loss": 0.7017,
"step": 665
},
{
"epoch": 0.3148936170212766,
"grad_norm": 3.148181200027466,
"learning_rate": 4.975107464876168e-06,
"loss": 0.747,
"step": 666
},
{
"epoch": 0.3153664302600473,
"grad_norm": 2.630584478378296,
"learning_rate": 4.9750195745924545e-06,
"loss": 0.5987,
"step": 667
},
{
"epoch": 0.315839243498818,
"grad_norm": 3.075866460800171,
"learning_rate": 4.974931530200124e-06,
"loss": 0.664,
"step": 668
},
{
"epoch": 0.31631205673758866,
"grad_norm": 2.947197914123535,
"learning_rate": 4.974843331704659e-06,
"loss": 0.631,
"step": 669
},
{
"epoch": 0.31678486997635935,
"grad_norm": 3.519646644592285,
"learning_rate": 4.974754979111552e-06,
"loss": 0.7154,
"step": 670
},
{
"epoch": 0.31725768321513004,
"grad_norm": 2.8687186241149902,
"learning_rate": 4.974666472426305e-06,
"loss": 0.6366,
"step": 671
},
{
"epoch": 0.3177304964539007,
"grad_norm": 2.6966612339019775,
"learning_rate": 4.974577811654426e-06,
"loss": 0.7112,
"step": 672
},
{
"epoch": 0.3182033096926714,
"grad_norm": 3.1390228271484375,
"learning_rate": 4.974488996801439e-06,
"loss": 0.6882,
"step": 673
},
{
"epoch": 0.3186761229314421,
"grad_norm": 3.4667599201202393,
"learning_rate": 4.974400027872871e-06,
"loss": 0.7153,
"step": 674
},
{
"epoch": 0.3191489361702128,
"grad_norm": 2.9632184505462646,
"learning_rate": 4.974310904874265e-06,
"loss": 0.7081,
"step": 675
},
{
"epoch": 0.31962174940898347,
"grad_norm": 3.46150279045105,
"learning_rate": 4.9742216278111666e-06,
"loss": 0.6242,
"step": 676
},
{
"epoch": 0.32009456264775416,
"grad_norm": 3.380403757095337,
"learning_rate": 4.974132196689137e-06,
"loss": 0.6863,
"step": 677
},
{
"epoch": 0.32056737588652484,
"grad_norm": 3.4279606342315674,
"learning_rate": 4.974042611513746e-06,
"loss": 0.6388,
"step": 678
},
{
"epoch": 0.3210401891252955,
"grad_norm": 2.634523391723633,
"learning_rate": 4.973952872290568e-06,
"loss": 0.6038,
"step": 679
},
{
"epoch": 0.3215130023640662,
"grad_norm": 3.19693922996521,
"learning_rate": 4.973862979025194e-06,
"loss": 0.6383,
"step": 680
},
{
"epoch": 0.3219858156028369,
"grad_norm": 3.437692165374756,
"learning_rate": 4.973772931723218e-06,
"loss": 0.7288,
"step": 681
},
{
"epoch": 0.3224586288416076,
"grad_norm": 2.506301164627075,
"learning_rate": 4.97368273039025e-06,
"loss": 0.5707,
"step": 682
},
{
"epoch": 0.3229314420803783,
"grad_norm": 3.0942845344543457,
"learning_rate": 4.9735923750319044e-06,
"loss": 0.6348,
"step": 683
},
{
"epoch": 0.32340425531914896,
"grad_norm": 3.0889835357666016,
"learning_rate": 4.973501865653809e-06,
"loss": 0.6697,
"step": 684
},
{
"epoch": 0.32387706855791965,
"grad_norm": 3.0391931533813477,
"learning_rate": 4.973411202261598e-06,
"loss": 0.7091,
"step": 685
},
{
"epoch": 0.32434988179669033,
"grad_norm": 3.0333497524261475,
"learning_rate": 4.973320384860917e-06,
"loss": 0.6403,
"step": 686
},
{
"epoch": 0.324822695035461,
"grad_norm": 2.9714622497558594,
"learning_rate": 4.973229413457421e-06,
"loss": 0.6977,
"step": 687
},
{
"epoch": 0.3252955082742317,
"grad_norm": 3.057558298110962,
"learning_rate": 4.973138288056774e-06,
"loss": 0.7236,
"step": 688
},
{
"epoch": 0.3257683215130024,
"grad_norm": 2.921093463897705,
"learning_rate": 4.97304700866465e-06,
"loss": 0.576,
"step": 689
},
{
"epoch": 0.3262411347517731,
"grad_norm": 3.0287256240844727,
"learning_rate": 4.972955575286732e-06,
"loss": 0.7077,
"step": 690
},
{
"epoch": 0.32671394799054376,
"grad_norm": 2.8621346950531006,
"learning_rate": 4.972863987928716e-06,
"loss": 0.6952,
"step": 691
},
{
"epoch": 0.3271867612293144,
"grad_norm": 2.631359100341797,
"learning_rate": 4.9727722465963006e-06,
"loss": 0.6931,
"step": 692
},
{
"epoch": 0.3276595744680851,
"grad_norm": 2.8484320640563965,
"learning_rate": 4.972680351295201e-06,
"loss": 0.6292,
"step": 693
},
{
"epoch": 0.32813238770685577,
"grad_norm": 2.593001365661621,
"learning_rate": 4.972588302031138e-06,
"loss": 0.5942,
"step": 694
},
{
"epoch": 0.32860520094562645,
"grad_norm": 2.6321065425872803,
"learning_rate": 4.972496098809844e-06,
"loss": 0.65,
"step": 695
},
{
"epoch": 0.32907801418439714,
"grad_norm": 3.2516732215881348,
"learning_rate": 4.972403741637059e-06,
"loss": 0.7385,
"step": 696
},
{
"epoch": 0.3295508274231678,
"grad_norm": 3.180854320526123,
"learning_rate": 4.972311230518535e-06,
"loss": 0.6569,
"step": 697
},
{
"epoch": 0.3300236406619385,
"grad_norm": 4.161016941070557,
"learning_rate": 4.972218565460031e-06,
"loss": 0.6416,
"step": 698
},
{
"epoch": 0.3304964539007092,
"grad_norm": 3.153897762298584,
"learning_rate": 4.972125746467317e-06,
"loss": 0.7196,
"step": 699
},
{
"epoch": 0.3309692671394799,
"grad_norm": 2.9595556259155273,
"learning_rate": 4.972032773546173e-06,
"loss": 0.7093,
"step": 700
},
{
"epoch": 0.33144208037825057,
"grad_norm": 3.1086833477020264,
"learning_rate": 4.9719396467023875e-06,
"loss": 0.6963,
"step": 701
},
{
"epoch": 0.33191489361702126,
"grad_norm": 2.958921432495117,
"learning_rate": 4.971846365941759e-06,
"loss": 0.6518,
"step": 702
},
{
"epoch": 0.33238770685579194,
"grad_norm": 2.8745479583740234,
"learning_rate": 4.971752931270096e-06,
"loss": 0.696,
"step": 703
},
{
"epoch": 0.33286052009456263,
"grad_norm": 3.224358558654785,
"learning_rate": 4.971659342693217e-06,
"loss": 0.6769,
"step": 704
},
{
"epoch": 0.3333333333333333,
"grad_norm": 2.696319580078125,
"learning_rate": 4.9715656002169486e-06,
"loss": 0.6833,
"step": 705
},
{
"epoch": 0.333806146572104,
"grad_norm": 2.9283502101898193,
"learning_rate": 4.971471703847127e-06,
"loss": 0.6784,
"step": 706
},
{
"epoch": 0.3342789598108747,
"grad_norm": 2.654914140701294,
"learning_rate": 4.9713776535896e-06,
"loss": 0.6337,
"step": 707
},
{
"epoch": 0.3347517730496454,
"grad_norm": 3.041555643081665,
"learning_rate": 4.971283449450224e-06,
"loss": 0.6227,
"step": 708
},
{
"epoch": 0.33522458628841606,
"grad_norm": 2.893008232116699,
"learning_rate": 4.971189091434863e-06,
"loss": 0.655,
"step": 709
},
{
"epoch": 0.33569739952718675,
"grad_norm": 2.8806653022766113,
"learning_rate": 4.971094579549393e-06,
"loss": 0.7077,
"step": 710
},
{
"epoch": 0.33617021276595743,
"grad_norm": 3.4830048084259033,
"learning_rate": 4.9709999137996986e-06,
"loss": 0.7461,
"step": 711
},
{
"epoch": 0.3366430260047281,
"grad_norm": 3.155444860458374,
"learning_rate": 4.970905094191674e-06,
"loss": 0.652,
"step": 712
},
{
"epoch": 0.3371158392434988,
"grad_norm": 2.7608706951141357,
"learning_rate": 4.970810120731225e-06,
"loss": 0.684,
"step": 713
},
{
"epoch": 0.3375886524822695,
"grad_norm": 2.8209474086761475,
"learning_rate": 4.970714993424265e-06,
"loss": 0.6009,
"step": 714
},
{
"epoch": 0.3380614657210402,
"grad_norm": 3.6532654762268066,
"learning_rate": 4.9706197122767145e-06,
"loss": 0.702,
"step": 715
},
{
"epoch": 0.33853427895981086,
"grad_norm": 2.6276566982269287,
"learning_rate": 4.970524277294508e-06,
"loss": 0.6338,
"step": 716
},
{
"epoch": 0.33900709219858155,
"grad_norm": 3.509871482849121,
"learning_rate": 4.970428688483589e-06,
"loss": 0.6853,
"step": 717
},
{
"epoch": 0.33947990543735224,
"grad_norm": 5.332682132720947,
"learning_rate": 4.970332945849906e-06,
"loss": 0.6684,
"step": 718
},
{
"epoch": 0.3399527186761229,
"grad_norm": 2.718801975250244,
"learning_rate": 4.970237049399424e-06,
"loss": 0.6676,
"step": 719
},
{
"epoch": 0.3404255319148936,
"grad_norm": 3.891003131866455,
"learning_rate": 4.970140999138112e-06,
"loss": 0.7043,
"step": 720
},
{
"epoch": 0.3408983451536643,
"grad_norm": 2.8863155841827393,
"learning_rate": 4.970044795071951e-06,
"loss": 0.6563,
"step": 721
},
{
"epoch": 0.341371158392435,
"grad_norm": 3.2527518272399902,
"learning_rate": 4.969948437206932e-06,
"loss": 0.7244,
"step": 722
},
{
"epoch": 0.34184397163120567,
"grad_norm": 2.9726758003234863,
"learning_rate": 4.969851925549054e-06,
"loss": 0.6548,
"step": 723
},
{
"epoch": 0.34231678486997635,
"grad_norm": 3.118309497833252,
"learning_rate": 4.969755260104327e-06,
"loss": 0.7293,
"step": 724
},
{
"epoch": 0.34278959810874704,
"grad_norm": 3.373068332672119,
"learning_rate": 4.969658440878769e-06,
"loss": 0.6444,
"step": 725
},
{
"epoch": 0.3432624113475177,
"grad_norm": 2.7157437801361084,
"learning_rate": 4.969561467878409e-06,
"loss": 0.642,
"step": 726
},
{
"epoch": 0.3437352245862884,
"grad_norm": 2.58929705619812,
"learning_rate": 4.969464341109285e-06,
"loss": 0.6165,
"step": 727
},
{
"epoch": 0.3442080378250591,
"grad_norm": 2.8811306953430176,
"learning_rate": 4.969367060577445e-06,
"loss": 0.7127,
"step": 728
},
{
"epoch": 0.3446808510638298,
"grad_norm": 3.494358539581299,
"learning_rate": 4.969269626288946e-06,
"loss": 0.7103,
"step": 729
},
{
"epoch": 0.34515366430260047,
"grad_norm": 2.9753928184509277,
"learning_rate": 4.969172038249855e-06,
"loss": 0.6911,
"step": 730
},
{
"epoch": 0.34562647754137116,
"grad_norm": 3.2885913848876953,
"learning_rate": 4.969074296466247e-06,
"loss": 0.6968,
"step": 731
},
{
"epoch": 0.34609929078014184,
"grad_norm": 2.7564568519592285,
"learning_rate": 4.968976400944211e-06,
"loss": 0.6843,
"step": 732
},
{
"epoch": 0.34657210401891253,
"grad_norm": 2.9255006313323975,
"learning_rate": 4.96887835168984e-06,
"loss": 0.6024,
"step": 733
},
{
"epoch": 0.3470449172576832,
"grad_norm": 3.1808290481567383,
"learning_rate": 4.968780148709239e-06,
"loss": 0.7377,
"step": 734
},
{
"epoch": 0.3475177304964539,
"grad_norm": 2.956666946411133,
"learning_rate": 4.968681792008523e-06,
"loss": 0.65,
"step": 735
},
{
"epoch": 0.3479905437352246,
"grad_norm": 2.9631855487823486,
"learning_rate": 4.9685832815938175e-06,
"loss": 0.677,
"step": 736
},
{
"epoch": 0.3484633569739953,
"grad_norm": 2.501917600631714,
"learning_rate": 4.968484617471256e-06,
"loss": 0.6282,
"step": 737
},
{
"epoch": 0.34893617021276596,
"grad_norm": 2.750779628753662,
"learning_rate": 4.968385799646981e-06,
"loss": 0.6507,
"step": 738
},
{
"epoch": 0.34940898345153665,
"grad_norm": 2.872300624847412,
"learning_rate": 4.968286828127146e-06,
"loss": 0.5949,
"step": 739
},
{
"epoch": 0.34988179669030733,
"grad_norm": 2.6316142082214355,
"learning_rate": 4.9681877029179124e-06,
"loss": 0.6328,
"step": 740
},
{
"epoch": 0.350354609929078,
"grad_norm": 3.244364023208618,
"learning_rate": 4.968088424025454e-06,
"loss": 0.7393,
"step": 741
},
{
"epoch": 0.3508274231678487,
"grad_norm": 2.620465040206909,
"learning_rate": 4.967988991455951e-06,
"loss": 0.6797,
"step": 742
},
{
"epoch": 0.3513002364066194,
"grad_norm": 2.854513645172119,
"learning_rate": 4.967889405215596e-06,
"loss": 0.6368,
"step": 743
},
{
"epoch": 0.3517730496453901,
"grad_norm": 2.579854726791382,
"learning_rate": 4.9677896653105886e-06,
"loss": 0.6489,
"step": 744
},
{
"epoch": 0.35224586288416077,
"grad_norm": 3.0697381496429443,
"learning_rate": 4.96768977174714e-06,
"loss": 0.6313,
"step": 745
},
{
"epoch": 0.35271867612293145,
"grad_norm": 3.369338035583496,
"learning_rate": 4.96758972453147e-06,
"loss": 0.7416,
"step": 746
},
{
"epoch": 0.35319148936170214,
"grad_norm": 2.836221933364868,
"learning_rate": 4.967489523669807e-06,
"loss": 0.6422,
"step": 747
},
{
"epoch": 0.3536643026004728,
"grad_norm": 2.929579496383667,
"learning_rate": 4.967389169168392e-06,
"loss": 0.6482,
"step": 748
},
{
"epoch": 0.3541371158392435,
"grad_norm": 2.9243831634521484,
"learning_rate": 4.967288661033472e-06,
"loss": 0.5813,
"step": 749
},
{
"epoch": 0.3546099290780142,
"grad_norm": 3.7555336952209473,
"learning_rate": 4.967187999271306e-06,
"loss": 0.6501,
"step": 750
},
{
"epoch": 0.3550827423167849,
"grad_norm": 3.4279143810272217,
"learning_rate": 4.9670871838881615e-06,
"loss": 0.6326,
"step": 751
},
{
"epoch": 0.35555555555555557,
"grad_norm": 2.875066041946411,
"learning_rate": 4.9669862148903166e-06,
"loss": 0.664,
"step": 752
},
{
"epoch": 0.35602836879432626,
"grad_norm": 3.130394697189331,
"learning_rate": 4.966885092284057e-06,
"loss": 0.706,
"step": 753
},
{
"epoch": 0.35650118203309694,
"grad_norm": 2.9606287479400635,
"learning_rate": 4.96678381607568e-06,
"loss": 0.693,
"step": 754
},
{
"epoch": 0.35697399527186763,
"grad_norm": 3.0584909915924072,
"learning_rate": 4.966682386271491e-06,
"loss": 0.6034,
"step": 755
},
{
"epoch": 0.3574468085106383,
"grad_norm": 2.8215200901031494,
"learning_rate": 4.966580802877805e-06,
"loss": 0.6217,
"step": 756
},
{
"epoch": 0.357919621749409,
"grad_norm": 2.7348055839538574,
"learning_rate": 4.966479065900949e-06,
"loss": 0.6194,
"step": 757
},
{
"epoch": 0.3583924349881797,
"grad_norm": 3.2347466945648193,
"learning_rate": 4.966377175347257e-06,
"loss": 0.6377,
"step": 758
},
{
"epoch": 0.3588652482269504,
"grad_norm": 3.311845302581787,
"learning_rate": 4.966275131223072e-06,
"loss": 0.6234,
"step": 759
},
{
"epoch": 0.35933806146572106,
"grad_norm": 3.0384368896484375,
"learning_rate": 4.96617293353475e-06,
"loss": 0.609,
"step": 760
},
{
"epoch": 0.35981087470449175,
"grad_norm": 3.516854763031006,
"learning_rate": 4.966070582288653e-06,
"loss": 0.6627,
"step": 761
},
{
"epoch": 0.36028368794326243,
"grad_norm": 3.2425215244293213,
"learning_rate": 4.9659680774911534e-06,
"loss": 0.7355,
"step": 762
},
{
"epoch": 0.3607565011820331,
"grad_norm": 3.2665750980377197,
"learning_rate": 4.965865419148636e-06,
"loss": 0.6787,
"step": 763
},
{
"epoch": 0.3612293144208038,
"grad_norm": 2.729428291320801,
"learning_rate": 4.96576260726749e-06,
"loss": 0.6272,
"step": 764
},
{
"epoch": 0.3617021276595745,
"grad_norm": 3.299969434738159,
"learning_rate": 4.965659641854119e-06,
"loss": 0.6552,
"step": 765
},
{
"epoch": 0.3621749408983452,
"grad_norm": 2.7090916633605957,
"learning_rate": 4.965556522914934e-06,
"loss": 0.6661,
"step": 766
},
{
"epoch": 0.36264775413711586,
"grad_norm": 2.488846778869629,
"learning_rate": 4.965453250456355e-06,
"loss": 0.5821,
"step": 767
},
{
"epoch": 0.36312056737588655,
"grad_norm": 2.5267233848571777,
"learning_rate": 4.965349824484813e-06,
"loss": 0.5593,
"step": 768
},
{
"epoch": 0.36359338061465724,
"grad_norm": 3.0646679401397705,
"learning_rate": 4.965246245006748e-06,
"loss": 0.6341,
"step": 769
},
{
"epoch": 0.3640661938534279,
"grad_norm": 2.9877712726593018,
"learning_rate": 4.965142512028609e-06,
"loss": 0.7202,
"step": 770
},
{
"epoch": 0.3645390070921986,
"grad_norm": 3.7494113445281982,
"learning_rate": 4.965038625556854e-06,
"loss": 0.7643,
"step": 771
},
{
"epoch": 0.3650118203309693,
"grad_norm": 2.8382890224456787,
"learning_rate": 4.964934585597954e-06,
"loss": 0.6522,
"step": 772
},
{
"epoch": 0.3654846335697399,
"grad_norm": 3.091655731201172,
"learning_rate": 4.9648303921583854e-06,
"loss": 0.7117,
"step": 773
},
{
"epoch": 0.3659574468085106,
"grad_norm": 3.0608325004577637,
"learning_rate": 4.964726045244635e-06,
"loss": 0.6538,
"step": 774
},
{
"epoch": 0.3664302600472813,
"grad_norm": 2.8492867946624756,
"learning_rate": 4.964621544863203e-06,
"loss": 0.6079,
"step": 775
},
{
"epoch": 0.366903073286052,
"grad_norm": 3.0669894218444824,
"learning_rate": 4.964516891020594e-06,
"loss": 0.6223,
"step": 776
},
{
"epoch": 0.36737588652482267,
"grad_norm": 3.089984893798828,
"learning_rate": 4.964412083723325e-06,
"loss": 0.671,
"step": 777
},
{
"epoch": 0.36784869976359336,
"grad_norm": 2.905242443084717,
"learning_rate": 4.964307122977921e-06,
"loss": 0.62,
"step": 778
},
{
"epoch": 0.36832151300236404,
"grad_norm": 3.954436779022217,
"learning_rate": 4.964202008790918e-06,
"loss": 0.6535,
"step": 779
},
{
"epoch": 0.36879432624113473,
"grad_norm": 2.6026058197021484,
"learning_rate": 4.9640967411688615e-06,
"loss": 0.5865,
"step": 780
},
{
"epoch": 0.3692671394799054,
"grad_norm": 2.9876346588134766,
"learning_rate": 4.963991320118306e-06,
"loss": 0.6698,
"step": 781
},
{
"epoch": 0.3697399527186761,
"grad_norm": 2.9411263465881348,
"learning_rate": 4.963885745645815e-06,
"loss": 0.6173,
"step": 782
},
{
"epoch": 0.3702127659574468,
"grad_norm": 2.5679805278778076,
"learning_rate": 4.963780017757962e-06,
"loss": 0.6285,
"step": 783
},
{
"epoch": 0.3706855791962175,
"grad_norm": 3.3100640773773193,
"learning_rate": 4.963674136461332e-06,
"loss": 0.5968,
"step": 784
},
{
"epoch": 0.37115839243498816,
"grad_norm": 3.1293699741363525,
"learning_rate": 4.963568101762515e-06,
"loss": 0.697,
"step": 785
},
{
"epoch": 0.37163120567375885,
"grad_norm": 3.043853759765625,
"learning_rate": 4.963461913668115e-06,
"loss": 0.5881,
"step": 786
},
{
"epoch": 0.37210401891252953,
"grad_norm": 3.07351016998291,
"learning_rate": 4.963355572184744e-06,
"loss": 0.6307,
"step": 787
},
{
"epoch": 0.3725768321513002,
"grad_norm": 2.7381317615509033,
"learning_rate": 4.9632490773190225e-06,
"loss": 0.716,
"step": 788
},
{
"epoch": 0.3730496453900709,
"grad_norm": 2.892221450805664,
"learning_rate": 4.963142429077582e-06,
"loss": 0.6867,
"step": 789
},
{
"epoch": 0.3735224586288416,
"grad_norm": 3.133122205734253,
"learning_rate": 4.963035627467064e-06,
"loss": 0.659,
"step": 790
},
{
"epoch": 0.3739952718676123,
"grad_norm": 3.032599925994873,
"learning_rate": 4.962928672494116e-06,
"loss": 0.6848,
"step": 791
},
{
"epoch": 0.37446808510638296,
"grad_norm": 3.0076355934143066,
"learning_rate": 4.9628215641654e-06,
"loss": 0.6549,
"step": 792
},
{
"epoch": 0.37494089834515365,
"grad_norm": 2.8904454708099365,
"learning_rate": 4.962714302487585e-06,
"loss": 0.6484,
"step": 793
},
{
"epoch": 0.37541371158392434,
"grad_norm": 2.881364107131958,
"learning_rate": 4.9626068874673486e-06,
"loss": 0.721,
"step": 794
},
{
"epoch": 0.375886524822695,
"grad_norm": 3.11668062210083,
"learning_rate": 4.962499319111379e-06,
"loss": 0.7824,
"step": 795
},
{
"epoch": 0.3763593380614657,
"grad_norm": 2.9201436042785645,
"learning_rate": 4.962391597426374e-06,
"loss": 0.6911,
"step": 796
},
{
"epoch": 0.3768321513002364,
"grad_norm": 2.926598072052002,
"learning_rate": 4.962283722419043e-06,
"loss": 0.6715,
"step": 797
},
{
"epoch": 0.3773049645390071,
"grad_norm": 2.7267675399780273,
"learning_rate": 4.962175694096101e-06,
"loss": 0.6111,
"step": 798
},
{
"epoch": 0.37777777777777777,
"grad_norm": 3.194031000137329,
"learning_rate": 4.962067512464275e-06,
"loss": 0.6558,
"step": 799
},
{
"epoch": 0.37825059101654845,
"grad_norm": 2.6249136924743652,
"learning_rate": 4.9619591775303e-06,
"loss": 0.6166,
"step": 800
},
{
"epoch": 0.37872340425531914,
"grad_norm": 2.6356167793273926,
"learning_rate": 4.961850689300923e-06,
"loss": 0.6112,
"step": 801
},
{
"epoch": 0.3791962174940898,
"grad_norm": 3.030724287033081,
"learning_rate": 4.961742047782898e-06,
"loss": 0.6511,
"step": 802
},
{
"epoch": 0.3796690307328605,
"grad_norm": 3.4987757205963135,
"learning_rate": 4.96163325298299e-06,
"loss": 0.5888,
"step": 803
},
{
"epoch": 0.3801418439716312,
"grad_norm": 3.0371780395507812,
"learning_rate": 4.961524304907974e-06,
"loss": 0.6385,
"step": 804
},
{
"epoch": 0.3806146572104019,
"grad_norm": 3.302570104598999,
"learning_rate": 4.961415203564632e-06,
"loss": 0.6515,
"step": 805
},
{
"epoch": 0.38108747044917257,
"grad_norm": 2.7597038745880127,
"learning_rate": 4.961305948959759e-06,
"loss": 0.6126,
"step": 806
},
{
"epoch": 0.38156028368794326,
"grad_norm": 2.789811849594116,
"learning_rate": 4.9611965411001575e-06,
"loss": 0.6601,
"step": 807
},
{
"epoch": 0.38203309692671394,
"grad_norm": 3.0403921604156494,
"learning_rate": 4.961086979992639e-06,
"loss": 0.6947,
"step": 808
},
{
"epoch": 0.38250591016548463,
"grad_norm": 3.2139980792999268,
"learning_rate": 4.960977265644026e-06,
"loss": 0.6876,
"step": 809
},
{
"epoch": 0.3829787234042553,
"grad_norm": 2.918515205383301,
"learning_rate": 4.960867398061149e-06,
"loss": 0.5997,
"step": 810
},
{
"epoch": 0.383451536643026,
"grad_norm": 3.197636604309082,
"learning_rate": 4.9607573772508495e-06,
"loss": 0.5754,
"step": 811
},
{
"epoch": 0.3839243498817967,
"grad_norm": 2.8848466873168945,
"learning_rate": 4.960647203219979e-06,
"loss": 0.6424,
"step": 812
},
{
"epoch": 0.3843971631205674,
"grad_norm": 3.4810187816619873,
"learning_rate": 4.960536875975397e-06,
"loss": 0.6851,
"step": 813
},
{
"epoch": 0.38486997635933806,
"grad_norm": 3.713934898376465,
"learning_rate": 4.960426395523972e-06,
"loss": 0.6122,
"step": 814
},
{
"epoch": 0.38534278959810875,
"grad_norm": 2.862600803375244,
"learning_rate": 4.960315761872585e-06,
"loss": 0.6493,
"step": 815
},
{
"epoch": 0.38581560283687943,
"grad_norm": 3.133882522583008,
"learning_rate": 4.960204975028123e-06,
"loss": 0.7535,
"step": 816
},
{
"epoch": 0.3862884160756501,
"grad_norm": 3.1526732444763184,
"learning_rate": 4.960094034997485e-06,
"loss": 0.6512,
"step": 817
},
{
"epoch": 0.3867612293144208,
"grad_norm": 2.7213544845581055,
"learning_rate": 4.959982941787579e-06,
"loss": 0.6121,
"step": 818
},
{
"epoch": 0.3872340425531915,
"grad_norm": 3.4935851097106934,
"learning_rate": 4.9598716954053214e-06,
"loss": 0.7852,
"step": 819
},
{
"epoch": 0.3877068557919622,
"grad_norm": 2.691016435623169,
"learning_rate": 4.9597602958576395e-06,
"loss": 0.6861,
"step": 820
},
{
"epoch": 0.38817966903073287,
"grad_norm": 2.8621015548706055,
"learning_rate": 4.959648743151469e-06,
"loss": 0.6262,
"step": 821
},
{
"epoch": 0.38865248226950355,
"grad_norm": 3.3887462615966797,
"learning_rate": 4.959537037293758e-06,
"loss": 0.7103,
"step": 822
},
{
"epoch": 0.38912529550827424,
"grad_norm": 2.7565438747406006,
"learning_rate": 4.95942517829146e-06,
"loss": 0.6471,
"step": 823
},
{
"epoch": 0.3895981087470449,
"grad_norm": 2.7920358180999756,
"learning_rate": 4.959313166151541e-06,
"loss": 0.6239,
"step": 824
},
{
"epoch": 0.3900709219858156,
"grad_norm": 3.18904185295105,
"learning_rate": 4.959201000880973e-06,
"loss": 0.7461,
"step": 825
},
{
"epoch": 0.3905437352245863,
"grad_norm": 2.727872371673584,
"learning_rate": 4.959088682486743e-06,
"loss": 0.6333,
"step": 826
},
{
"epoch": 0.391016548463357,
"grad_norm": 2.906378746032715,
"learning_rate": 4.958976210975844e-06,
"loss": 0.7547,
"step": 827
},
{
"epoch": 0.39148936170212767,
"grad_norm": 2.96482515335083,
"learning_rate": 4.958863586355278e-06,
"loss": 0.6312,
"step": 828
},
{
"epoch": 0.39196217494089836,
"grad_norm": 3.2890889644622803,
"learning_rate": 4.958750808632059e-06,
"loss": 0.6943,
"step": 829
},
{
"epoch": 0.39243498817966904,
"grad_norm": 2.7004311084747314,
"learning_rate": 4.958637877813207e-06,
"loss": 0.5918,
"step": 830
},
{
"epoch": 0.39290780141843973,
"grad_norm": 2.7487950325012207,
"learning_rate": 4.9585247939057566e-06,
"loss": 0.6201,
"step": 831
},
{
"epoch": 0.3933806146572104,
"grad_norm": 2.7873897552490234,
"learning_rate": 4.958411556916747e-06,
"loss": 0.6268,
"step": 832
},
{
"epoch": 0.3938534278959811,
"grad_norm": 2.8501343727111816,
"learning_rate": 4.958298166853229e-06,
"loss": 0.7119,
"step": 833
},
{
"epoch": 0.3943262411347518,
"grad_norm": 3.0391547679901123,
"learning_rate": 4.958184623722265e-06,
"loss": 0.6375,
"step": 834
},
{
"epoch": 0.3947990543735225,
"grad_norm": 2.850520133972168,
"learning_rate": 4.958070927530922e-06,
"loss": 0.5962,
"step": 835
},
{
"epoch": 0.39527186761229316,
"grad_norm": 3.351914644241333,
"learning_rate": 4.957957078286281e-06,
"loss": 0.7247,
"step": 836
},
{
"epoch": 0.39574468085106385,
"grad_norm": 2.9559543132781982,
"learning_rate": 4.957843075995431e-06,
"loss": 0.6571,
"step": 837
},
{
"epoch": 0.39621749408983453,
"grad_norm": 3.225785255432129,
"learning_rate": 4.95772892066547e-06,
"loss": 0.7074,
"step": 838
},
{
"epoch": 0.3966903073286052,
"grad_norm": 2.7842373847961426,
"learning_rate": 4.957614612303505e-06,
"loss": 0.6469,
"step": 839
},
{
"epoch": 0.3971631205673759,
"grad_norm": 4.249724864959717,
"learning_rate": 4.957500150916655e-06,
"loss": 0.741,
"step": 840
},
{
"epoch": 0.3976359338061466,
"grad_norm": 3.138221263885498,
"learning_rate": 4.957385536512046e-06,
"loss": 0.6676,
"step": 841
},
{
"epoch": 0.3981087470449173,
"grad_norm": 3.456423759460449,
"learning_rate": 4.957270769096816e-06,
"loss": 0.6877,
"step": 842
},
{
"epoch": 0.39858156028368796,
"grad_norm": 2.8676278591156006,
"learning_rate": 4.957155848678109e-06,
"loss": 0.5986,
"step": 843
},
{
"epoch": 0.39905437352245865,
"grad_norm": 2.705324411392212,
"learning_rate": 4.957040775263082e-06,
"loss": 0.6356,
"step": 844
},
{
"epoch": 0.39952718676122934,
"grad_norm": 3.0767486095428467,
"learning_rate": 4.9569255488589e-06,
"loss": 0.6844,
"step": 845
},
{
"epoch": 0.4,
"grad_norm": 2.7787704467773438,
"learning_rate": 4.956810169472736e-06,
"loss": 0.6641,
"step": 846
},
{
"epoch": 0.4004728132387707,
"grad_norm": 2.584277868270874,
"learning_rate": 4.956694637111777e-06,
"loss": 0.6256,
"step": 847
},
{
"epoch": 0.4009456264775414,
"grad_norm": 2.751641273498535,
"learning_rate": 4.956578951783215e-06,
"loss": 0.5954,
"step": 848
},
{
"epoch": 0.4014184397163121,
"grad_norm": 3.0181658267974854,
"learning_rate": 4.956463113494253e-06,
"loss": 0.6569,
"step": 849
},
{
"epoch": 0.40189125295508277,
"grad_norm": 3.0933220386505127,
"learning_rate": 4.956347122252104e-06,
"loss": 0.6248,
"step": 850
},
{
"epoch": 0.40236406619385345,
"grad_norm": 3.3767428398132324,
"learning_rate": 4.956230978063991e-06,
"loss": 0.719,
"step": 851
},
{
"epoch": 0.40283687943262414,
"grad_norm": 3.7666573524475098,
"learning_rate": 4.956114680937145e-06,
"loss": 0.6467,
"step": 852
},
{
"epoch": 0.4033096926713948,
"grad_norm": 2.9836843013763428,
"learning_rate": 4.955998230878808e-06,
"loss": 0.6993,
"step": 853
},
{
"epoch": 0.4037825059101655,
"grad_norm": 2.981497049331665,
"learning_rate": 4.955881627896229e-06,
"loss": 0.6578,
"step": 854
},
{
"epoch": 0.40425531914893614,
"grad_norm": 3.1369056701660156,
"learning_rate": 4.955764871996672e-06,
"loss": 0.6763,
"step": 855
},
{
"epoch": 0.40472813238770683,
"grad_norm": 2.7675817012786865,
"learning_rate": 4.9556479631874036e-06,
"loss": 0.6488,
"step": 856
},
{
"epoch": 0.4052009456264775,
"grad_norm": 3.035334825515747,
"learning_rate": 4.9555309014757034e-06,
"loss": 0.7076,
"step": 857
},
{
"epoch": 0.4056737588652482,
"grad_norm": 3.493704319000244,
"learning_rate": 4.955413686868862e-06,
"loss": 0.6773,
"step": 858
},
{
"epoch": 0.4061465721040189,
"grad_norm": 3.245487928390503,
"learning_rate": 4.9552963193741765e-06,
"loss": 0.6915,
"step": 859
},
{
"epoch": 0.4066193853427896,
"grad_norm": 3.189969539642334,
"learning_rate": 4.955178798998956e-06,
"loss": 0.7318,
"step": 860
},
{
"epoch": 0.40709219858156026,
"grad_norm": 2.7987146377563477,
"learning_rate": 4.955061125750517e-06,
"loss": 0.6162,
"step": 861
},
{
"epoch": 0.40756501182033095,
"grad_norm": 3.020118474960327,
"learning_rate": 4.954943299636187e-06,
"loss": 0.6678,
"step": 862
},
{
"epoch": 0.40803782505910163,
"grad_norm": 2.715463876724243,
"learning_rate": 4.954825320663302e-06,
"loss": 0.668,
"step": 863
},
{
"epoch": 0.4085106382978723,
"grad_norm": 2.595050096511841,
"learning_rate": 4.9547071888392085e-06,
"loss": 0.6557,
"step": 864
},
{
"epoch": 0.408983451536643,
"grad_norm": 3.131596088409424,
"learning_rate": 4.954588904171261e-06,
"loss": 0.6548,
"step": 865
},
{
"epoch": 0.4094562647754137,
"grad_norm": 2.5742313861846924,
"learning_rate": 4.954470466666827e-06,
"loss": 0.6592,
"step": 866
},
{
"epoch": 0.4099290780141844,
"grad_norm": 2.8612802028656006,
"learning_rate": 4.9543518763332785e-06,
"loss": 0.5391,
"step": 867
},
{
"epoch": 0.41040189125295506,
"grad_norm": 2.8973186016082764,
"learning_rate": 4.954233133178001e-06,
"loss": 0.6649,
"step": 868
},
{
"epoch": 0.41087470449172575,
"grad_norm": 2.802525043487549,
"learning_rate": 4.954114237208388e-06,
"loss": 0.6212,
"step": 869
},
{
"epoch": 0.41134751773049644,
"grad_norm": 2.5919506549835205,
"learning_rate": 4.953995188431843e-06,
"loss": 0.6596,
"step": 870
},
{
"epoch": 0.4118203309692671,
"grad_norm": 3.139169454574585,
"learning_rate": 4.953875986855777e-06,
"loss": 0.6799,
"step": 871
},
{
"epoch": 0.4122931442080378,
"grad_norm": 3.99727725982666,
"learning_rate": 4.953756632487614e-06,
"loss": 0.6519,
"step": 872
},
{
"epoch": 0.4127659574468085,
"grad_norm": 3.238706350326538,
"learning_rate": 4.953637125334784e-06,
"loss": 0.7361,
"step": 873
},
{
"epoch": 0.4132387706855792,
"grad_norm": 2.780019998550415,
"learning_rate": 4.9535174654047295e-06,
"loss": 0.6406,
"step": 874
},
{
"epoch": 0.41371158392434987,
"grad_norm": 2.7629551887512207,
"learning_rate": 4.953397652704901e-06,
"loss": 0.6131,
"step": 875
},
{
"epoch": 0.41418439716312055,
"grad_norm": 2.8008246421813965,
"learning_rate": 4.9532776872427585e-06,
"loss": 0.6464,
"step": 876
},
{
"epoch": 0.41465721040189124,
"grad_norm": 3.0970115661621094,
"learning_rate": 4.953157569025772e-06,
"loss": 0.7066,
"step": 877
},
{
"epoch": 0.4151300236406619,
"grad_norm": 2.8375589847564697,
"learning_rate": 4.9530372980614195e-06,
"loss": 0.6551,
"step": 878
},
{
"epoch": 0.4156028368794326,
"grad_norm": 2.718843936920166,
"learning_rate": 4.952916874357191e-06,
"loss": 0.5947,
"step": 879
},
{
"epoch": 0.4160756501182033,
"grad_norm": 2.7104697227478027,
"learning_rate": 4.952796297920585e-06,
"loss": 0.6708,
"step": 880
},
{
"epoch": 0.416548463356974,
"grad_norm": 2.8223445415496826,
"learning_rate": 4.952675568759108e-06,
"loss": 0.6214,
"step": 881
},
{
"epoch": 0.41702127659574467,
"grad_norm": 2.6598153114318848,
"learning_rate": 4.952554686880279e-06,
"loss": 0.6116,
"step": 882
},
{
"epoch": 0.41749408983451536,
"grad_norm": 2.8639824390411377,
"learning_rate": 4.952433652291623e-06,
"loss": 0.5971,
"step": 883
},
{
"epoch": 0.41796690307328604,
"grad_norm": 2.9578304290771484,
"learning_rate": 4.952312465000677e-06,
"loss": 0.6785,
"step": 884
},
{
"epoch": 0.41843971631205673,
"grad_norm": 2.872144937515259,
"learning_rate": 4.952191125014987e-06,
"loss": 0.6772,
"step": 885
},
{
"epoch": 0.4189125295508274,
"grad_norm": 2.7513675689697266,
"learning_rate": 4.952069632342108e-06,
"loss": 0.702,
"step": 886
},
{
"epoch": 0.4193853427895981,
"grad_norm": 2.9275078773498535,
"learning_rate": 4.951947986989606e-06,
"loss": 0.589,
"step": 887
},
{
"epoch": 0.4198581560283688,
"grad_norm": 2.740549325942993,
"learning_rate": 4.951826188965053e-06,
"loss": 0.5942,
"step": 888
},
{
"epoch": 0.4203309692671395,
"grad_norm": 2.92452073097229,
"learning_rate": 4.951704238276035e-06,
"loss": 0.6819,
"step": 889
},
{
"epoch": 0.42080378250591016,
"grad_norm": 2.842491865158081,
"learning_rate": 4.951582134930144e-06,
"loss": 0.6304,
"step": 890
},
{
"epoch": 0.42127659574468085,
"grad_norm": 2.613478422164917,
"learning_rate": 4.951459878934983e-06,
"loss": 0.6912,
"step": 891
},
{
"epoch": 0.42174940898345153,
"grad_norm": 3.2408607006073,
"learning_rate": 4.951337470298165e-06,
"loss": 0.6755,
"step": 892
},
{
"epoch": 0.4222222222222222,
"grad_norm": 3.1022439002990723,
"learning_rate": 4.9512149090273125e-06,
"loss": 0.6138,
"step": 893
},
{
"epoch": 0.4226950354609929,
"grad_norm": 2.6418895721435547,
"learning_rate": 4.951092195130055e-06,
"loss": 0.639,
"step": 894
},
{
"epoch": 0.4231678486997636,
"grad_norm": 3.010744333267212,
"learning_rate": 4.950969328614035e-06,
"loss": 0.7102,
"step": 895
},
{
"epoch": 0.4236406619385343,
"grad_norm": 2.673292636871338,
"learning_rate": 4.950846309486901e-06,
"loss": 0.5676,
"step": 896
},
{
"epoch": 0.42411347517730497,
"grad_norm": 3.6974737644195557,
"learning_rate": 4.950723137756314e-06,
"loss": 0.5722,
"step": 897
},
{
"epoch": 0.42458628841607565,
"grad_norm": 3.69028902053833,
"learning_rate": 4.9505998134299435e-06,
"loss": 0.6337,
"step": 898
},
{
"epoch": 0.42505910165484634,
"grad_norm": 3.2136125564575195,
"learning_rate": 4.950476336515469e-06,
"loss": 0.6469,
"step": 899
},
{
"epoch": 0.425531914893617,
"grad_norm": 2.7396016120910645,
"learning_rate": 4.950352707020577e-06,
"loss": 0.6656,
"step": 900
},
{
"epoch": 0.4260047281323877,
"grad_norm": 2.825416088104248,
"learning_rate": 4.950228924952967e-06,
"loss": 0.6298,
"step": 901
},
{
"epoch": 0.4264775413711584,
"grad_norm": 3.401658535003662,
"learning_rate": 4.950104990320345e-06,
"loss": 0.778,
"step": 902
},
{
"epoch": 0.4269503546099291,
"grad_norm": 2.7002272605895996,
"learning_rate": 4.9499809031304294e-06,
"loss": 0.6536,
"step": 903
},
{
"epoch": 0.42742316784869977,
"grad_norm": 2.62386417388916,
"learning_rate": 4.949856663390945e-06,
"loss": 0.6629,
"step": 904
},
{
"epoch": 0.42789598108747046,
"grad_norm": 2.584247589111328,
"learning_rate": 4.94973227110963e-06,
"loss": 0.5813,
"step": 905
},
{
"epoch": 0.42836879432624114,
"grad_norm": 3.4365768432617188,
"learning_rate": 4.9496077262942265e-06,
"loss": 0.7648,
"step": 906
},
{
"epoch": 0.42884160756501183,
"grad_norm": 2.8993639945983887,
"learning_rate": 4.949483028952492e-06,
"loss": 0.6696,
"step": 907
},
{
"epoch": 0.4293144208037825,
"grad_norm": 2.922809362411499,
"learning_rate": 4.94935817909219e-06,
"loss": 0.6892,
"step": 908
},
{
"epoch": 0.4297872340425532,
"grad_norm": 2.85478138923645,
"learning_rate": 4.9492331767210944e-06,
"loss": 0.536,
"step": 909
},
{
"epoch": 0.4302600472813239,
"grad_norm": 2.8639259338378906,
"learning_rate": 4.949108021846988e-06,
"loss": 0.634,
"step": 910
},
{
"epoch": 0.4307328605200946,
"grad_norm": 3.0533697605133057,
"learning_rate": 4.948982714477664e-06,
"loss": 0.6318,
"step": 911
},
{
"epoch": 0.43120567375886526,
"grad_norm": 2.331674814224243,
"learning_rate": 4.9488572546209255e-06,
"loss": 0.6562,
"step": 912
},
{
"epoch": 0.43167848699763595,
"grad_norm": 3.0154623985290527,
"learning_rate": 4.9487316422845835e-06,
"loss": 0.6675,
"step": 913
},
{
"epoch": 0.43215130023640663,
"grad_norm": 2.7354514598846436,
"learning_rate": 4.948605877476459e-06,
"loss": 0.6012,
"step": 914
},
{
"epoch": 0.4326241134751773,
"grad_norm": 2.863736629486084,
"learning_rate": 4.948479960204383e-06,
"loss": 0.6062,
"step": 915
},
{
"epoch": 0.433096926713948,
"grad_norm": 3.01998233795166,
"learning_rate": 4.948353890476197e-06,
"loss": 0.6749,
"step": 916
},
{
"epoch": 0.4335697399527187,
"grad_norm": 2.7550456523895264,
"learning_rate": 4.94822766829975e-06,
"loss": 0.6507,
"step": 917
},
{
"epoch": 0.4340425531914894,
"grad_norm": 3.370572805404663,
"learning_rate": 4.948101293682901e-06,
"loss": 0.714,
"step": 918
},
{
"epoch": 0.43451536643026006,
"grad_norm": 2.9736790657043457,
"learning_rate": 4.947974766633519e-06,
"loss": 0.729,
"step": 919
},
{
"epoch": 0.43498817966903075,
"grad_norm": 3.1036548614501953,
"learning_rate": 4.947848087159483e-06,
"loss": 0.7547,
"step": 920
},
{
"epoch": 0.43546099290780144,
"grad_norm": 2.895094871520996,
"learning_rate": 4.947721255268679e-06,
"loss": 0.6089,
"step": 921
},
{
"epoch": 0.4359338061465721,
"grad_norm": 2.798476219177246,
"learning_rate": 4.947594270969005e-06,
"loss": 0.5432,
"step": 922
},
{
"epoch": 0.4364066193853428,
"grad_norm": 2.7675702571868896,
"learning_rate": 4.94746713426837e-06,
"loss": 0.5693,
"step": 923
},
{
"epoch": 0.4368794326241135,
"grad_norm": 2.6851553916931152,
"learning_rate": 4.947339845174687e-06,
"loss": 0.6503,
"step": 924
},
{
"epoch": 0.4373522458628842,
"grad_norm": 2.909635543823242,
"learning_rate": 4.947212403695883e-06,
"loss": 0.6494,
"step": 925
},
{
"epoch": 0.43782505910165487,
"grad_norm": 2.604526996612549,
"learning_rate": 4.947084809839894e-06,
"loss": 0.6349,
"step": 926
},
{
"epoch": 0.43829787234042555,
"grad_norm": 3.118149518966675,
"learning_rate": 4.946957063614664e-06,
"loss": 0.6219,
"step": 927
},
{
"epoch": 0.43877068557919624,
"grad_norm": 2.7452616691589355,
"learning_rate": 4.9468291650281465e-06,
"loss": 0.6096,
"step": 928
},
{
"epoch": 0.4392434988179669,
"grad_norm": 3.30098819732666,
"learning_rate": 4.946701114088307e-06,
"loss": 0.6277,
"step": 929
},
{
"epoch": 0.4397163120567376,
"grad_norm": 2.789482593536377,
"learning_rate": 4.946572910803116e-06,
"loss": 0.7,
"step": 930
},
{
"epoch": 0.4401891252955083,
"grad_norm": 2.7283935546875,
"learning_rate": 4.946444555180559e-06,
"loss": 0.5375,
"step": 931
},
{
"epoch": 0.440661938534279,
"grad_norm": 3.101304054260254,
"learning_rate": 4.946316047228627e-06,
"loss": 0.6131,
"step": 932
},
{
"epoch": 0.44113475177304967,
"grad_norm": 3.573908805847168,
"learning_rate": 4.946187386955321e-06,
"loss": 0.7073,
"step": 933
},
{
"epoch": 0.44160756501182036,
"grad_norm": 3.214979648590088,
"learning_rate": 4.946058574368653e-06,
"loss": 0.6508,
"step": 934
},
{
"epoch": 0.44208037825059104,
"grad_norm": 3.145082712173462,
"learning_rate": 4.945929609476643e-06,
"loss": 0.64,
"step": 935
},
{
"epoch": 0.4425531914893617,
"grad_norm": 2.991780996322632,
"learning_rate": 4.945800492287321e-06,
"loss": 0.6315,
"step": 936
},
{
"epoch": 0.44302600472813236,
"grad_norm": 3.2441139221191406,
"learning_rate": 4.945671222808727e-06,
"loss": 0.7144,
"step": 937
},
{
"epoch": 0.44349881796690305,
"grad_norm": 2.9397029876708984,
"learning_rate": 4.94554180104891e-06,
"loss": 0.6818,
"step": 938
},
{
"epoch": 0.44397163120567373,
"grad_norm": 3.2471461296081543,
"learning_rate": 4.945412227015929e-06,
"loss": 0.6921,
"step": 939
},
{
"epoch": 0.4444444444444444,
"grad_norm": 3.0882487297058105,
"learning_rate": 4.945282500717851e-06,
"loss": 0.718,
"step": 940
},
{
"epoch": 0.4449172576832151,
"grad_norm": 2.6035783290863037,
"learning_rate": 4.945152622162753e-06,
"loss": 0.621,
"step": 941
},
{
"epoch": 0.4453900709219858,
"grad_norm": 2.83659029006958,
"learning_rate": 4.945022591358724e-06,
"loss": 0.6403,
"step": 942
},
{
"epoch": 0.4458628841607565,
"grad_norm": 2.824463129043579,
"learning_rate": 4.944892408313859e-06,
"loss": 0.6594,
"step": 943
},
{
"epoch": 0.44633569739952716,
"grad_norm": 2.753735065460205,
"learning_rate": 4.9447620730362645e-06,
"loss": 0.6116,
"step": 944
},
{
"epoch": 0.44680851063829785,
"grad_norm": 3.0659725666046143,
"learning_rate": 4.944631585534056e-06,
"loss": 0.5983,
"step": 945
},
{
"epoch": 0.44728132387706854,
"grad_norm": 2.969113349914551,
"learning_rate": 4.944500945815357e-06,
"loss": 0.6859,
"step": 946
},
{
"epoch": 0.4477541371158392,
"grad_norm": 2.810303211212158,
"learning_rate": 4.944370153888303e-06,
"loss": 0.7025,
"step": 947
},
{
"epoch": 0.4482269503546099,
"grad_norm": 3.027721643447876,
"learning_rate": 4.944239209761038e-06,
"loss": 0.7268,
"step": 948
},
{
"epoch": 0.4486997635933806,
"grad_norm": 2.661503314971924,
"learning_rate": 4.944108113441716e-06,
"loss": 0.6702,
"step": 949
},
{
"epoch": 0.4491725768321513,
"grad_norm": 2.738591432571411,
"learning_rate": 4.943976864938498e-06,
"loss": 0.6728,
"step": 950
},
{
"epoch": 0.44964539007092197,
"grad_norm": 3.447505474090576,
"learning_rate": 4.943845464259557e-06,
"loss": 0.6586,
"step": 951
},
{
"epoch": 0.45011820330969265,
"grad_norm": 3.0968854427337646,
"learning_rate": 4.943713911413075e-06,
"loss": 0.7666,
"step": 952
},
{
"epoch": 0.45059101654846334,
"grad_norm": 2.4113779067993164,
"learning_rate": 4.943582206407244e-06,
"loss": 0.6173,
"step": 953
},
{
"epoch": 0.451063829787234,
"grad_norm": 2.6357979774475098,
"learning_rate": 4.943450349250263e-06,
"loss": 0.5589,
"step": 954
},
{
"epoch": 0.4515366430260047,
"grad_norm": 2.9182233810424805,
"learning_rate": 4.9433183399503425e-06,
"loss": 0.6252,
"step": 955
},
{
"epoch": 0.4520094562647754,
"grad_norm": 2.832740306854248,
"learning_rate": 4.943186178515703e-06,
"loss": 0.6882,
"step": 956
},
{
"epoch": 0.4524822695035461,
"grad_norm": 2.9508981704711914,
"learning_rate": 4.943053864954574e-06,
"loss": 0.5722,
"step": 957
},
{
"epoch": 0.4529550827423168,
"grad_norm": 3.044729471206665,
"learning_rate": 4.9429213992751925e-06,
"loss": 0.6772,
"step": 958
},
{
"epoch": 0.45342789598108746,
"grad_norm": 2.606003522872925,
"learning_rate": 4.9427887814858075e-06,
"loss": 0.6445,
"step": 959
},
{
"epoch": 0.45390070921985815,
"grad_norm": 2.4634225368499756,
"learning_rate": 4.942656011594676e-06,
"loss": 0.6151,
"step": 960
},
{
"epoch": 0.45437352245862883,
"grad_norm": 2.8872334957122803,
"learning_rate": 4.942523089610066e-06,
"loss": 0.6255,
"step": 961
},
{
"epoch": 0.4548463356973995,
"grad_norm": 2.870605707168579,
"learning_rate": 4.942390015540253e-06,
"loss": 0.7481,
"step": 962
},
{
"epoch": 0.4553191489361702,
"grad_norm": 2.952680826187134,
"learning_rate": 4.942256789393524e-06,
"loss": 0.5556,
"step": 963
},
{
"epoch": 0.4557919621749409,
"grad_norm": 2.623680353164673,
"learning_rate": 4.9421234111781725e-06,
"loss": 0.6115,
"step": 964
},
{
"epoch": 0.4562647754137116,
"grad_norm": 2.6933600902557373,
"learning_rate": 4.941989880902505e-06,
"loss": 0.6102,
"step": 965
},
{
"epoch": 0.45673758865248226,
"grad_norm": 2.6047189235687256,
"learning_rate": 4.941856198574836e-06,
"loss": 0.612,
"step": 966
},
{
"epoch": 0.45721040189125295,
"grad_norm": 2.779186725616455,
"learning_rate": 4.9417223642034885e-06,
"loss": 0.5424,
"step": 967
},
{
"epoch": 0.45768321513002364,
"grad_norm": 2.6177165508270264,
"learning_rate": 4.941588377796795e-06,
"loss": 0.4661,
"step": 968
},
{
"epoch": 0.4581560283687943,
"grad_norm": 2.959676742553711,
"learning_rate": 4.941454239363101e-06,
"loss": 0.6966,
"step": 969
},
{
"epoch": 0.458628841607565,
"grad_norm": 2.9788379669189453,
"learning_rate": 4.941319948910756e-06,
"loss": 0.6181,
"step": 970
},
{
"epoch": 0.4591016548463357,
"grad_norm": 4.642750263214111,
"learning_rate": 4.941185506448122e-06,
"loss": 0.5602,
"step": 971
},
{
"epoch": 0.4595744680851064,
"grad_norm": 2.793002128601074,
"learning_rate": 4.941050911983572e-06,
"loss": 0.602,
"step": 972
},
{
"epoch": 0.46004728132387707,
"grad_norm": 2.6833035945892334,
"learning_rate": 4.9409161655254845e-06,
"loss": 0.5549,
"step": 973
},
{
"epoch": 0.46052009456264775,
"grad_norm": 3.905032157897949,
"learning_rate": 4.94078126708225e-06,
"loss": 0.6335,
"step": 974
},
{
"epoch": 0.46099290780141844,
"grad_norm": 2.922609329223633,
"learning_rate": 4.94064621666227e-06,
"loss": 0.5839,
"step": 975
},
{
"epoch": 0.4614657210401891,
"grad_norm": 2.8277416229248047,
"learning_rate": 4.940511014273952e-06,
"loss": 0.629,
"step": 976
},
{
"epoch": 0.4619385342789598,
"grad_norm": 3.07511043548584,
"learning_rate": 4.940375659925714e-06,
"loss": 0.7058,
"step": 977
},
{
"epoch": 0.4624113475177305,
"grad_norm": 3.65043044090271,
"learning_rate": 4.940240153625984e-06,
"loss": 0.7174,
"step": 978
},
{
"epoch": 0.4628841607565012,
"grad_norm": 2.755167245864868,
"learning_rate": 4.9401044953832e-06,
"loss": 0.6548,
"step": 979
},
{
"epoch": 0.46335697399527187,
"grad_norm": 2.9881057739257812,
"learning_rate": 4.939968685205808e-06,
"loss": 0.6245,
"step": 980
},
{
"epoch": 0.46382978723404256,
"grad_norm": 2.9484212398529053,
"learning_rate": 4.939832723102266e-06,
"loss": 0.655,
"step": 981
},
{
"epoch": 0.46430260047281324,
"grad_norm": 2.898918628692627,
"learning_rate": 4.939696609081038e-06,
"loss": 0.6178,
"step": 982
},
{
"epoch": 0.46477541371158393,
"grad_norm": 2.7052435874938965,
"learning_rate": 4.9395603431506e-06,
"loss": 0.6393,
"step": 983
},
{
"epoch": 0.4652482269503546,
"grad_norm": 2.5610013008117676,
"learning_rate": 4.939423925319436e-06,
"loss": 0.4847,
"step": 984
},
{
"epoch": 0.4657210401891253,
"grad_norm": 3.229083299636841,
"learning_rate": 4.939287355596042e-06,
"loss": 0.6473,
"step": 985
},
{
"epoch": 0.466193853427896,
"grad_norm": 2.907097816467285,
"learning_rate": 4.9391506339889195e-06,
"loss": 0.652,
"step": 986
},
{
"epoch": 0.4666666666666667,
"grad_norm": 2.6929478645324707,
"learning_rate": 4.939013760506582e-06,
"loss": 0.6175,
"step": 987
},
{
"epoch": 0.46713947990543736,
"grad_norm": 3.414813280105591,
"learning_rate": 4.938876735157554e-06,
"loss": 0.7597,
"step": 988
},
{
"epoch": 0.46761229314420805,
"grad_norm": 3.297360420227051,
"learning_rate": 4.938739557950365e-06,
"loss": 0.6824,
"step": 989
},
{
"epoch": 0.46808510638297873,
"grad_norm": 3.083155393600464,
"learning_rate": 4.938602228893557e-06,
"loss": 0.6505,
"step": 990
},
{
"epoch": 0.4685579196217494,
"grad_norm": 2.9781153202056885,
"learning_rate": 4.938464747995681e-06,
"loss": 0.666,
"step": 991
},
{
"epoch": 0.4690307328605201,
"grad_norm": 3.1494534015655518,
"learning_rate": 4.9383271152652975e-06,
"loss": 0.6422,
"step": 992
},
{
"epoch": 0.4695035460992908,
"grad_norm": 2.547868490219116,
"learning_rate": 4.938189330710976e-06,
"loss": 0.5766,
"step": 993
},
{
"epoch": 0.4699763593380615,
"grad_norm": 2.684736967086792,
"learning_rate": 4.938051394341297e-06,
"loss": 0.6407,
"step": 994
},
{
"epoch": 0.47044917257683216,
"grad_norm": 2.9619693756103516,
"learning_rate": 4.937913306164847e-06,
"loss": 0.6936,
"step": 995
},
{
"epoch": 0.47092198581560285,
"grad_norm": 2.9698498249053955,
"learning_rate": 4.937775066190227e-06,
"loss": 0.6464,
"step": 996
},
{
"epoch": 0.47139479905437354,
"grad_norm": 3.121049642562866,
"learning_rate": 4.937636674426042e-06,
"loss": 0.6383,
"step": 997
},
{
"epoch": 0.4718676122931442,
"grad_norm": 3.113672971725464,
"learning_rate": 4.93749813088091e-06,
"loss": 0.6892,
"step": 998
},
{
"epoch": 0.4723404255319149,
"grad_norm": 3.126113176345825,
"learning_rate": 4.937359435563458e-06,
"loss": 0.6728,
"step": 999
},
{
"epoch": 0.4728132387706856,
"grad_norm": 3.353966236114502,
"learning_rate": 4.937220588482321e-06,
"loss": 0.6041,
"step": 1000
},
{
"epoch": 0.4732860520094563,
"grad_norm": 2.8860628604888916,
"learning_rate": 4.937081589646144e-06,
"loss": 0.6798,
"step": 1001
},
{
"epoch": 0.47375886524822697,
"grad_norm": 3.0510590076446533,
"learning_rate": 4.936942439063584e-06,
"loss": 0.5841,
"step": 1002
},
{
"epoch": 0.47423167848699765,
"grad_norm": 2.6998369693756104,
"learning_rate": 4.936803136743303e-06,
"loss": 0.6403,
"step": 1003
},
{
"epoch": 0.47470449172576834,
"grad_norm": 2.875347137451172,
"learning_rate": 4.9366636826939765e-06,
"loss": 0.5811,
"step": 1004
},
{
"epoch": 0.475177304964539,
"grad_norm": 2.9122262001037598,
"learning_rate": 4.936524076924287e-06,
"loss": 0.6852,
"step": 1005
},
{
"epoch": 0.4756501182033097,
"grad_norm": 2.5167057514190674,
"learning_rate": 4.9363843194429265e-06,
"loss": 0.5367,
"step": 1006
},
{
"epoch": 0.4761229314420804,
"grad_norm": 2.5745551586151123,
"learning_rate": 4.9362444102585985e-06,
"loss": 0.6241,
"step": 1007
},
{
"epoch": 0.4765957446808511,
"grad_norm": 2.5024216175079346,
"learning_rate": 4.9361043493800125e-06,
"loss": 0.6133,
"step": 1008
},
{
"epoch": 0.47706855791962177,
"grad_norm": 2.7281384468078613,
"learning_rate": 4.935964136815892e-06,
"loss": 0.6834,
"step": 1009
},
{
"epoch": 0.47754137115839246,
"grad_norm": 3.0118913650512695,
"learning_rate": 4.935823772574965e-06,
"loss": 0.6922,
"step": 1010
},
{
"epoch": 0.47801418439716314,
"grad_norm": 3.016216993331909,
"learning_rate": 4.935683256665973e-06,
"loss": 0.6653,
"step": 1011
},
{
"epoch": 0.47848699763593383,
"grad_norm": 2.9526784420013428,
"learning_rate": 4.9355425890976636e-06,
"loss": 0.6423,
"step": 1012
},
{
"epoch": 0.4789598108747045,
"grad_norm": 6.222797393798828,
"learning_rate": 4.9354017698787985e-06,
"loss": 0.5884,
"step": 1013
},
{
"epoch": 0.4794326241134752,
"grad_norm": 2.6553597450256348,
"learning_rate": 4.935260799018143e-06,
"loss": 0.6624,
"step": 1014
},
{
"epoch": 0.4799054373522459,
"grad_norm": 3.0942065715789795,
"learning_rate": 4.935119676524475e-06,
"loss": 0.6623,
"step": 1015
},
{
"epoch": 0.4803782505910166,
"grad_norm": 2.626359224319458,
"learning_rate": 4.934978402406585e-06,
"loss": 0.6195,
"step": 1016
},
{
"epoch": 0.4808510638297872,
"grad_norm": 2.7954699993133545,
"learning_rate": 4.934836976673265e-06,
"loss": 0.5545,
"step": 1017
},
{
"epoch": 0.4813238770685579,
"grad_norm": 2.913557291030884,
"learning_rate": 4.934695399333324e-06,
"loss": 0.6288,
"step": 1018
},
{
"epoch": 0.4817966903073286,
"grad_norm": 3.1043739318847656,
"learning_rate": 4.9345536703955746e-06,
"loss": 0.6771,
"step": 1019
},
{
"epoch": 0.48226950354609927,
"grad_norm": 2.789357900619507,
"learning_rate": 4.934411789868845e-06,
"loss": 0.6227,
"step": 1020
},
{
"epoch": 0.48274231678486995,
"grad_norm": 2.480609655380249,
"learning_rate": 4.934269757761967e-06,
"loss": 0.5779,
"step": 1021
},
{
"epoch": 0.48321513002364064,
"grad_norm": 2.7946252822875977,
"learning_rate": 4.934127574083785e-06,
"loss": 0.6166,
"step": 1022
},
{
"epoch": 0.4836879432624113,
"grad_norm": 3.0670509338378906,
"learning_rate": 4.933985238843153e-06,
"loss": 0.7766,
"step": 1023
},
{
"epoch": 0.484160756501182,
"grad_norm": 2.8567559719085693,
"learning_rate": 4.933842752048932e-06,
"loss": 0.5088,
"step": 1024
},
{
"epoch": 0.4846335697399527,
"grad_norm": 2.5674657821655273,
"learning_rate": 4.933700113709996e-06,
"loss": 0.6036,
"step": 1025
},
{
"epoch": 0.4851063829787234,
"grad_norm": 2.782339096069336,
"learning_rate": 4.933557323835224e-06,
"loss": 0.5335,
"step": 1026
},
{
"epoch": 0.48557919621749407,
"grad_norm": 2.6334071159362793,
"learning_rate": 4.93341438243351e-06,
"loss": 0.6327,
"step": 1027
},
{
"epoch": 0.48605200945626476,
"grad_norm": 3.0853965282440186,
"learning_rate": 4.933271289513751e-06,
"loss": 0.7102,
"step": 1028
},
{
"epoch": 0.48652482269503544,
"grad_norm": 2.619997501373291,
"learning_rate": 4.933128045084859e-06,
"loss": 0.6138,
"step": 1029
},
{
"epoch": 0.48699763593380613,
"grad_norm": 2.8316116333007812,
"learning_rate": 4.932984649155753e-06,
"loss": 0.6346,
"step": 1030
},
{
"epoch": 0.4874704491725768,
"grad_norm": 3.153486490249634,
"learning_rate": 4.932841101735361e-06,
"loss": 0.7626,
"step": 1031
},
{
"epoch": 0.4879432624113475,
"grad_norm": 3.1831274032592773,
"learning_rate": 4.9326974028326214e-06,
"loss": 0.6607,
"step": 1032
},
{
"epoch": 0.4884160756501182,
"grad_norm": 2.791078567504883,
"learning_rate": 4.932553552456481e-06,
"loss": 0.6141,
"step": 1033
},
{
"epoch": 0.4888888888888889,
"grad_norm": 2.627263307571411,
"learning_rate": 4.932409550615898e-06,
"loss": 0.6777,
"step": 1034
},
{
"epoch": 0.48936170212765956,
"grad_norm": 2.8550007343292236,
"learning_rate": 4.932265397319838e-06,
"loss": 0.6379,
"step": 1035
},
{
"epoch": 0.48983451536643025,
"grad_norm": 4.505824089050293,
"learning_rate": 4.932121092577276e-06,
"loss": 0.5892,
"step": 1036
},
{
"epoch": 0.49030732860520093,
"grad_norm": 3.100191116333008,
"learning_rate": 4.931976636397199e-06,
"loss": 0.6443,
"step": 1037
},
{
"epoch": 0.4907801418439716,
"grad_norm": 2.921494245529175,
"learning_rate": 4.9318320287886e-06,
"loss": 0.6821,
"step": 1038
},
{
"epoch": 0.4912529550827423,
"grad_norm": 4.577807903289795,
"learning_rate": 4.931687269760485e-06,
"loss": 0.5946,
"step": 1039
},
{
"epoch": 0.491725768321513,
"grad_norm": 2.7347636222839355,
"learning_rate": 4.931542359321865e-06,
"loss": 0.5689,
"step": 1040
},
{
"epoch": 0.4921985815602837,
"grad_norm": 2.5289158821105957,
"learning_rate": 4.931397297481765e-06,
"loss": 0.5632,
"step": 1041
},
{
"epoch": 0.49267139479905436,
"grad_norm": 3.3518471717834473,
"learning_rate": 4.9312520842492165e-06,
"loss": 0.6349,
"step": 1042
},
{
"epoch": 0.49314420803782505,
"grad_norm": 3.0469748973846436,
"learning_rate": 4.931106719633261e-06,
"loss": 0.5734,
"step": 1043
},
{
"epoch": 0.49361702127659574,
"grad_norm": 3.104682445526123,
"learning_rate": 4.930961203642951e-06,
"loss": 0.6101,
"step": 1044
},
{
"epoch": 0.4940898345153664,
"grad_norm": 2.776705503463745,
"learning_rate": 4.930815536287346e-06,
"loss": 0.6397,
"step": 1045
},
{
"epoch": 0.4945626477541371,
"grad_norm": 2.760380983352661,
"learning_rate": 4.930669717575516e-06,
"loss": 0.668,
"step": 1046
},
{
"epoch": 0.4950354609929078,
"grad_norm": 2.70084547996521,
"learning_rate": 4.930523747516541e-06,
"loss": 0.5729,
"step": 1047
},
{
"epoch": 0.4955082742316785,
"grad_norm": 2.7319583892822266,
"learning_rate": 4.930377626119511e-06,
"loss": 0.6258,
"step": 1048
},
{
"epoch": 0.49598108747044917,
"grad_norm": 3.2515223026275635,
"learning_rate": 4.930231353393521e-06,
"loss": 0.7412,
"step": 1049
},
{
"epoch": 0.49645390070921985,
"grad_norm": 3.0646486282348633,
"learning_rate": 4.930084929347682e-06,
"loss": 0.5809,
"step": 1050
},
{
"epoch": 0.49692671394799054,
"grad_norm": 3.1621921062469482,
"learning_rate": 4.9299383539911096e-06,
"loss": 0.6282,
"step": 1051
},
{
"epoch": 0.4973995271867612,
"grad_norm": 2.864713191986084,
"learning_rate": 4.929791627332931e-06,
"loss": 0.6263,
"step": 1052
},
{
"epoch": 0.4978723404255319,
"grad_norm": 3.181016683578491,
"learning_rate": 4.929644749382283e-06,
"loss": 0.5697,
"step": 1053
},
{
"epoch": 0.4983451536643026,
"grad_norm": 2.9064836502075195,
"learning_rate": 4.929497720148309e-06,
"loss": 0.6161,
"step": 1054
},
{
"epoch": 0.4988179669030733,
"grad_norm": 3.058112859725952,
"learning_rate": 4.9293505396401655e-06,
"loss": 0.6477,
"step": 1055
},
{
"epoch": 0.49929078014184397,
"grad_norm": 2.5227596759796143,
"learning_rate": 4.929203207867016e-06,
"loss": 0.5819,
"step": 1056
},
{
"epoch": 0.49976359338061466,
"grad_norm": 3.386862277984619,
"learning_rate": 4.929055724838035e-06,
"loss": 0.7342,
"step": 1057
},
{
"epoch": 0.5002364066193853,
"grad_norm": 3.368346929550171,
"learning_rate": 4.928908090562404e-06,
"loss": 0.6622,
"step": 1058
},
{
"epoch": 0.500709219858156,
"grad_norm": 2.9108314514160156,
"learning_rate": 4.928760305049317e-06,
"loss": 0.6598,
"step": 1059
},
{
"epoch": 0.5011820330969267,
"grad_norm": 2.822305917739868,
"learning_rate": 4.928612368307977e-06,
"loss": 0.5841,
"step": 1060
},
{
"epoch": 0.5016548463356973,
"grad_norm": 2.689131259918213,
"learning_rate": 4.928464280347592e-06,
"loss": 0.6631,
"step": 1061
},
{
"epoch": 0.502127659574468,
"grad_norm": 3.337214946746826,
"learning_rate": 4.9283160411773864e-06,
"loss": 0.6105,
"step": 1062
},
{
"epoch": 0.5026004728132387,
"grad_norm": 3.035911798477173,
"learning_rate": 4.928167650806588e-06,
"loss": 0.6981,
"step": 1063
},
{
"epoch": 0.5030732860520094,
"grad_norm": 2.8820855617523193,
"learning_rate": 4.9280191092444375e-06,
"loss": 0.6408,
"step": 1064
},
{
"epoch": 0.5035460992907801,
"grad_norm": 3.080432415008545,
"learning_rate": 4.927870416500183e-06,
"loss": 0.6398,
"step": 1065
},
{
"epoch": 0.5040189125295508,
"grad_norm": 2.761612892150879,
"learning_rate": 4.927721572583084e-06,
"loss": 0.6126,
"step": 1066
},
{
"epoch": 0.5044917257683215,
"grad_norm": 2.8561882972717285,
"learning_rate": 4.927572577502408e-06,
"loss": 0.584,
"step": 1067
},
{
"epoch": 0.5049645390070922,
"grad_norm": 3.3386311531066895,
"learning_rate": 4.927423431267432e-06,
"loss": 0.6666,
"step": 1068
},
{
"epoch": 0.5054373522458628,
"grad_norm": 2.632906675338745,
"learning_rate": 4.927274133887443e-06,
"loss": 0.632,
"step": 1069
},
{
"epoch": 0.5059101654846335,
"grad_norm": 2.8737308979034424,
"learning_rate": 4.927124685371737e-06,
"loss": 0.6051,
"step": 1070
},
{
"epoch": 0.5063829787234042,
"grad_norm": 3.042222738265991,
"learning_rate": 4.926975085729619e-06,
"loss": 0.6954,
"step": 1071
},
{
"epoch": 0.5068557919621749,
"grad_norm": 3.3341481685638428,
"learning_rate": 4.926825334970404e-06,
"loss": 0.7148,
"step": 1072
},
{
"epoch": 0.5073286052009456,
"grad_norm": 2.7415387630462646,
"learning_rate": 4.926675433103418e-06,
"loss": 0.5456,
"step": 1073
},
{
"epoch": 0.5078014184397163,
"grad_norm": 2.7545325756073,
"learning_rate": 4.926525380137993e-06,
"loss": 0.6213,
"step": 1074
},
{
"epoch": 0.508274231678487,
"grad_norm": 2.9153690338134766,
"learning_rate": 4.926375176083472e-06,
"loss": 0.6466,
"step": 1075
},
{
"epoch": 0.5087470449172576,
"grad_norm": 4.210638523101807,
"learning_rate": 4.926224820949209e-06,
"loss": 0.6192,
"step": 1076
},
{
"epoch": 0.5092198581560283,
"grad_norm": 2.4357898235321045,
"learning_rate": 4.926074314744565e-06,
"loss": 0.594,
"step": 1077
},
{
"epoch": 0.509692671394799,
"grad_norm": 2.8004701137542725,
"learning_rate": 4.92592365747891e-06,
"loss": 0.6276,
"step": 1078
},
{
"epoch": 0.5101654846335697,
"grad_norm": 2.920675039291382,
"learning_rate": 4.925772849161628e-06,
"loss": 0.6043,
"step": 1079
},
{
"epoch": 0.5106382978723404,
"grad_norm": 2.791555404663086,
"learning_rate": 4.9256218898021055e-06,
"loss": 0.6837,
"step": 1080
},
{
"epoch": 0.5111111111111111,
"grad_norm": 3.1702463626861572,
"learning_rate": 4.925470779409746e-06,
"loss": 0.668,
"step": 1081
},
{
"epoch": 0.5115839243498818,
"grad_norm": 2.7149479389190674,
"learning_rate": 4.925319517993955e-06,
"loss": 0.5842,
"step": 1082
},
{
"epoch": 0.5120567375886524,
"grad_norm": 2.916311025619507,
"learning_rate": 4.925168105564153e-06,
"loss": 0.6893,
"step": 1083
},
{
"epoch": 0.5125295508274231,
"grad_norm": 2.917654514312744,
"learning_rate": 4.925016542129767e-06,
"loss": 0.6513,
"step": 1084
},
{
"epoch": 0.5130023640661938,
"grad_norm": 2.5568928718566895,
"learning_rate": 4.924864827700234e-06,
"loss": 0.6177,
"step": 1085
},
{
"epoch": 0.5134751773049645,
"grad_norm": 2.816720485687256,
"learning_rate": 4.924712962285001e-06,
"loss": 0.5833,
"step": 1086
},
{
"epoch": 0.5139479905437352,
"grad_norm": 2.6989188194274902,
"learning_rate": 4.9245609458935235e-06,
"loss": 0.6332,
"step": 1087
},
{
"epoch": 0.5144208037825059,
"grad_norm": 2.959599494934082,
"learning_rate": 4.924408778535268e-06,
"loss": 0.626,
"step": 1088
},
{
"epoch": 0.5148936170212766,
"grad_norm": 2.872814416885376,
"learning_rate": 4.924256460219708e-06,
"loss": 0.6407,
"step": 1089
},
{
"epoch": 0.5153664302600472,
"grad_norm": 2.6989097595214844,
"learning_rate": 4.924103990956329e-06,
"loss": 0.6391,
"step": 1090
},
{
"epoch": 0.5158392434988179,
"grad_norm": 2.986492156982422,
"learning_rate": 4.9239513707546235e-06,
"loss": 0.6911,
"step": 1091
},
{
"epoch": 0.5163120567375886,
"grad_norm": 3.069920301437378,
"learning_rate": 4.9237985996240954e-06,
"loss": 0.671,
"step": 1092
},
{
"epoch": 0.5167848699763593,
"grad_norm": 2.8214917182922363,
"learning_rate": 4.9236456775742555e-06,
"loss": 0.5885,
"step": 1093
},
{
"epoch": 0.51725768321513,
"grad_norm": 2.9416961669921875,
"learning_rate": 4.923492604614627e-06,
"loss": 0.6293,
"step": 1094
},
{
"epoch": 0.5177304964539007,
"grad_norm": 2.761780023574829,
"learning_rate": 4.923339380754741e-06,
"loss": 0.649,
"step": 1095
},
{
"epoch": 0.5182033096926714,
"grad_norm": 2.7648792266845703,
"learning_rate": 4.923186006004138e-06,
"loss": 0.5906,
"step": 1096
},
{
"epoch": 0.518676122931442,
"grad_norm": 3.5535428524017334,
"learning_rate": 4.923032480372367e-06,
"loss": 0.7138,
"step": 1097
},
{
"epoch": 0.5191489361702127,
"grad_norm": 2.6252479553222656,
"learning_rate": 4.922878803868988e-06,
"loss": 0.5499,
"step": 1098
},
{
"epoch": 0.5196217494089834,
"grad_norm": 2.901002883911133,
"learning_rate": 4.9227249765035715e-06,
"loss": 0.6991,
"step": 1099
},
{
"epoch": 0.5200945626477541,
"grad_norm": 2.621877431869507,
"learning_rate": 4.9225709982856925e-06,
"loss": 0.6269,
"step": 1100
},
{
"epoch": 0.5205673758865248,
"grad_norm": 2.872483015060425,
"learning_rate": 4.92241686922494e-06,
"loss": 0.6657,
"step": 1101
},
{
"epoch": 0.5210401891252955,
"grad_norm": 2.730447769165039,
"learning_rate": 4.922262589330912e-06,
"loss": 0.6061,
"step": 1102
},
{
"epoch": 0.5215130023640662,
"grad_norm": 2.646247386932373,
"learning_rate": 4.922108158613213e-06,
"loss": 0.5923,
"step": 1103
},
{
"epoch": 0.5219858156028369,
"grad_norm": 2.6488895416259766,
"learning_rate": 4.92195357708146e-06,
"loss": 0.6293,
"step": 1104
},
{
"epoch": 0.5224586288416075,
"grad_norm": 2.756338357925415,
"learning_rate": 4.921798844745278e-06,
"loss": 0.6374,
"step": 1105
},
{
"epoch": 0.5229314420803782,
"grad_norm": 3.1441280841827393,
"learning_rate": 4.921643961614301e-06,
"loss": 0.6652,
"step": 1106
},
{
"epoch": 0.5234042553191489,
"grad_norm": 3.050002098083496,
"learning_rate": 4.921488927698172e-06,
"loss": 0.6809,
"step": 1107
},
{
"epoch": 0.5238770685579196,
"grad_norm": 2.71750807762146,
"learning_rate": 4.921333743006547e-06,
"loss": 0.6266,
"step": 1108
},
{
"epoch": 0.5243498817966903,
"grad_norm": 2.8439245223999023,
"learning_rate": 4.921178407549086e-06,
"loss": 0.5663,
"step": 1109
},
{
"epoch": 0.524822695035461,
"grad_norm": 3.0722241401672363,
"learning_rate": 4.921022921335464e-06,
"loss": 0.6791,
"step": 1110
},
{
"epoch": 0.5252955082742317,
"grad_norm": 3.4381656646728516,
"learning_rate": 4.920867284375358e-06,
"loss": 0.6687,
"step": 1111
},
{
"epoch": 0.5257683215130023,
"grad_norm": 2.819812774658203,
"learning_rate": 4.920711496678463e-06,
"loss": 0.6299,
"step": 1112
},
{
"epoch": 0.526241134751773,
"grad_norm": 3.6587414741516113,
"learning_rate": 4.9205555582544765e-06,
"loss": 0.7392,
"step": 1113
},
{
"epoch": 0.5267139479905437,
"grad_norm": 2.774296522140503,
"learning_rate": 4.920399469113109e-06,
"loss": 0.6652,
"step": 1114
},
{
"epoch": 0.5271867612293144,
"grad_norm": 2.7480580806732178,
"learning_rate": 4.920243229264081e-06,
"loss": 0.596,
"step": 1115
},
{
"epoch": 0.5276595744680851,
"grad_norm": 3.213057518005371,
"learning_rate": 4.920086838717119e-06,
"loss": 0.6986,
"step": 1116
},
{
"epoch": 0.5281323877068558,
"grad_norm": 2.940546989440918,
"learning_rate": 4.919930297481962e-06,
"loss": 0.6481,
"step": 1117
},
{
"epoch": 0.5286052009456265,
"grad_norm": 2.5970494747161865,
"learning_rate": 4.9197736055683555e-06,
"loss": 0.5658,
"step": 1118
},
{
"epoch": 0.5290780141843971,
"grad_norm": 4.49385404586792,
"learning_rate": 4.919616762986057e-06,
"loss": 0.605,
"step": 1119
},
{
"epoch": 0.5295508274231678,
"grad_norm": 2.971857786178589,
"learning_rate": 4.919459769744833e-06,
"loss": 0.6539,
"step": 1120
},
{
"epoch": 0.5300236406619385,
"grad_norm": 2.6192965507507324,
"learning_rate": 4.919302625854457e-06,
"loss": 0.6226,
"step": 1121
},
{
"epoch": 0.5304964539007092,
"grad_norm": 2.665088176727295,
"learning_rate": 4.919145331324716e-06,
"loss": 0.6647,
"step": 1122
},
{
"epoch": 0.5309692671394799,
"grad_norm": 2.612126111984253,
"learning_rate": 4.918987886165403e-06,
"loss": 0.6965,
"step": 1123
},
{
"epoch": 0.5314420803782506,
"grad_norm": 3.80017352104187,
"learning_rate": 4.9188302903863205e-06,
"loss": 0.7396,
"step": 1124
},
{
"epoch": 0.5319148936170213,
"grad_norm": 2.781752824783325,
"learning_rate": 4.918672543997282e-06,
"loss": 0.5985,
"step": 1125
},
{
"epoch": 0.532387706855792,
"grad_norm": 2.6067914962768555,
"learning_rate": 4.91851464700811e-06,
"loss": 0.6159,
"step": 1126
},
{
"epoch": 0.5328605200945626,
"grad_norm": 2.670807123184204,
"learning_rate": 4.918356599428636e-06,
"loss": 0.5958,
"step": 1127
},
{
"epoch": 0.5333333333333333,
"grad_norm": 2.608611822128296,
"learning_rate": 4.9181984012687e-06,
"loss": 0.5768,
"step": 1128
},
{
"epoch": 0.533806146572104,
"grad_norm": 2.586764097213745,
"learning_rate": 4.918040052538154e-06,
"loss": 0.661,
"step": 1129
},
{
"epoch": 0.5342789598108747,
"grad_norm": 3.1317451000213623,
"learning_rate": 4.917881553246856e-06,
"loss": 0.6626,
"step": 1130
},
{
"epoch": 0.5347517730496454,
"grad_norm": 2.7135281562805176,
"learning_rate": 4.917722903404676e-06,
"loss": 0.6572,
"step": 1131
},
{
"epoch": 0.5352245862884161,
"grad_norm": 3.4546358585357666,
"learning_rate": 4.917564103021493e-06,
"loss": 0.5597,
"step": 1132
},
{
"epoch": 0.5356973995271868,
"grad_norm": 3.0943493843078613,
"learning_rate": 4.917405152107193e-06,
"loss": 0.7258,
"step": 1133
},
{
"epoch": 0.5361702127659574,
"grad_norm": 2.6069352626800537,
"learning_rate": 4.917246050671674e-06,
"loss": 0.6209,
"step": 1134
},
{
"epoch": 0.5366430260047281,
"grad_norm": 2.584883689880371,
"learning_rate": 4.917086798724844e-06,
"loss": 0.658,
"step": 1135
},
{
"epoch": 0.5371158392434988,
"grad_norm": 3.001976490020752,
"learning_rate": 4.9169273962766166e-06,
"loss": 0.6306,
"step": 1136
},
{
"epoch": 0.5375886524822695,
"grad_norm": 2.5013928413391113,
"learning_rate": 4.916767843336918e-06,
"loss": 0.572,
"step": 1137
},
{
"epoch": 0.5380614657210402,
"grad_norm": 2.9114553928375244,
"learning_rate": 4.916608139915684e-06,
"loss": 0.5841,
"step": 1138
},
{
"epoch": 0.5385342789598109,
"grad_norm": 2.8878467082977295,
"learning_rate": 4.9164482860228564e-06,
"loss": 0.6654,
"step": 1139
},
{
"epoch": 0.5390070921985816,
"grad_norm": 2.9827866554260254,
"learning_rate": 4.91628828166839e-06,
"loss": 0.6674,
"step": 1140
},
{
"epoch": 0.5394799054373522,
"grad_norm": 3.8696281909942627,
"learning_rate": 4.916128126862248e-06,
"loss": 0.6241,
"step": 1141
},
{
"epoch": 0.5399527186761229,
"grad_norm": 2.9556291103363037,
"learning_rate": 4.915967821614402e-06,
"loss": 0.6478,
"step": 1142
},
{
"epoch": 0.5404255319148936,
"grad_norm": 2.392942428588867,
"learning_rate": 4.915807365934834e-06,
"loss": 0.6097,
"step": 1143
},
{
"epoch": 0.5408983451536643,
"grad_norm": 3.032235860824585,
"learning_rate": 4.915646759833534e-06,
"loss": 0.7193,
"step": 1144
},
{
"epoch": 0.541371158392435,
"grad_norm": 2.840416193008423,
"learning_rate": 4.915486003320501e-06,
"loss": 0.5506,
"step": 1145
},
{
"epoch": 0.5418439716312057,
"grad_norm": 2.5438895225524902,
"learning_rate": 4.915325096405747e-06,
"loss": 0.6487,
"step": 1146
},
{
"epoch": 0.5423167848699764,
"grad_norm": 2.544334650039673,
"learning_rate": 4.9151640390992905e-06,
"loss": 0.6168,
"step": 1147
},
{
"epoch": 0.542789598108747,
"grad_norm": 2.8535678386688232,
"learning_rate": 4.91500283141116e-06,
"loss": 0.678,
"step": 1148
},
{
"epoch": 0.5432624113475177,
"grad_norm": 2.8086955547332764,
"learning_rate": 4.9148414733513915e-06,
"loss": 0.6473,
"step": 1149
},
{
"epoch": 0.5437352245862884,
"grad_norm": 2.4709885120391846,
"learning_rate": 4.914679964930034e-06,
"loss": 0.6797,
"step": 1150
},
{
"epoch": 0.5442080378250591,
"grad_norm": 2.8546934127807617,
"learning_rate": 4.9145183061571435e-06,
"loss": 0.6247,
"step": 1151
},
{
"epoch": 0.5446808510638298,
"grad_norm": 2.991184711456299,
"learning_rate": 4.9143564970427844e-06,
"loss": 0.5977,
"step": 1152
},
{
"epoch": 0.5451536643026005,
"grad_norm": 3.011216402053833,
"learning_rate": 4.914194537597033e-06,
"loss": 0.7005,
"step": 1153
},
{
"epoch": 0.5456264775413712,
"grad_norm": 2.807521343231201,
"learning_rate": 4.9140324278299744e-06,
"loss": 0.5412,
"step": 1154
},
{
"epoch": 0.5460992907801419,
"grad_norm": 3.0401229858398438,
"learning_rate": 4.913870167751701e-06,
"loss": 0.6394,
"step": 1155
},
{
"epoch": 0.5465721040189125,
"grad_norm": 2.853914976119995,
"learning_rate": 4.913707757372317e-06,
"loss": 0.6745,
"step": 1156
},
{
"epoch": 0.5470449172576832,
"grad_norm": 4.505620956420898,
"learning_rate": 4.913545196701935e-06,
"loss": 0.6668,
"step": 1157
},
{
"epoch": 0.5475177304964539,
"grad_norm": 3.0505781173706055,
"learning_rate": 4.913382485750676e-06,
"loss": 0.6926,
"step": 1158
},
{
"epoch": 0.5479905437352246,
"grad_norm": 2.798435688018799,
"learning_rate": 4.913219624528672e-06,
"loss": 0.605,
"step": 1159
},
{
"epoch": 0.5484633569739953,
"grad_norm": 2.7814908027648926,
"learning_rate": 4.913056613046065e-06,
"loss": 0.6678,
"step": 1160
},
{
"epoch": 0.548936170212766,
"grad_norm": 3.2089321613311768,
"learning_rate": 4.9128934513130025e-06,
"loss": 0.5995,
"step": 1161
},
{
"epoch": 0.5494089834515367,
"grad_norm": 2.7699952125549316,
"learning_rate": 4.9127301393396455e-06,
"loss": 0.7062,
"step": 1162
},
{
"epoch": 0.5498817966903073,
"grad_norm": 2.859368324279785,
"learning_rate": 4.912566677136162e-06,
"loss": 0.6063,
"step": 1163
},
{
"epoch": 0.550354609929078,
"grad_norm": 2.727334499359131,
"learning_rate": 4.91240306471273e-06,
"loss": 0.6848,
"step": 1164
},
{
"epoch": 0.5508274231678487,
"grad_norm": 2.6017510890960693,
"learning_rate": 4.912239302079537e-06,
"loss": 0.5808,
"step": 1165
},
{
"epoch": 0.5513002364066194,
"grad_norm": 3.539583206176758,
"learning_rate": 4.912075389246781e-06,
"loss": 0.7053,
"step": 1166
},
{
"epoch": 0.5517730496453901,
"grad_norm": 2.918280601501465,
"learning_rate": 4.911911326224666e-06,
"loss": 0.5904,
"step": 1167
},
{
"epoch": 0.5522458628841608,
"grad_norm": 3.0067362785339355,
"learning_rate": 4.9117471130234095e-06,
"loss": 0.6392,
"step": 1168
},
{
"epoch": 0.5527186761229315,
"grad_norm": 2.4374797344207764,
"learning_rate": 4.911582749653236e-06,
"loss": 0.5793,
"step": 1169
},
{
"epoch": 0.5531914893617021,
"grad_norm": 3.121182918548584,
"learning_rate": 4.911418236124378e-06,
"loss": 0.6636,
"step": 1170
},
{
"epoch": 0.5536643026004728,
"grad_norm": 3.1289851665496826,
"learning_rate": 4.91125357244708e-06,
"loss": 0.656,
"step": 1171
},
{
"epoch": 0.5541371158392435,
"grad_norm": 2.7034592628479004,
"learning_rate": 4.911088758631596e-06,
"loss": 0.6001,
"step": 1172
},
{
"epoch": 0.5546099290780142,
"grad_norm": 2.710146188735962,
"learning_rate": 4.910923794688187e-06,
"loss": 0.6007,
"step": 1173
},
{
"epoch": 0.5550827423167849,
"grad_norm": 2.5424487590789795,
"learning_rate": 4.910758680627124e-06,
"loss": 0.5193,
"step": 1174
},
{
"epoch": 0.5555555555555556,
"grad_norm": 2.615893602371216,
"learning_rate": 4.91059341645869e-06,
"loss": 0.5525,
"step": 1175
},
{
"epoch": 0.5560283687943263,
"grad_norm": 3.3179728984832764,
"learning_rate": 4.910428002193174e-06,
"loss": 0.7285,
"step": 1176
},
{
"epoch": 0.556501182033097,
"grad_norm": 2.7234175205230713,
"learning_rate": 4.910262437840875e-06,
"loss": 0.574,
"step": 1177
},
{
"epoch": 0.5569739952718676,
"grad_norm": 3.0416605472564697,
"learning_rate": 4.9100967234121034e-06,
"loss": 0.5623,
"step": 1178
},
{
"epoch": 0.5574468085106383,
"grad_norm": 3.067786455154419,
"learning_rate": 4.909930858917177e-06,
"loss": 0.6491,
"step": 1179
},
{
"epoch": 0.557919621749409,
"grad_norm": 3.0037379264831543,
"learning_rate": 4.909764844366422e-06,
"loss": 0.5696,
"step": 1180
},
{
"epoch": 0.5583924349881797,
"grad_norm": 2.966179609298706,
"learning_rate": 4.909598679770178e-06,
"loss": 0.6042,
"step": 1181
},
{
"epoch": 0.5588652482269504,
"grad_norm": 2.6000657081604004,
"learning_rate": 4.909432365138789e-06,
"loss": 0.5883,
"step": 1182
},
{
"epoch": 0.5593380614657211,
"grad_norm": 2.6794495582580566,
"learning_rate": 4.909265900482612e-06,
"loss": 0.6809,
"step": 1183
},
{
"epoch": 0.5598108747044918,
"grad_norm": 2.6765122413635254,
"learning_rate": 4.9090992858120115e-06,
"loss": 0.6601,
"step": 1184
},
{
"epoch": 0.5602836879432624,
"grad_norm": 2.6051928997039795,
"learning_rate": 4.908932521137363e-06,
"loss": 0.5946,
"step": 1185
},
{
"epoch": 0.5607565011820331,
"grad_norm": 3.0405542850494385,
"learning_rate": 4.908765606469048e-06,
"loss": 0.6998,
"step": 1186
},
{
"epoch": 0.5612293144208038,
"grad_norm": 2.7975668907165527,
"learning_rate": 4.908598541817462e-06,
"loss": 0.6218,
"step": 1187
},
{
"epoch": 0.5617021276595745,
"grad_norm": 2.5367627143859863,
"learning_rate": 4.908431327193005e-06,
"loss": 0.6354,
"step": 1188
},
{
"epoch": 0.5621749408983452,
"grad_norm": 3.7939631938934326,
"learning_rate": 4.908263962606091e-06,
"loss": 0.6376,
"step": 1189
},
{
"epoch": 0.5626477541371159,
"grad_norm": 2.864079475402832,
"learning_rate": 4.908096448067139e-06,
"loss": 0.5485,
"step": 1190
},
{
"epoch": 0.5631205673758866,
"grad_norm": 2.7855563163757324,
"learning_rate": 4.9079287835865804e-06,
"loss": 0.6645,
"step": 1191
},
{
"epoch": 0.5635933806146572,
"grad_norm": 2.6156625747680664,
"learning_rate": 4.9077609691748556e-06,
"loss": 0.5751,
"step": 1192
},
{
"epoch": 0.5640661938534279,
"grad_norm": 3.0475659370422363,
"learning_rate": 4.907593004842412e-06,
"loss": 0.6739,
"step": 1193
},
{
"epoch": 0.5645390070921986,
"grad_norm": 2.9176738262176514,
"learning_rate": 4.9074248905997104e-06,
"loss": 0.6493,
"step": 1194
},
{
"epoch": 0.5650118203309693,
"grad_norm": 2.6168384552001953,
"learning_rate": 4.907256626457216e-06,
"loss": 0.6154,
"step": 1195
},
{
"epoch": 0.56548463356974,
"grad_norm": 2.893980026245117,
"learning_rate": 4.907088212425408e-06,
"loss": 0.5808,
"step": 1196
},
{
"epoch": 0.5659574468085107,
"grad_norm": 3.3832836151123047,
"learning_rate": 4.90691964851477e-06,
"loss": 0.7888,
"step": 1197
},
{
"epoch": 0.5664302600472814,
"grad_norm": 3.088932752609253,
"learning_rate": 4.906750934735801e-06,
"loss": 0.6516,
"step": 1198
},
{
"epoch": 0.566903073286052,
"grad_norm": 2.494471549987793,
"learning_rate": 4.906582071099004e-06,
"loss": 0.6286,
"step": 1199
},
{
"epoch": 0.5673758865248227,
"grad_norm": 2.716550588607788,
"learning_rate": 4.906413057614895e-06,
"loss": 0.5939,
"step": 1200
},
{
"epoch": 0.5678486997635934,
"grad_norm": 2.5821073055267334,
"learning_rate": 4.906243894293995e-06,
"loss": 0.6668,
"step": 1201
},
{
"epoch": 0.5683215130023641,
"grad_norm": 3.651787042617798,
"learning_rate": 4.90607458114684e-06,
"loss": 0.6124,
"step": 1202
},
{
"epoch": 0.5687943262411348,
"grad_norm": 2.7567858695983887,
"learning_rate": 4.9059051181839705e-06,
"loss": 0.6656,
"step": 1203
},
{
"epoch": 0.5692671394799055,
"grad_norm": 2.8067586421966553,
"learning_rate": 4.90573550541594e-06,
"loss": 0.6306,
"step": 1204
},
{
"epoch": 0.5697399527186762,
"grad_norm": 2.6136393547058105,
"learning_rate": 4.905565742853307e-06,
"loss": 0.5992,
"step": 1205
},
{
"epoch": 0.5702127659574469,
"grad_norm": 2.899049758911133,
"learning_rate": 4.905395830506644e-06,
"loss": 0.621,
"step": 1206
},
{
"epoch": 0.5706855791962175,
"grad_norm": 3.036583185195923,
"learning_rate": 4.9052257683865294e-06,
"loss": 0.652,
"step": 1207
},
{
"epoch": 0.5711583924349882,
"grad_norm": 2.7947216033935547,
"learning_rate": 4.905055556503553e-06,
"loss": 0.6636,
"step": 1208
},
{
"epoch": 0.5716312056737589,
"grad_norm": 3.1646955013275146,
"learning_rate": 4.9048851948683135e-06,
"loss": 0.6376,
"step": 1209
},
{
"epoch": 0.5721040189125296,
"grad_norm": 2.8175766468048096,
"learning_rate": 4.904714683491417e-06,
"loss": 0.5929,
"step": 1210
},
{
"epoch": 0.5725768321513003,
"grad_norm": 2.923923969268799,
"learning_rate": 4.904544022383483e-06,
"loss": 0.6633,
"step": 1211
},
{
"epoch": 0.573049645390071,
"grad_norm": 2.7471134662628174,
"learning_rate": 4.9043732115551356e-06,
"loss": 0.6551,
"step": 1212
},
{
"epoch": 0.5735224586288417,
"grad_norm": 2.8660807609558105,
"learning_rate": 4.90420225101701e-06,
"loss": 0.6423,
"step": 1213
},
{
"epoch": 0.5739952718676123,
"grad_norm": 2.769247531890869,
"learning_rate": 4.904031140779754e-06,
"loss": 0.5982,
"step": 1214
},
{
"epoch": 0.574468085106383,
"grad_norm": 2.9043145179748535,
"learning_rate": 4.90385988085402e-06,
"loss": 0.5843,
"step": 1215
},
{
"epoch": 0.5749408983451537,
"grad_norm": 2.6639609336853027,
"learning_rate": 4.903688471250471e-06,
"loss": 0.5858,
"step": 1216
},
{
"epoch": 0.5754137115839244,
"grad_norm": 2.6967573165893555,
"learning_rate": 4.903516911979781e-06,
"loss": 0.5755,
"step": 1217
},
{
"epoch": 0.5758865248226951,
"grad_norm": 2.8865857124328613,
"learning_rate": 4.903345203052633e-06,
"loss": 0.6051,
"step": 1218
},
{
"epoch": 0.5763593380614658,
"grad_norm": 2.381979465484619,
"learning_rate": 4.903173344479717e-06,
"loss": 0.5727,
"step": 1219
},
{
"epoch": 0.5768321513002365,
"grad_norm": 2.7717981338500977,
"learning_rate": 4.903001336271734e-06,
"loss": 0.6406,
"step": 1220
},
{
"epoch": 0.577304964539007,
"grad_norm": 2.6431570053100586,
"learning_rate": 4.902829178439395e-06,
"loss": 0.6226,
"step": 1221
},
{
"epoch": 0.5777777777777777,
"grad_norm": 2.8090415000915527,
"learning_rate": 4.902656870993419e-06,
"loss": 0.5761,
"step": 1222
},
{
"epoch": 0.5782505910165484,
"grad_norm": 2.4769368171691895,
"learning_rate": 4.902484413944535e-06,
"loss": 0.5602,
"step": 1223
},
{
"epoch": 0.5787234042553191,
"grad_norm": 2.693316698074341,
"learning_rate": 4.902311807303481e-06,
"loss": 0.5222,
"step": 1224
},
{
"epoch": 0.5791962174940898,
"grad_norm": 2.7623913288116455,
"learning_rate": 4.902139051081004e-06,
"loss": 0.6978,
"step": 1225
},
{
"epoch": 0.5796690307328605,
"grad_norm": 2.6133766174316406,
"learning_rate": 4.901966145287863e-06,
"loss": 0.5802,
"step": 1226
},
{
"epoch": 0.5801418439716312,
"grad_norm": 2.7345972061157227,
"learning_rate": 4.901793089934821e-06,
"loss": 0.6294,
"step": 1227
},
{
"epoch": 0.5806146572104018,
"grad_norm": 2.7545835971832275,
"learning_rate": 4.9016198850326555e-06,
"loss": 0.6085,
"step": 1228
},
{
"epoch": 0.5810874704491725,
"grad_norm": 2.6947758197784424,
"learning_rate": 4.90144653059215e-06,
"loss": 0.6025,
"step": 1229
},
{
"epoch": 0.5815602836879432,
"grad_norm": 2.692967414855957,
"learning_rate": 4.901273026624099e-06,
"loss": 0.5715,
"step": 1230
},
{
"epoch": 0.5820330969267139,
"grad_norm": 2.78347110748291,
"learning_rate": 4.901099373139307e-06,
"loss": 0.6063,
"step": 1231
},
{
"epoch": 0.5825059101654846,
"grad_norm": 2.346496343612671,
"learning_rate": 4.900925570148585e-06,
"loss": 0.5869,
"step": 1232
},
{
"epoch": 0.5829787234042553,
"grad_norm": 2.606639862060547,
"learning_rate": 4.900751617662755e-06,
"loss": 0.6197,
"step": 1233
},
{
"epoch": 0.583451536643026,
"grad_norm": 2.5825929641723633,
"learning_rate": 4.900577515692649e-06,
"loss": 0.6721,
"step": 1234
},
{
"epoch": 0.5839243498817966,
"grad_norm": 2.731349468231201,
"learning_rate": 4.900403264249107e-06,
"loss": 0.6273,
"step": 1235
},
{
"epoch": 0.5843971631205673,
"grad_norm": 3.2133874893188477,
"learning_rate": 4.90022886334298e-06,
"loss": 0.6231,
"step": 1236
},
{
"epoch": 0.584869976359338,
"grad_norm": 2.9213852882385254,
"learning_rate": 4.900054312985127e-06,
"loss": 0.6677,
"step": 1237
},
{
"epoch": 0.5853427895981087,
"grad_norm": 2.815425157546997,
"learning_rate": 4.899879613186414e-06,
"loss": 0.6405,
"step": 1238
},
{
"epoch": 0.5858156028368794,
"grad_norm": 2.730782985687256,
"learning_rate": 4.899704763957721e-06,
"loss": 0.6233,
"step": 1239
},
{
"epoch": 0.5862884160756501,
"grad_norm": 2.6432766914367676,
"learning_rate": 4.899529765309936e-06,
"loss": 0.6267,
"step": 1240
},
{
"epoch": 0.5867612293144208,
"grad_norm": 2.616215229034424,
"learning_rate": 4.899354617253953e-06,
"loss": 0.6268,
"step": 1241
},
{
"epoch": 0.5872340425531914,
"grad_norm": 2.7630255222320557,
"learning_rate": 4.899179319800679e-06,
"loss": 0.6348,
"step": 1242
},
{
"epoch": 0.5877068557919621,
"grad_norm": 2.785095453262329,
"learning_rate": 4.899003872961029e-06,
"loss": 0.5839,
"step": 1243
},
{
"epoch": 0.5881796690307328,
"grad_norm": 2.9050328731536865,
"learning_rate": 4.898828276745927e-06,
"loss": 0.651,
"step": 1244
},
{
"epoch": 0.5886524822695035,
"grad_norm": 2.958092212677002,
"learning_rate": 4.8986525311663065e-06,
"loss": 0.6395,
"step": 1245
},
{
"epoch": 0.5891252955082742,
"grad_norm": 2.952310800552368,
"learning_rate": 4.898476636233111e-06,
"loss": 0.6731,
"step": 1246
},
{
"epoch": 0.5895981087470449,
"grad_norm": 2.9876346588134766,
"learning_rate": 4.898300591957293e-06,
"loss": 0.7015,
"step": 1247
},
{
"epoch": 0.5900709219858156,
"grad_norm": 2.8941752910614014,
"learning_rate": 4.898124398349813e-06,
"loss": 0.6452,
"step": 1248
},
{
"epoch": 0.5905437352245863,
"grad_norm": 2.9809536933898926,
"learning_rate": 4.897948055421642e-06,
"loss": 0.5736,
"step": 1249
},
{
"epoch": 0.5910165484633569,
"grad_norm": 2.927046775817871,
"learning_rate": 4.897771563183761e-06,
"loss": 0.5918,
"step": 1250
},
{
"epoch": 0.5914893617021276,
"grad_norm": 2.865020275115967,
"learning_rate": 4.897594921647158e-06,
"loss": 0.6924,
"step": 1251
},
{
"epoch": 0.5919621749408983,
"grad_norm": 2.7406699657440186,
"learning_rate": 4.897418130822832e-06,
"loss": 0.509,
"step": 1252
},
{
"epoch": 0.592434988179669,
"grad_norm": 2.781606912612915,
"learning_rate": 4.897241190721791e-06,
"loss": 0.5555,
"step": 1253
},
{
"epoch": 0.5929078014184397,
"grad_norm": 2.79209303855896,
"learning_rate": 4.8970641013550535e-06,
"loss": 0.6722,
"step": 1254
},
{
"epoch": 0.5933806146572104,
"grad_norm": 3.0672268867492676,
"learning_rate": 4.896886862733645e-06,
"loss": 0.6366,
"step": 1255
},
{
"epoch": 0.5938534278959811,
"grad_norm": 2.7456953525543213,
"learning_rate": 4.896709474868602e-06,
"loss": 0.6246,
"step": 1256
},
{
"epoch": 0.5943262411347517,
"grad_norm": 3.6731202602386475,
"learning_rate": 4.896531937770968e-06,
"loss": 0.668,
"step": 1257
},
{
"epoch": 0.5947990543735224,
"grad_norm": 2.6056087017059326,
"learning_rate": 4.8963542514518e-06,
"loss": 0.5815,
"step": 1258
},
{
"epoch": 0.5952718676122931,
"grad_norm": 2.719698905944824,
"learning_rate": 4.89617641592216e-06,
"loss": 0.6058,
"step": 1259
},
{
"epoch": 0.5957446808510638,
"grad_norm": 2.625838279724121,
"learning_rate": 4.895998431193121e-06,
"loss": 0.6143,
"step": 1260
},
{
"epoch": 0.5962174940898345,
"grad_norm": 2.7166085243225098,
"learning_rate": 4.895820297275767e-06,
"loss": 0.5187,
"step": 1261
},
{
"epoch": 0.5966903073286052,
"grad_norm": 2.7544102668762207,
"learning_rate": 4.8956420141811875e-06,
"loss": 0.5928,
"step": 1262
},
{
"epoch": 0.5971631205673759,
"grad_norm": 2.6678333282470703,
"learning_rate": 4.895463581920484e-06,
"loss": 0.611,
"step": 1263
},
{
"epoch": 0.5976359338061465,
"grad_norm": 2.853384494781494,
"learning_rate": 4.895285000504768e-06,
"loss": 0.642,
"step": 1264
},
{
"epoch": 0.5981087470449172,
"grad_norm": 2.637852430343628,
"learning_rate": 4.895106269945158e-06,
"loss": 0.6308,
"step": 1265
},
{
"epoch": 0.5985815602836879,
"grad_norm": 2.9880387783050537,
"learning_rate": 4.8949273902527826e-06,
"loss": 0.5781,
"step": 1266
},
{
"epoch": 0.5990543735224586,
"grad_norm": 3.5984015464782715,
"learning_rate": 4.89474836143878e-06,
"loss": 0.5865,
"step": 1267
},
{
"epoch": 0.5995271867612293,
"grad_norm": 2.719855546951294,
"learning_rate": 4.8945691835142975e-06,
"loss": 0.6393,
"step": 1268
},
{
"epoch": 0.6,
"grad_norm": 2.7885141372680664,
"learning_rate": 4.894389856490492e-06,
"loss": 0.66,
"step": 1269
},
{
"epoch": 0.6004728132387707,
"grad_norm": 2.698819875717163,
"learning_rate": 4.894210380378529e-06,
"loss": 0.6144,
"step": 1270
},
{
"epoch": 0.6009456264775414,
"grad_norm": 2.278045654296875,
"learning_rate": 4.894030755189584e-06,
"loss": 0.5609,
"step": 1271
},
{
"epoch": 0.601418439716312,
"grad_norm": 2.8729357719421387,
"learning_rate": 4.893850980934841e-06,
"loss": 0.6715,
"step": 1272
},
{
"epoch": 0.6018912529550827,
"grad_norm": 2.8541221618652344,
"learning_rate": 4.893671057625495e-06,
"loss": 0.6787,
"step": 1273
},
{
"epoch": 0.6023640661938534,
"grad_norm": 2.4561476707458496,
"learning_rate": 4.893490985272748e-06,
"loss": 0.6331,
"step": 1274
},
{
"epoch": 0.6028368794326241,
"grad_norm": 2.565739154815674,
"learning_rate": 4.893310763887812e-06,
"loss": 0.587,
"step": 1275
},
{
"epoch": 0.6033096926713948,
"grad_norm": 2.384951591491699,
"learning_rate": 4.8931303934819095e-06,
"loss": 0.5358,
"step": 1276
},
{
"epoch": 0.6037825059101655,
"grad_norm": 2.380808115005493,
"learning_rate": 4.89294987406627e-06,
"loss": 0.5402,
"step": 1277
},
{
"epoch": 0.6042553191489362,
"grad_norm": 2.764815092086792,
"learning_rate": 4.892769205652136e-06,
"loss": 0.6103,
"step": 1278
},
{
"epoch": 0.6047281323877068,
"grad_norm": 2.463350296020508,
"learning_rate": 4.892588388250754e-06,
"loss": 0.5937,
"step": 1279
},
{
"epoch": 0.6052009456264775,
"grad_norm": 3.099689245223999,
"learning_rate": 4.8924074218733855e-06,
"loss": 0.6354,
"step": 1280
},
{
"epoch": 0.6056737588652482,
"grad_norm": 2.804450035095215,
"learning_rate": 4.892226306531297e-06,
"loss": 0.6595,
"step": 1281
},
{
"epoch": 0.6061465721040189,
"grad_norm": 3.1559767723083496,
"learning_rate": 4.892045042235765e-06,
"loss": 0.6664,
"step": 1282
},
{
"epoch": 0.6066193853427896,
"grad_norm": 2.844341993331909,
"learning_rate": 4.891863628998079e-06,
"loss": 0.7454,
"step": 1283
},
{
"epoch": 0.6070921985815603,
"grad_norm": 2.686602830886841,
"learning_rate": 4.891682066829532e-06,
"loss": 0.6755,
"step": 1284
},
{
"epoch": 0.607565011820331,
"grad_norm": 2.736457347869873,
"learning_rate": 4.8915003557414285e-06,
"loss": 0.6305,
"step": 1285
},
{
"epoch": 0.6080378250591016,
"grad_norm": 2.661362409591675,
"learning_rate": 4.891318495745086e-06,
"loss": 0.5958,
"step": 1286
},
{
"epoch": 0.6085106382978723,
"grad_norm": 2.707348108291626,
"learning_rate": 4.8911364868518255e-06,
"loss": 0.5824,
"step": 1287
},
{
"epoch": 0.608983451536643,
"grad_norm": 2.9798858165740967,
"learning_rate": 4.890954329072981e-06,
"loss": 0.5981,
"step": 1288
},
{
"epoch": 0.6094562647754137,
"grad_norm": 2.6285455226898193,
"learning_rate": 4.890772022419895e-06,
"loss": 0.6194,
"step": 1289
},
{
"epoch": 0.6099290780141844,
"grad_norm": 2.9254322052001953,
"learning_rate": 4.890589566903917e-06,
"loss": 0.6002,
"step": 1290
},
{
"epoch": 0.6104018912529551,
"grad_norm": 2.6458325386047363,
"learning_rate": 4.89040696253641e-06,
"loss": 0.5457,
"step": 1291
},
{
"epoch": 0.6108747044917258,
"grad_norm": 2.508242607116699,
"learning_rate": 4.890224209328743e-06,
"loss": 0.6168,
"step": 1292
},
{
"epoch": 0.6113475177304964,
"grad_norm": 3.034785509109497,
"learning_rate": 4.890041307292296e-06,
"loss": 0.664,
"step": 1293
},
{
"epoch": 0.6118203309692671,
"grad_norm": 3.52469539642334,
"learning_rate": 4.889858256438455e-06,
"loss": 0.7301,
"step": 1294
},
{
"epoch": 0.6122931442080378,
"grad_norm": 2.9145348072052,
"learning_rate": 4.889675056778622e-06,
"loss": 0.6494,
"step": 1295
},
{
"epoch": 0.6127659574468085,
"grad_norm": 2.831829071044922,
"learning_rate": 4.8894917083242e-06,
"loss": 0.6064,
"step": 1296
},
{
"epoch": 0.6132387706855792,
"grad_norm": 2.6883130073547363,
"learning_rate": 4.889308211086608e-06,
"loss": 0.5642,
"step": 1297
},
{
"epoch": 0.6137115839243499,
"grad_norm": 3.0605485439300537,
"learning_rate": 4.889124565077269e-06,
"loss": 0.6695,
"step": 1298
},
{
"epoch": 0.6141843971631206,
"grad_norm": 3.44062876701355,
"learning_rate": 4.88894077030762e-06,
"loss": 0.6415,
"step": 1299
},
{
"epoch": 0.6146572104018913,
"grad_norm": 2.5970818996429443,
"learning_rate": 4.888756826789105e-06,
"loss": 0.6518,
"step": 1300
},
{
"epoch": 0.6151300236406619,
"grad_norm": 4.2233567237854,
"learning_rate": 4.8885727345331755e-06,
"loss": 0.6555,
"step": 1301
},
{
"epoch": 0.6156028368794326,
"grad_norm": 2.645385503768921,
"learning_rate": 4.888388493551297e-06,
"loss": 0.6762,
"step": 1302
},
{
"epoch": 0.6160756501182033,
"grad_norm": 2.907954454421997,
"learning_rate": 4.8882041038549385e-06,
"loss": 0.6526,
"step": 1303
},
{
"epoch": 0.616548463356974,
"grad_norm": 2.482771873474121,
"learning_rate": 4.888019565455583e-06,
"loss": 0.628,
"step": 1304
},
{
"epoch": 0.6170212765957447,
"grad_norm": 2.7165915966033936,
"learning_rate": 4.88783487836472e-06,
"loss": 0.5743,
"step": 1305
},
{
"epoch": 0.6174940898345154,
"grad_norm": 3.095627546310425,
"learning_rate": 4.88765004259385e-06,
"loss": 0.627,
"step": 1306
},
{
"epoch": 0.6179669030732861,
"grad_norm": 2.5018465518951416,
"learning_rate": 4.8874650581544805e-06,
"loss": 0.5215,
"step": 1307
},
{
"epoch": 0.6184397163120567,
"grad_norm": 3.094337224960327,
"learning_rate": 4.8872799250581316e-06,
"loss": 0.6979,
"step": 1308
},
{
"epoch": 0.6189125295508274,
"grad_norm": 3.1002209186553955,
"learning_rate": 4.887094643316329e-06,
"loss": 0.6565,
"step": 1309
},
{
"epoch": 0.6193853427895981,
"grad_norm": 2.551431894302368,
"learning_rate": 4.88690921294061e-06,
"loss": 0.5748,
"step": 1310
},
{
"epoch": 0.6198581560283688,
"grad_norm": 2.8282904624938965,
"learning_rate": 4.886723633942521e-06,
"loss": 0.676,
"step": 1311
},
{
"epoch": 0.6203309692671395,
"grad_norm": 2.8887810707092285,
"learning_rate": 4.886537906333617e-06,
"loss": 0.5971,
"step": 1312
},
{
"epoch": 0.6208037825059102,
"grad_norm": 2.9989118576049805,
"learning_rate": 4.886352030125462e-06,
"loss": 0.6341,
"step": 1313
},
{
"epoch": 0.6212765957446809,
"grad_norm": 2.8042776584625244,
"learning_rate": 4.886166005329629e-06,
"loss": 0.6578,
"step": 1314
},
{
"epoch": 0.6217494089834515,
"grad_norm": 2.4980967044830322,
"learning_rate": 4.8859798319577026e-06,
"loss": 0.6711,
"step": 1315
},
{
"epoch": 0.6222222222222222,
"grad_norm": 2.762369155883789,
"learning_rate": 4.885793510021274e-06,
"loss": 0.5747,
"step": 1316
},
{
"epoch": 0.6226950354609929,
"grad_norm": 3.136327028274536,
"learning_rate": 4.885607039531945e-06,
"loss": 0.7544,
"step": 1317
},
{
"epoch": 0.6231678486997636,
"grad_norm": 2.8736963272094727,
"learning_rate": 4.885420420501327e-06,
"loss": 0.6603,
"step": 1318
},
{
"epoch": 0.6236406619385343,
"grad_norm": 2.766237497329712,
"learning_rate": 4.885233652941039e-06,
"loss": 0.581,
"step": 1319
},
{
"epoch": 0.624113475177305,
"grad_norm": 2.4740939140319824,
"learning_rate": 4.88504673686271e-06,
"loss": 0.6335,
"step": 1320
},
{
"epoch": 0.6245862884160757,
"grad_norm": 3.324795961380005,
"learning_rate": 4.884859672277978e-06,
"loss": 0.6019,
"step": 1321
},
{
"epoch": 0.6250591016548463,
"grad_norm": 3.521327257156372,
"learning_rate": 4.884672459198493e-06,
"loss": 0.6104,
"step": 1322
},
{
"epoch": 0.625531914893617,
"grad_norm": 2.7728071212768555,
"learning_rate": 4.884485097635909e-06,
"loss": 0.6714,
"step": 1323
},
{
"epoch": 0.6260047281323877,
"grad_norm": 3.0738155841827393,
"learning_rate": 4.884297587601895e-06,
"loss": 0.604,
"step": 1324
},
{
"epoch": 0.6264775413711584,
"grad_norm": 2.719240427017212,
"learning_rate": 4.884109929108124e-06,
"loss": 0.6795,
"step": 1325
},
{
"epoch": 0.6269503546099291,
"grad_norm": 2.4108200073242188,
"learning_rate": 4.883922122166282e-06,
"loss": 0.5846,
"step": 1326
},
{
"epoch": 0.6274231678486998,
"grad_norm": 2.393899917602539,
"learning_rate": 4.883734166788063e-06,
"loss": 0.6188,
"step": 1327
},
{
"epoch": 0.6278959810874705,
"grad_norm": 4.555255889892578,
"learning_rate": 4.883546062985169e-06,
"loss": 0.5962,
"step": 1328
},
{
"epoch": 0.6283687943262412,
"grad_norm": 2.571075439453125,
"learning_rate": 4.883357810769315e-06,
"loss": 0.6165,
"step": 1329
},
{
"epoch": 0.6288416075650118,
"grad_norm": 2.553115129470825,
"learning_rate": 4.8831694101522185e-06,
"loss": 0.6787,
"step": 1330
},
{
"epoch": 0.6293144208037825,
"grad_norm": 3.2564642429351807,
"learning_rate": 4.882980861145614e-06,
"loss": 0.659,
"step": 1331
},
{
"epoch": 0.6297872340425532,
"grad_norm": 2.535216808319092,
"learning_rate": 4.882792163761241e-06,
"loss": 0.6176,
"step": 1332
},
{
"epoch": 0.6302600472813239,
"grad_norm": 3.097921848297119,
"learning_rate": 4.882603318010847e-06,
"loss": 0.6822,
"step": 1333
},
{
"epoch": 0.6307328605200946,
"grad_norm": 2.8135175704956055,
"learning_rate": 4.882414323906192e-06,
"loss": 0.6782,
"step": 1334
},
{
"epoch": 0.6312056737588653,
"grad_norm": 2.724634885787964,
"learning_rate": 4.882225181459044e-06,
"loss": 0.6545,
"step": 1335
},
{
"epoch": 0.631678486997636,
"grad_norm": 2.9585227966308594,
"learning_rate": 4.882035890681179e-06,
"loss": 0.6218,
"step": 1336
},
{
"epoch": 0.6321513002364066,
"grad_norm": 2.6952011585235596,
"learning_rate": 4.881846451584385e-06,
"loss": 0.6,
"step": 1337
},
{
"epoch": 0.6326241134751773,
"grad_norm": 3.1400704383850098,
"learning_rate": 4.881656864180455e-06,
"loss": 0.6687,
"step": 1338
},
{
"epoch": 0.633096926713948,
"grad_norm": 2.8382487297058105,
"learning_rate": 4.881467128481197e-06,
"loss": 0.574,
"step": 1339
},
{
"epoch": 0.6335697399527187,
"grad_norm": 2.8520095348358154,
"learning_rate": 4.881277244498422e-06,
"loss": 0.6582,
"step": 1340
},
{
"epoch": 0.6340425531914894,
"grad_norm": 2.703498363494873,
"learning_rate": 4.881087212243956e-06,
"loss": 0.7224,
"step": 1341
},
{
"epoch": 0.6345153664302601,
"grad_norm": 3.697205066680908,
"learning_rate": 4.880897031729629e-06,
"loss": 0.6582,
"step": 1342
},
{
"epoch": 0.6349881796690308,
"grad_norm": 2.7625808715820312,
"learning_rate": 4.880706702967284e-06,
"loss": 0.574,
"step": 1343
},
{
"epoch": 0.6354609929078014,
"grad_norm": 2.949984073638916,
"learning_rate": 4.880516225968771e-06,
"loss": 0.66,
"step": 1344
},
{
"epoch": 0.6359338061465721,
"grad_norm": 2.548269748687744,
"learning_rate": 4.8803256007459525e-06,
"loss": 0.642,
"step": 1345
},
{
"epoch": 0.6364066193853428,
"grad_norm": 2.5102174282073975,
"learning_rate": 4.8801348273106945e-06,
"loss": 0.6238,
"step": 1346
},
{
"epoch": 0.6368794326241135,
"grad_norm": 2.9847946166992188,
"learning_rate": 4.8799439056748786e-06,
"loss": 0.5416,
"step": 1347
},
{
"epoch": 0.6373522458628842,
"grad_norm": 2.8711049556732178,
"learning_rate": 4.879752835850391e-06,
"loss": 0.6427,
"step": 1348
},
{
"epoch": 0.6378250591016549,
"grad_norm": 2.7901716232299805,
"learning_rate": 4.879561617849129e-06,
"loss": 0.6026,
"step": 1349
},
{
"epoch": 0.6382978723404256,
"grad_norm": 2.659778356552124,
"learning_rate": 4.879370251682999e-06,
"loss": 0.6623,
"step": 1350
},
{
"epoch": 0.6387706855791963,
"grad_norm": 3.224386692047119,
"learning_rate": 4.879178737363917e-06,
"loss": 0.6485,
"step": 1351
},
{
"epoch": 0.6392434988179669,
"grad_norm": 2.6385605335235596,
"learning_rate": 4.8789870749038076e-06,
"loss": 0.5866,
"step": 1352
},
{
"epoch": 0.6397163120567376,
"grad_norm": 2.807713270187378,
"learning_rate": 4.8787952643146045e-06,
"loss": 0.6537,
"step": 1353
},
{
"epoch": 0.6401891252955083,
"grad_norm": 2.5689280033111572,
"learning_rate": 4.878603305608251e-06,
"loss": 0.6216,
"step": 1354
},
{
"epoch": 0.640661938534279,
"grad_norm": 2.7347843647003174,
"learning_rate": 4.8784111987967e-06,
"loss": 0.6318,
"step": 1355
},
{
"epoch": 0.6411347517730497,
"grad_norm": 2.5210378170013428,
"learning_rate": 4.878218943891911e-06,
"loss": 0.5472,
"step": 1356
},
{
"epoch": 0.6416075650118204,
"grad_norm": 2.866785764694214,
"learning_rate": 4.878026540905858e-06,
"loss": 0.7108,
"step": 1357
},
{
"epoch": 0.642080378250591,
"grad_norm": 2.923314332962036,
"learning_rate": 4.877833989850519e-06,
"loss": 0.5557,
"step": 1358
},
{
"epoch": 0.6425531914893617,
"grad_norm": 2.925463914871216,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.6382,
"step": 1359
},
{
"epoch": 0.6430260047281324,
"grad_norm": 2.909644365310669,
"learning_rate": 4.877448443579952e-06,
"loss": 0.5603,
"step": 1360
},
{
"epoch": 0.6434988179669031,
"grad_norm": 3.501148223876953,
"learning_rate": 4.8772554483887306e-06,
"loss": 0.6722,
"step": 1361
},
{
"epoch": 0.6439716312056738,
"grad_norm": 2.823765516281128,
"learning_rate": 4.877062305176235e-06,
"loss": 0.6408,
"step": 1362
},
{
"epoch": 0.6444444444444445,
"grad_norm": 2.9807584285736084,
"learning_rate": 4.8768690139544935e-06,
"loss": 0.5984,
"step": 1363
},
{
"epoch": 0.6449172576832152,
"grad_norm": 2.8411378860473633,
"learning_rate": 4.8766755747355405e-06,
"loss": 0.6231,
"step": 1364
},
{
"epoch": 0.6453900709219859,
"grad_norm": 3.158952236175537,
"learning_rate": 4.8764819875314215e-06,
"loss": 0.6441,
"step": 1365
},
{
"epoch": 0.6458628841607565,
"grad_norm": 2.9614369869232178,
"learning_rate": 4.876288252354189e-06,
"loss": 0.6308,
"step": 1366
},
{
"epoch": 0.6463356973995272,
"grad_norm": 3.073805570602417,
"learning_rate": 4.876094369215907e-06,
"loss": 0.6046,
"step": 1367
},
{
"epoch": 0.6468085106382979,
"grad_norm": 2.719189405441284,
"learning_rate": 4.875900338128648e-06,
"loss": 0.6082,
"step": 1368
},
{
"epoch": 0.6472813238770686,
"grad_norm": 2.676726818084717,
"learning_rate": 4.8757061591044914e-06,
"loss": 0.6344,
"step": 1369
},
{
"epoch": 0.6477541371158393,
"grad_norm": 2.955256938934326,
"learning_rate": 4.87551183215553e-06,
"loss": 0.6506,
"step": 1370
},
{
"epoch": 0.64822695035461,
"grad_norm": 2.5672218799591064,
"learning_rate": 4.875317357293864e-06,
"loss": 0.5284,
"step": 1371
},
{
"epoch": 0.6486997635933807,
"grad_norm": 2.5860238075256348,
"learning_rate": 4.875122734531602e-06,
"loss": 0.667,
"step": 1372
},
{
"epoch": 0.6491725768321513,
"grad_norm": 3.1037003993988037,
"learning_rate": 4.8749279638808605e-06,
"loss": 0.6902,
"step": 1373
},
{
"epoch": 0.649645390070922,
"grad_norm": 2.7715282440185547,
"learning_rate": 4.874733045353769e-06,
"loss": 0.6291,
"step": 1374
},
{
"epoch": 0.6501182033096927,
"grad_norm": 2.527071475982666,
"learning_rate": 4.874537978962463e-06,
"loss": 0.5565,
"step": 1375
},
{
"epoch": 0.6505910165484634,
"grad_norm": 2.722092628479004,
"learning_rate": 4.874342764719091e-06,
"loss": 0.5724,
"step": 1376
},
{
"epoch": 0.6510638297872341,
"grad_norm": 2.6342411041259766,
"learning_rate": 4.874147402635805e-06,
"loss": 0.6308,
"step": 1377
},
{
"epoch": 0.6515366430260048,
"grad_norm": 2.3850719928741455,
"learning_rate": 4.8739518927247695e-06,
"loss": 0.5692,
"step": 1378
},
{
"epoch": 0.6520094562647755,
"grad_norm": 2.9787259101867676,
"learning_rate": 4.873756234998161e-06,
"loss": 0.6953,
"step": 1379
},
{
"epoch": 0.6524822695035462,
"grad_norm": 2.634141683578491,
"learning_rate": 4.873560429468159e-06,
"loss": 0.6077,
"step": 1380
},
{
"epoch": 0.6529550827423168,
"grad_norm": 2.803046941757202,
"learning_rate": 4.873364476146958e-06,
"loss": 0.6657,
"step": 1381
},
{
"epoch": 0.6534278959810875,
"grad_norm": 2.762827157974243,
"learning_rate": 4.8731683750467574e-06,
"loss": 0.6061,
"step": 1382
},
{
"epoch": 0.6539007092198581,
"grad_norm": 2.6654391288757324,
"learning_rate": 4.872972126179768e-06,
"loss": 0.6387,
"step": 1383
},
{
"epoch": 0.6543735224586288,
"grad_norm": 2.4363625049591064,
"learning_rate": 4.872775729558209e-06,
"loss": 0.5623,
"step": 1384
},
{
"epoch": 0.6548463356973995,
"grad_norm": 2.528959035873413,
"learning_rate": 4.87257918519431e-06,
"loss": 0.5609,
"step": 1385
},
{
"epoch": 0.6553191489361702,
"grad_norm": 2.718383312225342,
"learning_rate": 4.872382493100309e-06,
"loss": 0.5575,
"step": 1386
},
{
"epoch": 0.6557919621749408,
"grad_norm": 2.660841226577759,
"learning_rate": 4.872185653288453e-06,
"loss": 0.6106,
"step": 1387
},
{
"epoch": 0.6562647754137115,
"grad_norm": 2.508753538131714,
"learning_rate": 4.871988665770997e-06,
"loss": 0.5705,
"step": 1388
},
{
"epoch": 0.6567375886524822,
"grad_norm": 2.5134334564208984,
"learning_rate": 4.871791530560208e-06,
"loss": 0.5592,
"step": 1389
},
{
"epoch": 0.6572104018912529,
"grad_norm": 2.7475597858428955,
"learning_rate": 4.871594247668361e-06,
"loss": 0.6277,
"step": 1390
},
{
"epoch": 0.6576832151300236,
"grad_norm": 2.793616533279419,
"learning_rate": 4.871396817107739e-06,
"loss": 0.595,
"step": 1391
},
{
"epoch": 0.6581560283687943,
"grad_norm": 2.8285086154937744,
"learning_rate": 4.871199238890635e-06,
"loss": 0.6094,
"step": 1392
},
{
"epoch": 0.658628841607565,
"grad_norm": 2.74124813079834,
"learning_rate": 4.871001513029352e-06,
"loss": 0.6296,
"step": 1393
},
{
"epoch": 0.6591016548463356,
"grad_norm": 2.761237621307373,
"learning_rate": 4.870803639536202e-06,
"loss": 0.5702,
"step": 1394
},
{
"epoch": 0.6595744680851063,
"grad_norm": 2.761038064956665,
"learning_rate": 4.870605618423504e-06,
"loss": 0.6195,
"step": 1395
},
{
"epoch": 0.660047281323877,
"grad_norm": 2.8812482357025146,
"learning_rate": 4.870407449703589e-06,
"loss": 0.616,
"step": 1396
},
{
"epoch": 0.6605200945626477,
"grad_norm": 2.9966578483581543,
"learning_rate": 4.870209133388797e-06,
"loss": 0.6547,
"step": 1397
},
{
"epoch": 0.6609929078014184,
"grad_norm": 2.7969017028808594,
"learning_rate": 4.870010669491474e-06,
"loss": 0.5762,
"step": 1398
},
{
"epoch": 0.6614657210401891,
"grad_norm": 2.557783842086792,
"learning_rate": 4.86981205802398e-06,
"loss": 0.6184,
"step": 1399
},
{
"epoch": 0.6619385342789598,
"grad_norm": 2.5393927097320557,
"learning_rate": 4.86961329899868e-06,
"loss": 0.5953,
"step": 1400
},
{
"epoch": 0.6624113475177305,
"grad_norm": 2.7745981216430664,
"learning_rate": 4.86941439242795e-06,
"loss": 0.5967,
"step": 1401
},
{
"epoch": 0.6628841607565011,
"grad_norm": 2.650381326675415,
"learning_rate": 4.869215338324176e-06,
"loss": 0.5667,
"step": 1402
},
{
"epoch": 0.6633569739952718,
"grad_norm": 2.583169937133789,
"learning_rate": 4.869016136699751e-06,
"loss": 0.549,
"step": 1403
},
{
"epoch": 0.6638297872340425,
"grad_norm": 2.984978437423706,
"learning_rate": 4.868816787567079e-06,
"loss": 0.5931,
"step": 1404
},
{
"epoch": 0.6643026004728132,
"grad_norm": 3.1947181224823,
"learning_rate": 4.868617290938573e-06,
"loss": 0.5473,
"step": 1405
},
{
"epoch": 0.6647754137115839,
"grad_norm": 2.562927007675171,
"learning_rate": 4.868417646826654e-06,
"loss": 0.6878,
"step": 1406
},
{
"epoch": 0.6652482269503546,
"grad_norm": 2.8741261959075928,
"learning_rate": 4.868217855243754e-06,
"loss": 0.6312,
"step": 1407
},
{
"epoch": 0.6657210401891253,
"grad_norm": 2.9834797382354736,
"learning_rate": 4.868017916202312e-06,
"loss": 0.5624,
"step": 1408
},
{
"epoch": 0.6661938534278959,
"grad_norm": 2.6935982704162598,
"learning_rate": 4.8678178297147785e-06,
"loss": 0.5857,
"step": 1409
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.8200576305389404,
"learning_rate": 4.86761759579361e-06,
"loss": 0.6153,
"step": 1410
},
{
"epoch": 0.6671394799054373,
"grad_norm": 2.831425189971924,
"learning_rate": 4.867417214451276e-06,
"loss": 0.6495,
"step": 1411
},
{
"epoch": 0.667612293144208,
"grad_norm": 2.733565092086792,
"learning_rate": 4.867216685700253e-06,
"loss": 0.6036,
"step": 1412
},
{
"epoch": 0.6680851063829787,
"grad_norm": 3.0609400272369385,
"learning_rate": 4.867016009553027e-06,
"loss": 0.6773,
"step": 1413
},
{
"epoch": 0.6685579196217494,
"grad_norm": 2.665452241897583,
"learning_rate": 4.866815186022093e-06,
"loss": 0.6256,
"step": 1414
},
{
"epoch": 0.6690307328605201,
"grad_norm": 2.9480721950531006,
"learning_rate": 4.866614215119956e-06,
"loss": 0.535,
"step": 1415
},
{
"epoch": 0.6695035460992907,
"grad_norm": 2.5514180660247803,
"learning_rate": 4.866413096859128e-06,
"loss": 0.6588,
"step": 1416
},
{
"epoch": 0.6699763593380614,
"grad_norm": 3.3442373275756836,
"learning_rate": 4.866211831252134e-06,
"loss": 0.5754,
"step": 1417
},
{
"epoch": 0.6704491725768321,
"grad_norm": 2.521467685699463,
"learning_rate": 4.866010418311504e-06,
"loss": 0.5546,
"step": 1418
},
{
"epoch": 0.6709219858156028,
"grad_norm": 2.930706262588501,
"learning_rate": 4.865808858049781e-06,
"loss": 0.589,
"step": 1419
},
{
"epoch": 0.6713947990543735,
"grad_norm": 2.6298375129699707,
"learning_rate": 4.865607150479513e-06,
"loss": 0.5915,
"step": 1420
},
{
"epoch": 0.6718676122931442,
"grad_norm": 2.9554293155670166,
"learning_rate": 4.8654052956132615e-06,
"loss": 0.6654,
"step": 1421
},
{
"epoch": 0.6723404255319149,
"grad_norm": 3.2706902027130127,
"learning_rate": 4.865203293463593e-06,
"loss": 0.7115,
"step": 1422
},
{
"epoch": 0.6728132387706856,
"grad_norm": 3.041539430618286,
"learning_rate": 4.865001144043088e-06,
"loss": 0.5818,
"step": 1423
},
{
"epoch": 0.6732860520094562,
"grad_norm": 3.1314544677734375,
"learning_rate": 4.864798847364331e-06,
"loss": 0.5822,
"step": 1424
},
{
"epoch": 0.6737588652482269,
"grad_norm": 2.5301461219787598,
"learning_rate": 4.86459640343992e-06,
"loss": 0.5525,
"step": 1425
},
{
"epoch": 0.6742316784869976,
"grad_norm": 2.809295892715454,
"learning_rate": 4.864393812282458e-06,
"loss": 0.6768,
"step": 1426
},
{
"epoch": 0.6747044917257683,
"grad_norm": 2.794664144515991,
"learning_rate": 4.864191073904562e-06,
"loss": 0.5793,
"step": 1427
},
{
"epoch": 0.675177304964539,
"grad_norm": 2.7771105766296387,
"learning_rate": 4.863988188318854e-06,
"loss": 0.6453,
"step": 1428
},
{
"epoch": 0.6756501182033097,
"grad_norm": 2.6431946754455566,
"learning_rate": 4.863785155537967e-06,
"loss": 0.5877,
"step": 1429
},
{
"epoch": 0.6761229314420804,
"grad_norm": 2.951353073120117,
"learning_rate": 4.863581975574544e-06,
"loss": 0.6793,
"step": 1430
},
{
"epoch": 0.676595744680851,
"grad_norm": 3.1336071491241455,
"learning_rate": 4.863378648441235e-06,
"loss": 0.6695,
"step": 1431
},
{
"epoch": 0.6770685579196217,
"grad_norm": 2.735982656478882,
"learning_rate": 4.8631751741507e-06,
"loss": 0.5239,
"step": 1432
},
{
"epoch": 0.6775413711583924,
"grad_norm": 2.7085206508636475,
"learning_rate": 4.862971552715611e-06,
"loss": 0.6837,
"step": 1433
},
{
"epoch": 0.6780141843971631,
"grad_norm": 3.136528730392456,
"learning_rate": 4.8627677841486436e-06,
"loss": 0.683,
"step": 1434
},
{
"epoch": 0.6784869976359338,
"grad_norm": 2.7879369258880615,
"learning_rate": 4.862563868462486e-06,
"loss": 0.608,
"step": 1435
},
{
"epoch": 0.6789598108747045,
"grad_norm": 2.7937729358673096,
"learning_rate": 4.862359805669837e-06,
"loss": 0.6131,
"step": 1436
},
{
"epoch": 0.6794326241134752,
"grad_norm": 2.5988364219665527,
"learning_rate": 4.862155595783401e-06,
"loss": 0.6303,
"step": 1437
},
{
"epoch": 0.6799054373522458,
"grad_norm": 3.251070499420166,
"learning_rate": 4.861951238815894e-06,
"loss": 0.7246,
"step": 1438
},
{
"epoch": 0.6803782505910165,
"grad_norm": 2.646759271621704,
"learning_rate": 4.861746734780039e-06,
"loss": 0.6313,
"step": 1439
},
{
"epoch": 0.6808510638297872,
"grad_norm": 2.773866891860962,
"learning_rate": 4.861542083688573e-06,
"loss": 0.6463,
"step": 1440
},
{
"epoch": 0.6813238770685579,
"grad_norm": 2.759965658187866,
"learning_rate": 4.861337285554235e-06,
"loss": 0.5428,
"step": 1441
},
{
"epoch": 0.6817966903073286,
"grad_norm": 3.3250818252563477,
"learning_rate": 4.861132340389779e-06,
"loss": 0.6522,
"step": 1442
},
{
"epoch": 0.6822695035460993,
"grad_norm": 2.661797523498535,
"learning_rate": 4.860927248207965e-06,
"loss": 0.5871,
"step": 1443
},
{
"epoch": 0.68274231678487,
"grad_norm": 2.706289052963257,
"learning_rate": 4.860722009021563e-06,
"loss": 0.6651,
"step": 1444
},
{
"epoch": 0.6832151300236406,
"grad_norm": 2.8459298610687256,
"learning_rate": 4.860516622843354e-06,
"loss": 0.5827,
"step": 1445
},
{
"epoch": 0.6836879432624113,
"grad_norm": 3.1041831970214844,
"learning_rate": 4.860311089686125e-06,
"loss": 0.6727,
"step": 1446
},
{
"epoch": 0.684160756501182,
"grad_norm": 2.9382801055908203,
"learning_rate": 4.8601054095626746e-06,
"loss": 0.6002,
"step": 1447
},
{
"epoch": 0.6846335697399527,
"grad_norm": 2.782475471496582,
"learning_rate": 4.859899582485808e-06,
"loss": 0.6951,
"step": 1448
},
{
"epoch": 0.6851063829787234,
"grad_norm": 3.313894510269165,
"learning_rate": 4.859693608468343e-06,
"loss": 0.6363,
"step": 1449
},
{
"epoch": 0.6855791962174941,
"grad_norm": 3.1639695167541504,
"learning_rate": 4.8594874875231045e-06,
"loss": 0.7002,
"step": 1450
},
{
"epoch": 0.6860520094562648,
"grad_norm": 2.6762218475341797,
"learning_rate": 4.859281219662926e-06,
"loss": 0.6246,
"step": 1451
},
{
"epoch": 0.6865248226950355,
"grad_norm": 2.8368663787841797,
"learning_rate": 4.85907480490065e-06,
"loss": 0.5906,
"step": 1452
},
{
"epoch": 0.6869976359338061,
"grad_norm": 2.887373208999634,
"learning_rate": 4.858868243249131e-06,
"loss": 0.5931,
"step": 1453
},
{
"epoch": 0.6874704491725768,
"grad_norm": 2.8115322589874268,
"learning_rate": 4.858661534721229e-06,
"loss": 0.6337,
"step": 1454
},
{
"epoch": 0.6879432624113475,
"grad_norm": 2.8470499515533447,
"learning_rate": 4.8584546793298174e-06,
"loss": 0.632,
"step": 1455
},
{
"epoch": 0.6884160756501182,
"grad_norm": 2.8229613304138184,
"learning_rate": 4.8582476770877725e-06,
"loss": 0.6494,
"step": 1456
},
{
"epoch": 0.6888888888888889,
"grad_norm": 2.4235479831695557,
"learning_rate": 4.858040528007987e-06,
"loss": 0.5709,
"step": 1457
},
{
"epoch": 0.6893617021276596,
"grad_norm": 2.9348199367523193,
"learning_rate": 4.857833232103356e-06,
"loss": 0.5404,
"step": 1458
},
{
"epoch": 0.6898345153664303,
"grad_norm": 2.8274219036102295,
"learning_rate": 4.857625789386789e-06,
"loss": 0.701,
"step": 1459
},
{
"epoch": 0.6903073286052009,
"grad_norm": 3.136929988861084,
"learning_rate": 4.857418199871203e-06,
"loss": 0.6971,
"step": 1460
},
{
"epoch": 0.6907801418439716,
"grad_norm": 2.8987185955047607,
"learning_rate": 4.8572104635695214e-06,
"loss": 0.6613,
"step": 1461
},
{
"epoch": 0.6912529550827423,
"grad_norm": 2.5073442459106445,
"learning_rate": 4.857002580494681e-06,
"loss": 0.6032,
"step": 1462
},
{
"epoch": 0.691725768321513,
"grad_norm": 2.7019522190093994,
"learning_rate": 4.856794550659625e-06,
"loss": 0.567,
"step": 1463
},
{
"epoch": 0.6921985815602837,
"grad_norm": 2.4795594215393066,
"learning_rate": 4.8565863740773054e-06,
"loss": 0.5777,
"step": 1464
},
{
"epoch": 0.6926713947990544,
"grad_norm": 3.032506227493286,
"learning_rate": 4.856378050760687e-06,
"loss": 0.607,
"step": 1465
},
{
"epoch": 0.6931442080378251,
"grad_norm": 3.052091121673584,
"learning_rate": 4.85616958072274e-06,
"loss": 0.591,
"step": 1466
},
{
"epoch": 0.6936170212765957,
"grad_norm": 2.704831838607788,
"learning_rate": 4.855960963976443e-06,
"loss": 0.6528,
"step": 1467
},
{
"epoch": 0.6940898345153664,
"grad_norm": 2.680995225906372,
"learning_rate": 4.855752200534788e-06,
"loss": 0.6294,
"step": 1468
},
{
"epoch": 0.6945626477541371,
"grad_norm": 2.3948659896850586,
"learning_rate": 4.855543290410774e-06,
"loss": 0.6091,
"step": 1469
},
{
"epoch": 0.6950354609929078,
"grad_norm": 2.6407411098480225,
"learning_rate": 4.855334233617407e-06,
"loss": 0.5572,
"step": 1470
},
{
"epoch": 0.6955082742316785,
"grad_norm": 2.5526835918426514,
"learning_rate": 4.8551250301677064e-06,
"loss": 0.5432,
"step": 1471
},
{
"epoch": 0.6959810874704492,
"grad_norm": 3.1237430572509766,
"learning_rate": 4.8549156800746965e-06,
"loss": 0.5944,
"step": 1472
},
{
"epoch": 0.6964539007092199,
"grad_norm": 2.8112540245056152,
"learning_rate": 4.854706183351412e-06,
"loss": 0.604,
"step": 1473
},
{
"epoch": 0.6969267139479906,
"grad_norm": 2.664644479751587,
"learning_rate": 4.8544965400109e-06,
"loss": 0.5647,
"step": 1474
},
{
"epoch": 0.6973995271867612,
"grad_norm": 3.26310133934021,
"learning_rate": 4.854286750066212e-06,
"loss": 0.6999,
"step": 1475
},
{
"epoch": 0.6978723404255319,
"grad_norm": 2.9717442989349365,
"learning_rate": 4.8540768135304115e-06,
"loss": 0.6655,
"step": 1476
},
{
"epoch": 0.6983451536643026,
"grad_norm": 2.5302982330322266,
"learning_rate": 4.85386673041657e-06,
"loss": 0.6384,
"step": 1477
},
{
"epoch": 0.6988179669030733,
"grad_norm": 2.864877700805664,
"learning_rate": 4.853656500737769e-06,
"loss": 0.6834,
"step": 1478
},
{
"epoch": 0.699290780141844,
"grad_norm": 2.5522031784057617,
"learning_rate": 4.853446124507098e-06,
"loss": 0.5929,
"step": 1479
},
{
"epoch": 0.6997635933806147,
"grad_norm": 3.096477746963501,
"learning_rate": 4.853235601737656e-06,
"loss": 0.5737,
"step": 1480
},
{
"epoch": 0.7002364066193854,
"grad_norm": 2.884779214859009,
"learning_rate": 4.853024932442552e-06,
"loss": 0.6362,
"step": 1481
},
{
"epoch": 0.700709219858156,
"grad_norm": 3.368558406829834,
"learning_rate": 4.852814116634903e-06,
"loss": 0.6721,
"step": 1482
},
{
"epoch": 0.7011820330969267,
"grad_norm": 2.742414951324463,
"learning_rate": 4.852603154327837e-06,
"loss": 0.6212,
"step": 1483
},
{
"epoch": 0.7016548463356974,
"grad_norm": 2.53454852104187,
"learning_rate": 4.8523920455344864e-06,
"loss": 0.6675,
"step": 1484
},
{
"epoch": 0.7021276595744681,
"grad_norm": 2.9354238510131836,
"learning_rate": 4.852180790267999e-06,
"loss": 0.6692,
"step": 1485
},
{
"epoch": 0.7026004728132388,
"grad_norm": 2.585070848464966,
"learning_rate": 4.8519693885415274e-06,
"loss": 0.6215,
"step": 1486
},
{
"epoch": 0.7030732860520095,
"grad_norm": 2.9047999382019043,
"learning_rate": 4.851757840368235e-06,
"loss": 0.6231,
"step": 1487
},
{
"epoch": 0.7035460992907802,
"grad_norm": 3.0930933952331543,
"learning_rate": 4.851546145761295e-06,
"loss": 0.7267,
"step": 1488
},
{
"epoch": 0.7040189125295508,
"grad_norm": 3.0224719047546387,
"learning_rate": 4.8513343047338875e-06,
"loss": 0.6293,
"step": 1489
},
{
"epoch": 0.7044917257683215,
"grad_norm": 2.5758471488952637,
"learning_rate": 4.851122317299203e-06,
"loss": 0.5855,
"step": 1490
},
{
"epoch": 0.7049645390070922,
"grad_norm": 2.579272508621216,
"learning_rate": 4.850910183470441e-06,
"loss": 0.582,
"step": 1491
},
{
"epoch": 0.7054373522458629,
"grad_norm": 2.8148300647735596,
"learning_rate": 4.85069790326081e-06,
"loss": 0.6396,
"step": 1492
},
{
"epoch": 0.7059101654846336,
"grad_norm": 2.6380527019500732,
"learning_rate": 4.850485476683528e-06,
"loss": 0.6114,
"step": 1493
},
{
"epoch": 0.7063829787234043,
"grad_norm": 2.7736263275146484,
"learning_rate": 4.850272903751823e-06,
"loss": 0.6683,
"step": 1494
},
{
"epoch": 0.706855791962175,
"grad_norm": 3.1958179473876953,
"learning_rate": 4.8500601844789285e-06,
"loss": 0.6265,
"step": 1495
},
{
"epoch": 0.7073286052009456,
"grad_norm": 3.783212423324585,
"learning_rate": 4.8498473188780916e-06,
"loss": 0.6078,
"step": 1496
},
{
"epoch": 0.7078014184397163,
"grad_norm": 2.6656646728515625,
"learning_rate": 4.849634306962566e-06,
"loss": 0.5756,
"step": 1497
},
{
"epoch": 0.708274231678487,
"grad_norm": 2.757141590118408,
"learning_rate": 4.849421148745615e-06,
"loss": 0.5596,
"step": 1498
},
{
"epoch": 0.7087470449172577,
"grad_norm": 3.0391886234283447,
"learning_rate": 4.849207844240511e-06,
"loss": 0.5293,
"step": 1499
},
{
"epoch": 0.7092198581560284,
"grad_norm": 2.981912851333618,
"learning_rate": 4.848994393460535e-06,
"loss": 0.598,
"step": 1500
},
{
"epoch": 0.7096926713947991,
"grad_norm": 2.5470798015594482,
"learning_rate": 4.848780796418978e-06,
"loss": 0.6266,
"step": 1501
},
{
"epoch": 0.7101654846335698,
"grad_norm": 2.8394415378570557,
"learning_rate": 4.8485670531291415e-06,
"loss": 0.6844,
"step": 1502
},
{
"epoch": 0.7106382978723405,
"grad_norm": 3.2023508548736572,
"learning_rate": 4.848353163604331e-06,
"loss": 0.6134,
"step": 1503
},
{
"epoch": 0.7111111111111111,
"grad_norm": 2.98245906829834,
"learning_rate": 4.848139127857867e-06,
"loss": 0.7084,
"step": 1504
},
{
"epoch": 0.7115839243498818,
"grad_norm": 2.5917441844940186,
"learning_rate": 4.847924945903076e-06,
"loss": 0.5676,
"step": 1505
},
{
"epoch": 0.7120567375886525,
"grad_norm": 2.8736681938171387,
"learning_rate": 4.847710617753294e-06,
"loss": 0.6304,
"step": 1506
},
{
"epoch": 0.7125295508274232,
"grad_norm": 2.7832682132720947,
"learning_rate": 4.847496143421866e-06,
"loss": 0.5705,
"step": 1507
},
{
"epoch": 0.7130023640661939,
"grad_norm": 2.480560779571533,
"learning_rate": 4.847281522922147e-06,
"loss": 0.5595,
"step": 1508
},
{
"epoch": 0.7134751773049646,
"grad_norm": 2.357675313949585,
"learning_rate": 4.847066756267499e-06,
"loss": 0.5065,
"step": 1509
},
{
"epoch": 0.7139479905437353,
"grad_norm": 2.632669448852539,
"learning_rate": 4.846851843471296e-06,
"loss": 0.6949,
"step": 1510
},
{
"epoch": 0.7144208037825059,
"grad_norm": 2.7691073417663574,
"learning_rate": 4.84663678454692e-06,
"loss": 0.6638,
"step": 1511
},
{
"epoch": 0.7148936170212766,
"grad_norm": 2.5647685527801514,
"learning_rate": 4.846421579507761e-06,
"loss": 0.6098,
"step": 1512
},
{
"epoch": 0.7153664302600473,
"grad_norm": 2.476701021194458,
"learning_rate": 4.846206228367218e-06,
"loss": 0.592,
"step": 1513
},
{
"epoch": 0.715839243498818,
"grad_norm": 2.805727958679199,
"learning_rate": 4.845990731138702e-06,
"loss": 0.5466,
"step": 1514
},
{
"epoch": 0.7163120567375887,
"grad_norm": 2.551392078399658,
"learning_rate": 4.84577508783563e-06,
"loss": 0.6039,
"step": 1515
},
{
"epoch": 0.7167848699763594,
"grad_norm": 2.6861350536346436,
"learning_rate": 4.845559298471429e-06,
"loss": 0.6427,
"step": 1516
},
{
"epoch": 0.7172576832151301,
"grad_norm": 3.1908371448516846,
"learning_rate": 4.845343363059535e-06,
"loss": 0.5447,
"step": 1517
},
{
"epoch": 0.7177304964539007,
"grad_norm": 2.9021761417388916,
"learning_rate": 4.845127281613394e-06,
"loss": 0.5836,
"step": 1518
},
{
"epoch": 0.7182033096926714,
"grad_norm": 2.476670742034912,
"learning_rate": 4.844911054146461e-06,
"loss": 0.5863,
"step": 1519
},
{
"epoch": 0.7186761229314421,
"grad_norm": 2.662935495376587,
"learning_rate": 4.844694680672198e-06,
"loss": 0.5678,
"step": 1520
},
{
"epoch": 0.7191489361702128,
"grad_norm": 2.677896738052368,
"learning_rate": 4.844478161204079e-06,
"loss": 0.6195,
"step": 1521
},
{
"epoch": 0.7196217494089835,
"grad_norm": 2.781921863555908,
"learning_rate": 4.844261495755585e-06,
"loss": 0.643,
"step": 1522
},
{
"epoch": 0.7200945626477542,
"grad_norm": 3.0157392024993896,
"learning_rate": 4.844044684340206e-06,
"loss": 0.7559,
"step": 1523
},
{
"epoch": 0.7205673758865249,
"grad_norm": 2.8109354972839355,
"learning_rate": 4.843827726971444e-06,
"loss": 0.6264,
"step": 1524
},
{
"epoch": 0.7210401891252955,
"grad_norm": 3.0953569412231445,
"learning_rate": 4.8436106236628064e-06,
"loss": 0.6429,
"step": 1525
},
{
"epoch": 0.7215130023640662,
"grad_norm": 2.6850643157958984,
"learning_rate": 4.843393374427812e-06,
"loss": 0.6598,
"step": 1526
},
{
"epoch": 0.7219858156028369,
"grad_norm": 3.043480634689331,
"learning_rate": 4.8431759792799874e-06,
"loss": 0.6331,
"step": 1527
},
{
"epoch": 0.7224586288416076,
"grad_norm": 2.723870038986206,
"learning_rate": 4.842958438232868e-06,
"loss": 0.6259,
"step": 1528
},
{
"epoch": 0.7229314420803783,
"grad_norm": 2.822492837905884,
"learning_rate": 4.842740751300002e-06,
"loss": 0.6554,
"step": 1529
},
{
"epoch": 0.723404255319149,
"grad_norm": 2.7866315841674805,
"learning_rate": 4.842522918494941e-06,
"loss": 0.6991,
"step": 1530
},
{
"epoch": 0.7238770685579197,
"grad_norm": 2.8881826400756836,
"learning_rate": 4.84230493983125e-06,
"loss": 0.5876,
"step": 1531
},
{
"epoch": 0.7243498817966904,
"grad_norm": 2.7456939220428467,
"learning_rate": 4.8420868153225e-06,
"loss": 0.6188,
"step": 1532
},
{
"epoch": 0.724822695035461,
"grad_norm": 3.0257532596588135,
"learning_rate": 4.841868544982274e-06,
"loss": 0.63,
"step": 1533
},
{
"epoch": 0.7252955082742317,
"grad_norm": 3.1581954956054688,
"learning_rate": 4.841650128824164e-06,
"loss": 0.7214,
"step": 1534
},
{
"epoch": 0.7257683215130024,
"grad_norm": 2.9174306392669678,
"learning_rate": 4.841431566861767e-06,
"loss": 0.704,
"step": 1535
},
{
"epoch": 0.7262411347517731,
"grad_norm": 2.5019054412841797,
"learning_rate": 4.8412128591086935e-06,
"loss": 0.6298,
"step": 1536
},
{
"epoch": 0.7267139479905438,
"grad_norm": 2.724285125732422,
"learning_rate": 4.840994005578562e-06,
"loss": 0.6289,
"step": 1537
},
{
"epoch": 0.7271867612293145,
"grad_norm": 2.5882341861724854,
"learning_rate": 4.840775006284998e-06,
"loss": 0.6355,
"step": 1538
},
{
"epoch": 0.7276595744680852,
"grad_norm": 3.1281991004943848,
"learning_rate": 4.840555861241638e-06,
"loss": 0.5551,
"step": 1539
},
{
"epoch": 0.7281323877068558,
"grad_norm": 2.6064817905426025,
"learning_rate": 4.840336570462127e-06,
"loss": 0.5543,
"step": 1540
},
{
"epoch": 0.7286052009456265,
"grad_norm": 2.67112398147583,
"learning_rate": 4.840117133960122e-06,
"loss": 0.6044,
"step": 1541
},
{
"epoch": 0.7290780141843972,
"grad_norm": 2.838022232055664,
"learning_rate": 4.839897551749282e-06,
"loss": 0.6814,
"step": 1542
},
{
"epoch": 0.7295508274231679,
"grad_norm": 2.8897151947021484,
"learning_rate": 4.839677823843283e-06,
"loss": 0.593,
"step": 1543
},
{
"epoch": 0.7300236406619386,
"grad_norm": 2.9238014221191406,
"learning_rate": 4.839457950255805e-06,
"loss": 0.5544,
"step": 1544
},
{
"epoch": 0.7304964539007093,
"grad_norm": 3.016876459121704,
"learning_rate": 4.839237931000538e-06,
"loss": 0.6099,
"step": 1545
},
{
"epoch": 0.7309692671394799,
"grad_norm": 2.9415392875671387,
"learning_rate": 4.839017766091182e-06,
"loss": 0.6413,
"step": 1546
},
{
"epoch": 0.7314420803782505,
"grad_norm": 2.658067226409912,
"learning_rate": 4.838797455541446e-06,
"loss": 0.6534,
"step": 1547
},
{
"epoch": 0.7319148936170212,
"grad_norm": 2.460358142852783,
"learning_rate": 4.838576999365049e-06,
"loss": 0.5307,
"step": 1548
},
{
"epoch": 0.7323877068557919,
"grad_norm": 2.5818674564361572,
"learning_rate": 4.838356397575716e-06,
"loss": 0.6265,
"step": 1549
},
{
"epoch": 0.7328605200945626,
"grad_norm": 3.009197473526001,
"learning_rate": 4.838135650187183e-06,
"loss": 0.6957,
"step": 1550
},
{
"epoch": 0.7333333333333333,
"grad_norm": 2.738543748855591,
"learning_rate": 4.837914757213196e-06,
"loss": 0.646,
"step": 1551
},
{
"epoch": 0.733806146572104,
"grad_norm": 2.8208494186401367,
"learning_rate": 4.837693718667508e-06,
"loss": 0.5936,
"step": 1552
},
{
"epoch": 0.7342789598108747,
"grad_norm": 3.1574649810791016,
"learning_rate": 4.837472534563883e-06,
"loss": 0.6455,
"step": 1553
},
{
"epoch": 0.7347517730496453,
"grad_norm": 2.6737420558929443,
"learning_rate": 4.837251204916093e-06,
"loss": 0.5921,
"step": 1554
},
{
"epoch": 0.735224586288416,
"grad_norm": 2.424983024597168,
"learning_rate": 4.837029729737918e-06,
"loss": 0.6346,
"step": 1555
},
{
"epoch": 0.7356973995271867,
"grad_norm": 2.5163493156433105,
"learning_rate": 4.836808109043151e-06,
"loss": 0.6061,
"step": 1556
},
{
"epoch": 0.7361702127659574,
"grad_norm": 2.8377044200897217,
"learning_rate": 4.836586342845588e-06,
"loss": 0.611,
"step": 1557
},
{
"epoch": 0.7366430260047281,
"grad_norm": 2.5929181575775146,
"learning_rate": 4.83636443115904e-06,
"loss": 0.5496,
"step": 1558
},
{
"epoch": 0.7371158392434988,
"grad_norm": 2.5017223358154297,
"learning_rate": 4.836142373997323e-06,
"loss": 0.6235,
"step": 1559
},
{
"epoch": 0.7375886524822695,
"grad_norm": 2.822500228881836,
"learning_rate": 4.835920171374265e-06,
"loss": 0.6147,
"step": 1560
},
{
"epoch": 0.7380614657210401,
"grad_norm": 2.7234230041503906,
"learning_rate": 4.8356978233037e-06,
"loss": 0.6228,
"step": 1561
},
{
"epoch": 0.7385342789598108,
"grad_norm": 2.9565515518188477,
"learning_rate": 4.835475329799472e-06,
"loss": 0.5728,
"step": 1562
},
{
"epoch": 0.7390070921985815,
"grad_norm": 2.4356038570404053,
"learning_rate": 4.835252690875438e-06,
"loss": 0.6723,
"step": 1563
},
{
"epoch": 0.7394799054373522,
"grad_norm": 2.765913248062134,
"learning_rate": 4.835029906545458e-06,
"loss": 0.5805,
"step": 1564
},
{
"epoch": 0.7399527186761229,
"grad_norm": 2.4481914043426514,
"learning_rate": 4.834806976823405e-06,
"loss": 0.599,
"step": 1565
},
{
"epoch": 0.7404255319148936,
"grad_norm": 2.620779514312744,
"learning_rate": 4.834583901723158e-06,
"loss": 0.63,
"step": 1566
},
{
"epoch": 0.7408983451536643,
"grad_norm": 2.654426097869873,
"learning_rate": 4.83436068125861e-06,
"loss": 0.6544,
"step": 1567
},
{
"epoch": 0.741371158392435,
"grad_norm": 2.589623212814331,
"learning_rate": 4.834137315443656e-06,
"loss": 0.5596,
"step": 1568
},
{
"epoch": 0.7418439716312056,
"grad_norm": 2.572883129119873,
"learning_rate": 4.833913804292209e-06,
"loss": 0.5974,
"step": 1569
},
{
"epoch": 0.7423167848699763,
"grad_norm": 2.8744914531707764,
"learning_rate": 4.833690147818181e-06,
"loss": 0.5364,
"step": 1570
},
{
"epoch": 0.742789598108747,
"grad_norm": 2.9800851345062256,
"learning_rate": 4.833466346035502e-06,
"loss": 0.6287,
"step": 1571
},
{
"epoch": 0.7432624113475177,
"grad_norm": 2.627784490585327,
"learning_rate": 4.833242398958105e-06,
"loss": 0.621,
"step": 1572
},
{
"epoch": 0.7437352245862884,
"grad_norm": 2.5187721252441406,
"learning_rate": 4.833018306599933e-06,
"loss": 0.5901,
"step": 1573
},
{
"epoch": 0.7442080378250591,
"grad_norm": 2.4843688011169434,
"learning_rate": 4.832794068974944e-06,
"loss": 0.6336,
"step": 1574
},
{
"epoch": 0.7446808510638298,
"grad_norm": 2.774911880493164,
"learning_rate": 4.832569686097096e-06,
"loss": 0.6091,
"step": 1575
},
{
"epoch": 0.7451536643026004,
"grad_norm": 3.2562527656555176,
"learning_rate": 4.8323451579803615e-06,
"loss": 0.7686,
"step": 1576
},
{
"epoch": 0.7456264775413711,
"grad_norm": 2.799570083618164,
"learning_rate": 4.832120484638721e-06,
"loss": 0.6233,
"step": 1577
},
{
"epoch": 0.7460992907801418,
"grad_norm": 2.661893367767334,
"learning_rate": 4.831895666086164e-06,
"loss": 0.5841,
"step": 1578
},
{
"epoch": 0.7465721040189125,
"grad_norm": 3.0382652282714844,
"learning_rate": 4.831670702336689e-06,
"loss": 0.5769,
"step": 1579
},
{
"epoch": 0.7470449172576832,
"grad_norm": 2.676398515701294,
"learning_rate": 4.831445593404304e-06,
"loss": 0.619,
"step": 1580
},
{
"epoch": 0.7475177304964539,
"grad_norm": 2.717916965484619,
"learning_rate": 4.831220339303024e-06,
"loss": 0.5787,
"step": 1581
},
{
"epoch": 0.7479905437352246,
"grad_norm": 2.3918066024780273,
"learning_rate": 4.830994940046876e-06,
"loss": 0.5108,
"step": 1582
},
{
"epoch": 0.7484633569739952,
"grad_norm": 2.709144115447998,
"learning_rate": 4.830769395649895e-06,
"loss": 0.6875,
"step": 1583
},
{
"epoch": 0.7489361702127659,
"grad_norm": 2.8711116313934326,
"learning_rate": 4.830543706126123e-06,
"loss": 0.6745,
"step": 1584
},
{
"epoch": 0.7494089834515366,
"grad_norm": 2.612339496612549,
"learning_rate": 4.830317871489614e-06,
"loss": 0.5738,
"step": 1585
},
{
"epoch": 0.7498817966903073,
"grad_norm": 2.4355857372283936,
"learning_rate": 4.830091891754429e-06,
"loss": 0.5907,
"step": 1586
},
{
"epoch": 0.750354609929078,
"grad_norm": 2.676051378250122,
"learning_rate": 4.829865766934638e-06,
"loss": 0.6628,
"step": 1587
},
{
"epoch": 0.7508274231678487,
"grad_norm": 2.66489839553833,
"learning_rate": 4.829639497044323e-06,
"loss": 0.5984,
"step": 1588
},
{
"epoch": 0.7513002364066194,
"grad_norm": 2.5358035564422607,
"learning_rate": 4.829413082097572e-06,
"loss": 0.5867,
"step": 1589
},
{
"epoch": 0.75177304964539,
"grad_norm": 2.6530144214630127,
"learning_rate": 4.8291865221084815e-06,
"loss": 0.5917,
"step": 1590
},
{
"epoch": 0.7522458628841607,
"grad_norm": 2.5160958766937256,
"learning_rate": 4.82895981709116e-06,
"loss": 0.6347,
"step": 1591
},
{
"epoch": 0.7527186761229314,
"grad_norm": 2.61592698097229,
"learning_rate": 4.8287329670597225e-06,
"loss": 0.5472,
"step": 1592
},
{
"epoch": 0.7531914893617021,
"grad_norm": 2.7528622150421143,
"learning_rate": 4.828505972028296e-06,
"loss": 0.5842,
"step": 1593
},
{
"epoch": 0.7536643026004728,
"grad_norm": 2.8154072761535645,
"learning_rate": 4.828278832011011e-06,
"loss": 0.5757,
"step": 1594
},
{
"epoch": 0.7541371158392435,
"grad_norm": 3.118515729904175,
"learning_rate": 4.828051547022013e-06,
"loss": 0.6472,
"step": 1595
},
{
"epoch": 0.7546099290780142,
"grad_norm": 2.452033758163452,
"learning_rate": 4.827824117075453e-06,
"loss": 0.5571,
"step": 1596
},
{
"epoch": 0.7550827423167848,
"grad_norm": 2.984388828277588,
"learning_rate": 4.827596542185492e-06,
"loss": 0.6656,
"step": 1597
},
{
"epoch": 0.7555555555555555,
"grad_norm": 2.61356782913208,
"learning_rate": 4.8273688223663014e-06,
"loss": 0.6444,
"step": 1598
},
{
"epoch": 0.7560283687943262,
"grad_norm": 2.8967196941375732,
"learning_rate": 4.8271409576320595e-06,
"loss": 0.6457,
"step": 1599
},
{
"epoch": 0.7565011820330969,
"grad_norm": 2.852367639541626,
"learning_rate": 4.826912947996954e-06,
"loss": 0.5629,
"step": 1600
},
{
"epoch": 0.7569739952718676,
"grad_norm": 2.905280590057373,
"learning_rate": 4.826684793475182e-06,
"loss": 0.6245,
"step": 1601
},
{
"epoch": 0.7574468085106383,
"grad_norm": 2.6156530380249023,
"learning_rate": 4.826456494080951e-06,
"loss": 0.5869,
"step": 1602
},
{
"epoch": 0.757919621749409,
"grad_norm": 2.6490228176116943,
"learning_rate": 4.826228049828475e-06,
"loss": 0.5461,
"step": 1603
},
{
"epoch": 0.7583924349881797,
"grad_norm": 2.9626693725585938,
"learning_rate": 4.825999460731978e-06,
"loss": 0.6842,
"step": 1604
},
{
"epoch": 0.7588652482269503,
"grad_norm": 2.6866023540496826,
"learning_rate": 4.825770726805695e-06,
"loss": 0.5726,
"step": 1605
},
{
"epoch": 0.759338061465721,
"grad_norm": 2.5525858402252197,
"learning_rate": 4.825541848063866e-06,
"loss": 0.6061,
"step": 1606
},
{
"epoch": 0.7598108747044917,
"grad_norm": 2.703977584838867,
"learning_rate": 4.825312824520743e-06,
"loss": 0.6726,
"step": 1607
},
{
"epoch": 0.7602836879432624,
"grad_norm": 2.856534957885742,
"learning_rate": 4.825083656190588e-06,
"loss": 0.625,
"step": 1608
},
{
"epoch": 0.7607565011820331,
"grad_norm": 2.8564887046813965,
"learning_rate": 4.824854343087668e-06,
"loss": 0.7251,
"step": 1609
},
{
"epoch": 0.7612293144208038,
"grad_norm": 2.327650308609009,
"learning_rate": 4.824624885226262e-06,
"loss": 0.526,
"step": 1610
},
{
"epoch": 0.7617021276595745,
"grad_norm": 3.0025737285614014,
"learning_rate": 4.824395282620659e-06,
"loss": 0.6043,
"step": 1611
},
{
"epoch": 0.7621749408983451,
"grad_norm": 2.5441737174987793,
"learning_rate": 4.824165535285152e-06,
"loss": 0.6276,
"step": 1612
},
{
"epoch": 0.7626477541371158,
"grad_norm": 2.4177372455596924,
"learning_rate": 4.823935643234049e-06,
"loss": 0.6419,
"step": 1613
},
{
"epoch": 0.7631205673758865,
"grad_norm": 2.9210550785064697,
"learning_rate": 4.823705606481664e-06,
"loss": 0.5663,
"step": 1614
},
{
"epoch": 0.7635933806146572,
"grad_norm": 2.6353724002838135,
"learning_rate": 4.82347542504232e-06,
"loss": 0.5669,
"step": 1615
},
{
"epoch": 0.7640661938534279,
"grad_norm": 2.419081926345825,
"learning_rate": 4.823245098930349e-06,
"loss": 0.5777,
"step": 1616
},
{
"epoch": 0.7645390070921986,
"grad_norm": 2.5077571868896484,
"learning_rate": 4.823014628160093e-06,
"loss": 0.5924,
"step": 1617
},
{
"epoch": 0.7650118203309693,
"grad_norm": 2.816056251525879,
"learning_rate": 4.822784012745902e-06,
"loss": 0.7273,
"step": 1618
},
{
"epoch": 0.76548463356974,
"grad_norm": 2.7163147926330566,
"learning_rate": 4.8225532527021366e-06,
"loss": 0.5545,
"step": 1619
},
{
"epoch": 0.7659574468085106,
"grad_norm": 2.4784302711486816,
"learning_rate": 4.822322348043164e-06,
"loss": 0.556,
"step": 1620
},
{
"epoch": 0.7664302600472813,
"grad_norm": 2.712467670440674,
"learning_rate": 4.822091298783361e-06,
"loss": 0.6501,
"step": 1621
},
{
"epoch": 0.766903073286052,
"grad_norm": 2.7217724323272705,
"learning_rate": 4.821860104937115e-06,
"loss": 0.5989,
"step": 1622
},
{
"epoch": 0.7673758865248227,
"grad_norm": 2.5622854232788086,
"learning_rate": 4.821628766518821e-06,
"loss": 0.5263,
"step": 1623
},
{
"epoch": 0.7678486997635934,
"grad_norm": 3.230923891067505,
"learning_rate": 4.821397283542884e-06,
"loss": 0.6707,
"step": 1624
},
{
"epoch": 0.7683215130023641,
"grad_norm": 2.37929105758667,
"learning_rate": 4.821165656023718e-06,
"loss": 0.6124,
"step": 1625
},
{
"epoch": 0.7687943262411348,
"grad_norm": 2.9811325073242188,
"learning_rate": 4.820933883975745e-06,
"loss": 0.6435,
"step": 1626
},
{
"epoch": 0.7692671394799054,
"grad_norm": 2.887380838394165,
"learning_rate": 4.820701967413395e-06,
"loss": 0.621,
"step": 1627
},
{
"epoch": 0.7697399527186761,
"grad_norm": 2.6762876510620117,
"learning_rate": 4.820469906351109e-06,
"loss": 0.5713,
"step": 1628
},
{
"epoch": 0.7702127659574468,
"grad_norm": 2.7347512245178223,
"learning_rate": 4.820237700803337e-06,
"loss": 0.6136,
"step": 1629
},
{
"epoch": 0.7706855791962175,
"grad_norm": 2.7244746685028076,
"learning_rate": 4.820005350784539e-06,
"loss": 0.5816,
"step": 1630
},
{
"epoch": 0.7711583924349882,
"grad_norm": 2.9293999671936035,
"learning_rate": 4.8197728563091795e-06,
"loss": 0.6649,
"step": 1631
},
{
"epoch": 0.7716312056737589,
"grad_norm": 2.4402127265930176,
"learning_rate": 4.819540217391736e-06,
"loss": 0.6481,
"step": 1632
},
{
"epoch": 0.7721040189125296,
"grad_norm": 3.083941698074341,
"learning_rate": 4.819307434046694e-06,
"loss": 0.6951,
"step": 1633
},
{
"epoch": 0.7725768321513002,
"grad_norm": 2.544952392578125,
"learning_rate": 4.819074506288548e-06,
"loss": 0.539,
"step": 1634
},
{
"epoch": 0.7730496453900709,
"grad_norm": 2.7791268825531006,
"learning_rate": 4.818841434131801e-06,
"loss": 0.5827,
"step": 1635
},
{
"epoch": 0.7735224586288416,
"grad_norm": 2.7349796295166016,
"learning_rate": 4.818608217590967e-06,
"loss": 0.5584,
"step": 1636
},
{
"epoch": 0.7739952718676123,
"grad_norm": 2.637652635574341,
"learning_rate": 4.818374856680565e-06,
"loss": 0.6386,
"step": 1637
},
{
"epoch": 0.774468085106383,
"grad_norm": 2.9821584224700928,
"learning_rate": 4.818141351415127e-06,
"loss": 0.6734,
"step": 1638
},
{
"epoch": 0.7749408983451537,
"grad_norm": 2.992938995361328,
"learning_rate": 4.817907701809192e-06,
"loss": 0.5899,
"step": 1639
},
{
"epoch": 0.7754137115839244,
"grad_norm": 4.35719633102417,
"learning_rate": 4.8176739078773076e-06,
"loss": 0.6281,
"step": 1640
},
{
"epoch": 0.775886524822695,
"grad_norm": 2.838146209716797,
"learning_rate": 4.8174399696340315e-06,
"loss": 0.5766,
"step": 1641
},
{
"epoch": 0.7763593380614657,
"grad_norm": 3.3116989135742188,
"learning_rate": 4.81720588709393e-06,
"loss": 0.6409,
"step": 1642
},
{
"epoch": 0.7768321513002364,
"grad_norm": 2.9843590259552,
"learning_rate": 4.816971660271579e-06,
"loss": 0.6108,
"step": 1643
},
{
"epoch": 0.7773049645390071,
"grad_norm": 2.843770742416382,
"learning_rate": 4.816737289181562e-06,
"loss": 0.6053,
"step": 1644
},
{
"epoch": 0.7777777777777778,
"grad_norm": 2.7608556747436523,
"learning_rate": 4.816502773838473e-06,
"loss": 0.5854,
"step": 1645
},
{
"epoch": 0.7782505910165485,
"grad_norm": 3.343682289123535,
"learning_rate": 4.816268114256914e-06,
"loss": 0.6329,
"step": 1646
},
{
"epoch": 0.7787234042553192,
"grad_norm": 2.769768476486206,
"learning_rate": 4.816033310451496e-06,
"loss": 0.6242,
"step": 1647
},
{
"epoch": 0.7791962174940898,
"grad_norm": 2.989851713180542,
"learning_rate": 4.815798362436838e-06,
"loss": 0.6493,
"step": 1648
},
{
"epoch": 0.7796690307328605,
"grad_norm": 3.170736312866211,
"learning_rate": 4.8155632702275716e-06,
"loss": 0.6341,
"step": 1649
},
{
"epoch": 0.7801418439716312,
"grad_norm": 2.7372522354125977,
"learning_rate": 4.815328033838334e-06,
"loss": 0.5445,
"step": 1650
},
{
"epoch": 0.7806146572104019,
"grad_norm": 2.6947238445281982,
"learning_rate": 4.8150926532837715e-06,
"loss": 0.6437,
"step": 1651
},
{
"epoch": 0.7810874704491726,
"grad_norm": 2.472323179244995,
"learning_rate": 4.81485712857854e-06,
"loss": 0.5751,
"step": 1652
},
{
"epoch": 0.7815602836879433,
"grad_norm": 2.791114091873169,
"learning_rate": 4.814621459737308e-06,
"loss": 0.5996,
"step": 1653
},
{
"epoch": 0.782033096926714,
"grad_norm": 3.1957521438598633,
"learning_rate": 4.814385646774745e-06,
"loss": 0.5803,
"step": 1654
},
{
"epoch": 0.7825059101654847,
"grad_norm": 2.4120798110961914,
"learning_rate": 4.8141496897055364e-06,
"loss": 0.5814,
"step": 1655
},
{
"epoch": 0.7829787234042553,
"grad_norm": 2.9262423515319824,
"learning_rate": 4.813913588544374e-06,
"loss": 0.6292,
"step": 1656
},
{
"epoch": 0.783451536643026,
"grad_norm": 2.8251047134399414,
"learning_rate": 4.813677343305959e-06,
"loss": 0.6787,
"step": 1657
},
{
"epoch": 0.7839243498817967,
"grad_norm": 2.931659698486328,
"learning_rate": 4.8134409540050005e-06,
"loss": 0.6163,
"step": 1658
},
{
"epoch": 0.7843971631205674,
"grad_norm": 2.7160706520080566,
"learning_rate": 4.813204420656219e-06,
"loss": 0.6831,
"step": 1659
},
{
"epoch": 0.7848699763593381,
"grad_norm": 3.2134454250335693,
"learning_rate": 4.81296774327434e-06,
"loss": 0.6002,
"step": 1660
},
{
"epoch": 0.7853427895981088,
"grad_norm": 2.4002513885498047,
"learning_rate": 4.812730921874103e-06,
"loss": 0.5488,
"step": 1661
},
{
"epoch": 0.7858156028368795,
"grad_norm": 2.5559282302856445,
"learning_rate": 4.812493956470251e-06,
"loss": 0.5802,
"step": 1662
},
{
"epoch": 0.7862884160756501,
"grad_norm": 2.57478404045105,
"learning_rate": 4.812256847077541e-06,
"loss": 0.646,
"step": 1663
},
{
"epoch": 0.7867612293144208,
"grad_norm": 2.811851978302002,
"learning_rate": 4.812019593710736e-06,
"loss": 0.6245,
"step": 1664
},
{
"epoch": 0.7872340425531915,
"grad_norm": 2.5228829383850098,
"learning_rate": 4.811782196384609e-06,
"loss": 0.5949,
"step": 1665
},
{
"epoch": 0.7877068557919622,
"grad_norm": 2.744096040725708,
"learning_rate": 4.8115446551139415e-06,
"loss": 0.6006,
"step": 1666
},
{
"epoch": 0.7881796690307329,
"grad_norm": 3.129242420196533,
"learning_rate": 4.811306969913524e-06,
"loss": 0.7251,
"step": 1667
},
{
"epoch": 0.7886524822695036,
"grad_norm": 2.7855660915374756,
"learning_rate": 4.811069140798156e-06,
"loss": 0.6534,
"step": 1668
},
{
"epoch": 0.7891252955082743,
"grad_norm": 2.836603879928589,
"learning_rate": 4.810831167782647e-06,
"loss": 0.6661,
"step": 1669
},
{
"epoch": 0.789598108747045,
"grad_norm": 2.5339887142181396,
"learning_rate": 4.810593050881813e-06,
"loss": 0.5354,
"step": 1670
},
{
"epoch": 0.7900709219858156,
"grad_norm": 2.9553709030151367,
"learning_rate": 4.810354790110482e-06,
"loss": 0.6001,
"step": 1671
},
{
"epoch": 0.7905437352245863,
"grad_norm": 2.6581788063049316,
"learning_rate": 4.8101163854834885e-06,
"loss": 0.6802,
"step": 1672
},
{
"epoch": 0.791016548463357,
"grad_norm": 3.2002551555633545,
"learning_rate": 4.809877837015677e-06,
"loss": 0.6641,
"step": 1673
},
{
"epoch": 0.7914893617021277,
"grad_norm": 2.918792963027954,
"learning_rate": 4.809639144721902e-06,
"loss": 0.6758,
"step": 1674
},
{
"epoch": 0.7919621749408984,
"grad_norm": 2.7993946075439453,
"learning_rate": 4.8094003086170245e-06,
"loss": 0.5889,
"step": 1675
},
{
"epoch": 0.7924349881796691,
"grad_norm": 2.3698952198028564,
"learning_rate": 4.809161328715916e-06,
"loss": 0.6244,
"step": 1676
},
{
"epoch": 0.7929078014184398,
"grad_norm": 2.8891594409942627,
"learning_rate": 4.808922205033458e-06,
"loss": 0.5835,
"step": 1677
},
{
"epoch": 0.7933806146572104,
"grad_norm": 2.838345766067505,
"learning_rate": 4.808682937584537e-06,
"loss": 0.6907,
"step": 1678
},
{
"epoch": 0.7938534278959811,
"grad_norm": 2.8443174362182617,
"learning_rate": 4.808443526384053e-06,
"loss": 0.6692,
"step": 1679
},
{
"epoch": 0.7943262411347518,
"grad_norm": 2.7355034351348877,
"learning_rate": 4.808203971446913e-06,
"loss": 0.5799,
"step": 1680
},
{
"epoch": 0.7947990543735225,
"grad_norm": 2.7108020782470703,
"learning_rate": 4.807964272788033e-06,
"loss": 0.652,
"step": 1681
},
{
"epoch": 0.7952718676122932,
"grad_norm": 2.397650957107544,
"learning_rate": 4.807724430422338e-06,
"loss": 0.5418,
"step": 1682
},
{
"epoch": 0.7957446808510639,
"grad_norm": 2.4981582164764404,
"learning_rate": 4.807484444364762e-06,
"loss": 0.5731,
"step": 1683
},
{
"epoch": 0.7962174940898346,
"grad_norm": 2.7943713665008545,
"learning_rate": 4.8072443146302475e-06,
"loss": 0.5913,
"step": 1684
},
{
"epoch": 0.7966903073286052,
"grad_norm": 2.5691423416137695,
"learning_rate": 4.807004041233746e-06,
"loss": 0.6475,
"step": 1685
},
{
"epoch": 0.7971631205673759,
"grad_norm": 3.2367498874664307,
"learning_rate": 4.8067636241902195e-06,
"loss": 0.675,
"step": 1686
},
{
"epoch": 0.7976359338061466,
"grad_norm": 3.000595808029175,
"learning_rate": 4.806523063514637e-06,
"loss": 0.5481,
"step": 1687
},
{
"epoch": 0.7981087470449173,
"grad_norm": 2.702014207839966,
"learning_rate": 4.806282359221976e-06,
"loss": 0.5993,
"step": 1688
},
{
"epoch": 0.798581560283688,
"grad_norm": 2.383671998977661,
"learning_rate": 4.806041511327226e-06,
"loss": 0.562,
"step": 1689
},
{
"epoch": 0.7990543735224587,
"grad_norm": 2.6965041160583496,
"learning_rate": 4.8058005198453834e-06,
"loss": 0.5955,
"step": 1690
},
{
"epoch": 0.7995271867612294,
"grad_norm": 2.5906765460968018,
"learning_rate": 4.805559384791453e-06,
"loss": 0.5151,
"step": 1691
},
{
"epoch": 0.8,
"grad_norm": 2.5454652309417725,
"learning_rate": 4.8053181061804475e-06,
"loss": 0.5843,
"step": 1692
},
{
"epoch": 0.8004728132387707,
"grad_norm": 2.661343812942505,
"learning_rate": 4.8050766840273935e-06,
"loss": 0.5995,
"step": 1693
},
{
"epoch": 0.8009456264775414,
"grad_norm": 2.5635924339294434,
"learning_rate": 4.8048351183473215e-06,
"loss": 0.5676,
"step": 1694
},
{
"epoch": 0.8014184397163121,
"grad_norm": 2.5936667919158936,
"learning_rate": 4.804593409155274e-06,
"loss": 0.6291,
"step": 1695
},
{
"epoch": 0.8018912529550828,
"grad_norm": 2.6902432441711426,
"learning_rate": 4.804351556466299e-06,
"loss": 0.6114,
"step": 1696
},
{
"epoch": 0.8023640661938535,
"grad_norm": 2.7764673233032227,
"learning_rate": 4.804109560295457e-06,
"loss": 0.5768,
"step": 1697
},
{
"epoch": 0.8028368794326242,
"grad_norm": 2.9587221145629883,
"learning_rate": 4.803867420657816e-06,
"loss": 0.6048,
"step": 1698
},
{
"epoch": 0.8033096926713948,
"grad_norm": 2.9238998889923096,
"learning_rate": 4.803625137568453e-06,
"loss": 0.6329,
"step": 1699
},
{
"epoch": 0.8037825059101655,
"grad_norm": 2.70473313331604,
"learning_rate": 4.803382711042455e-06,
"loss": 0.5427,
"step": 1700
},
{
"epoch": 0.8042553191489362,
"grad_norm": 3.1604979038238525,
"learning_rate": 4.803140141094914e-06,
"loss": 0.626,
"step": 1701
},
{
"epoch": 0.8047281323877069,
"grad_norm": 2.9567699432373047,
"learning_rate": 4.802897427740936e-06,
"loss": 0.5319,
"step": 1702
},
{
"epoch": 0.8052009456264776,
"grad_norm": 2.90983247756958,
"learning_rate": 4.802654570995632e-06,
"loss": 0.586,
"step": 1703
},
{
"epoch": 0.8056737588652483,
"grad_norm": 2.783480167388916,
"learning_rate": 4.8024115708741255e-06,
"loss": 0.5773,
"step": 1704
},
{
"epoch": 0.806146572104019,
"grad_norm": 3.3307793140411377,
"learning_rate": 4.802168427391547e-06,
"loss": 0.6257,
"step": 1705
},
{
"epoch": 0.8066193853427897,
"grad_norm": 3.0475001335144043,
"learning_rate": 4.801925140563034e-06,
"loss": 0.6612,
"step": 1706
},
{
"epoch": 0.8070921985815603,
"grad_norm": 2.8278894424438477,
"learning_rate": 4.8016817104037375e-06,
"loss": 0.6449,
"step": 1707
},
{
"epoch": 0.807565011820331,
"grad_norm": 2.760244369506836,
"learning_rate": 4.801438136928812e-06,
"loss": 0.7007,
"step": 1708
},
{
"epoch": 0.8080378250591016,
"grad_norm": 2.827634572982788,
"learning_rate": 4.801194420153427e-06,
"loss": 0.6418,
"step": 1709
},
{
"epoch": 0.8085106382978723,
"grad_norm": 2.8655009269714355,
"learning_rate": 4.800950560092754e-06,
"loss": 0.6231,
"step": 1710
},
{
"epoch": 0.808983451536643,
"grad_norm": 2.738112688064575,
"learning_rate": 4.800706556761981e-06,
"loss": 0.6463,
"step": 1711
},
{
"epoch": 0.8094562647754137,
"grad_norm": 2.4781179428100586,
"learning_rate": 4.800462410176296e-06,
"loss": 0.5365,
"step": 1712
},
{
"epoch": 0.8099290780141843,
"grad_norm": 2.6049838066101074,
"learning_rate": 4.800218120350906e-06,
"loss": 0.6035,
"step": 1713
},
{
"epoch": 0.810401891252955,
"grad_norm": 2.9089980125427246,
"learning_rate": 4.79997368730102e-06,
"loss": 0.5828,
"step": 1714
},
{
"epoch": 0.8108747044917257,
"grad_norm": 2.831871747970581,
"learning_rate": 4.799729111041857e-06,
"loss": 0.5953,
"step": 1715
},
{
"epoch": 0.8113475177304964,
"grad_norm": 2.5611300468444824,
"learning_rate": 4.799484391588647e-06,
"loss": 0.6302,
"step": 1716
},
{
"epoch": 0.8118203309692671,
"grad_norm": 2.744070053100586,
"learning_rate": 4.799239528956625e-06,
"loss": 0.5561,
"step": 1717
},
{
"epoch": 0.8122931442080378,
"grad_norm": 2.7344231605529785,
"learning_rate": 4.798994523161041e-06,
"loss": 0.6317,
"step": 1718
},
{
"epoch": 0.8127659574468085,
"grad_norm": 2.3420889377593994,
"learning_rate": 4.798749374217149e-06,
"loss": 0.5415,
"step": 1719
},
{
"epoch": 0.8132387706855791,
"grad_norm": 2.57384991645813,
"learning_rate": 4.798504082140212e-06,
"loss": 0.6383,
"step": 1720
},
{
"epoch": 0.8137115839243498,
"grad_norm": 2.8819844722747803,
"learning_rate": 4.798258646945505e-06,
"loss": 0.6355,
"step": 1721
},
{
"epoch": 0.8141843971631205,
"grad_norm": 2.908123254776001,
"learning_rate": 4.79801306864831e-06,
"loss": 0.701,
"step": 1722
},
{
"epoch": 0.8146572104018912,
"grad_norm": 2.6500701904296875,
"learning_rate": 4.797767347263917e-06,
"loss": 0.6152,
"step": 1723
},
{
"epoch": 0.8151300236406619,
"grad_norm": 2.5513017177581787,
"learning_rate": 4.797521482807628e-06,
"loss": 0.6241,
"step": 1724
},
{
"epoch": 0.8156028368794326,
"grad_norm": 2.6239185333251953,
"learning_rate": 4.7972754752947495e-06,
"loss": 0.6072,
"step": 1725
},
{
"epoch": 0.8160756501182033,
"grad_norm": 2.673436403274536,
"learning_rate": 4.797029324740601e-06,
"loss": 0.5802,
"step": 1726
},
{
"epoch": 0.816548463356974,
"grad_norm": 2.533831834793091,
"learning_rate": 4.796783031160508e-06,
"loss": 0.5566,
"step": 1727
},
{
"epoch": 0.8170212765957446,
"grad_norm": 2.9806582927703857,
"learning_rate": 4.796536594569807e-06,
"loss": 0.6945,
"step": 1728
},
{
"epoch": 0.8174940898345153,
"grad_norm": 2.7093560695648193,
"learning_rate": 4.796290014983842e-06,
"loss": 0.7143,
"step": 1729
},
{
"epoch": 0.817966903073286,
"grad_norm": 2.814507246017456,
"learning_rate": 4.796043292417967e-06,
"loss": 0.6122,
"step": 1730
},
{
"epoch": 0.8184397163120567,
"grad_norm": 2.537156820297241,
"learning_rate": 4.795796426887543e-06,
"loss": 0.6229,
"step": 1731
},
{
"epoch": 0.8189125295508274,
"grad_norm": 2.4878013134002686,
"learning_rate": 4.795549418407944e-06,
"loss": 0.5442,
"step": 1732
},
{
"epoch": 0.8193853427895981,
"grad_norm": 2.839383363723755,
"learning_rate": 4.795302266994548e-06,
"loss": 0.6717,
"step": 1733
},
{
"epoch": 0.8198581560283688,
"grad_norm": 3.1981801986694336,
"learning_rate": 4.795054972662744e-06,
"loss": 0.6596,
"step": 1734
},
{
"epoch": 0.8203309692671394,
"grad_norm": 2.781730890274048,
"learning_rate": 4.79480753542793e-06,
"loss": 0.5845,
"step": 1735
},
{
"epoch": 0.8208037825059101,
"grad_norm": 2.689948558807373,
"learning_rate": 4.794559955305513e-06,
"loss": 0.5928,
"step": 1736
},
{
"epoch": 0.8212765957446808,
"grad_norm": 2.7267637252807617,
"learning_rate": 4.7943122323109105e-06,
"loss": 0.5224,
"step": 1737
},
{
"epoch": 0.8217494089834515,
"grad_norm": 2.4346601963043213,
"learning_rate": 4.794064366459544e-06,
"loss": 0.6431,
"step": 1738
},
{
"epoch": 0.8222222222222222,
"grad_norm": 2.7440176010131836,
"learning_rate": 4.793816357766849e-06,
"loss": 0.6083,
"step": 1739
},
{
"epoch": 0.8226950354609929,
"grad_norm": 2.6558027267456055,
"learning_rate": 4.793568206248268e-06,
"loss": 0.698,
"step": 1740
},
{
"epoch": 0.8231678486997636,
"grad_norm": 2.591658353805542,
"learning_rate": 4.793319911919251e-06,
"loss": 0.6601,
"step": 1741
},
{
"epoch": 0.8236406619385342,
"grad_norm": 2.5431172847747803,
"learning_rate": 4.79307147479526e-06,
"loss": 0.5917,
"step": 1742
},
{
"epoch": 0.8241134751773049,
"grad_norm": 2.7335588932037354,
"learning_rate": 4.792822894891762e-06,
"loss": 0.5925,
"step": 1743
},
{
"epoch": 0.8245862884160756,
"grad_norm": 2.2500839233398438,
"learning_rate": 4.792574172224237e-06,
"loss": 0.4984,
"step": 1744
},
{
"epoch": 0.8250591016548463,
"grad_norm": 2.691343069076538,
"learning_rate": 4.79232530680817e-06,
"loss": 0.6262,
"step": 1745
},
{
"epoch": 0.825531914893617,
"grad_norm": 2.612204074859619,
"learning_rate": 4.792076298659058e-06,
"loss": 0.5822,
"step": 1746
},
{
"epoch": 0.8260047281323877,
"grad_norm": 3.0163519382476807,
"learning_rate": 4.791827147792406e-06,
"loss": 0.6263,
"step": 1747
},
{
"epoch": 0.8264775413711584,
"grad_norm": 2.742183208465576,
"learning_rate": 4.791577854223727e-06,
"loss": 0.6628,
"step": 1748
},
{
"epoch": 0.826950354609929,
"grad_norm": 2.872213840484619,
"learning_rate": 4.791328417968542e-06,
"loss": 0.6332,
"step": 1749
},
{
"epoch": 0.8274231678486997,
"grad_norm": 2.725006580352783,
"learning_rate": 4.7910788390423844e-06,
"loss": 0.6266,
"step": 1750
},
{
"epoch": 0.8278959810874704,
"grad_norm": 3.0366697311401367,
"learning_rate": 4.790829117460793e-06,
"loss": 0.6403,
"step": 1751
},
{
"epoch": 0.8283687943262411,
"grad_norm": 2.594881772994995,
"learning_rate": 4.790579253239318e-06,
"loss": 0.521,
"step": 1752
},
{
"epoch": 0.8288416075650118,
"grad_norm": 2.4496347904205322,
"learning_rate": 4.790329246393517e-06,
"loss": 0.54,
"step": 1753
},
{
"epoch": 0.8293144208037825,
"grad_norm": 3.102278470993042,
"learning_rate": 4.790079096938956e-06,
"loss": 0.6142,
"step": 1754
},
{
"epoch": 0.8297872340425532,
"grad_norm": 2.4645912647247314,
"learning_rate": 4.789828804891212e-06,
"loss": 0.5212,
"step": 1755
},
{
"epoch": 0.8302600472813239,
"grad_norm": 2.7482516765594482,
"learning_rate": 4.789578370265868e-06,
"loss": 0.6712,
"step": 1756
},
{
"epoch": 0.8307328605200945,
"grad_norm": 2.61360502243042,
"learning_rate": 4.7893277930785195e-06,
"loss": 0.6367,
"step": 1757
},
{
"epoch": 0.8312056737588652,
"grad_norm": 2.79028058052063,
"learning_rate": 4.789077073344767e-06,
"loss": 0.5099,
"step": 1758
},
{
"epoch": 0.8316784869976359,
"grad_norm": 2.647662401199341,
"learning_rate": 4.788826211080222e-06,
"loss": 0.6698,
"step": 1759
},
{
"epoch": 0.8321513002364066,
"grad_norm": 3.0214831829071045,
"learning_rate": 4.7885752063005055e-06,
"loss": 0.6121,
"step": 1760
},
{
"epoch": 0.8326241134751773,
"grad_norm": 2.8244032859802246,
"learning_rate": 4.788324059021247e-06,
"loss": 0.6921,
"step": 1761
},
{
"epoch": 0.833096926713948,
"grad_norm": 3.1501076221466064,
"learning_rate": 4.788072769258082e-06,
"loss": 0.6872,
"step": 1762
},
{
"epoch": 0.8335697399527187,
"grad_norm": 2.6989903450012207,
"learning_rate": 4.7878213370266594e-06,
"loss": 0.5884,
"step": 1763
},
{
"epoch": 0.8340425531914893,
"grad_norm": 2.6982665061950684,
"learning_rate": 4.787569762342633e-06,
"loss": 0.6112,
"step": 1764
},
{
"epoch": 0.83451536643026,
"grad_norm": 2.6918323040008545,
"learning_rate": 4.7873180452216685e-06,
"loss": 0.5315,
"step": 1765
},
{
"epoch": 0.8349881796690307,
"grad_norm": 2.5494401454925537,
"learning_rate": 4.78706618567944e-06,
"loss": 0.5909,
"step": 1766
},
{
"epoch": 0.8354609929078014,
"grad_norm": 2.7532095909118652,
"learning_rate": 4.786814183731627e-06,
"loss": 0.5566,
"step": 1767
},
{
"epoch": 0.8359338061465721,
"grad_norm": 2.550865888595581,
"learning_rate": 4.786562039393923e-06,
"loss": 0.555,
"step": 1768
},
{
"epoch": 0.8364066193853428,
"grad_norm": 2.4477791786193848,
"learning_rate": 4.786309752682028e-06,
"loss": 0.5844,
"step": 1769
},
{
"epoch": 0.8368794326241135,
"grad_norm": 2.6982262134552,
"learning_rate": 4.7860573236116485e-06,
"loss": 0.6136,
"step": 1770
},
{
"epoch": 0.8373522458628841,
"grad_norm": 2.456263542175293,
"learning_rate": 4.785804752198503e-06,
"loss": 0.5055,
"step": 1771
},
{
"epoch": 0.8378250591016548,
"grad_norm": 2.428544521331787,
"learning_rate": 4.78555203845832e-06,
"loss": 0.5859,
"step": 1772
},
{
"epoch": 0.8382978723404255,
"grad_norm": 2.1782307624816895,
"learning_rate": 4.785299182406833e-06,
"loss": 0.5325,
"step": 1773
},
{
"epoch": 0.8387706855791962,
"grad_norm": 3.137956142425537,
"learning_rate": 4.785046184059786e-06,
"loss": 0.6097,
"step": 1774
},
{
"epoch": 0.8392434988179669,
"grad_norm": 2.6269001960754395,
"learning_rate": 4.7847930434329336e-06,
"loss": 0.5972,
"step": 1775
},
{
"epoch": 0.8397163120567376,
"grad_norm": 2.732659339904785,
"learning_rate": 4.784539760542037e-06,
"loss": 0.6054,
"step": 1776
},
{
"epoch": 0.8401891252955083,
"grad_norm": 2.5346736907958984,
"learning_rate": 4.784286335402866e-06,
"loss": 0.5521,
"step": 1777
},
{
"epoch": 0.840661938534279,
"grad_norm": 3.1420228481292725,
"learning_rate": 4.784032768031202e-06,
"loss": 0.6165,
"step": 1778
},
{
"epoch": 0.8411347517730496,
"grad_norm": 3.073793411254883,
"learning_rate": 4.783779058442831e-06,
"loss": 0.6414,
"step": 1779
},
{
"epoch": 0.8416075650118203,
"grad_norm": 2.6621336936950684,
"learning_rate": 4.783525206653554e-06,
"loss": 0.5836,
"step": 1780
},
{
"epoch": 0.842080378250591,
"grad_norm": 2.7029049396514893,
"learning_rate": 4.7832712126791745e-06,
"loss": 0.5897,
"step": 1781
},
{
"epoch": 0.8425531914893617,
"grad_norm": 2.4733822345733643,
"learning_rate": 4.783017076535509e-06,
"loss": 0.5913,
"step": 1782
},
{
"epoch": 0.8430260047281324,
"grad_norm": 2.8119473457336426,
"learning_rate": 4.782762798238381e-06,
"loss": 0.6105,
"step": 1783
},
{
"epoch": 0.8434988179669031,
"grad_norm": 2.5290818214416504,
"learning_rate": 4.782508377803622e-06,
"loss": 0.6119,
"step": 1784
},
{
"epoch": 0.8439716312056738,
"grad_norm": 3.193472385406494,
"learning_rate": 4.782253815247076e-06,
"loss": 0.6665,
"step": 1785
},
{
"epoch": 0.8444444444444444,
"grad_norm": 3.206759452819824,
"learning_rate": 4.781999110584592e-06,
"loss": 0.6012,
"step": 1786
},
{
"epoch": 0.8449172576832151,
"grad_norm": 2.6227457523345947,
"learning_rate": 4.781744263832029e-06,
"loss": 0.5845,
"step": 1787
},
{
"epoch": 0.8453900709219858,
"grad_norm": 2.838365316390991,
"learning_rate": 4.781489275005257e-06,
"loss": 0.5695,
"step": 1788
},
{
"epoch": 0.8458628841607565,
"grad_norm": 2.8348326683044434,
"learning_rate": 4.78123414412015e-06,
"loss": 0.6136,
"step": 1789
},
{
"epoch": 0.8463356973995272,
"grad_norm": 2.5698344707489014,
"learning_rate": 4.780978871192597e-06,
"loss": 0.6576,
"step": 1790
},
{
"epoch": 0.8468085106382979,
"grad_norm": 2.5198330879211426,
"learning_rate": 4.780723456238492e-06,
"loss": 0.5521,
"step": 1791
},
{
"epoch": 0.8472813238770686,
"grad_norm": 3.001325845718384,
"learning_rate": 4.780467899273737e-06,
"loss": 0.6075,
"step": 1792
},
{
"epoch": 0.8477541371158392,
"grad_norm": 2.7732746601104736,
"learning_rate": 4.780212200314247e-06,
"loss": 0.6245,
"step": 1793
},
{
"epoch": 0.8482269503546099,
"grad_norm": 2.6950337886810303,
"learning_rate": 4.77995635937594e-06,
"loss": 0.5723,
"step": 1794
},
{
"epoch": 0.8486997635933806,
"grad_norm": 2.82051420211792,
"learning_rate": 4.779700376474749e-06,
"loss": 0.6184,
"step": 1795
},
{
"epoch": 0.8491725768321513,
"grad_norm": 2.757791757583618,
"learning_rate": 4.779444251626611e-06,
"loss": 0.608,
"step": 1796
},
{
"epoch": 0.849645390070922,
"grad_norm": 2.394108533859253,
"learning_rate": 4.779187984847475e-06,
"loss": 0.6174,
"step": 1797
},
{
"epoch": 0.8501182033096927,
"grad_norm": 2.427562713623047,
"learning_rate": 4.778931576153296e-06,
"loss": 0.5618,
"step": 1798
},
{
"epoch": 0.8505910165484634,
"grad_norm": 2.891268491744995,
"learning_rate": 4.778675025560042e-06,
"loss": 0.6865,
"step": 1799
},
{
"epoch": 0.851063829787234,
"grad_norm": 2.665534257888794,
"learning_rate": 4.778418333083685e-06,
"loss": 0.5852,
"step": 1800
},
{
"epoch": 0.8515366430260047,
"grad_norm": 2.5492889881134033,
"learning_rate": 4.7781614987402095e-06,
"loss": 0.5161,
"step": 1801
},
{
"epoch": 0.8520094562647754,
"grad_norm": 2.400177001953125,
"learning_rate": 4.777904522545607e-06,
"loss": 0.5128,
"step": 1802
},
{
"epoch": 0.8524822695035461,
"grad_norm": 2.3949809074401855,
"learning_rate": 4.777647404515878e-06,
"loss": 0.571,
"step": 1803
},
{
"epoch": 0.8529550827423168,
"grad_norm": 2.3624472618103027,
"learning_rate": 4.7773901446670325e-06,
"loss": 0.5486,
"step": 1804
},
{
"epoch": 0.8534278959810875,
"grad_norm": 2.711366891860962,
"learning_rate": 4.7771327430150885e-06,
"loss": 0.5667,
"step": 1805
},
{
"epoch": 0.8539007092198582,
"grad_norm": 2.7681493759155273,
"learning_rate": 4.776875199576073e-06,
"loss": 0.5686,
"step": 1806
},
{
"epoch": 0.8543735224586289,
"grad_norm": 3.0369436740875244,
"learning_rate": 4.776617514366023e-06,
"loss": 0.6635,
"step": 1807
},
{
"epoch": 0.8548463356973995,
"grad_norm": 2.919649600982666,
"learning_rate": 4.776359687400983e-06,
"loss": 0.5749,
"step": 1808
},
{
"epoch": 0.8553191489361702,
"grad_norm": 2.7986185550689697,
"learning_rate": 4.776101718697007e-06,
"loss": 0.559,
"step": 1809
},
{
"epoch": 0.8557919621749409,
"grad_norm": 2.5951223373413086,
"learning_rate": 4.775843608270158e-06,
"loss": 0.5654,
"step": 1810
},
{
"epoch": 0.8562647754137116,
"grad_norm": 2.674138069152832,
"learning_rate": 4.775585356136505e-06,
"loss": 0.5286,
"step": 1811
},
{
"epoch": 0.8567375886524823,
"grad_norm": 3.045437812805176,
"learning_rate": 4.775326962312131e-06,
"loss": 0.6185,
"step": 1812
},
{
"epoch": 0.857210401891253,
"grad_norm": 2.6145293712615967,
"learning_rate": 4.775068426813124e-06,
"loss": 0.6075,
"step": 1813
},
{
"epoch": 0.8576832151300237,
"grad_norm": 2.6320106983184814,
"learning_rate": 4.7748097496555824e-06,
"loss": 0.561,
"step": 1814
},
{
"epoch": 0.8581560283687943,
"grad_norm": 2.5038623809814453,
"learning_rate": 4.774550930855612e-06,
"loss": 0.593,
"step": 1815
},
{
"epoch": 0.858628841607565,
"grad_norm": 2.8168089389801025,
"learning_rate": 4.774291970429329e-06,
"loss": 0.5196,
"step": 1816
},
{
"epoch": 0.8591016548463357,
"grad_norm": 2.778130292892456,
"learning_rate": 4.774032868392858e-06,
"loss": 0.5984,
"step": 1817
},
{
"epoch": 0.8595744680851064,
"grad_norm": 2.536458730697632,
"learning_rate": 4.7737736247623305e-06,
"loss": 0.568,
"step": 1818
},
{
"epoch": 0.8600472813238771,
"grad_norm": 2.6669719219207764,
"learning_rate": 4.77351423955389e-06,
"loss": 0.6233,
"step": 1819
},
{
"epoch": 0.8605200945626478,
"grad_norm": 2.578242540359497,
"learning_rate": 4.773254712783687e-06,
"loss": 0.579,
"step": 1820
},
{
"epoch": 0.8609929078014185,
"grad_norm": 2.816664457321167,
"learning_rate": 4.772995044467881e-06,
"loss": 0.6635,
"step": 1821
},
{
"epoch": 0.8614657210401891,
"grad_norm": 3.1111979484558105,
"learning_rate": 4.77273523462264e-06,
"loss": 0.6372,
"step": 1822
},
{
"epoch": 0.8619385342789598,
"grad_norm": 2.764552354812622,
"learning_rate": 4.772475283264142e-06,
"loss": 0.6216,
"step": 1823
},
{
"epoch": 0.8624113475177305,
"grad_norm": 2.9126830101013184,
"learning_rate": 4.772215190408572e-06,
"loss": 0.6396,
"step": 1824
},
{
"epoch": 0.8628841607565012,
"grad_norm": 2.7502307891845703,
"learning_rate": 4.7719549560721264e-06,
"loss": 0.6186,
"step": 1825
},
{
"epoch": 0.8633569739952719,
"grad_norm": 2.6279006004333496,
"learning_rate": 4.771694580271007e-06,
"loss": 0.5557,
"step": 1826
},
{
"epoch": 0.8638297872340426,
"grad_norm": 2.996563196182251,
"learning_rate": 4.7714340630214276e-06,
"loss": 0.6259,
"step": 1827
},
{
"epoch": 0.8643026004728133,
"grad_norm": 3.231323480606079,
"learning_rate": 4.771173404339609e-06,
"loss": 0.5473,
"step": 1828
},
{
"epoch": 0.864775413711584,
"grad_norm": 3.143519878387451,
"learning_rate": 4.770912604241781e-06,
"loss": 0.593,
"step": 1829
},
{
"epoch": 0.8652482269503546,
"grad_norm": 2.515484094619751,
"learning_rate": 4.770651662744184e-06,
"loss": 0.538,
"step": 1830
},
{
"epoch": 0.8657210401891253,
"grad_norm": 2.629058837890625,
"learning_rate": 4.770390579863064e-06,
"loss": 0.5745,
"step": 1831
},
{
"epoch": 0.866193853427896,
"grad_norm": 2.5826802253723145,
"learning_rate": 4.770129355614677e-06,
"loss": 0.6397,
"step": 1832
},
{
"epoch": 0.8666666666666667,
"grad_norm": 2.954623222351074,
"learning_rate": 4.769867990015289e-06,
"loss": 0.6106,
"step": 1833
},
{
"epoch": 0.8671394799054374,
"grad_norm": 2.742192268371582,
"learning_rate": 4.769606483081175e-06,
"loss": 0.6902,
"step": 1834
},
{
"epoch": 0.8676122931442081,
"grad_norm": 2.2619097232818604,
"learning_rate": 4.769344834828618e-06,
"loss": 0.5414,
"step": 1835
},
{
"epoch": 0.8680851063829788,
"grad_norm": 2.7384188175201416,
"learning_rate": 4.769083045273908e-06,
"loss": 0.5787,
"step": 1836
},
{
"epoch": 0.8685579196217494,
"grad_norm": 2.6734485626220703,
"learning_rate": 4.768821114433346e-06,
"loss": 0.5923,
"step": 1837
},
{
"epoch": 0.8690307328605201,
"grad_norm": 2.286140203475952,
"learning_rate": 4.768559042323243e-06,
"loss": 0.5822,
"step": 1838
},
{
"epoch": 0.8695035460992908,
"grad_norm": 3.0243725776672363,
"learning_rate": 4.768296828959915e-06,
"loss": 0.6623,
"step": 1839
},
{
"epoch": 0.8699763593380615,
"grad_norm": 2.4026312828063965,
"learning_rate": 4.768034474359689e-06,
"loss": 0.5554,
"step": 1840
},
{
"epoch": 0.8704491725768322,
"grad_norm": 2.7469029426574707,
"learning_rate": 4.767771978538903e-06,
"loss": 0.6316,
"step": 1841
},
{
"epoch": 0.8709219858156029,
"grad_norm": 2.729659080505371,
"learning_rate": 4.767509341513899e-06,
"loss": 0.5807,
"step": 1842
},
{
"epoch": 0.8713947990543736,
"grad_norm": 2.5336945056915283,
"learning_rate": 4.76724656330103e-06,
"loss": 0.6109,
"step": 1843
},
{
"epoch": 0.8718676122931442,
"grad_norm": 2.519880533218384,
"learning_rate": 4.76698364391666e-06,
"loss": 0.5313,
"step": 1844
},
{
"epoch": 0.8723404255319149,
"grad_norm": 2.698862075805664,
"learning_rate": 4.766720583377159e-06,
"loss": 0.5953,
"step": 1845
},
{
"epoch": 0.8728132387706856,
"grad_norm": 3.0195560455322266,
"learning_rate": 4.766457381698907e-06,
"loss": 0.5965,
"step": 1846
},
{
"epoch": 0.8732860520094563,
"grad_norm": 2.5972697734832764,
"learning_rate": 4.766194038898291e-06,
"loss": 0.6014,
"step": 1847
},
{
"epoch": 0.873758865248227,
"grad_norm": 2.7132294178009033,
"learning_rate": 4.76593055499171e-06,
"loss": 0.5638,
"step": 1848
},
{
"epoch": 0.8742316784869977,
"grad_norm": 2.7134575843811035,
"learning_rate": 4.765666929995568e-06,
"loss": 0.52,
"step": 1849
},
{
"epoch": 0.8747044917257684,
"grad_norm": 2.3804993629455566,
"learning_rate": 4.765403163926282e-06,
"loss": 0.5435,
"step": 1850
},
{
"epoch": 0.875177304964539,
"grad_norm": 2.8782761096954346,
"learning_rate": 4.765139256800274e-06,
"loss": 0.5843,
"step": 1851
},
{
"epoch": 0.8756501182033097,
"grad_norm": 2.836209774017334,
"learning_rate": 4.764875208633977e-06,
"loss": 0.6667,
"step": 1852
},
{
"epoch": 0.8761229314420804,
"grad_norm": 2.608851194381714,
"learning_rate": 4.764611019443831e-06,
"loss": 0.5436,
"step": 1853
},
{
"epoch": 0.8765957446808511,
"grad_norm": 2.788738965988159,
"learning_rate": 4.764346689246288e-06,
"loss": 0.7331,
"step": 1854
},
{
"epoch": 0.8770685579196218,
"grad_norm": 2.524277687072754,
"learning_rate": 4.764082218057805e-06,
"loss": 0.5067,
"step": 1855
},
{
"epoch": 0.8775413711583925,
"grad_norm": 3.7559316158294678,
"learning_rate": 4.763817605894851e-06,
"loss": 0.6809,
"step": 1856
},
{
"epoch": 0.8780141843971632,
"grad_norm": 2.9070613384246826,
"learning_rate": 4.763552852773899e-06,
"loss": 0.5913,
"step": 1857
},
{
"epoch": 0.8784869976359339,
"grad_norm": 2.7050609588623047,
"learning_rate": 4.7632879587114386e-06,
"loss": 0.6074,
"step": 1858
},
{
"epoch": 0.8789598108747045,
"grad_norm": 2.891134262084961,
"learning_rate": 4.76302292372396e-06,
"loss": 0.5939,
"step": 1859
},
{
"epoch": 0.8794326241134752,
"grad_norm": 2.8581702709198,
"learning_rate": 4.762757747827968e-06,
"loss": 0.5972,
"step": 1860
},
{
"epoch": 0.8799054373522459,
"grad_norm": 2.8266196250915527,
"learning_rate": 4.762492431039971e-06,
"loss": 0.5993,
"step": 1861
},
{
"epoch": 0.8803782505910166,
"grad_norm": 2.4853954315185547,
"learning_rate": 4.762226973376493e-06,
"loss": 0.6388,
"step": 1862
},
{
"epoch": 0.8808510638297873,
"grad_norm": 3.2212886810302734,
"learning_rate": 4.761961374854059e-06,
"loss": 0.6698,
"step": 1863
},
{
"epoch": 0.881323877068558,
"grad_norm": 3.1254501342773438,
"learning_rate": 4.761695635489211e-06,
"loss": 0.5263,
"step": 1864
},
{
"epoch": 0.8817966903073287,
"grad_norm": 2.6891462802886963,
"learning_rate": 4.761429755298491e-06,
"loss": 0.5359,
"step": 1865
},
{
"epoch": 0.8822695035460993,
"grad_norm": 2.8557538986206055,
"learning_rate": 4.761163734298457e-06,
"loss": 0.5933,
"step": 1866
},
{
"epoch": 0.88274231678487,
"grad_norm": 2.53548264503479,
"learning_rate": 4.7608975725056724e-06,
"loss": 0.6397,
"step": 1867
},
{
"epoch": 0.8832151300236407,
"grad_norm": 3.0237956047058105,
"learning_rate": 4.76063126993671e-06,
"loss": 0.6845,
"step": 1868
},
{
"epoch": 0.8836879432624114,
"grad_norm": 3.222886800765991,
"learning_rate": 4.76036482660815e-06,
"loss": 0.6055,
"step": 1869
},
{
"epoch": 0.8841607565011821,
"grad_norm": 3.1867551803588867,
"learning_rate": 4.760098242536584e-06,
"loss": 0.6592,
"step": 1870
},
{
"epoch": 0.8846335697399527,
"grad_norm": 2.782209873199463,
"learning_rate": 4.7598315177386115e-06,
"loss": 0.5833,
"step": 1871
},
{
"epoch": 0.8851063829787233,
"grad_norm": 2.899871587753296,
"learning_rate": 4.759564652230838e-06,
"loss": 0.6129,
"step": 1872
},
{
"epoch": 0.885579196217494,
"grad_norm": 2.5690579414367676,
"learning_rate": 4.759297646029882e-06,
"loss": 0.5827,
"step": 1873
},
{
"epoch": 0.8860520094562647,
"grad_norm": 2.666130304336548,
"learning_rate": 4.759030499152368e-06,
"loss": 0.5272,
"step": 1874
},
{
"epoch": 0.8865248226950354,
"grad_norm": 2.7030911445617676,
"learning_rate": 4.758763211614932e-06,
"loss": 0.6415,
"step": 1875
},
{
"epoch": 0.8869976359338061,
"grad_norm": 2.717512845993042,
"learning_rate": 4.7584957834342135e-06,
"loss": 0.5827,
"step": 1876
},
{
"epoch": 0.8874704491725768,
"grad_norm": 2.665823459625244,
"learning_rate": 4.758228214626867e-06,
"loss": 0.6209,
"step": 1877
},
{
"epoch": 0.8879432624113475,
"grad_norm": 2.636653184890747,
"learning_rate": 4.75796050520955e-06,
"loss": 0.6413,
"step": 1878
},
{
"epoch": 0.8884160756501182,
"grad_norm": 2.585115671157837,
"learning_rate": 4.7576926551989345e-06,
"loss": 0.5518,
"step": 1879
},
{
"epoch": 0.8888888888888888,
"grad_norm": 2.808526039123535,
"learning_rate": 4.757424664611697e-06,
"loss": 0.5717,
"step": 1880
},
{
"epoch": 0.8893617021276595,
"grad_norm": 3.5957939624786377,
"learning_rate": 4.757156533464524e-06,
"loss": 0.6323,
"step": 1881
},
{
"epoch": 0.8898345153664302,
"grad_norm": 2.5003883838653564,
"learning_rate": 4.756888261774111e-06,
"loss": 0.5937,
"step": 1882
},
{
"epoch": 0.8903073286052009,
"grad_norm": 2.749061346054077,
"learning_rate": 4.756619849557161e-06,
"loss": 0.6642,
"step": 1883
},
{
"epoch": 0.8907801418439716,
"grad_norm": 2.6757891178131104,
"learning_rate": 4.756351296830389e-06,
"loss": 0.5887,
"step": 1884
},
{
"epoch": 0.8912529550827423,
"grad_norm": 2.811925172805786,
"learning_rate": 4.756082603610516e-06,
"loss": 0.6571,
"step": 1885
},
{
"epoch": 0.891725768321513,
"grad_norm": 2.5054616928100586,
"learning_rate": 4.755813769914271e-06,
"loss": 0.6312,
"step": 1886
},
{
"epoch": 0.8921985815602836,
"grad_norm": 2.7518467903137207,
"learning_rate": 4.755544795758395e-06,
"loss": 0.6685,
"step": 1887
},
{
"epoch": 0.8926713947990543,
"grad_norm": 2.7527287006378174,
"learning_rate": 4.755275681159634e-06,
"loss": 0.5886,
"step": 1888
},
{
"epoch": 0.893144208037825,
"grad_norm": 2.6162452697753906,
"learning_rate": 4.755006426134745e-06,
"loss": 0.546,
"step": 1889
},
{
"epoch": 0.8936170212765957,
"grad_norm": 2.4016737937927246,
"learning_rate": 4.754737030700495e-06,
"loss": 0.5726,
"step": 1890
},
{
"epoch": 0.8940898345153664,
"grad_norm": 2.528327703475952,
"learning_rate": 4.754467494873656e-06,
"loss": 0.5682,
"step": 1891
},
{
"epoch": 0.8945626477541371,
"grad_norm": 2.3139286041259766,
"learning_rate": 4.7541978186710115e-06,
"loss": 0.6108,
"step": 1892
},
{
"epoch": 0.8950354609929078,
"grad_norm": 2.7269136905670166,
"learning_rate": 4.753928002109354e-06,
"loss": 0.5875,
"step": 1893
},
{
"epoch": 0.8955082742316784,
"grad_norm": 4.425495147705078,
"learning_rate": 4.753658045205482e-06,
"loss": 0.5572,
"step": 1894
},
{
"epoch": 0.8959810874704491,
"grad_norm": 2.535409927368164,
"learning_rate": 4.753387947976206e-06,
"loss": 0.5868,
"step": 1895
},
{
"epoch": 0.8964539007092198,
"grad_norm": 2.722458600997925,
"learning_rate": 4.753117710438343e-06,
"loss": 0.5935,
"step": 1896
},
{
"epoch": 0.8969267139479905,
"grad_norm": 2.743861436843872,
"learning_rate": 4.75284733260872e-06,
"loss": 0.572,
"step": 1897
},
{
"epoch": 0.8973995271867612,
"grad_norm": 2.60640549659729,
"learning_rate": 4.752576814504173e-06,
"loss": 0.567,
"step": 1898
},
{
"epoch": 0.8978723404255319,
"grad_norm": 2.7486042976379395,
"learning_rate": 4.7523061561415435e-06,
"loss": 0.5768,
"step": 1899
},
{
"epoch": 0.8983451536643026,
"grad_norm": 3.8410251140594482,
"learning_rate": 4.752035357537686e-06,
"loss": 0.6034,
"step": 1900
},
{
"epoch": 0.8988179669030733,
"grad_norm": 3.0935890674591064,
"learning_rate": 4.751764418709462e-06,
"loss": 0.5644,
"step": 1901
},
{
"epoch": 0.8992907801418439,
"grad_norm": 2.7989892959594727,
"learning_rate": 4.751493339673742e-06,
"loss": 0.656,
"step": 1902
},
{
"epoch": 0.8997635933806146,
"grad_norm": 3.6940557956695557,
"learning_rate": 4.751222120447403e-06,
"loss": 0.6632,
"step": 1903
},
{
"epoch": 0.9002364066193853,
"grad_norm": 2.3428797721862793,
"learning_rate": 4.750950761047335e-06,
"loss": 0.4485,
"step": 1904
},
{
"epoch": 0.900709219858156,
"grad_norm": 2.622544050216675,
"learning_rate": 4.750679261490432e-06,
"loss": 0.5857,
"step": 1905
},
{
"epoch": 0.9011820330969267,
"grad_norm": 2.4911322593688965,
"learning_rate": 4.750407621793601e-06,
"loss": 0.5618,
"step": 1906
},
{
"epoch": 0.9016548463356974,
"grad_norm": 2.6434662342071533,
"learning_rate": 4.750135841973755e-06,
"loss": 0.6057,
"step": 1907
},
{
"epoch": 0.902127659574468,
"grad_norm": 3.115443706512451,
"learning_rate": 4.749863922047817e-06,
"loss": 0.6064,
"step": 1908
},
{
"epoch": 0.9026004728132387,
"grad_norm": 2.5671091079711914,
"learning_rate": 4.749591862032718e-06,
"loss": 0.5625,
"step": 1909
},
{
"epoch": 0.9030732860520094,
"grad_norm": 3.2008655071258545,
"learning_rate": 4.749319661945398e-06,
"loss": 0.5547,
"step": 1910
},
{
"epoch": 0.9035460992907801,
"grad_norm": 2.905987024307251,
"learning_rate": 4.749047321802805e-06,
"loss": 0.6033,
"step": 1911
},
{
"epoch": 0.9040189125295508,
"grad_norm": 3.1456053256988525,
"learning_rate": 4.748774841621897e-06,
"loss": 0.5651,
"step": 1912
},
{
"epoch": 0.9044917257683215,
"grad_norm": 2.8116416931152344,
"learning_rate": 4.748502221419641e-06,
"loss": 0.5853,
"step": 1913
},
{
"epoch": 0.9049645390070922,
"grad_norm": 3.123835325241089,
"learning_rate": 4.748229461213011e-06,
"loss": 0.5427,
"step": 1914
},
{
"epoch": 0.9054373522458629,
"grad_norm": 2.4750146865844727,
"learning_rate": 4.747956561018989e-06,
"loss": 0.6517,
"step": 1915
},
{
"epoch": 0.9059101654846335,
"grad_norm": 2.6174299716949463,
"learning_rate": 4.7476835208545705e-06,
"loss": 0.6119,
"step": 1916
},
{
"epoch": 0.9063829787234042,
"grad_norm": 2.7390382289886475,
"learning_rate": 4.747410340736755e-06,
"loss": 0.5664,
"step": 1917
},
{
"epoch": 0.9068557919621749,
"grad_norm": 2.7940444946289062,
"learning_rate": 4.747137020682552e-06,
"loss": 0.5628,
"step": 1918
},
{
"epoch": 0.9073286052009456,
"grad_norm": 2.477365016937256,
"learning_rate": 4.7468635607089795e-06,
"loss": 0.5261,
"step": 1919
},
{
"epoch": 0.9078014184397163,
"grad_norm": 2.7016685009002686,
"learning_rate": 4.746589960833066e-06,
"loss": 0.5576,
"step": 1920
},
{
"epoch": 0.908274231678487,
"grad_norm": 2.8806519508361816,
"learning_rate": 4.746316221071846e-06,
"loss": 0.5925,
"step": 1921
},
{
"epoch": 0.9087470449172577,
"grad_norm": 3.0315234661102295,
"learning_rate": 4.746042341442365e-06,
"loss": 0.6142,
"step": 1922
},
{
"epoch": 0.9092198581560283,
"grad_norm": 4.2446160316467285,
"learning_rate": 4.745768321961676e-06,
"loss": 0.5352,
"step": 1923
},
{
"epoch": 0.909692671394799,
"grad_norm": 2.6517012119293213,
"learning_rate": 4.745494162646841e-06,
"loss": 0.6118,
"step": 1924
},
{
"epoch": 0.9101654846335697,
"grad_norm": 2.774900197982788,
"learning_rate": 4.7452198635149304e-06,
"loss": 0.572,
"step": 1925
},
{
"epoch": 0.9106382978723404,
"grad_norm": 3.0133683681488037,
"learning_rate": 4.744945424583024e-06,
"loss": 0.5897,
"step": 1926
},
{
"epoch": 0.9111111111111111,
"grad_norm": 2.7344839572906494,
"learning_rate": 4.744670845868211e-06,
"loss": 0.6207,
"step": 1927
},
{
"epoch": 0.9115839243498818,
"grad_norm": 2.636578321456909,
"learning_rate": 4.744396127387586e-06,
"loss": 0.6687,
"step": 1928
},
{
"epoch": 0.9120567375886525,
"grad_norm": 2.8663458824157715,
"learning_rate": 4.744121269158255e-06,
"loss": 0.5002,
"step": 1929
},
{
"epoch": 0.9125295508274232,
"grad_norm": 2.661079168319702,
"learning_rate": 4.743846271197333e-06,
"loss": 0.5848,
"step": 1930
},
{
"epoch": 0.9130023640661938,
"grad_norm": 2.881256341934204,
"learning_rate": 4.743571133521943e-06,
"loss": 0.5911,
"step": 1931
},
{
"epoch": 0.9134751773049645,
"grad_norm": 2.5540573596954346,
"learning_rate": 4.743295856149217e-06,
"loss": 0.5647,
"step": 1932
},
{
"epoch": 0.9139479905437352,
"grad_norm": 2.7060387134552,
"learning_rate": 4.743020439096293e-06,
"loss": 0.6267,
"step": 1933
},
{
"epoch": 0.9144208037825059,
"grad_norm": 2.694481372833252,
"learning_rate": 4.742744882380323e-06,
"loss": 0.6283,
"step": 1934
},
{
"epoch": 0.9148936170212766,
"grad_norm": 2.711555242538452,
"learning_rate": 4.7424691860184625e-06,
"loss": 0.5784,
"step": 1935
},
{
"epoch": 0.9153664302600473,
"grad_norm": 2.9077224731445312,
"learning_rate": 4.742193350027879e-06,
"loss": 0.5948,
"step": 1936
},
{
"epoch": 0.915839243498818,
"grad_norm": 2.9824187755584717,
"learning_rate": 4.7419173744257476e-06,
"loss": 0.6115,
"step": 1937
},
{
"epoch": 0.9163120567375886,
"grad_norm": 2.5127830505371094,
"learning_rate": 4.7416412592292515e-06,
"loss": 0.5803,
"step": 1938
},
{
"epoch": 0.9167848699763593,
"grad_norm": 3.1307175159454346,
"learning_rate": 4.741365004455583e-06,
"loss": 0.5657,
"step": 1939
},
{
"epoch": 0.91725768321513,
"grad_norm": 2.8205273151397705,
"learning_rate": 4.741088610121944e-06,
"loss": 0.6145,
"step": 1940
},
{
"epoch": 0.9177304964539007,
"grad_norm": 2.6119720935821533,
"learning_rate": 4.7408120762455444e-06,
"loss": 0.6058,
"step": 1941
},
{
"epoch": 0.9182033096926714,
"grad_norm": 2.421276092529297,
"learning_rate": 4.7405354028436025e-06,
"loss": 0.5973,
"step": 1942
},
{
"epoch": 0.9186761229314421,
"grad_norm": 2.9846808910369873,
"learning_rate": 4.740258589933346e-06,
"loss": 0.6892,
"step": 1943
},
{
"epoch": 0.9191489361702128,
"grad_norm": 2.6899871826171875,
"learning_rate": 4.739981637532009e-06,
"loss": 0.5705,
"step": 1944
},
{
"epoch": 0.9196217494089834,
"grad_norm": 2.8636131286621094,
"learning_rate": 4.739704545656839e-06,
"loss": 0.5775,
"step": 1945
},
{
"epoch": 0.9200945626477541,
"grad_norm": 2.7659449577331543,
"learning_rate": 4.739427314325087e-06,
"loss": 0.5823,
"step": 1946
},
{
"epoch": 0.9205673758865248,
"grad_norm": 4.71295166015625,
"learning_rate": 4.739149943554016e-06,
"loss": 0.5601,
"step": 1947
},
{
"epoch": 0.9210401891252955,
"grad_norm": 2.642636775970459,
"learning_rate": 4.738872433360896e-06,
"loss": 0.5278,
"step": 1948
},
{
"epoch": 0.9215130023640662,
"grad_norm": 2.4658217430114746,
"learning_rate": 4.7385947837630065e-06,
"loss": 0.6392,
"step": 1949
},
{
"epoch": 0.9219858156028369,
"grad_norm": 2.851602792739868,
"learning_rate": 4.738316994777636e-06,
"loss": 0.6164,
"step": 1950
},
{
"epoch": 0.9224586288416076,
"grad_norm": 2.394226551055908,
"learning_rate": 4.738039066422081e-06,
"loss": 0.5556,
"step": 1951
},
{
"epoch": 0.9229314420803783,
"grad_norm": 2.7985100746154785,
"learning_rate": 4.737760998713647e-06,
"loss": 0.5799,
"step": 1952
},
{
"epoch": 0.9234042553191489,
"grad_norm": 2.5974674224853516,
"learning_rate": 4.737482791669648e-06,
"loss": 0.6984,
"step": 1953
},
{
"epoch": 0.9238770685579196,
"grad_norm": 2.707636594772339,
"learning_rate": 4.737204445307406e-06,
"loss": 0.5548,
"step": 1954
},
{
"epoch": 0.9243498817966903,
"grad_norm": 2.7882707118988037,
"learning_rate": 4.736925959644254e-06,
"loss": 0.6026,
"step": 1955
},
{
"epoch": 0.924822695035461,
"grad_norm": 2.474482774734497,
"learning_rate": 4.7366473346975304e-06,
"loss": 0.5832,
"step": 1956
},
{
"epoch": 0.9252955082742317,
"grad_norm": 2.6196324825286865,
"learning_rate": 4.736368570484585e-06,
"loss": 0.5861,
"step": 1957
},
{
"epoch": 0.9257683215130024,
"grad_norm": 2.826864004135132,
"learning_rate": 4.736089667022775e-06,
"loss": 0.6173,
"step": 1958
},
{
"epoch": 0.926241134751773,
"grad_norm": 2.414473056793213,
"learning_rate": 4.735810624329466e-06,
"loss": 0.5753,
"step": 1959
},
{
"epoch": 0.9267139479905437,
"grad_norm": 2.8037970066070557,
"learning_rate": 4.7355314424220335e-06,
"loss": 0.6207,
"step": 1960
},
{
"epoch": 0.9271867612293144,
"grad_norm": 2.645458698272705,
"learning_rate": 4.735252121317861e-06,
"loss": 0.5959,
"step": 1961
},
{
"epoch": 0.9276595744680851,
"grad_norm": 2.7983884811401367,
"learning_rate": 4.734972661034339e-06,
"loss": 0.5696,
"step": 1962
},
{
"epoch": 0.9281323877068558,
"grad_norm": 3.0568997859954834,
"learning_rate": 4.73469306158887e-06,
"loss": 0.6194,
"step": 1963
},
{
"epoch": 0.9286052009456265,
"grad_norm": 2.7205135822296143,
"learning_rate": 4.734413322998863e-06,
"loss": 0.5292,
"step": 1964
},
{
"epoch": 0.9290780141843972,
"grad_norm": 3.3168489933013916,
"learning_rate": 4.734133445281735e-06,
"loss": 0.5654,
"step": 1965
},
{
"epoch": 0.9295508274231679,
"grad_norm": 3.0095653533935547,
"learning_rate": 4.733853428454916e-06,
"loss": 0.6508,
"step": 1966
},
{
"epoch": 0.9300236406619385,
"grad_norm": 2.7726712226867676,
"learning_rate": 4.733573272535838e-06,
"loss": 0.644,
"step": 1967
},
{
"epoch": 0.9304964539007092,
"grad_norm": 2.474397659301758,
"learning_rate": 4.7332929775419456e-06,
"loss": 0.5479,
"step": 1968
},
{
"epoch": 0.9309692671394799,
"grad_norm": 2.4518635272979736,
"learning_rate": 4.733012543490693e-06,
"loss": 0.6,
"step": 1969
},
{
"epoch": 0.9314420803782506,
"grad_norm": 2.9292192459106445,
"learning_rate": 4.73273197039954e-06,
"loss": 0.6647,
"step": 1970
},
{
"epoch": 0.9319148936170213,
"grad_norm": 2.425004720687866,
"learning_rate": 4.732451258285958e-06,
"loss": 0.6338,
"step": 1971
},
{
"epoch": 0.932387706855792,
"grad_norm": 2.904479503631592,
"learning_rate": 4.7321704071674255e-06,
"loss": 0.5923,
"step": 1972
},
{
"epoch": 0.9328605200945627,
"grad_norm": 2.477085590362549,
"learning_rate": 4.731889417061428e-06,
"loss": 0.5984,
"step": 1973
},
{
"epoch": 0.9333333333333333,
"grad_norm": 2.585240364074707,
"learning_rate": 4.731608287985465e-06,
"loss": 0.558,
"step": 1974
},
{
"epoch": 0.933806146572104,
"grad_norm": 2.658714532852173,
"learning_rate": 4.731327019957039e-06,
"loss": 0.5567,
"step": 1975
},
{
"epoch": 0.9342789598108747,
"grad_norm": 2.7593026161193848,
"learning_rate": 4.731045612993662e-06,
"loss": 0.5772,
"step": 1976
},
{
"epoch": 0.9347517730496454,
"grad_norm": 2.4386026859283447,
"learning_rate": 4.7307640671128585e-06,
"loss": 0.6199,
"step": 1977
},
{
"epoch": 0.9352245862884161,
"grad_norm": 2.681910514831543,
"learning_rate": 4.730482382332158e-06,
"loss": 0.5971,
"step": 1978
},
{
"epoch": 0.9356973995271868,
"grad_norm": 3.7593860626220703,
"learning_rate": 4.7302005586691e-06,
"loss": 0.6346,
"step": 1979
},
{
"epoch": 0.9361702127659575,
"grad_norm": 2.5789096355438232,
"learning_rate": 4.729918596141232e-06,
"loss": 0.5684,
"step": 1980
},
{
"epoch": 0.9366430260047282,
"grad_norm": 3.0607335567474365,
"learning_rate": 4.729636494766111e-06,
"loss": 0.6223,
"step": 1981
},
{
"epoch": 0.9371158392434988,
"grad_norm": 2.906643867492676,
"learning_rate": 4.729354254561303e-06,
"loss": 0.6513,
"step": 1982
},
{
"epoch": 0.9375886524822695,
"grad_norm": 3.192430019378662,
"learning_rate": 4.7290718755443795e-06,
"loss": 0.5095,
"step": 1983
},
{
"epoch": 0.9380614657210402,
"grad_norm": 2.661536931991577,
"learning_rate": 4.7287893577329255e-06,
"loss": 0.5525,
"step": 1984
},
{
"epoch": 0.9385342789598109,
"grad_norm": 2.8436734676361084,
"learning_rate": 4.728506701144531e-06,
"loss": 0.6323,
"step": 1985
},
{
"epoch": 0.9390070921985816,
"grad_norm": 2.75544810295105,
"learning_rate": 4.728223905796796e-06,
"loss": 0.6018,
"step": 1986
},
{
"epoch": 0.9394799054373523,
"grad_norm": 3.0652759075164795,
"learning_rate": 4.727940971707329e-06,
"loss": 0.62,
"step": 1987
},
{
"epoch": 0.939952718676123,
"grad_norm": 2.802567720413208,
"learning_rate": 4.727657898893747e-06,
"loss": 0.5809,
"step": 1988
},
{
"epoch": 0.9404255319148936,
"grad_norm": 2.6208512783050537,
"learning_rate": 4.7273746873736745e-06,
"loss": 0.5762,
"step": 1989
},
{
"epoch": 0.9408983451536643,
"grad_norm": 2.5901873111724854,
"learning_rate": 4.727091337164748e-06,
"loss": 0.6111,
"step": 1990
},
{
"epoch": 0.941371158392435,
"grad_norm": 3.002347707748413,
"learning_rate": 4.726807848284609e-06,
"loss": 0.6419,
"step": 1991
},
{
"epoch": 0.9418439716312057,
"grad_norm": 2.522151470184326,
"learning_rate": 4.72652422075091e-06,
"loss": 0.642,
"step": 1992
},
{
"epoch": 0.9423167848699764,
"grad_norm": 2.5571532249450684,
"learning_rate": 4.726240454581311e-06,
"loss": 0.5729,
"step": 1993
},
{
"epoch": 0.9427895981087471,
"grad_norm": 2.7704918384552,
"learning_rate": 4.72595654979348e-06,
"loss": 0.6816,
"step": 1994
},
{
"epoch": 0.9432624113475178,
"grad_norm": 2.517040491104126,
"learning_rate": 4.7256725064050955e-06,
"loss": 0.5782,
"step": 1995
},
{
"epoch": 0.9437352245862884,
"grad_norm": 2.613955020904541,
"learning_rate": 4.725388324433843e-06,
"loss": 0.6291,
"step": 1996
},
{
"epoch": 0.9442080378250591,
"grad_norm": 2.848891258239746,
"learning_rate": 4.725104003897418e-06,
"loss": 0.6544,
"step": 1997
},
{
"epoch": 0.9446808510638298,
"grad_norm": 3.0162429809570312,
"learning_rate": 4.724819544813523e-06,
"loss": 0.6301,
"step": 1998
},
{
"epoch": 0.9451536643026005,
"grad_norm": 2.613614559173584,
"learning_rate": 4.72453494719987e-06,
"loss": 0.5829,
"step": 1999
},
{
"epoch": 0.9456264775413712,
"grad_norm": 2.4838767051696777,
"learning_rate": 4.724250211074182e-06,
"loss": 0.6042,
"step": 2000
},
{
"epoch": 0.9460992907801419,
"grad_norm": 2.526470899581909,
"learning_rate": 4.723965336454185e-06,
"loss": 0.6167,
"step": 2001
},
{
"epoch": 0.9465721040189126,
"grad_norm": 2.504506826400757,
"learning_rate": 4.723680323357618e-06,
"loss": 0.6061,
"step": 2002
},
{
"epoch": 0.9470449172576832,
"grad_norm": 3.0547544956207275,
"learning_rate": 4.723395171802228e-06,
"loss": 0.6619,
"step": 2003
},
{
"epoch": 0.9475177304964539,
"grad_norm": 2.8692407608032227,
"learning_rate": 4.723109881805771e-06,
"loss": 0.5985,
"step": 2004
},
{
"epoch": 0.9479905437352246,
"grad_norm": 2.7929654121398926,
"learning_rate": 4.7228244533860094e-06,
"loss": 0.5869,
"step": 2005
},
{
"epoch": 0.9484633569739953,
"grad_norm": 2.764869451522827,
"learning_rate": 4.7225388865607146e-06,
"loss": 0.6288,
"step": 2006
},
{
"epoch": 0.948936170212766,
"grad_norm": 2.7656404972076416,
"learning_rate": 4.722253181347671e-06,
"loss": 0.5831,
"step": 2007
},
{
"epoch": 0.9494089834515367,
"grad_norm": 2.6698336601257324,
"learning_rate": 4.7219673377646635e-06,
"loss": 0.6087,
"step": 2008
},
{
"epoch": 0.9498817966903074,
"grad_norm": 2.524935722351074,
"learning_rate": 4.7216813558294946e-06,
"loss": 0.5675,
"step": 2009
},
{
"epoch": 0.950354609929078,
"grad_norm": 2.5998785495758057,
"learning_rate": 4.721395235559969e-06,
"loss": 0.5667,
"step": 2010
},
{
"epoch": 0.9508274231678487,
"grad_norm": 2.758021354675293,
"learning_rate": 4.721108976973902e-06,
"loss": 0.4931,
"step": 2011
},
{
"epoch": 0.9513002364066194,
"grad_norm": 2.767695903778076,
"learning_rate": 4.72082258008912e-06,
"loss": 0.5778,
"step": 2012
},
{
"epoch": 0.9517730496453901,
"grad_norm": 2.982314348220825,
"learning_rate": 4.720536044923453e-06,
"loss": 0.6096,
"step": 2013
},
{
"epoch": 0.9522458628841608,
"grad_norm": 2.7608799934387207,
"learning_rate": 4.720249371494743e-06,
"loss": 0.6242,
"step": 2014
},
{
"epoch": 0.9527186761229315,
"grad_norm": 2.60054349899292,
"learning_rate": 4.71996255982084e-06,
"loss": 0.6249,
"step": 2015
},
{
"epoch": 0.9531914893617022,
"grad_norm": 2.654355764389038,
"learning_rate": 4.719675609919603e-06,
"loss": 0.6327,
"step": 2016
},
{
"epoch": 0.9536643026004729,
"grad_norm": 2.589404582977295,
"learning_rate": 4.719388521808899e-06,
"loss": 0.6357,
"step": 2017
},
{
"epoch": 0.9541371158392435,
"grad_norm": 2.8016581535339355,
"learning_rate": 4.719101295506603e-06,
"loss": 0.5901,
"step": 2018
},
{
"epoch": 0.9546099290780142,
"grad_norm": 3.1408045291900635,
"learning_rate": 4.7188139310306e-06,
"loss": 0.598,
"step": 2019
},
{
"epoch": 0.9550827423167849,
"grad_norm": 2.7432665824890137,
"learning_rate": 4.718526428398783e-06,
"loss": 0.5508,
"step": 2020
},
{
"epoch": 0.9555555555555556,
"grad_norm": 2.947800874710083,
"learning_rate": 4.718238787629053e-06,
"loss": 0.6439,
"step": 2021
},
{
"epoch": 0.9560283687943263,
"grad_norm": 2.50828218460083,
"learning_rate": 4.71795100873932e-06,
"loss": 0.5441,
"step": 2022
},
{
"epoch": 0.956501182033097,
"grad_norm": 2.8558974266052246,
"learning_rate": 4.717663091747503e-06,
"loss": 0.5416,
"step": 2023
},
{
"epoch": 0.9569739952718677,
"grad_norm": 2.4803316593170166,
"learning_rate": 4.71737503667153e-06,
"loss": 0.5317,
"step": 2024
},
{
"epoch": 0.9574468085106383,
"grad_norm": 4.36754035949707,
"learning_rate": 4.717086843529336e-06,
"loss": 0.5808,
"step": 2025
},
{
"epoch": 0.957919621749409,
"grad_norm": 2.730185031890869,
"learning_rate": 4.7167985123388665e-06,
"loss": 0.5257,
"step": 2026
},
{
"epoch": 0.9583924349881797,
"grad_norm": 2.8136069774627686,
"learning_rate": 4.716510043118074e-06,
"loss": 0.5836,
"step": 2027
},
{
"epoch": 0.9588652482269504,
"grad_norm": 2.793975353240967,
"learning_rate": 4.71622143588492e-06,
"loss": 0.5706,
"step": 2028
},
{
"epoch": 0.9593380614657211,
"grad_norm": 2.3883821964263916,
"learning_rate": 4.7159326906573745e-06,
"loss": 0.5291,
"step": 2029
},
{
"epoch": 0.9598108747044918,
"grad_norm": 2.6135976314544678,
"learning_rate": 4.715643807453417e-06,
"loss": 0.6199,
"step": 2030
},
{
"epoch": 0.9602836879432625,
"grad_norm": 2.6245670318603516,
"learning_rate": 4.715354786291035e-06,
"loss": 0.5585,
"step": 2031
},
{
"epoch": 0.9607565011820332,
"grad_norm": 2.7870967388153076,
"learning_rate": 4.715065627188225e-06,
"loss": 0.6196,
"step": 2032
},
{
"epoch": 0.9612293144208038,
"grad_norm": 2.6983911991119385,
"learning_rate": 4.714776330162991e-06,
"loss": 0.6424,
"step": 2033
},
{
"epoch": 0.9617021276595744,
"grad_norm": 2.3221919536590576,
"learning_rate": 4.7144868952333465e-06,
"loss": 0.568,
"step": 2034
},
{
"epoch": 0.9621749408983451,
"grad_norm": 2.9408178329467773,
"learning_rate": 4.714197322417314e-06,
"loss": 0.6175,
"step": 2035
},
{
"epoch": 0.9626477541371158,
"grad_norm": 2.404057264328003,
"learning_rate": 4.713907611732921e-06,
"loss": 0.4943,
"step": 2036
},
{
"epoch": 0.9631205673758865,
"grad_norm": 3.547607660293579,
"learning_rate": 4.71361776319821e-06,
"loss": 0.5488,
"step": 2037
},
{
"epoch": 0.9635933806146572,
"grad_norm": 2.679614543914795,
"learning_rate": 4.713327776831227e-06,
"loss": 0.6234,
"step": 2038
},
{
"epoch": 0.9640661938534278,
"grad_norm": 2.526914119720459,
"learning_rate": 4.7130376526500286e-06,
"loss": 0.5891,
"step": 2039
},
{
"epoch": 0.9645390070921985,
"grad_norm": 2.6953470706939697,
"learning_rate": 4.71274739067268e-06,
"loss": 0.69,
"step": 2040
},
{
"epoch": 0.9650118203309692,
"grad_norm": 2.546660900115967,
"learning_rate": 4.712456990917254e-06,
"loss": 0.6185,
"step": 2041
},
{
"epoch": 0.9654846335697399,
"grad_norm": 3.3920490741729736,
"learning_rate": 4.712166453401832e-06,
"loss": 0.587,
"step": 2042
},
{
"epoch": 0.9659574468085106,
"grad_norm": 2.5961573123931885,
"learning_rate": 4.711875778144504e-06,
"loss": 0.6105,
"step": 2043
},
{
"epoch": 0.9664302600472813,
"grad_norm": 2.5111498832702637,
"learning_rate": 4.711584965163372e-06,
"loss": 0.5533,
"step": 2044
},
{
"epoch": 0.966903073286052,
"grad_norm": 2.4878132343292236,
"learning_rate": 4.7112940144765405e-06,
"loss": 0.5604,
"step": 2045
},
{
"epoch": 0.9673758865248226,
"grad_norm": 2.5714077949523926,
"learning_rate": 4.711002926102128e-06,
"loss": 0.5794,
"step": 2046
},
{
"epoch": 0.9678486997635933,
"grad_norm": 2.7069091796875,
"learning_rate": 4.710711700058257e-06,
"loss": 0.594,
"step": 2047
},
{
"epoch": 0.968321513002364,
"grad_norm": 2.8104631900787354,
"learning_rate": 4.710420336363063e-06,
"loss": 0.6247,
"step": 2048
},
{
"epoch": 0.9687943262411347,
"grad_norm": 2.8464386463165283,
"learning_rate": 4.7101288350346865e-06,
"loss": 0.6162,
"step": 2049
},
{
"epoch": 0.9692671394799054,
"grad_norm": 2.7187976837158203,
"learning_rate": 4.709837196091279e-06,
"loss": 0.6109,
"step": 2050
},
{
"epoch": 0.9697399527186761,
"grad_norm": 2.556734085083008,
"learning_rate": 4.709545419550999e-06,
"loss": 0.6297,
"step": 2051
},
{
"epoch": 0.9702127659574468,
"grad_norm": 2.937195062637329,
"learning_rate": 4.709253505432014e-06,
"loss": 0.6862,
"step": 2052
},
{
"epoch": 0.9706855791962175,
"grad_norm": 2.792175531387329,
"learning_rate": 4.7089614537525015e-06,
"loss": 0.6105,
"step": 2053
},
{
"epoch": 0.9711583924349881,
"grad_norm": 2.625636100769043,
"learning_rate": 4.708669264530644e-06,
"loss": 0.5849,
"step": 2054
},
{
"epoch": 0.9716312056737588,
"grad_norm": 2.6752610206604004,
"learning_rate": 4.708376937784637e-06,
"loss": 0.5949,
"step": 2055
},
{
"epoch": 0.9721040189125295,
"grad_norm": 2.6072793006896973,
"learning_rate": 4.708084473532681e-06,
"loss": 0.5776,
"step": 2056
},
{
"epoch": 0.9725768321513002,
"grad_norm": 2.728632926940918,
"learning_rate": 4.707791871792988e-06,
"loss": 0.6352,
"step": 2057
},
{
"epoch": 0.9730496453900709,
"grad_norm": 2.5841758251190186,
"learning_rate": 4.707499132583775e-06,
"loss": 0.5488,
"step": 2058
},
{
"epoch": 0.9735224586288416,
"grad_norm": 2.8464293479919434,
"learning_rate": 4.707206255923271e-06,
"loss": 0.7051,
"step": 2059
},
{
"epoch": 0.9739952718676123,
"grad_norm": 2.547297239303589,
"learning_rate": 4.706913241829712e-06,
"loss": 0.5937,
"step": 2060
},
{
"epoch": 0.9744680851063829,
"grad_norm": 2.6572306156158447,
"learning_rate": 4.706620090321341e-06,
"loss": 0.6041,
"step": 2061
},
{
"epoch": 0.9749408983451536,
"grad_norm": 2.3262805938720703,
"learning_rate": 4.706326801416414e-06,
"loss": 0.5144,
"step": 2062
},
{
"epoch": 0.9754137115839243,
"grad_norm": 2.9693965911865234,
"learning_rate": 4.706033375133191e-06,
"loss": 0.551,
"step": 2063
},
{
"epoch": 0.975886524822695,
"grad_norm": 2.5993731021881104,
"learning_rate": 4.7057398114899435e-06,
"loss": 0.6143,
"step": 2064
},
{
"epoch": 0.9763593380614657,
"grad_norm": 2.453336477279663,
"learning_rate": 4.70544611050495e-06,
"loss": 0.6093,
"step": 2065
},
{
"epoch": 0.9768321513002364,
"grad_norm": 2.898629665374756,
"learning_rate": 4.705152272196497e-06,
"loss": 0.6007,
"step": 2066
},
{
"epoch": 0.9773049645390071,
"grad_norm": 2.7990612983703613,
"learning_rate": 4.7048582965828815e-06,
"loss": 0.6687,
"step": 2067
},
{
"epoch": 0.9777777777777777,
"grad_norm": 2.635284423828125,
"learning_rate": 4.704564183682408e-06,
"loss": 0.5564,
"step": 2068
},
{
"epoch": 0.9782505910165484,
"grad_norm": 3.014547109603882,
"learning_rate": 4.704269933513389e-06,
"loss": 0.6084,
"step": 2069
},
{
"epoch": 0.9787234042553191,
"grad_norm": 2.659357786178589,
"learning_rate": 4.703975546094147e-06,
"loss": 0.6031,
"step": 2070
},
{
"epoch": 0.9791962174940898,
"grad_norm": 2.326932668685913,
"learning_rate": 4.703681021443013e-06,
"loss": 0.5859,
"step": 2071
},
{
"epoch": 0.9796690307328605,
"grad_norm": 2.958803653717041,
"learning_rate": 4.7033863595783235e-06,
"loss": 0.5586,
"step": 2072
},
{
"epoch": 0.9801418439716312,
"grad_norm": 2.921386957168579,
"learning_rate": 4.703091560518427e-06,
"loss": 0.6126,
"step": 2073
},
{
"epoch": 0.9806146572104019,
"grad_norm": 2.6500775814056396,
"learning_rate": 4.702796624281679e-06,
"loss": 0.5678,
"step": 2074
},
{
"epoch": 0.9810874704491725,
"grad_norm": 2.7740228176116943,
"learning_rate": 4.702501550886445e-06,
"loss": 0.6067,
"step": 2075
},
{
"epoch": 0.9815602836879432,
"grad_norm": 2.3296213150024414,
"learning_rate": 4.702206340351096e-06,
"loss": 0.5247,
"step": 2076
},
{
"epoch": 0.9820330969267139,
"grad_norm": 2.748300790786743,
"learning_rate": 4.701910992694016e-06,
"loss": 0.5197,
"step": 2077
},
{
"epoch": 0.9825059101654846,
"grad_norm": 2.250985622406006,
"learning_rate": 4.7016155079335926e-06,
"loss": 0.5214,
"step": 2078
},
{
"epoch": 0.9829787234042553,
"grad_norm": 2.389845848083496,
"learning_rate": 4.701319886088226e-06,
"loss": 0.519,
"step": 2079
},
{
"epoch": 0.983451536643026,
"grad_norm": 2.818220853805542,
"learning_rate": 4.701024127176322e-06,
"loss": 0.607,
"step": 2080
},
{
"epoch": 0.9839243498817967,
"grad_norm": 3.4058034420013428,
"learning_rate": 4.700728231216297e-06,
"loss": 0.5711,
"step": 2081
},
{
"epoch": 0.9843971631205674,
"grad_norm": 2.5297787189483643,
"learning_rate": 4.700432198226575e-06,
"loss": 0.5979,
"step": 2082
},
{
"epoch": 0.984869976359338,
"grad_norm": 3.0548105239868164,
"learning_rate": 4.7001360282255885e-06,
"loss": 0.6041,
"step": 2083
},
{
"epoch": 0.9853427895981087,
"grad_norm": 2.8983733654022217,
"learning_rate": 4.699839721231779e-06,
"loss": 0.5926,
"step": 2084
},
{
"epoch": 0.9858156028368794,
"grad_norm": 3.2717764377593994,
"learning_rate": 4.699543277263596e-06,
"loss": 0.6477,
"step": 2085
},
{
"epoch": 0.9862884160756501,
"grad_norm": 3.03729248046875,
"learning_rate": 4.699246696339497e-06,
"loss": 0.6786,
"step": 2086
},
{
"epoch": 0.9867612293144208,
"grad_norm": 2.852301597595215,
"learning_rate": 4.698949978477951e-06,
"loss": 0.6565,
"step": 2087
},
{
"epoch": 0.9872340425531915,
"grad_norm": 2.843485116958618,
"learning_rate": 4.698653123697431e-06,
"loss": 0.6627,
"step": 2088
},
{
"epoch": 0.9877068557919622,
"grad_norm": 2.6315064430236816,
"learning_rate": 4.698356132016423e-06,
"loss": 0.6577,
"step": 2089
},
{
"epoch": 0.9881796690307328,
"grad_norm": 2.7482151985168457,
"learning_rate": 4.698059003453417e-06,
"loss": 0.5514,
"step": 2090
},
{
"epoch": 0.9886524822695035,
"grad_norm": 2.826673746109009,
"learning_rate": 4.6977617380269145e-06,
"loss": 0.565,
"step": 2091
},
{
"epoch": 0.9891252955082742,
"grad_norm": 3.0273752212524414,
"learning_rate": 4.697464335755427e-06,
"loss": 0.6331,
"step": 2092
},
{
"epoch": 0.9895981087470449,
"grad_norm": 2.7551653385162354,
"learning_rate": 4.6971667966574695e-06,
"loss": 0.6486,
"step": 2093
},
{
"epoch": 0.9900709219858156,
"grad_norm": 2.656299114227295,
"learning_rate": 4.696869120751571e-06,
"loss": 0.6562,
"step": 2094
},
{
"epoch": 0.9905437352245863,
"grad_norm": 2.785322904586792,
"learning_rate": 4.696571308056265e-06,
"loss": 0.5892,
"step": 2095
},
{
"epoch": 0.991016548463357,
"grad_norm": 2.9334635734558105,
"learning_rate": 4.696273358590095e-06,
"loss": 0.6346,
"step": 2096
},
{
"epoch": 0.9914893617021276,
"grad_norm": 2.7944300174713135,
"learning_rate": 4.695975272371613e-06,
"loss": 0.5859,
"step": 2097
},
{
"epoch": 0.9919621749408983,
"grad_norm": 2.5416972637176514,
"learning_rate": 4.695677049419381e-06,
"loss": 0.5658,
"step": 2098
},
{
"epoch": 0.992434988179669,
"grad_norm": 2.4056856632232666,
"learning_rate": 4.695378689751966e-06,
"loss": 0.5121,
"step": 2099
},
{
"epoch": 0.9929078014184397,
"grad_norm": 2.614548683166504,
"learning_rate": 4.695080193387948e-06,
"loss": 0.5961,
"step": 2100
},
{
"epoch": 0.9933806146572104,
"grad_norm": 2.8966517448425293,
"learning_rate": 4.69478156034591e-06,
"loss": 0.5985,
"step": 2101
},
{
"epoch": 0.9938534278959811,
"grad_norm": 2.9514098167419434,
"learning_rate": 4.694482790644448e-06,
"loss": 0.5677,
"step": 2102
},
{
"epoch": 0.9943262411347518,
"grad_norm": 2.4326791763305664,
"learning_rate": 4.694183884302165e-06,
"loss": 0.5698,
"step": 2103
},
{
"epoch": 0.9947990543735225,
"grad_norm": 2.9242892265319824,
"learning_rate": 4.6938848413376735e-06,
"loss": 0.6245,
"step": 2104
},
{
"epoch": 0.9952718676122931,
"grad_norm": 2.9134104251861572,
"learning_rate": 4.693585661769593e-06,
"loss": 0.6164,
"step": 2105
},
{
"epoch": 0.9957446808510638,
"grad_norm": 2.472564458847046,
"learning_rate": 4.693286345616551e-06,
"loss": 0.5616,
"step": 2106
},
{
"epoch": 0.9962174940898345,
"grad_norm": 3.2456448078155518,
"learning_rate": 4.692986892897186e-06,
"loss": 0.6977,
"step": 2107
},
{
"epoch": 0.9966903073286052,
"grad_norm": 3.4032769203186035,
"learning_rate": 4.692687303630143e-06,
"loss": 0.643,
"step": 2108
},
{
"epoch": 0.9971631205673759,
"grad_norm": 2.722200870513916,
"learning_rate": 4.692387577834076e-06,
"loss": 0.5873,
"step": 2109
},
{
"epoch": 0.9976359338061466,
"grad_norm": 2.687532663345337,
"learning_rate": 4.692087715527648e-06,
"loss": 0.5423,
"step": 2110
},
{
"epoch": 0.9981087470449173,
"grad_norm": 2.578613042831421,
"learning_rate": 4.6917877167295305e-06,
"loss": 0.5689,
"step": 2111
},
{
"epoch": 0.9985815602836879,
"grad_norm": 3.1806094646453857,
"learning_rate": 4.691487581458402e-06,
"loss": 0.6133,
"step": 2112
},
{
"epoch": 0.9990543735224586,
"grad_norm": 2.4449520111083984,
"learning_rate": 4.691187309732952e-06,
"loss": 0.5841,
"step": 2113
},
{
"epoch": 0.9995271867612293,
"grad_norm": 2.908749580383301,
"learning_rate": 4.690886901571875e-06,
"loss": 0.534,
"step": 2114
},
{
"epoch": 1.0,
"grad_norm": 4.019968032836914,
"learning_rate": 4.6905863569938785e-06,
"loss": 0.596,
"step": 2115
}
],
"logging_steps": 1,
"max_steps": 12690,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 2115,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.341936104473887e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}