model_b71bbc0b / checkpoint-912 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
2ca21ca verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.985401459854015,
"eval_steps": 500,
"global_step": 912,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004379562043795621,
"grad_norm": 34.64235305786133,
"learning_rate": 5.0000000000000004e-08,
"loss": 2.6583,
"step": 1
},
{
"epoch": 0.008759124087591242,
"grad_norm": 33.89678192138672,
"learning_rate": 1.0000000000000001e-07,
"loss": 2.5074,
"step": 2
},
{
"epoch": 0.013138686131386862,
"grad_norm": 35.2148551940918,
"learning_rate": 1.5000000000000002e-07,
"loss": 2.7094,
"step": 3
},
{
"epoch": 0.017518248175182483,
"grad_norm": 35.11457061767578,
"learning_rate": 2.0000000000000002e-07,
"loss": 2.7266,
"step": 4
},
{
"epoch": 0.021897810218978103,
"grad_norm": 35.70753479003906,
"learning_rate": 2.5000000000000004e-07,
"loss": 2.7442,
"step": 5
},
{
"epoch": 0.026277372262773723,
"grad_norm": 34.34943771362305,
"learning_rate": 3.0000000000000004e-07,
"loss": 2.5578,
"step": 6
},
{
"epoch": 0.030656934306569343,
"grad_norm": 34.31540298461914,
"learning_rate": 3.5000000000000004e-07,
"loss": 2.5893,
"step": 7
},
{
"epoch": 0.035036496350364967,
"grad_norm": 32.545223236083984,
"learning_rate": 4.0000000000000003e-07,
"loss": 2.5039,
"step": 8
},
{
"epoch": 0.03941605839416058,
"grad_norm": 35.70431137084961,
"learning_rate": 4.5000000000000003e-07,
"loss": 2.6719,
"step": 9
},
{
"epoch": 0.043795620437956206,
"grad_norm": 34.14265441894531,
"learning_rate": 5.000000000000001e-07,
"loss": 2.5764,
"step": 10
},
{
"epoch": 0.04817518248175182,
"grad_norm": 32.08097839355469,
"learning_rate": 5.5e-07,
"loss": 2.4564,
"step": 11
},
{
"epoch": 0.052554744525547446,
"grad_norm": 32.66060256958008,
"learning_rate": 6.000000000000001e-07,
"loss": 2.458,
"step": 12
},
{
"epoch": 0.05693430656934306,
"grad_norm": 33.21636962890625,
"learning_rate": 6.5e-07,
"loss": 2.4835,
"step": 13
},
{
"epoch": 0.061313868613138686,
"grad_norm": 33.92257308959961,
"learning_rate": 7.000000000000001e-07,
"loss": 2.4288,
"step": 14
},
{
"epoch": 0.06569343065693431,
"grad_norm": 32.19805145263672,
"learning_rate": 7.5e-07,
"loss": 2.2411,
"step": 15
},
{
"epoch": 0.07007299270072993,
"grad_norm": 32.355220794677734,
"learning_rate": 8.000000000000001e-07,
"loss": 2.1597,
"step": 16
},
{
"epoch": 0.07445255474452554,
"grad_norm": 33.08480453491211,
"learning_rate": 8.500000000000001e-07,
"loss": 2.1377,
"step": 17
},
{
"epoch": 0.07883211678832117,
"grad_norm": 33.459957122802734,
"learning_rate": 9.000000000000001e-07,
"loss": 2.0306,
"step": 18
},
{
"epoch": 0.08321167883211679,
"grad_norm": 32.897315979003906,
"learning_rate": 9.500000000000001e-07,
"loss": 1.8697,
"step": 19
},
{
"epoch": 0.08759124087591241,
"grad_norm": 33.81785202026367,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.8147,
"step": 20
},
{
"epoch": 0.09197080291970802,
"grad_norm": 32.52595520019531,
"learning_rate": 1.0500000000000001e-06,
"loss": 1.6526,
"step": 21
},
{
"epoch": 0.09635036496350365,
"grad_norm": 34.09442138671875,
"learning_rate": 1.1e-06,
"loss": 1.6127,
"step": 22
},
{
"epoch": 0.10072992700729927,
"grad_norm": 30.89822769165039,
"learning_rate": 1.1500000000000002e-06,
"loss": 1.3872,
"step": 23
},
{
"epoch": 0.10510948905109489,
"grad_norm": 29.566524505615234,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.2755,
"step": 24
},
{
"epoch": 0.10948905109489052,
"grad_norm": 28.26628875732422,
"learning_rate": 1.25e-06,
"loss": 1.1409,
"step": 25
},
{
"epoch": 0.11386861313868613,
"grad_norm": 30.7103328704834,
"learning_rate": 1.3e-06,
"loss": 0.966,
"step": 26
},
{
"epoch": 0.11824817518248175,
"grad_norm": 28.975385665893555,
"learning_rate": 1.3500000000000002e-06,
"loss": 0.7579,
"step": 27
},
{
"epoch": 0.12262773722627737,
"grad_norm": 26.821529388427734,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.6013,
"step": 28
},
{
"epoch": 0.12700729927007298,
"grad_norm": 23.804439544677734,
"learning_rate": 1.45e-06,
"loss": 0.4978,
"step": 29
},
{
"epoch": 0.13138686131386862,
"grad_norm": 21.404451370239258,
"learning_rate": 1.5e-06,
"loss": 0.3926,
"step": 30
},
{
"epoch": 0.13576642335766423,
"grad_norm": 17.63161849975586,
"learning_rate": 1.5500000000000002e-06,
"loss": 0.2568,
"step": 31
},
{
"epoch": 0.14014598540145987,
"grad_norm": 10.998854637145996,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.2373,
"step": 32
},
{
"epoch": 0.14452554744525548,
"grad_norm": 6.9544997215271,
"learning_rate": 1.6500000000000003e-06,
"loss": 0.1689,
"step": 33
},
{
"epoch": 0.14890510948905109,
"grad_norm": 5.1013102531433105,
"learning_rate": 1.7000000000000002e-06,
"loss": 0.1471,
"step": 34
},
{
"epoch": 0.15328467153284672,
"grad_norm": 4.501709461212158,
"learning_rate": 1.75e-06,
"loss": 0.132,
"step": 35
},
{
"epoch": 0.15766423357664233,
"grad_norm": 3.198529005050659,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.1065,
"step": 36
},
{
"epoch": 0.16204379562043797,
"grad_norm": 3.2325005531311035,
"learning_rate": 1.85e-06,
"loss": 0.0907,
"step": 37
},
{
"epoch": 0.16642335766423358,
"grad_norm": 1.5125375986099243,
"learning_rate": 1.9000000000000002e-06,
"loss": 0.0782,
"step": 38
},
{
"epoch": 0.1708029197080292,
"grad_norm": 1.9160635471343994,
"learning_rate": 1.9500000000000004e-06,
"loss": 0.0852,
"step": 39
},
{
"epoch": 0.17518248175182483,
"grad_norm": 1.6062333583831787,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.074,
"step": 40
},
{
"epoch": 0.17956204379562044,
"grad_norm": 1.5675855875015259,
"learning_rate": 2.05e-06,
"loss": 0.0704,
"step": 41
},
{
"epoch": 0.18394160583941604,
"grad_norm": 1.440182089805603,
"learning_rate": 2.1000000000000002e-06,
"loss": 0.0674,
"step": 42
},
{
"epoch": 0.18832116788321168,
"grad_norm": 1.1466726064682007,
"learning_rate": 2.15e-06,
"loss": 0.0702,
"step": 43
},
{
"epoch": 0.1927007299270073,
"grad_norm": 1.2195515632629395,
"learning_rate": 2.2e-06,
"loss": 0.0723,
"step": 44
},
{
"epoch": 0.19708029197080293,
"grad_norm": 1.743561029434204,
"learning_rate": 2.25e-06,
"loss": 0.0875,
"step": 45
},
{
"epoch": 0.20145985401459854,
"grad_norm": 0.9764343500137329,
"learning_rate": 2.3000000000000004e-06,
"loss": 0.062,
"step": 46
},
{
"epoch": 0.20583941605839415,
"grad_norm": 0.8891277313232422,
"learning_rate": 2.35e-06,
"loss": 0.0576,
"step": 47
},
{
"epoch": 0.21021897810218979,
"grad_norm": 0.9648666977882385,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.0656,
"step": 48
},
{
"epoch": 0.2145985401459854,
"grad_norm": 0.784566342830658,
"learning_rate": 2.4500000000000003e-06,
"loss": 0.0548,
"step": 49
},
{
"epoch": 0.21897810218978103,
"grad_norm": 0.9402966499328613,
"learning_rate": 2.5e-06,
"loss": 0.0626,
"step": 50
},
{
"epoch": 0.22335766423357664,
"grad_norm": 1.3284685611724854,
"learning_rate": 2.55e-06,
"loss": 0.0632,
"step": 51
},
{
"epoch": 0.22773722627737225,
"grad_norm": 1.0913968086242676,
"learning_rate": 2.6e-06,
"loss": 0.0675,
"step": 52
},
{
"epoch": 0.2321167883211679,
"grad_norm": 1.1069140434265137,
"learning_rate": 2.6500000000000005e-06,
"loss": 0.0541,
"step": 53
},
{
"epoch": 0.2364963503649635,
"grad_norm": 0.8529757857322693,
"learning_rate": 2.7000000000000004e-06,
"loss": 0.0657,
"step": 54
},
{
"epoch": 0.24087591240875914,
"grad_norm": 0.7182446718215942,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0607,
"step": 55
},
{
"epoch": 0.24525547445255474,
"grad_norm": 1.0538653135299683,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.0556,
"step": 56
},
{
"epoch": 0.24963503649635035,
"grad_norm": 1.2083594799041748,
"learning_rate": 2.85e-06,
"loss": 0.0532,
"step": 57
},
{
"epoch": 0.25401459854014596,
"grad_norm": 0.8183572888374329,
"learning_rate": 2.9e-06,
"loss": 0.0529,
"step": 58
},
{
"epoch": 0.2583941605839416,
"grad_norm": 0.9014842510223389,
"learning_rate": 2.95e-06,
"loss": 0.0601,
"step": 59
},
{
"epoch": 0.26277372262773724,
"grad_norm": 0.9017247557640076,
"learning_rate": 3e-06,
"loss": 0.0584,
"step": 60
},
{
"epoch": 0.2671532846715328,
"grad_norm": 1.1078683137893677,
"learning_rate": 3.05e-06,
"loss": 0.0635,
"step": 61
},
{
"epoch": 0.27153284671532846,
"grad_norm": 1.174526572227478,
"learning_rate": 3.1000000000000004e-06,
"loss": 0.0523,
"step": 62
},
{
"epoch": 0.2759124087591241,
"grad_norm": 0.9296770095825195,
"learning_rate": 3.1500000000000003e-06,
"loss": 0.0588,
"step": 63
},
{
"epoch": 0.28029197080291973,
"grad_norm": 0.8549372553825378,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.0639,
"step": 64
},
{
"epoch": 0.2846715328467153,
"grad_norm": 0.8956279158592224,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.059,
"step": 65
},
{
"epoch": 0.28905109489051095,
"grad_norm": 0.7937710285186768,
"learning_rate": 3.3000000000000006e-06,
"loss": 0.0579,
"step": 66
},
{
"epoch": 0.2934306569343066,
"grad_norm": 0.7786620855331421,
"learning_rate": 3.3500000000000005e-06,
"loss": 0.0586,
"step": 67
},
{
"epoch": 0.29781021897810217,
"grad_norm": 0.7562637329101562,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.046,
"step": 68
},
{
"epoch": 0.3021897810218978,
"grad_norm": 0.8958250880241394,
"learning_rate": 3.45e-06,
"loss": 0.0566,
"step": 69
},
{
"epoch": 0.30656934306569344,
"grad_norm": 0.9434528946876526,
"learning_rate": 3.5e-06,
"loss": 0.0548,
"step": 70
},
{
"epoch": 0.310948905109489,
"grad_norm": 1.0564453601837158,
"learning_rate": 3.5500000000000003e-06,
"loss": 0.0529,
"step": 71
},
{
"epoch": 0.31532846715328466,
"grad_norm": 0.896443247795105,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.0517,
"step": 72
},
{
"epoch": 0.3197080291970803,
"grad_norm": 1.1364223957061768,
"learning_rate": 3.65e-06,
"loss": 0.0489,
"step": 73
},
{
"epoch": 0.32408759124087594,
"grad_norm": 1.1319010257720947,
"learning_rate": 3.7e-06,
"loss": 0.0548,
"step": 74
},
{
"epoch": 0.3284671532846715,
"grad_norm": 0.9694503545761108,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0525,
"step": 75
},
{
"epoch": 0.33284671532846716,
"grad_norm": 0.8128111958503723,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.0566,
"step": 76
},
{
"epoch": 0.3372262773722628,
"grad_norm": 0.9068273901939392,
"learning_rate": 3.85e-06,
"loss": 0.0475,
"step": 77
},
{
"epoch": 0.3416058394160584,
"grad_norm": 0.9689438343048096,
"learning_rate": 3.900000000000001e-06,
"loss": 0.048,
"step": 78
},
{
"epoch": 0.345985401459854,
"grad_norm": 0.940131664276123,
"learning_rate": 3.95e-06,
"loss": 0.0567,
"step": 79
},
{
"epoch": 0.35036496350364965,
"grad_norm": 0.8836082220077515,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0542,
"step": 80
},
{
"epoch": 0.35474452554744523,
"grad_norm": 0.9325949549674988,
"learning_rate": 4.05e-06,
"loss": 0.0551,
"step": 81
},
{
"epoch": 0.35912408759124087,
"grad_norm": 0.8954764008522034,
"learning_rate": 4.1e-06,
"loss": 0.0517,
"step": 82
},
{
"epoch": 0.3635036496350365,
"grad_norm": 0.6444959044456482,
"learning_rate": 4.15e-06,
"loss": 0.0434,
"step": 83
},
{
"epoch": 0.3678832116788321,
"grad_norm": 0.9097581505775452,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.0471,
"step": 84
},
{
"epoch": 0.3722627737226277,
"grad_norm": 0.849006712436676,
"learning_rate": 4.25e-06,
"loss": 0.0529,
"step": 85
},
{
"epoch": 0.37664233576642336,
"grad_norm": 0.8611392378807068,
"learning_rate": 4.3e-06,
"loss": 0.0513,
"step": 86
},
{
"epoch": 0.381021897810219,
"grad_norm": 0.7885357737541199,
"learning_rate": 4.350000000000001e-06,
"loss": 0.0523,
"step": 87
},
{
"epoch": 0.3854014598540146,
"grad_norm": 0.7642116546630859,
"learning_rate": 4.4e-06,
"loss": 0.0407,
"step": 88
},
{
"epoch": 0.3897810218978102,
"grad_norm": 0.8920945525169373,
"learning_rate": 4.450000000000001e-06,
"loss": 0.0485,
"step": 89
},
{
"epoch": 0.39416058394160586,
"grad_norm": 0.9801046848297119,
"learning_rate": 4.5e-06,
"loss": 0.0404,
"step": 90
},
{
"epoch": 0.39854014598540144,
"grad_norm": 1.0874953269958496,
"learning_rate": 4.5500000000000005e-06,
"loss": 0.0588,
"step": 91
},
{
"epoch": 0.4029197080291971,
"grad_norm": 0.9019029140472412,
"learning_rate": 4.600000000000001e-06,
"loss": 0.0466,
"step": 92
},
{
"epoch": 0.4072992700729927,
"grad_norm": 0.7258988618850708,
"learning_rate": 4.65e-06,
"loss": 0.0493,
"step": 93
},
{
"epoch": 0.4116788321167883,
"grad_norm": 1.103407859802246,
"learning_rate": 4.7e-06,
"loss": 0.0495,
"step": 94
},
{
"epoch": 0.41605839416058393,
"grad_norm": 0.751805305480957,
"learning_rate": 4.75e-06,
"loss": 0.0484,
"step": 95
},
{
"epoch": 0.42043795620437957,
"grad_norm": 0.7717764973640442,
"learning_rate": 4.800000000000001e-06,
"loss": 0.0447,
"step": 96
},
{
"epoch": 0.4248175182481752,
"grad_norm": 0.7147190570831299,
"learning_rate": 4.85e-06,
"loss": 0.0523,
"step": 97
},
{
"epoch": 0.4291970802919708,
"grad_norm": 0.9990110993385315,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.0454,
"step": 98
},
{
"epoch": 0.4335766423357664,
"grad_norm": 0.7766187191009521,
"learning_rate": 4.95e-06,
"loss": 0.0472,
"step": 99
},
{
"epoch": 0.43795620437956206,
"grad_norm": 0.7124347686767578,
"learning_rate": 5e-06,
"loss": 0.0473,
"step": 100
},
{
"epoch": 0.44233576642335765,
"grad_norm": 0.9340270757675171,
"learning_rate": 4.99999232689698e-06,
"loss": 0.0499,
"step": 101
},
{
"epoch": 0.4467153284671533,
"grad_norm": 0.7429985404014587,
"learning_rate": 4.999969307635021e-06,
"loss": 0.042,
"step": 102
},
{
"epoch": 0.4510948905109489,
"grad_norm": 0.9131317138671875,
"learning_rate": 4.999930942355425e-06,
"loss": 0.0519,
"step": 103
},
{
"epoch": 0.4554744525547445,
"grad_norm": 0.9970843195915222,
"learning_rate": 4.999877231293698e-06,
"loss": 0.0428,
"step": 104
},
{
"epoch": 0.45985401459854014,
"grad_norm": 0.7625145316123962,
"learning_rate": 4.999808174779543e-06,
"loss": 0.0442,
"step": 105
},
{
"epoch": 0.4642335766423358,
"grad_norm": 0.6059474945068359,
"learning_rate": 4.999723773236865e-06,
"loss": 0.0456,
"step": 106
},
{
"epoch": 0.4686131386861314,
"grad_norm": 0.6798833608627319,
"learning_rate": 4.999624027183758e-06,
"loss": 0.0408,
"step": 107
},
{
"epoch": 0.472992700729927,
"grad_norm": 1.0250803232192993,
"learning_rate": 4.999508937232514e-06,
"loss": 0.0471,
"step": 108
},
{
"epoch": 0.47737226277372263,
"grad_norm": 0.8457198739051819,
"learning_rate": 4.999378504089609e-06,
"loss": 0.0425,
"step": 109
},
{
"epoch": 0.48175182481751827,
"grad_norm": 0.9417868852615356,
"learning_rate": 4.999232728555705e-06,
"loss": 0.0388,
"step": 110
},
{
"epoch": 0.48613138686131385,
"grad_norm": 0.8558921813964844,
"learning_rate": 4.999071611525643e-06,
"loss": 0.0423,
"step": 111
},
{
"epoch": 0.4905109489051095,
"grad_norm": 0.7070104479789734,
"learning_rate": 4.998895153988437e-06,
"loss": 0.0354,
"step": 112
},
{
"epoch": 0.4948905109489051,
"grad_norm": 0.8162719011306763,
"learning_rate": 4.998703357027268e-06,
"loss": 0.0465,
"step": 113
},
{
"epoch": 0.4992700729927007,
"grad_norm": 0.9140358567237854,
"learning_rate": 4.998496221819479e-06,
"loss": 0.0457,
"step": 114
},
{
"epoch": 0.5036496350364964,
"grad_norm": 0.6447531580924988,
"learning_rate": 4.998273749636564e-06,
"loss": 0.039,
"step": 115
},
{
"epoch": 0.5080291970802919,
"grad_norm": 0.9157156944274902,
"learning_rate": 4.998035941844167e-06,
"loss": 0.0469,
"step": 116
},
{
"epoch": 0.5124087591240876,
"grad_norm": 0.7706230878829956,
"learning_rate": 4.997782799902065e-06,
"loss": 0.0325,
"step": 117
},
{
"epoch": 0.5167883211678832,
"grad_norm": 0.9391443729400635,
"learning_rate": 4.997514325364168e-06,
"loss": 0.0397,
"step": 118
},
{
"epoch": 0.5211678832116788,
"grad_norm": 1.0085054636001587,
"learning_rate": 4.997230519878499e-06,
"loss": 0.0403,
"step": 119
},
{
"epoch": 0.5255474452554745,
"grad_norm": 1.8318824768066406,
"learning_rate": 4.996931385187195e-06,
"loss": 0.0463,
"step": 120
},
{
"epoch": 0.5299270072992701,
"grad_norm": 1.0216630697250366,
"learning_rate": 4.9966169231264885e-06,
"loss": 0.0406,
"step": 121
},
{
"epoch": 0.5343065693430656,
"grad_norm": 1.4819082021713257,
"learning_rate": 4.9962871356267e-06,
"loss": 0.0485,
"step": 122
},
{
"epoch": 0.5386861313868613,
"grad_norm": 0.9435060024261475,
"learning_rate": 4.995942024712222e-06,
"loss": 0.04,
"step": 123
},
{
"epoch": 0.5430656934306569,
"grad_norm": 0.7887905240058899,
"learning_rate": 4.995581592501514e-06,
"loss": 0.0397,
"step": 124
},
{
"epoch": 0.5474452554744526,
"grad_norm": 0.8321148753166199,
"learning_rate": 4.995205841207082e-06,
"loss": 0.0413,
"step": 125
},
{
"epoch": 0.5518248175182482,
"grad_norm": 1.0303553342819214,
"learning_rate": 4.99481477313547e-06,
"loss": 0.0422,
"step": 126
},
{
"epoch": 0.5562043795620438,
"grad_norm": 0.7056427001953125,
"learning_rate": 4.994408390687241e-06,
"loss": 0.0362,
"step": 127
},
{
"epoch": 0.5605839416058395,
"grad_norm": 0.9762740135192871,
"learning_rate": 4.993986696356966e-06,
"loss": 0.0385,
"step": 128
},
{
"epoch": 0.564963503649635,
"grad_norm": 0.9447624683380127,
"learning_rate": 4.9935496927332095e-06,
"loss": 0.0402,
"step": 129
},
{
"epoch": 0.5693430656934306,
"grad_norm": 0.6106760501861572,
"learning_rate": 4.993097382498511e-06,
"loss": 0.0319,
"step": 130
},
{
"epoch": 0.5737226277372263,
"grad_norm": 1.0554594993591309,
"learning_rate": 4.992629768429367e-06,
"loss": 0.0437,
"step": 131
},
{
"epoch": 0.5781021897810219,
"grad_norm": 1.066218376159668,
"learning_rate": 4.992146853396219e-06,
"loss": 0.0382,
"step": 132
},
{
"epoch": 0.5824817518248175,
"grad_norm": 0.7517623901367188,
"learning_rate": 4.991648640363434e-06,
"loss": 0.0317,
"step": 133
},
{
"epoch": 0.5868613138686132,
"grad_norm": 0.8136976957321167,
"learning_rate": 4.991135132389282e-06,
"loss": 0.0339,
"step": 134
},
{
"epoch": 0.5912408759124088,
"grad_norm": 0.9254240989685059,
"learning_rate": 4.990606332625923e-06,
"loss": 0.0413,
"step": 135
},
{
"epoch": 0.5956204379562043,
"grad_norm": 0.6778447031974792,
"learning_rate": 4.990062244319387e-06,
"loss": 0.0377,
"step": 136
},
{
"epoch": 0.6,
"grad_norm": 1.1036059856414795,
"learning_rate": 4.989502870809547e-06,
"loss": 0.0376,
"step": 137
},
{
"epoch": 0.6043795620437956,
"grad_norm": 0.8054158091545105,
"learning_rate": 4.988928215530111e-06,
"loss": 0.0367,
"step": 138
},
{
"epoch": 0.6087591240875913,
"grad_norm": 0.9227175116539001,
"learning_rate": 4.988338282008588e-06,
"loss": 0.0374,
"step": 139
},
{
"epoch": 0.6131386861313869,
"grad_norm": 0.8502228260040283,
"learning_rate": 4.9877330738662755e-06,
"loss": 0.0384,
"step": 140
},
{
"epoch": 0.6175182481751825,
"grad_norm": 0.684752881526947,
"learning_rate": 4.987112594818232e-06,
"loss": 0.0366,
"step": 141
},
{
"epoch": 0.621897810218978,
"grad_norm": 0.7456391453742981,
"learning_rate": 4.9864768486732585e-06,
"loss": 0.037,
"step": 142
},
{
"epoch": 0.6262773722627737,
"grad_norm": 0.6797431111335754,
"learning_rate": 4.985825839333872e-06,
"loss": 0.0325,
"step": 143
},
{
"epoch": 0.6306569343065693,
"grad_norm": 0.8098205924034119,
"learning_rate": 4.985159570796279e-06,
"loss": 0.0343,
"step": 144
},
{
"epoch": 0.635036496350365,
"grad_norm": 0.8089592456817627,
"learning_rate": 4.984478047150361e-06,
"loss": 0.026,
"step": 145
},
{
"epoch": 0.6394160583941606,
"grad_norm": 0.9282512664794922,
"learning_rate": 4.983781272579637e-06,
"loss": 0.0334,
"step": 146
},
{
"epoch": 0.6437956204379562,
"grad_norm": 0.802608072757721,
"learning_rate": 4.9830692513612445e-06,
"loss": 0.0259,
"step": 147
},
{
"epoch": 0.6481751824817519,
"grad_norm": 1.3046361207962036,
"learning_rate": 4.982341987865914e-06,
"loss": 0.045,
"step": 148
},
{
"epoch": 0.6525547445255474,
"grad_norm": 1.0812411308288574,
"learning_rate": 4.9815994865579405e-06,
"loss": 0.0329,
"step": 149
},
{
"epoch": 0.656934306569343,
"grad_norm": 0.7856137156486511,
"learning_rate": 4.980841751995155e-06,
"loss": 0.0341,
"step": 150
},
{
"epoch": 0.6613138686131387,
"grad_norm": 1.0517083406448364,
"learning_rate": 4.980068788828897e-06,
"loss": 0.0299,
"step": 151
},
{
"epoch": 0.6656934306569343,
"grad_norm": 0.6148231029510498,
"learning_rate": 4.979280601803988e-06,
"loss": 0.0304,
"step": 152
},
{
"epoch": 0.67007299270073,
"grad_norm": 0.7572031021118164,
"learning_rate": 4.9784771957586995e-06,
"loss": 0.0309,
"step": 153
},
{
"epoch": 0.6744525547445256,
"grad_norm": 2.0948777198791504,
"learning_rate": 4.977658575624727e-06,
"loss": 0.0307,
"step": 154
},
{
"epoch": 0.6788321167883211,
"grad_norm": 0.624940037727356,
"learning_rate": 4.976824746427153e-06,
"loss": 0.03,
"step": 155
},
{
"epoch": 0.6832116788321168,
"grad_norm": 0.8346346616744995,
"learning_rate": 4.975975713284426e-06,
"loss": 0.036,
"step": 156
},
{
"epoch": 0.6875912408759124,
"grad_norm": 0.742098867893219,
"learning_rate": 4.975111481408319e-06,
"loss": 0.0325,
"step": 157
},
{
"epoch": 0.691970802919708,
"grad_norm": 0.8000304102897644,
"learning_rate": 4.9742320561039055e-06,
"loss": 0.0332,
"step": 158
},
{
"epoch": 0.6963503649635037,
"grad_norm": 1.063854694366455,
"learning_rate": 4.973337442769523e-06,
"loss": 0.0366,
"step": 159
},
{
"epoch": 0.7007299270072993,
"grad_norm": 0.965560257434845,
"learning_rate": 4.972427646896738e-06,
"loss": 0.0331,
"step": 160
},
{
"epoch": 0.7051094890510949,
"grad_norm": 1.5070244073867798,
"learning_rate": 4.971502674070317e-06,
"loss": 0.0446,
"step": 161
},
{
"epoch": 0.7094890510948905,
"grad_norm": 0.8810545206069946,
"learning_rate": 4.970562529968189e-06,
"loss": 0.0299,
"step": 162
},
{
"epoch": 0.7138686131386861,
"grad_norm": 0.7683446407318115,
"learning_rate": 4.969607220361414e-06,
"loss": 0.0244,
"step": 163
},
{
"epoch": 0.7182481751824817,
"grad_norm": 0.7444891929626465,
"learning_rate": 4.968636751114141e-06,
"loss": 0.0338,
"step": 164
},
{
"epoch": 0.7226277372262774,
"grad_norm": 0.7077688574790955,
"learning_rate": 4.96765112818358e-06,
"loss": 0.0285,
"step": 165
},
{
"epoch": 0.727007299270073,
"grad_norm": 0.5648500919342041,
"learning_rate": 4.9666503576199574e-06,
"loss": 0.026,
"step": 166
},
{
"epoch": 0.7313868613138687,
"grad_norm": 0.763556718826294,
"learning_rate": 4.965634445566489e-06,
"loss": 0.0299,
"step": 167
},
{
"epoch": 0.7357664233576642,
"grad_norm": 0.6892725825309753,
"learning_rate": 4.9646033982593315e-06,
"loss": 0.023,
"step": 168
},
{
"epoch": 0.7401459854014598,
"grad_norm": 1.0332573652267456,
"learning_rate": 4.963557222027551e-06,
"loss": 0.0313,
"step": 169
},
{
"epoch": 0.7445255474452555,
"grad_norm": 1.214428424835205,
"learning_rate": 4.962495923293081e-06,
"loss": 0.027,
"step": 170
},
{
"epoch": 0.7489051094890511,
"grad_norm": 0.9823130965232849,
"learning_rate": 4.961419508570686e-06,
"loss": 0.0231,
"step": 171
},
{
"epoch": 0.7532846715328467,
"grad_norm": 1.2535115480422974,
"learning_rate": 4.960327984467919e-06,
"loss": 0.0326,
"step": 172
},
{
"epoch": 0.7576642335766424,
"grad_norm": 0.9383441209793091,
"learning_rate": 4.959221357685081e-06,
"loss": 0.0286,
"step": 173
},
{
"epoch": 0.762043795620438,
"grad_norm": 1.0426976680755615,
"learning_rate": 4.958099635015182e-06,
"loss": 0.0298,
"step": 174
},
{
"epoch": 0.7664233576642335,
"grad_norm": 0.9159742593765259,
"learning_rate": 4.956962823343895e-06,
"loss": 0.025,
"step": 175
},
{
"epoch": 0.7708029197080292,
"grad_norm": 0.8746912479400635,
"learning_rate": 4.95581092964952e-06,
"loss": 0.0299,
"step": 176
},
{
"epoch": 0.7751824817518248,
"grad_norm": 0.9875199198722839,
"learning_rate": 4.954643961002936e-06,
"loss": 0.0309,
"step": 177
},
{
"epoch": 0.7795620437956204,
"grad_norm": 0.7389516234397888,
"learning_rate": 4.953461924567559e-06,
"loss": 0.0291,
"step": 178
},
{
"epoch": 0.7839416058394161,
"grad_norm": 0.790238082408905,
"learning_rate": 4.952264827599299e-06,
"loss": 0.0236,
"step": 179
},
{
"epoch": 0.7883211678832117,
"grad_norm": 0.6766819953918457,
"learning_rate": 4.951052677446515e-06,
"loss": 0.0238,
"step": 180
},
{
"epoch": 0.7927007299270074,
"grad_norm": 0.8832846283912659,
"learning_rate": 4.94982548154997e-06,
"loss": 0.0259,
"step": 181
},
{
"epoch": 0.7970802919708029,
"grad_norm": 0.7298055291175842,
"learning_rate": 4.948583247442783e-06,
"loss": 0.023,
"step": 182
},
{
"epoch": 0.8014598540145985,
"grad_norm": 0.911920428276062,
"learning_rate": 4.947325982750387e-06,
"loss": 0.0272,
"step": 183
},
{
"epoch": 0.8058394160583942,
"grad_norm": 0.9145316481590271,
"learning_rate": 4.946053695190479e-06,
"loss": 0.0248,
"step": 184
},
{
"epoch": 0.8102189781021898,
"grad_norm": 0.8759565353393555,
"learning_rate": 4.9447663925729735e-06,
"loss": 0.0263,
"step": 185
},
{
"epoch": 0.8145985401459854,
"grad_norm": 1.1927592754364014,
"learning_rate": 4.943464082799956e-06,
"loss": 0.0305,
"step": 186
},
{
"epoch": 0.8189781021897811,
"grad_norm": 0.752566933631897,
"learning_rate": 4.942146773865631e-06,
"loss": 0.0247,
"step": 187
},
{
"epoch": 0.8233576642335766,
"grad_norm": 1.1121447086334229,
"learning_rate": 4.940814473856278e-06,
"loss": 0.0293,
"step": 188
},
{
"epoch": 0.8277372262773722,
"grad_norm": 1.0319955348968506,
"learning_rate": 4.939467190950195e-06,
"loss": 0.0247,
"step": 189
},
{
"epoch": 0.8321167883211679,
"grad_norm": 0.7960589528083801,
"learning_rate": 4.938104933417655e-06,
"loss": 0.0232,
"step": 190
},
{
"epoch": 0.8364963503649635,
"grad_norm": 0.593197226524353,
"learning_rate": 4.936727709620853e-06,
"loss": 0.0232,
"step": 191
},
{
"epoch": 0.8408759124087591,
"grad_norm": 0.6710584759712219,
"learning_rate": 4.9353355280138525e-06,
"loss": 0.0278,
"step": 192
},
{
"epoch": 0.8452554744525548,
"grad_norm": 0.7627159357070923,
"learning_rate": 4.933928397142535e-06,
"loss": 0.0291,
"step": 193
},
{
"epoch": 0.8496350364963504,
"grad_norm": 0.4998359680175781,
"learning_rate": 4.93250632564455e-06,
"loss": 0.018,
"step": 194
},
{
"epoch": 0.8540145985401459,
"grad_norm": 0.8028760552406311,
"learning_rate": 4.931069322249258e-06,
"loss": 0.0193,
"step": 195
},
{
"epoch": 0.8583941605839416,
"grad_norm": 0.6061640977859497,
"learning_rate": 4.929617395777678e-06,
"loss": 0.0142,
"step": 196
},
{
"epoch": 0.8627737226277372,
"grad_norm": 0.5901748538017273,
"learning_rate": 4.928150555142436e-06,
"loss": 0.0177,
"step": 197
},
{
"epoch": 0.8671532846715329,
"grad_norm": 0.7800254225730896,
"learning_rate": 4.926668809347707e-06,
"loss": 0.0264,
"step": 198
},
{
"epoch": 0.8715328467153285,
"grad_norm": 0.9308339357376099,
"learning_rate": 4.925172167489162e-06,
"loss": 0.0247,
"step": 199
},
{
"epoch": 0.8759124087591241,
"grad_norm": 0.9651213884353638,
"learning_rate": 4.923660638753911e-06,
"loss": 0.0216,
"step": 200
},
{
"epoch": 0.8802919708029197,
"grad_norm": 1.1258251667022705,
"learning_rate": 4.9221342324204455e-06,
"loss": 0.0249,
"step": 201
},
{
"epoch": 0.8846715328467153,
"grad_norm": 1.0175387859344482,
"learning_rate": 4.9205929578585845e-06,
"loss": 0.0201,
"step": 202
},
{
"epoch": 0.8890510948905109,
"grad_norm": 1.5190610885620117,
"learning_rate": 4.9190368245294155e-06,
"loss": 0.0319,
"step": 203
},
{
"epoch": 0.8934306569343066,
"grad_norm": 0.9947767853736877,
"learning_rate": 4.917465841985234e-06,
"loss": 0.0228,
"step": 204
},
{
"epoch": 0.8978102189781022,
"grad_norm": 0.6416967511177063,
"learning_rate": 4.91588001986949e-06,
"loss": 0.0198,
"step": 205
},
{
"epoch": 0.9021897810218978,
"grad_norm": 0.6980161666870117,
"learning_rate": 4.914279367916724e-06,
"loss": 0.0172,
"step": 206
},
{
"epoch": 0.9065693430656935,
"grad_norm": 0.5301483869552612,
"learning_rate": 4.912663895952511e-06,
"loss": 0.0208,
"step": 207
},
{
"epoch": 0.910948905109489,
"grad_norm": 0.6047857999801636,
"learning_rate": 4.911033613893397e-06,
"loss": 0.0227,
"step": 208
},
{
"epoch": 0.9153284671532846,
"grad_norm": 0.6069537997245789,
"learning_rate": 4.909388531746837e-06,
"loss": 0.0195,
"step": 209
},
{
"epoch": 0.9197080291970803,
"grad_norm": 0.6859843730926514,
"learning_rate": 4.907728659611143e-06,
"loss": 0.0244,
"step": 210
},
{
"epoch": 0.9240875912408759,
"grad_norm": 0.6074005365371704,
"learning_rate": 4.906054007675408e-06,
"loss": 0.0195,
"step": 211
},
{
"epoch": 0.9284671532846716,
"grad_norm": 1.1983692646026611,
"learning_rate": 4.9043645862194545e-06,
"loss": 0.023,
"step": 212
},
{
"epoch": 0.9328467153284672,
"grad_norm": 0.8806214928627014,
"learning_rate": 4.902660405613767e-06,
"loss": 0.0243,
"step": 213
},
{
"epoch": 0.9372262773722628,
"grad_norm": 0.6523962616920471,
"learning_rate": 4.900941476319426e-06,
"loss": 0.016,
"step": 214
},
{
"epoch": 0.9416058394160584,
"grad_norm": 0.5673899054527283,
"learning_rate": 4.899207808888051e-06,
"loss": 0.0158,
"step": 215
},
{
"epoch": 0.945985401459854,
"grad_norm": 0.9643133282661438,
"learning_rate": 4.897459413961729e-06,
"loss": 0.0194,
"step": 216
},
{
"epoch": 0.9503649635036496,
"grad_norm": 0.6007612347602844,
"learning_rate": 4.8956963022729495e-06,
"loss": 0.0187,
"step": 217
},
{
"epoch": 0.9547445255474453,
"grad_norm": 0.968173623085022,
"learning_rate": 4.893918484644545e-06,
"loss": 0.0223,
"step": 218
},
{
"epoch": 0.9591240875912409,
"grad_norm": 0.6649457216262817,
"learning_rate": 4.892125971989616e-06,
"loss": 0.0205,
"step": 219
},
{
"epoch": 0.9635036496350365,
"grad_norm": 0.48259082436561584,
"learning_rate": 4.890318775311471e-06,
"loss": 0.0121,
"step": 220
},
{
"epoch": 0.9678832116788321,
"grad_norm": 0.8284991383552551,
"learning_rate": 4.888496905703554e-06,
"loss": 0.0176,
"step": 221
},
{
"epoch": 0.9722627737226277,
"grad_norm": 0.5141683220863342,
"learning_rate": 4.8866603743493805e-06,
"loss": 0.0154,
"step": 222
},
{
"epoch": 0.9766423357664233,
"grad_norm": 1.0223891735076904,
"learning_rate": 4.884809192522466e-06,
"loss": 0.0111,
"step": 223
},
{
"epoch": 0.981021897810219,
"grad_norm": 0.844782292842865,
"learning_rate": 4.882943371586256e-06,
"loss": 0.016,
"step": 224
},
{
"epoch": 0.9854014598540146,
"grad_norm": 0.6978311538696289,
"learning_rate": 4.881062922994061e-06,
"loss": 0.0129,
"step": 225
},
{
"epoch": 0.9897810218978103,
"grad_norm": 0.8764100074768066,
"learning_rate": 4.879167858288982e-06,
"loss": 0.0213,
"step": 226
},
{
"epoch": 0.9941605839416059,
"grad_norm": 1.0449023246765137,
"learning_rate": 4.877258189103839e-06,
"loss": 0.015,
"step": 227
},
{
"epoch": 0.9985401459854014,
"grad_norm": 0.7534664869308472,
"learning_rate": 4.875333927161104e-06,
"loss": 0.0144,
"step": 228
},
{
"epoch": 1.0,
"grad_norm": 0.7534664869308472,
"learning_rate": 4.8733950842728236e-06,
"loss": 0.0186,
"step": 229
},
{
"epoch": 1.0043795620437956,
"grad_norm": 1.4982736110687256,
"learning_rate": 4.871441672340551e-06,
"loss": 0.0126,
"step": 230
},
{
"epoch": 1.0087591240875913,
"grad_norm": 1.206292986869812,
"learning_rate": 4.869473703355273e-06,
"loss": 0.0165,
"step": 231
},
{
"epoch": 1.013138686131387,
"grad_norm": 0.4586186408996582,
"learning_rate": 4.867491189397331e-06,
"loss": 0.0089,
"step": 232
},
{
"epoch": 1.0175182481751825,
"grad_norm": 0.5647240281105042,
"learning_rate": 4.8654941426363525e-06,
"loss": 0.0122,
"step": 233
},
{
"epoch": 1.0218978102189782,
"grad_norm": 0.6478530764579773,
"learning_rate": 4.863482575331173e-06,
"loss": 0.012,
"step": 234
},
{
"epoch": 1.0262773722627738,
"grad_norm": 0.48696213960647583,
"learning_rate": 4.861456499829764e-06,
"loss": 0.0092,
"step": 235
},
{
"epoch": 1.0306569343065692,
"grad_norm": 0.6736640334129333,
"learning_rate": 4.859415928569154e-06,
"loss": 0.0149,
"step": 236
},
{
"epoch": 1.0350364963503649,
"grad_norm": 0.6518754363059998,
"learning_rate": 4.857360874075355e-06,
"loss": 0.0085,
"step": 237
},
{
"epoch": 1.0394160583941605,
"grad_norm": 0.5145443677902222,
"learning_rate": 4.855291348963281e-06,
"loss": 0.0102,
"step": 238
},
{
"epoch": 1.0437956204379562,
"grad_norm": 0.5647151470184326,
"learning_rate": 4.853207365936676e-06,
"loss": 0.0065,
"step": 239
},
{
"epoch": 1.0481751824817518,
"grad_norm": 0.46668219566345215,
"learning_rate": 4.8511089377880334e-06,
"loss": 0.0081,
"step": 240
},
{
"epoch": 1.0525547445255474,
"grad_norm": 0.9103809595108032,
"learning_rate": 4.848996077398518e-06,
"loss": 0.0107,
"step": 241
},
{
"epoch": 1.056934306569343,
"grad_norm": 0.5947101712226868,
"learning_rate": 4.8468687977378855e-06,
"loss": 0.0095,
"step": 242
},
{
"epoch": 1.0613138686131387,
"grad_norm": 0.7154219150543213,
"learning_rate": 4.844727111864405e-06,
"loss": 0.0097,
"step": 243
},
{
"epoch": 1.0656934306569343,
"grad_norm": 0.9023681282997131,
"learning_rate": 4.842571032924778e-06,
"loss": 0.0105,
"step": 244
},
{
"epoch": 1.07007299270073,
"grad_norm": 0.6020027995109558,
"learning_rate": 4.840400574154056e-06,
"loss": 0.0065,
"step": 245
},
{
"epoch": 1.0744525547445256,
"grad_norm": 0.7602945566177368,
"learning_rate": 4.838215748875562e-06,
"loss": 0.0121,
"step": 246
},
{
"epoch": 1.0788321167883212,
"grad_norm": 0.8768120408058167,
"learning_rate": 4.83601657050081e-06,
"loss": 0.0146,
"step": 247
},
{
"epoch": 1.0832116788321169,
"grad_norm": 0.7482877373695374,
"learning_rate": 4.833803052529414e-06,
"loss": 0.0076,
"step": 248
},
{
"epoch": 1.0875912408759123,
"grad_norm": 0.4619101881980896,
"learning_rate": 4.831575208549018e-06,
"loss": 0.0114,
"step": 249
},
{
"epoch": 1.091970802919708,
"grad_norm": 0.7442188262939453,
"learning_rate": 4.829333052235202e-06,
"loss": 0.0119,
"step": 250
},
{
"epoch": 1.0963503649635036,
"grad_norm": 0.754559338092804,
"learning_rate": 4.827076597351403e-06,
"loss": 0.011,
"step": 251
},
{
"epoch": 1.1007299270072992,
"grad_norm": 0.8147054314613342,
"learning_rate": 4.824805857748831e-06,
"loss": 0.0098,
"step": 252
},
{
"epoch": 1.1051094890510949,
"grad_norm": 0.814437985420227,
"learning_rate": 4.82252084736638e-06,
"loss": 0.0077,
"step": 253
},
{
"epoch": 1.1094890510948905,
"grad_norm": 0.7731255888938904,
"learning_rate": 4.820221580230545e-06,
"loss": 0.0129,
"step": 254
},
{
"epoch": 1.1138686131386861,
"grad_norm": 0.7589200139045715,
"learning_rate": 4.8179080704553386e-06,
"loss": 0.0095,
"step": 255
},
{
"epoch": 1.1182481751824818,
"grad_norm": 0.455625057220459,
"learning_rate": 4.815580332242199e-06,
"loss": 0.0088,
"step": 256
},
{
"epoch": 1.1226277372262774,
"grad_norm": 0.51591956615448,
"learning_rate": 4.8132383798799075e-06,
"loss": 0.0071,
"step": 257
},
{
"epoch": 1.127007299270073,
"grad_norm": 0.6024675965309143,
"learning_rate": 4.810882227744495e-06,
"loss": 0.0108,
"step": 258
},
{
"epoch": 1.1313868613138687,
"grad_norm": 0.6686123609542847,
"learning_rate": 4.808511890299163e-06,
"loss": 0.0139,
"step": 259
},
{
"epoch": 1.1357664233576643,
"grad_norm": 0.7872790694236755,
"learning_rate": 4.806127382094184e-06,
"loss": 0.0113,
"step": 260
},
{
"epoch": 1.14014598540146,
"grad_norm": 0.6551967263221741,
"learning_rate": 4.803728717766822e-06,
"loss": 0.0069,
"step": 261
},
{
"epoch": 1.1445255474452556,
"grad_norm": 0.7421084642410278,
"learning_rate": 4.801315912041232e-06,
"loss": 0.0083,
"step": 262
},
{
"epoch": 1.148905109489051,
"grad_norm": 0.6349561810493469,
"learning_rate": 4.798888979728382e-06,
"loss": 0.0097,
"step": 263
},
{
"epoch": 1.1532846715328466,
"grad_norm": 0.6274579167366028,
"learning_rate": 4.796447935725954e-06,
"loss": 0.0089,
"step": 264
},
{
"epoch": 1.1576642335766423,
"grad_norm": 0.5055127739906311,
"learning_rate": 4.793992795018253e-06,
"loss": 0.0062,
"step": 265
},
{
"epoch": 1.162043795620438,
"grad_norm": 1.1284935474395752,
"learning_rate": 4.791523572676115e-06,
"loss": 0.0118,
"step": 266
},
{
"epoch": 1.1664233576642336,
"grad_norm": 0.6343486905097961,
"learning_rate": 4.789040283856822e-06,
"loss": 0.0058,
"step": 267
},
{
"epoch": 1.1708029197080292,
"grad_norm": 0.9384168982505798,
"learning_rate": 4.7865429438039955e-06,
"loss": 0.0096,
"step": 268
},
{
"epoch": 1.1751824817518248,
"grad_norm": 0.879307746887207,
"learning_rate": 4.784031567847515e-06,
"loss": 0.0129,
"step": 269
},
{
"epoch": 1.1795620437956205,
"grad_norm": 0.5268783569335938,
"learning_rate": 4.781506171403416e-06,
"loss": 0.0073,
"step": 270
},
{
"epoch": 1.183941605839416,
"grad_norm": 1.332766056060791,
"learning_rate": 4.778966769973802e-06,
"loss": 0.0129,
"step": 271
},
{
"epoch": 1.1883211678832117,
"grad_norm": 0.7192438244819641,
"learning_rate": 4.7764133791467434e-06,
"loss": 0.0083,
"step": 272
},
{
"epoch": 1.1927007299270074,
"grad_norm": 0.5047981142997742,
"learning_rate": 4.773846014596185e-06,
"loss": 0.0057,
"step": 273
},
{
"epoch": 1.197080291970803,
"grad_norm": 0.5075733661651611,
"learning_rate": 4.7712646920818486e-06,
"loss": 0.0098,
"step": 274
},
{
"epoch": 1.2014598540145984,
"grad_norm": 0.5874909162521362,
"learning_rate": 4.7686694274491375e-06,
"loss": 0.0072,
"step": 275
},
{
"epoch": 1.205839416058394,
"grad_norm": 0.511114239692688,
"learning_rate": 4.766060236629037e-06,
"loss": 0.0058,
"step": 276
},
{
"epoch": 1.2102189781021897,
"grad_norm": 0.5427272915840149,
"learning_rate": 4.763437135638021e-06,
"loss": 0.0094,
"step": 277
},
{
"epoch": 1.2145985401459853,
"grad_norm": 0.6207345724105835,
"learning_rate": 4.760800140577947e-06,
"loss": 0.0117,
"step": 278
},
{
"epoch": 1.218978102189781,
"grad_norm": 0.9132710695266724,
"learning_rate": 4.758149267635963e-06,
"loss": 0.0085,
"step": 279
},
{
"epoch": 1.2233576642335766,
"grad_norm": 0.500217080116272,
"learning_rate": 4.755484533084407e-06,
"loss": 0.01,
"step": 280
},
{
"epoch": 1.2277372262773723,
"grad_norm": 0.38535866141319275,
"learning_rate": 4.7528059532807045e-06,
"loss": 0.0038,
"step": 281
},
{
"epoch": 1.2321167883211679,
"grad_norm": 0.5505772233009338,
"learning_rate": 4.750113544667271e-06,
"loss": 0.0064,
"step": 282
},
{
"epoch": 1.2364963503649635,
"grad_norm": 0.5370091795921326,
"learning_rate": 4.747407323771408e-06,
"loss": 0.0083,
"step": 283
},
{
"epoch": 1.2408759124087592,
"grad_norm": 0.6680497527122498,
"learning_rate": 4.744687307205207e-06,
"loss": 0.006,
"step": 284
},
{
"epoch": 1.2452554744525548,
"grad_norm": 0.5799117088317871,
"learning_rate": 4.74195351166544e-06,
"loss": 0.0067,
"step": 285
},
{
"epoch": 1.2496350364963504,
"grad_norm": 0.3809143304824829,
"learning_rate": 4.739205953933464e-06,
"loss": 0.0081,
"step": 286
},
{
"epoch": 1.254014598540146,
"grad_norm": 0.8633838891983032,
"learning_rate": 4.736444650875114e-06,
"loss": 0.0083,
"step": 287
},
{
"epoch": 1.2583941605839417,
"grad_norm": 0.4796256124973297,
"learning_rate": 4.7336696194405995e-06,
"loss": 0.0083,
"step": 288
},
{
"epoch": 1.2627737226277373,
"grad_norm": 0.8990418314933777,
"learning_rate": 4.730880876664402e-06,
"loss": 0.0053,
"step": 289
},
{
"epoch": 1.2671532846715328,
"grad_norm": 0.21372799575328827,
"learning_rate": 4.72807843966517e-06,
"loss": 0.0042,
"step": 290
},
{
"epoch": 1.2715328467153284,
"grad_norm": 1.0377510786056519,
"learning_rate": 4.725262325645615e-06,
"loss": 0.0083,
"step": 291
},
{
"epoch": 1.275912408759124,
"grad_norm": 0.29527121782302856,
"learning_rate": 4.722432551892402e-06,
"loss": 0.0023,
"step": 292
},
{
"epoch": 1.2802919708029197,
"grad_norm": 1.5753306150436401,
"learning_rate": 4.719589135776048e-06,
"loss": 0.0089,
"step": 293
},
{
"epoch": 1.2846715328467153,
"grad_norm": 0.3794252574443817,
"learning_rate": 4.716732094750813e-06,
"loss": 0.003,
"step": 294
},
{
"epoch": 1.289051094890511,
"grad_norm": 0.5407822132110596,
"learning_rate": 4.7138614463545926e-06,
"loss": 0.0075,
"step": 295
},
{
"epoch": 1.2934306569343066,
"grad_norm": 0.8722830414772034,
"learning_rate": 4.710977208208812e-06,
"loss": 0.0051,
"step": 296
},
{
"epoch": 1.2978102189781022,
"grad_norm": 0.6819527745246887,
"learning_rate": 4.708079398018316e-06,
"loss": 0.0094,
"step": 297
},
{
"epoch": 1.3021897810218979,
"grad_norm": 0.7198041677474976,
"learning_rate": 4.7051680335712626e-06,
"loss": 0.0068,
"step": 298
},
{
"epoch": 1.3065693430656935,
"grad_norm": 0.467638224363327,
"learning_rate": 4.70224313273901e-06,
"loss": 0.0059,
"step": 299
},
{
"epoch": 1.310948905109489,
"grad_norm": 0.4593437612056732,
"learning_rate": 4.699304713476009e-06,
"loss": 0.0039,
"step": 300
},
{
"epoch": 1.3153284671532846,
"grad_norm": 0.5790926814079285,
"learning_rate": 4.696352793819698e-06,
"loss": 0.0057,
"step": 301
},
{
"epoch": 1.3197080291970802,
"grad_norm": 0.3413192331790924,
"learning_rate": 4.693387391890382e-06,
"loss": 0.0055,
"step": 302
},
{
"epoch": 1.3240875912408758,
"grad_norm": 0.5049291849136353,
"learning_rate": 4.690408525891129e-06,
"loss": 0.0061,
"step": 303
},
{
"epoch": 1.3284671532846715,
"grad_norm": 0.25111323595046997,
"learning_rate": 4.687416214107655e-06,
"loss": 0.0041,
"step": 304
},
{
"epoch": 1.332846715328467,
"grad_norm": 0.5559152364730835,
"learning_rate": 4.684410474908214e-06,
"loss": 0.0093,
"step": 305
},
{
"epoch": 1.3372262773722627,
"grad_norm": 0.3842668831348419,
"learning_rate": 4.681391326743484e-06,
"loss": 0.0051,
"step": 306
},
{
"epoch": 1.3416058394160584,
"grad_norm": 1.6264209747314453,
"learning_rate": 4.67835878814645e-06,
"loss": 0.0063,
"step": 307
},
{
"epoch": 1.345985401459854,
"grad_norm": 0.5829497575759888,
"learning_rate": 4.6753128777323e-06,
"loss": 0.0054,
"step": 308
},
{
"epoch": 1.3503649635036497,
"grad_norm": 0.6949307322502136,
"learning_rate": 4.6722536141982995e-06,
"loss": 0.0055,
"step": 309
},
{
"epoch": 1.3547445255474453,
"grad_norm": 0.6198911070823669,
"learning_rate": 4.669181016323686e-06,
"loss": 0.0063,
"step": 310
},
{
"epoch": 1.359124087591241,
"grad_norm": 0.4557003080844879,
"learning_rate": 4.666095102969545e-06,
"loss": 0.0053,
"step": 311
},
{
"epoch": 1.3635036496350366,
"grad_norm": 0.7198585271835327,
"learning_rate": 4.662995893078702e-06,
"loss": 0.0048,
"step": 312
},
{
"epoch": 1.3678832116788322,
"grad_norm": 0.4380558431148529,
"learning_rate": 4.659883405675604e-06,
"loss": 0.0057,
"step": 313
},
{
"epoch": 1.3722627737226278,
"grad_norm": 0.986754298210144,
"learning_rate": 4.656757659866199e-06,
"loss": 0.0091,
"step": 314
},
{
"epoch": 1.3766423357664235,
"grad_norm": 1.1282256841659546,
"learning_rate": 4.6536186748378236e-06,
"loss": 0.0058,
"step": 315
},
{
"epoch": 1.3810218978102191,
"grad_norm": 0.3973119854927063,
"learning_rate": 4.6504664698590795e-06,
"loss": 0.0048,
"step": 316
},
{
"epoch": 1.3854014598540145,
"grad_norm": 0.4406156837940216,
"learning_rate": 4.647301064279725e-06,
"loss": 0.0039,
"step": 317
},
{
"epoch": 1.3897810218978102,
"grad_norm": 0.8249232172966003,
"learning_rate": 4.644122477530545e-06,
"loss": 0.0084,
"step": 318
},
{
"epoch": 1.3941605839416058,
"grad_norm": 1.4877322912216187,
"learning_rate": 4.640930729123237e-06,
"loss": 0.0054,
"step": 319
},
{
"epoch": 1.3985401459854014,
"grad_norm": 0.4890510141849518,
"learning_rate": 4.6377258386502956e-06,
"loss": 0.0021,
"step": 320
},
{
"epoch": 1.402919708029197,
"grad_norm": 0.36471042037010193,
"learning_rate": 4.634507825784882e-06,
"loss": 0.004,
"step": 321
},
{
"epoch": 1.4072992700729927,
"grad_norm": 1.1714568138122559,
"learning_rate": 4.631276710280713e-06,
"loss": 0.0079,
"step": 322
},
{
"epoch": 1.4116788321167884,
"grad_norm": 0.509325385093689,
"learning_rate": 4.628032511971934e-06,
"loss": 0.0027,
"step": 323
},
{
"epoch": 1.416058394160584,
"grad_norm": 0.34730231761932373,
"learning_rate": 4.624775250772999e-06,
"loss": 0.004,
"step": 324
},
{
"epoch": 1.4204379562043796,
"grad_norm": 0.4304009974002838,
"learning_rate": 4.6215049466785484e-06,
"loss": 0.0046,
"step": 325
},
{
"epoch": 1.4248175182481753,
"grad_norm": 0.721092700958252,
"learning_rate": 4.618221619763287e-06,
"loss": 0.0042,
"step": 326
},
{
"epoch": 1.4291970802919707,
"grad_norm": 0.9019221067428589,
"learning_rate": 4.6149252901818585e-06,
"loss": 0.008,
"step": 327
},
{
"epoch": 1.4335766423357663,
"grad_norm": 3.142669439315796,
"learning_rate": 4.611615978168725e-06,
"loss": 0.0053,
"step": 328
},
{
"epoch": 1.437956204379562,
"grad_norm": 0.8218545317649841,
"learning_rate": 4.608293704038039e-06,
"loss": 0.007,
"step": 329
},
{
"epoch": 1.4423357664233576,
"grad_norm": 0.49122154712677,
"learning_rate": 4.604958488183523e-06,
"loss": 0.0056,
"step": 330
},
{
"epoch": 1.4467153284671532,
"grad_norm": 0.7947913408279419,
"learning_rate": 4.6016103510783405e-06,
"loss": 0.0069,
"step": 331
},
{
"epoch": 1.4510948905109489,
"grad_norm": 0.38262632489204407,
"learning_rate": 4.598249313274972e-06,
"loss": 0.0054,
"step": 332
},
{
"epoch": 1.4554744525547445,
"grad_norm": 0.7605669498443604,
"learning_rate": 4.59487539540509e-06,
"loss": 0.0074,
"step": 333
},
{
"epoch": 1.4598540145985401,
"grad_norm": 0.4355056583881378,
"learning_rate": 4.591488618179428e-06,
"loss": 0.0027,
"step": 334
},
{
"epoch": 1.4642335766423358,
"grad_norm": 0.4696539640426636,
"learning_rate": 4.58808900238766e-06,
"loss": 0.0063,
"step": 335
},
{
"epoch": 1.4686131386861314,
"grad_norm": 0.4078298807144165,
"learning_rate": 4.584676568898267e-06,
"loss": 0.0039,
"step": 336
},
{
"epoch": 1.472992700729927,
"grad_norm": 0.22500784695148468,
"learning_rate": 4.581251338658412e-06,
"loss": 0.0027,
"step": 337
},
{
"epoch": 1.4773722627737227,
"grad_norm": 0.28224533796310425,
"learning_rate": 4.577813332693812e-06,
"loss": 0.0037,
"step": 338
},
{
"epoch": 1.4817518248175183,
"grad_norm": 0.4234824478626251,
"learning_rate": 4.574362572108604e-06,
"loss": 0.0057,
"step": 339
},
{
"epoch": 1.486131386861314,
"grad_norm": 0.4610466957092285,
"learning_rate": 4.570899078085223e-06,
"loss": 0.0033,
"step": 340
},
{
"epoch": 1.4905109489051096,
"grad_norm": 0.8538670539855957,
"learning_rate": 4.567422871884265e-06,
"loss": 0.0044,
"step": 341
},
{
"epoch": 1.4948905109489052,
"grad_norm": 0.4335832893848419,
"learning_rate": 4.563933974844361e-06,
"loss": 0.0041,
"step": 342
},
{
"epoch": 1.4992700729927007,
"grad_norm": 0.4888335168361664,
"learning_rate": 4.560432408382045e-06,
"loss": 0.003,
"step": 343
},
{
"epoch": 1.5036496350364965,
"grad_norm": 0.545806884765625,
"learning_rate": 4.5569181939916195e-06,
"loss": 0.0062,
"step": 344
},
{
"epoch": 1.508029197080292,
"grad_norm": 0.7364339828491211,
"learning_rate": 4.553391353245029e-06,
"loss": 0.0068,
"step": 345
},
{
"epoch": 1.5124087591240876,
"grad_norm": 0.7074061036109924,
"learning_rate": 4.549851907791722e-06,
"loss": 0.0034,
"step": 346
},
{
"epoch": 1.5167883211678832,
"grad_norm": 0.39756596088409424,
"learning_rate": 4.546299879358524e-06,
"loss": 0.0032,
"step": 347
},
{
"epoch": 1.5211678832116788,
"grad_norm": 0.6966583728790283,
"learning_rate": 4.542735289749498e-06,
"loss": 0.0013,
"step": 348
},
{
"epoch": 1.5255474452554745,
"grad_norm": 0.19892163574695587,
"learning_rate": 4.5391581608458144e-06,
"loss": 0.0011,
"step": 349
},
{
"epoch": 1.5299270072992701,
"grad_norm": 0.718493640422821,
"learning_rate": 4.535568514605617e-06,
"loss": 0.0026,
"step": 350
},
{
"epoch": 1.5343065693430655,
"grad_norm": 0.8941331505775452,
"learning_rate": 4.5319663730638865e-06,
"loss": 0.0034,
"step": 351
},
{
"epoch": 1.5386861313868612,
"grad_norm": 0.33956244587898254,
"learning_rate": 4.528351758332303e-06,
"loss": 0.002,
"step": 352
},
{
"epoch": 1.5430656934306568,
"grad_norm": 0.557651937007904,
"learning_rate": 4.5247246925991185e-06,
"loss": 0.0013,
"step": 353
},
{
"epoch": 1.5474452554744524,
"grad_norm": 0.7165636420249939,
"learning_rate": 4.5210851981290096e-06,
"loss": 0.003,
"step": 354
},
{
"epoch": 1.551824817518248,
"grad_norm": 0.36456218361854553,
"learning_rate": 4.5174332972629505e-06,
"loss": 0.0022,
"step": 355
},
{
"epoch": 1.5562043795620437,
"grad_norm": 0.1896594613790512,
"learning_rate": 4.5137690124180714e-06,
"loss": 0.0056,
"step": 356
},
{
"epoch": 1.5605839416058394,
"grad_norm": 0.6159863471984863,
"learning_rate": 4.510092366087518e-06,
"loss": 0.0057,
"step": 357
},
{
"epoch": 1.564963503649635,
"grad_norm": 1.0295354127883911,
"learning_rate": 4.506403380840321e-06,
"loss": 0.0011,
"step": 358
},
{
"epoch": 1.5693430656934306,
"grad_norm": 0.33694684505462646,
"learning_rate": 4.50270207932125e-06,
"loss": 0.0024,
"step": 359
},
{
"epoch": 1.5737226277372263,
"grad_norm": 0.8961917757987976,
"learning_rate": 4.498988484250681e-06,
"loss": 0.0058,
"step": 360
},
{
"epoch": 1.578102189781022,
"grad_norm": 1.736559510231018,
"learning_rate": 4.4952626184244504e-06,
"loss": 0.006,
"step": 361
},
{
"epoch": 1.5824817518248175,
"grad_norm": 0.41748425364494324,
"learning_rate": 4.491524504713722e-06,
"loss": 0.0017,
"step": 362
},
{
"epoch": 1.5868613138686132,
"grad_norm": 0.501815140247345,
"learning_rate": 4.487774166064839e-06,
"loss": 0.0018,
"step": 363
},
{
"epoch": 1.5912408759124088,
"grad_norm": 0.4359874427318573,
"learning_rate": 4.48401162549919e-06,
"loss": 0.0044,
"step": 364
},
{
"epoch": 1.5956204379562045,
"grad_norm": 0.3699054718017578,
"learning_rate": 4.480236906113066e-06,
"loss": 0.0036,
"step": 365
},
{
"epoch": 1.6,
"grad_norm": 0.5684164762496948,
"learning_rate": 4.476450031077512e-06,
"loss": 0.0023,
"step": 366
},
{
"epoch": 1.6043795620437957,
"grad_norm": 0.6451728343963623,
"learning_rate": 4.4726510236381956e-06,
"loss": 0.0044,
"step": 367
},
{
"epoch": 1.6087591240875914,
"grad_norm": 2.3887782096862793,
"learning_rate": 4.468839907115259e-06,
"loss": 0.0059,
"step": 368
},
{
"epoch": 1.613138686131387,
"grad_norm": 0.6304333806037903,
"learning_rate": 4.465016704903171e-06,
"loss": 0.0023,
"step": 369
},
{
"epoch": 1.6175182481751826,
"grad_norm": 0.38788676261901855,
"learning_rate": 4.461181440470592e-06,
"loss": 0.0027,
"step": 370
},
{
"epoch": 1.621897810218978,
"grad_norm": 0.3805489242076874,
"learning_rate": 4.457334137360226e-06,
"loss": 0.0012,
"step": 371
},
{
"epoch": 1.6262773722627737,
"grad_norm": 0.3548617660999298,
"learning_rate": 4.453474819188676e-06,
"loss": 0.0032,
"step": 372
},
{
"epoch": 1.6306569343065693,
"grad_norm": 0.8332701921463013,
"learning_rate": 4.449603509646297e-06,
"loss": 0.0028,
"step": 373
},
{
"epoch": 1.635036496350365,
"grad_norm": 0.7843290567398071,
"learning_rate": 4.445720232497055e-06,
"loss": 0.0037,
"step": 374
},
{
"epoch": 1.6394160583941606,
"grad_norm": 0.7074784636497498,
"learning_rate": 4.44182501157838e-06,
"loss": 0.003,
"step": 375
},
{
"epoch": 1.6437956204379562,
"grad_norm": 0.6076835989952087,
"learning_rate": 4.4379178708010155e-06,
"loss": 0.0019,
"step": 376
},
{
"epoch": 1.6481751824817519,
"grad_norm": 0.5793138146400452,
"learning_rate": 4.433998834148877e-06,
"loss": 0.0036,
"step": 377
},
{
"epoch": 1.6525547445255473,
"grad_norm": 0.7881670594215393,
"learning_rate": 4.430067925678902e-06,
"loss": 0.0025,
"step": 378
},
{
"epoch": 1.656934306569343,
"grad_norm": 0.24594959616661072,
"learning_rate": 4.426125169520903e-06,
"loss": 0.0022,
"step": 379
},
{
"epoch": 1.6613138686131386,
"grad_norm": 0.2806392312049866,
"learning_rate": 4.42217058987742e-06,
"loss": 0.0005,
"step": 380
},
{
"epoch": 1.6656934306569342,
"grad_norm": 0.4979081153869629,
"learning_rate": 4.418204211023569e-06,
"loss": 0.0021,
"step": 381
},
{
"epoch": 1.6700729927007298,
"grad_norm": 0.42502567172050476,
"learning_rate": 4.4142260573068995e-06,
"loss": 0.0053,
"step": 382
},
{
"epoch": 1.6744525547445255,
"grad_norm": 1.1811860799789429,
"learning_rate": 4.410236153147235e-06,
"loss": 0.0026,
"step": 383
},
{
"epoch": 1.6788321167883211,
"grad_norm": 0.4582519829273224,
"learning_rate": 4.4062345230365345e-06,
"loss": 0.0024,
"step": 384
},
{
"epoch": 1.6832116788321168,
"grad_norm": 0.30464282631874084,
"learning_rate": 4.402221191538733e-06,
"loss": 0.0055,
"step": 385
},
{
"epoch": 1.6875912408759124,
"grad_norm": 0.22526738047599792,
"learning_rate": 4.3981961832895945e-06,
"loss": 0.0003,
"step": 386
},
{
"epoch": 1.691970802919708,
"grad_norm": 0.32826468348503113,
"learning_rate": 4.394159522996564e-06,
"loss": 0.0009,
"step": 387
},
{
"epoch": 1.6963503649635037,
"grad_norm": 0.5943058133125305,
"learning_rate": 4.390111235438606e-06,
"loss": 0.0028,
"step": 388
},
{
"epoch": 1.7007299270072993,
"grad_norm": 1.7098802328109741,
"learning_rate": 4.3860513454660666e-06,
"loss": 0.0035,
"step": 389
},
{
"epoch": 1.705109489051095,
"grad_norm": 0.36092230677604675,
"learning_rate": 4.381979878000506e-06,
"loss": 0.0037,
"step": 390
},
{
"epoch": 1.7094890510948906,
"grad_norm": 0.2771202027797699,
"learning_rate": 4.377896858034557e-06,
"loss": 0.0018,
"step": 391
},
{
"epoch": 1.7138686131386862,
"grad_norm": 0.12323533743619919,
"learning_rate": 4.373802310631765e-06,
"loss": 0.0008,
"step": 392
},
{
"epoch": 1.7182481751824819,
"grad_norm": 0.19630667567253113,
"learning_rate": 4.3696962609264375e-06,
"loss": 0.0008,
"step": 393
},
{
"epoch": 1.7226277372262775,
"grad_norm": 0.4139691889286041,
"learning_rate": 4.365578734123489e-06,
"loss": 0.0031,
"step": 394
},
{
"epoch": 1.7270072992700731,
"grad_norm": 0.6594070196151733,
"learning_rate": 4.3614497554982845e-06,
"loss": 0.0044,
"step": 395
},
{
"epoch": 1.7313868613138688,
"grad_norm": 0.2723977863788605,
"learning_rate": 4.357309350396488e-06,
"loss": 0.0018,
"step": 396
},
{
"epoch": 1.7357664233576642,
"grad_norm": 0.16032417118549347,
"learning_rate": 4.3531575442339025e-06,
"loss": 0.0005,
"step": 397
},
{
"epoch": 1.7401459854014598,
"grad_norm": 0.3799298107624054,
"learning_rate": 4.348994362496316e-06,
"loss": 0.006,
"step": 398
},
{
"epoch": 1.7445255474452555,
"grad_norm": 0.28333285450935364,
"learning_rate": 4.344819830739349e-06,
"loss": 0.0015,
"step": 399
},
{
"epoch": 1.748905109489051,
"grad_norm": 0.3942627012729645,
"learning_rate": 4.34063397458829e-06,
"loss": 0.0018,
"step": 400
},
{
"epoch": 1.7532846715328467,
"grad_norm": 0.8048702478408813,
"learning_rate": 4.336436819737942e-06,
"loss": 0.0021,
"step": 401
},
{
"epoch": 1.7576642335766424,
"grad_norm": 0.1157551184296608,
"learning_rate": 4.332228391952469e-06,
"loss": 0.0009,
"step": 402
},
{
"epoch": 1.762043795620438,
"grad_norm": 0.18697626888751984,
"learning_rate": 4.328008717065228e-06,
"loss": 0.0031,
"step": 403
},
{
"epoch": 1.7664233576642334,
"grad_norm": 0.6587929129600525,
"learning_rate": 4.323777820978622e-06,
"loss": 0.0011,
"step": 404
},
{
"epoch": 1.770802919708029,
"grad_norm": 0.40322232246398926,
"learning_rate": 4.319535729663929e-06,
"loss": 0.0013,
"step": 405
},
{
"epoch": 1.7751824817518247,
"grad_norm": 0.33533793687820435,
"learning_rate": 4.315282469161156e-06,
"loss": 0.0008,
"step": 406
},
{
"epoch": 1.7795620437956203,
"grad_norm": 0.2024499624967575,
"learning_rate": 4.3110180655788645e-06,
"loss": 0.0022,
"step": 407
},
{
"epoch": 1.783941605839416,
"grad_norm": 0.5895872116088867,
"learning_rate": 4.306742545094022e-06,
"loss": 0.0019,
"step": 408
},
{
"epoch": 1.7883211678832116,
"grad_norm": 0.3792962431907654,
"learning_rate": 4.3024559339518355e-06,
"loss": 0.0017,
"step": 409
},
{
"epoch": 1.7927007299270072,
"grad_norm": 0.7945428490638733,
"learning_rate": 4.298158258465593e-06,
"loss": 0.0027,
"step": 410
},
{
"epoch": 1.7970802919708029,
"grad_norm": 0.37964075803756714,
"learning_rate": 4.2938495450164984e-06,
"loss": 0.0014,
"step": 411
},
{
"epoch": 1.8014598540145985,
"grad_norm": 0.08326616883277893,
"learning_rate": 4.289529820053515e-06,
"loss": 0.0005,
"step": 412
},
{
"epoch": 1.8058394160583942,
"grad_norm": 0.14445550739765167,
"learning_rate": 4.285199110093198e-06,
"loss": 0.0021,
"step": 413
},
{
"epoch": 1.8102189781021898,
"grad_norm": 0.24620558321475983,
"learning_rate": 4.280857441719533e-06,
"loss": 0.0007,
"step": 414
},
{
"epoch": 1.8145985401459854,
"grad_norm": 0.2617506980895996,
"learning_rate": 4.276504841583778e-06,
"loss": 0.0011,
"step": 415
},
{
"epoch": 1.818978102189781,
"grad_norm": 0.22467154264450073,
"learning_rate": 4.27214133640429e-06,
"loss": 0.0006,
"step": 416
},
{
"epoch": 1.8233576642335767,
"grad_norm": 0.25831958651542664,
"learning_rate": 4.267766952966369e-06,
"loss": 0.0029,
"step": 417
},
{
"epoch": 1.8277372262773723,
"grad_norm": 0.30368125438690186,
"learning_rate": 4.263381718122092e-06,
"loss": 0.0016,
"step": 418
},
{
"epoch": 1.832116788321168,
"grad_norm": 0.6697282195091248,
"learning_rate": 4.258985658790144e-06,
"loss": 0.0044,
"step": 419
},
{
"epoch": 1.8364963503649636,
"grad_norm": 0.4912242293357849,
"learning_rate": 4.25457880195566e-06,
"loss": 0.0014,
"step": 420
},
{
"epoch": 1.8408759124087593,
"grad_norm": 0.17477519810199738,
"learning_rate": 4.2501611746700526e-06,
"loss": 0.0002,
"step": 421
},
{
"epoch": 1.845255474452555,
"grad_norm": 0.09962823987007141,
"learning_rate": 4.245732804050848e-06,
"loss": 0.0009,
"step": 422
},
{
"epoch": 1.8496350364963505,
"grad_norm": 0.5256549119949341,
"learning_rate": 4.241293717281523e-06,
"loss": 0.0005,
"step": 423
},
{
"epoch": 1.854014598540146,
"grad_norm": 0.1596180498600006,
"learning_rate": 4.236843941611332e-06,
"loss": 0.001,
"step": 424
},
{
"epoch": 1.8583941605839416,
"grad_norm": 0.3437536656856537,
"learning_rate": 4.232383504355147e-06,
"loss": 0.002,
"step": 425
},
{
"epoch": 1.8627737226277372,
"grad_norm": 0.32742857933044434,
"learning_rate": 4.227912432893282e-06,
"loss": 0.0018,
"step": 426
},
{
"epoch": 1.8671532846715329,
"grad_norm": 0.5527262091636658,
"learning_rate": 4.223430754671331e-06,
"loss": 0.0004,
"step": 427
},
{
"epoch": 1.8715328467153285,
"grad_norm": 0.11191878467798233,
"learning_rate": 4.218938497199996e-06,
"loss": 0.0003,
"step": 428
},
{
"epoch": 1.8759124087591241,
"grad_norm": 0.09846347570419312,
"learning_rate": 4.214435688054922e-06,
"loss": 0.0004,
"step": 429
},
{
"epoch": 1.8802919708029195,
"grad_norm": 0.16991831362247467,
"learning_rate": 4.209922354876523e-06,
"loss": 0.0008,
"step": 430
},
{
"epoch": 1.8846715328467152,
"grad_norm": 0.126469686627388,
"learning_rate": 4.2053985253698155e-06,
"loss": 0.0004,
"step": 431
},
{
"epoch": 1.8890510948905108,
"grad_norm": 0.3232942521572113,
"learning_rate": 4.200864227304247e-06,
"loss": 0.0022,
"step": 432
},
{
"epoch": 1.8934306569343065,
"grad_norm": 0.3737439811229706,
"learning_rate": 4.196319488513528e-06,
"loss": 0.0017,
"step": 433
},
{
"epoch": 1.897810218978102,
"grad_norm": 0.14488628506660461,
"learning_rate": 4.191764336895455e-06,
"loss": 0.0002,
"step": 434
},
{
"epoch": 1.9021897810218977,
"grad_norm": 0.16040323674678802,
"learning_rate": 4.187198800411748e-06,
"loss": 0.0005,
"step": 435
},
{
"epoch": 1.9065693430656934,
"grad_norm": 0.19812235236167908,
"learning_rate": 4.182622907087872e-06,
"loss": 0.0002,
"step": 436
},
{
"epoch": 1.910948905109489,
"grad_norm": 0.059883181005716324,
"learning_rate": 4.178036685012869e-06,
"loss": 0.0005,
"step": 437
},
{
"epoch": 1.9153284671532846,
"grad_norm": 0.2905563414096832,
"learning_rate": 4.1734401623391794e-06,
"loss": 0.001,
"step": 438
},
{
"epoch": 1.9197080291970803,
"grad_norm": 0.38163650035858154,
"learning_rate": 4.168833367282479e-06,
"loss": 0.0007,
"step": 439
},
{
"epoch": 1.924087591240876,
"grad_norm": 0.04273957014083862,
"learning_rate": 4.164216328121499e-06,
"loss": 0.0001,
"step": 440
},
{
"epoch": 1.9284671532846716,
"grad_norm": 0.02129952795803547,
"learning_rate": 4.15958907319785e-06,
"loss": 0.0001,
"step": 441
},
{
"epoch": 1.9328467153284672,
"grad_norm": 0.016533153131604195,
"learning_rate": 4.154951630915859e-06,
"loss": 0.0001,
"step": 442
},
{
"epoch": 1.9372262773722628,
"grad_norm": 0.11019770801067352,
"learning_rate": 4.150304029742381e-06,
"loss": 0.0002,
"step": 443
},
{
"epoch": 1.9416058394160585,
"grad_norm": 0.05474651977419853,
"learning_rate": 4.145646298206636e-06,
"loss": 0.0002,
"step": 444
},
{
"epoch": 1.945985401459854,
"grad_norm": 0.10142989456653595,
"learning_rate": 4.1409784649000255e-06,
"loss": 0.0001,
"step": 445
},
{
"epoch": 1.9503649635036497,
"grad_norm": 0.21639519929885864,
"learning_rate": 4.136300558475962e-06,
"loss": 0.0019,
"step": 446
},
{
"epoch": 1.9547445255474454,
"grad_norm": 0.45263969898223877,
"learning_rate": 4.131612607649694e-06,
"loss": 0.0034,
"step": 447
},
{
"epoch": 1.959124087591241,
"grad_norm": 0.38673898577690125,
"learning_rate": 4.126914641198123e-06,
"loss": 0.0005,
"step": 448
},
{
"epoch": 1.9635036496350367,
"grad_norm": 0.29815611243247986,
"learning_rate": 4.1222066879596344e-06,
"loss": 0.0004,
"step": 449
},
{
"epoch": 1.967883211678832,
"grad_norm": 0.029003242030739784,
"learning_rate": 4.1174887768339165e-06,
"loss": 0.0002,
"step": 450
},
{
"epoch": 1.9722627737226277,
"grad_norm": 0.21172675490379333,
"learning_rate": 4.112760936781783e-06,
"loss": 0.0002,
"step": 451
},
{
"epoch": 1.9766423357664233,
"grad_norm": 0.27625802159309387,
"learning_rate": 4.108023196824998e-06,
"loss": 0.0038,
"step": 452
},
{
"epoch": 1.981021897810219,
"grad_norm": 0.26207876205444336,
"learning_rate": 4.103275586046095e-06,
"loss": 0.0002,
"step": 453
},
{
"epoch": 1.9854014598540146,
"grad_norm": 0.6478922367095947,
"learning_rate": 4.098518133588198e-06,
"loss": 0.0015,
"step": 454
},
{
"epoch": 1.9897810218978103,
"grad_norm": 0.10226385295391083,
"learning_rate": 4.093750868654845e-06,
"loss": 0.0005,
"step": 455
},
{
"epoch": 1.994160583941606,
"grad_norm": 0.18696191906929016,
"learning_rate": 4.088973820509811e-06,
"loss": 0.0004,
"step": 456
},
{
"epoch": 1.9985401459854013,
"grad_norm": 0.26377999782562256,
"learning_rate": 4.0841870184769184e-06,
"loss": 0.0006,
"step": 457
},
{
"epoch": 2.0,
"grad_norm": 0.26377999782562256,
"learning_rate": 4.079390491939868e-06,
"loss": 0.0002,
"step": 458
},
{
"epoch": 2.0043795620437956,
"grad_norm": 0.1395256221294403,
"learning_rate": 4.074584270342057e-06,
"loss": 0.0001,
"step": 459
},
{
"epoch": 2.0087591240875913,
"grad_norm": 0.028063921257853508,
"learning_rate": 4.069768383186388e-06,
"loss": 0.0001,
"step": 460
},
{
"epoch": 2.013138686131387,
"grad_norm": 0.12402255833148956,
"learning_rate": 4.064942860035102e-06,
"loss": 0.0008,
"step": 461
},
{
"epoch": 2.0175182481751825,
"grad_norm": 1.1348239183425903,
"learning_rate": 4.060107730509587e-06,
"loss": 0.0002,
"step": 462
},
{
"epoch": 2.021897810218978,
"grad_norm": 0.08540225028991699,
"learning_rate": 4.055263024290201e-06,
"loss": 0.0001,
"step": 463
},
{
"epoch": 2.026277372262774,
"grad_norm": 0.021235302090644836,
"learning_rate": 4.0504087711160875e-06,
"loss": 0.0006,
"step": 464
},
{
"epoch": 2.0306569343065695,
"grad_norm": 0.11977211385965347,
"learning_rate": 4.045545000784995e-06,
"loss": 0.0001,
"step": 465
},
{
"epoch": 2.035036496350365,
"grad_norm": 0.03654933720827103,
"learning_rate": 4.040671743153091e-06,
"loss": 0.0,
"step": 466
},
{
"epoch": 2.0394160583941607,
"grad_norm": 0.2099592536687851,
"learning_rate": 4.035789028134782e-06,
"loss": 0.0015,
"step": 467
},
{
"epoch": 2.0437956204379564,
"grad_norm": 0.3698459267616272,
"learning_rate": 4.03089688570253e-06,
"loss": 0.0005,
"step": 468
},
{
"epoch": 2.048175182481752,
"grad_norm": 0.12982334196567535,
"learning_rate": 4.025995345886663e-06,
"loss": 0.0005,
"step": 469
},
{
"epoch": 2.0525547445255476,
"grad_norm": 0.2783415615558624,
"learning_rate": 4.021084438775199e-06,
"loss": 0.0001,
"step": 470
},
{
"epoch": 2.0569343065693433,
"grad_norm": 0.0721098855137825,
"learning_rate": 4.016164194513654e-06,
"loss": 0.0001,
"step": 471
},
{
"epoch": 2.0613138686131385,
"grad_norm": 0.013497601263225079,
"learning_rate": 4.01123464330486e-06,
"loss": 0.0001,
"step": 472
},
{
"epoch": 2.065693430656934,
"grad_norm": 0.025111181661486626,
"learning_rate": 4.006295815408781e-06,
"loss": 0.0,
"step": 473
},
{
"epoch": 2.0700729927007298,
"grad_norm": 0.01752329058945179,
"learning_rate": 4.001347741142327e-06,
"loss": 0.0016,
"step": 474
},
{
"epoch": 2.0744525547445254,
"grad_norm": 0.5526646971702576,
"learning_rate": 3.996390450879163e-06,
"loss": 0.0002,
"step": 475
},
{
"epoch": 2.078832116788321,
"grad_norm": 0.3007132112979889,
"learning_rate": 3.9914239750495276e-06,
"loss": 0.0003,
"step": 476
},
{
"epoch": 2.0832116788321167,
"grad_norm": 0.42960840463638306,
"learning_rate": 3.986448344140047e-06,
"loss": 0.0011,
"step": 477
},
{
"epoch": 2.0875912408759123,
"grad_norm": 0.396423876285553,
"learning_rate": 3.9814635886935425e-06,
"loss": 0.0001,
"step": 478
},
{
"epoch": 2.091970802919708,
"grad_norm": 0.04294263571500778,
"learning_rate": 3.976469739308849e-06,
"loss": 0.0001,
"step": 479
},
{
"epoch": 2.0963503649635036,
"grad_norm": 0.04891116917133331,
"learning_rate": 3.971466826640623e-06,
"loss": 0.0,
"step": 480
},
{
"epoch": 2.100729927007299,
"grad_norm": 0.028048237785696983,
"learning_rate": 3.966454881399155e-06,
"loss": 0.0002,
"step": 481
},
{
"epoch": 2.105109489051095,
"grad_norm": 0.08178327232599258,
"learning_rate": 3.961433934350183e-06,
"loss": 0.0001,
"step": 482
},
{
"epoch": 2.1094890510948905,
"grad_norm": 0.1565946638584137,
"learning_rate": 3.956404016314703e-06,
"loss": 0.0,
"step": 483
},
{
"epoch": 2.113868613138686,
"grad_norm": 0.017099319025874138,
"learning_rate": 3.951365158168778e-06,
"loss": 0.0,
"step": 484
},
{
"epoch": 2.1182481751824818,
"grad_norm": 0.013942725956439972,
"learning_rate": 3.9463173908433505e-06,
"loss": 0.0001,
"step": 485
},
{
"epoch": 2.1226277372262774,
"grad_norm": 0.031127380207180977,
"learning_rate": 3.94126074532405e-06,
"loss": 0.0001,
"step": 486
},
{
"epoch": 2.127007299270073,
"grad_norm": 0.0673493817448616,
"learning_rate": 3.936195252651008e-06,
"loss": 0.0,
"step": 487
},
{
"epoch": 2.1313868613138687,
"grad_norm": 0.009533442556858063,
"learning_rate": 3.931120943918661e-06,
"loss": 0.0001,
"step": 488
},
{
"epoch": 2.1357664233576643,
"grad_norm": 0.2884984016418457,
"learning_rate": 3.9260378502755644e-06,
"loss": 0.0,
"step": 489
},
{
"epoch": 2.14014598540146,
"grad_norm": 0.012546413578093052,
"learning_rate": 3.9209460029242e-06,
"loss": 0.0002,
"step": 490
},
{
"epoch": 2.1445255474452556,
"grad_norm": 0.06351834535598755,
"learning_rate": 3.915845433120781e-06,
"loss": 0.0001,
"step": 491
},
{
"epoch": 2.1489051094890512,
"grad_norm": 0.06691469997167587,
"learning_rate": 3.910736172175066e-06,
"loss": 0.0002,
"step": 492
},
{
"epoch": 2.153284671532847,
"grad_norm": 0.13160394132137299,
"learning_rate": 3.905618251450165e-06,
"loss": 0.0002,
"step": 493
},
{
"epoch": 2.1576642335766425,
"grad_norm": 0.13259641826152802,
"learning_rate": 3.900491702362344e-06,
"loss": 0.0001,
"step": 494
},
{
"epoch": 2.162043795620438,
"grad_norm": 0.0303142461925745,
"learning_rate": 3.895356556380833e-06,
"loss": 0.0,
"step": 495
},
{
"epoch": 2.1664233576642338,
"grad_norm": 0.013111322186887264,
"learning_rate": 3.890212845027637e-06,
"loss": 0.0001,
"step": 496
},
{
"epoch": 2.170802919708029,
"grad_norm": 0.10938696563243866,
"learning_rate": 3.8850605998773374e-06,
"loss": 0.0002,
"step": 497
},
{
"epoch": 2.1751824817518246,
"grad_norm": 0.032018087804317474,
"learning_rate": 3.8798998525568985e-06,
"loss": 0.0001,
"step": 498
},
{
"epoch": 2.1795620437956202,
"grad_norm": 0.0468863919377327,
"learning_rate": 3.87473063474548e-06,
"loss": 0.0001,
"step": 499
},
{
"epoch": 2.183941605839416,
"grad_norm": 0.07472304254770279,
"learning_rate": 3.869552978174233e-06,
"loss": 0.0013,
"step": 500
},
{
"epoch": 2.1883211678832115,
"grad_norm": 1.0564846992492676,
"learning_rate": 3.8643669146261105e-06,
"loss": 0.0,
"step": 501
},
{
"epoch": 2.192700729927007,
"grad_norm": 0.014464089646935463,
"learning_rate": 3.859172475935674e-06,
"loss": 0.0003,
"step": 502
},
{
"epoch": 2.197080291970803,
"grad_norm": 0.18908506631851196,
"learning_rate": 3.853969693988892e-06,
"loss": 0.0,
"step": 503
},
{
"epoch": 2.2014598540145984,
"grad_norm": 0.059129055589437485,
"learning_rate": 3.848758600722953e-06,
"loss": 0.0001,
"step": 504
},
{
"epoch": 2.205839416058394,
"grad_norm": 0.008117246441543102,
"learning_rate": 3.843539228126059e-06,
"loss": 0.0017,
"step": 505
},
{
"epoch": 2.2102189781021897,
"grad_norm": 0.8072256445884705,
"learning_rate": 3.838311608237239e-06,
"loss": 0.0036,
"step": 506
},
{
"epoch": 2.2145985401459853,
"grad_norm": 1.5086314678192139,
"learning_rate": 3.833075773146142e-06,
"loss": 0.0001,
"step": 507
},
{
"epoch": 2.218978102189781,
"grad_norm": 0.19763076305389404,
"learning_rate": 3.827831754992854e-06,
"loss": 0.0,
"step": 508
},
{
"epoch": 2.2233576642335766,
"grad_norm": 0.007341600954532623,
"learning_rate": 3.822579585967685e-06,
"loss": 0.0,
"step": 509
},
{
"epoch": 2.2277372262773723,
"grad_norm": 0.014308220706880093,
"learning_rate": 3.817319298310984e-06,
"loss": 0.0001,
"step": 510
},
{
"epoch": 2.232116788321168,
"grad_norm": 0.11574184894561768,
"learning_rate": 3.812050924312934e-06,
"loss": 0.0001,
"step": 511
},
{
"epoch": 2.2364963503649635,
"grad_norm": 0.04201903194189072,
"learning_rate": 3.8067744963133555e-06,
"loss": 0.0004,
"step": 512
},
{
"epoch": 2.240875912408759,
"grad_norm": 0.2121448516845703,
"learning_rate": 3.8014900467015093e-06,
"loss": 0.0008,
"step": 513
},
{
"epoch": 2.245255474452555,
"grad_norm": 0.41363075375556946,
"learning_rate": 3.7961976079158964e-06,
"loss": 0.0014,
"step": 514
},
{
"epoch": 2.2496350364963504,
"grad_norm": 0.0038762835320085287,
"learning_rate": 3.79089721244406e-06,
"loss": 0.0001,
"step": 515
},
{
"epoch": 2.254014598540146,
"grad_norm": 0.018642032518982887,
"learning_rate": 3.785588892822383e-06,
"loss": 0.0003,
"step": 516
},
{
"epoch": 2.2583941605839417,
"grad_norm": 0.12630635499954224,
"learning_rate": 3.780272681635894e-06,
"loss": 0.0,
"step": 517
},
{
"epoch": 2.2627737226277373,
"grad_norm": 0.34310439229011536,
"learning_rate": 3.77494861151806e-06,
"loss": 0.0013,
"step": 518
},
{
"epoch": 2.267153284671533,
"grad_norm": 0.044213853776454926,
"learning_rate": 3.769616715150593e-06,
"loss": 0.0005,
"step": 519
},
{
"epoch": 2.2715328467153286,
"grad_norm": 0.17094682157039642,
"learning_rate": 3.7642770252632444e-06,
"loss": 0.0001,
"step": 520
},
{
"epoch": 2.2759124087591243,
"grad_norm": 0.0257079117000103,
"learning_rate": 3.7589295746336074e-06,
"loss": 0.0,
"step": 521
},
{
"epoch": 2.28029197080292,
"grad_norm": 0.01195420790463686,
"learning_rate": 3.753574396086913e-06,
"loss": 0.0001,
"step": 522
},
{
"epoch": 2.2846715328467155,
"grad_norm": 0.426551878452301,
"learning_rate": 3.748211522495831e-06,
"loss": 0.001,
"step": 523
},
{
"epoch": 2.289051094890511,
"grad_norm": 0.02542410045862198,
"learning_rate": 3.742840986780266e-06,
"loss": 0.0002,
"step": 524
},
{
"epoch": 2.293430656934307,
"grad_norm": 0.2616007924079895,
"learning_rate": 3.737462821907158e-06,
"loss": 0.002,
"step": 525
},
{
"epoch": 2.297810218978102,
"grad_norm": 0.1279967725276947,
"learning_rate": 3.732077060890277e-06,
"loss": 0.0004,
"step": 526
},
{
"epoch": 2.3021897810218976,
"grad_norm": 0.21853570640087128,
"learning_rate": 3.7266837367900214e-06,
"loss": 0.0011,
"step": 527
},
{
"epoch": 2.3065693430656933,
"grad_norm": 0.21419601142406464,
"learning_rate": 3.721282882713218e-06,
"loss": 0.0005,
"step": 528
},
{
"epoch": 2.310948905109489,
"grad_norm": 0.2496281862258911,
"learning_rate": 3.7158745318129135e-06,
"loss": 0.0003,
"step": 529
},
{
"epoch": 2.3153284671532846,
"grad_norm": 0.10131111741065979,
"learning_rate": 3.710458717288176e-06,
"loss": 0.0003,
"step": 530
},
{
"epoch": 2.31970802919708,
"grad_norm": 0.023186631500720978,
"learning_rate": 3.7050354723838855e-06,
"loss": 0.0005,
"step": 531
},
{
"epoch": 2.324087591240876,
"grad_norm": 0.3965378403663635,
"learning_rate": 3.6996048303905373e-06,
"loss": 0.0003,
"step": 532
},
{
"epoch": 2.3284671532846715,
"grad_norm": 0.03234691172838211,
"learning_rate": 3.6941668246440323e-06,
"loss": 0.0002,
"step": 533
},
{
"epoch": 2.332846715328467,
"grad_norm": 0.14468444883823395,
"learning_rate": 3.688721488525471e-06,
"loss": 0.0003,
"step": 534
},
{
"epoch": 2.3372262773722627,
"grad_norm": 0.09114562720060349,
"learning_rate": 3.683268855460955e-06,
"loss": 0.0,
"step": 535
},
{
"epoch": 2.3416058394160584,
"grad_norm": 0.011452419683337212,
"learning_rate": 3.6778089589213756e-06,
"loss": 0.0,
"step": 536
},
{
"epoch": 2.345985401459854,
"grad_norm": 0.026728764176368713,
"learning_rate": 3.6723418324222126e-06,
"loss": 0.0,
"step": 537
},
{
"epoch": 2.3503649635036497,
"grad_norm": 0.010162679478526115,
"learning_rate": 3.666867509523325e-06,
"loss": 0.0003,
"step": 538
},
{
"epoch": 2.3547445255474453,
"grad_norm": 0.2717076241970062,
"learning_rate": 3.661386023828749e-06,
"loss": 0.0002,
"step": 539
},
{
"epoch": 2.359124087591241,
"grad_norm": 0.17123937606811523,
"learning_rate": 3.6558974089864875e-06,
"loss": 0.0,
"step": 540
},
{
"epoch": 2.3635036496350366,
"grad_norm": 0.05525152385234833,
"learning_rate": 3.650401698688305e-06,
"loss": 0.0001,
"step": 541
},
{
"epoch": 2.367883211678832,
"grad_norm": 0.0844765231013298,
"learning_rate": 3.644898926669524e-06,
"loss": 0.0001,
"step": 542
},
{
"epoch": 2.372262773722628,
"grad_norm": 0.05303291603922844,
"learning_rate": 3.6393891267088132e-06,
"loss": 0.0,
"step": 543
},
{
"epoch": 2.3766423357664235,
"grad_norm": 0.02489842288196087,
"learning_rate": 3.633872332627983e-06,
"loss": 0.0014,
"step": 544
},
{
"epoch": 2.381021897810219,
"grad_norm": 0.4054766893386841,
"learning_rate": 3.628348578291776e-06,
"loss": 0.0,
"step": 545
},
{
"epoch": 2.3854014598540147,
"grad_norm": 0.2564426064491272,
"learning_rate": 3.6228178976076626e-06,
"loss": 0.0005,
"step": 546
},
{
"epoch": 2.3897810218978104,
"grad_norm": 0.4229254722595215,
"learning_rate": 3.6172803245256283e-06,
"loss": 0.0001,
"step": 547
},
{
"epoch": 2.394160583941606,
"grad_norm": 0.05393270403146744,
"learning_rate": 3.611735893037967e-06,
"loss": 0.0,
"step": 548
},
{
"epoch": 2.398540145985401,
"grad_norm": 0.004804346710443497,
"learning_rate": 3.6061846371790754e-06,
"loss": 0.0001,
"step": 549
},
{
"epoch": 2.402919708029197,
"grad_norm": 0.2905990481376648,
"learning_rate": 3.6006265910252393e-06,
"loss": 0.0,
"step": 550
},
{
"epoch": 2.4072992700729925,
"grad_norm": 0.05537007004022598,
"learning_rate": 3.5950617886944272e-06,
"loss": 0.0001,
"step": 551
},
{
"epoch": 2.411678832116788,
"grad_norm": 0.006207960192114115,
"learning_rate": 3.5894902643460807e-06,
"loss": 0.0,
"step": 552
},
{
"epoch": 2.4160583941605838,
"grad_norm": 0.14528508484363556,
"learning_rate": 3.5839120521809036e-06,
"loss": 0.0003,
"step": 553
},
{
"epoch": 2.4204379562043794,
"grad_norm": 0.007411749102175236,
"learning_rate": 3.578327186440654e-06,
"loss": 0.0002,
"step": 554
},
{
"epoch": 2.424817518248175,
"grad_norm": 0.1338014453649521,
"learning_rate": 3.5727357014079306e-06,
"loss": 0.0001,
"step": 555
},
{
"epoch": 2.4291970802919707,
"grad_norm": 0.0037137684412300587,
"learning_rate": 3.5671376314059676e-06,
"loss": 0.0,
"step": 556
},
{
"epoch": 2.4335766423357663,
"grad_norm": 0.26890304684638977,
"learning_rate": 3.561533010798418e-06,
"loss": 0.0004,
"step": 557
},
{
"epoch": 2.437956204379562,
"grad_norm": 0.10123980045318604,
"learning_rate": 3.555921873989148e-06,
"loss": 0.0,
"step": 558
},
{
"epoch": 2.4423357664233576,
"grad_norm": 0.033905990421772,
"learning_rate": 3.5503042554220206e-06,
"loss": 0.0001,
"step": 559
},
{
"epoch": 2.4467153284671532,
"grad_norm": 0.2706269323825836,
"learning_rate": 3.5446801895806904e-06,
"loss": 0.0003,
"step": 560
},
{
"epoch": 2.451094890510949,
"grad_norm": 0.017258645966649055,
"learning_rate": 3.539049710988386e-06,
"loss": 0.0,
"step": 561
},
{
"epoch": 2.4554744525547445,
"grad_norm": 0.010611895471811295,
"learning_rate": 3.5334128542077007e-06,
"loss": 0.0,
"step": 562
},
{
"epoch": 2.45985401459854,
"grad_norm": 0.0026555017102509737,
"learning_rate": 3.527769653840381e-06,
"loss": 0.0,
"step": 563
},
{
"epoch": 2.4642335766423358,
"grad_norm": 0.15091587603092194,
"learning_rate": 3.5221201445271136e-06,
"loss": 0.0001,
"step": 564
},
{
"epoch": 2.4686131386861314,
"grad_norm": 0.05164916068315506,
"learning_rate": 3.5164643609473115e-06,
"loss": 0.0004,
"step": 565
},
{
"epoch": 2.472992700729927,
"grad_norm": 0.48653078079223633,
"learning_rate": 3.5108023378189036e-06,
"loss": 0.0,
"step": 566
},
{
"epoch": 2.4773722627737227,
"grad_norm": 0.015059034340083599,
"learning_rate": 3.5051341098981184e-06,
"loss": 0.0,
"step": 567
},
{
"epoch": 2.4817518248175183,
"grad_norm": 0.013922065496444702,
"learning_rate": 3.499459711979274e-06,
"loss": 0.0,
"step": 568
},
{
"epoch": 2.486131386861314,
"grad_norm": 0.2496362030506134,
"learning_rate": 3.493779178894561e-06,
"loss": 0.0004,
"step": 569
},
{
"epoch": 2.4905109489051096,
"grad_norm": 0.07800492644309998,
"learning_rate": 3.488092545513833e-06,
"loss": 0.0,
"step": 570
},
{
"epoch": 2.4948905109489052,
"grad_norm": 0.613246738910675,
"learning_rate": 3.4823998467443886e-06,
"loss": 0.001,
"step": 571
},
{
"epoch": 2.499270072992701,
"grad_norm": 0.5939133167266846,
"learning_rate": 3.4767011175307596e-06,
"loss": 0.0005,
"step": 572
},
{
"epoch": 2.5036496350364965,
"grad_norm": 0.02257866971194744,
"learning_rate": 3.4709963928544952e-06,
"loss": 0.0001,
"step": 573
},
{
"epoch": 2.508029197080292,
"grad_norm": 0.31688934564590454,
"learning_rate": 3.4652857077339464e-06,
"loss": 0.0016,
"step": 574
},
{
"epoch": 2.512408759124088,
"grad_norm": 0.24091926217079163,
"learning_rate": 3.459569097224054e-06,
"loss": 0.0,
"step": 575
},
{
"epoch": 2.5167883211678834,
"grad_norm": 0.0038259460125118494,
"learning_rate": 3.4538465964161315e-06,
"loss": 0.0009,
"step": 576
},
{
"epoch": 2.521167883211679,
"grad_norm": 0.8010972738265991,
"learning_rate": 3.448118240437649e-06,
"loss": 0.0,
"step": 577
},
{
"epoch": 2.5255474452554747,
"grad_norm": 0.10152903199195862,
"learning_rate": 3.442384064452019e-06,
"loss": 0.0,
"step": 578
},
{
"epoch": 2.5299270072992703,
"grad_norm": 0.0037650642916560173,
"learning_rate": 3.4366441036583803e-06,
"loss": 0.0,
"step": 579
},
{
"epoch": 2.5343065693430655,
"grad_norm": 0.002322736894711852,
"learning_rate": 3.4308983932913806e-06,
"loss": 0.0,
"step": 580
},
{
"epoch": 2.538686131386861,
"grad_norm": 0.010998697020113468,
"learning_rate": 3.4251469686209626e-06,
"loss": 0.0,
"step": 581
},
{
"epoch": 2.543065693430657,
"grad_norm": 0.009687647223472595,
"learning_rate": 3.419389864952145e-06,
"loss": 0.0,
"step": 582
},
{
"epoch": 2.5474452554744524,
"grad_norm": 0.003304305486381054,
"learning_rate": 3.413627117624808e-06,
"loss": 0.0001,
"step": 583
},
{
"epoch": 2.551824817518248,
"grad_norm": 0.04525240138173103,
"learning_rate": 3.4078587620134747e-06,
"loss": 0.0,
"step": 584
},
{
"epoch": 2.5562043795620437,
"grad_norm": 0.14142946898937225,
"learning_rate": 3.4020848335270946e-06,
"loss": 0.0013,
"step": 585
},
{
"epoch": 2.5605839416058394,
"grad_norm": 0.18268825113773346,
"learning_rate": 3.3963053676088253e-06,
"loss": 0.0,
"step": 586
},
{
"epoch": 2.564963503649635,
"grad_norm": 0.07787997275590897,
"learning_rate": 3.390520399735818e-06,
"loss": 0.0008,
"step": 587
},
{
"epoch": 2.5693430656934306,
"grad_norm": 0.6478791832923889,
"learning_rate": 3.3847299654189947e-06,
"loss": 0.0,
"step": 588
},
{
"epoch": 2.5737226277372263,
"grad_norm": 0.029990263283252716,
"learning_rate": 3.3789341002028364e-06,
"loss": 0.0,
"step": 589
},
{
"epoch": 2.578102189781022,
"grad_norm": 0.029525484889745712,
"learning_rate": 3.3731328396651586e-06,
"loss": 0.0,
"step": 590
},
{
"epoch": 2.5824817518248175,
"grad_norm": 0.008475772105157375,
"learning_rate": 3.3673262194168976e-06,
"loss": 0.0,
"step": 591
},
{
"epoch": 2.586861313868613,
"grad_norm": 0.013320226222276688,
"learning_rate": 3.3615142751018893e-06,
"loss": 0.0,
"step": 592
},
{
"epoch": 2.591240875912409,
"grad_norm": 0.009780370630323887,
"learning_rate": 3.3556970423966515e-06,
"loss": 0.0,
"step": 593
},
{
"epoch": 2.5956204379562045,
"grad_norm": 0.015174774453043938,
"learning_rate": 3.349874557010166e-06,
"loss": 0.0,
"step": 594
},
{
"epoch": 2.6,
"grad_norm": 0.011269648559391499,
"learning_rate": 3.3440468546836564e-06,
"loss": 0.0,
"step": 595
},
{
"epoch": 2.6043795620437957,
"grad_norm": 0.014948060736060143,
"learning_rate": 3.3382139711903707e-06,
"loss": 0.0,
"step": 596
},
{
"epoch": 2.6087591240875914,
"grad_norm": 0.005212848540395498,
"learning_rate": 3.3323759423353618e-06,
"loss": 0.0001,
"step": 597
},
{
"epoch": 2.613138686131387,
"grad_norm": 0.3330608308315277,
"learning_rate": 3.3265328039552676e-06,
"loss": 0.0013,
"step": 598
},
{
"epoch": 2.6175182481751826,
"grad_norm": 0.01612473465502262,
"learning_rate": 3.320684591918089e-06,
"loss": 0.002,
"step": 599
},
{
"epoch": 2.621897810218978,
"grad_norm": 1.6749413013458252,
"learning_rate": 3.3148313421229743e-06,
"loss": 0.0,
"step": 600
},
{
"epoch": 2.6262773722627735,
"grad_norm": 0.007137598004192114,
"learning_rate": 3.308973090499994e-06,
"loss": 0.0003,
"step": 601
},
{
"epoch": 2.630656934306569,
"grad_norm": 0.2040940374135971,
"learning_rate": 3.303109873009922e-06,
"loss": 0.0,
"step": 602
},
{
"epoch": 2.6350364963503647,
"grad_norm": 0.016712144017219543,
"learning_rate": 3.297241725644016e-06,
"loss": 0.0,
"step": 603
},
{
"epoch": 2.6394160583941604,
"grad_norm": 0.05457067862153053,
"learning_rate": 3.2913686844237963e-06,
"loss": 0.0002,
"step": 604
},
{
"epoch": 2.643795620437956,
"grad_norm": 0.05992881581187248,
"learning_rate": 3.2854907854008224e-06,
"loss": 0.0018,
"step": 605
},
{
"epoch": 2.6481751824817517,
"grad_norm": 0.2480854094028473,
"learning_rate": 3.2796080646564738e-06,
"loss": 0.0,
"step": 606
},
{
"epoch": 2.6525547445255473,
"grad_norm": 0.009658033028244972,
"learning_rate": 3.273720558301729e-06,
"loss": 0.0003,
"step": 607
},
{
"epoch": 2.656934306569343,
"grad_norm": 0.06701422482728958,
"learning_rate": 3.267828302476942e-06,
"loss": 0.0001,
"step": 608
},
{
"epoch": 2.6613138686131386,
"grad_norm": 0.04225367307662964,
"learning_rate": 3.2619313333516213e-06,
"loss": 0.0001,
"step": 609
},
{
"epoch": 2.665693430656934,
"grad_norm": 0.01937365159392357,
"learning_rate": 3.2560296871242085e-06,
"loss": 0.0,
"step": 610
},
{
"epoch": 2.67007299270073,
"grad_norm": 0.010443875566124916,
"learning_rate": 3.2501234000218558e-06,
"loss": 0.0001,
"step": 611
},
{
"epoch": 2.6744525547445255,
"grad_norm": 0.039485372602939606,
"learning_rate": 3.2442125083002014e-06,
"loss": 0.0001,
"step": 612
},
{
"epoch": 2.678832116788321,
"grad_norm": 0.01544400304555893,
"learning_rate": 3.238297048243151e-06,
"loss": 0.0,
"step": 613
},
{
"epoch": 2.6832116788321168,
"grad_norm": 0.014269383624196053,
"learning_rate": 3.2323770561626523e-06,
"loss": 0.0,
"step": 614
},
{
"epoch": 2.6875912408759124,
"grad_norm": 0.00998217985033989,
"learning_rate": 3.2264525683984717e-06,
"loss": 0.0001,
"step": 615
},
{
"epoch": 2.691970802919708,
"grad_norm": 0.0631580650806427,
"learning_rate": 3.2205236213179736e-06,
"loss": 0.0001,
"step": 616
},
{
"epoch": 2.6963503649635037,
"grad_norm": 0.022673817351460457,
"learning_rate": 3.2145902513158963e-06,
"loss": 0.0034,
"step": 617
},
{
"epoch": 2.7007299270072993,
"grad_norm": 0.24634726345539093,
"learning_rate": 3.2086524948141263e-06,
"loss": 0.0,
"step": 618
},
{
"epoch": 2.705109489051095,
"grad_norm": 0.018728157505393028,
"learning_rate": 3.2027103882614772e-06,
"loss": 0.0001,
"step": 619
},
{
"epoch": 2.7094890510948906,
"grad_norm": 1.7468042373657227,
"learning_rate": 3.1967639681334668e-06,
"loss": 0.0002,
"step": 620
},
{
"epoch": 2.713868613138686,
"grad_norm": 0.014051638543605804,
"learning_rate": 3.1908132709320895e-06,
"loss": 0.0003,
"step": 621
},
{
"epoch": 2.718248175182482,
"grad_norm": 0.1467590481042862,
"learning_rate": 3.1848583331855952e-06,
"loss": 0.0,
"step": 622
},
{
"epoch": 2.7226277372262775,
"grad_norm": 0.0301121287047863,
"learning_rate": 3.178899191448266e-06,
"loss": 0.0001,
"step": 623
},
{
"epoch": 2.727007299270073,
"grad_norm": 0.011421700939536095,
"learning_rate": 3.1729358823001873e-06,
"loss": 0.0,
"step": 624
},
{
"epoch": 2.7313868613138688,
"grad_norm": 0.018829964101314545,
"learning_rate": 3.1669684423470277e-06,
"loss": 0.0001,
"step": 625
},
{
"epoch": 2.7357664233576644,
"grad_norm": 0.2821716070175171,
"learning_rate": 3.1609969082198124e-06,
"loss": 0.0001,
"step": 626
},
{
"epoch": 2.74014598540146,
"grad_norm": 0.03946685045957565,
"learning_rate": 3.155021316574699e-06,
"loss": 0.0001,
"step": 627
},
{
"epoch": 2.7445255474452557,
"grad_norm": 0.03428944945335388,
"learning_rate": 3.1490417040927513e-06,
"loss": 0.0001,
"step": 628
},
{
"epoch": 2.7489051094890513,
"grad_norm": 0.0415036678314209,
"learning_rate": 3.143058107479716e-06,
"loss": 0.0001,
"step": 629
},
{
"epoch": 2.753284671532847,
"grad_norm": 0.011763577349483967,
"learning_rate": 3.1370705634657953e-06,
"loss": 0.0005,
"step": 630
},
{
"epoch": 2.7576642335766426,
"grad_norm": 0.2982889413833618,
"learning_rate": 3.1310791088054225e-06,
"loss": 0.0001,
"step": 631
},
{
"epoch": 2.7620437956204382,
"grad_norm": 0.07167187333106995,
"learning_rate": 3.1250837802770378e-06,
"loss": 0.0001,
"step": 632
},
{
"epoch": 2.7664233576642334,
"grad_norm": 0.009533251635730267,
"learning_rate": 3.1190846146828587e-06,
"loss": 0.0,
"step": 633
},
{
"epoch": 2.770802919708029,
"grad_norm": 0.09512810409069061,
"learning_rate": 3.1130816488486582e-06,
"loss": 0.0003,
"step": 634
},
{
"epoch": 2.7751824817518247,
"grad_norm": 0.024417169392108917,
"learning_rate": 3.1070749196235366e-06,
"loss": 0.0001,
"step": 635
},
{
"epoch": 2.7795620437956203,
"grad_norm": 0.026550231501460075,
"learning_rate": 3.1010644638796956e-06,
"loss": 0.0002,
"step": 636
},
{
"epoch": 2.783941605839416,
"grad_norm": 0.04400951415300369,
"learning_rate": 3.0950503185122116e-06,
"loss": 0.0001,
"step": 637
},
{
"epoch": 2.7883211678832116,
"grad_norm": 0.037575673311948776,
"learning_rate": 3.0890325204388107e-06,
"loss": 0.0001,
"step": 638
},
{
"epoch": 2.7927007299270072,
"grad_norm": 0.017704375088214874,
"learning_rate": 3.083011106599641e-06,
"loss": 0.0,
"step": 639
},
{
"epoch": 2.797080291970803,
"grad_norm": 0.5097067356109619,
"learning_rate": 3.0769861139570446e-06,
"loss": 0.0002,
"step": 640
},
{
"epoch": 2.8014598540145985,
"grad_norm": 0.03586648404598236,
"learning_rate": 3.0709575794953333e-06,
"loss": 0.0001,
"step": 641
},
{
"epoch": 2.805839416058394,
"grad_norm": 0.08737017959356308,
"learning_rate": 3.06492554022056e-06,
"loss": 0.0001,
"step": 642
},
{
"epoch": 2.81021897810219,
"grad_norm": 0.03558499738574028,
"learning_rate": 3.0588900331602915e-06,
"loss": 0.0,
"step": 643
},
{
"epoch": 2.8145985401459854,
"grad_norm": 0.015121969394385815,
"learning_rate": 3.0528510953633824e-06,
"loss": 0.0,
"step": 644
},
{
"epoch": 2.818978102189781,
"grad_norm": 0.005024883430451155,
"learning_rate": 3.046808763899745e-06,
"loss": 0.0,
"step": 645
},
{
"epoch": 2.8233576642335767,
"grad_norm": 0.10004691034555435,
"learning_rate": 3.0407630758601257e-06,
"loss": 0.0002,
"step": 646
},
{
"epoch": 2.8277372262773723,
"grad_norm": 0.0034653018228709698,
"learning_rate": 3.034714068355874e-06,
"loss": 0.0,
"step": 647
},
{
"epoch": 2.832116788321168,
"grad_norm": 0.005470994859933853,
"learning_rate": 3.0286617785187157e-06,
"loss": 0.0,
"step": 648
},
{
"epoch": 2.8364963503649636,
"grad_norm": 0.004967759363353252,
"learning_rate": 3.022606243500526e-06,
"loss": 0.0,
"step": 649
},
{
"epoch": 2.8408759124087593,
"grad_norm": 0.005472187884151936,
"learning_rate": 3.0165475004730994e-06,
"loss": 0.0,
"step": 650
},
{
"epoch": 2.845255474452555,
"grad_norm": 0.02719656005501747,
"learning_rate": 3.0104855866279244e-06,
"loss": 0.0,
"step": 651
},
{
"epoch": 2.8496350364963505,
"grad_norm": 0.006005981471389532,
"learning_rate": 3.0044205391759517e-06,
"loss": 0.0001,
"step": 652
},
{
"epoch": 2.8540145985401457,
"grad_norm": 0.07548055797815323,
"learning_rate": 2.9983523953473697e-06,
"loss": 0.0001,
"step": 653
},
{
"epoch": 2.8583941605839414,
"grad_norm": 0.022410407662391663,
"learning_rate": 2.9922811923913712e-06,
"loss": 0.0,
"step": 654
},
{
"epoch": 2.862773722627737,
"grad_norm": 0.0035277260467410088,
"learning_rate": 2.9862069675759297e-06,
"loss": 0.0,
"step": 655
},
{
"epoch": 2.8671532846715326,
"grad_norm": 0.00670978520065546,
"learning_rate": 2.980129758187567e-06,
"loss": 0.0,
"step": 656
},
{
"epoch": 2.8715328467153283,
"grad_norm": 0.0031098946928977966,
"learning_rate": 2.974049601531126e-06,
"loss": 0.0,
"step": 657
},
{
"epoch": 2.875912408759124,
"grad_norm": 0.013540991581976414,
"learning_rate": 2.9679665349295417e-06,
"loss": 0.0,
"step": 658
},
{
"epoch": 2.8802919708029195,
"grad_norm": 0.015702638775110245,
"learning_rate": 2.9618805957236113e-06,
"loss": 0.0,
"step": 659
},
{
"epoch": 2.884671532846715,
"grad_norm": 0.03669784590601921,
"learning_rate": 2.955791821271766e-06,
"loss": 0.0,
"step": 660
},
{
"epoch": 2.889051094890511,
"grad_norm": 0.011530515737831593,
"learning_rate": 2.9497002489498394e-06,
"loss": 0.0,
"step": 661
},
{
"epoch": 2.8934306569343065,
"grad_norm": 0.004896043334156275,
"learning_rate": 2.9436059161508425e-06,
"loss": 0.0,
"step": 662
},
{
"epoch": 2.897810218978102,
"grad_norm": 0.006100150756537914,
"learning_rate": 2.9375088602847303e-06,
"loss": 0.0,
"step": 663
},
{
"epoch": 2.9021897810218977,
"grad_norm": 0.0026638389099389315,
"learning_rate": 2.931409118778172e-06,
"loss": 0.0,
"step": 664
},
{
"epoch": 2.9065693430656934,
"grad_norm": 0.03918790817260742,
"learning_rate": 2.9253067290743237e-06,
"loss": 0.0001,
"step": 665
},
{
"epoch": 2.910948905109489,
"grad_norm": 0.7945866584777832,
"learning_rate": 2.9192017286325975e-06,
"loss": 0.0,
"step": 666
},
{
"epoch": 2.9153284671532846,
"grad_norm": 0.004381685517728329,
"learning_rate": 2.913094154928431e-06,
"loss": 0.0,
"step": 667
},
{
"epoch": 2.9197080291970803,
"grad_norm": 0.006551735103130341,
"learning_rate": 2.9069840454530583e-06,
"loss": 0.0,
"step": 668
},
{
"epoch": 2.924087591240876,
"grad_norm": 0.003194763557985425,
"learning_rate": 2.900871437713279e-06,
"loss": 0.0002,
"step": 669
},
{
"epoch": 2.9284671532846716,
"grad_norm": 0.05487615615129471,
"learning_rate": 2.894756369231228e-06,
"loss": 0.0,
"step": 670
},
{
"epoch": 2.932846715328467,
"grad_norm": 0.008219748735427856,
"learning_rate": 2.888638877544146e-06,
"loss": 0.0,
"step": 671
},
{
"epoch": 2.937226277372263,
"grad_norm": 0.0017953907372429967,
"learning_rate": 2.8825190002041475e-06,
"loss": 0.0,
"step": 672
},
{
"epoch": 2.9416058394160585,
"grad_norm": 0.00900351069867611,
"learning_rate": 2.8763967747779926e-06,
"loss": 0.0,
"step": 673
},
{
"epoch": 2.945985401459854,
"grad_norm": 0.0021545395720750093,
"learning_rate": 2.8702722388468544e-06,
"loss": 0.0,
"step": 674
},
{
"epoch": 2.9503649635036497,
"grad_norm": 0.01696695201098919,
"learning_rate": 2.864145430006089e-06,
"loss": 0.0,
"step": 675
},
{
"epoch": 2.9547445255474454,
"grad_norm": 0.018733413890004158,
"learning_rate": 2.858016385865004e-06,
"loss": 0.0001,
"step": 676
},
{
"epoch": 2.959124087591241,
"grad_norm": 0.0550026074051857,
"learning_rate": 2.85188514404663e-06,
"loss": 0.0,
"step": 677
},
{
"epoch": 2.9635036496350367,
"grad_norm": 0.00857064314186573,
"learning_rate": 2.845751742187487e-06,
"loss": 0.0,
"step": 678
},
{
"epoch": 2.9678832116788323,
"grad_norm": 0.008322341367602348,
"learning_rate": 2.839616217937354e-06,
"loss": 0.0,
"step": 679
},
{
"epoch": 2.972262773722628,
"grad_norm": 0.001857755589298904,
"learning_rate": 2.833478608959038e-06,
"loss": 0.0,
"step": 680
},
{
"epoch": 2.9766423357664236,
"grad_norm": 0.014696883969008923,
"learning_rate": 2.827338952928146e-06,
"loss": 0.0,
"step": 681
},
{
"epoch": 2.981021897810219,
"grad_norm": 0.0031583639793097973,
"learning_rate": 2.821197287532847e-06,
"loss": 0.0,
"step": 682
},
{
"epoch": 2.985401459854015,
"grad_norm": 0.0016461616614833474,
"learning_rate": 2.8150536504736457e-06,
"loss": 0.0,
"step": 683
},
{
"epoch": 2.9897810218978105,
"grad_norm": 0.0036859370302408934,
"learning_rate": 2.8089080794631514e-06,
"loss": 0.0,
"step": 684
},
{
"epoch": 2.994160583941606,
"grad_norm": 0.0019211465260013938,
"learning_rate": 2.8027606122258435e-06,
"loss": 0.0,
"step": 685
},
{
"epoch": 2.9985401459854013,
"grad_norm": 0.001531517249532044,
"learning_rate": 2.79661128649784e-06,
"loss": 0.0,
"step": 686
},
{
"epoch": 3.0,
"grad_norm": 0.002470475621521473,
"learning_rate": 2.7904601400266706e-06,
"loss": 0.0,
"step": 687
},
{
"epoch": 3.0043795620437956,
"grad_norm": 0.017151040956377983,
"learning_rate": 2.784307210571039e-06,
"loss": 0.0,
"step": 688
},
{
"epoch": 3.0087591240875913,
"grad_norm": 0.0010016716551035643,
"learning_rate": 2.7781525359005945e-06,
"loss": 0.0,
"step": 689
},
{
"epoch": 3.013138686131387,
"grad_norm": 0.0022327261976897717,
"learning_rate": 2.771996153795699e-06,
"loss": 0.0,
"step": 690
},
{
"epoch": 3.0175182481751825,
"grad_norm": 0.001771376933902502,
"learning_rate": 2.7658381020471965e-06,
"loss": 0.0,
"step": 691
},
{
"epoch": 3.021897810218978,
"grad_norm": 0.0039240955375134945,
"learning_rate": 2.7596784184561788e-06,
"loss": 0.0,
"step": 692
},
{
"epoch": 3.026277372262774,
"grad_norm": 0.002869370160624385,
"learning_rate": 2.7535171408337556e-06,
"loss": 0.0,
"step": 693
},
{
"epoch": 3.0306569343065695,
"grad_norm": 0.01324407197535038,
"learning_rate": 2.7473543070008213e-06,
"loss": 0.0,
"step": 694
},
{
"epoch": 3.035036496350365,
"grad_norm": 0.015128228813409805,
"learning_rate": 2.7411899547878223e-06,
"loss": 0.0,
"step": 695
},
{
"epoch": 3.0394160583941607,
"grad_norm": 0.0010945587418973446,
"learning_rate": 2.7350241220345273e-06,
"loss": 0.0,
"step": 696
},
{
"epoch": 3.0437956204379564,
"grad_norm": 0.006534193176776171,
"learning_rate": 2.7288568465897918e-06,
"loss": 0.0,
"step": 697
},
{
"epoch": 3.048175182481752,
"grad_norm": 0.0015636439202353358,
"learning_rate": 2.722688166311328e-06,
"loss": 0.0,
"step": 698
},
{
"epoch": 3.0525547445255476,
"grad_norm": 0.0023577925749123096,
"learning_rate": 2.7165181190654705e-06,
"loss": 0.0,
"step": 699
},
{
"epoch": 3.0569343065693433,
"grad_norm": 0.0021843581926077604,
"learning_rate": 2.7103467427269466e-06,
"loss": 0.0,
"step": 700
},
{
"epoch": 3.0613138686131385,
"grad_norm": 0.0018377688247710466,
"learning_rate": 2.704174075178641e-06,
"loss": 0.0,
"step": 701
},
{
"epoch": 3.065693430656934,
"grad_norm": 0.0018055844120681286,
"learning_rate": 2.6980001543113653e-06,
"loss": 0.0,
"step": 702
},
{
"epoch": 3.0700729927007298,
"grad_norm": 0.0028426465578377247,
"learning_rate": 2.691825018023624e-06,
"loss": 0.0,
"step": 703
},
{
"epoch": 3.0744525547445254,
"grad_norm": 0.006463038269430399,
"learning_rate": 2.6856487042213825e-06,
"loss": 0.0,
"step": 704
},
{
"epoch": 3.078832116788321,
"grad_norm": 0.0850614532828331,
"learning_rate": 2.6794712508178345e-06,
"loss": 0.0001,
"step": 705
},
{
"epoch": 3.0832116788321167,
"grad_norm": 0.0032790934201329947,
"learning_rate": 2.673292695733169e-06,
"loss": 0.0,
"step": 706
},
{
"epoch": 3.0875912408759123,
"grad_norm": 0.0014041299000382423,
"learning_rate": 2.6671130768943375e-06,
"loss": 0.0,
"step": 707
},
{
"epoch": 3.091970802919708,
"grad_norm": 0.0034296936355531216,
"learning_rate": 2.660932432234823e-06,
"loss": 0.0,
"step": 708
},
{
"epoch": 3.0963503649635036,
"grad_norm": 0.0021895640529692173,
"learning_rate": 2.654750799694402e-06,
"loss": 0.0,
"step": 709
},
{
"epoch": 3.100729927007299,
"grad_norm": 0.0013117026537656784,
"learning_rate": 2.648568217218919e-06,
"loss": 0.0,
"step": 710
},
{
"epoch": 3.105109489051095,
"grad_norm": 0.0008849436999298632,
"learning_rate": 2.6423847227600462e-06,
"loss": 0.0,
"step": 711
},
{
"epoch": 3.1094890510948905,
"grad_norm": 0.015677910298109055,
"learning_rate": 2.636200354275057e-06,
"loss": 0.0,
"step": 712
},
{
"epoch": 3.113868613138686,
"grad_norm": 0.0022430643439292908,
"learning_rate": 2.630015149726588e-06,
"loss": 0.0,
"step": 713
},
{
"epoch": 3.1182481751824818,
"grad_norm": 0.0011902175610885024,
"learning_rate": 2.6238291470824084e-06,
"loss": 0.0,
"step": 714
},
{
"epoch": 3.1226277372262774,
"grad_norm": 0.002818114822730422,
"learning_rate": 2.6176423843151866e-06,
"loss": 0.0,
"step": 715
},
{
"epoch": 3.127007299270073,
"grad_norm": 0.0025780019350349903,
"learning_rate": 2.6114548994022575e-06,
"loss": 0.0,
"step": 716
},
{
"epoch": 3.1313868613138687,
"grad_norm": 0.008761986158788204,
"learning_rate": 2.6052667303253886e-06,
"loss": 0.0,
"step": 717
},
{
"epoch": 3.1357664233576643,
"grad_norm": 0.0025233717169612646,
"learning_rate": 2.5990779150705454e-06,
"loss": 0.0,
"step": 718
},
{
"epoch": 3.14014598540146,
"grad_norm": 0.0033938682172447443,
"learning_rate": 2.5928884916276638e-06,
"loss": 0.0,
"step": 719
},
{
"epoch": 3.1445255474452556,
"grad_norm": 0.026820925995707512,
"learning_rate": 2.586698497990409e-06,
"loss": 0.0001,
"step": 720
},
{
"epoch": 3.1489051094890512,
"grad_norm": 0.0023744499776512384,
"learning_rate": 2.5805079721559496e-06,
"loss": 0.0,
"step": 721
},
{
"epoch": 3.153284671532847,
"grad_norm": 0.001529409782961011,
"learning_rate": 2.574316952124718e-06,
"loss": 0.0,
"step": 722
},
{
"epoch": 3.1576642335766425,
"grad_norm": 0.005077557172626257,
"learning_rate": 2.5681254759001828e-06,
"loss": 0.0,
"step": 723
},
{
"epoch": 3.162043795620438,
"grad_norm": 0.002494523301720619,
"learning_rate": 2.561933581488612e-06,
"loss": 0.0,
"step": 724
},
{
"epoch": 3.1664233576642338,
"grad_norm": 0.0017919996753335,
"learning_rate": 2.55574130689884e-06,
"loss": 0.0,
"step": 725
},
{
"epoch": 3.170802919708029,
"grad_norm": 0.0017449480947107077,
"learning_rate": 2.549548690142036e-06,
"loss": 0.0,
"step": 726
},
{
"epoch": 3.1751824817518246,
"grad_norm": 0.0017062796978279948,
"learning_rate": 2.5433557692314687e-06,
"loss": 0.0,
"step": 727
},
{
"epoch": 3.1795620437956202,
"grad_norm": 0.002210992621257901,
"learning_rate": 2.5371625821822743e-06,
"loss": 0.0,
"step": 728
},
{
"epoch": 3.183941605839416,
"grad_norm": 0.0025367604102939367,
"learning_rate": 2.530969167011222e-06,
"loss": 0.0,
"step": 729
},
{
"epoch": 3.1883211678832115,
"grad_norm": 0.0028825236950069666,
"learning_rate": 2.5247755617364826e-06,
"loss": 0.0,
"step": 730
},
{
"epoch": 3.192700729927007,
"grad_norm": 0.0011007302673533559,
"learning_rate": 2.5185818043773942e-06,
"loss": 0.0,
"step": 731
},
{
"epoch": 3.197080291970803,
"grad_norm": 0.008204109966754913,
"learning_rate": 2.5123879329542255e-06,
"loss": 0.0,
"step": 732
},
{
"epoch": 3.2014598540145984,
"grad_norm": 0.001251939800567925,
"learning_rate": 2.5061939854879485e-06,
"loss": 0.0,
"step": 733
},
{
"epoch": 3.205839416058394,
"grad_norm": 0.0024482880253344774,
"learning_rate": 2.5e-06,
"loss": 0.0,
"step": 734
},
{
"epoch": 3.2102189781021897,
"grad_norm": 0.002128303749486804,
"learning_rate": 2.4938060145120523e-06,
"loss": 0.0,
"step": 735
},
{
"epoch": 3.2145985401459853,
"grad_norm": 0.0013024493819102645,
"learning_rate": 2.4876120670457753e-06,
"loss": 0.0,
"step": 736
},
{
"epoch": 3.218978102189781,
"grad_norm": 0.0012688646093010902,
"learning_rate": 2.481418195622607e-06,
"loss": 0.0,
"step": 737
},
{
"epoch": 3.2233576642335766,
"grad_norm": 0.0021316749043762684,
"learning_rate": 2.475224438263518e-06,
"loss": 0.0,
"step": 738
},
{
"epoch": 3.2277372262773723,
"grad_norm": 0.0019681300036609173,
"learning_rate": 2.469030832988779e-06,
"loss": 0.0,
"step": 739
},
{
"epoch": 3.232116788321168,
"grad_norm": 0.0013115162728354335,
"learning_rate": 2.4628374178177274e-06,
"loss": 0.0,
"step": 740
},
{
"epoch": 3.2364963503649635,
"grad_norm": 0.002466258592903614,
"learning_rate": 2.4566442307685325e-06,
"loss": 0.0,
"step": 741
},
{
"epoch": 3.240875912408759,
"grad_norm": 0.005171903874725103,
"learning_rate": 2.450451309857965e-06,
"loss": 0.0,
"step": 742
},
{
"epoch": 3.245255474452555,
"grad_norm": 0.0015806729206815362,
"learning_rate": 2.4442586931011607e-06,
"loss": 0.0,
"step": 743
},
{
"epoch": 3.2496350364963504,
"grad_norm": 0.0023869157303124666,
"learning_rate": 2.438066418511389e-06,
"loss": 0.0,
"step": 744
},
{
"epoch": 3.254014598540146,
"grad_norm": 0.0015038090059533715,
"learning_rate": 2.431874524099818e-06,
"loss": 0.0,
"step": 745
},
{
"epoch": 3.2583941605839417,
"grad_norm": 0.0022007508669048548,
"learning_rate": 2.4256830478752823e-06,
"loss": 0.0,
"step": 746
},
{
"epoch": 3.2627737226277373,
"grad_norm": 0.000799484783783555,
"learning_rate": 2.419492027844051e-06,
"loss": 0.0,
"step": 747
},
{
"epoch": 3.267153284671533,
"grad_norm": 0.0014924613060429692,
"learning_rate": 2.413301502009591e-06,
"loss": 0.0,
"step": 748
},
{
"epoch": 3.2715328467153286,
"grad_norm": 0.0013423648197203875,
"learning_rate": 2.4071115083723367e-06,
"loss": 0.0,
"step": 749
},
{
"epoch": 3.2759124087591243,
"grad_norm": 0.0016152571188285947,
"learning_rate": 2.4009220849294546e-06,
"loss": 0.0,
"step": 750
},
{
"epoch": 3.28029197080292,
"grad_norm": 0.002237207954749465,
"learning_rate": 2.394733269674612e-06,
"loss": 0.0,
"step": 751
},
{
"epoch": 3.2846715328467155,
"grad_norm": 0.0030521079897880554,
"learning_rate": 2.388545100597743e-06,
"loss": 0.0,
"step": 752
},
{
"epoch": 3.289051094890511,
"grad_norm": 0.0032652418594807386,
"learning_rate": 2.3823576156848138e-06,
"loss": 0.0,
"step": 753
},
{
"epoch": 3.293430656934307,
"grad_norm": 0.0014183277962729335,
"learning_rate": 2.3761708529175924e-06,
"loss": 0.0,
"step": 754
},
{
"epoch": 3.297810218978102,
"grad_norm": 0.0078318752348423,
"learning_rate": 2.3699848502734126e-06,
"loss": 0.0,
"step": 755
},
{
"epoch": 3.3021897810218976,
"grad_norm": 0.003864576341584325,
"learning_rate": 2.3637996457249433e-06,
"loss": 0.0,
"step": 756
},
{
"epoch": 3.3065693430656933,
"grad_norm": 0.002102078404277563,
"learning_rate": 2.357615277239954e-06,
"loss": 0.0,
"step": 757
},
{
"epoch": 3.310948905109489,
"grad_norm": 0.0014949225587770343,
"learning_rate": 2.3514317827810816e-06,
"loss": 0.0,
"step": 758
},
{
"epoch": 3.3153284671532846,
"grad_norm": 0.00100362254306674,
"learning_rate": 2.3452492003055987e-06,
"loss": 0.0,
"step": 759
},
{
"epoch": 3.31970802919708,
"grad_norm": 0.2444867193698883,
"learning_rate": 2.3390675677651778e-06,
"loss": 0.0011,
"step": 760
},
{
"epoch": 3.324087591240876,
"grad_norm": 0.0020129296462982893,
"learning_rate": 2.332886923105663e-06,
"loss": 0.0,
"step": 761
},
{
"epoch": 3.3284671532846715,
"grad_norm": 0.22737674415111542,
"learning_rate": 2.326707304266832e-06,
"loss": 0.0006,
"step": 762
},
{
"epoch": 3.332846715328467,
"grad_norm": 0.008498843759298325,
"learning_rate": 2.3205287491821663e-06,
"loss": 0.0,
"step": 763
},
{
"epoch": 3.3372262773722627,
"grad_norm": 0.05480220168828964,
"learning_rate": 2.3143512957786184e-06,
"loss": 0.0001,
"step": 764
},
{
"epoch": 3.3416058394160584,
"grad_norm": 0.0011567015899345279,
"learning_rate": 2.308174981976377e-06,
"loss": 0.0,
"step": 765
},
{
"epoch": 3.345985401459854,
"grad_norm": 0.0017249404918402433,
"learning_rate": 2.301999845688635e-06,
"loss": 0.0,
"step": 766
},
{
"epoch": 3.3503649635036497,
"grad_norm": 0.04111481457948685,
"learning_rate": 2.2958259248213595e-06,
"loss": 0.0,
"step": 767
},
{
"epoch": 3.3547445255474453,
"grad_norm": 0.0038588643074035645,
"learning_rate": 2.2896532572730534e-06,
"loss": 0.0,
"step": 768
},
{
"epoch": 3.359124087591241,
"grad_norm": 0.001527044572867453,
"learning_rate": 2.2834818809345295e-06,
"loss": 0.0,
"step": 769
},
{
"epoch": 3.3635036496350366,
"grad_norm": 0.009431148879230022,
"learning_rate": 2.2773118336886723e-06,
"loss": 0.0,
"step": 770
},
{
"epoch": 3.367883211678832,
"grad_norm": 0.0027088054921478033,
"learning_rate": 2.271143153410208e-06,
"loss": 0.0,
"step": 771
},
{
"epoch": 3.372262773722628,
"grad_norm": 0.002571119461208582,
"learning_rate": 2.264975877965473e-06,
"loss": 0.0,
"step": 772
},
{
"epoch": 3.3766423357664235,
"grad_norm": 0.0013004938373342156,
"learning_rate": 2.258810045212178e-06,
"loss": 0.0,
"step": 773
},
{
"epoch": 3.381021897810219,
"grad_norm": 0.0035143231507390738,
"learning_rate": 2.2526456929991795e-06,
"loss": 0.0,
"step": 774
},
{
"epoch": 3.3854014598540147,
"grad_norm": 0.0021215202286839485,
"learning_rate": 2.2464828591662452e-06,
"loss": 0.0,
"step": 775
},
{
"epoch": 3.3897810218978104,
"grad_norm": 0.004478083923459053,
"learning_rate": 2.240321581543822e-06,
"loss": 0.0,
"step": 776
},
{
"epoch": 3.394160583941606,
"grad_norm": 0.0010630427859723568,
"learning_rate": 2.2341618979528044e-06,
"loss": 0.0,
"step": 777
},
{
"epoch": 3.398540145985401,
"grad_norm": 0.010118206031620502,
"learning_rate": 2.2280038462043017e-06,
"loss": 0.0,
"step": 778
},
{
"epoch": 3.402919708029197,
"grad_norm": 0.005418050102889538,
"learning_rate": 2.2218474640994064e-06,
"loss": 0.0,
"step": 779
},
{
"epoch": 3.4072992700729925,
"grad_norm": 0.00403578719124198,
"learning_rate": 2.215692789428962e-06,
"loss": 0.0,
"step": 780
},
{
"epoch": 3.411678832116788,
"grad_norm": 0.004099252633750439,
"learning_rate": 2.20953985997333e-06,
"loss": 0.0,
"step": 781
},
{
"epoch": 3.4160583941605838,
"grad_norm": 0.0032652115914970636,
"learning_rate": 2.2033887135021605e-06,
"loss": 0.0,
"step": 782
},
{
"epoch": 3.4204379562043794,
"grad_norm": 0.002060306491330266,
"learning_rate": 2.1972393877741578e-06,
"loss": 0.0,
"step": 783
},
{
"epoch": 3.424817518248175,
"grad_norm": 0.005281209945678711,
"learning_rate": 2.191091920536849e-06,
"loss": 0.0,
"step": 784
},
{
"epoch": 3.4291970802919707,
"grad_norm": 0.006397861056029797,
"learning_rate": 2.1849463495263547e-06,
"loss": 0.0,
"step": 785
},
{
"epoch": 3.4335766423357663,
"grad_norm": 0.008003026247024536,
"learning_rate": 2.1788027124671542e-06,
"loss": 0.0,
"step": 786
},
{
"epoch": 3.437956204379562,
"grad_norm": 0.005707223899662495,
"learning_rate": 2.1726610470718553e-06,
"loss": 0.0,
"step": 787
},
{
"epoch": 3.4423357664233576,
"grad_norm": 0.007127422373741865,
"learning_rate": 2.166521391040963e-06,
"loss": 0.0,
"step": 788
},
{
"epoch": 3.4467153284671532,
"grad_norm": 0.0022657839581370354,
"learning_rate": 2.1603837820626478e-06,
"loss": 0.0,
"step": 789
},
{
"epoch": 3.451094890510949,
"grad_norm": 0.004881497472524643,
"learning_rate": 2.1542482578125148e-06,
"loss": 0.0,
"step": 790
},
{
"epoch": 3.4554744525547445,
"grad_norm": 0.0031092618592083454,
"learning_rate": 2.1481148559533703e-06,
"loss": 0.0,
"step": 791
},
{
"epoch": 3.45985401459854,
"grad_norm": 0.010201151482760906,
"learning_rate": 2.1419836141349964e-06,
"loss": 0.0,
"step": 792
},
{
"epoch": 3.4642335766423358,
"grad_norm": 0.0022016458678990602,
"learning_rate": 2.1358545699939114e-06,
"loss": 0.0,
"step": 793
},
{
"epoch": 3.4686131386861314,
"grad_norm": 0.001897580805234611,
"learning_rate": 2.129727761153146e-06,
"loss": 0.0,
"step": 794
},
{
"epoch": 3.472992700729927,
"grad_norm": 0.0018193572759628296,
"learning_rate": 2.1236032252220074e-06,
"loss": 0.0,
"step": 795
},
{
"epoch": 3.4773722627737227,
"grad_norm": 0.0025425944477319717,
"learning_rate": 2.117480999795853e-06,
"loss": 0.0,
"step": 796
},
{
"epoch": 3.4817518248175183,
"grad_norm": 0.0025497935712337494,
"learning_rate": 2.1113611224558545e-06,
"loss": 0.0,
"step": 797
},
{
"epoch": 3.486131386861314,
"grad_norm": 0.002034541452303529,
"learning_rate": 2.1052436307687725e-06,
"loss": 0.0,
"step": 798
},
{
"epoch": 3.4905109489051096,
"grad_norm": 0.004022255074232817,
"learning_rate": 2.0991285622867215e-06,
"loss": 0.0,
"step": 799
},
{
"epoch": 3.4948905109489052,
"grad_norm": 0.003748381743207574,
"learning_rate": 2.093015954546942e-06,
"loss": 0.0,
"step": 800
},
{
"epoch": 3.499270072992701,
"grad_norm": 0.005397303961217403,
"learning_rate": 2.0869058450715694e-06,
"loss": 0.0,
"step": 801
},
{
"epoch": 3.5036496350364965,
"grad_norm": 0.0028435164131224155,
"learning_rate": 2.0807982713674037e-06,
"loss": 0.0,
"step": 802
},
{
"epoch": 3.508029197080292,
"grad_norm": 0.01242138259112835,
"learning_rate": 2.074693270925677e-06,
"loss": 0.0,
"step": 803
},
{
"epoch": 3.512408759124088,
"grad_norm": 0.08091861754655838,
"learning_rate": 2.068590881221829e-06,
"loss": 0.0002,
"step": 804
},
{
"epoch": 3.5167883211678834,
"grad_norm": 0.007172738667577505,
"learning_rate": 2.062491139715271e-06,
"loss": 0.0,
"step": 805
},
{
"epoch": 3.521167883211679,
"grad_norm": 0.0017966198502108455,
"learning_rate": 2.056394083849158e-06,
"loss": 0.0,
"step": 806
},
{
"epoch": 3.5255474452554747,
"grad_norm": 0.0038727717474102974,
"learning_rate": 2.0502997510501614e-06,
"loss": 0.0,
"step": 807
},
{
"epoch": 3.5299270072992703,
"grad_norm": 0.004094596020877361,
"learning_rate": 2.0442081787282354e-06,
"loss": 0.0,
"step": 808
},
{
"epoch": 3.5343065693430655,
"grad_norm": 0.006543958559632301,
"learning_rate": 2.03811940427639e-06,
"loss": 0.0,
"step": 809
},
{
"epoch": 3.538686131386861,
"grad_norm": 0.0020556284580379725,
"learning_rate": 2.0320334650704595e-06,
"loss": 0.0,
"step": 810
},
{
"epoch": 3.543065693430657,
"grad_norm": 0.0070542385801672935,
"learning_rate": 2.025950398468875e-06,
"loss": 0.0,
"step": 811
},
{
"epoch": 3.5474452554744524,
"grad_norm": 0.017733529210090637,
"learning_rate": 2.0198702418124345e-06,
"loss": 0.0,
"step": 812
},
{
"epoch": 3.551824817518248,
"grad_norm": 0.009895303286612034,
"learning_rate": 2.013793032424072e-06,
"loss": 0.0,
"step": 813
},
{
"epoch": 3.5562043795620437,
"grad_norm": 0.001627814257517457,
"learning_rate": 2.007718807608629e-06,
"loss": 0.0,
"step": 814
},
{
"epoch": 3.5605839416058394,
"grad_norm": 0.005018120631575584,
"learning_rate": 2.0016476046526308e-06,
"loss": 0.0,
"step": 815
},
{
"epoch": 3.564963503649635,
"grad_norm": 0.0010407187510281801,
"learning_rate": 1.995579460824048e-06,
"loss": 0.0,
"step": 816
},
{
"epoch": 3.5693430656934306,
"grad_norm": 0.0016328482888638973,
"learning_rate": 1.989514413372076e-06,
"loss": 0.0,
"step": 817
},
{
"epoch": 3.5737226277372263,
"grad_norm": 0.007556082680821419,
"learning_rate": 1.983452499526901e-06,
"loss": 0.0,
"step": 818
},
{
"epoch": 3.578102189781022,
"grad_norm": 0.0021091431844979525,
"learning_rate": 1.9773937564994747e-06,
"loss": 0.0,
"step": 819
},
{
"epoch": 3.5824817518248175,
"grad_norm": 0.0016330553917214274,
"learning_rate": 1.9713382214812847e-06,
"loss": 0.0,
"step": 820
},
{
"epoch": 3.586861313868613,
"grad_norm": 0.0024340248201042414,
"learning_rate": 1.9652859316441266e-06,
"loss": 0.0,
"step": 821
},
{
"epoch": 3.591240875912409,
"grad_norm": 0.001374023617245257,
"learning_rate": 1.9592369241398747e-06,
"loss": 0.0,
"step": 822
},
{
"epoch": 3.5956204379562045,
"grad_norm": 0.0006934819975867867,
"learning_rate": 1.9531912361002554e-06,
"loss": 0.0,
"step": 823
},
{
"epoch": 3.6,
"grad_norm": 0.0039897337555885315,
"learning_rate": 1.9471489046366184e-06,
"loss": 0.0,
"step": 824
},
{
"epoch": 3.6043795620437957,
"grad_norm": 0.002902815816923976,
"learning_rate": 1.941109966839709e-06,
"loss": 0.0,
"step": 825
},
{
"epoch": 3.6087591240875914,
"grad_norm": 0.009976208209991455,
"learning_rate": 1.9350744597794407e-06,
"loss": 0.0,
"step": 826
},
{
"epoch": 3.613138686131387,
"grad_norm": 0.0016394915292039514,
"learning_rate": 1.929042420504667e-06,
"loss": 0.0,
"step": 827
},
{
"epoch": 3.6175182481751826,
"grad_norm": 0.011417590081691742,
"learning_rate": 1.923013886042956e-06,
"loss": 0.0,
"step": 828
},
{
"epoch": 3.621897810218978,
"grad_norm": 0.006547160446643829,
"learning_rate": 1.91698889340036e-06,
"loss": 0.0,
"step": 829
},
{
"epoch": 3.6262773722627735,
"grad_norm": 0.0013417567824944854,
"learning_rate": 1.9109674795611897e-06,
"loss": 0.0,
"step": 830
},
{
"epoch": 3.630656934306569,
"grad_norm": 0.0021602141205221415,
"learning_rate": 1.9049496814877894e-06,
"loss": 0.0,
"step": 831
},
{
"epoch": 3.6350364963503647,
"grad_norm": 0.004333204589784145,
"learning_rate": 1.8989355361203057e-06,
"loss": 0.0,
"step": 832
},
{
"epoch": 3.6394160583941604,
"grad_norm": 0.0013988579157739878,
"learning_rate": 1.892925080376465e-06,
"loss": 0.0,
"step": 833
},
{
"epoch": 3.643795620437956,
"grad_norm": 0.0024188708048313856,
"learning_rate": 1.886918351151343e-06,
"loss": 0.0,
"step": 834
},
{
"epoch": 3.6481751824817517,
"grad_norm": 0.0826224833726883,
"learning_rate": 1.8809153853171428e-06,
"loss": 0.0,
"step": 835
},
{
"epoch": 3.6525547445255473,
"grad_norm": 0.0022762177977710962,
"learning_rate": 1.8749162197229626e-06,
"loss": 0.0,
"step": 836
},
{
"epoch": 3.656934306569343,
"grad_norm": 0.0012058281572535634,
"learning_rate": 1.8689208911945771e-06,
"loss": 0.0,
"step": 837
},
{
"epoch": 3.6613138686131386,
"grad_norm": 0.0028202789835631847,
"learning_rate": 1.8629294365342049e-06,
"loss": 0.0,
"step": 838
},
{
"epoch": 3.665693430656934,
"grad_norm": 0.0023832700680941343,
"learning_rate": 1.8569418925202841e-06,
"loss": 0.0,
"step": 839
},
{
"epoch": 3.67007299270073,
"grad_norm": 0.0048092082142829895,
"learning_rate": 1.8509582959072487e-06,
"loss": 0.0,
"step": 840
},
{
"epoch": 3.6744525547445255,
"grad_norm": 0.004336558748036623,
"learning_rate": 1.8449786834253016e-06,
"loss": 0.0,
"step": 841
},
{
"epoch": 3.678832116788321,
"grad_norm": 0.008016410283744335,
"learning_rate": 1.8390030917801883e-06,
"loss": 0.0,
"step": 842
},
{
"epoch": 3.6832116788321168,
"grad_norm": 0.003487163921818137,
"learning_rate": 1.8330315576529733e-06,
"loss": 0.0,
"step": 843
},
{
"epoch": 3.6875912408759124,
"grad_norm": 0.005607697181403637,
"learning_rate": 1.8270641176998138e-06,
"loss": 0.0,
"step": 844
},
{
"epoch": 3.691970802919708,
"grad_norm": 0.013032430782914162,
"learning_rate": 1.8211008085517348e-06,
"loss": 0.0,
"step": 845
},
{
"epoch": 3.6963503649635037,
"grad_norm": 0.0017879578517749906,
"learning_rate": 1.815141666814405e-06,
"loss": 0.0,
"step": 846
},
{
"epoch": 3.7007299270072993,
"grad_norm": 0.0024302792735397816,
"learning_rate": 1.809186729067911e-06,
"loss": 0.0,
"step": 847
},
{
"epoch": 3.705109489051095,
"grad_norm": 0.0009780308464542031,
"learning_rate": 1.8032360318665337e-06,
"loss": 0.0,
"step": 848
},
{
"epoch": 3.7094890510948906,
"grad_norm": 0.19592009484767914,
"learning_rate": 1.797289611738523e-06,
"loss": 0.0019,
"step": 849
},
{
"epoch": 3.713868613138686,
"grad_norm": 0.002372670453041792,
"learning_rate": 1.7913475051858746e-06,
"loss": 0.0,
"step": 850
},
{
"epoch": 3.718248175182482,
"grad_norm": 0.002017020247876644,
"learning_rate": 1.7854097486841043e-06,
"loss": 0.0,
"step": 851
},
{
"epoch": 3.7226277372262775,
"grad_norm": 0.0022830108646303415,
"learning_rate": 1.7794763786820268e-06,
"loss": 0.0,
"step": 852
},
{
"epoch": 3.727007299270073,
"grad_norm": 0.002295683603733778,
"learning_rate": 1.7735474316015294e-06,
"loss": 0.0,
"step": 853
},
{
"epoch": 3.7313868613138688,
"grad_norm": 0.001965237082913518,
"learning_rate": 1.767622943837349e-06,
"loss": 0.0,
"step": 854
},
{
"epoch": 3.7357664233576644,
"grad_norm": 0.0012728808214887977,
"learning_rate": 1.7617029517568502e-06,
"loss": 0.0,
"step": 855
},
{
"epoch": 3.74014598540146,
"grad_norm": 0.0020083924755454063,
"learning_rate": 1.7557874916997996e-06,
"loss": 0.0,
"step": 856
},
{
"epoch": 3.7445255474452557,
"grad_norm": 0.0020820496138185263,
"learning_rate": 1.7498765999781455e-06,
"loss": 0.0,
"step": 857
},
{
"epoch": 3.7489051094890513,
"grad_norm": 0.0020251362584531307,
"learning_rate": 1.7439703128757923e-06,
"loss": 0.0,
"step": 858
},
{
"epoch": 3.753284671532847,
"grad_norm": 0.0015859343111515045,
"learning_rate": 1.7380686666483793e-06,
"loss": 0.0,
"step": 859
},
{
"epoch": 3.7576642335766426,
"grad_norm": 0.001591153210029006,
"learning_rate": 1.7321716975230588e-06,
"loss": 0.0,
"step": 860
},
{
"epoch": 3.7620437956204382,
"grad_norm": 0.0018416156526654959,
"learning_rate": 1.7262794416982717e-06,
"loss": 0.0,
"step": 861
},
{
"epoch": 3.7664233576642334,
"grad_norm": 0.0017293720738962293,
"learning_rate": 1.7203919353435269e-06,
"loss": 0.0,
"step": 862
},
{
"epoch": 3.770802919708029,
"grad_norm": 0.0010803790064528584,
"learning_rate": 1.7145092145991786e-06,
"loss": 0.0,
"step": 863
},
{
"epoch": 3.7751824817518247,
"grad_norm": 0.002193450927734375,
"learning_rate": 1.7086313155762046e-06,
"loss": 0.0,
"step": 864
},
{
"epoch": 3.7795620437956203,
"grad_norm": 0.11664100736379623,
"learning_rate": 1.7027582743559845e-06,
"loss": 0.0008,
"step": 865
},
{
"epoch": 3.783941605839416,
"grad_norm": 0.0010278144618496299,
"learning_rate": 1.696890126990079e-06,
"loss": 0.0,
"step": 866
},
{
"epoch": 3.7883211678832116,
"grad_norm": 0.0027571087703108788,
"learning_rate": 1.691026909500007e-06,
"loss": 0.0,
"step": 867
},
{
"epoch": 3.7927007299270072,
"grad_norm": 0.001730642979964614,
"learning_rate": 1.6851686578770263e-06,
"loss": 0.0,
"step": 868
},
{
"epoch": 3.797080291970803,
"grad_norm": 0.0017755437875166535,
"learning_rate": 1.6793154080819112e-06,
"loss": 0.0,
"step": 869
},
{
"epoch": 3.8014598540145985,
"grad_norm": 0.0018059754511341453,
"learning_rate": 1.6734671960447335e-06,
"loss": 0.0,
"step": 870
},
{
"epoch": 3.805839416058394,
"grad_norm": 0.0010772132081910968,
"learning_rate": 1.6676240576646389e-06,
"loss": 0.0,
"step": 871
},
{
"epoch": 3.81021897810219,
"grad_norm": 0.0013907999964430928,
"learning_rate": 1.66178602880963e-06,
"loss": 0.0,
"step": 872
},
{
"epoch": 3.8145985401459854,
"grad_norm": 0.0017574954545125365,
"learning_rate": 1.655953145316344e-06,
"loss": 0.0,
"step": 873
},
{
"epoch": 3.818978102189781,
"grad_norm": 0.0023797417525202036,
"learning_rate": 1.6501254429898345e-06,
"loss": 0.0,
"step": 874
},
{
"epoch": 3.8233576642335767,
"grad_norm": 0.0010705847525969148,
"learning_rate": 1.6443029576033493e-06,
"loss": 0.0,
"step": 875
},
{
"epoch": 3.8277372262773723,
"grad_norm": 0.0013967688428238034,
"learning_rate": 1.6384857248981117e-06,
"loss": 0.0,
"step": 876
},
{
"epoch": 3.832116788321168,
"grad_norm": 0.0027486851904541254,
"learning_rate": 1.6326737805831039e-06,
"loss": 0.0,
"step": 877
},
{
"epoch": 3.8364963503649636,
"grad_norm": 0.0009487051866017282,
"learning_rate": 1.6268671603348428e-06,
"loss": 0.0,
"step": 878
},
{
"epoch": 3.8408759124087593,
"grad_norm": 0.0026726871728897095,
"learning_rate": 1.621065899797165e-06,
"loss": 0.0,
"step": 879
},
{
"epoch": 3.845255474452555,
"grad_norm": 0.0028086488600820303,
"learning_rate": 1.6152700345810063e-06,
"loss": 0.0,
"step": 880
},
{
"epoch": 3.8496350364963505,
"grad_norm": 0.005458397325128317,
"learning_rate": 1.6094796002641836e-06,
"loss": 0.0,
"step": 881
},
{
"epoch": 3.8540145985401457,
"grad_norm": 0.0014935546787455678,
"learning_rate": 1.6036946323911753e-06,
"loss": 0.0,
"step": 882
},
{
"epoch": 3.8583941605839414,
"grad_norm": 0.002109678229317069,
"learning_rate": 1.5979151664729063e-06,
"loss": 0.0,
"step": 883
},
{
"epoch": 3.862773722627737,
"grad_norm": 0.0017332076095044613,
"learning_rate": 1.5921412379865259e-06,
"loss": 0.0,
"step": 884
},
{
"epoch": 3.8671532846715326,
"grad_norm": 0.009105571545660496,
"learning_rate": 1.5863728823751922e-06,
"loss": 0.0,
"step": 885
},
{
"epoch": 3.8715328467153283,
"grad_norm": 0.0020738227758556604,
"learning_rate": 1.5806101350478552e-06,
"loss": 0.0,
"step": 886
},
{
"epoch": 3.875912408759124,
"grad_norm": 0.0010305409086868167,
"learning_rate": 1.5748530313790379e-06,
"loss": 0.0,
"step": 887
},
{
"epoch": 3.8802919708029195,
"grad_norm": 0.002297050319612026,
"learning_rate": 1.5691016067086198e-06,
"loss": 0.0,
"step": 888
},
{
"epoch": 3.884671532846715,
"grad_norm": 0.002647539833560586,
"learning_rate": 1.5633558963416203e-06,
"loss": 0.0,
"step": 889
},
{
"epoch": 3.889051094890511,
"grad_norm": 0.0012281707022339106,
"learning_rate": 1.5576159355479814e-06,
"loss": 0.0,
"step": 890
},
{
"epoch": 3.8934306569343065,
"grad_norm": 0.0018704022513702512,
"learning_rate": 1.5518817595623514e-06,
"loss": 0.0,
"step": 891
},
{
"epoch": 3.897810218978102,
"grad_norm": 0.001662683323957026,
"learning_rate": 1.546153403583869e-06,
"loss": 0.0,
"step": 892
},
{
"epoch": 3.9021897810218977,
"grad_norm": 0.0023420508950948715,
"learning_rate": 1.540430902775946e-06,
"loss": 0.0,
"step": 893
},
{
"epoch": 3.9065693430656934,
"grad_norm": 0.0019655104260891676,
"learning_rate": 1.534714292266054e-06,
"loss": 0.0,
"step": 894
},
{
"epoch": 3.910948905109489,
"grad_norm": 0.0009503703331574798,
"learning_rate": 1.5290036071455056e-06,
"loss": 0.0,
"step": 895
},
{
"epoch": 3.9153284671532846,
"grad_norm": 0.04980412498116493,
"learning_rate": 1.5232988824692406e-06,
"loss": 0.0,
"step": 896
},
{
"epoch": 3.9197080291970803,
"grad_norm": 0.0007013222202658653,
"learning_rate": 1.5176001532556118e-06,
"loss": 0.0,
"step": 897
},
{
"epoch": 3.924087591240876,
"grad_norm": 0.0010505859972909093,
"learning_rate": 1.511907454486168e-06,
"loss": 0.0,
"step": 898
},
{
"epoch": 3.9284671532846716,
"grad_norm": 0.0009482012246735394,
"learning_rate": 1.5062208211054398e-06,
"loss": 0.0,
"step": 899
},
{
"epoch": 3.932846715328467,
"grad_norm": 0.004321451764553785,
"learning_rate": 1.5005402880207272e-06,
"loss": 0.0,
"step": 900
},
{
"epoch": 3.937226277372263,
"grad_norm": 0.00210862560197711,
"learning_rate": 1.4948658901018826e-06,
"loss": 0.0,
"step": 901
},
{
"epoch": 3.9416058394160585,
"grad_norm": 0.00250151171348989,
"learning_rate": 1.4891976621810972e-06,
"loss": 0.0,
"step": 902
},
{
"epoch": 3.945985401459854,
"grad_norm": 0.0026350142434239388,
"learning_rate": 1.483535639052689e-06,
"loss": 0.0,
"step": 903
},
{
"epoch": 3.9503649635036497,
"grad_norm": 0.004716258030384779,
"learning_rate": 1.4778798554728866e-06,
"loss": 0.0,
"step": 904
},
{
"epoch": 3.9547445255474454,
"grad_norm": 0.0018248335691168904,
"learning_rate": 1.4722303461596192e-06,
"loss": 0.0,
"step": 905
},
{
"epoch": 3.959124087591241,
"grad_norm": 0.013006918132305145,
"learning_rate": 1.4665871457922997e-06,
"loss": 0.0,
"step": 906
},
{
"epoch": 3.9635036496350367,
"grad_norm": 0.0023916151840239763,
"learning_rate": 1.4609502890116146e-06,
"loss": 0.0,
"step": 907
},
{
"epoch": 3.9678832116788323,
"grad_norm": 0.001576004782691598,
"learning_rate": 1.4553198104193094e-06,
"loss": 0.0,
"step": 908
},
{
"epoch": 3.972262773722628,
"grad_norm": 0.0026557703968137503,
"learning_rate": 1.4496957445779792e-06,
"loss": 0.0,
"step": 909
},
{
"epoch": 3.9766423357664236,
"grad_norm": 0.002321658656001091,
"learning_rate": 1.4440781260108521e-06,
"loss": 0.0,
"step": 910
},
{
"epoch": 3.981021897810219,
"grad_norm": 0.0010188892483711243,
"learning_rate": 1.438466989201583e-06,
"loss": 0.0,
"step": 911
},
{
"epoch": 3.985401459854015,
"grad_norm": 0.0009046494378708303,
"learning_rate": 1.4328623685940335e-06,
"loss": 0.0,
"step": 912
}
],
"logging_steps": 1,
"max_steps": 1368,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 228,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.2978513818196378e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}