diff --git "a/sft_qwen_14B/checkpoints/checkpoint-2000/trainer_state.json" "b/sft_qwen_14B/checkpoints/checkpoint-2000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/sft_qwen_14B/checkpoints/checkpoint-2000/trainer_state.json"
@@ -0,0 +1,7203 @@
+{
+  "best_global_step": 2000,
+  "best_metric": 0.8567262887954712,
+  "best_model_checkpoint": "runs/instruct_run_14b_v1/checkpoints/checkpoint-2000",
+  "epoch": 0.8629989212513485,
+  "eval_steps": 100,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008629989212513484,
+      "grad_norm": 0.36567428708076477,
+      "learning_rate": 1.7969451931716084e-07,
+      "loss": 1.6794371604919434,
+      "step": 2
+    },
+    {
+      "epoch": 0.001725997842502697,
+      "grad_norm": 0.4024646580219269,
+      "learning_rate": 5.390835579514825e-07,
+      "loss": 1.6853073835372925,
+      "step": 4
+    },
+    {
+      "epoch": 0.0025889967637540453,
+      "grad_norm": 0.40199393033981323,
+      "learning_rate": 8.984725965858042e-07,
+      "loss": 1.7621158361434937,
+      "step": 6
+    },
+    {
+      "epoch": 0.003451995685005394,
+      "grad_norm": 0.35409677028656006,
+      "learning_rate": 1.257861635220126e-06,
+      "loss": 1.633257269859314,
+      "step": 8
+    },
+    {
+      "epoch": 0.004314994606256742,
+      "grad_norm": 0.39087551832199097,
+      "learning_rate": 1.6172506738544475e-06,
+      "loss": 1.7374768257141113,
+      "step": 10
+    },
+    {
+      "epoch": 0.005177993527508091,
+      "grad_norm": 0.3586857318878174,
+      "learning_rate": 1.9766397124887693e-06,
+      "loss": 1.6955714225769043,
+      "step": 12
+    },
+    {
+      "epoch": 0.006040992448759439,
+      "grad_norm": 0.32755669951438904,
+      "learning_rate": 2.3360287511230908e-06,
+      "loss": 1.720664381980896,
+      "step": 14
+    },
+    {
+      "epoch": 0.006903991370010788,
+      "grad_norm": 0.4054872691631317,
+      "learning_rate": 2.6954177897574127e-06,
+      "loss": 1.6957035064697266,
+      "step": 16
+    },
+    {
+      "epoch": 0.007766990291262136,
+      "grad_norm": 0.37593814730644226,
+      "learning_rate": 3.0548068283917343e-06,
+      "loss": 1.7286947965621948,
+      "step": 18
+    },
+    {
+      "epoch": 0.008629989212513484,
+      "grad_norm": 0.3344813287258148,
+      "learning_rate": 3.414195867026056e-06,
+      "loss": 1.727295994758606,
+      "step": 20
+    },
+    {
+      "epoch": 0.009492988133764833,
+      "grad_norm": 0.357474148273468,
+      "learning_rate": 3.7735849056603773e-06,
+      "loss": 1.6727914810180664,
+      "step": 22
+    },
+    {
+      "epoch": 0.010355987055016181,
+      "grad_norm": 0.39115726947784424,
+      "learning_rate": 4.132973944294699e-06,
+      "loss": 1.6518884897232056,
+      "step": 24
+    },
+    {
+      "epoch": 0.01121898597626753,
+      "grad_norm": 0.4711727201938629,
+      "learning_rate": 4.492362982929021e-06,
+      "loss": 1.7868088483810425,
+      "step": 26
+    },
+    {
+      "epoch": 0.012081984897518877,
+      "grad_norm": 0.34112176299095154,
+      "learning_rate": 4.851752021563342e-06,
+      "loss": 1.6127634048461914,
+      "step": 28
+    },
+    {
+      "epoch": 0.012944983818770227,
+      "grad_norm": 0.5071991682052612,
+      "learning_rate": 5.211141060197664e-06,
+      "loss": 1.7858378887176514,
+      "step": 30
+    },
+    {
+      "epoch": 0.013807982740021575,
+      "grad_norm": 0.42048847675323486,
+      "learning_rate": 5.570530098831986e-06,
+      "loss": 1.7123326063156128,
+      "step": 32
+    },
+    {
+      "epoch": 0.014670981661272923,
+      "grad_norm": 0.48883870244026184,
+      "learning_rate": 5.929919137466308e-06,
+      "loss": 1.737749695777893,
+      "step": 34
+    },
+    {
+      "epoch": 0.015533980582524271,
+      "grad_norm": 0.3311465084552765,
+      "learning_rate": 6.289308176100629e-06,
+      "loss": 1.5578981637954712,
+      "step": 36
+    },
+    {
+      "epoch": 0.01639697950377562,
+      "grad_norm": 0.5178973078727722,
+      "learning_rate": 6.64869721473495e-06,
+      "loss": 1.719806432723999,
+      "step": 38
+    },
+    {
+      "epoch": 0.017259978425026967,
+      "grad_norm": 0.47097742557525635,
+      "learning_rate": 7.008086253369272e-06,
+      "loss": 1.728212833404541,
+      "step": 40
+    },
+    {
+      "epoch": 0.018122977346278317,
+      "grad_norm": 0.5051584243774414,
+      "learning_rate": 7.367475292003594e-06,
+      "loss": 1.6542466878890991,
+      "step": 42
+    },
+    {
+      "epoch": 0.018985976267529667,
+      "grad_norm": 0.4645111560821533,
+      "learning_rate": 7.726864330637915e-06,
+      "loss": 1.7087690830230713,
+      "step": 44
+    },
+    {
+      "epoch": 0.019848975188781013,
+      "grad_norm": 0.5184999704360962,
+      "learning_rate": 8.086253369272237e-06,
+      "loss": 1.7018946409225464,
+      "step": 46
+    },
+    {
+      "epoch": 0.020711974110032363,
+      "grad_norm": 0.4543815851211548,
+      "learning_rate": 8.44564240790656e-06,
+      "loss": 1.6818269491195679,
+      "step": 48
+    },
+    {
+      "epoch": 0.021574973031283712,
+      "grad_norm": 0.44411996006965637,
+      "learning_rate": 8.80503144654088e-06,
+      "loss": 1.5772877931594849,
+      "step": 50
+    },
+    {
+      "epoch": 0.02243797195253506,
+      "grad_norm": 0.3409404158592224,
+      "learning_rate": 9.164420485175203e-06,
+      "loss": 1.498152732849121,
+      "step": 52
+    },
+    {
+      "epoch": 0.02330097087378641,
+      "grad_norm": 0.42104434967041016,
+      "learning_rate": 9.523809523809523e-06,
+      "loss": 1.6189048290252686,
+      "step": 54
+    },
+    {
+      "epoch": 0.024163969795037755,
+      "grad_norm": 0.3756246268749237,
+      "learning_rate": 9.883198562443846e-06,
+      "loss": 1.4596441984176636,
+      "step": 56
+    },
+    {
+      "epoch": 0.025026968716289105,
+      "grad_norm": 0.36214128136634827,
+      "learning_rate": 1.0242587601078168e-05,
+      "loss": 1.503880500793457,
+      "step": 58
+    },
+    {
+      "epoch": 0.025889967637540454,
+      "grad_norm": 0.40893009305000305,
+      "learning_rate": 1.060197663971249e-05,
+      "loss": 1.5912823677062988,
+      "step": 60
+    },
+    {
+      "epoch": 0.0267529665587918,
+      "grad_norm": 0.28710272908210754,
+      "learning_rate": 1.0961365678346811e-05,
+      "loss": 1.2956721782684326,
+      "step": 62
+    },
+    {
+      "epoch": 0.02761596548004315,
+      "grad_norm": 0.304573118686676,
+      "learning_rate": 1.1320754716981132e-05,
+      "loss": 1.4648056030273438,
+      "step": 64
+    },
+    {
+      "epoch": 0.0284789644012945,
+      "grad_norm": 0.36523914337158203,
+      "learning_rate": 1.1680143755615454e-05,
+      "loss": 1.6078968048095703,
+      "step": 66
+    },
+    {
+      "epoch": 0.029341963322545846,
+      "grad_norm": 0.37929031252861023,
+      "learning_rate": 1.2039532794249775e-05,
+      "loss": 1.5969421863555908,
+      "step": 68
+    },
+    {
+      "epoch": 0.030204962243797196,
+      "grad_norm": 0.3053947389125824,
+      "learning_rate": 1.2398921832884097e-05,
+      "loss": 1.4312325716018677,
+      "step": 70
+    },
+    {
+      "epoch": 0.031067961165048542,
+      "grad_norm": 0.3028779923915863,
+      "learning_rate": 1.275831087151842e-05,
+      "loss": 1.4101300239562988,
+      "step": 72
+    },
+    {
+      "epoch": 0.03193096008629989,
+      "grad_norm": 0.29649803042411804,
+      "learning_rate": 1.3117699910152742e-05,
+      "loss": 1.4553817510604858,
+      "step": 74
+    },
+    {
+      "epoch": 0.03279395900755124,
+      "grad_norm": 0.26032644510269165,
+      "learning_rate": 1.3477088948787062e-05,
+      "loss": 1.4623000621795654,
+      "step": 76
+    },
+    {
+      "epoch": 0.03365695792880259,
+      "grad_norm": 0.33558446168899536,
+      "learning_rate": 1.3836477987421385e-05,
+      "loss": 1.5181745290756226,
+      "step": 78
+    },
+    {
+      "epoch": 0.034519956850053934,
+      "grad_norm": 0.28307804465293884,
+      "learning_rate": 1.4195867026055706e-05,
+      "loss": 1.4397861957550049,
+      "step": 80
+    },
+    {
+      "epoch": 0.035382955771305284,
+      "grad_norm": 0.3451690673828125,
+      "learning_rate": 1.455525606469003e-05,
+      "loss": 1.463841199874878,
+      "step": 82
+    },
+    {
+      "epoch": 0.036245954692556634,
+      "grad_norm": 0.3248669505119324,
+      "learning_rate": 1.4914645103324348e-05,
+      "loss": 1.3554227352142334,
+      "step": 84
+    },
+    {
+      "epoch": 0.037108953613807984,
+      "grad_norm": 0.2855011224746704,
+      "learning_rate": 1.527403414195867e-05,
+      "loss": 1.2810425758361816,
+      "step": 86
+    },
+    {
+      "epoch": 0.03797195253505933,
+      "grad_norm": 0.33365535736083984,
+      "learning_rate": 1.5633423180592992e-05,
+      "loss": 1.428163766860962,
+      "step": 88
+    },
+    {
+      "epoch": 0.038834951456310676,
+      "grad_norm": 0.34099438786506653,
+      "learning_rate": 1.5992812219227316e-05,
+      "loss": 1.3487578630447388,
+      "step": 90
+    },
+    {
+      "epoch": 0.039697950377562026,
+      "grad_norm": 0.39247506856918335,
+      "learning_rate": 1.6352201257861635e-05,
+      "loss": 1.30057954788208,
+      "step": 92
+    },
+    {
+      "epoch": 0.040560949298813376,
+      "grad_norm": 0.32692041993141174,
+      "learning_rate": 1.671159029649596e-05,
+      "loss": 1.2923580408096313,
+      "step": 94
+    },
+    {
+      "epoch": 0.041423948220064725,
+      "grad_norm": 0.43452519178390503,
+      "learning_rate": 1.707097933513028e-05,
+      "loss": 1.5002273321151733,
+      "step": 96
+    },
+    {
+      "epoch": 0.042286947141316075,
+      "grad_norm": 0.3251534402370453,
+      "learning_rate": 1.7430368373764602e-05,
+      "loss": 1.330254077911377,
+      "step": 98
+    },
+    {
+      "epoch": 0.043149946062567425,
+      "grad_norm": 0.3198273479938507,
+      "learning_rate": 1.778975741239892e-05,
+      "loss": 1.3054943084716797,
+      "step": 100
+    },
+    {
+      "epoch": 0.043149946062567425,
+      "eval_loss": 1.366738200187683,
+      "eval_runtime": 651.8198,
+      "eval_samples_per_second": 3.16,
+      "eval_steps_per_second": 3.16,
+      "step": 100
+    },
+    {
+      "epoch": 0.04401294498381877,
+      "grad_norm": 0.37364065647125244,
+      "learning_rate": 1.8149146451033245e-05,
+      "loss": 1.314281940460205,
+      "step": 102
+    },
+    {
+      "epoch": 0.04487594390507012,
+      "grad_norm": 0.39384758472442627,
+      "learning_rate": 1.8508535489667568e-05,
+      "loss": 1.2737246751785278,
+      "step": 104
+    },
+    {
+      "epoch": 0.04573894282632147,
+      "grad_norm": 0.3521905541419983,
+      "learning_rate": 1.8867924528301888e-05,
+      "loss": 1.3113226890563965,
+      "step": 106
+    },
+    {
+      "epoch": 0.04660194174757282,
+      "grad_norm": 0.33531463146209717,
+      "learning_rate": 1.9227313566936208e-05,
+      "loss": 1.3253653049468994,
+      "step": 108
+    },
+    {
+      "epoch": 0.04746494066882417,
+      "grad_norm": 0.35596340894699097,
+      "learning_rate": 1.958670260557053e-05,
+      "loss": 1.3236849308013916,
+      "step": 110
+    },
+    {
+      "epoch": 0.04832793959007551,
+      "grad_norm": 0.36028242111206055,
+      "learning_rate": 1.9946091644204854e-05,
+      "loss": 1.183128833770752,
+      "step": 112
+    },
+    {
+      "epoch": 0.04919093851132686,
+      "grad_norm": 0.42109814286231995,
+      "learning_rate": 2.0305480682839174e-05,
+      "loss": 1.2741888761520386,
+      "step": 114
+    },
+    {
+      "epoch": 0.05005393743257821,
+      "grad_norm": 0.39675939083099365,
+      "learning_rate": 2.0664869721473494e-05,
+      "loss": 1.3050109148025513,
+      "step": 116
+    },
+    {
+      "epoch": 0.05091693635382956,
+      "grad_norm": 0.4414141774177551,
+      "learning_rate": 2.1024258760107817e-05,
+      "loss": 1.2472094297409058,
+      "step": 118
+    },
+    {
+      "epoch": 0.05177993527508091,
+      "grad_norm": 0.42872729897499084,
+      "learning_rate": 2.138364779874214e-05,
+      "loss": 1.3338921070098877,
+      "step": 120
+    },
+    {
+      "epoch": 0.05264293419633225,
+      "grad_norm": 0.38336244225502014,
+      "learning_rate": 2.174303683737646e-05,
+      "loss": 1.322908878326416,
+      "step": 122
+    },
+    {
+      "epoch": 0.0535059331175836,
+      "grad_norm": 0.41046878695487976,
+      "learning_rate": 2.2102425876010783e-05,
+      "loss": 1.2169240713119507,
+      "step": 124
+    },
+    {
+      "epoch": 0.05436893203883495,
+      "grad_norm": 0.39460113644599915,
+      "learning_rate": 2.2461814914645103e-05,
+      "loss": 1.2085309028625488,
+      "step": 126
+    },
+    {
+      "epoch": 0.0552319309600863,
+      "grad_norm": 0.42829909920692444,
+      "learning_rate": 2.2821203953279426e-05,
+      "loss": 1.2969133853912354,
+      "step": 128
+    },
+    {
+      "epoch": 0.05609492988133765,
+      "grad_norm": 0.3940851390361786,
+      "learning_rate": 2.3180592991913746e-05,
+      "loss": 1.1892330646514893,
+      "step": 130
+    },
+    {
+      "epoch": 0.056957928802589,
+      "grad_norm": 0.45011839270591736,
+      "learning_rate": 2.353998203054807e-05,
+      "loss": 1.2082979679107666,
+      "step": 132
+    },
+    {
+      "epoch": 0.05782092772384034,
+      "grad_norm": 0.46059420704841614,
+      "learning_rate": 2.3899371069182393e-05,
+      "loss": 1.2388817071914673,
+      "step": 134
+    },
+    {
+      "epoch": 0.05868392664509169,
+      "grad_norm": 0.41085872054100037,
+      "learning_rate": 2.4258760107816713e-05,
+      "loss": 1.193917155265808,
+      "step": 136
+    },
+    {
+      "epoch": 0.05954692556634304,
+      "grad_norm": 0.4024205207824707,
+      "learning_rate": 2.4618149146451032e-05,
+      "loss": 1.1514034271240234,
+      "step": 138
+    },
+    {
+      "epoch": 0.06040992448759439,
+      "grad_norm": 0.3893793523311615,
+      "learning_rate": 2.4977538185085356e-05,
+      "loss": 1.1626157760620117,
+      "step": 140
+    },
+    {
+      "epoch": 0.06127292340884574,
+      "grad_norm": 0.4456317126750946,
+      "learning_rate": 2.5336927223719675e-05,
+      "loss": 1.1627076864242554,
+      "step": 142
+    },
+    {
+      "epoch": 0.062135922330097085,
+      "grad_norm": 0.5050215125083923,
+      "learning_rate": 2.5696316262354e-05,
+      "loss": 1.3038755655288696,
+      "step": 144
+    },
+    {
+      "epoch": 0.06299892125134844,
+      "grad_norm": 0.4071207642555237,
+      "learning_rate": 2.605570530098832e-05,
+      "loss": 1.1708844900131226,
+      "step": 146
+    },
+    {
+      "epoch": 0.06386192017259978,
+      "grad_norm": 0.4363228678703308,
+      "learning_rate": 2.641509433962264e-05,
+      "loss": 1.2149070501327515,
+      "step": 148
+    },
+    {
+      "epoch": 0.06472491909385113,
+      "grad_norm": 0.4436556398868561,
+      "learning_rate": 2.6774483378256965e-05,
+      "loss": 1.1942368745803833,
+      "step": 150
+    },
+    {
+      "epoch": 0.06558791801510248,
+      "grad_norm": 0.4068629741668701,
+      "learning_rate": 2.7133872416891288e-05,
+      "loss": 1.1799161434173584,
+      "step": 152
+    },
+    {
+      "epoch": 0.06645091693635383,
+      "grad_norm": 0.5291106700897217,
+      "learning_rate": 2.7493261455525608e-05,
+      "loss": 1.1832845211029053,
+      "step": 154
+    },
+    {
+      "epoch": 0.06731391585760518,
+      "grad_norm": 0.4410109221935272,
+      "learning_rate": 2.785265049415993e-05,
+      "loss": 1.1696993112564087,
+      "step": 156
+    },
+    {
+      "epoch": 0.06817691477885653,
+      "grad_norm": 0.4858371913433075,
+      "learning_rate": 2.8212039532794248e-05,
+      "loss": 1.2036973237991333,
+      "step": 158
+    },
+    {
+      "epoch": 0.06903991370010787,
+      "grad_norm": 0.45373693108558655,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 1.1145079135894775,
+      "step": 160
+    },
+    {
+      "epoch": 0.06990291262135923,
+      "grad_norm": 0.4881038963794708,
+      "learning_rate": 2.8930817610062894e-05,
+      "loss": 1.173502802848816,
+      "step": 162
+    },
+    {
+      "epoch": 0.07076591154261057,
+      "grad_norm": 0.576934814453125,
+      "learning_rate": 2.9290206648697217e-05,
+      "loss": 1.250414490699768,
+      "step": 164
+    },
+    {
+      "epoch": 0.07162891046386193,
+      "grad_norm": 0.4900001287460327,
+      "learning_rate": 2.9649595687331537e-05,
+      "loss": 1.0721495151519775,
+      "step": 166
+    },
+    {
+      "epoch": 0.07249190938511327,
+      "grad_norm": 0.4440019726753235,
+      "learning_rate": 3.000898472596586e-05,
+      "loss": 1.0689374208450317,
+      "step": 168
+    },
+    {
+      "epoch": 0.07335490830636461,
+      "grad_norm": 0.4267268180847168,
+      "learning_rate": 3.0368373764600184e-05,
+      "loss": 1.2095128297805786,
+      "step": 170
+    },
+    {
+      "epoch": 0.07421790722761597,
+      "grad_norm": 0.6062787771224976,
+      "learning_rate": 3.0727762803234503e-05,
+      "loss": 1.077776551246643,
+      "step": 172
+    },
+    {
+      "epoch": 0.07508090614886731,
+      "grad_norm": 0.49510180950164795,
+      "learning_rate": 3.108715184186882e-05,
+      "loss": 1.144006371498108,
+      "step": 174
+    },
+    {
+      "epoch": 0.07594390507011867,
+      "grad_norm": 0.4670701026916504,
+      "learning_rate": 3.144654088050314e-05,
+      "loss": 1.1663392782211304,
+      "step": 176
+    },
+    {
+      "epoch": 0.07680690399137001,
+      "grad_norm": 0.5615383386611938,
+      "learning_rate": 3.1805929919137466e-05,
+      "loss": 1.1665973663330078,
+      "step": 178
+    },
+    {
+      "epoch": 0.07766990291262135,
+      "grad_norm": 0.47305551171302795,
+      "learning_rate": 3.216531895777179e-05,
+      "loss": 1.1337063312530518,
+      "step": 180
+    },
+    {
+      "epoch": 0.07853290183387271,
+      "grad_norm": 0.5127068758010864,
+      "learning_rate": 3.252470799640611e-05,
+      "loss": 1.072874903678894,
+      "step": 182
+    },
+    {
+      "epoch": 0.07939590075512405,
+      "grad_norm": 0.632448136806488,
+      "learning_rate": 3.2884097035040436e-05,
+      "loss": 1.1577240228652954,
+      "step": 184
+    },
+    {
+      "epoch": 0.08025889967637541,
+      "grad_norm": 0.4041025638580322,
+      "learning_rate": 3.324348607367476e-05,
+      "loss": 1.1186822652816772,
+      "step": 186
+    },
+    {
+      "epoch": 0.08112189859762675,
+      "grad_norm": 0.5239102244377136,
+      "learning_rate": 3.3602875112309076e-05,
+      "loss": 1.1468429565429688,
+      "step": 188
+    },
+    {
+      "epoch": 0.08198489751887811,
+      "grad_norm": 0.4486575424671173,
+      "learning_rate": 3.39622641509434e-05,
+      "loss": 1.0017019510269165,
+      "step": 190
+    },
+    {
+      "epoch": 0.08284789644012945,
+      "grad_norm": 0.4994317293167114,
+      "learning_rate": 3.4321653189577715e-05,
+      "loss": 1.1901532411575317,
+      "step": 192
+    },
+    {
+      "epoch": 0.0837108953613808,
+      "grad_norm": 0.5023699998855591,
+      "learning_rate": 3.468104222821204e-05,
+      "loss": 1.1398564577102661,
+      "step": 194
+    },
+    {
+      "epoch": 0.08457389428263215,
+      "grad_norm": 0.5077701807022095,
+      "learning_rate": 3.504043126684636e-05,
+      "loss": 1.1390413045883179,
+      "step": 196
+    },
+    {
+      "epoch": 0.0854368932038835,
+      "grad_norm": 0.5527892112731934,
+      "learning_rate": 3.5399820305480685e-05,
+      "loss": 1.1411432027816772,
+      "step": 198
+    },
+    {
+      "epoch": 0.08629989212513485,
+      "grad_norm": 0.5572488903999329,
+      "learning_rate": 3.575920934411501e-05,
+      "loss": 1.071260690689087,
+      "step": 200
+    },
+    {
+      "epoch": 0.08629989212513485,
+      "eval_loss": 1.1519012451171875,
+      "eval_runtime": 654.6055,
+      "eval_samples_per_second": 3.147,
+      "eval_steps_per_second": 3.147,
+      "step": 200
+    },
+    {
+      "epoch": 0.08716289104638619,
+      "grad_norm": 0.5134095549583435,
+      "learning_rate": 3.611859838274933e-05,
+      "loss": 1.138135552406311,
+      "step": 202
+    },
+    {
+      "epoch": 0.08802588996763754,
+      "grad_norm": 0.5166040658950806,
+      "learning_rate": 3.647798742138365e-05,
+      "loss": 1.111999273300171,
+      "step": 204
+    },
+    {
+      "epoch": 0.08888888888888889,
+      "grad_norm": 0.5336993336677551,
+      "learning_rate": 3.683737646001797e-05,
+      "loss": 1.1031352281570435,
+      "step": 206
+    },
+    {
+      "epoch": 0.08975188781014024,
+      "grad_norm": 0.8289600014686584,
+      "learning_rate": 3.7196765498652294e-05,
+      "loss": 1.0388667583465576,
+      "step": 208
+    },
+    {
+      "epoch": 0.09061488673139159,
+      "grad_norm": 0.47992637753486633,
+      "learning_rate": 3.755615453728661e-05,
+      "loss": 1.0950241088867188,
+      "step": 210
+    },
+    {
+      "epoch": 0.09147788565264293,
+      "grad_norm": 0.5629691481590271,
+      "learning_rate": 3.7915543575920934e-05,
+      "loss": 1.0361733436584473,
+      "step": 212
+    },
+    {
+      "epoch": 0.09234088457389428,
+      "grad_norm": 0.5515111684799194,
+      "learning_rate": 3.827493261455526e-05,
+      "loss": 1.0922447443008423,
+      "step": 214
+    },
+    {
+      "epoch": 0.09320388349514563,
+      "grad_norm": 0.5078643560409546,
+      "learning_rate": 3.863432165318958e-05,
+      "loss": 1.0866856575012207,
+      "step": 216
+    },
+    {
+      "epoch": 0.09406688241639698,
+      "grad_norm": 0.6046127676963806,
+      "learning_rate": 3.8993710691823904e-05,
+      "loss": 1.1231595277786255,
+      "step": 218
+    },
+    {
+      "epoch": 0.09492988133764833,
+      "grad_norm": 0.6255762577056885,
+      "learning_rate": 3.935309973045822e-05,
+      "loss": 1.099171757698059,
+      "step": 220
+    },
+    {
+      "epoch": 0.09579288025889968,
+      "grad_norm": 0.6036638021469116,
+      "learning_rate": 3.971248876909254e-05,
+      "loss": 1.0557761192321777,
+      "step": 222
+    },
+    {
+      "epoch": 0.09665587918015102,
+      "grad_norm": 0.5520529747009277,
+      "learning_rate": 4.0071877807726867e-05,
+      "loss": 1.0467877388000488,
+      "step": 224
+    },
+    {
+      "epoch": 0.09751887810140238,
+      "grad_norm": 0.5958684682846069,
+      "learning_rate": 4.043126684636119e-05,
+      "loss": 1.17941153049469,
+      "step": 226
+    },
+    {
+      "epoch": 0.09838187702265372,
+      "grad_norm": 0.5283281803131104,
+      "learning_rate": 4.079065588499551e-05,
+      "loss": 1.104217767715454,
+      "step": 228
+    },
+    {
+      "epoch": 0.09924487594390508,
+      "grad_norm": 0.5608792901039124,
+      "learning_rate": 4.115004492362983e-05,
+      "loss": 1.0900640487670898,
+      "step": 230
+    },
+    {
+      "epoch": 0.10010787486515642,
+      "grad_norm": 0.555964469909668,
+      "learning_rate": 4.150943396226415e-05,
+      "loss": 0.9887422323226929,
+      "step": 232
+    },
+    {
+      "epoch": 0.10097087378640776,
+      "grad_norm": 0.5875785946846008,
+      "learning_rate": 4.1868823000898476e-05,
+      "loss": 1.1298567056655884,
+      "step": 234
+    },
+    {
+      "epoch": 0.10183387270765912,
+      "grad_norm": 0.4544795751571655,
+      "learning_rate": 4.222821203953279e-05,
+      "loss": 1.0957067012786865,
+      "step": 236
+    },
+    {
+      "epoch": 0.10269687162891046,
+      "grad_norm": 0.564145565032959,
+      "learning_rate": 4.2587601078167116e-05,
+      "loss": 1.0328738689422607,
+      "step": 238
+    },
+    {
+      "epoch": 0.10355987055016182,
+      "grad_norm": 0.6285979747772217,
+      "learning_rate": 4.294699011680144e-05,
+      "loss": 1.1085515022277832,
+      "step": 240
+    },
+    {
+      "epoch": 0.10442286947141316,
+      "grad_norm": 0.6442288756370544,
+      "learning_rate": 4.330637915543576e-05,
+      "loss": 1.1291271448135376,
+      "step": 242
+    },
+    {
+      "epoch": 0.1052858683926645,
+      "grad_norm": 0.6137154698371887,
+      "learning_rate": 4.3665768194070085e-05,
+      "loss": 1.1759567260742188,
+      "step": 244
+    },
+    {
+      "epoch": 0.10614886731391586,
+      "grad_norm": 0.5906805992126465,
+      "learning_rate": 4.402515723270441e-05,
+      "loss": 1.148414969444275,
+      "step": 246
+    },
+    {
+      "epoch": 0.1070118662351672,
+      "grad_norm": 0.5382888913154602,
+      "learning_rate": 4.438454627133873e-05,
+      "loss": 1.0749616622924805,
+      "step": 248
+    },
+    {
+      "epoch": 0.10787486515641856,
+      "grad_norm": 0.6185492873191833,
+      "learning_rate": 4.474393530997305e-05,
+      "loss": 1.2235801219940186,
+      "step": 250
+    },
+    {
+      "epoch": 0.1087378640776699,
+      "grad_norm": 0.5981597900390625,
+      "learning_rate": 4.5103324348607365e-05,
+      "loss": 1.1390639543533325,
+      "step": 252
+    },
+    {
+      "epoch": 0.10960086299892124,
+      "grad_norm": 0.5664694905281067,
+      "learning_rate": 4.546271338724169e-05,
+      "loss": 1.171774983406067,
+      "step": 254
+    },
+    {
+      "epoch": 0.1104638619201726,
+      "grad_norm": 0.7071851491928101,
+      "learning_rate": 4.582210242587601e-05,
+      "loss": 1.1704237461090088,
+      "step": 256
+    },
+    {
+      "epoch": 0.11132686084142394,
+      "grad_norm": 0.5815614461898804,
+      "learning_rate": 4.6181491464510334e-05,
+      "loss": 1.0619677305221558,
+      "step": 258
+    },
+    {
+      "epoch": 0.1121898597626753,
+      "grad_norm": 0.6481915712356567,
+      "learning_rate": 4.654088050314466e-05,
+      "loss": 1.0824390649795532,
+      "step": 260
+    },
+    {
+      "epoch": 0.11305285868392664,
+      "grad_norm": 0.5988591313362122,
+      "learning_rate": 4.690026954177898e-05,
+      "loss": 1.087929606437683,
+      "step": 262
+    },
+    {
+      "epoch": 0.113915857605178,
+      "grad_norm": 0.6545296311378479,
+      "learning_rate": 4.7259658580413304e-05,
+      "loss": 1.0936195850372314,
+      "step": 264
+    },
+    {
+      "epoch": 0.11477885652642934,
+      "grad_norm": 0.5826204419136047,
+      "learning_rate": 4.761904761904762e-05,
+      "loss": 1.0433681011199951,
+      "step": 266
+    },
+    {
+      "epoch": 0.11564185544768069,
+      "grad_norm": 0.5907514095306396,
+      "learning_rate": 4.7978436657681944e-05,
+      "loss": 1.0719536542892456,
+      "step": 268
+    },
+    {
+      "epoch": 0.11650485436893204,
+      "grad_norm": 0.524394154548645,
+      "learning_rate": 4.833782569631627e-05,
+      "loss": 1.0231504440307617,
+      "step": 270
+    },
+    {
+      "epoch": 0.11736785329018339,
+      "grad_norm": 0.5472846031188965,
+      "learning_rate": 4.869721473495058e-05,
+      "loss": 0.9905915260314941,
+      "step": 272
+    },
+    {
+      "epoch": 0.11823085221143474,
+      "grad_norm": 0.727922260761261,
+      "learning_rate": 4.9056603773584906e-05,
+      "loss": 1.213677167892456,
+      "step": 274
+    },
+    {
+      "epoch": 0.11909385113268608,
+      "grad_norm": 0.6009684801101685,
+      "learning_rate": 4.941599281221923e-05,
+      "loss": 1.0052144527435303,
+      "step": 276
+    },
+    {
+      "epoch": 0.11995685005393743,
+      "grad_norm": 0.6564669013023376,
+      "learning_rate": 4.977538185085355e-05,
+      "loss": 1.108136773109436,
+      "step": 278
+    },
+    {
+      "epoch": 0.12081984897518878,
+      "grad_norm": 0.650074303150177,
+      "learning_rate": 5.013477088948787e-05,
+      "loss": 0.9700815677642822,
+      "step": 280
+    },
+    {
+      "epoch": 0.12168284789644013,
+      "grad_norm": 0.5772947072982788,
+      "learning_rate": 5.04941599281222e-05,
+      "loss": 1.038031816482544,
+      "step": 282
+    },
+    {
+      "epoch": 0.12254584681769148,
+      "grad_norm": 0.7293002009391785,
+      "learning_rate": 5.0853548966756516e-05,
+      "loss": 1.1063730716705322,
+      "step": 284
+    },
+    {
+      "epoch": 0.12340884573894283,
+      "grad_norm": 0.7937333583831787,
+      "learning_rate": 5.1212938005390846e-05,
+      "loss": 1.128495693206787,
+      "step": 286
+    },
+    {
+      "epoch": 0.12427184466019417,
+      "grad_norm": 0.48499324917793274,
+      "learning_rate": 5.157232704402516e-05,
+      "loss": 0.9438712000846863,
+      "step": 288
+    },
+    {
+      "epoch": 0.12513484358144553,
+      "grad_norm": 0.6010656952857971,
+      "learning_rate": 5.193171608265948e-05,
+      "loss": 1.0872881412506104,
+      "step": 290
+    },
+    {
+      "epoch": 0.12599784250269688,
+      "grad_norm": 0.6240811944007874,
+      "learning_rate": 5.22911051212938e-05,
+      "loss": 1.110992193222046,
+      "step": 292
+    },
+    {
+      "epoch": 0.1268608414239482,
+      "grad_norm": 0.7172768712043762,
+      "learning_rate": 5.265049415992812e-05,
+      "loss": 1.1109752655029297,
+      "step": 294
+    },
+    {
+      "epoch": 0.12772384034519957,
+      "grad_norm": 0.6442400217056274,
+      "learning_rate": 5.300988319856245e-05,
+      "loss": 1.05553138256073,
+      "step": 296
+    },
+    {
+      "epoch": 0.12858683926645093,
+      "grad_norm": 0.7074702382087708,
+      "learning_rate": 5.3369272237196765e-05,
+      "loss": 1.0717648267745972,
+      "step": 298
+    },
+    {
+      "epoch": 0.12944983818770225,
+      "grad_norm": 0.5277591347694397,
+      "learning_rate": 5.3728661275831095e-05,
+      "loss": 0.9777541756629944,
+      "step": 300
+    },
+    {
+      "epoch": 0.12944983818770225,
+      "eval_loss": 1.0977506637573242,
+      "eval_runtime": 662.1728,
+      "eval_samples_per_second": 3.111,
+      "eval_steps_per_second": 3.111,
+      "step": 300
+    },
+    {
+      "epoch": 0.1303128371089536,
+      "grad_norm": 0.7252246737480164,
+      "learning_rate": 5.408805031446541e-05,
+      "loss": 1.075905203819275,
+      "step": 302
+    },
+    {
+      "epoch": 0.13117583603020497,
+      "grad_norm": 0.7003294229507446,
+      "learning_rate": 5.444743935309974e-05,
+      "loss": 1.1117515563964844,
+      "step": 304
+    },
+    {
+      "epoch": 0.13203883495145632,
+      "grad_norm": 0.5878211259841919,
+      "learning_rate": 5.480682839173406e-05,
+      "loss": 1.0289191007614136,
+      "step": 306
+    },
+    {
+      "epoch": 0.13290183387270765,
+      "grad_norm": 0.7133644223213196,
+      "learning_rate": 5.5166217430368374e-05,
+      "loss": 1.0199183225631714,
+      "step": 308
+    },
+    {
+      "epoch": 0.133764832793959,
+      "grad_norm": 0.6098423600196838,
+      "learning_rate": 5.55256064690027e-05,
+      "loss": 1.0132375955581665,
+      "step": 310
+    },
+    {
+      "epoch": 0.13462783171521037,
+      "grad_norm": 0.6386916041374207,
+      "learning_rate": 5.5884995507637014e-05,
+      "loss": 1.1595754623413086,
+      "step": 312
+    },
+    {
+      "epoch": 0.1354908306364617,
+      "grad_norm": 0.6563469767570496,
+      "learning_rate": 5.6244384546271344e-05,
+      "loss": 1.0921307802200317,
+      "step": 314
+    },
+    {
+      "epoch": 0.13635382955771305,
+      "grad_norm": 0.6388015747070312,
+      "learning_rate": 5.660377358490566e-05,
+      "loss": 1.0200815200805664,
+      "step": 316
+    },
+    {
+      "epoch": 0.1372168284789644,
+      "grad_norm": 0.6026274561882019,
+      "learning_rate": 5.696316262353999e-05,
+      "loss": 0.9339485764503479,
+      "step": 318
+    },
+    {
+      "epoch": 0.13807982740021574,
+      "grad_norm": 0.619800865650177,
+      "learning_rate": 5.732255166217431e-05,
+      "loss": 1.0268478393554688,
+      "step": 320
+    },
+    {
+      "epoch": 0.1389428263214671,
+      "grad_norm": 0.5924715399742126,
+      "learning_rate": 5.768194070080862e-05,
+      "loss": 1.1394236087799072,
+      "step": 322
+    },
+    {
+      "epoch": 0.13980582524271845,
+      "grad_norm": 0.6829012036323547,
+      "learning_rate": 5.804132973944295e-05,
+      "loss": 1.002437949180603,
+      "step": 324
+    },
+    {
+      "epoch": 0.1406688241639698,
+      "grad_norm": 0.7012544274330139,
+      "learning_rate": 5.840071877807727e-05,
+      "loss": 1.132503628730774,
+      "step": 326
+    },
+    {
+      "epoch": 0.14153182308522114,
+      "grad_norm": 0.7921599745750427,
+      "learning_rate": 5.876010781671159e-05,
+      "loss": 1.1859129667282104,
+      "step": 328
+    },
+    {
+      "epoch": 0.1423948220064725,
+      "grad_norm": 0.6373353004455566,
+      "learning_rate": 5.9119496855345916e-05,
+      "loss": 1.0896776914596558,
+      "step": 330
+    },
+    {
+      "epoch": 0.14325782092772385,
+      "grad_norm": 0.6174030900001526,
+      "learning_rate": 5.947888589398024e-05,
+      "loss": 1.0691723823547363,
+      "step": 332
+    },
+    {
+      "epoch": 0.14412081984897518,
+      "grad_norm": 0.5110617280006409,
+      "learning_rate": 5.9838274932614556e-05,
+      "loss": 1.0144777297973633,
+      "step": 334
+    },
+    {
+      "epoch": 0.14498381877022654,
+      "grad_norm": 0.5580511093139648,
+      "learning_rate": 6.019766397124887e-05,
+      "loss": 0.9955101609230042,
+      "step": 336
+    },
+    {
+      "epoch": 0.1458468176914779,
+      "grad_norm": 0.6427345275878906,
+      "learning_rate": 6.05570530098832e-05,
+      "loss": 0.9863013625144958,
+      "step": 338
+    },
+    {
+      "epoch": 0.14670981661272922,
+      "grad_norm": 0.7464537024497986,
+      "learning_rate": 6.091644204851752e-05,
+      "loss": 1.0682255029678345,
+      "step": 340
+    },
+    {
+      "epoch": 0.14757281553398058,
+      "grad_norm": 0.599926769733429,
+      "learning_rate": 6.127583108715184e-05,
+      "loss": 1.034083366394043,
+      "step": 342
+    },
+    {
+      "epoch": 0.14843581445523193,
+      "grad_norm": 0.6320257186889648,
+      "learning_rate": 6.163522012578616e-05,
+      "loss": 1.0776089429855347,
+      "step": 344
+    },
+    {
+      "epoch": 0.1492988133764833,
+      "grad_norm": 0.6565091013908386,
+      "learning_rate": 6.199460916442049e-05,
+      "loss": 1.0493087768554688,
+      "step": 346
+    },
+    {
+      "epoch": 0.15016181229773462,
+      "grad_norm": 0.6512171626091003,
+      "learning_rate": 6.23539982030548e-05,
+      "loss": 1.0469218492507935,
+      "step": 348
+    },
+    {
+      "epoch": 0.15102481121898598,
+      "grad_norm": 0.8487282991409302,
+      "learning_rate": 6.271338724168913e-05,
+      "loss": 1.0985081195831299,
+      "step": 350
+    },
+    {
+      "epoch": 0.15188781014023733,
+      "grad_norm": 0.6718961596488953,
+      "learning_rate": 6.307277628032345e-05,
+      "loss": 1.0714176893234253,
+      "step": 352
+    },
+    {
+      "epoch": 0.15275080906148866,
+      "grad_norm": 0.8175088167190552,
+      "learning_rate": 6.343216531895777e-05,
+      "loss": 1.0599322319030762,
+      "step": 354
+    },
+    {
+      "epoch": 0.15361380798274002,
+      "grad_norm": 0.6359215378761292,
+      "learning_rate": 6.37915543575921e-05,
+      "loss": 0.9268131256103516,
+      "step": 356
+    },
+    {
+      "epoch": 0.15447680690399138,
+      "grad_norm": 0.6423866748809814,
+      "learning_rate": 6.415094339622641e-05,
+      "loss": 0.9838354587554932,
+      "step": 358
+    },
+    {
+      "epoch": 0.1553398058252427,
+      "grad_norm": 0.6496716737747192,
+      "learning_rate": 6.451033243486074e-05,
+      "loss": 1.048566460609436,
+      "step": 360
+    },
+    {
+      "epoch": 0.15620280474649406,
+      "grad_norm": 0.6536920666694641,
+      "learning_rate": 6.486972147349506e-05,
+      "loss": 1.0910537242889404,
+      "step": 362
+    },
+    {
+      "epoch": 0.15706580366774542,
+      "grad_norm": 0.5832068920135498,
+      "learning_rate": 6.522911051212939e-05,
+      "loss": 0.9971448183059692,
+      "step": 364
+    },
+    {
+      "epoch": 0.15792880258899678,
+      "grad_norm": 0.6647719144821167,
+      "learning_rate": 6.558849955076371e-05,
+      "loss": 1.0496708154678345,
+      "step": 366
+    },
+    {
+      "epoch": 0.1587918015102481,
+      "grad_norm": 0.623252809047699,
+      "learning_rate": 6.594788858939802e-05,
+      "loss": 0.955894410610199,
+      "step": 368
+    },
+    {
+      "epoch": 0.15965480043149946,
+      "grad_norm": 0.6311860084533691,
+      "learning_rate": 6.630727762803235e-05,
+      "loss": 1.1304032802581787,
+      "step": 370
+    },
+    {
+      "epoch": 0.16051779935275082,
+      "grad_norm": 0.5306481122970581,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.8746405243873596,
+      "step": 372
+    },
+    {
+      "epoch": 0.16138079827400215,
+      "grad_norm": 0.6249631643295288,
+      "learning_rate": 6.7026055705301e-05,
+      "loss": 0.9104986786842346,
+      "step": 374
+    },
+    {
+      "epoch": 0.1622437971952535,
+      "grad_norm": 0.6243219971656799,
+      "learning_rate": 6.738544474393532e-05,
+      "loss": 1.043666124343872,
+      "step": 376
+    },
+    {
+      "epoch": 0.16310679611650486,
+      "grad_norm": 0.6833282113075256,
+      "learning_rate": 6.774483378256963e-05,
+      "loss": 1.0504906177520752,
+      "step": 378
+    },
+    {
+      "epoch": 0.16396979503775622,
+      "grad_norm": 0.7124452590942383,
+      "learning_rate": 6.810422282120395e-05,
+      "loss": 1.0608166456222534,
+      "step": 380
+    },
+    {
+      "epoch": 0.16483279395900755,
+      "grad_norm": 0.7520908117294312,
+      "learning_rate": 6.846361185983828e-05,
+      "loss": 1.1653732061386108,
+      "step": 382
+    },
+    {
+      "epoch": 0.1656957928802589,
+      "grad_norm": 0.7121814489364624,
+      "learning_rate": 6.88230008984726e-05,
+      "loss": 1.0626367330551147,
+      "step": 384
+    },
+    {
+      "epoch": 0.16655879180151026,
+      "grad_norm": 0.6825008988380432,
+      "learning_rate": 6.918238993710691e-05,
+      "loss": 1.012121319770813,
+      "step": 386
+    },
+    {
+      "epoch": 0.1674217907227616,
+      "grad_norm": 0.4922940135002136,
+      "learning_rate": 6.954177897574124e-05,
+      "loss": 1.0576211214065552,
+      "step": 388
+    },
+    {
+      "epoch": 0.16828478964401294,
+      "grad_norm": 0.6122089624404907,
+      "learning_rate": 6.990116801437556e-05,
+      "loss": 1.03916597366333,
+      "step": 390
+    },
+    {
+      "epoch": 0.1691477885652643,
+      "grad_norm": 0.6348981261253357,
+      "learning_rate": 7.026055705300989e-05,
+      "loss": 1.17647123336792,
+      "step": 392
+    },
+    {
+      "epoch": 0.17001078748651563,
+      "grad_norm": 0.6205878257751465,
+      "learning_rate": 7.06199460916442e-05,
+      "loss": 0.9095983505249023,
+      "step": 394
+    },
+    {
+      "epoch": 0.170873786407767,
+      "grad_norm": 0.61506187915802,
+      "learning_rate": 7.097933513027853e-05,
+      "loss": 1.082506775856018,
+      "step": 396
+    },
+    {
+      "epoch": 0.17173678532901834,
+      "grad_norm": 0.6481751799583435,
+      "learning_rate": 7.133872416891285e-05,
+      "loss": 1.0716280937194824,
+      "step": 398
+    },
+    {
+      "epoch": 0.1725997842502697,
+      "grad_norm": 0.4871014952659607,
+      "learning_rate": 7.169811320754717e-05,
+      "loss": 0.9616814851760864,
+      "step": 400
+    },
+    {
+      "epoch": 0.1725997842502697,
+      "eval_loss": 1.0649415254592896,
+      "eval_runtime": 668.6025,
+      "eval_samples_per_second": 3.081,
+      "eval_steps_per_second": 3.081,
+      "step": 400
+    },
+    {
+      "epoch": 0.17346278317152103,
+      "grad_norm": 0.5680040121078491,
+      "learning_rate": 7.20575022461815e-05,
+      "loss": 1.0475050210952759,
+      "step": 402
+    },
+    {
+      "epoch": 0.17432578209277239,
+      "grad_norm": 0.6417813897132874,
+      "learning_rate": 7.241689128481581e-05,
+      "loss": 0.9851161241531372,
+      "step": 404
+    },
+    {
+      "epoch": 0.17518878101402374,
+      "grad_norm": 0.6600468158721924,
+      "learning_rate": 7.277628032345014e-05,
+      "loss": 1.013339638710022,
+      "step": 406
+    },
+    {
+      "epoch": 0.17605177993527507,
+      "grad_norm": 0.6733932495117188,
+      "learning_rate": 7.313566936208446e-05,
+      "loss": 0.9346804022789001,
+      "step": 408
+    },
+    {
+      "epoch": 0.17691477885652643,
+      "grad_norm": 0.6812151074409485,
+      "learning_rate": 7.349505840071879e-05,
+      "loss": 0.9890368580818176,
+      "step": 410
+    },
+    {
+      "epoch": 0.17777777777777778,
+      "grad_norm": 0.6380394697189331,
+      "learning_rate": 7.385444743935311e-05,
+      "loss": 0.8787848949432373,
+      "step": 412
+    },
+    {
+      "epoch": 0.1786407766990291,
+      "grad_norm": 0.6004905700683594,
+      "learning_rate": 7.421383647798742e-05,
+      "loss": 1.0235728025436401,
+      "step": 414
+    },
+    {
+      "epoch": 0.17950377562028047,
+      "grad_norm": 0.6569193005561829,
+      "learning_rate": 7.457322551662175e-05,
+      "loss": 0.9972385168075562,
+      "step": 416
+    },
+    {
+      "epoch": 0.18036677454153183,
+      "grad_norm": 0.6761631369590759,
+      "learning_rate": 7.493261455525607e-05,
+      "loss": 0.9593698382377625,
+      "step": 418
+    },
+    {
+      "epoch": 0.18122977346278318,
+      "grad_norm": 0.7328561544418335,
+      "learning_rate": 7.529200359389039e-05,
+      "loss": 1.0426853895187378,
+      "step": 420
+    },
+    {
+      "epoch": 0.1820927723840345,
+      "grad_norm": 0.6256070137023926,
+      "learning_rate": 7.56513926325247e-05,
+      "loss": 0.9608182311058044,
+      "step": 422
+    },
+    {
+      "epoch": 0.18295577130528587,
+      "grad_norm": 1.2549844980239868,
+      "learning_rate": 7.601078167115903e-05,
+      "loss": 1.0162668228149414,
+      "step": 424
+    },
+    {
+      "epoch": 0.18381877022653723,
+      "grad_norm": 0.6751510500907898,
+      "learning_rate": 7.637017070979335e-05,
+      "loss": 1.130725383758545,
+      "step": 426
+    },
+    {
+      "epoch": 0.18468176914778855,
+      "grad_norm": 0.7029808163642883,
+      "learning_rate": 7.672955974842768e-05,
+      "loss": 1.0384817123413086,
+      "step": 428
+    },
+    {
+      "epoch": 0.1855447680690399,
+      "grad_norm": 0.644353449344635,
+      "learning_rate": 7.7088948787062e-05,
+      "loss": 1.017020344734192,
+      "step": 430
+    },
+    {
+      "epoch": 0.18640776699029127,
+      "grad_norm": 0.6784916520118713,
+      "learning_rate": 7.744833782569631e-05,
+      "loss": 1.005354404449463,
+      "step": 432
+    },
+    {
+      "epoch": 0.1872707659115426,
+      "grad_norm": 0.5989449620246887,
+      "learning_rate": 7.780772686433064e-05,
+      "loss": 1.026848316192627,
+      "step": 434
+    },
+    {
+      "epoch": 0.18813376483279395,
+      "grad_norm": 0.6502639651298523,
+      "learning_rate": 7.816711590296496e-05,
+      "loss": 0.9891080856323242,
+      "step": 436
+    },
+    {
+      "epoch": 0.1889967637540453,
+      "grad_norm": 0.6176205277442932,
+      "learning_rate": 7.852650494159929e-05,
+      "loss": 0.966316819190979,
+      "step": 438
+    },
+    {
+      "epoch": 0.18985976267529667,
+      "grad_norm": 0.6801626086235046,
+      "learning_rate": 7.88858939802336e-05,
+      "loss": 1.123063087463379,
+      "step": 440
+    },
+    {
+      "epoch": 0.190722761596548,
+      "grad_norm": 0.6718618273735046,
+      "learning_rate": 7.924528301886794e-05,
+      "loss": 1.0467073917388916,
+      "step": 442
+    },
+    {
+      "epoch": 0.19158576051779935,
+      "grad_norm": 0.6761009097099304,
+      "learning_rate": 7.960467205750225e-05,
+      "loss": 1.0952889919281006,
+      "step": 444
+    },
+    {
+      "epoch": 0.1924487594390507,
+      "grad_norm": 0.6356327533721924,
+      "learning_rate": 7.996406109613657e-05,
+      "loss": 0.954807698726654,
+      "step": 446
+    },
+    {
+      "epoch": 0.19331175836030204,
+      "grad_norm": 0.6798669695854187,
+      "learning_rate": 8.03234501347709e-05,
+      "loss": 0.9941422343254089,
+      "step": 448
+    },
+    {
+      "epoch": 0.1941747572815534,
+      "grad_norm": 0.6511302590370178,
+      "learning_rate": 8.068283917340521e-05,
+      "loss": 1.0351495742797852,
+      "step": 450
+    },
+    {
+      "epoch": 0.19503775620280475,
+      "grad_norm": 0.6061258912086487,
+      "learning_rate": 8.104222821203954e-05,
+      "loss": 1.00546133518219,
+      "step": 452
+    },
+    {
+      "epoch": 0.1959007551240561,
+      "grad_norm": 0.6278533935546875,
+      "learning_rate": 8.140161725067386e-05,
+      "loss": 1.0778460502624512,
+      "step": 454
+    },
+    {
+      "epoch": 0.19676375404530744,
+      "grad_norm": 0.6866298317909241,
+      "learning_rate": 8.176100628930818e-05,
+      "loss": 1.0344486236572266,
+      "step": 456
+    },
+    {
+      "epoch": 0.1976267529665588,
+      "grad_norm": 0.7338075041770935,
+      "learning_rate": 8.212039532794251e-05,
+      "loss": 1.0663033723831177,
+      "step": 458
+    },
+    {
+      "epoch": 0.19848975188781015,
+      "grad_norm": 0.6811459064483643,
+      "learning_rate": 8.247978436657682e-05,
+      "loss": 0.9665339589118958,
+      "step": 460
+    },
+    {
+      "epoch": 0.19935275080906148,
+      "grad_norm": 0.6779627799987793,
+      "learning_rate": 8.283917340521114e-05,
+      "loss": 1.024712324142456,
+      "step": 462
+    },
+    {
+      "epoch": 0.20021574973031284,
+      "grad_norm": 0.6486892700195312,
+      "learning_rate": 8.319856244384546e-05,
+      "loss": 0.9699305295944214,
+      "step": 464
+    },
+    {
+      "epoch": 0.2010787486515642,
+      "grad_norm": 0.7022278308868408,
+      "learning_rate": 8.355795148247979e-05,
+      "loss": 0.9540432095527649,
+      "step": 466
+    },
+    {
+      "epoch": 0.20194174757281552,
+      "grad_norm": 0.5922990441322327,
+      "learning_rate": 8.39173405211141e-05,
+      "loss": 0.9253339767456055,
+      "step": 468
+    },
+    {
+      "epoch": 0.20280474649406688,
+      "grad_norm": 0.7076792120933533,
+      "learning_rate": 8.427672955974843e-05,
+      "loss": 0.9987741112709045,
+      "step": 470
+    },
+    {
+      "epoch": 0.20366774541531824,
+      "grad_norm": 0.6491380333900452,
+      "learning_rate": 8.463611859838275e-05,
+      "loss": 1.0249329805374146,
+      "step": 472
+    },
+    {
+      "epoch": 0.2045307443365696,
+      "grad_norm": 0.6784211993217468,
+      "learning_rate": 8.499550763701708e-05,
+      "loss": 1.0577133893966675,
+      "step": 474
+    },
+    {
+      "epoch": 0.20539374325782092,
+      "grad_norm": 0.6453303694725037,
+      "learning_rate": 8.53548966756514e-05,
+      "loss": 1.1312458515167236,
+      "step": 476
+    },
+    {
+      "epoch": 0.20625674217907228,
+      "grad_norm": 0.7431377172470093,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.0592451095581055,
+      "step": 478
+    },
+    {
+      "epoch": 0.20711974110032363,
+      "grad_norm": 0.6097649931907654,
+      "learning_rate": 8.607367475292004e-05,
+      "loss": 0.9337235689163208,
+      "step": 480
+    },
+    {
+      "epoch": 0.20798274002157496,
+      "grad_norm": 0.5693124532699585,
+      "learning_rate": 8.643306379155436e-05,
+      "loss": 0.9088928699493408,
+      "step": 482
+    },
+    {
+      "epoch": 0.20884573894282632,
+      "grad_norm": 0.7377229332923889,
+      "learning_rate": 8.679245283018869e-05,
+      "loss": 1.0729358196258545,
+      "step": 484
+    },
+    {
+      "epoch": 0.20970873786407768,
+      "grad_norm": 0.7399470210075378,
+      "learning_rate": 8.7151841868823e-05,
+      "loss": 1.0428457260131836,
+      "step": 486
+    },
+    {
+      "epoch": 0.210571736785329,
+      "grad_norm": 0.677052915096283,
+      "learning_rate": 8.751123090745734e-05,
+      "loss": 0.9940266013145447,
+      "step": 488
+    },
+    {
+      "epoch": 0.21143473570658036,
+      "grad_norm": 0.7126721739768982,
+      "learning_rate": 8.787061994609165e-05,
+      "loss": 1.011808156967163,
+      "step": 490
+    },
+    {
+      "epoch": 0.21229773462783172,
+      "grad_norm": 0.6663792729377747,
+      "learning_rate": 8.823000898472597e-05,
+      "loss": 1.0054185390472412,
+      "step": 492
+    },
+    {
+      "epoch": 0.21316073354908308,
+      "grad_norm": 0.6661092042922974,
+      "learning_rate": 8.85893980233603e-05,
+      "loss": 1.0167138576507568,
+      "step": 494
+    },
+    {
+      "epoch": 0.2140237324703344,
+      "grad_norm": 0.6975740194320679,
+      "learning_rate": 8.894878706199461e-05,
+      "loss": 1.1470818519592285,
+      "step": 496
+    },
+    {
+      "epoch": 0.21488673139158576,
+      "grad_norm": 0.6594390869140625,
+      "learning_rate": 8.930817610062893e-05,
+      "loss": 0.9619631171226501,
+      "step": 498
+    },
+    {
+      "epoch": 0.21574973031283712,
+      "grad_norm": 0.7216679453849792,
+      "learning_rate": 8.966756513926325e-05,
+      "loss": 0.9971368312835693,
+      "step": 500
+    },
+    {
+      "epoch": 0.21574973031283712,
+      "eval_loss": 1.0417571067810059,
+      "eval_runtime": 659.3112,
+      "eval_samples_per_second": 3.124,
+      "eval_steps_per_second": 3.124,
+      "step": 500
+    },
+    {
+      "epoch": 0.21661272923408845,
+      "grad_norm": 0.6188210844993591,
+      "learning_rate": 9.002695417789758e-05,
+      "loss": 1.0307213068008423,
+      "step": 502
+    },
+    {
+      "epoch": 0.2174757281553398,
+      "grad_norm": 0.6716445088386536,
+      "learning_rate": 9.03863432165319e-05,
+      "loss": 1.0188794136047363,
+      "step": 504
+    },
+    {
+      "epoch": 0.21833872707659116,
+      "grad_norm": 0.6790863275527954,
+      "learning_rate": 9.074573225516622e-05,
+      "loss": 0.9764845967292786,
+      "step": 506
+    },
+    {
+      "epoch": 0.2192017259978425,
+      "grad_norm": 0.6764960289001465,
+      "learning_rate": 9.110512129380054e-05,
+      "loss": 0.948829174041748,
+      "step": 508
+    },
+    {
+      "epoch": 0.22006472491909385,
+      "grad_norm": 0.6210965514183044,
+      "learning_rate": 9.146451033243486e-05,
+      "loss": 1.008013129234314,
+      "step": 510
+    },
+    {
+      "epoch": 0.2209277238403452,
+      "grad_norm": 0.7739297747612,
+      "learning_rate": 9.182389937106919e-05,
+      "loss": 1.1662557125091553,
+      "step": 512
+    },
+    {
+      "epoch": 0.22179072276159656,
+      "grad_norm": 0.7055562138557434,
+      "learning_rate": 9.21832884097035e-05,
+      "loss": 1.0325161218643188,
+      "step": 514
+    },
+    {
+      "epoch": 0.2226537216828479,
+      "grad_norm": 0.6079210042953491,
+      "learning_rate": 9.254267744833783e-05,
+      "loss": 1.00056791305542,
+      "step": 516
+    },
+    {
+      "epoch": 0.22351672060409924,
+      "grad_norm": 0.5974318981170654,
+      "learning_rate": 9.290206648697215e-05,
+      "loss": 0.9422364234924316,
+      "step": 518
+    },
+    {
+      "epoch": 0.2243797195253506,
+      "grad_norm": 0.5963430404663086,
+      "learning_rate": 9.326145552560648e-05,
+      "loss": 0.936336100101471,
+      "step": 520
+    },
+    {
+      "epoch": 0.22524271844660193,
+      "grad_norm": 0.6823658347129822,
+      "learning_rate": 9.36208445642408e-05,
+      "loss": 1.0538607835769653,
+      "step": 522
+    },
+    {
+      "epoch": 0.2261057173678533,
+      "grad_norm": 0.6409855484962463,
+      "learning_rate": 9.398023360287511e-05,
+      "loss": 1.0483653545379639,
+      "step": 524
+    },
+    {
+      "epoch": 0.22696871628910464,
+      "grad_norm": 0.6867254376411438,
+      "learning_rate": 9.433962264150944e-05,
+      "loss": 0.9668049812316895,
+      "step": 526
+    },
+    {
+      "epoch": 0.227831715210356,
+      "grad_norm": 0.5690792798995972,
+      "learning_rate": 9.469901168014376e-05,
+      "loss": 1.008763313293457,
+      "step": 528
+    },
+    {
+      "epoch": 0.22869471413160733,
+      "grad_norm": 0.5964897274971008,
+      "learning_rate": 9.505840071877809e-05,
+      "loss": 1.0816441774368286,
+      "step": 530
+    },
+    {
+      "epoch": 0.2295577130528587,
+      "grad_norm": 0.627419114112854,
+      "learning_rate": 9.54177897574124e-05,
+      "loss": 0.9265700578689575,
+      "step": 532
+    },
+    {
+      "epoch": 0.23042071197411004,
+      "grad_norm": 0.5862151980400085,
+      "learning_rate": 9.577717879604674e-05,
+      "loss": 0.9804646372795105,
+      "step": 534
+    },
+    {
+      "epoch": 0.23128371089536137,
+      "grad_norm": 0.5573718547821045,
+      "learning_rate": 9.613656783468105e-05,
+      "loss": 0.9627988934516907,
+      "step": 536
+    },
+    {
+      "epoch": 0.23214670981661273,
+      "grad_norm": 0.6705166101455688,
+      "learning_rate": 9.649595687331537e-05,
+      "loss": 1.0012824535369873,
+      "step": 538
+    },
+    {
+      "epoch": 0.23300970873786409,
+      "grad_norm": 0.6251236796379089,
+      "learning_rate": 9.685534591194969e-05,
+      "loss": 0.9568162560462952,
+      "step": 540
+    },
+    {
+      "epoch": 0.23387270765911541,
+      "grad_norm": 0.6466493010520935,
+      "learning_rate": 9.7214734950584e-05,
+      "loss": 1.031549334526062,
+      "step": 542
+    },
+    {
+      "epoch": 0.23473570658036677,
+      "grad_norm": 0.5183866024017334,
+      "learning_rate": 9.757412398921833e-05,
+      "loss": 0.8603643774986267,
+      "step": 544
+    },
+    {
+      "epoch": 0.23559870550161813,
+      "grad_norm": 0.6725775599479675,
+      "learning_rate": 9.793351302785265e-05,
+      "loss": 1.0365077257156372,
+      "step": 546
+    },
+    {
+      "epoch": 0.23646170442286948,
+      "grad_norm": 0.5972357988357544,
+      "learning_rate": 9.829290206648698e-05,
+      "loss": 0.9304701089859009,
+      "step": 548
+    },
+    {
+      "epoch": 0.2373247033441208,
+      "grad_norm": 0.5319957733154297,
+      "learning_rate": 9.86522911051213e-05,
+      "loss": 0.9575805068016052,
+      "step": 550
+    },
+    {
+      "epoch": 0.23818770226537217,
+      "grad_norm": 0.6502835750579834,
+      "learning_rate": 9.901168014375562e-05,
+      "loss": 1.0307214260101318,
+      "step": 552
+    },
+    {
+      "epoch": 0.23905070118662353,
+      "grad_norm": 0.6734047532081604,
+      "learning_rate": 9.937106918238994e-05,
+      "loss": 1.05185067653656,
+      "step": 554
+    },
+    {
+      "epoch": 0.23991370010787486,
+      "grad_norm": 0.5667978525161743,
+      "learning_rate": 9.973045822102426e-05,
+      "loss": 1.0190176963806152,
+      "step": 556
+    },
+    {
+      "epoch": 0.2407766990291262,
+      "grad_norm": 0.6370418667793274,
+      "learning_rate": 0.00010008984725965857,
+      "loss": 1.076182246208191,
+      "step": 558
+    },
+    {
+      "epoch": 0.24163969795037757,
+      "grad_norm": 0.689719021320343,
+      "learning_rate": 0.0001004492362982929,
+      "loss": 1.0408724546432495,
+      "step": 560
+    },
+    {
+      "epoch": 0.2425026968716289,
+      "grad_norm": 0.6304254531860352,
+      "learning_rate": 0.00010080862533692723,
+      "loss": 0.9869902729988098,
+      "step": 562
+    },
+    {
+      "epoch": 0.24336569579288025,
+      "grad_norm": 0.6797420382499695,
+      "learning_rate": 0.00010116801437556156,
+      "loss": 1.0198370218276978,
+      "step": 564
+    },
+    {
+      "epoch": 0.2442286947141316,
+      "grad_norm": 0.5993657112121582,
+      "learning_rate": 0.00010152740341419587,
+      "loss": 0.9947441816329956,
+      "step": 566
+    },
+    {
+      "epoch": 0.24509169363538297,
+      "grad_norm": 0.6369836330413818,
+      "learning_rate": 0.0001018867924528302,
+      "loss": 0.9722896814346313,
+      "step": 568
+    },
+    {
+      "epoch": 0.2459546925566343,
+      "grad_norm": 0.6942457556724548,
+      "learning_rate": 0.00010224618149146453,
+      "loss": 0.9716570973396301,
+      "step": 570
+    },
+    {
+      "epoch": 0.24681769147788565,
+      "grad_norm": 0.5403370261192322,
+      "learning_rate": 0.00010260557053009883,
+      "loss": 0.9797524213790894,
+      "step": 572
+    },
+    {
+      "epoch": 0.247680690399137,
+      "grad_norm": 0.5207529067993164,
+      "learning_rate": 0.00010296495956873316,
+      "loss": 0.985367476940155,
+      "step": 574
+    },
+    {
+      "epoch": 0.24854368932038834,
+      "grad_norm": 0.6751103401184082,
+      "learning_rate": 0.00010332434860736748,
+      "loss": 1.075042724609375,
+      "step": 576
+    },
+    {
+      "epoch": 0.2494066882416397,
+      "grad_norm": 0.565331220626831,
+      "learning_rate": 0.0001036837376460018,
+      "loss": 0.9273878335952759,
+      "step": 578
+    },
+    {
+      "epoch": 0.25026968716289105,
+      "grad_norm": 0.6858948469161987,
+      "learning_rate": 0.00010404312668463612,
+      "loss": 0.9872279763221741,
+      "step": 580
+    },
+    {
+      "epoch": 0.2511326860841424,
+      "grad_norm": 0.7091426253318787,
+      "learning_rate": 0.00010440251572327044,
+      "loss": 1.0038671493530273,
+      "step": 582
+    },
+    {
+      "epoch": 0.25199568500539377,
+      "grad_norm": 0.6493771076202393,
+      "learning_rate": 0.00010476190476190477,
+      "loss": 1.0109868049621582,
+      "step": 584
+    },
+    {
+      "epoch": 0.25285868392664507,
+      "grad_norm": 0.6107586622238159,
+      "learning_rate": 0.00010512129380053907,
+      "loss": 1.0020402669906616,
+      "step": 586
+    },
+    {
+      "epoch": 0.2537216828478964,
+      "grad_norm": 0.6878048181533813,
+      "learning_rate": 0.0001054806828391734,
+      "loss": 0.961039662361145,
+      "step": 588
+    },
+    {
+      "epoch": 0.2545846817691478,
+      "grad_norm": 0.664034903049469,
+      "learning_rate": 0.00010584007187780773,
+      "loss": 0.9725209474563599,
+      "step": 590
+    },
+    {
+      "epoch": 0.25544768069039914,
+      "grad_norm": 0.6399680376052856,
+      "learning_rate": 0.00010619946091644206,
+      "loss": 0.9907437562942505,
+      "step": 592
+    },
+    {
+      "epoch": 0.2563106796116505,
+      "grad_norm": 0.6163286566734314,
+      "learning_rate": 0.00010655884995507636,
+      "loss": 0.9650095701217651,
+      "step": 594
+    },
+    {
+      "epoch": 0.25717367853290185,
+      "grad_norm": 0.6008322238922119,
+      "learning_rate": 0.0001069182389937107,
+      "loss": 1.0102758407592773,
+      "step": 596
+    },
+    {
+      "epoch": 0.2580366774541532,
+      "grad_norm": 0.6752071380615234,
+      "learning_rate": 0.00010727762803234502,
+      "loss": 0.9101885557174683,
+      "step": 598
+    },
+    {
+      "epoch": 0.2588996763754045,
+      "grad_norm": 0.6789175868034363,
+      "learning_rate": 0.00010763701707097935,
+      "loss": 1.0461398363113403,
+      "step": 600
+    },
+    {
+      "epoch": 0.2588996763754045,
+      "eval_loss": 1.021111011505127,
+      "eval_runtime": 648.1611,
+      "eval_samples_per_second": 3.178,
+      "eval_steps_per_second": 3.178,
+      "step": 600
+    },
+    {
+      "epoch": 0.25976267529665586,
+      "grad_norm": 0.5660730600357056,
+      "learning_rate": 0.00010799640610961366,
+      "loss": 0.9582418203353882,
+      "step": 602
+    },
+    {
+      "epoch": 0.2606256742179072,
+      "grad_norm": 0.6726544499397278,
+      "learning_rate": 0.00010835579514824799,
+      "loss": 1.0763746500015259,
+      "step": 604
+    },
+    {
+      "epoch": 0.2614886731391586,
+      "grad_norm": 0.6068508625030518,
+      "learning_rate": 0.00010871518418688232,
+      "loss": 1.0432032346725464,
+      "step": 606
+    },
+    {
+      "epoch": 0.26235167206040994,
+      "grad_norm": 0.5731637477874756,
+      "learning_rate": 0.00010907457322551662,
+      "loss": 0.9830516576766968,
+      "step": 608
+    },
+    {
+      "epoch": 0.2632146709816613,
+      "grad_norm": 0.6777567267417908,
+      "learning_rate": 0.00010943396226415095,
+      "loss": 1.0442042350769043,
+      "step": 610
+    },
+    {
+      "epoch": 0.26407766990291265,
+      "grad_norm": 0.6372506618499756,
+      "learning_rate": 0.00010979335130278528,
+      "loss": 1.0001944303512573,
+      "step": 612
+    },
+    {
+      "epoch": 0.26494066882416395,
+      "grad_norm": 0.6606221795082092,
+      "learning_rate": 0.0001101527403414196,
+      "loss": 1.035884141921997,
+      "step": 614
+    },
+    {
+      "epoch": 0.2658036677454153,
+      "grad_norm": 0.6083229780197144,
+      "learning_rate": 0.00011051212938005391,
+      "loss": 0.9403397440910339,
+      "step": 616
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.6318517923355103,
+      "learning_rate": 0.00011087151841868823,
+      "loss": 0.9274454116821289,
+      "step": 618
+    },
+    {
+      "epoch": 0.267529665587918,
+      "grad_norm": 0.628190279006958,
+      "learning_rate": 0.00011123090745732256,
+      "loss": 0.9883754253387451,
+      "step": 620
+    },
+    {
+      "epoch": 0.2683926645091694,
+      "grad_norm": 0.5961961150169373,
+      "learning_rate": 0.00011159029649595688,
+      "loss": 1.0317535400390625,
+      "step": 622
+    },
+    {
+      "epoch": 0.26925566343042073,
+      "grad_norm": 0.5995341539382935,
+      "learning_rate": 0.00011194968553459119,
+      "loss": 0.9776126742362976,
+      "step": 624
+    },
+    {
+      "epoch": 0.27011866235167203,
+      "grad_norm": 0.6639334559440613,
+      "learning_rate": 0.00011230907457322552,
+      "loss": 1.0112378597259521,
+      "step": 626
+    },
+    {
+      "epoch": 0.2709816612729234,
+      "grad_norm": 0.6348621249198914,
+      "learning_rate": 0.00011266846361185985,
+      "loss": 1.0553804636001587,
+      "step": 628
+    },
+    {
+      "epoch": 0.27184466019417475,
+      "grad_norm": 0.5929805040359497,
+      "learning_rate": 0.00011302785265049416,
+      "loss": 1.025888442993164,
+      "step": 630
+    },
+    {
+      "epoch": 0.2727076591154261,
+      "grad_norm": 0.6052366495132446,
+      "learning_rate": 0.00011338724168912849,
+      "loss": 1.02956223487854,
+      "step": 632
+    },
+    {
+      "epoch": 0.27357065803667746,
+      "grad_norm": 0.6494882106781006,
+      "learning_rate": 0.00011374663072776282,
+      "loss": 0.989752471446991,
+      "step": 634
+    },
+    {
+      "epoch": 0.2744336569579288,
+      "grad_norm": 0.6005767583847046,
+      "learning_rate": 0.00011410601976639712,
+      "loss": 1.0030683279037476,
+      "step": 636
+    },
+    {
+      "epoch": 0.2752966558791802,
+      "grad_norm": 0.6478356122970581,
+      "learning_rate": 0.00011446540880503145,
+      "loss": 1.002000093460083,
+      "step": 638
+    },
+    {
+      "epoch": 0.2761596548004315,
+      "grad_norm": 0.5804725289344788,
+      "learning_rate": 0.00011482479784366578,
+      "loss": 0.9807654023170471,
+      "step": 640
+    },
+    {
+      "epoch": 0.27702265372168283,
+      "grad_norm": 0.632530689239502,
+      "learning_rate": 0.00011518418688230011,
+      "loss": 0.9841892123222351,
+      "step": 642
+    },
+    {
+      "epoch": 0.2778856526429342,
+      "grad_norm": 0.5340113639831543,
+      "learning_rate": 0.00011554357592093441,
+      "loss": 0.8902478814125061,
+      "step": 644
+    },
+    {
+      "epoch": 0.27874865156418555,
+      "grad_norm": 0.5901665091514587,
+      "learning_rate": 0.00011590296495956874,
+      "loss": 0.9019404649734497,
+      "step": 646
+    },
+    {
+      "epoch": 0.2796116504854369,
+      "grad_norm": 0.666589617729187,
+      "learning_rate": 0.00011626235399820307,
+      "loss": 0.9384423494338989,
+      "step": 648
+    },
+    {
+      "epoch": 0.28047464940668826,
+      "grad_norm": 0.7000334858894348,
+      "learning_rate": 0.00011662174303683737,
+      "loss": 1.0666629076004028,
+      "step": 650
+    },
+    {
+      "epoch": 0.2813376483279396,
+      "grad_norm": 0.663663923740387,
+      "learning_rate": 0.0001169811320754717,
+      "loss": 1.000019907951355,
+      "step": 652
+    },
+    {
+      "epoch": 0.2822006472491909,
+      "grad_norm": 0.6097694039344788,
+      "learning_rate": 0.00011734052111410603,
+      "loss": 0.9450293183326721,
+      "step": 654
+    },
+    {
+      "epoch": 0.2830636461704423,
+      "grad_norm": 0.6130967140197754,
+      "learning_rate": 0.00011769991015274035,
+      "loss": 0.9480894207954407,
+      "step": 656
+    },
+    {
+      "epoch": 0.28392664509169363,
+      "grad_norm": 0.7091249227523804,
+      "learning_rate": 0.00011805929919137467,
+      "loss": 1.1377143859863281,
+      "step": 658
+    },
+    {
+      "epoch": 0.284789644012945,
+      "grad_norm": 0.6556766629219055,
+      "learning_rate": 0.00011841868823000898,
+      "loss": 0.9421243667602539,
+      "step": 660
+    },
+    {
+      "epoch": 0.28565264293419634,
+      "grad_norm": 0.6682968735694885,
+      "learning_rate": 0.00011877807726864331,
+      "loss": 0.9726828336715698,
+      "step": 662
+    },
+    {
+      "epoch": 0.2865156418554477,
+      "grad_norm": 0.5224708914756775,
+      "learning_rate": 0.00011913746630727762,
+      "loss": 0.8996511697769165,
+      "step": 664
+    },
+    {
+      "epoch": 0.287378640776699,
+      "grad_norm": 0.5914195775985718,
+      "learning_rate": 0.00011949685534591195,
+      "loss": 0.9679517149925232,
+      "step": 666
+    },
+    {
+      "epoch": 0.28824163969795036,
+      "grad_norm": 0.6175519824028015,
+      "learning_rate": 0.00011985624438454628,
+      "loss": 0.8743209838867188,
+      "step": 668
+    },
+    {
+      "epoch": 0.2891046386192017,
+      "grad_norm": 0.6019226312637329,
+      "learning_rate": 0.0001202156334231806,
+      "loss": 0.9741992354393005,
+      "step": 670
+    },
+    {
+      "epoch": 0.28996763754045307,
+      "grad_norm": 0.6080542206764221,
+      "learning_rate": 0.00012057502246181491,
+      "loss": 0.9516472816467285,
+      "step": 672
+    },
+    {
+      "epoch": 0.29083063646170443,
+      "grad_norm": 0.5885615944862366,
+      "learning_rate": 0.00012093441150044924,
+      "loss": 1.122761607170105,
+      "step": 674
+    },
+    {
+      "epoch": 0.2916936353829558,
+      "grad_norm": 0.6635209918022156,
+      "learning_rate": 0.00012129380053908357,
+      "loss": 1.0105189085006714,
+      "step": 676
+    },
+    {
+      "epoch": 0.29255663430420714,
+      "grad_norm": 0.5805009007453918,
+      "learning_rate": 0.0001216531895777179,
+      "loss": 0.906292200088501,
+      "step": 678
+    },
+    {
+      "epoch": 0.29341963322545844,
+      "grad_norm": 0.5980029702186584,
+      "learning_rate": 0.0001220125786163522,
+      "loss": 1.009568691253662,
+      "step": 680
+    },
+    {
+      "epoch": 0.2942826321467098,
+      "grad_norm": 0.6797705292701721,
+      "learning_rate": 0.00012237196765498652,
+      "loss": 1.0373667478561401,
+      "step": 682
+    },
+    {
+      "epoch": 0.29514563106796116,
+      "grad_norm": 0.6280547976493835,
+      "learning_rate": 0.00012273135669362085,
+      "loss": 0.9758188724517822,
+      "step": 684
+    },
+    {
+      "epoch": 0.2960086299892125,
+      "grad_norm": 0.511608898639679,
+      "learning_rate": 0.00012309074573225515,
+      "loss": 0.9111692905426025,
+      "step": 686
+    },
+    {
+      "epoch": 0.29687162891046387,
+      "grad_norm": 0.5781835317611694,
+      "learning_rate": 0.00012345013477088948,
+      "loss": 0.8865921497344971,
+      "step": 688
+    },
+    {
+      "epoch": 0.2977346278317152,
+      "grad_norm": 0.6514166593551636,
+      "learning_rate": 0.0001238095238095238,
+      "loss": 0.9768189191818237,
+      "step": 690
+    },
+    {
+      "epoch": 0.2985976267529666,
+      "grad_norm": 0.6109189987182617,
+      "learning_rate": 0.00012416891284815814,
+      "loss": 0.9991607069969177,
+      "step": 692
+    },
+    {
+      "epoch": 0.2994606256742179,
+      "grad_norm": 0.6598902344703674,
+      "learning_rate": 0.00012452830188679244,
+      "loss": 0.9548360705375671,
+      "step": 694
+    },
+    {
+      "epoch": 0.30032362459546924,
+      "grad_norm": 0.5633156895637512,
+      "learning_rate": 0.00012488769092542677,
+      "loss": 0.992988109588623,
+      "step": 696
+    },
+    {
+      "epoch": 0.3011866235167206,
+      "grad_norm": 0.6098802089691162,
+      "learning_rate": 0.0001252470799640611,
+      "loss": 0.9709890484809875,
+      "step": 698
+    },
+    {
+      "epoch": 0.30204962243797195,
+      "grad_norm": 0.6197102665901184,
+      "learning_rate": 0.0001256064690026954,
+      "loss": 1.018282175064087,
+      "step": 700
+    },
+    {
+      "epoch": 0.30204962243797195,
+      "eval_loss": 1.0030721426010132,
+      "eval_runtime": 655.4533,
+      "eval_samples_per_second": 3.143,
+      "eval_steps_per_second": 3.143,
+      "step": 700
+    },
+    {
+      "epoch": 0.3029126213592233,
+      "grad_norm": 0.5817480683326721,
+      "learning_rate": 0.00012596585804132974,
+      "loss": 0.9147283434867859,
+      "step": 702
+    },
+    {
+      "epoch": 0.30377562028047467,
+      "grad_norm": 0.5976696014404297,
+      "learning_rate": 0.00012632524707996407,
+      "loss": 0.9318362474441528,
+      "step": 704
+    },
+    {
+      "epoch": 0.304638619201726,
+      "grad_norm": 0.6389723420143127,
+      "learning_rate": 0.0001266846361185984,
+      "loss": 0.9500927925109863,
+      "step": 706
+    },
+    {
+      "epoch": 0.3055016181229773,
+      "grad_norm": 0.6485719084739685,
+      "learning_rate": 0.0001270440251572327,
+      "loss": 1.0271424055099487,
+      "step": 708
+    },
+    {
+      "epoch": 0.3063646170442287,
+      "grad_norm": 0.5802455544471741,
+      "learning_rate": 0.00012740341419586703,
+      "loss": 0.9781906008720398,
+      "step": 710
+    },
+    {
+      "epoch": 0.30722761596548004,
+      "grad_norm": 0.6359356641769409,
+      "learning_rate": 0.00012776280323450136,
+      "loss": 1.0195324420928955,
+      "step": 712
+    },
+    {
+      "epoch": 0.3080906148867314,
+      "grad_norm": 0.5975426435470581,
+      "learning_rate": 0.00012812219227313566,
+      "loss": 0.9250738024711609,
+      "step": 714
+    },
+    {
+      "epoch": 0.30895361380798275,
+      "grad_norm": 0.643110454082489,
+      "learning_rate": 0.00012848158131177,
+      "loss": 0.9888015985488892,
+      "step": 716
+    },
+    {
+      "epoch": 0.3098166127292341,
+      "grad_norm": 0.6043205261230469,
+      "learning_rate": 0.00012884097035040432,
+      "loss": 0.9709514379501343,
+      "step": 718
+    },
+    {
+      "epoch": 0.3106796116504854,
+      "grad_norm": 0.5687094926834106,
+      "learning_rate": 0.00012920035938903865,
+      "loss": 1.0272964239120483,
+      "step": 720
+    },
+    {
+      "epoch": 0.31154261057173677,
+      "grad_norm": 0.5688400864601135,
+      "learning_rate": 0.00012955974842767296,
+      "loss": 0.9370370507240295,
+      "step": 722
+    },
+    {
+      "epoch": 0.3124056094929881,
+      "grad_norm": 0.5610610246658325,
+      "learning_rate": 0.00012991913746630729,
+      "loss": 0.9535608291625977,
+      "step": 724
+    },
+    {
+      "epoch": 0.3132686084142395,
+      "grad_norm": 0.6338257193565369,
+      "learning_rate": 0.00013027852650494162,
+      "loss": 1.0188907384872437,
+      "step": 726
+    },
+    {
+      "epoch": 0.31413160733549084,
+      "grad_norm": 0.5365633368492126,
+      "learning_rate": 0.00013063791554357592,
+      "loss": 0.9253716468811035,
+      "step": 728
+    },
+    {
+      "epoch": 0.3149946062567422,
+      "grad_norm": 0.5599163174629211,
+      "learning_rate": 0.00013099730458221025,
+      "loss": 0.8941492438316345,
+      "step": 730
+    },
+    {
+      "epoch": 0.31585760517799355,
+      "grad_norm": 0.6059780716896057,
+      "learning_rate": 0.00013135669362084458,
+      "loss": 0.9831459522247314,
+      "step": 732
+    },
+    {
+      "epoch": 0.31672060409924485,
+      "grad_norm": 0.5596494078636169,
+      "learning_rate": 0.0001317160826594789,
+      "loss": 0.9332310557365417,
+      "step": 734
+    },
+    {
+      "epoch": 0.3175836030204962,
+      "grad_norm": 0.5618010759353638,
+      "learning_rate": 0.0001320754716981132,
+      "loss": 0.9082580208778381,
+      "step": 736
+    },
+    {
+      "epoch": 0.31844660194174756,
+      "grad_norm": 0.6412109732627869,
+      "learning_rate": 0.00013243486073674754,
+      "loss": 1.008690357208252,
+      "step": 738
+    },
+    {
+      "epoch": 0.3193096008629989,
+      "grad_norm": 0.5742355585098267,
+      "learning_rate": 0.00013279424977538187,
+      "loss": 0.9597798585891724,
+      "step": 740
+    },
+    {
+      "epoch": 0.3201725997842503,
+      "grad_norm": 0.6470226645469666,
+      "learning_rate": 0.00013315363881401617,
+      "loss": 0.989331841468811,
+      "step": 742
+    },
+    {
+      "epoch": 0.32103559870550163,
+      "grad_norm": 0.5598039031028748,
+      "learning_rate": 0.0001335130278526505,
+      "loss": 0.8677343130111694,
+      "step": 744
+    },
+    {
+      "epoch": 0.321898597626753,
+      "grad_norm": 0.5441372990608215,
+      "learning_rate": 0.00013387241689128483,
+      "loss": 0.9462730288505554,
+      "step": 746
+    },
+    {
+      "epoch": 0.3227615965480043,
+      "grad_norm": 0.5858626365661621,
+      "learning_rate": 0.00013423180592991916,
+      "loss": 0.994694173336029,
+      "step": 748
+    },
+    {
+      "epoch": 0.32362459546925565,
+      "grad_norm": 0.511372447013855,
+      "learning_rate": 0.00013459119496855347,
+      "loss": 0.9387269616127014,
+      "step": 750
+    },
+    {
+      "epoch": 0.324487594390507,
+      "grad_norm": 0.47798457741737366,
+      "learning_rate": 0.0001349505840071878,
+      "loss": 0.9473881721496582,
+      "step": 752
+    },
+    {
+      "epoch": 0.32535059331175836,
+      "grad_norm": 0.5907022953033447,
+      "learning_rate": 0.0001353099730458221,
+      "loss": 0.9375183582305908,
+      "step": 754
+    },
+    {
+      "epoch": 0.3262135922330097,
+      "grad_norm": 0.618733286857605,
+      "learning_rate": 0.00013566936208445643,
+      "loss": 1.028738260269165,
+      "step": 756
+    },
+    {
+      "epoch": 0.3270765911542611,
+      "grad_norm": 0.5234512090682983,
+      "learning_rate": 0.00013602875112309076,
+      "loss": 0.9420192241668701,
+      "step": 758
+    },
+    {
+      "epoch": 0.32793959007551243,
+      "grad_norm": 0.7036319971084595,
+      "learning_rate": 0.00013638814016172506,
+      "loss": 1.0252270698547363,
+      "step": 760
+    },
+    {
+      "epoch": 0.32880258899676373,
+      "grad_norm": 0.5543172359466553,
+      "learning_rate": 0.0001367475292003594,
+      "loss": 0.8453778028488159,
+      "step": 762
+    },
+    {
+      "epoch": 0.3296655879180151,
+      "grad_norm": 0.5438711643218994,
+      "learning_rate": 0.0001371069182389937,
+      "loss": 0.8659937977790833,
+      "step": 764
+    },
+    {
+      "epoch": 0.33052858683926645,
+      "grad_norm": 0.6390914916992188,
+      "learning_rate": 0.00013746630727762803,
+      "loss": 1.038142442703247,
+      "step": 766
+    },
+    {
+      "epoch": 0.3313915857605178,
+      "grad_norm": 0.50070720911026,
+      "learning_rate": 0.00013782569631626236,
+      "loss": 0.899932861328125,
+      "step": 768
+    },
+    {
+      "epoch": 0.33225458468176916,
+      "grad_norm": 0.5982286334037781,
+      "learning_rate": 0.00013818508535489669,
+      "loss": 0.9712884426116943,
+      "step": 770
+    },
+    {
+      "epoch": 0.3331175836030205,
+      "grad_norm": 0.6588822603225708,
+      "learning_rate": 0.000138544474393531,
+      "loss": 0.9427542686462402,
+      "step": 772
+    },
+    {
+      "epoch": 0.3339805825242718,
+      "grad_norm": 0.6022042632102966,
+      "learning_rate": 0.00013890386343216532,
+      "loss": 0.8961561918258667,
+      "step": 774
+    },
+    {
+      "epoch": 0.3348435814455232,
+      "grad_norm": 0.6595642566680908,
+      "learning_rate": 0.00013926325247079965,
+      "loss": 0.9525937438011169,
+      "step": 776
+    },
+    {
+      "epoch": 0.33570658036677453,
+      "grad_norm": 0.5210421681404114,
+      "learning_rate": 0.00013962264150943395,
+      "loss": 0.9218845367431641,
+      "step": 778
+    },
+    {
+      "epoch": 0.3365695792880259,
+      "grad_norm": 0.549669623374939,
+      "learning_rate": 0.00013998203054806828,
+      "loss": 0.877951443195343,
+      "step": 780
+    },
+    {
+      "epoch": 0.33743257820927725,
+      "grad_norm": 0.5360157489776611,
+      "learning_rate": 0.0001403414195867026,
+      "loss": 0.8670064210891724,
+      "step": 782
+    },
+    {
+      "epoch": 0.3382955771305286,
+      "grad_norm": 0.614734947681427,
+      "learning_rate": 0.00014070080862533694,
+      "loss": 0.9561367630958557,
+      "step": 784
+    },
+    {
+      "epoch": 0.33915857605177996,
+      "grad_norm": 0.5798251628875732,
+      "learning_rate": 0.00014106019766397124,
+      "loss": 0.9132505059242249,
+      "step": 786
+    },
+    {
+      "epoch": 0.34002157497303126,
+      "grad_norm": 0.6267077326774597,
+      "learning_rate": 0.00014141958670260557,
+      "loss": 0.9297707080841064,
+      "step": 788
+    },
+    {
+      "epoch": 0.3408845738942826,
+      "grad_norm": 0.6045349836349487,
+      "learning_rate": 0.0001417789757412399,
+      "loss": 0.9382412433624268,
+      "step": 790
+    },
+    {
+      "epoch": 0.341747572815534,
+      "grad_norm": 0.6125404834747314,
+      "learning_rate": 0.0001421383647798742,
+      "loss": 0.9078555107116699,
+      "step": 792
+    },
+    {
+      "epoch": 0.34261057173678533,
+      "grad_norm": 0.5927051901817322,
+      "learning_rate": 0.00014249775381850854,
+      "loss": 0.899101197719574,
+      "step": 794
+    },
+    {
+      "epoch": 0.3434735706580367,
+      "grad_norm": 0.6315743923187256,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.028346300125122,
+      "step": 796
+    },
+    {
+      "epoch": 0.34433656957928804,
+      "grad_norm": 0.549271285533905,
+      "learning_rate": 0.0001432165318957772,
+      "loss": 0.8988189697265625,
+      "step": 798
+    },
+    {
+      "epoch": 0.3451995685005394,
+      "grad_norm": 0.6344245672225952,
+      "learning_rate": 0.0001435759209344115,
+      "loss": 1.0489003658294678,
+      "step": 800
+    },
+    {
+      "epoch": 0.3451995685005394,
+      "eval_loss": 0.9864639639854431,
+      "eval_runtime": 667.3516,
+      "eval_samples_per_second": 3.087,
+      "eval_steps_per_second": 3.087,
+      "step": 800
+    },
+    {
+      "epoch": 0.3460625674217907,
+      "grad_norm": 0.5625309348106384,
+      "learning_rate": 0.00014393530997304583,
+      "loss": 0.8773928284645081,
+      "step": 802
+    },
+    {
+      "epoch": 0.34692556634304206,
+      "grad_norm": 0.5931969285011292,
+      "learning_rate": 0.00014429469901168016,
+      "loss": 0.9116050004959106,
+      "step": 804
+    },
+    {
+      "epoch": 0.3477885652642934,
+      "grad_norm": 0.5189821720123291,
+      "learning_rate": 0.00014465408805031446,
+      "loss": 0.9124425649642944,
+      "step": 806
+    },
+    {
+      "epoch": 0.34865156418554477,
+      "grad_norm": 0.5392254590988159,
+      "learning_rate": 0.0001450134770889488,
+      "loss": 0.9517888426780701,
+      "step": 808
+    },
+    {
+      "epoch": 0.34951456310679613,
+      "grad_norm": 0.5584444999694824,
+      "learning_rate": 0.00014537286612758312,
+      "loss": 0.9947572350502014,
+      "step": 810
+    },
+    {
+      "epoch": 0.3503775620280475,
+      "grad_norm": 0.5188854932785034,
+      "learning_rate": 0.00014573225516621745,
+      "loss": 0.9314022660255432,
+      "step": 812
+    },
+    {
+      "epoch": 0.3512405609492988,
+      "grad_norm": 0.5783659815788269,
+      "learning_rate": 0.00014609164420485176,
+      "loss": 0.9135628938674927,
+      "step": 814
+    },
+    {
+      "epoch": 0.35210355987055014,
+      "grad_norm": 0.550959050655365,
+      "learning_rate": 0.0001464510332434861,
+      "loss": 0.9665075540542603,
+      "step": 816
+    },
+    {
+      "epoch": 0.3529665587918015,
+      "grad_norm": 0.6013346314430237,
+      "learning_rate": 0.00014681042228212042,
+      "loss": 0.9836555123329163,
+      "step": 818
+    },
+    {
+      "epoch": 0.35382955771305286,
+      "grad_norm": 0.49219194054603577,
+      "learning_rate": 0.00014716981132075472,
+      "loss": 0.8900108337402344,
+      "step": 820
+    },
+    {
+      "epoch": 0.3546925566343042,
+      "grad_norm": 0.5517411828041077,
+      "learning_rate": 0.00014752920035938905,
+      "loss": 0.8769304156303406,
+      "step": 822
+    },
+    {
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.6062695980072021,
+      "learning_rate": 0.00014788858939802338,
+      "loss": 0.9744759202003479,
+      "step": 824
+    },
+    {
+      "epoch": 0.3564185544768069,
+      "grad_norm": 0.5132041573524475,
+      "learning_rate": 0.0001482479784366577,
+      "loss": 0.8875447511672974,
+      "step": 826
+    },
+    {
+      "epoch": 0.3572815533980582,
+      "grad_norm": 0.551799476146698,
+      "learning_rate": 0.000148607367475292,
+      "loss": 0.962710440158844,
+      "step": 828
+    },
+    {
+      "epoch": 0.3581445523193096,
+      "grad_norm": 0.6046625971794128,
+      "learning_rate": 0.00014896675651392634,
+      "loss": 0.8997528553009033,
+      "step": 830
+    },
+    {
+      "epoch": 0.35900755124056094,
+      "grad_norm": 0.560025691986084,
+      "learning_rate": 0.00014932614555256067,
+      "loss": 0.9541417360305786,
+      "step": 832
+    },
+    {
+      "epoch": 0.3598705501618123,
+      "grad_norm": 0.6441047787666321,
+      "learning_rate": 0.00014968553459119498,
+      "loss": 0.907791018486023,
+      "step": 834
+    },
+    {
+      "epoch": 0.36073354908306365,
+      "grad_norm": 0.5636281967163086,
+      "learning_rate": 0.0001500449236298293,
+      "loss": 1.0295937061309814,
+      "step": 836
+    },
+    {
+      "epoch": 0.361596548004315,
+      "grad_norm": 0.5528303384780884,
+      "learning_rate": 0.0001504043126684636,
+      "loss": 0.8875265717506409,
+      "step": 838
+    },
+    {
+      "epoch": 0.36245954692556637,
+      "grad_norm": 0.5345163345336914,
+      "learning_rate": 0.00015076370170709794,
+      "loss": 0.9678915739059448,
+      "step": 840
+    },
+    {
+      "epoch": 0.36332254584681767,
+      "grad_norm": 0.5551225543022156,
+      "learning_rate": 0.00015112309074573224,
+      "loss": 0.9235162734985352,
+      "step": 842
+    },
+    {
+      "epoch": 0.364185544768069,
+      "grad_norm": 0.5131904482841492,
+      "learning_rate": 0.00015148247978436657,
+      "loss": 0.8624292016029358,
+      "step": 844
+    },
+    {
+      "epoch": 0.3650485436893204,
+      "grad_norm": 0.6811004281044006,
+      "learning_rate": 0.0001518418688230009,
+      "loss": 1.0360193252563477,
+      "step": 846
+    },
+    {
+      "epoch": 0.36591154261057174,
+      "grad_norm": 0.6409741640090942,
+      "learning_rate": 0.00015220125786163523,
+      "loss": 0.9254010915756226,
+      "step": 848
+    },
+    {
+      "epoch": 0.3667745415318231,
+      "grad_norm": 0.5534068942070007,
+      "learning_rate": 0.00015256064690026953,
+      "loss": 0.8900630474090576,
+      "step": 850
+    },
+    {
+      "epoch": 0.36763754045307445,
+      "grad_norm": 0.4999487102031708,
+      "learning_rate": 0.00015292003593890386,
+      "loss": 0.88521409034729,
+      "step": 852
+    },
+    {
+      "epoch": 0.3685005393743258,
+      "grad_norm": 0.5805923938751221,
+      "learning_rate": 0.0001532794249775382,
+      "loss": 0.9563921093940735,
+      "step": 854
+    },
+    {
+      "epoch": 0.3693635382955771,
+      "grad_norm": 0.5485470294952393,
+      "learning_rate": 0.0001536388140161725,
+      "loss": 0.8909372687339783,
+      "step": 856
+    },
+    {
+      "epoch": 0.37022653721682847,
+      "grad_norm": 0.5317923426628113,
+      "learning_rate": 0.00015399820305480683,
+      "loss": 0.9145731925964355,
+      "step": 858
+    },
+    {
+      "epoch": 0.3710895361380798,
+      "grad_norm": 0.6073495745658875,
+      "learning_rate": 0.00015435759209344116,
+      "loss": 1.01466965675354,
+      "step": 860
+    },
+    {
+      "epoch": 0.3719525350593312,
+      "grad_norm": 0.566655158996582,
+      "learning_rate": 0.0001547169811320755,
+      "loss": 0.9941825270652771,
+      "step": 862
+    },
+    {
+      "epoch": 0.37281553398058254,
+      "grad_norm": 0.5262459516525269,
+      "learning_rate": 0.0001550763701707098,
+      "loss": 1.0059782266616821,
+      "step": 864
+    },
+    {
+      "epoch": 0.3736785329018339,
+      "grad_norm": 0.6264083981513977,
+      "learning_rate": 0.00015543575920934412,
+      "loss": 1.0332856178283691,
+      "step": 866
+    },
+    {
+      "epoch": 0.3745415318230852,
+      "grad_norm": 0.6575480699539185,
+      "learning_rate": 0.00015579514824797845,
+      "loss": 1.022459626197815,
+      "step": 868
+    },
+    {
+      "epoch": 0.37540453074433655,
+      "grad_norm": 0.6291940212249756,
+      "learning_rate": 0.00015615453728661275,
+      "loss": 0.9550372362136841,
+      "step": 870
+    },
+    {
+      "epoch": 0.3762675296655879,
+      "grad_norm": 0.6710562109947205,
+      "learning_rate": 0.00015651392632524708,
+      "loss": 0.9861716628074646,
+      "step": 872
+    },
+    {
+      "epoch": 0.37713052858683926,
+      "grad_norm": 0.5505748987197876,
+      "learning_rate": 0.0001568733153638814,
+      "loss": 0.9719111919403076,
+      "step": 874
+    },
+    {
+      "epoch": 0.3779935275080906,
+      "grad_norm": 0.5055180788040161,
+      "learning_rate": 0.00015723270440251574,
+      "loss": 0.8698170185089111,
+      "step": 876
+    },
+    {
+      "epoch": 0.378856526429342,
+      "grad_norm": 0.5935947895050049,
+      "learning_rate": 0.00015759209344115005,
+      "loss": 1.029494285583496,
+      "step": 878
+    },
+    {
+      "epoch": 0.37971952535059333,
+      "grad_norm": 0.538325846195221,
+      "learning_rate": 0.00015795148247978438,
+      "loss": 0.923010528087616,
+      "step": 880
+    },
+    {
+      "epoch": 0.38058252427184464,
+      "grad_norm": 0.587297797203064,
+      "learning_rate": 0.0001583108715184187,
+      "loss": 0.9394056797027588,
+      "step": 882
+    },
+    {
+      "epoch": 0.381445523193096,
+      "grad_norm": 0.5910462737083435,
+      "learning_rate": 0.000158670260557053,
+      "loss": 0.9472483992576599,
+      "step": 884
+    },
+    {
+      "epoch": 0.38230852211434735,
+      "grad_norm": 0.629048764705658,
+      "learning_rate": 0.00015902964959568734,
+      "loss": 0.9028263688087463,
+      "step": 886
+    },
+    {
+      "epoch": 0.3831715210355987,
+      "grad_norm": 0.5028086304664612,
+      "learning_rate": 0.00015938903863432167,
+      "loss": 0.9579087495803833,
+      "step": 888
+    },
+    {
+      "epoch": 0.38403451995685006,
+      "grad_norm": 0.5372384190559387,
+      "learning_rate": 0.000159748427672956,
+      "loss": 0.8318673372268677,
+      "step": 890
+    },
+    {
+      "epoch": 0.3848975188781014,
+      "grad_norm": 0.6314184665679932,
+      "learning_rate": 0.0001601078167115903,
+      "loss": 0.9804943203926086,
+      "step": 892
+    },
+    {
+      "epoch": 0.3857605177993528,
+      "grad_norm": 0.5545229911804199,
+      "learning_rate": 0.00016046720575022463,
+      "loss": 1.0078438520431519,
+      "step": 894
+    },
+    {
+      "epoch": 0.3866235167206041,
+      "grad_norm": 0.4674014151096344,
+      "learning_rate": 0.00016082659478885896,
+      "loss": 0.9269036650657654,
+      "step": 896
+    },
+    {
+      "epoch": 0.38748651564185543,
+      "grad_norm": 1.5887153148651123,
+      "learning_rate": 0.00016118598382749326,
+      "loss": 0.8927953243255615,
+      "step": 898
+    },
+    {
+      "epoch": 0.3883495145631068,
+      "grad_norm": 0.5217035412788391,
+      "learning_rate": 0.0001615453728661276,
+      "loss": 0.908074140548706,
+      "step": 900
+    },
+    {
+      "epoch": 0.3883495145631068,
+      "eval_loss": 0.9741895794868469,
+      "eval_runtime": 667.2236,
+      "eval_samples_per_second": 3.087,
+      "eval_steps_per_second": 3.087,
+      "step": 900
+    },
+    {
+      "epoch": 0.38921251348435815,
+      "grad_norm": 0.470498651266098,
+      "learning_rate": 0.00016190476190476192,
+      "loss": 0.9660369157791138,
+      "step": 902
+    },
+    {
+      "epoch": 0.3900755124056095,
+      "grad_norm": 0.5111004114151001,
+      "learning_rate": 0.00016226415094339625,
+      "loss": 0.9236379265785217,
+      "step": 904
+    },
+    {
+      "epoch": 0.39093851132686086,
+      "grad_norm": 0.5872815251350403,
+      "learning_rate": 0.00016262353998203056,
+      "loss": 1.0061595439910889,
+      "step": 906
+    },
+    {
+      "epoch": 0.3918015102481122,
+      "grad_norm": 0.5150740742683411,
+      "learning_rate": 0.0001629829290206649,
+      "loss": 0.8347328901290894,
+      "step": 908
+    },
+    {
+      "epoch": 0.3926645091693635,
+      "grad_norm": 0.46554985642433167,
+      "learning_rate": 0.00016334231805929922,
+      "loss": 0.9091183543205261,
+      "step": 910
+    },
+    {
+      "epoch": 0.3935275080906149,
+      "grad_norm": 0.5292875170707703,
+      "learning_rate": 0.00016370170709793352,
+      "loss": 0.9299798011779785,
+      "step": 912
+    },
+    {
+      "epoch": 0.39439050701186623,
+      "grad_norm": 0.5177125930786133,
+      "learning_rate": 0.00016406109613656785,
+      "loss": 0.942286491394043,
+      "step": 914
+    },
+    {
+      "epoch": 0.3952535059331176,
+      "grad_norm": 0.5564161539077759,
+      "learning_rate": 0.00016442048517520215,
+      "loss": 0.825290858745575,
+      "step": 916
+    },
+    {
+      "epoch": 0.39611650485436894,
+      "grad_norm": 0.5572530031204224,
+      "learning_rate": 0.00016477987421383648,
+      "loss": 0.876898467540741,
+      "step": 918
+    },
+    {
+      "epoch": 0.3969795037756203,
+      "grad_norm": 0.7294673323631287,
+      "learning_rate": 0.0001651392632524708,
+      "loss": 0.8949798941612244,
+      "step": 920
+    },
+    {
+      "epoch": 0.3978425026968716,
+      "grad_norm": 0.5234251022338867,
+      "learning_rate": 0.00016549865229110512,
+      "loss": 0.8457819223403931,
+      "step": 922
+    },
+    {
+      "epoch": 0.39870550161812296,
+      "grad_norm": 0.5273709893226624,
+      "learning_rate": 0.00016585804132973945,
+      "loss": 0.9080174565315247,
+      "step": 924
+    },
+    {
+      "epoch": 0.3995685005393743,
+      "grad_norm": 0.5795063376426697,
+      "learning_rate": 0.00016621743036837378,
+      "loss": 1.0304023027420044,
+      "step": 926
+    },
+    {
+      "epoch": 0.4004314994606257,
+      "grad_norm": 0.6153313517570496,
+      "learning_rate": 0.00016657681940700808,
+      "loss": 0.8900477886199951,
+      "step": 928
+    },
+    {
+      "epoch": 0.40129449838187703,
+      "grad_norm": 0.6293173432350159,
+      "learning_rate": 0.0001669362084456424,
+      "loss": 1.0130009651184082,
+      "step": 930
+    },
+    {
+      "epoch": 0.4021574973031284,
+      "grad_norm": 0.5455223321914673,
+      "learning_rate": 0.00016729559748427674,
+      "loss": 0.9339282512664795,
+      "step": 932
+    },
+    {
+      "epoch": 0.40302049622437974,
+      "grad_norm": 0.5349094271659851,
+      "learning_rate": 0.00016765498652291104,
+      "loss": 0.9628980755805969,
+      "step": 934
+    },
+    {
+      "epoch": 0.40388349514563104,
+      "grad_norm": 0.491227924823761,
+      "learning_rate": 0.00016801437556154537,
+      "loss": 0.8922860026359558,
+      "step": 936
+    },
+    {
+      "epoch": 0.4047464940668824,
+      "grad_norm": 0.6331246495246887,
+      "learning_rate": 0.0001683737646001797,
+      "loss": 1.0470497608184814,
+      "step": 938
+    },
+    {
+      "epoch": 0.40560949298813376,
+      "grad_norm": 0.6079246401786804,
+      "learning_rate": 0.00016873315363881403,
+      "loss": 0.8868283629417419,
+      "step": 940
+    },
+    {
+      "epoch": 0.4064724919093851,
+      "grad_norm": 0.5326972603797913,
+      "learning_rate": 0.00016909254267744833,
+      "loss": 0.9938711524009705,
+      "step": 942
+    },
+    {
+      "epoch": 0.40733549083063647,
+      "grad_norm": 0.47754305601119995,
+      "learning_rate": 0.00016945193171608266,
+      "loss": 0.8280484676361084,
+      "step": 944
+    },
+    {
+      "epoch": 0.4081984897518878,
+      "grad_norm": 0.6683310270309448,
+      "learning_rate": 0.000169811320754717,
+      "loss": 1.089701533317566,
+      "step": 946
+    },
+    {
+      "epoch": 0.4090614886731392,
+      "grad_norm": 0.42798754572868347,
+      "learning_rate": 0.0001701707097933513,
+      "loss": 0.8535542488098145,
+      "step": 948
+    },
+    {
+      "epoch": 0.4099244875943905,
+      "grad_norm": 0.5999574065208435,
+      "learning_rate": 0.00017053009883198563,
+      "loss": 0.9039298295974731,
+      "step": 950
+    },
+    {
+      "epoch": 0.41078748651564184,
+      "grad_norm": 0.5752781629562378,
+      "learning_rate": 0.00017088948787061996,
+      "loss": 0.8786448240280151,
+      "step": 952
+    },
+    {
+      "epoch": 0.4116504854368932,
+      "grad_norm": 0.5121532678604126,
+      "learning_rate": 0.0001712488769092543,
+      "loss": 0.9206072688102722,
+      "step": 954
+    },
+    {
+      "epoch": 0.41251348435814456,
+      "grad_norm": 0.611078143119812,
+      "learning_rate": 0.0001716082659478886,
+      "loss": 0.9246986508369446,
+      "step": 956
+    },
+    {
+      "epoch": 0.4133764832793959,
+      "grad_norm": 0.5101020336151123,
+      "learning_rate": 0.00017196765498652292,
+      "loss": 0.9221894145011902,
+      "step": 958
+    },
+    {
+      "epoch": 0.41423948220064727,
+      "grad_norm": 0.5681450963020325,
+      "learning_rate": 0.00017232704402515725,
+      "loss": 0.9072799682617188,
+      "step": 960
+    },
+    {
+      "epoch": 0.41510248112189857,
+      "grad_norm": 0.47865498065948486,
+      "learning_rate": 0.00017268643306379155,
+      "loss": 0.9460896849632263,
+      "step": 962
+    },
+    {
+      "epoch": 0.4159654800431499,
+      "grad_norm": 0.49861401319503784,
+      "learning_rate": 0.00017304582210242588,
+      "loss": 0.9121519923210144,
+      "step": 964
+    },
+    {
+      "epoch": 0.4168284789644013,
+      "grad_norm": 0.43025892972946167,
+      "learning_rate": 0.0001734052111410602,
+      "loss": 0.8826848864555359,
+      "step": 966
+    },
+    {
+      "epoch": 0.41769147788565264,
+      "grad_norm": 0.4600491225719452,
+      "learning_rate": 0.00017376460017969454,
+      "loss": 0.8756251335144043,
+      "step": 968
+    },
+    {
+      "epoch": 0.418554476806904,
+      "grad_norm": 0.5297656059265137,
+      "learning_rate": 0.00017412398921832885,
+      "loss": 0.9171333312988281,
+      "step": 970
+    },
+    {
+      "epoch": 0.41941747572815535,
+      "grad_norm": 0.4906919002532959,
+      "learning_rate": 0.00017448337825696318,
+      "loss": 0.8887524008750916,
+      "step": 972
+    },
+    {
+      "epoch": 0.4202804746494067,
+      "grad_norm": 0.49263402819633484,
+      "learning_rate": 0.0001748427672955975,
+      "loss": 0.8345810174942017,
+      "step": 974
+    },
+    {
+      "epoch": 0.421143473570658,
+      "grad_norm": 0.5706565380096436,
+      "learning_rate": 0.0001752021563342318,
+      "loss": 0.968651294708252,
+      "step": 976
+    },
+    {
+      "epoch": 0.42200647249190937,
+      "grad_norm": 0.5269908308982849,
+      "learning_rate": 0.00017556154537286614,
+      "loss": 0.9729376435279846,
+      "step": 978
+    },
+    {
+      "epoch": 0.4228694714131607,
+      "grad_norm": 0.47058001160621643,
+      "learning_rate": 0.00017592093441150047,
+      "loss": 0.963884711265564,
+      "step": 980
+    },
+    {
+      "epoch": 0.4237324703344121,
+      "grad_norm": 0.5322962999343872,
+      "learning_rate": 0.0001762803234501348,
+      "loss": 0.8952447175979614,
+      "step": 982
+    },
+    {
+      "epoch": 0.42459546925566344,
+      "grad_norm": 0.5750975012779236,
+      "learning_rate": 0.0001766397124887691,
+      "loss": 0.8932783603668213,
+      "step": 984
+    },
+    {
+      "epoch": 0.4254584681769148,
+      "grad_norm": 0.5539655685424805,
+      "learning_rate": 0.00017699910152740343,
+      "loss": 0.916595458984375,
+      "step": 986
+    },
+    {
+      "epoch": 0.42632146709816615,
+      "grad_norm": 0.568000853061676,
+      "learning_rate": 0.00017735849056603776,
+      "loss": 0.9669626355171204,
+      "step": 988
+    },
+    {
+      "epoch": 0.42718446601941745,
+      "grad_norm": 0.6010684370994568,
+      "learning_rate": 0.00017771787960467206,
+      "loss": 1.0089105367660522,
+      "step": 990
+    },
+    {
+      "epoch": 0.4280474649406688,
+      "grad_norm": 0.6083462238311768,
+      "learning_rate": 0.0001780772686433064,
+      "loss": 0.9810921549797058,
+      "step": 992
+    },
+    {
+      "epoch": 0.42891046386192017,
+      "grad_norm": 0.5076655149459839,
+      "learning_rate": 0.0001784366576819407,
+      "loss": 0.9524372816085815,
+      "step": 994
+    },
+    {
+      "epoch": 0.4297734627831715,
+      "grad_norm": 0.5260922312736511,
+      "learning_rate": 0.00017879604672057503,
+      "loss": 0.881294846534729,
+      "step": 996
+    },
+    {
+      "epoch": 0.4306364617044229,
+      "grad_norm": 0.6130498051643372,
+      "learning_rate": 0.00017915543575920936,
+      "loss": 0.9138327836990356,
+      "step": 998
+    },
+    {
+      "epoch": 0.43149946062567424,
+      "grad_norm": 0.5346242785453796,
+      "learning_rate": 0.00017951482479784366,
+      "loss": 0.8861367106437683,
+      "step": 1000
+    },
+    {
+      "epoch": 0.43149946062567424,
+      "eval_loss": 0.9606748819351196,
+      "eval_runtime": 655.4358,
+      "eval_samples_per_second": 3.143,
+      "eval_steps_per_second": 3.143,
+      "step": 1000
+    },
+    {
+      "epoch": 0.4323624595469256,
+      "grad_norm": 0.5977228879928589,
+      "learning_rate": 0.000179874213836478,
+      "loss": 0.8711628913879395,
+      "step": 1002
+    },
+    {
+      "epoch": 0.4332254584681769,
+      "grad_norm": 0.5547866821289062,
+      "learning_rate": 0.00018023360287511232,
+      "loss": 0.9393253326416016,
+      "step": 1004
+    },
+    {
+      "epoch": 0.43408845738942825,
+      "grad_norm": 0.536856472492218,
+      "learning_rate": 0.00018059299191374662,
+      "loss": 0.9486003518104553,
+      "step": 1006
+    },
+    {
+      "epoch": 0.4349514563106796,
+      "grad_norm": 0.4769814610481262,
+      "learning_rate": 0.00018095238095238095,
+      "loss": 0.9042052030563354,
+      "step": 1008
+    },
+    {
+      "epoch": 0.43581445523193096,
+      "grad_norm": 0.5554604530334473,
+      "learning_rate": 0.00018131176999101528,
+      "loss": 0.978546142578125,
+      "step": 1010
+    },
+    {
+      "epoch": 0.4366774541531823,
+      "grad_norm": 0.5112947225570679,
+      "learning_rate": 0.00018167115902964959,
+      "loss": 0.8382073640823364,
+      "step": 1012
+    },
+    {
+      "epoch": 0.4375404530744337,
+      "grad_norm": 0.45194941759109497,
+      "learning_rate": 0.00018203054806828392,
+      "loss": 0.8577026724815369,
+      "step": 1014
+    },
+    {
+      "epoch": 0.438403451995685,
+      "grad_norm": 0.5115043520927429,
+      "learning_rate": 0.00018238993710691825,
+      "loss": 0.8517863154411316,
+      "step": 1016
+    },
+    {
+      "epoch": 0.43926645091693634,
+      "grad_norm": 0.5485050082206726,
+      "learning_rate": 0.00018274932614555258,
+      "loss": 0.9597266912460327,
+      "step": 1018
+    },
+    {
+      "epoch": 0.4401294498381877,
+      "grad_norm": 0.5742959976196289,
+      "learning_rate": 0.00018310871518418688,
+      "loss": 1.0407187938690186,
+      "step": 1020
+    },
+    {
+      "epoch": 0.44099244875943905,
+      "grad_norm": 0.44870051741600037,
+      "learning_rate": 0.0001834681042228212,
+      "loss": 0.8696310520172119,
+      "step": 1022
+    },
+    {
+      "epoch": 0.4418554476806904,
+      "grad_norm": 0.5179623961448669,
+      "learning_rate": 0.00018382749326145554,
+      "loss": 0.9673634767532349,
+      "step": 1024
+    },
+    {
+      "epoch": 0.44271844660194176,
+      "grad_norm": 0.5404779314994812,
+      "learning_rate": 0.00018418688230008984,
+      "loss": 0.9596615433692932,
+      "step": 1026
+    },
+    {
+      "epoch": 0.4435814455231931,
+      "grad_norm": 0.47766315937042236,
+      "learning_rate": 0.00018454627133872417,
+      "loss": 0.8483878970146179,
+      "step": 1028
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.503380537033081,
+      "learning_rate": 0.0001849056603773585,
+      "loss": 0.9330979585647583,
+      "step": 1030
+    },
+    {
+      "epoch": 0.4453074433656958,
+      "grad_norm": 0.6129396557807922,
+      "learning_rate": 0.00018526504941599283,
+      "loss": 0.9341012239456177,
+      "step": 1032
+    },
+    {
+      "epoch": 0.44617044228694713,
+      "grad_norm": 0.4497876465320587,
+      "learning_rate": 0.00018562443845462713,
+      "loss": 0.9139068126678467,
+      "step": 1034
+    },
+    {
+      "epoch": 0.4470334412081985,
+      "grad_norm": 0.5369747281074524,
+      "learning_rate": 0.00018598382749326146,
+      "loss": 0.8874827027320862,
+      "step": 1036
+    },
+    {
+      "epoch": 0.44789644012944985,
+      "grad_norm": 0.5947322845458984,
+      "learning_rate": 0.0001863432165318958,
+      "loss": 0.9653725028038025,
+      "step": 1038
+    },
+    {
+      "epoch": 0.4487594390507012,
+      "grad_norm": 0.6649987101554871,
+      "learning_rate": 0.0001867026055705301,
+      "loss": 0.9553119540214539,
+      "step": 1040
+    },
+    {
+      "epoch": 0.44962243797195256,
+      "grad_norm": 0.5369387269020081,
+      "learning_rate": 0.00018706199460916443,
+      "loss": 0.904233992099762,
+      "step": 1042
+    },
+    {
+      "epoch": 0.45048543689320386,
+      "grad_norm": 0.4956842362880707,
+      "learning_rate": 0.00018742138364779876,
+      "loss": 0.8837952017784119,
+      "step": 1044
+    },
+    {
+      "epoch": 0.4513484358144552,
+      "grad_norm": 0.48045051097869873,
+      "learning_rate": 0.0001877807726864331,
+      "loss": 0.8964687585830688,
+      "step": 1046
+    },
+    {
+      "epoch": 0.4522114347357066,
+      "grad_norm": 0.4925530254840851,
+      "learning_rate": 0.0001881401617250674,
+      "loss": 0.9105878472328186,
+      "step": 1048
+    },
+    {
+      "epoch": 0.45307443365695793,
+      "grad_norm": 0.5131362080574036,
+      "learning_rate": 0.00018849955076370172,
+      "loss": 0.96272873878479,
+      "step": 1050
+    },
+    {
+      "epoch": 0.4539374325782093,
+      "grad_norm": 0.435739129781723,
+      "learning_rate": 0.00018885893980233605,
+      "loss": 0.8796783685684204,
+      "step": 1052
+    },
+    {
+      "epoch": 0.45480043149946064,
+      "grad_norm": 0.500938892364502,
+      "learning_rate": 0.00018921832884097035,
+      "loss": 0.9463814496994019,
+      "step": 1054
+    },
+    {
+      "epoch": 0.455663430420712,
+      "grad_norm": 0.4769900143146515,
+      "learning_rate": 0.00018957771787960468,
+      "loss": 0.9030335545539856,
+      "step": 1056
+    },
+    {
+      "epoch": 0.4565264293419633,
+      "grad_norm": 0.49585285782814026,
+      "learning_rate": 0.00018993710691823901,
+      "loss": 0.986995279788971,
+      "step": 1058
+    },
+    {
+      "epoch": 0.45738942826321466,
+      "grad_norm": 0.5875195264816284,
+      "learning_rate": 0.00019029649595687334,
+      "loss": 0.9297246932983398,
+      "step": 1060
+    },
+    {
+      "epoch": 0.458252427184466,
+      "grad_norm": 0.5552583932876587,
+      "learning_rate": 0.00019065588499550765,
+      "loss": 1.005869746208191,
+      "step": 1062
+    },
+    {
+      "epoch": 0.4591154261057174,
+      "grad_norm": 0.49282076954841614,
+      "learning_rate": 0.00019101527403414198,
+      "loss": 0.8949927091598511,
+      "step": 1064
+    },
+    {
+      "epoch": 0.45997842502696873,
+      "grad_norm": 0.4951777160167694,
+      "learning_rate": 0.0001913746630727763,
+      "loss": 0.9997886419296265,
+      "step": 1066
+    },
+    {
+      "epoch": 0.4608414239482201,
+      "grad_norm": 0.5154827237129211,
+      "learning_rate": 0.0001917340521114106,
+      "loss": 0.9532123804092407,
+      "step": 1068
+    },
+    {
+      "epoch": 0.4617044228694714,
+      "grad_norm": 0.5547500252723694,
+      "learning_rate": 0.00019209344115004494,
+      "loss": 0.8959843516349792,
+      "step": 1070
+    },
+    {
+      "epoch": 0.46256742179072274,
+      "grad_norm": 0.500188946723938,
+      "learning_rate": 0.00019245283018867927,
+      "loss": 0.8201484680175781,
+      "step": 1072
+    },
+    {
+      "epoch": 0.4634304207119741,
+      "grad_norm": 0.4181794822216034,
+      "learning_rate": 0.00019281221922731357,
+      "loss": 0.8255136609077454,
+      "step": 1074
+    },
+    {
+      "epoch": 0.46429341963322546,
+      "grad_norm": 0.5613874197006226,
+      "learning_rate": 0.0001931716082659479,
+      "loss": 0.896024763584137,
+      "step": 1076
+    },
+    {
+      "epoch": 0.4651564185544768,
+      "grad_norm": 0.5000972151756287,
+      "learning_rate": 0.0001935309973045822,
+      "loss": 0.8831873536109924,
+      "step": 1078
+    },
+    {
+      "epoch": 0.46601941747572817,
+      "grad_norm": 0.6321820616722107,
+      "learning_rate": 0.00019389038634321654,
+      "loss": 0.9787988662719727,
+      "step": 1080
+    },
+    {
+      "epoch": 0.4668824163969795,
+      "grad_norm": 0.4843652546405792,
+      "learning_rate": 0.00019424977538185087,
+      "loss": 0.933361828327179,
+      "step": 1082
+    },
+    {
+      "epoch": 0.46774541531823083,
+      "grad_norm": 0.537330150604248,
+      "learning_rate": 0.00019460916442048517,
+      "loss": 0.9046981334686279,
+      "step": 1084
+    },
+    {
+      "epoch": 0.4686084142394822,
+      "grad_norm": 0.5761371850967407,
+      "learning_rate": 0.0001949685534591195,
+      "loss": 0.9625781178474426,
+      "step": 1086
+    },
+    {
+      "epoch": 0.46947141316073354,
+      "grad_norm": 0.5209522843360901,
+      "learning_rate": 0.00019532794249775383,
+      "loss": 0.9280619025230408,
+      "step": 1088
+    },
+    {
+      "epoch": 0.4703344120819849,
+      "grad_norm": 0.5383933186531067,
+      "learning_rate": 0.00019568733153638813,
+      "loss": 0.8236247301101685,
+      "step": 1090
+    },
+    {
+      "epoch": 0.47119741100323626,
+      "grad_norm": 0.4994274377822876,
+      "learning_rate": 0.00019604672057502246,
+      "loss": 0.9404071569442749,
+      "step": 1092
+    },
+    {
+      "epoch": 0.4720604099244876,
+      "grad_norm": 0.5177807211875916,
+      "learning_rate": 0.0001964061096136568,
+      "loss": 0.8517536520957947,
+      "step": 1094
+    },
+    {
+      "epoch": 0.47292340884573897,
+      "grad_norm": 0.5374870896339417,
+      "learning_rate": 0.00019676549865229112,
+      "loss": 0.8214367032051086,
+      "step": 1096
+    },
+    {
+      "epoch": 0.47378640776699027,
+      "grad_norm": 0.5544074177742004,
+      "learning_rate": 0.00019712488769092542,
+      "loss": 1.016176700592041,
+      "step": 1098
+    },
+    {
+      "epoch": 0.4746494066882416,
+      "grad_norm": 0.5125867128372192,
+      "learning_rate": 0.00019748427672955975,
+      "loss": 0.8425421118736267,
+      "step": 1100
+    },
+    {
+      "epoch": 0.4746494066882416,
+      "eval_loss": 0.944629430770874,
+      "eval_runtime": 649.9107,
+      "eval_samples_per_second": 3.17,
+      "eval_steps_per_second": 3.17,
+      "step": 1100
+    },
+    {
+      "epoch": 0.475512405609493,
+      "grad_norm": 0.5204418897628784,
+      "learning_rate": 0.00019784366576819408,
+      "loss": 0.9444556832313538,
+      "step": 1102
+    },
+    {
+      "epoch": 0.47637540453074434,
+      "grad_norm": 0.46085885167121887,
+      "learning_rate": 0.0001982030548068284,
+      "loss": 0.877082109451294,
+      "step": 1104
+    },
+    {
+      "epoch": 0.4772384034519957,
+      "grad_norm": 0.5269598960876465,
+      "learning_rate": 0.00019856244384546272,
+      "loss": 0.9214640259742737,
+      "step": 1106
+    },
+    {
+      "epoch": 0.47810140237324705,
+      "grad_norm": 0.4894753694534302,
+      "learning_rate": 0.00019892183288409705,
+      "loss": 0.8867175579071045,
+      "step": 1108
+    },
+    {
+      "epoch": 0.47896440129449835,
+      "grad_norm": 0.5204115509986877,
+      "learning_rate": 0.00019928122192273138,
+      "loss": 0.9641162753105164,
+      "step": 1110
+    },
+    {
+      "epoch": 0.4798274002157497,
+      "grad_norm": 0.6399031281471252,
+      "learning_rate": 0.00019964061096136568,
+      "loss": 1.0219199657440186,
+      "step": 1112
+    },
+    {
+      "epoch": 0.48069039913700107,
+      "grad_norm": 0.3979159891605377,
+      "learning_rate": 0.0002,
+      "loss": 0.8189998269081116,
+      "step": 1114
+    },
+    {
+      "epoch": 0.4815533980582524,
+      "grad_norm": 0.4782681465148926,
+      "learning_rate": 0.0001999999879427254,
+      "loss": 0.83241868019104,
+      "step": 1116
+    },
+    {
+      "epoch": 0.4824163969795038,
+      "grad_norm": 0.5235620141029358,
+      "learning_rate": 0.00019999995177090454,
+      "loss": 0.9371466636657715,
+      "step": 1118
+    },
+    {
+      "epoch": 0.48327939590075514,
+      "grad_norm": 0.4543023407459259,
+      "learning_rate": 0.00019999989148454606,
+      "loss": 0.7767758369445801,
+      "step": 1120
+    },
+    {
+      "epoch": 0.4841423948220065,
+      "grad_norm": 0.6191229820251465,
+      "learning_rate": 0.00019999980708366457,
+      "loss": 1.0103063583374023,
+      "step": 1122
+    },
+    {
+      "epoch": 0.4850053937432578,
+      "grad_norm": 0.45392486453056335,
+      "learning_rate": 0.00019999969856828042,
+      "loss": 0.8720875382423401,
+      "step": 1124
+    },
+    {
+      "epoch": 0.48586839266450915,
+      "grad_norm": 0.42748701572418213,
+      "learning_rate": 0.00019999956593841974,
+      "loss": 0.7859150171279907,
+      "step": 1126
+    },
+    {
+      "epoch": 0.4867313915857605,
+      "grad_norm": 0.4952569603919983,
+      "learning_rate": 0.00019999940919411454,
+      "loss": 0.9154419898986816,
+      "step": 1128
+    },
+    {
+      "epoch": 0.48759439050701187,
+      "grad_norm": 0.5522173047065735,
+      "learning_rate": 0.00019999922833540264,
+      "loss": 0.9076330065727234,
+      "step": 1130
+    },
+    {
+      "epoch": 0.4884573894282632,
+      "grad_norm": 0.5355855226516724,
+      "learning_rate": 0.00019999902336232758,
+      "loss": 0.8933543562889099,
+      "step": 1132
+    },
+    {
+      "epoch": 0.4893203883495146,
+      "grad_norm": 0.4613489508628845,
+      "learning_rate": 0.00019999879427493885,
+      "loss": 0.9160735607147217,
+      "step": 1134
+    },
+    {
+      "epoch": 0.49018338727076594,
+      "grad_norm": 0.4758962094783783,
+      "learning_rate": 0.0001999985410732917,
+      "loss": 0.8552446961402893,
+      "step": 1136
+    },
+    {
+      "epoch": 0.49104638619201724,
+      "grad_norm": 0.4549376964569092,
+      "learning_rate": 0.00019999826375744715,
+      "loss": 0.9979530572891235,
+      "step": 1138
+    },
+    {
+      "epoch": 0.4919093851132686,
+      "grad_norm": 0.4363284409046173,
+      "learning_rate": 0.0001999979623274721,
+      "loss": 0.9295380115509033,
+      "step": 1140
+    },
+    {
+      "epoch": 0.49277238403451995,
+      "grad_norm": 0.5090877413749695,
+      "learning_rate": 0.0001999976367834392,
+      "loss": 0.8737252950668335,
+      "step": 1142
+    },
+    {
+      "epoch": 0.4936353829557713,
+      "grad_norm": 0.45340779423713684,
+      "learning_rate": 0.000199997287125427,
+      "loss": 0.8312779068946838,
+      "step": 1144
+    },
+    {
+      "epoch": 0.49449838187702266,
+      "grad_norm": 0.4771472215652466,
+      "learning_rate": 0.0001999969133535198,
+      "loss": 0.9105207324028015,
+      "step": 1146
+    },
+    {
+      "epoch": 0.495361380798274,
+      "grad_norm": 0.5251384377479553,
+      "learning_rate": 0.00019999651546780773,
+      "loss": 0.8578172922134399,
+      "step": 1148
+    },
+    {
+      "epoch": 0.4962243797195254,
+      "grad_norm": 0.49128198623657227,
+      "learning_rate": 0.00019999609346838676,
+      "loss": 0.9193941950798035,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4970873786407767,
+      "grad_norm": 0.5558596849441528,
+      "learning_rate": 0.0001999956473553586,
+      "loss": 0.9141314625740051,
+      "step": 1152
+    },
+    {
+      "epoch": 0.49795037756202803,
+      "grad_norm": 0.45872750878334045,
+      "learning_rate": 0.00019999517712883087,
+      "loss": 0.9058388471603394,
+      "step": 1154
+    },
+    {
+      "epoch": 0.4988133764832794,
+      "grad_norm": 0.4710173010826111,
+      "learning_rate": 0.00019999468278891698,
+      "loss": 0.8462487459182739,
+      "step": 1156
+    },
+    {
+      "epoch": 0.49967637540453075,
+      "grad_norm": 0.4805637001991272,
+      "learning_rate": 0.0001999941643357361,
+      "loss": 0.8960906863212585,
+      "step": 1158
+    },
+    {
+      "epoch": 0.5005393743257821,
+      "grad_norm": 0.5108052492141724,
+      "learning_rate": 0.00019999362176941325,
+      "loss": 0.9074980020523071,
+      "step": 1160
+    },
+    {
+      "epoch": 0.5014023732470334,
+      "grad_norm": 0.49572333693504333,
+      "learning_rate": 0.00019999305509007932,
+      "loss": 0.9710080623626709,
+      "step": 1162
+    },
+    {
+      "epoch": 0.5022653721682848,
+      "grad_norm": 0.5304561257362366,
+      "learning_rate": 0.0001999924642978709,
+      "loss": 0.8877825140953064,
+      "step": 1164
+    },
+    {
+      "epoch": 0.5031283710895361,
+      "grad_norm": 0.5007328391075134,
+      "learning_rate": 0.0001999918493929305,
+      "loss": 0.8955381512641907,
+      "step": 1166
+    },
+    {
+      "epoch": 0.5039913700107875,
+      "grad_norm": 0.569549024105072,
+      "learning_rate": 0.0001999912103754064,
+      "loss": 0.9478562474250793,
+      "step": 1168
+    },
+    {
+      "epoch": 0.5048543689320388,
+      "grad_norm": 0.5354957580566406,
+      "learning_rate": 0.00019999054724545264,
+      "loss": 0.9685383439064026,
+      "step": 1170
+    },
+    {
+      "epoch": 0.5057173678532901,
+      "grad_norm": 0.547788143157959,
+      "learning_rate": 0.00019998986000322917,
+      "loss": 0.9221975207328796,
+      "step": 1172
+    },
+    {
+      "epoch": 0.5065803667745415,
+      "grad_norm": 0.4919529855251312,
+      "learning_rate": 0.00019998914864890175,
+      "loss": 0.9104788303375244,
+      "step": 1174
+    },
+    {
+      "epoch": 0.5074433656957928,
+      "grad_norm": 0.5274141430854797,
+      "learning_rate": 0.00019998841318264187,
+      "loss": 0.9176050424575806,
+      "step": 1176
+    },
+    {
+      "epoch": 0.5083063646170443,
+      "grad_norm": 0.4257420301437378,
+      "learning_rate": 0.00019998765360462688,
+      "loss": 0.8389710187911987,
+      "step": 1178
+    },
+    {
+      "epoch": 0.5091693635382956,
+      "grad_norm": 0.4947778880596161,
+      "learning_rate": 0.00019998686991504002,
+      "loss": 1.0164397954940796,
+      "step": 1180
+    },
+    {
+      "epoch": 0.510032362459547,
+      "grad_norm": 0.5540821552276611,
+      "learning_rate": 0.00019998606211407016,
+      "loss": 0.9900994300842285,
+      "step": 1182
+    },
+    {
+      "epoch": 0.5108953613807983,
+      "grad_norm": 0.4793289601802826,
+      "learning_rate": 0.0001999852302019122,
+      "loss": 0.8797636032104492,
+      "step": 1184
+    },
+    {
+      "epoch": 0.5117583603020496,
+      "grad_norm": 0.47429659962654114,
+      "learning_rate": 0.00019998437417876672,
+      "loss": 0.96225905418396,
+      "step": 1186
+    },
+    {
+      "epoch": 0.512621359223301,
+      "grad_norm": 0.44918450713157654,
+      "learning_rate": 0.00019998349404484013,
+      "loss": 0.8604235649108887,
+      "step": 1188
+    },
+    {
+      "epoch": 0.5134843581445523,
+      "grad_norm": 0.566977858543396,
+      "learning_rate": 0.00019998258980034468,
+      "loss": 1.0325366258621216,
+      "step": 1190
+    },
+    {
+      "epoch": 0.5143473570658037,
+      "grad_norm": 0.4671999514102936,
+      "learning_rate": 0.00019998166144549843,
+      "loss": 0.7658900022506714,
+      "step": 1192
+    },
+    {
+      "epoch": 0.515210355987055,
+      "grad_norm": 0.42312702536582947,
+      "learning_rate": 0.00019998070898052521,
+      "loss": 0.9365432858467102,
+      "step": 1194
+    },
+    {
+      "epoch": 0.5160733549083064,
+      "grad_norm": 0.4652721881866455,
+      "learning_rate": 0.00019997973240565476,
+      "loss": 0.771016538143158,
+      "step": 1196
+    },
+    {
+      "epoch": 0.5169363538295577,
+      "grad_norm": 0.5048499703407288,
+      "learning_rate": 0.00019997873172112254,
+      "loss": 0.9123705625534058,
+      "step": 1198
+    },
+    {
+      "epoch": 0.517799352750809,
+      "grad_norm": 0.5446439981460571,
+      "learning_rate": 0.0001999777069271699,
+      "loss": 0.8975751399993896,
+      "step": 1200
+    },
+    {
+      "epoch": 0.517799352750809,
+      "eval_loss": 0.9344067573547363,
+      "eval_runtime": 658.6934,
+      "eval_samples_per_second": 3.127,
+      "eval_steps_per_second": 3.127,
+      "step": 1200
+    },
+    {
+      "epoch": 0.5186623516720604,
+      "grad_norm": 0.5044088959693909,
+      "learning_rate": 0.0001999766580240439,
+      "loss": 0.89097660779953,
+      "step": 1202
+    },
+    {
+      "epoch": 0.5195253505933117,
+      "grad_norm": 0.4672294855117798,
+      "learning_rate": 0.00019997558501199753,
+      "loss": 0.9138525128364563,
+      "step": 1204
+    },
+    {
+      "epoch": 0.5203883495145631,
+      "grad_norm": 0.45749008655548096,
+      "learning_rate": 0.00019997448789128952,
+      "loss": 0.8946340680122375,
+      "step": 1206
+    },
+    {
+      "epoch": 0.5212513484358144,
+      "grad_norm": 0.4828707277774811,
+      "learning_rate": 0.00019997336666218447,
+      "loss": 0.8661436438560486,
+      "step": 1208
+    },
+    {
+      "epoch": 0.5221143473570659,
+      "grad_norm": 0.3975147008895874,
+      "learning_rate": 0.0001999722213249527,
+      "loss": 0.7684835195541382,
+      "step": 1210
+    },
+    {
+      "epoch": 0.5229773462783172,
+      "grad_norm": 0.5642077326774597,
+      "learning_rate": 0.00019997105187987045,
+      "loss": 0.9667536616325378,
+      "step": 1212
+    },
+    {
+      "epoch": 0.5238403451995685,
+      "grad_norm": 0.4907105565071106,
+      "learning_rate": 0.00019996985832721972,
+      "loss": 0.8679366707801819,
+      "step": 1214
+    },
+    {
+      "epoch": 0.5247033441208199,
+      "grad_norm": 0.46214789152145386,
+      "learning_rate": 0.0001999686406672883,
+      "loss": 0.8802784085273743,
+      "step": 1216
+    },
+    {
+      "epoch": 0.5255663430420712,
+      "grad_norm": 0.4355131685733795,
+      "learning_rate": 0.00019996739890036985,
+      "loss": 0.8493598103523254,
+      "step": 1218
+    },
+    {
+      "epoch": 0.5264293419633226,
+      "grad_norm": 0.4293915331363678,
+      "learning_rate": 0.0001999661330267638,
+      "loss": 0.8949980735778809,
+      "step": 1220
+    },
+    {
+      "epoch": 0.5272923408845739,
+      "grad_norm": 0.5452485680580139,
+      "learning_rate": 0.00019996484304677544,
+      "loss": 0.9497376680374146,
+      "step": 1222
+    },
+    {
+      "epoch": 0.5281553398058253,
+      "grad_norm": 0.45874500274658203,
+      "learning_rate": 0.00019996352896071583,
+      "loss": 0.9170818328857422,
+      "step": 1224
+    },
+    {
+      "epoch": 0.5290183387270766,
+      "grad_norm": 0.4414025843143463,
+      "learning_rate": 0.00019996219076890182,
+      "loss": 0.7557252645492554,
+      "step": 1226
+    },
+    {
+      "epoch": 0.5298813376483279,
+      "grad_norm": 0.4891829192638397,
+      "learning_rate": 0.0001999608284716562,
+      "loss": 0.848960816860199,
+      "step": 1228
+    },
+    {
+      "epoch": 0.5307443365695793,
+      "grad_norm": 0.5048345327377319,
+      "learning_rate": 0.00019995944206930734,
+      "loss": 0.9555954933166504,
+      "step": 1230
+    },
+    {
+      "epoch": 0.5316073354908306,
+      "grad_norm": 0.5006756782531738,
+      "learning_rate": 0.00019995803156218968,
+      "loss": 0.8080939054489136,
+      "step": 1232
+    },
+    {
+      "epoch": 0.532470334412082,
+      "grad_norm": 0.5422173738479614,
+      "learning_rate": 0.00019995659695064332,
+      "loss": 0.8638371825218201,
+      "step": 1234
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.4535163640975952,
+      "learning_rate": 0.0001999551382350142,
+      "loss": 0.889068067073822,
+      "step": 1236
+    },
+    {
+      "epoch": 0.5341963322545846,
+      "grad_norm": 0.4588642418384552,
+      "learning_rate": 0.00019995365541565412,
+      "loss": 0.8803121447563171,
+      "step": 1238
+    },
+    {
+      "epoch": 0.535059331175836,
+      "grad_norm": 0.49366191029548645,
+      "learning_rate": 0.00019995214849292064,
+      "loss": 0.9694926738739014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.5359223300970873,
+      "grad_norm": 0.41988301277160645,
+      "learning_rate": 0.0001999506174671771,
+      "loss": 0.8367960453033447,
+      "step": 1242
+    },
+    {
+      "epoch": 0.5367853290183388,
+      "grad_norm": 0.5242130160331726,
+      "learning_rate": 0.00019994906233879273,
+      "loss": 0.942340612411499,
+      "step": 1244
+    },
+    {
+      "epoch": 0.53764832793959,
+      "grad_norm": 0.4899834096431732,
+      "learning_rate": 0.00019994748310814256,
+      "loss": 0.8926790356636047,
+      "step": 1246
+    },
+    {
+      "epoch": 0.5385113268608415,
+      "grad_norm": 0.5496823787689209,
+      "learning_rate": 0.00019994587977560744,
+      "loss": 0.99891597032547,
+      "step": 1248
+    },
+    {
+      "epoch": 0.5393743257820928,
+      "grad_norm": 0.4497414231300354,
+      "learning_rate": 0.00019994425234157396,
+      "loss": 0.8873116970062256,
+      "step": 1250
+    },
+    {
+      "epoch": 0.5402373247033441,
+      "grad_norm": 0.4256928563117981,
+      "learning_rate": 0.00019994260080643454,
+      "loss": 0.9041396975517273,
+      "step": 1252
+    },
+    {
+      "epoch": 0.5411003236245955,
+      "grad_norm": 0.36910608410835266,
+      "learning_rate": 0.00019994092517058753,
+      "loss": 0.7647561430931091,
+      "step": 1254
+    },
+    {
+      "epoch": 0.5419633225458468,
+      "grad_norm": 0.537584125995636,
+      "learning_rate": 0.0001999392254344369,
+      "loss": 0.8892287015914917,
+      "step": 1256
+    },
+    {
+      "epoch": 0.5428263214670982,
+      "grad_norm": 0.49463894963264465,
+      "learning_rate": 0.00019993750159839264,
+      "loss": 0.8638571500778198,
+      "step": 1258
+    },
+    {
+      "epoch": 0.5436893203883495,
+      "grad_norm": 0.5052056908607483,
+      "learning_rate": 0.00019993575366287036,
+      "loss": 0.8165372014045715,
+      "step": 1260
+    },
+    {
+      "epoch": 0.5445523193096009,
+      "grad_norm": 0.47367510199546814,
+      "learning_rate": 0.0001999339816282916,
+      "loss": 0.9099977016448975,
+      "step": 1262
+    },
+    {
+      "epoch": 0.5454153182308522,
+      "grad_norm": 0.4600350558757782,
+      "learning_rate": 0.00019993218549508364,
+      "loss": 0.8557311296463013,
+      "step": 1264
+    },
+    {
+      "epoch": 0.5462783171521035,
+      "grad_norm": 0.5684534311294556,
+      "learning_rate": 0.0001999303652636797,
+      "loss": 0.9136497974395752,
+      "step": 1266
+    },
+    {
+      "epoch": 0.5471413160733549,
+      "grad_norm": 0.5151359438896179,
+      "learning_rate": 0.00019992852093451865,
+      "loss": 0.7906932830810547,
+      "step": 1268
+    },
+    {
+      "epoch": 0.5480043149946062,
+      "grad_norm": 0.48577409982681274,
+      "learning_rate": 0.00019992665250804525,
+      "loss": 0.9326766133308411,
+      "step": 1270
+    },
+    {
+      "epoch": 0.5488673139158576,
+      "grad_norm": 0.490531325340271,
+      "learning_rate": 0.00019992475998471004,
+      "loss": 0.9734495878219604,
+      "step": 1272
+    },
+    {
+      "epoch": 0.5497303128371089,
+      "grad_norm": 0.5092435479164124,
+      "learning_rate": 0.00019992284336496947,
+      "loss": 0.8728410005569458,
+      "step": 1274
+    },
+    {
+      "epoch": 0.5505933117583603,
+      "grad_norm": 0.3843296766281128,
+      "learning_rate": 0.00019992090264928566,
+      "loss": 0.7572637796401978,
+      "step": 1276
+    },
+    {
+      "epoch": 0.5514563106796116,
+      "grad_norm": 0.534304678440094,
+      "learning_rate": 0.00019991893783812662,
+      "loss": 0.8895323872566223,
+      "step": 1278
+    },
+    {
+      "epoch": 0.552319309600863,
+      "grad_norm": 0.4567227363586426,
+      "learning_rate": 0.00019991694893196614,
+      "loss": 0.9318088293075562,
+      "step": 1280
+    },
+    {
+      "epoch": 0.5531823085221144,
+      "grad_norm": 0.48464900255203247,
+      "learning_rate": 0.0001999149359312839,
+      "loss": 0.8541979789733887,
+      "step": 1282
+    },
+    {
+      "epoch": 0.5540453074433657,
+      "grad_norm": 0.5569567084312439,
+      "learning_rate": 0.00019991289883656524,
+      "loss": 0.977894127368927,
+      "step": 1284
+    },
+    {
+      "epoch": 0.5549083063646171,
+      "grad_norm": 0.4637227952480316,
+      "learning_rate": 0.00019991083764830145,
+      "loss": 0.8860608339309692,
+      "step": 1286
+    },
+    {
+      "epoch": 0.5557713052858684,
+      "grad_norm": 0.4096687436103821,
+      "learning_rate": 0.00019990875236698956,
+      "loss": 0.8429648876190186,
+      "step": 1288
+    },
+    {
+      "epoch": 0.5566343042071198,
+      "grad_norm": 0.5221695303916931,
+      "learning_rate": 0.00019990664299313242,
+      "loss": 0.8510909080505371,
+      "step": 1290
+    },
+    {
+      "epoch": 0.5574973031283711,
+      "grad_norm": 0.5155899524688721,
+      "learning_rate": 0.00019990450952723872,
+      "loss": 0.8971074223518372,
+      "step": 1292
+    },
+    {
+      "epoch": 0.5583603020496224,
+      "grad_norm": 0.5064809322357178,
+      "learning_rate": 0.0001999023519698229,
+      "loss": 0.9030373096466064,
+      "step": 1294
+    },
+    {
+      "epoch": 0.5592233009708738,
+      "grad_norm": 0.40551698207855225,
+      "learning_rate": 0.00019990017032140526,
+      "loss": 0.7866057753562927,
+      "step": 1296
+    },
+    {
+      "epoch": 0.5600862998921251,
+      "grad_norm": 0.5009430050849915,
+      "learning_rate": 0.00019989796458251194,
+      "loss": 0.9477730989456177,
+      "step": 1298
+    },
+    {
+      "epoch": 0.5609492988133765,
+      "grad_norm": 0.5192028880119324,
+      "learning_rate": 0.00019989573475367477,
+      "loss": 0.9206778407096863,
+      "step": 1300
+    },
+    {
+      "epoch": 0.5609492988133765,
+      "eval_loss": 0.9177446365356445,
+      "eval_runtime": 665.3245,
+      "eval_samples_per_second": 3.096,
+      "eval_steps_per_second": 3.096,
+      "step": 1300
+    },
+    {
+      "epoch": 0.5618122977346278,
+      "grad_norm": 0.5580230951309204,
+      "learning_rate": 0.00019989348083543148,
+      "loss": 0.9516512155532837,
+      "step": 1302
+    },
+    {
+      "epoch": 0.5626752966558792,
+      "grad_norm": 0.4151005744934082,
+      "learning_rate": 0.00019989120282832564,
+      "loss": 0.7725991606712341,
+      "step": 1304
+    },
+    {
+      "epoch": 0.5635382955771305,
+      "grad_norm": 0.56330406665802,
+      "learning_rate": 0.00019988890073290656,
+      "loss": 0.9241501688957214,
+      "step": 1306
+    },
+    {
+      "epoch": 0.5644012944983818,
+      "grad_norm": 0.44836440682411194,
+      "learning_rate": 0.00019988657454972936,
+      "loss": 0.8351686000823975,
+      "step": 1308
+    },
+    {
+      "epoch": 0.5652642934196332,
+      "grad_norm": 0.5414754152297974,
+      "learning_rate": 0.00019988422427935496,
+      "loss": 0.9033217430114746,
+      "step": 1310
+    },
+    {
+      "epoch": 0.5661272923408845,
+      "grad_norm": 0.5283750891685486,
+      "learning_rate": 0.0001998818499223502,
+      "loss": 0.8885331153869629,
+      "step": 1312
+    },
+    {
+      "epoch": 0.566990291262136,
+      "grad_norm": 0.45846256613731384,
+      "learning_rate": 0.00019987945147928758,
+      "loss": 0.8359912037849426,
+      "step": 1314
+    },
+    {
+      "epoch": 0.5678532901833873,
+      "grad_norm": 0.44439879059791565,
+      "learning_rate": 0.0001998770289507455,
+      "loss": 0.8327895402908325,
+      "step": 1316
+    },
+    {
+      "epoch": 0.5687162891046387,
+      "grad_norm": 0.5491341948509216,
+      "learning_rate": 0.00019987458233730813,
+      "loss": 0.9354757070541382,
+      "step": 1318
+    },
+    {
+      "epoch": 0.56957928802589,
+      "grad_norm": 0.5502263307571411,
+      "learning_rate": 0.00019987211163956548,
+      "loss": 0.944054901599884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.5704422869471413,
+      "grad_norm": 0.47165682911872864,
+      "learning_rate": 0.00019986961685811334,
+      "loss": 0.9515072107315063,
+      "step": 1322
+    },
+    {
+      "epoch": 0.5713052858683927,
+      "grad_norm": 0.5136987566947937,
+      "learning_rate": 0.0001998670979935533,
+      "loss": 0.9455493688583374,
+      "step": 1324
+    },
+    {
+      "epoch": 0.572168284789644,
+      "grad_norm": 0.4822693169116974,
+      "learning_rate": 0.00019986455504649277,
+      "loss": 0.8626728057861328,
+      "step": 1326
+    },
+    {
+      "epoch": 0.5730312837108954,
+      "grad_norm": 0.4639468193054199,
+      "learning_rate": 0.000199861988017545,
+      "loss": 0.8857194781303406,
+      "step": 1328
+    },
+    {
+      "epoch": 0.5738942826321467,
+      "grad_norm": 0.5224950313568115,
+      "learning_rate": 0.00019985939690732898,
+      "loss": 0.9198446273803711,
+      "step": 1330
+    },
+    {
+      "epoch": 0.574757281553398,
+      "grad_norm": 0.47504499554634094,
+      "learning_rate": 0.00019985678171646954,
+      "loss": 0.8785439133644104,
+      "step": 1332
+    },
+    {
+      "epoch": 0.5756202804746494,
+      "grad_norm": 0.5022051930427551,
+      "learning_rate": 0.0001998541424455974,
+      "loss": 0.9593754410743713,
+      "step": 1334
+    },
+    {
+      "epoch": 0.5764832793959007,
+      "grad_norm": 0.4572875201702118,
+      "learning_rate": 0.00019985147909534897,
+      "loss": 0.986197292804718,
+      "step": 1336
+    },
+    {
+      "epoch": 0.5773462783171521,
+      "grad_norm": 0.5153827667236328,
+      "learning_rate": 0.00019984879166636644,
+      "loss": 0.9163863658905029,
+      "step": 1338
+    },
+    {
+      "epoch": 0.5782092772384034,
+      "grad_norm": 0.4813650846481323,
+      "learning_rate": 0.00019984608015929792,
+      "loss": 0.8815995454788208,
+      "step": 1340
+    },
+    {
+      "epoch": 0.5790722761596548,
+      "grad_norm": 0.4319819211959839,
+      "learning_rate": 0.0001998433445747973,
+      "loss": 0.857044517993927,
+      "step": 1342
+    },
+    {
+      "epoch": 0.5799352750809061,
+      "grad_norm": 0.5128870010375977,
+      "learning_rate": 0.00019984058491352423,
+      "loss": 0.8939256072044373,
+      "step": 1344
+    },
+    {
+      "epoch": 0.5807982740021574,
+      "grad_norm": 0.41450315713882446,
+      "learning_rate": 0.0001998378011761442,
+      "loss": 0.9267327785491943,
+      "step": 1346
+    },
+    {
+      "epoch": 0.5816612729234089,
+      "grad_norm": 0.42916348576545715,
+      "learning_rate": 0.00019983499336332844,
+      "loss": 0.8494639992713928,
+      "step": 1348
+    },
+    {
+      "epoch": 0.5825242718446602,
+      "grad_norm": 0.43995746970176697,
+      "learning_rate": 0.00019983216147575412,
+      "loss": 0.9720427989959717,
+      "step": 1350
+    },
+    {
+      "epoch": 0.5833872707659116,
+      "grad_norm": 0.5063773989677429,
+      "learning_rate": 0.00019982930551410411,
+      "loss": 0.9400854706764221,
+      "step": 1352
+    },
+    {
+      "epoch": 0.5842502696871629,
+      "grad_norm": 0.5029586553573608,
+      "learning_rate": 0.0001998264254790671,
+      "loss": 0.8657845258712769,
+      "step": 1354
+    },
+    {
+      "epoch": 0.5851132686084143,
+      "grad_norm": 0.45519232749938965,
+      "learning_rate": 0.00019982352137133764,
+      "loss": 0.8593506813049316,
+      "step": 1356
+    },
+    {
+      "epoch": 0.5859762675296656,
+      "grad_norm": 0.49116215109825134,
+      "learning_rate": 0.000199820593191616,
+      "loss": 0.8658114671707153,
+      "step": 1358
+    },
+    {
+      "epoch": 0.5868392664509169,
+      "grad_norm": 0.45347318053245544,
+      "learning_rate": 0.00019981764094060826,
+      "loss": 0.8044605255126953,
+      "step": 1360
+    },
+    {
+      "epoch": 0.5877022653721683,
+      "grad_norm": 0.5191754698753357,
+      "learning_rate": 0.00019981466461902643,
+      "loss": 0.9176861047744751,
+      "step": 1362
+    },
+    {
+      "epoch": 0.5885652642934196,
+      "grad_norm": 0.4791528284549713,
+      "learning_rate": 0.00019981166422758818,
+      "loss": 0.8453370928764343,
+      "step": 1364
+    },
+    {
+      "epoch": 0.589428263214671,
+      "grad_norm": 0.5105116367340088,
+      "learning_rate": 0.00019980863976701705,
+      "loss": 0.9343777298927307,
+      "step": 1366
+    },
+    {
+      "epoch": 0.5902912621359223,
+      "grad_norm": 0.44593656063079834,
+      "learning_rate": 0.00019980559123804236,
+      "loss": 0.8950760960578918,
+      "step": 1368
+    },
+    {
+      "epoch": 0.5911542610571737,
+      "grad_norm": 0.4375658631324768,
+      "learning_rate": 0.0001998025186413993,
+      "loss": 0.8613521456718445,
+      "step": 1370
+    },
+    {
+      "epoch": 0.592017259978425,
+      "grad_norm": 0.5138815641403198,
+      "learning_rate": 0.00019979942197782878,
+      "loss": 0.8982083201408386,
+      "step": 1372
+    },
+    {
+      "epoch": 0.5928802588996763,
+      "grad_norm": 0.45473602414131165,
+      "learning_rate": 0.00019979630124807753,
+      "loss": 0.9372450709342957,
+      "step": 1374
+    },
+    {
+      "epoch": 0.5937432578209277,
+      "grad_norm": 0.4961191713809967,
+      "learning_rate": 0.00019979315645289814,
+      "loss": 0.8758652806282043,
+      "step": 1376
+    },
+    {
+      "epoch": 0.594606256742179,
+      "grad_norm": 0.5672827363014221,
+      "learning_rate": 0.00019978998759304895,
+      "loss": 0.9852207899093628,
+      "step": 1378
+    },
+    {
+      "epoch": 0.5954692556634305,
+      "grad_norm": 0.44907906651496887,
+      "learning_rate": 0.00019978679466929407,
+      "loss": 0.8451287746429443,
+      "step": 1380
+    },
+    {
+      "epoch": 0.5963322545846818,
+      "grad_norm": 0.4759652316570282,
+      "learning_rate": 0.00019978357768240352,
+      "loss": 0.8381558656692505,
+      "step": 1382
+    },
+    {
+      "epoch": 0.5971952535059332,
+      "grad_norm": 0.4936048686504364,
+      "learning_rate": 0.00019978033663315304,
+      "loss": 0.8820816874504089,
+      "step": 1384
+    },
+    {
+      "epoch": 0.5980582524271845,
+      "grad_norm": 0.44983741641044617,
+      "learning_rate": 0.00019977707152232416,
+      "loss": 0.9156787991523743,
+      "step": 1386
+    },
+    {
+      "epoch": 0.5989212513484358,
+      "grad_norm": 0.45198705792427063,
+      "learning_rate": 0.0001997737823507043,
+      "loss": 0.8285194039344788,
+      "step": 1388
+    },
+    {
+      "epoch": 0.5997842502696872,
+      "grad_norm": 0.45788463950157166,
+      "learning_rate": 0.00019977046911908664,
+      "loss": 0.8109505772590637,
+      "step": 1390
+    },
+    {
+      "epoch": 0.6006472491909385,
+      "grad_norm": 0.4795142412185669,
+      "learning_rate": 0.0001997671318282701,
+      "loss": 0.8285947442054749,
+      "step": 1392
+    },
+    {
+      "epoch": 0.6015102481121899,
+      "grad_norm": 0.5026728510856628,
+      "learning_rate": 0.00019976377047905945,
+      "loss": 0.9497535228729248,
+      "step": 1394
+    },
+    {
+      "epoch": 0.6023732470334412,
+      "grad_norm": 0.4994853734970093,
+      "learning_rate": 0.0001997603850722653,
+      "loss": 0.9171916246414185,
+      "step": 1396
+    },
+    {
+      "epoch": 0.6032362459546926,
+      "grad_norm": 0.4789866507053375,
+      "learning_rate": 0.00019975697560870403,
+      "loss": 0.7894434928894043,
+      "step": 1398
+    },
+    {
+      "epoch": 0.6040992448759439,
+      "grad_norm": 0.42282742261886597,
+      "learning_rate": 0.0001997535420891978,
+      "loss": 0.8942429423332214,
+      "step": 1400
+    },
+    {
+      "epoch": 0.6040992448759439,
+      "eval_loss": 0.9080492854118347,
+      "eval_runtime": 661.4597,
+      "eval_samples_per_second": 3.114,
+      "eval_steps_per_second": 3.114,
+      "step": 1400
+    },
+    {
+      "epoch": 0.6049622437971952,
+      "grad_norm": 0.5789905190467834,
+      "learning_rate": 0.00019975008451457454,
+      "loss": 0.8938372731208801,
+      "step": 1402
+    },
+    {
+      "epoch": 0.6058252427184466,
+      "grad_norm": 0.46121683716773987,
+      "learning_rate": 0.00019974660288566814,
+      "loss": 0.8746235370635986,
+      "step": 1404
+    },
+    {
+      "epoch": 0.6066882416396979,
+      "grad_norm": 0.5195551514625549,
+      "learning_rate": 0.00019974309720331807,
+      "loss": 0.8650617003440857,
+      "step": 1406
+    },
+    {
+      "epoch": 0.6075512405609493,
+      "grad_norm": 0.46930259466171265,
+      "learning_rate": 0.00019973956746836976,
+      "loss": 0.8853039145469666,
+      "step": 1408
+    },
+    {
+      "epoch": 0.6084142394822006,
+      "grad_norm": 0.40869632363319397,
+      "learning_rate": 0.0001997360136816744,
+      "loss": 0.7865594029426575,
+      "step": 1410
+    },
+    {
+      "epoch": 0.609277238403452,
+      "grad_norm": 0.5398361086845398,
+      "learning_rate": 0.00019973243584408895,
+      "loss": 0.907535970211029,
+      "step": 1412
+    },
+    {
+      "epoch": 0.6101402373247033,
+      "grad_norm": 0.5110154747962952,
+      "learning_rate": 0.00019972883395647615,
+      "loss": 0.8682730197906494,
+      "step": 1414
+    },
+    {
+      "epoch": 0.6110032362459547,
+      "grad_norm": 0.44484639167785645,
+      "learning_rate": 0.00019972520801970467,
+      "loss": 0.8786011338233948,
+      "step": 1416
+    },
+    {
+      "epoch": 0.6118662351672061,
+      "grad_norm": 0.4768071472644806,
+      "learning_rate": 0.0001997215580346488,
+      "loss": 0.9021878242492676,
+      "step": 1418
+    },
+    {
+      "epoch": 0.6127292340884574,
+      "grad_norm": 0.43265241384506226,
+      "learning_rate": 0.0001997178840021888,
+      "loss": 0.7737482786178589,
+      "step": 1420
+    },
+    {
+      "epoch": 0.6135922330097088,
+      "grad_norm": 0.525692343711853,
+      "learning_rate": 0.0001997141859232106,
+      "loss": 0.876280665397644,
+      "step": 1422
+    },
+    {
+      "epoch": 0.6144552319309601,
+      "grad_norm": 0.48206865787506104,
+      "learning_rate": 0.00019971046379860594,
+      "loss": 0.8503577709197998,
+      "step": 1424
+    },
+    {
+      "epoch": 0.6153182308522115,
+      "grad_norm": 0.6032769680023193,
+      "learning_rate": 0.00019970671762927246,
+      "loss": 0.9459730982780457,
+      "step": 1426
+    },
+    {
+      "epoch": 0.6161812297734628,
+      "grad_norm": 0.4491981863975525,
+      "learning_rate": 0.0001997029474161135,
+      "loss": 0.8836647868156433,
+      "step": 1428
+    },
+    {
+      "epoch": 0.6170442286947141,
+      "grad_norm": 0.47503358125686646,
+      "learning_rate": 0.00019969915316003824,
+      "loss": 0.8614388108253479,
+      "step": 1430
+    },
+    {
+      "epoch": 0.6179072276159655,
+      "grad_norm": 0.44801047444343567,
+      "learning_rate": 0.00019969533486196162,
+      "loss": 0.8420360684394836,
+      "step": 1432
+    },
+    {
+      "epoch": 0.6187702265372168,
+      "grad_norm": 0.45057111978530884,
+      "learning_rate": 0.00019969149252280446,
+      "loss": 0.8256269693374634,
+      "step": 1434
+    },
+    {
+      "epoch": 0.6196332254584682,
+      "grad_norm": 0.4589645266532898,
+      "learning_rate": 0.00019968762614349327,
+      "loss": 0.9130199551582336,
+      "step": 1436
+    },
+    {
+      "epoch": 0.6204962243797195,
+      "grad_norm": 0.48914027214050293,
+      "learning_rate": 0.00019968373572496045,
+      "loss": 0.74083012342453,
+      "step": 1438
+    },
+    {
+      "epoch": 0.6213592233009708,
+      "grad_norm": 0.4582098424434662,
+      "learning_rate": 0.00019967982126814412,
+      "loss": 0.8538379669189453,
+      "step": 1440
+    },
+    {
+      "epoch": 0.6222222222222222,
+      "grad_norm": 0.48722779750823975,
+      "learning_rate": 0.00019967588277398823,
+      "loss": 0.8780114054679871,
+      "step": 1442
+    },
+    {
+      "epoch": 0.6230852211434735,
+      "grad_norm": 0.4291327893733978,
+      "learning_rate": 0.00019967192024344254,
+      "loss": 0.8341028690338135,
+      "step": 1444
+    },
+    {
+      "epoch": 0.623948220064725,
+      "grad_norm": 0.4773139953613281,
+      "learning_rate": 0.00019966793367746265,
+      "loss": 0.8651667237281799,
+      "step": 1446
+    },
+    {
+      "epoch": 0.6248112189859762,
+      "grad_norm": 0.45556166768074036,
+      "learning_rate": 0.00019966392307700986,
+      "loss": 0.8339929580688477,
+      "step": 1448
+    },
+    {
+      "epoch": 0.6256742179072277,
+      "grad_norm": 0.5126671195030212,
+      "learning_rate": 0.00019965988844305129,
+      "loss": 0.9129340052604675,
+      "step": 1450
+    },
+    {
+      "epoch": 0.626537216828479,
+      "grad_norm": 0.6067109704017639,
+      "learning_rate": 0.00019965582977655988,
+      "loss": 0.9057610034942627,
+      "step": 1452
+    },
+    {
+      "epoch": 0.6274002157497303,
+      "grad_norm": 0.46425968408584595,
+      "learning_rate": 0.00019965174707851438,
+      "loss": 0.874100387096405,
+      "step": 1454
+    },
+    {
+      "epoch": 0.6282632146709817,
+      "grad_norm": 0.481077641248703,
+      "learning_rate": 0.0001996476403498993,
+      "loss": 0.915635347366333,
+      "step": 1456
+    },
+    {
+      "epoch": 0.629126213592233,
+      "grad_norm": 0.47299909591674805,
+      "learning_rate": 0.000199643509591705,
+      "loss": 0.9059650301933289,
+      "step": 1458
+    },
+    {
+      "epoch": 0.6299892125134844,
+      "grad_norm": 0.48924630880355835,
+      "learning_rate": 0.00019963935480492753,
+      "loss": 0.9775188565254211,
+      "step": 1460
+    },
+    {
+      "epoch": 0.6308522114347357,
+      "grad_norm": 0.4407665431499481,
+      "learning_rate": 0.0001996351759905688,
+      "loss": 0.8950685858726501,
+      "step": 1462
+    },
+    {
+      "epoch": 0.6317152103559871,
+      "grad_norm": 0.5018318295478821,
+      "learning_rate": 0.00019963097314963657,
+      "loss": 0.8532119989395142,
+      "step": 1464
+    },
+    {
+      "epoch": 0.6325782092772384,
+      "grad_norm": 0.43245720863342285,
+      "learning_rate": 0.0001996267462831443,
+      "loss": 0.7775963544845581,
+      "step": 1466
+    },
+    {
+      "epoch": 0.6334412081984897,
+      "grad_norm": 0.5028865337371826,
+      "learning_rate": 0.00019962249539211125,
+      "loss": 0.8315839767456055,
+      "step": 1468
+    },
+    {
+      "epoch": 0.6343042071197411,
+      "grad_norm": 0.4697185754776001,
+      "learning_rate": 0.0001996182204775626,
+      "loss": 0.849076509475708,
+      "step": 1470
+    },
+    {
+      "epoch": 0.6351672060409924,
+      "grad_norm": 0.46725034713745117,
+      "learning_rate": 0.00019961392154052912,
+      "loss": 0.8828577995300293,
+      "step": 1472
+    },
+    {
+      "epoch": 0.6360302049622438,
+      "grad_norm": 0.4301203489303589,
+      "learning_rate": 0.00019960959858204754,
+      "loss": 0.850115954875946,
+      "step": 1474
+    },
+    {
+      "epoch": 0.6368932038834951,
+      "grad_norm": 0.46635881066322327,
+      "learning_rate": 0.0001996052516031603,
+      "loss": 0.7912618517875671,
+      "step": 1476
+    },
+    {
+      "epoch": 0.6377562028047465,
+      "grad_norm": 0.44143620133399963,
+      "learning_rate": 0.00019960088060491565,
+      "loss": 0.9072504639625549,
+      "step": 1478
+    },
+    {
+      "epoch": 0.6386192017259978,
+      "grad_norm": 0.47458893060684204,
+      "learning_rate": 0.00019959648558836763,
+      "loss": 0.8976638317108154,
+      "step": 1480
+    },
+    {
+      "epoch": 0.6394822006472491,
+      "grad_norm": 0.4596816897392273,
+      "learning_rate": 0.00019959206655457612,
+      "loss": 0.8142043352127075,
+      "step": 1482
+    },
+    {
+      "epoch": 0.6403451995685006,
+      "grad_norm": 0.4839977025985718,
+      "learning_rate": 0.0001995876235046067,
+      "loss": 0.860643744468689,
+      "step": 1484
+    },
+    {
+      "epoch": 0.6412081984897519,
+      "grad_norm": 0.3542814552783966,
+      "learning_rate": 0.00019958315643953085,
+      "loss": 0.7586524486541748,
+      "step": 1486
+    },
+    {
+      "epoch": 0.6420711974110033,
+      "grad_norm": 0.5423269271850586,
+      "learning_rate": 0.00019957866536042572,
+      "loss": 0.9353570342063904,
+      "step": 1488
+    },
+    {
+      "epoch": 0.6429341963322546,
+      "grad_norm": 0.4580909013748169,
+      "learning_rate": 0.00019957415026837437,
+      "loss": 0.9919291138648987,
+      "step": 1490
+    },
+    {
+      "epoch": 0.643797195253506,
+      "grad_norm": 0.4211732745170593,
+      "learning_rate": 0.00019956961116446555,
+      "loss": 0.8720914125442505,
+      "step": 1492
+    },
+    {
+      "epoch": 0.6446601941747573,
+      "grad_norm": 0.4583161175251007,
+      "learning_rate": 0.00019956504804979384,
+      "loss": 0.8661212921142578,
+      "step": 1494
+    },
+    {
+      "epoch": 0.6455231930960086,
+      "grad_norm": 0.4359884262084961,
+      "learning_rate": 0.00019956046092545966,
+      "loss": 0.8170996308326721,
+      "step": 1496
+    },
+    {
+      "epoch": 0.64638619201726,
+      "grad_norm": 0.4642556607723236,
+      "learning_rate": 0.00019955584979256913,
+      "loss": 0.8607422113418579,
+      "step": 1498
+    },
+    {
+      "epoch": 0.6472491909385113,
+      "grad_norm": 0.4496007561683655,
+      "learning_rate": 0.00019955121465223426,
+      "loss": 0.837529182434082,
+      "step": 1500
+    },
+    {
+      "epoch": 0.6472491909385113,
+      "eval_loss": 0.8972997665405273,
+      "eval_runtime": 649.796,
+      "eval_samples_per_second": 3.17,
+      "eval_steps_per_second": 3.17,
+      "step": 1500
+    },
+    {
+      "epoch": 0.6481121898597627,
+      "grad_norm": 0.48363205790519714,
+      "learning_rate": 0.0001995465555055727,
+      "loss": 0.9355250000953674,
+      "step": 1502
+    },
+    {
+      "epoch": 0.648975188781014,
+      "grad_norm": 0.44681695103645325,
+      "learning_rate": 0.0001995418723537081,
+      "loss": 0.8286949396133423,
+      "step": 1504
+    },
+    {
+      "epoch": 0.6498381877022654,
+      "grad_norm": 0.5110394358634949,
+      "learning_rate": 0.00019953716519776967,
+      "loss": 0.890398383140564,
+      "step": 1506
+    },
+    {
+      "epoch": 0.6507011866235167,
+      "grad_norm": 0.4353160262107849,
+      "learning_rate": 0.00019953243403889257,
+      "loss": 0.8117311000823975,
+      "step": 1508
+    },
+    {
+      "epoch": 0.651564185544768,
+      "grad_norm": 0.4731789231300354,
+      "learning_rate": 0.0001995276788782177,
+      "loss": 0.8255904316902161,
+      "step": 1510
+    },
+    {
+      "epoch": 0.6524271844660194,
+      "grad_norm": 0.5447185039520264,
+      "learning_rate": 0.00019952289971689177,
+      "loss": 0.9371263384819031,
+      "step": 1512
+    },
+    {
+      "epoch": 0.6532901833872707,
+      "grad_norm": 0.47616517543792725,
+      "learning_rate": 0.0001995180965560672,
+      "loss": 0.8532910943031311,
+      "step": 1514
+    },
+    {
+      "epoch": 0.6541531823085222,
+      "grad_norm": 0.4412213861942291,
+      "learning_rate": 0.0001995132693969023,
+      "loss": 0.8799141645431519,
+      "step": 1516
+    },
+    {
+      "epoch": 0.6550161812297735,
+      "grad_norm": 0.469911128282547,
+      "learning_rate": 0.00019950841824056107,
+      "loss": 0.8395764827728271,
+      "step": 1518
+    },
+    {
+      "epoch": 0.6558791801510249,
+      "grad_norm": 0.5236243009567261,
+      "learning_rate": 0.00019950354308821336,
+      "loss": 0.9556697010993958,
+      "step": 1520
+    },
+    {
+      "epoch": 0.6567421790722762,
+      "grad_norm": 0.4441990256309509,
+      "learning_rate": 0.00019949864394103482,
+      "loss": 0.8524283170700073,
+      "step": 1522
+    },
+    {
+      "epoch": 0.6576051779935275,
+      "grad_norm": 0.4173077940940857,
+      "learning_rate": 0.00019949372080020682,
+      "loss": 0.9213772416114807,
+      "step": 1524
+    },
+    {
+      "epoch": 0.6584681769147789,
+      "grad_norm": 0.4404120147228241,
+      "learning_rate": 0.00019948877366691658,
+      "loss": 0.9236897230148315,
+      "step": 1526
+    },
+    {
+      "epoch": 0.6593311758360302,
+      "grad_norm": 0.4586230516433716,
+      "learning_rate": 0.00019948380254235706,
+      "loss": 0.8373230695724487,
+      "step": 1528
+    },
+    {
+      "epoch": 0.6601941747572816,
+      "grad_norm": 0.4136028587818146,
+      "learning_rate": 0.00019947880742772703,
+      "loss": 0.8256528377532959,
+      "step": 1530
+    },
+    {
+      "epoch": 0.6610571736785329,
+      "grad_norm": 0.5007129907608032,
+      "learning_rate": 0.00019947378832423107,
+      "loss": 0.8580789566040039,
+      "step": 1532
+    },
+    {
+      "epoch": 0.6619201725997842,
+      "grad_norm": 0.39050203561782837,
+      "learning_rate": 0.00019946874523307947,
+      "loss": 0.7647744417190552,
+      "step": 1534
+    },
+    {
+      "epoch": 0.6627831715210356,
+      "grad_norm": 0.4640588164329529,
+      "learning_rate": 0.00019946367815548835,
+      "loss": 0.9042545557022095,
+      "step": 1536
+    },
+    {
+      "epoch": 0.6636461704422869,
+      "grad_norm": 0.5304957032203674,
+      "learning_rate": 0.00019945858709267963,
+      "loss": 0.9114110469818115,
+      "step": 1538
+    },
+    {
+      "epoch": 0.6645091693635383,
+      "grad_norm": 0.5426004528999329,
+      "learning_rate": 0.000199453472045881,
+      "loss": 0.8239460587501526,
+      "step": 1540
+    },
+    {
+      "epoch": 0.6653721682847896,
+      "grad_norm": 0.44893568754196167,
+      "learning_rate": 0.00019944833301632593,
+      "loss": 0.8091367483139038,
+      "step": 1542
+    },
+    {
+      "epoch": 0.666235167206041,
+      "grad_norm": 0.4294016361236572,
+      "learning_rate": 0.00019944317000525366,
+      "loss": 0.9202280640602112,
+      "step": 1544
+    },
+    {
+      "epoch": 0.6670981661272923,
+      "grad_norm": 0.449633926153183,
+      "learning_rate": 0.00019943798301390927,
+      "loss": 0.8884767889976501,
+      "step": 1546
+    },
+    {
+      "epoch": 0.6679611650485436,
+      "grad_norm": 0.4516827166080475,
+      "learning_rate": 0.0001994327720435435,
+      "loss": 0.8390879034996033,
+      "step": 1548
+    },
+    {
+      "epoch": 0.668824163969795,
+      "grad_norm": 0.422270268201828,
+      "learning_rate": 0.000199427537095413,
+      "loss": 0.7388033270835876,
+      "step": 1550
+    },
+    {
+      "epoch": 0.6696871628910464,
+      "grad_norm": 0.580563485622406,
+      "learning_rate": 0.00019942227817078015,
+      "loss": 0.9268350601196289,
+      "step": 1552
+    },
+    {
+      "epoch": 0.6705501618122978,
+      "grad_norm": 0.4436347782611847,
+      "learning_rate": 0.00019941699527091316,
+      "loss": 0.7978561520576477,
+      "step": 1554
+    },
+    {
+      "epoch": 0.6714131607335491,
+      "grad_norm": 0.4149787127971649,
+      "learning_rate": 0.0001994116883970859,
+      "loss": 0.8229286670684814,
+      "step": 1556
+    },
+    {
+      "epoch": 0.6722761596548005,
+      "grad_norm": 0.49915504455566406,
+      "learning_rate": 0.00019940635755057813,
+      "loss": 0.8554545640945435,
+      "step": 1558
+    },
+    {
+      "epoch": 0.6731391585760518,
+      "grad_norm": 0.45326656103134155,
+      "learning_rate": 0.00019940100273267537,
+      "loss": 0.9135572910308838,
+      "step": 1560
+    },
+    {
+      "epoch": 0.6740021574973031,
+      "grad_norm": 0.48639237880706787,
+      "learning_rate": 0.0001993956239446689,
+      "loss": 0.7769742012023926,
+      "step": 1562
+    },
+    {
+      "epoch": 0.6748651564185545,
+      "grad_norm": 0.5072791576385498,
+      "learning_rate": 0.0001993902211878558,
+      "loss": 0.9626237154006958,
+      "step": 1564
+    },
+    {
+      "epoch": 0.6757281553398058,
+      "grad_norm": 0.4646652638912201,
+      "learning_rate": 0.00019938479446353892,
+      "loss": 0.8506941199302673,
+      "step": 1566
+    },
+    {
+      "epoch": 0.6765911542610572,
+      "grad_norm": 0.4343051314353943,
+      "learning_rate": 0.00019937934377302688,
+      "loss": 0.8172947764396667,
+      "step": 1568
+    },
+    {
+      "epoch": 0.6774541531823085,
+      "grad_norm": 0.46890193223953247,
+      "learning_rate": 0.00019937386911763407,
+      "loss": 0.9215856790542603,
+      "step": 1570
+    },
+    {
+      "epoch": 0.6783171521035599,
+      "grad_norm": 0.5121113061904907,
+      "learning_rate": 0.0001993683704986807,
+      "loss": 0.8099892139434814,
+      "step": 1572
+    },
+    {
+      "epoch": 0.6791801510248112,
+      "grad_norm": 0.4652405083179474,
+      "learning_rate": 0.0001993628479174928,
+      "loss": 0.8675104975700378,
+      "step": 1574
+    },
+    {
+      "epoch": 0.6800431499460625,
+      "grad_norm": 0.4599422812461853,
+      "learning_rate": 0.00019935730137540198,
+      "loss": 0.7938929200172424,
+      "step": 1576
+    },
+    {
+      "epoch": 0.6809061488673139,
+      "grad_norm": 0.4738059937953949,
+      "learning_rate": 0.0001993517308737459,
+      "loss": 0.8610570430755615,
+      "step": 1578
+    },
+    {
+      "epoch": 0.6817691477885652,
+      "grad_norm": 0.5161214470863342,
+      "learning_rate": 0.00019934613641386776,
+      "loss": 0.9199413657188416,
+      "step": 1580
+    },
+    {
+      "epoch": 0.6826321467098166,
+      "grad_norm": 0.4284999370574951,
+      "learning_rate": 0.00019934051799711672,
+      "loss": 0.771649181842804,
+      "step": 1582
+    },
+    {
+      "epoch": 0.683495145631068,
+      "grad_norm": 0.5117548704147339,
+      "learning_rate": 0.00019933487562484757,
+      "loss": 0.8861327767372131,
+      "step": 1584
+    },
+    {
+      "epoch": 0.6843581445523194,
+      "grad_norm": 0.4964369833469391,
+      "learning_rate": 0.00019932920929842095,
+      "loss": 0.806983232498169,
+      "step": 1586
+    },
+    {
+      "epoch": 0.6852211434735707,
+      "grad_norm": 0.4699532091617584,
+      "learning_rate": 0.00019932351901920327,
+      "loss": 0.7963525652885437,
+      "step": 1588
+    },
+    {
+      "epoch": 0.686084142394822,
+      "grad_norm": 0.5329220294952393,
+      "learning_rate": 0.00019931780478856678,
+      "loss": 0.9406430721282959,
+      "step": 1590
+    },
+    {
+      "epoch": 0.6869471413160734,
+      "grad_norm": 0.49823835492134094,
+      "learning_rate": 0.00019931206660788936,
+      "loss": 0.8517770171165466,
+      "step": 1592
+    },
+    {
+      "epoch": 0.6878101402373247,
+      "grad_norm": 0.45245134830474854,
+      "learning_rate": 0.00019930630447855482,
+      "loss": 0.8703644275665283,
+      "step": 1594
+    },
+    {
+      "epoch": 0.6886731391585761,
+      "grad_norm": 0.47524577379226685,
+      "learning_rate": 0.0001993005184019526,
+      "loss": 0.9035283327102661,
+      "step": 1596
+    },
+    {
+      "epoch": 0.6895361380798274,
+      "grad_norm": 0.4537610411643982,
+      "learning_rate": 0.00019929470837947802,
+      "loss": 0.9173959493637085,
+      "step": 1598
+    },
+    {
+      "epoch": 0.6903991370010788,
+      "grad_norm": 0.42469722032546997,
+      "learning_rate": 0.00019928887441253212,
+      "loss": 0.8573579788208008,
+      "step": 1600
+    },
+    {
+      "epoch": 0.6903991370010788,
+      "eval_loss": 0.8880587220191956,
+      "eval_runtime": 653.9515,
+      "eval_samples_per_second": 3.15,
+      "eval_steps_per_second": 3.15,
+      "step": 1600
+    },
+    {
+      "epoch": 0.6912621359223301,
+      "grad_norm": 0.4388251304626465,
+      "learning_rate": 0.00019928301650252176,
+      "loss": 0.849348247051239,
+      "step": 1602
+    },
+    {
+      "epoch": 0.6921251348435814,
+      "grad_norm": 0.46086886525154114,
+      "learning_rate": 0.00019927713465085956,
+      "loss": 0.8298451900482178,
+      "step": 1604
+    },
+    {
+      "epoch": 0.6929881337648328,
+      "grad_norm": 0.42972785234451294,
+      "learning_rate": 0.00019927122885896387,
+      "loss": 0.8860712647438049,
+      "step": 1606
+    },
+    {
+      "epoch": 0.6938511326860841,
+      "grad_norm": 0.43009471893310547,
+      "learning_rate": 0.00019926529912825888,
+      "loss": 0.7972728610038757,
+      "step": 1608
+    },
+    {
+      "epoch": 0.6947141316073355,
+      "grad_norm": 0.3705308437347412,
+      "learning_rate": 0.00019925934546017446,
+      "loss": 0.8661653995513916,
+      "step": 1610
+    },
+    {
+      "epoch": 0.6955771305285868,
+      "grad_norm": 0.405208945274353,
+      "learning_rate": 0.00019925336785614635,
+      "loss": 0.8350111246109009,
+      "step": 1612
+    },
+    {
+      "epoch": 0.6964401294498382,
+      "grad_norm": 0.4773033857345581,
+      "learning_rate": 0.00019924736631761602,
+      "loss": 0.7920925617218018,
+      "step": 1614
+    },
+    {
+      "epoch": 0.6973031283710895,
+      "grad_norm": 0.4682428240776062,
+      "learning_rate": 0.00019924134084603075,
+      "loss": 0.8644304871559143,
+      "step": 1616
+    },
+    {
+      "epoch": 0.6981661272923408,
+      "grad_norm": 0.5694834589958191,
+      "learning_rate": 0.00019923529144284346,
+      "loss": 0.9897904992103577,
+      "step": 1618
+    },
+    {
+      "epoch": 0.6990291262135923,
+      "grad_norm": 0.40137484669685364,
+      "learning_rate": 0.00019922921810951302,
+      "loss": 0.6910083293914795,
+      "step": 1620
+    },
+    {
+      "epoch": 0.6998921251348436,
+      "grad_norm": 0.42076537013053894,
+      "learning_rate": 0.000199223120847504,
+      "loss": 0.8295826315879822,
+      "step": 1622
+    },
+    {
+      "epoch": 0.700755124056095,
+      "grad_norm": 0.4473017752170563,
+      "learning_rate": 0.00019921699965828662,
+      "loss": 0.820871889591217,
+      "step": 1624
+    },
+    {
+      "epoch": 0.7016181229773463,
+      "grad_norm": 0.43914029002189636,
+      "learning_rate": 0.00019921085454333706,
+      "loss": 0.8319019079208374,
+      "step": 1626
+    },
+    {
+      "epoch": 0.7024811218985976,
+      "grad_norm": 0.4758487939834595,
+      "learning_rate": 0.0001992046855041372,
+      "loss": 0.8589251041412354,
+      "step": 1628
+    },
+    {
+      "epoch": 0.703344120819849,
+      "grad_norm": 0.506401538848877,
+      "learning_rate": 0.00019919849254217465,
+      "loss": 0.9219205975532532,
+      "step": 1630
+    },
+    {
+      "epoch": 0.7042071197411003,
+      "grad_norm": 0.4397984445095062,
+      "learning_rate": 0.00019919227565894277,
+      "loss": 0.7824978232383728,
+      "step": 1632
+    },
+    {
+      "epoch": 0.7050701186623517,
+      "grad_norm": 0.3879252076148987,
+      "learning_rate": 0.0001991860348559408,
+      "loss": 0.8472069501876831,
+      "step": 1634
+    },
+    {
+      "epoch": 0.705933117583603,
+      "grad_norm": 0.42238810658454895,
+      "learning_rate": 0.00019917977013467368,
+      "loss": 0.824957013130188,
+      "step": 1636
+    },
+    {
+      "epoch": 0.7067961165048544,
+      "grad_norm": 0.5235037207603455,
+      "learning_rate": 0.00019917348149665206,
+      "loss": 0.9490993022918701,
+      "step": 1638
+    },
+    {
+      "epoch": 0.7076591154261057,
+      "grad_norm": 0.5195287466049194,
+      "learning_rate": 0.0001991671689433925,
+      "loss": 0.960905909538269,
+      "step": 1640
+    },
+    {
+      "epoch": 0.708522114347357,
+      "grad_norm": 0.5016481876373291,
+      "learning_rate": 0.00019916083247641716,
+      "loss": 0.8961218595504761,
+      "step": 1642
+    },
+    {
+      "epoch": 0.7093851132686084,
+      "grad_norm": 0.5510191321372986,
+      "learning_rate": 0.00019915447209725408,
+      "loss": 0.8883417844772339,
+      "step": 1644
+    },
+    {
+      "epoch": 0.7102481121898597,
+      "grad_norm": 0.4492250084877014,
+      "learning_rate": 0.0001991480878074371,
+      "loss": 0.7968636751174927,
+      "step": 1646
+    },
+    {
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.4189683496952057,
+      "learning_rate": 0.00019914167960850563,
+      "loss": 0.7869221568107605,
+      "step": 1648
+    },
+    {
+      "epoch": 0.7119741100323624,
+      "grad_norm": 0.4590536653995514,
+      "learning_rate": 0.0001991352475020051,
+      "loss": 0.8423646092414856,
+      "step": 1650
+    },
+    {
+      "epoch": 0.7128371089536139,
+      "grad_norm": 0.5058966875076294,
+      "learning_rate": 0.00019912879148948652,
+      "loss": 0.895459771156311,
+      "step": 1652
+    },
+    {
+      "epoch": 0.7137001078748652,
+      "grad_norm": 0.4904550313949585,
+      "learning_rate": 0.00019912231157250676,
+      "loss": 0.8737894296646118,
+      "step": 1654
+    },
+    {
+      "epoch": 0.7145631067961165,
+      "grad_norm": 0.4670710861682892,
+      "learning_rate": 0.0001991158077526284,
+      "loss": 0.8019732236862183,
+      "step": 1656
+    },
+    {
+      "epoch": 0.7154261057173679,
+      "grad_norm": 0.530343770980835,
+      "learning_rate": 0.00019910928003141984,
+      "loss": 0.9474499821662903,
+      "step": 1658
+    },
+    {
+      "epoch": 0.7162891046386192,
+      "grad_norm": 0.4250960052013397,
+      "learning_rate": 0.00019910272841045518,
+      "loss": 0.8738601803779602,
+      "step": 1660
+    },
+    {
+      "epoch": 0.7171521035598706,
+      "grad_norm": 0.4896513819694519,
+      "learning_rate": 0.0001990961528913143,
+      "loss": 0.9578261971473694,
+      "step": 1662
+    },
+    {
+      "epoch": 0.7180151024811219,
+      "grad_norm": 0.4999626576900482,
+      "learning_rate": 0.00019908955347558291,
+      "loss": 0.8116445541381836,
+      "step": 1664
+    },
+    {
+      "epoch": 0.7188781014023733,
+      "grad_norm": 0.4335242807865143,
+      "learning_rate": 0.00019908293016485237,
+      "loss": 0.8783043622970581,
+      "step": 1666
+    },
+    {
+      "epoch": 0.7197411003236246,
+      "grad_norm": 0.43542763590812683,
+      "learning_rate": 0.00019907628296071992,
+      "loss": 0.8223029375076294,
+      "step": 1668
+    },
+    {
+      "epoch": 0.7206040992448759,
+      "grad_norm": 0.4907461702823639,
+      "learning_rate": 0.00019906961186478842,
+      "loss": 1.0053197145462036,
+      "step": 1670
+    },
+    {
+      "epoch": 0.7214670981661273,
+      "grad_norm": 0.4054848253726959,
+      "learning_rate": 0.00019906291687866667,
+      "loss": 0.8107786178588867,
+      "step": 1672
+    },
+    {
+      "epoch": 0.7223300970873786,
+      "grad_norm": 0.3818599283695221,
+      "learning_rate": 0.0001990561980039691,
+      "loss": 0.780781626701355,
+      "step": 1674
+    },
+    {
+      "epoch": 0.72319309600863,
+      "grad_norm": 0.4128594994544983,
+      "learning_rate": 0.00019904945524231587,
+      "loss": 0.8189221620559692,
+      "step": 1676
+    },
+    {
+      "epoch": 0.7240560949298813,
+      "grad_norm": 0.46902593970298767,
+      "learning_rate": 0.0001990426885953331,
+      "loss": 0.83652263879776,
+      "step": 1678
+    },
+    {
+      "epoch": 0.7249190938511327,
+      "grad_norm": 0.49305564165115356,
+      "learning_rate": 0.00019903589806465242,
+      "loss": 0.8057956695556641,
+      "step": 1680
+    },
+    {
+      "epoch": 0.725782092772384,
+      "grad_norm": 0.44408300518989563,
+      "learning_rate": 0.0001990290836519114,
+      "loss": 0.8523716926574707,
+      "step": 1682
+    },
+    {
+      "epoch": 0.7266450916936353,
+      "grad_norm": 0.5211108922958374,
+      "learning_rate": 0.00019902224535875326,
+      "loss": 0.9179236888885498,
+      "step": 1684
+    },
+    {
+      "epoch": 0.7275080906148867,
+      "grad_norm": 0.4736526608467102,
+      "learning_rate": 0.00019901538318682705,
+      "loss": 0.8229476809501648,
+      "step": 1686
+    },
+    {
+      "epoch": 0.728371089536138,
+      "grad_norm": 0.541716992855072,
+      "learning_rate": 0.00019900849713778756,
+      "loss": 0.924200713634491,
+      "step": 1688
+    },
+    {
+      "epoch": 0.7292340884573895,
+      "grad_norm": 0.4524400532245636,
+      "learning_rate": 0.00019900158721329532,
+      "loss": 0.88961261510849,
+      "step": 1690
+    },
+    {
+      "epoch": 0.7300970873786408,
+      "grad_norm": 0.45256128907203674,
+      "learning_rate": 0.00019899465341501662,
+      "loss": 0.8491015434265137,
+      "step": 1692
+    },
+    {
+      "epoch": 0.7309600862998922,
+      "grad_norm": 0.5346773266792297,
+      "learning_rate": 0.0001989876957446235,
+      "loss": 0.8833339810371399,
+      "step": 1694
+    },
+    {
+      "epoch": 0.7318230852211435,
+      "grad_norm": 0.4696357846260071,
+      "learning_rate": 0.0001989807142037938,
+      "loss": 0.8535294532775879,
+      "step": 1696
+    },
+    {
+      "epoch": 0.7326860841423948,
+      "grad_norm": 0.4304637908935547,
+      "learning_rate": 0.0001989737087942111,
+      "loss": 0.8273076415061951,
+      "step": 1698
+    },
+    {
+      "epoch": 0.7335490830636462,
+      "grad_norm": 0.5085629224777222,
+      "learning_rate": 0.00019896667951756466,
+      "loss": 0.8759240508079529,
+      "step": 1700
+    },
+    {
+      "epoch": 0.7335490830636462,
+      "eval_loss": 0.879119336605072,
+      "eval_runtime": 663.0553,
+      "eval_samples_per_second": 3.107,
+      "eval_steps_per_second": 3.107,
+      "step": 1700
+    },
+    {
+      "epoch": 0.7344120819848975,
+      "grad_norm": 0.484223872423172,
+      "learning_rate": 0.00019895962637554964,
+      "loss": 0.852645218372345,
+      "step": 1702
+    },
+    {
+      "epoch": 0.7352750809061489,
+      "grad_norm": 0.4819294214248657,
+      "learning_rate": 0.0001989525493698668,
+      "loss": 0.878247857093811,
+      "step": 1704
+    },
+    {
+      "epoch": 0.7361380798274002,
+      "grad_norm": 0.4170311689376831,
+      "learning_rate": 0.00019894544850222276,
+      "loss": 0.8964285254478455,
+      "step": 1706
+    },
+    {
+      "epoch": 0.7370010787486516,
+      "grad_norm": 0.42712801694869995,
+      "learning_rate": 0.0001989383237743299,
+      "loss": 0.7479548454284668,
+      "step": 1708
+    },
+    {
+      "epoch": 0.7378640776699029,
+      "grad_norm": 0.5051686763763428,
+      "learning_rate": 0.00019893117518790624,
+      "loss": 0.8190052509307861,
+      "step": 1710
+    },
+    {
+      "epoch": 0.7387270765911542,
+      "grad_norm": 0.44053253531455994,
+      "learning_rate": 0.0001989240027446757,
+      "loss": 0.8646742105484009,
+      "step": 1712
+    },
+    {
+      "epoch": 0.7395900755124056,
+      "grad_norm": 0.4451025724411011,
+      "learning_rate": 0.00019891680644636782,
+      "loss": 0.874261736869812,
+      "step": 1714
+    },
+    {
+      "epoch": 0.7404530744336569,
+      "grad_norm": 0.4590521454811096,
+      "learning_rate": 0.00019890958629471798,
+      "loss": 0.8892465233802795,
+      "step": 1716
+    },
+    {
+      "epoch": 0.7413160733549083,
+      "grad_norm": 0.39169448614120483,
+      "learning_rate": 0.00019890234229146732,
+      "loss": 0.7031586766242981,
+      "step": 1718
+    },
+    {
+      "epoch": 0.7421790722761596,
+      "grad_norm": 0.46946024894714355,
+      "learning_rate": 0.00019889507443836266,
+      "loss": 0.8548433184623718,
+      "step": 1720
+    },
+    {
+      "epoch": 0.7430420711974111,
+      "grad_norm": 0.42404699325561523,
+      "learning_rate": 0.0001988877827371566,
+      "loss": 0.8231223821640015,
+      "step": 1722
+    },
+    {
+      "epoch": 0.7439050701186624,
+      "grad_norm": 0.40419483184814453,
+      "learning_rate": 0.00019888046718960755,
+      "loss": 0.8443762063980103,
+      "step": 1724
+    },
+    {
+      "epoch": 0.7447680690399137,
+      "grad_norm": 0.4550437927246094,
+      "learning_rate": 0.0001988731277974796,
+      "loss": 0.8787111639976501,
+      "step": 1726
+    },
+    {
+      "epoch": 0.7456310679611651,
+      "grad_norm": 0.42264053225517273,
+      "learning_rate": 0.0001988657645625426,
+      "loss": 0.8440850377082825,
+      "step": 1728
+    },
+    {
+      "epoch": 0.7464940668824164,
+      "grad_norm": 0.4638359844684601,
+      "learning_rate": 0.0001988583774865721,
+      "loss": 0.839216947555542,
+      "step": 1730
+    },
+    {
+      "epoch": 0.7473570658036678,
+      "grad_norm": 0.42644429206848145,
+      "learning_rate": 0.0001988509665713496,
+      "loss": 0.8011161684989929,
+      "step": 1732
+    },
+    {
+      "epoch": 0.7482200647249191,
+      "grad_norm": 0.36932024359703064,
+      "learning_rate": 0.0001988435318186621,
+      "loss": 0.850246787071228,
+      "step": 1734
+    },
+    {
+      "epoch": 0.7490830636461704,
+      "grad_norm": 0.4771935045719147,
+      "learning_rate": 0.00019883607323030252,
+      "loss": 0.7782483100891113,
+      "step": 1736
+    },
+    {
+      "epoch": 0.7499460625674218,
+      "grad_norm": 0.40007370710372925,
+      "learning_rate": 0.00019882859080806942,
+      "loss": 0.8337594866752625,
+      "step": 1738
+    },
+    {
+      "epoch": 0.7508090614886731,
+      "grad_norm": 0.5007418394088745,
+      "learning_rate": 0.00019882108455376716,
+      "loss": 0.8287386894226074,
+      "step": 1740
+    },
+    {
+      "epoch": 0.7516720604099245,
+      "grad_norm": 0.43999138474464417,
+      "learning_rate": 0.00019881355446920584,
+      "loss": 0.8655616044998169,
+      "step": 1742
+    },
+    {
+      "epoch": 0.7525350593311758,
+      "grad_norm": 0.5115824937820435,
+      "learning_rate": 0.00019880600055620135,
+      "loss": 0.8695262670516968,
+      "step": 1744
+    },
+    {
+      "epoch": 0.7533980582524272,
+      "grad_norm": 0.5035707950592041,
+      "learning_rate": 0.0001987984228165752,
+      "loss": 0.9207013845443726,
+      "step": 1746
+    },
+    {
+      "epoch": 0.7542610571736785,
+      "grad_norm": 0.4689575731754303,
+      "learning_rate": 0.0001987908212521548,
+      "loss": 0.8798729777336121,
+      "step": 1748
+    },
+    {
+      "epoch": 0.7551240560949298,
+      "grad_norm": 0.4730616509914398,
+      "learning_rate": 0.00019878319586477322,
+      "loss": 0.7737767696380615,
+      "step": 1750
+    },
+    {
+      "epoch": 0.7559870550161812,
+      "grad_norm": 0.49012845754623413,
+      "learning_rate": 0.00019877554665626926,
+      "loss": 0.929466187953949,
+      "step": 1752
+    },
+    {
+      "epoch": 0.7568500539374325,
+      "grad_norm": 0.43468761444091797,
+      "learning_rate": 0.0001987678736284875,
+      "loss": 0.8155670166015625,
+      "step": 1754
+    },
+    {
+      "epoch": 0.757713052858684,
+      "grad_norm": 0.507399320602417,
+      "learning_rate": 0.00019876017678327826,
+      "loss": 0.8082395195960999,
+      "step": 1756
+    },
+    {
+      "epoch": 0.7585760517799353,
+      "grad_norm": 0.4733552634716034,
+      "learning_rate": 0.0001987524561224976,
+      "loss": 0.8905934691429138,
+      "step": 1758
+    },
+    {
+      "epoch": 0.7594390507011867,
+      "grad_norm": 0.4670012891292572,
+      "learning_rate": 0.00019874471164800733,
+      "loss": 0.8794633746147156,
+      "step": 1760
+    },
+    {
+      "epoch": 0.760302049622438,
+      "grad_norm": 0.4951624572277069,
+      "learning_rate": 0.000198736943361675,
+      "loss": 0.8413973450660706,
+      "step": 1762
+    },
+    {
+      "epoch": 0.7611650485436893,
+      "grad_norm": 0.5478648543357849,
+      "learning_rate": 0.00019872915126537387,
+      "loss": 0.9067897200584412,
+      "step": 1764
+    },
+    {
+      "epoch": 0.7620280474649407,
+      "grad_norm": 0.48215776681900024,
+      "learning_rate": 0.000198721335360983,
+      "loss": 0.8932394981384277,
+      "step": 1766
+    },
+    {
+      "epoch": 0.762891046386192,
+      "grad_norm": 0.4688864052295685,
+      "learning_rate": 0.00019871349565038715,
+      "loss": 0.8496726751327515,
+      "step": 1768
+    },
+    {
+      "epoch": 0.7637540453074434,
+      "grad_norm": 0.4728260636329651,
+      "learning_rate": 0.0001987056321354768,
+      "loss": 0.9232800602912903,
+      "step": 1770
+    },
+    {
+      "epoch": 0.7646170442286947,
+      "grad_norm": 0.44501692056655884,
+      "learning_rate": 0.00019869774481814828,
+      "loss": 0.849755585193634,
+      "step": 1772
+    },
+    {
+      "epoch": 0.7654800431499461,
+      "grad_norm": 0.4189201593399048,
+      "learning_rate": 0.00019868983370030348,
+      "loss": 0.8258485794067383,
+      "step": 1774
+    },
+    {
+      "epoch": 0.7663430420711974,
+      "grad_norm": 0.5144591927528381,
+      "learning_rate": 0.00019868189878385016,
+      "loss": 0.8762873411178589,
+      "step": 1776
+    },
+    {
+      "epoch": 0.7672060409924487,
+      "grad_norm": 0.5048011541366577,
+      "learning_rate": 0.00019867394007070188,
+      "loss": 0.8732464909553528,
+      "step": 1778
+    },
+    {
+      "epoch": 0.7680690399137001,
+      "grad_norm": 0.41639819741249084,
+      "learning_rate": 0.00019866595756277774,
+      "loss": 0.8732751607894897,
+      "step": 1780
+    },
+    {
+      "epoch": 0.7689320388349514,
+      "grad_norm": 0.526757538318634,
+      "learning_rate": 0.00019865795126200271,
+      "loss": 0.8453729748725891,
+      "step": 1782
+    },
+    {
+      "epoch": 0.7697950377562028,
+      "grad_norm": 0.47041091322898865,
+      "learning_rate": 0.0001986499211703075,
+      "loss": 0.8780192732810974,
+      "step": 1784
+    },
+    {
+      "epoch": 0.7706580366774541,
+      "grad_norm": 0.4535890221595764,
+      "learning_rate": 0.0001986418672896285,
+      "loss": 0.8508450388908386,
+      "step": 1786
+    },
+    {
+      "epoch": 0.7715210355987056,
+      "grad_norm": 0.4608050286769867,
+      "learning_rate": 0.00019863378962190788,
+      "loss": 0.822467565536499,
+      "step": 1788
+    },
+    {
+      "epoch": 0.7723840345199569,
+      "grad_norm": 0.5190523862838745,
+      "learning_rate": 0.00019862568816909356,
+      "loss": 0.844614565372467,
+      "step": 1790
+    },
+    {
+      "epoch": 0.7732470334412082,
+      "grad_norm": 0.42502254247665405,
+      "learning_rate": 0.00019861756293313912,
+      "loss": 0.8144394755363464,
+      "step": 1792
+    },
+    {
+      "epoch": 0.7741100323624596,
+      "grad_norm": 0.47112616896629333,
+      "learning_rate": 0.000198609413916004,
+      "loss": 0.8836341500282288,
+      "step": 1794
+    },
+    {
+      "epoch": 0.7749730312837109,
+      "grad_norm": 0.48414838314056396,
+      "learning_rate": 0.0001986012411196532,
+      "loss": 0.8846262693405151,
+      "step": 1796
+    },
+    {
+      "epoch": 0.7758360302049623,
+      "grad_norm": 0.4670039415359497,
+      "learning_rate": 0.00019859304454605763,
+      "loss": 0.7993118762969971,
+      "step": 1798
+    },
+    {
+      "epoch": 0.7766990291262136,
+      "grad_norm": 0.41939061880111694,
+      "learning_rate": 0.0001985848241971938,
+      "loss": 0.8389407396316528,
+      "step": 1800
+    },
+    {
+      "epoch": 0.7766990291262136,
+      "eval_loss": 0.8727664947509766,
+      "eval_runtime": 668.2062,
+      "eval_samples_per_second": 3.083,
+      "eval_steps_per_second": 3.083,
+      "step": 1800
+    },
+    {
+      "epoch": 0.777562028047465,
+      "grad_norm": 0.4566517174243927,
+      "learning_rate": 0.00019857658007504405,
+      "loss": 0.8824291825294495,
+      "step": 1802
+    },
+    {
+      "epoch": 0.7784250269687163,
+      "grad_norm": 0.4798925518989563,
+      "learning_rate": 0.0001985683121815964,
+      "loss": 0.808982789516449,
+      "step": 1804
+    },
+    {
+      "epoch": 0.7792880258899676,
+      "grad_norm": 0.4659746587276459,
+      "learning_rate": 0.00019856002051884462,
+      "loss": 0.955269992351532,
+      "step": 1806
+    },
+    {
+      "epoch": 0.780151024811219,
+      "grad_norm": 0.48573991656303406,
+      "learning_rate": 0.00019855170508878818,
+      "loss": 0.8142994046211243,
+      "step": 1808
+    },
+    {
+      "epoch": 0.7810140237324703,
+      "grad_norm": 0.4700213670730591,
+      "learning_rate": 0.00019854336589343236,
+      "loss": 0.8755695819854736,
+      "step": 1810
+    },
+    {
+      "epoch": 0.7818770226537217,
+      "grad_norm": 0.42113450169563293,
+      "learning_rate": 0.00019853500293478806,
+      "loss": 0.8098483681678772,
+      "step": 1812
+    },
+    {
+      "epoch": 0.782740021574973,
+      "grad_norm": 0.4218153655529022,
+      "learning_rate": 0.00019852661621487205,
+      "loss": 0.8219783306121826,
+      "step": 1814
+    },
+    {
+      "epoch": 0.7836030204962244,
+      "grad_norm": 0.499052494764328,
+      "learning_rate": 0.00019851820573570664,
+      "loss": 0.8314159512519836,
+      "step": 1816
+    },
+    {
+      "epoch": 0.7844660194174757,
+      "grad_norm": 0.42420193552970886,
+      "learning_rate": 0.00019850977149932008,
+      "loss": 0.7985323071479797,
+      "step": 1818
+    },
+    {
+      "epoch": 0.785329018338727,
+      "grad_norm": 0.46345841884613037,
+      "learning_rate": 0.0001985013135077462,
+      "loss": 0.8528217077255249,
+      "step": 1820
+    },
+    {
+      "epoch": 0.7861920172599784,
+      "grad_norm": 0.4433307945728302,
+      "learning_rate": 0.00019849283176302462,
+      "loss": 0.8659319877624512,
+      "step": 1822
+    },
+    {
+      "epoch": 0.7870550161812297,
+      "grad_norm": 0.48279091715812683,
+      "learning_rate": 0.00019848432626720067,
+      "loss": 0.8675655126571655,
+      "step": 1824
+    },
+    {
+      "epoch": 0.7879180151024812,
+      "grad_norm": 0.5439180731773376,
+      "learning_rate": 0.0001984757970223254,
+      "loss": 0.8550227284431458,
+      "step": 1826
+    },
+    {
+      "epoch": 0.7887810140237325,
+      "grad_norm": 0.45749521255493164,
+      "learning_rate": 0.0001984672440304556,
+      "loss": 0.7290607690811157,
+      "step": 1828
+    },
+    {
+      "epoch": 0.7896440129449838,
+      "grad_norm": 0.4654783606529236,
+      "learning_rate": 0.00019845866729365378,
+      "loss": 0.8619251251220703,
+      "step": 1830
+    },
+    {
+      "epoch": 0.7905070118662352,
+      "grad_norm": 0.42632243037223816,
+      "learning_rate": 0.00019845006681398823,
+      "loss": 0.8249601125717163,
+      "step": 1832
+    },
+    {
+      "epoch": 0.7913700107874865,
+      "grad_norm": 0.4747186005115509,
+      "learning_rate": 0.0001984414425935329,
+      "loss": 0.7138552069664001,
+      "step": 1834
+    },
+    {
+      "epoch": 0.7922330097087379,
+      "grad_norm": 0.4462338387966156,
+      "learning_rate": 0.0001984327946343674,
+      "loss": 0.903292715549469,
+      "step": 1836
+    },
+    {
+      "epoch": 0.7930960086299892,
+      "grad_norm": 0.4581359922885895,
+      "learning_rate": 0.00019842412293857726,
+      "loss": 0.7569618225097656,
+      "step": 1838
+    },
+    {
+      "epoch": 0.7939590075512406,
+      "grad_norm": 0.4183015525341034,
+      "learning_rate": 0.00019841542750825356,
+      "loss": 0.8063036203384399,
+      "step": 1840
+    },
+    {
+      "epoch": 0.7948220064724919,
+      "grad_norm": 0.3954181373119354,
+      "learning_rate": 0.0001984067083454932,
+      "loss": 0.81150221824646,
+      "step": 1842
+    },
+    {
+      "epoch": 0.7956850053937432,
+      "grad_norm": 0.46220019459724426,
+      "learning_rate": 0.0001983979654523987,
+      "loss": 0.841649055480957,
+      "step": 1844
+    },
+    {
+      "epoch": 0.7965480043149946,
+      "grad_norm": 0.47807541489601135,
+      "learning_rate": 0.00019838919883107843,
+      "loss": 0.8019483685493469,
+      "step": 1846
+    },
+    {
+      "epoch": 0.7974110032362459,
+      "grad_norm": 0.48015692830085754,
+      "learning_rate": 0.0001983804084836464,
+      "loss": 0.9343363046646118,
+      "step": 1848
+    },
+    {
+      "epoch": 0.7982740021574973,
+      "grad_norm": 0.4906708896160126,
+      "learning_rate": 0.00019837159441222238,
+      "loss": 0.9163194894790649,
+      "step": 1850
+    },
+    {
+      "epoch": 0.7991370010787486,
+      "grad_norm": 0.4856911599636078,
+      "learning_rate": 0.0001983627566189318,
+      "loss": 0.8017736077308655,
+      "step": 1852
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.49403145909309387,
+      "learning_rate": 0.0001983538951059059,
+      "loss": 0.8375223875045776,
+      "step": 1854
+    },
+    {
+      "epoch": 0.8008629989212513,
+      "grad_norm": 0.4237985908985138,
+      "learning_rate": 0.00019834500987528158,
+      "loss": 0.8213951587677002,
+      "step": 1856
+    },
+    {
+      "epoch": 0.8017259978425026,
+      "grad_norm": 0.3977980315685272,
+      "learning_rate": 0.00019833610092920149,
+      "loss": 0.8086028099060059,
+      "step": 1858
+    },
+    {
+      "epoch": 0.8025889967637541,
+      "grad_norm": 0.435253381729126,
+      "learning_rate": 0.00019832716826981392,
+      "loss": 0.9402202367782593,
+      "step": 1860
+    },
+    {
+      "epoch": 0.8034519956850054,
+      "grad_norm": 0.4074764847755432,
+      "learning_rate": 0.000198318211899273,
+      "loss": 0.7730796933174133,
+      "step": 1862
+    },
+    {
+      "epoch": 0.8043149946062568,
+      "grad_norm": 0.48804348707199097,
+      "learning_rate": 0.0001983092318197385,
+      "loss": 0.9185802936553955,
+      "step": 1864
+    },
+    {
+      "epoch": 0.8051779935275081,
+      "grad_norm": 0.44363343715667725,
+      "learning_rate": 0.00019830022803337592,
+      "loss": 0.8578243851661682,
+      "step": 1866
+    },
+    {
+      "epoch": 0.8060409924487595,
+      "grad_norm": 0.46645957231521606,
+      "learning_rate": 0.00019829120054235653,
+      "loss": 0.8682060241699219,
+      "step": 1868
+    },
+    {
+      "epoch": 0.8069039913700108,
+      "grad_norm": 0.4527221620082855,
+      "learning_rate": 0.00019828214934885718,
+      "loss": 0.7845040559768677,
+      "step": 1870
+    },
+    {
+      "epoch": 0.8077669902912621,
+      "grad_norm": 0.4103536307811737,
+      "learning_rate": 0.0001982730744550606,
+      "loss": 0.8768247961997986,
+      "step": 1872
+    },
+    {
+      "epoch": 0.8086299892125135,
+      "grad_norm": 0.5257515907287598,
+      "learning_rate": 0.00019826397586315513,
+      "loss": 0.850267231464386,
+      "step": 1874
+    },
+    {
+      "epoch": 0.8094929881337648,
+      "grad_norm": 0.46675893664360046,
+      "learning_rate": 0.00019825485357533485,
+      "loss": 0.8234293460845947,
+      "step": 1876
+    },
+    {
+      "epoch": 0.8103559870550162,
+      "grad_norm": 0.46315401792526245,
+      "learning_rate": 0.00019824570759379958,
+      "loss": 0.8131387829780579,
+      "step": 1878
+    },
+    {
+      "epoch": 0.8112189859762675,
+      "grad_norm": 0.4766870439052582,
+      "learning_rate": 0.00019823653792075478,
+      "loss": 0.7680494785308838,
+      "step": 1880
+    },
+    {
+      "epoch": 0.8120819848975189,
+      "grad_norm": 0.39025625586509705,
+      "learning_rate": 0.00019822734455841173,
+      "loss": 0.7891425490379333,
+      "step": 1882
+    },
+    {
+      "epoch": 0.8129449838187702,
+      "grad_norm": 0.4672441780567169,
+      "learning_rate": 0.0001982181275089874,
+      "loss": 0.7990400791168213,
+      "step": 1884
+    },
+    {
+      "epoch": 0.8138079827400215,
+      "grad_norm": 0.44310975074768066,
+      "learning_rate": 0.00019820888677470432,
+      "loss": 0.7828341722488403,
+      "step": 1886
+    },
+    {
+      "epoch": 0.8146709816612729,
+      "grad_norm": 0.45098716020584106,
+      "learning_rate": 0.00019819962235779096,
+      "loss": 0.897715151309967,
+      "step": 1888
+    },
+    {
+      "epoch": 0.8155339805825242,
+      "grad_norm": 0.466805100440979,
+      "learning_rate": 0.00019819033426048135,
+      "loss": 0.7987668514251709,
+      "step": 1890
+    },
+    {
+      "epoch": 0.8163969795037757,
+      "grad_norm": 0.4438319206237793,
+      "learning_rate": 0.00019818102248501528,
+      "loss": 0.7950236201286316,
+      "step": 1892
+    },
+    {
+      "epoch": 0.817259978425027,
+      "grad_norm": 0.42012497782707214,
+      "learning_rate": 0.00019817168703363823,
+      "loss": 0.8789975643157959,
+      "step": 1894
+    },
+    {
+      "epoch": 0.8181229773462784,
+      "grad_norm": 0.4359394311904907,
+      "learning_rate": 0.0001981623279086014,
+      "loss": 0.8159777522087097,
+      "step": 1896
+    },
+    {
+      "epoch": 0.8189859762675297,
+      "grad_norm": 0.44709593057632446,
+      "learning_rate": 0.00019815294511216173,
+      "loss": 0.84877610206604,
+      "step": 1898
+    },
+    {
+      "epoch": 0.819848975188781,
+      "grad_norm": 0.4315306544303894,
+      "learning_rate": 0.00019814353864658184,
+      "loss": 0.8467556834220886,
+      "step": 1900
+    },
+    {
+      "epoch": 0.819848975188781,
+      "eval_loss": 0.8643407821655273,
+      "eval_runtime": 658.6942,
+      "eval_samples_per_second": 3.127,
+      "eval_steps_per_second": 3.127,
+      "step": 1900
+    },
+    {
+      "epoch": 0.8207119741100324,
+      "grad_norm": 0.35530397295951843,
+      "learning_rate": 0.00019813410851412998,
+      "loss": 0.7398589849472046,
+      "step": 1902
+    },
+    {
+      "epoch": 0.8215749730312837,
+      "grad_norm": 0.46949300169944763,
+      "learning_rate": 0.00019812465471708032,
+      "loss": 0.8544237613677979,
+      "step": 1904
+    },
+    {
+      "epoch": 0.8224379719525351,
+      "grad_norm": 0.4961565434932709,
+      "learning_rate": 0.00019811517725771248,
+      "loss": 0.8242526054382324,
+      "step": 1906
+    },
+    {
+      "epoch": 0.8233009708737864,
+      "grad_norm": 0.45835059881210327,
+      "learning_rate": 0.00019810567613831194,
+      "loss": 0.7856690287590027,
+      "step": 1908
+    },
+    {
+      "epoch": 0.8241639697950378,
+      "grad_norm": 0.4446084797382355,
+      "learning_rate": 0.0001980961513611699,
+      "loss": 0.8361829519271851,
+      "step": 1910
+    },
+    {
+      "epoch": 0.8250269687162891,
+      "grad_norm": 0.4470907747745514,
+      "learning_rate": 0.00019808660292858313,
+      "loss": 0.8993050456047058,
+      "step": 1912
+    },
+    {
+      "epoch": 0.8258899676375404,
+      "grad_norm": 0.44883644580841064,
+      "learning_rate": 0.0001980770308428543,
+      "loss": 0.8702824711799622,
+      "step": 1914
+    },
+    {
+      "epoch": 0.8267529665587918,
+      "grad_norm": 0.43215686082839966,
+      "learning_rate": 0.00019806743510629159,
+      "loss": 0.8454389572143555,
+      "step": 1916
+    },
+    {
+      "epoch": 0.8276159654800431,
+      "grad_norm": 0.4525185823440552,
+      "learning_rate": 0.00019805781572120897,
+      "loss": 0.8621824383735657,
+      "step": 1918
+    },
+    {
+      "epoch": 0.8284789644012945,
+      "grad_norm": 0.4616840183734894,
+      "learning_rate": 0.00019804817268992615,
+      "loss": 0.8661681413650513,
+      "step": 1920
+    },
+    {
+      "epoch": 0.8293419633225458,
+      "grad_norm": 0.4252975583076477,
+      "learning_rate": 0.0001980385060147685,
+      "loss": 0.8376660346984863,
+      "step": 1922
+    },
+    {
+      "epoch": 0.8302049622437971,
+      "grad_norm": 0.44600266218185425,
+      "learning_rate": 0.00019802881569806706,
+      "loss": 0.9258401393890381,
+      "step": 1924
+    },
+    {
+      "epoch": 0.8310679611650486,
+      "grad_norm": 0.48872479796409607,
+      "learning_rate": 0.00019801910174215866,
+      "loss": 0.8804965615272522,
+      "step": 1926
+    },
+    {
+      "epoch": 0.8319309600862999,
+      "grad_norm": 0.5357037782669067,
+      "learning_rate": 0.00019800936414938574,
+      "loss": 0.8561494946479797,
+      "step": 1928
+    },
+    {
+      "epoch": 0.8327939590075513,
+      "grad_norm": 0.39637291431427,
+      "learning_rate": 0.00019799960292209647,
+      "loss": 0.782166063785553,
+      "step": 1930
+    },
+    {
+      "epoch": 0.8336569579288026,
+      "grad_norm": 0.521138072013855,
+      "learning_rate": 0.00019798981806264476,
+      "loss": 0.9048293232917786,
+      "step": 1932
+    },
+    {
+      "epoch": 0.834519956850054,
+      "grad_norm": 0.4723529815673828,
+      "learning_rate": 0.00019798000957339015,
+      "loss": 0.9269952774047852,
+      "step": 1934
+    },
+    {
+      "epoch": 0.8353829557713053,
+      "grad_norm": 0.42827340960502625,
+      "learning_rate": 0.0001979701774566979,
+      "loss": 0.8620670437812805,
+      "step": 1936
+    },
+    {
+      "epoch": 0.8362459546925566,
+      "grad_norm": 0.4305116534233093,
+      "learning_rate": 0.00019796032171493907,
+      "loss": 0.8016669750213623,
+      "step": 1938
+    },
+    {
+      "epoch": 0.837108953613808,
+      "grad_norm": 0.4995502233505249,
+      "learning_rate": 0.00019795044235049024,
+      "loss": 0.879247784614563,
+      "step": 1940
+    },
+    {
+      "epoch": 0.8379719525350593,
+      "grad_norm": 0.49229878187179565,
+      "learning_rate": 0.0001979405393657338,
+      "loss": 0.9476580023765564,
+      "step": 1942
+    },
+    {
+      "epoch": 0.8388349514563107,
+      "grad_norm": 0.45756596326828003,
+      "learning_rate": 0.0001979306127630578,
+      "loss": 0.8654064536094666,
+      "step": 1944
+    },
+    {
+      "epoch": 0.839697950377562,
+      "grad_norm": 0.4855344891548157,
+      "learning_rate": 0.00019792066254485603,
+      "loss": 0.7792956829071045,
+      "step": 1946
+    },
+    {
+      "epoch": 0.8405609492988134,
+      "grad_norm": 0.4358632266521454,
+      "learning_rate": 0.00019791068871352787,
+      "loss": 0.8000320792198181,
+      "step": 1948
+    },
+    {
+      "epoch": 0.8414239482200647,
+      "grad_norm": 0.4225342273712158,
+      "learning_rate": 0.00019790069127147852,
+      "loss": 0.818372368812561,
+      "step": 1950
+    },
+    {
+      "epoch": 0.842286947141316,
+      "grad_norm": 0.3894529938697815,
+      "learning_rate": 0.00019789067022111886,
+      "loss": 0.727220892906189,
+      "step": 1952
+    },
+    {
+      "epoch": 0.8431499460625674,
+      "grad_norm": 0.5060731768608093,
+      "learning_rate": 0.0001978806255648653,
+      "loss": 0.894101083278656,
+      "step": 1954
+    },
+    {
+      "epoch": 0.8440129449838187,
+      "grad_norm": 0.4165003001689911,
+      "learning_rate": 0.0001978705573051402,
+      "loss": 0.878365695476532,
+      "step": 1956
+    },
+    {
+      "epoch": 0.8448759439050701,
+      "grad_norm": 0.48767927289009094,
+      "learning_rate": 0.0001978604654443714,
+      "loss": 0.8390909433364868,
+      "step": 1958
+    },
+    {
+      "epoch": 0.8457389428263214,
+      "grad_norm": 0.43019410967826843,
+      "learning_rate": 0.00019785034998499247,
+      "loss": 0.8807769417762756,
+      "step": 1960
+    },
+    {
+      "epoch": 0.8466019417475729,
+      "grad_norm": 0.4430403709411621,
+      "learning_rate": 0.0001978402109294428,
+      "loss": 0.8037779331207275,
+      "step": 1962
+    },
+    {
+      "epoch": 0.8474649406688242,
+      "grad_norm": 0.41642463207244873,
+      "learning_rate": 0.0001978300482801673,
+      "loss": 0.8341337442398071,
+      "step": 1964
+    },
+    {
+      "epoch": 0.8483279395900755,
+      "grad_norm": 0.45358774065971375,
+      "learning_rate": 0.00019781986203961668,
+      "loss": 0.854821503162384,
+      "step": 1966
+    },
+    {
+      "epoch": 0.8491909385113269,
+      "grad_norm": 0.4316342771053314,
+      "learning_rate": 0.00019780965221024728,
+      "loss": 0.8527678847312927,
+      "step": 1968
+    },
+    {
+      "epoch": 0.8500539374325782,
+      "grad_norm": 0.4581106901168823,
+      "learning_rate": 0.00019779941879452122,
+      "loss": 0.7461717128753662,
+      "step": 1970
+    },
+    {
+      "epoch": 0.8509169363538296,
+      "grad_norm": 0.49578142166137695,
+      "learning_rate": 0.0001977891617949062,
+      "loss": 0.884441077709198,
+      "step": 1972
+    },
+    {
+      "epoch": 0.8517799352750809,
+      "grad_norm": 0.4366011917591095,
+      "learning_rate": 0.00019777888121387562,
+      "loss": 0.855915904045105,
+      "step": 1974
+    },
+    {
+      "epoch": 0.8526429341963323,
+      "grad_norm": 0.486162930727005,
+      "learning_rate": 0.00019776857705390864,
+      "loss": 0.7563765645027161,
+      "step": 1976
+    },
+    {
+      "epoch": 0.8535059331175836,
+      "grad_norm": 0.5162674784660339,
+      "learning_rate": 0.00019775824931749005,
+      "loss": 0.8346326947212219,
+      "step": 1978
+    },
+    {
+      "epoch": 0.8543689320388349,
+      "grad_norm": 0.5824693441390991,
+      "learning_rate": 0.0001977478980071103,
+      "loss": 0.8701820969581604,
+      "step": 1980
+    },
+    {
+      "epoch": 0.8552319309600863,
+      "grad_norm": 0.4297148883342743,
+      "learning_rate": 0.00019773752312526565,
+      "loss": 0.893528938293457,
+      "step": 1982
+    },
+    {
+      "epoch": 0.8560949298813376,
+      "grad_norm": 0.42978280782699585,
+      "learning_rate": 0.00019772712467445788,
+      "loss": 0.8201018571853638,
+      "step": 1984
+    },
+    {
+      "epoch": 0.856957928802589,
+      "grad_norm": 0.5192655324935913,
+      "learning_rate": 0.00019771670265719454,
+      "loss": 0.9080212116241455,
+      "step": 1986
+    },
+    {
+      "epoch": 0.8578209277238403,
+      "grad_norm": 0.452690452337265,
+      "learning_rate": 0.00019770625707598885,
+      "loss": 0.8518272638320923,
+      "step": 1988
+    },
+    {
+      "epoch": 0.8586839266450917,
+      "grad_norm": 0.4371768832206726,
+      "learning_rate": 0.00019769578793335976,
+      "loss": 0.9426717758178711,
+      "step": 1990
+    },
+    {
+      "epoch": 0.859546925566343,
+      "grad_norm": 0.44595038890838623,
+      "learning_rate": 0.0001976852952318318,
+      "loss": 0.8065400123596191,
+      "step": 1992
+    },
+    {
+      "epoch": 0.8604099244875943,
+      "grad_norm": 0.4355090260505676,
+      "learning_rate": 0.0001976747789739353,
+      "loss": 0.7674415707588196,
+      "step": 1994
+    },
+    {
+      "epoch": 0.8612729234088458,
+      "grad_norm": 0.43745186924934387,
+      "learning_rate": 0.00019766423916220616,
+      "loss": 0.813849925994873,
+      "step": 1996
+    },
+    {
+      "epoch": 0.8621359223300971,
+      "grad_norm": 0.4588927924633026,
+      "learning_rate": 0.00019765367579918598,
+      "loss": 0.7870585322380066,
+      "step": 1998
+    },
+    {
+      "epoch": 0.8629989212513485,
+      "grad_norm": 0.4170977473258972,
+      "learning_rate": 0.00019764308888742214,
+      "loss": 0.8383269309997559,
+      "step": 2000
+    },
+    {
+      "epoch": 0.8629989212513485,
+      "eval_loss": 0.8567262887954712,
+      "eval_runtime": 646.6443,
+      "eval_samples_per_second": 3.186,
+      "eval_steps_per_second": 3.186,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 13908,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 6,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 3,
+        "early_stopping_threshold": 0.001
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1995266703367885e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}