diff --git "a/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-2000/trainer_state.json" "b/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-2000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-2000/trainer_state.json"
@@ -0,0 +1,14033 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.42003570303475796,
+  "eval_steps": 500,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00021001785151737897,
+      "grad_norm": 0.5090809464454651,
+      "learning_rate": 4.19287211740042e-07,
+      "loss": 1.1155,
+      "step": 1
+    },
+    {
+      "epoch": 0.00042003570303475793,
+      "grad_norm": 0.6277585625648499,
+      "learning_rate": 8.38574423480084e-07,
+      "loss": 1.3855,
+      "step": 2
+    },
+    {
+      "epoch": 0.000630053554552137,
+      "grad_norm": 0.7032777070999146,
+      "learning_rate": 1.257861635220126e-06,
+      "loss": 1.3905,
+      "step": 3
+    },
+    {
+      "epoch": 0.0008400714060695159,
+      "grad_norm": 0.745212197303772,
+      "learning_rate": 1.677148846960168e-06,
+      "loss": 1.4668,
+      "step": 4
+    },
+    {
+      "epoch": 0.0010500892575868949,
+      "grad_norm": 0.7577304840087891,
+      "learning_rate": 2.09643605870021e-06,
+      "loss": 1.473,
+      "step": 5
+    },
+    {
+      "epoch": 0.001260107109104274,
+      "grad_norm": 0.7788395881652832,
+      "learning_rate": 2.515723270440252e-06,
+      "loss": 1.485,
+      "step": 6
+    },
+    {
+      "epoch": 0.001470124960621653,
+      "grad_norm": 0.7430889010429382,
+      "learning_rate": 2.935010482180294e-06,
+      "loss": 1.4338,
+      "step": 7
+    },
+    {
+      "epoch": 0.0016801428121390317,
+      "grad_norm": 0.8291558623313904,
+      "learning_rate": 3.354297693920336e-06,
+      "loss": 1.4768,
+      "step": 8
+    },
+    {
+      "epoch": 0.0018901606636564107,
+      "grad_norm": 0.7731107473373413,
+      "learning_rate": 3.7735849056603773e-06,
+      "loss": 1.5853,
+      "step": 9
+    },
+    {
+      "epoch": 0.0021001785151737898,
+      "grad_norm": 0.8241227269172668,
+      "learning_rate": 4.19287211740042e-06,
+      "loss": 1.7314,
+      "step": 10
+    },
+    {
+      "epoch": 0.002310196366691169,
+      "grad_norm": 0.8158630728721619,
+      "learning_rate": 4.612159329140462e-06,
+      "loss": 1.5916,
+      "step": 11
+    },
+    {
+      "epoch": 0.002520214218208548,
+      "grad_norm": 0.8860861659049988,
+      "learning_rate": 5.031446540880504e-06,
+      "loss": 1.5115,
+      "step": 12
+    },
+    {
+      "epoch": 0.002730232069725927,
+      "grad_norm": 0.868651270866394,
+      "learning_rate": 5.4507337526205454e-06,
+      "loss": 1.7514,
+      "step": 13
+    },
+    {
+      "epoch": 0.002940249921243306,
+      "grad_norm": 0.9116117358207703,
+      "learning_rate": 5.870020964360588e-06,
+      "loss": 1.5645,
+      "step": 14
+    },
+    {
+      "epoch": 0.003150267772760685,
+      "grad_norm": 0.8694919347763062,
+      "learning_rate": 6.289308176100629e-06,
+      "loss": 1.6326,
+      "step": 15
+    },
+    {
+      "epoch": 0.0033602856242780635,
+      "grad_norm": 0.8614499568939209,
+      "learning_rate": 6.708595387840672e-06,
+      "loss": 1.6224,
+      "step": 16
+    },
+    {
+      "epoch": 0.0035703034757954425,
+      "grad_norm": 0.8713967800140381,
+      "learning_rate": 7.127882599580712e-06,
+      "loss": 1.5923,
+      "step": 17
+    },
+    {
+      "epoch": 0.0037803213273128215,
+      "grad_norm": 0.8446964025497437,
+      "learning_rate": 7.547169811320755e-06,
+      "loss": 1.5843,
+      "step": 18
+    },
+    {
+      "epoch": 0.0039903391788302005,
+      "grad_norm": 0.8920742869377136,
+      "learning_rate": 7.966457023060797e-06,
+      "loss": 1.5485,
+      "step": 19
+    },
+    {
+      "epoch": 0.0042003570303475795,
+      "grad_norm": 0.9501891136169434,
+      "learning_rate": 8.38574423480084e-06,
+      "loss": 1.663,
+      "step": 20
+    },
+    {
+      "epoch": 0.0044103748818649586,
+      "grad_norm": 0.9179856181144714,
+      "learning_rate": 8.80503144654088e-06,
+      "loss": 1.6163,
+      "step": 21
+    },
+    {
+      "epoch": 0.004620392733382338,
+      "grad_norm": 0.8716169595718384,
+      "learning_rate": 9.224318658280923e-06,
+      "loss": 1.5311,
+      "step": 22
+    },
+    {
+      "epoch": 0.004830410584899717,
+      "grad_norm": 0.9034018516540527,
+      "learning_rate": 9.643605870020965e-06,
+      "loss": 1.5315,
+      "step": 23
+    },
+    {
+      "epoch": 0.005040428436417096,
+      "grad_norm": 0.9811834692955017,
+      "learning_rate": 1.0062893081761008e-05,
+      "loss": 1.5356,
+      "step": 24
+    },
+    {
+      "epoch": 0.005250446287934475,
+      "grad_norm": 0.8846603035926819,
+      "learning_rate": 1.0482180293501048e-05,
+      "loss": 1.5815,
+      "step": 25
+    },
+    {
+      "epoch": 0.005460464139451854,
+      "grad_norm": 0.8842517137527466,
+      "learning_rate": 1.0901467505241091e-05,
+      "loss": 1.5628,
+      "step": 26
+    },
+    {
+      "epoch": 0.005670481990969233,
+      "grad_norm": 0.9207525253295898,
+      "learning_rate": 1.1320754716981132e-05,
+      "loss": 1.5623,
+      "step": 27
+    },
+    {
+      "epoch": 0.005880499842486612,
+      "grad_norm": 0.9082942605018616,
+      "learning_rate": 1.1740041928721176e-05,
+      "loss": 1.4599,
+      "step": 28
+    },
+    {
+      "epoch": 0.006090517694003991,
+      "grad_norm": 0.8724138736724854,
+      "learning_rate": 1.2159329140461215e-05,
+      "loss": 1.5282,
+      "step": 29
+    },
+    {
+      "epoch": 0.00630053554552137,
+      "grad_norm": 0.8738006353378296,
+      "learning_rate": 1.2578616352201259e-05,
+      "loss": 1.4782,
+      "step": 30
+    },
+    {
+      "epoch": 0.006510553397038748,
+      "grad_norm": 0.9410291910171509,
+      "learning_rate": 1.29979035639413e-05,
+      "loss": 1.3856,
+      "step": 31
+    },
+    {
+      "epoch": 0.006720571248556127,
+      "grad_norm": 0.9309423565864563,
+      "learning_rate": 1.3417190775681343e-05,
+      "loss": 1.4267,
+      "step": 32
+    },
+    {
+      "epoch": 0.006930589100073506,
+      "grad_norm": 0.9442999362945557,
+      "learning_rate": 1.3836477987421385e-05,
+      "loss": 1.3706,
+      "step": 33
+    },
+    {
+      "epoch": 0.007140606951590885,
+      "grad_norm": 0.9511269927024841,
+      "learning_rate": 1.4255765199161425e-05,
+      "loss": 1.26,
+      "step": 34
+    },
+    {
+      "epoch": 0.007350624803108264,
+      "grad_norm": 1.0389297008514404,
+      "learning_rate": 1.467505241090147e-05,
+      "loss": 1.235,
+      "step": 35
+    },
+    {
+      "epoch": 0.007560642654625643,
+      "grad_norm": 1.0033001899719238,
+      "learning_rate": 1.509433962264151e-05,
+      "loss": 1.2687,
+      "step": 36
+    },
+    {
+      "epoch": 0.007770660506143022,
+      "grad_norm": 1.075852632522583,
+      "learning_rate": 1.5513626834381552e-05,
+      "loss": 1.2762,
+      "step": 37
+    },
+    {
+      "epoch": 0.007980678357660401,
+      "grad_norm": 1.0721476078033447,
+      "learning_rate": 1.5932914046121594e-05,
+      "loss": 1.1935,
+      "step": 38
+    },
+    {
+      "epoch": 0.00819069620917778,
+      "grad_norm": 1.0784581899642944,
+      "learning_rate": 1.6352201257861635e-05,
+      "loss": 1.1119,
+      "step": 39
+    },
+    {
+      "epoch": 0.008400714060695159,
+      "grad_norm": 1.1390137672424316,
+      "learning_rate": 1.677148846960168e-05,
+      "loss": 1.0373,
+      "step": 40
+    },
+    {
+      "epoch": 0.008610731912212538,
+      "grad_norm": 1.3073922395706177,
+      "learning_rate": 1.719077568134172e-05,
+      "loss": 1.1536,
+      "step": 41
+    },
+    {
+      "epoch": 0.008820749763729917,
+      "grad_norm": 1.3248019218444824,
+      "learning_rate": 1.761006289308176e-05,
+      "loss": 0.9316,
+      "step": 42
+    },
+    {
+      "epoch": 0.009030767615247296,
+      "grad_norm": 1.3569798469543457,
+      "learning_rate": 1.8029350104821805e-05,
+      "loss": 0.8881,
+      "step": 43
+    },
+    {
+      "epoch": 0.009240785466764675,
+      "grad_norm": 1.3192838430404663,
+      "learning_rate": 1.8448637316561846e-05,
+      "loss": 0.8825,
+      "step": 44
+    },
+    {
+      "epoch": 0.009450803318282054,
+      "grad_norm": 1.1947859525680542,
+      "learning_rate": 1.8867924528301888e-05,
+      "loss": 0.9415,
+      "step": 45
+    },
+    {
+      "epoch": 0.009660821169799433,
+      "grad_norm": 1.1684753894805908,
+      "learning_rate": 1.928721174004193e-05,
+      "loss": 0.83,
+      "step": 46
+    },
+    {
+      "epoch": 0.009870839021316812,
+      "grad_norm": 1.1097474098205566,
+      "learning_rate": 1.970649895178197e-05,
+      "loss": 0.7248,
+      "step": 47
+    },
+    {
+      "epoch": 0.010080856872834191,
+      "grad_norm": 1.0564842224121094,
+      "learning_rate": 2.0125786163522016e-05,
+      "loss": 0.7427,
+      "step": 48
+    },
+    {
+      "epoch": 0.01029087472435157,
+      "grad_norm": 0.9865881204605103,
+      "learning_rate": 2.0545073375262054e-05,
+      "loss": 0.877,
+      "step": 49
+    },
+    {
+      "epoch": 0.01050089257586895,
+      "grad_norm": 1.072039246559143,
+      "learning_rate": 2.0964360587002095e-05,
+      "loss": 0.8473,
+      "step": 50
+    },
+    {
+      "epoch": 0.010710910427386328,
+      "grad_norm": 0.5596430897712708,
+      "learning_rate": 2.138364779874214e-05,
+      "loss": 0.4599,
+      "step": 51
+    },
+    {
+      "epoch": 0.010920928278903707,
+      "grad_norm": 0.6180581450462341,
+      "learning_rate": 2.1802935010482182e-05,
+      "loss": 0.5215,
+      "step": 52
+    },
+    {
+      "epoch": 0.011130946130421086,
+      "grad_norm": 0.6805194616317749,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 0.6148,
+      "step": 53
+    },
+    {
+      "epoch": 0.011340963981938465,
+      "grad_norm": 0.7125585079193115,
+      "learning_rate": 2.2641509433962265e-05,
+      "loss": 0.5615,
+      "step": 54
+    },
+    {
+      "epoch": 0.011550981833455844,
+      "grad_norm": 0.6816964745521545,
+      "learning_rate": 2.3060796645702306e-05,
+      "loss": 0.478,
+      "step": 55
+    },
+    {
+      "epoch": 0.011760999684973223,
+      "grad_norm": 0.5821985602378845,
+      "learning_rate": 2.348008385744235e-05,
+      "loss": 0.5604,
+      "step": 56
+    },
+    {
+      "epoch": 0.011971017536490602,
+      "grad_norm": 0.642721951007843,
+      "learning_rate": 2.3899371069182393e-05,
+      "loss": 0.4068,
+      "step": 57
+    },
+    {
+      "epoch": 0.012181035388007981,
+      "grad_norm": 0.5806999206542969,
+      "learning_rate": 2.431865828092243e-05,
+      "loss": 0.4711,
+      "step": 58
+    },
+    {
+      "epoch": 0.01239105323952536,
+      "grad_norm": 0.6702911257743835,
+      "learning_rate": 2.4737945492662476e-05,
+      "loss": 0.4601,
+      "step": 59
+    },
+    {
+      "epoch": 0.01260107109104274,
+      "grad_norm": 0.6345894932746887,
+      "learning_rate": 2.5157232704402517e-05,
+      "loss": 0.4172,
+      "step": 60
+    },
+    {
+      "epoch": 0.012811088942560117,
+      "grad_norm": 0.6444422602653503,
+      "learning_rate": 2.5576519916142562e-05,
+      "loss": 0.4635,
+      "step": 61
+    },
+    {
+      "epoch": 0.013021106794077496,
+      "grad_norm": 0.6568206548690796,
+      "learning_rate": 2.59958071278826e-05,
+      "loss": 0.4327,
+      "step": 62
+    },
+    {
+      "epoch": 0.013231124645594875,
+      "grad_norm": 0.6627638936042786,
+      "learning_rate": 2.641509433962264e-05,
+      "loss": 0.4935,
+      "step": 63
+    },
+    {
+      "epoch": 0.013441142497112254,
+      "grad_norm": 0.6746403574943542,
+      "learning_rate": 2.6834381551362687e-05,
+      "loss": 0.4013,
+      "step": 64
+    },
+    {
+      "epoch": 0.013651160348629633,
+      "grad_norm": 0.7141286134719849,
+      "learning_rate": 2.7253668763102725e-05,
+      "loss": 0.5162,
+      "step": 65
+    },
+    {
+      "epoch": 0.013861178200147012,
+      "grad_norm": 0.779960572719574,
+      "learning_rate": 2.767295597484277e-05,
+      "loss": 0.4126,
+      "step": 66
+    },
+    {
+      "epoch": 0.014071196051664391,
+      "grad_norm": 0.6626395583152771,
+      "learning_rate": 2.809224318658281e-05,
+      "loss": 0.3925,
+      "step": 67
+    },
+    {
+      "epoch": 0.01428121390318177,
+      "grad_norm": 0.6545393466949463,
+      "learning_rate": 2.851153039832285e-05,
+      "loss": 0.3152,
+      "step": 68
+    },
+    {
+      "epoch": 0.014491231754699149,
+      "grad_norm": 0.7004114389419556,
+      "learning_rate": 2.8930817610062894e-05,
+      "loss": 0.4223,
+      "step": 69
+    },
+    {
+      "epoch": 0.014701249606216528,
+      "grad_norm": 0.6912452578544617,
+      "learning_rate": 2.935010482180294e-05,
+      "loss": 0.3089,
+      "step": 70
+    },
+    {
+      "epoch": 0.014911267457733907,
+      "grad_norm": 0.7729060649871826,
+      "learning_rate": 2.976939203354298e-05,
+      "loss": 0.4045,
+      "step": 71
+    },
+    {
+      "epoch": 0.015121285309251286,
+      "grad_norm": 0.7606898546218872,
+      "learning_rate": 3.018867924528302e-05,
+      "loss": 0.3079,
+      "step": 72
+    },
+    {
+      "epoch": 0.015331303160768665,
+      "grad_norm": 0.6202028393745422,
+      "learning_rate": 3.060796645702306e-05,
+      "loss": 0.3833,
+      "step": 73
+    },
+    {
+      "epoch": 0.015541321012286044,
+      "grad_norm": 0.6014758348464966,
+      "learning_rate": 3.1027253668763105e-05,
+      "loss": 0.3815,
+      "step": 74
+    },
+    {
+      "epoch": 0.015751338863803425,
+      "grad_norm": 0.6792122721672058,
+      "learning_rate": 3.144654088050314e-05,
+      "loss": 0.3383,
+      "step": 75
+    },
+    {
+      "epoch": 0.015961356715320802,
+      "grad_norm": 0.7135879993438721,
+      "learning_rate": 3.186582809224319e-05,
+      "loss": 0.3744,
+      "step": 76
+    },
+    {
+      "epoch": 0.016171374566838183,
+      "grad_norm": 0.6972818374633789,
+      "learning_rate": 3.228511530398323e-05,
+      "loss": 0.3256,
+      "step": 77
+    },
+    {
+      "epoch": 0.01638139241835556,
+      "grad_norm": 0.5925168395042419,
+      "learning_rate": 3.270440251572327e-05,
+      "loss": 0.3309,
+      "step": 78
+    },
+    {
+      "epoch": 0.01659141026987294,
+      "grad_norm": 0.7750416994094849,
+      "learning_rate": 3.3123689727463316e-05,
+      "loss": 0.4142,
+      "step": 79
+    },
+    {
+      "epoch": 0.016801428121390318,
+      "grad_norm": 0.7466484904289246,
+      "learning_rate": 3.354297693920336e-05,
+      "loss": 0.2527,
+      "step": 80
+    },
+    {
+      "epoch": 0.0170114459729077,
+      "grad_norm": 0.7709718942642212,
+      "learning_rate": 3.39622641509434e-05,
+      "loss": 0.3717,
+      "step": 81
+    },
+    {
+      "epoch": 0.017221463824425076,
+      "grad_norm": 0.6134454607963562,
+      "learning_rate": 3.438155136268344e-05,
+      "loss": 0.2969,
+      "step": 82
+    },
+    {
+      "epoch": 0.017431481675942453,
+      "grad_norm": 0.6442283391952515,
+      "learning_rate": 3.480083857442348e-05,
+      "loss": 0.3009,
+      "step": 83
+    },
+    {
+      "epoch": 0.017641499527459834,
+      "grad_norm": 0.6788150072097778,
+      "learning_rate": 3.522012578616352e-05,
+      "loss": 0.309,
+      "step": 84
+    },
+    {
+      "epoch": 0.01785151737897721,
+      "grad_norm": 0.7172322869300842,
+      "learning_rate": 3.5639412997903565e-05,
+      "loss": 0.3602,
+      "step": 85
+    },
+    {
+      "epoch": 0.018061535230494592,
+      "grad_norm": 0.7475742697715759,
+      "learning_rate": 3.605870020964361e-05,
+      "loss": 0.1889,
+      "step": 86
+    },
+    {
+      "epoch": 0.01827155308201197,
+      "grad_norm": 0.7164073586463928,
+      "learning_rate": 3.647798742138365e-05,
+      "loss": 0.2062,
+      "step": 87
+    },
+    {
+      "epoch": 0.01848157093352935,
+      "grad_norm": 0.7514247298240662,
+      "learning_rate": 3.689727463312369e-05,
+      "loss": 0.2426,
+      "step": 88
+    },
+    {
+      "epoch": 0.018691588785046728,
+      "grad_norm": 0.8898234963417053,
+      "learning_rate": 3.731656184486374e-05,
+      "loss": 0.3759,
+      "step": 89
+    },
+    {
+      "epoch": 0.01890160663656411,
+      "grad_norm": 0.8034729361534119,
+      "learning_rate": 3.7735849056603776e-05,
+      "loss": 0.2547,
+      "step": 90
+    },
+    {
+      "epoch": 0.019111624488081486,
+      "grad_norm": 0.771716296672821,
+      "learning_rate": 3.8155136268343814e-05,
+      "loss": 0.2125,
+      "step": 91
+    },
+    {
+      "epoch": 0.019321642339598866,
+      "grad_norm": 0.811174213886261,
+      "learning_rate": 3.857442348008386e-05,
+      "loss": 0.3535,
+      "step": 92
+    },
+    {
+      "epoch": 0.019531660191116244,
+      "grad_norm": 1.0474952459335327,
+      "learning_rate": 3.8993710691823904e-05,
+      "loss": 0.3278,
+      "step": 93
+    },
+    {
+      "epoch": 0.019741678042633624,
+      "grad_norm": 0.752088725566864,
+      "learning_rate": 3.941299790356394e-05,
+      "loss": 0.2574,
+      "step": 94
+    },
+    {
+      "epoch": 0.019951695894151,
+      "grad_norm": 0.9202740788459778,
+      "learning_rate": 3.983228511530399e-05,
+      "loss": 0.2618,
+      "step": 95
+    },
+    {
+      "epoch": 0.020161713745668382,
+      "grad_norm": 0.663686990737915,
+      "learning_rate": 4.025157232704403e-05,
+      "loss": 0.1981,
+      "step": 96
+    },
+    {
+      "epoch": 0.02037173159718576,
+      "grad_norm": 0.7075244784355164,
+      "learning_rate": 4.067085953878407e-05,
+      "loss": 0.195,
+      "step": 97
+    },
+    {
+      "epoch": 0.02058174944870314,
+      "grad_norm": 0.8226995468139648,
+      "learning_rate": 4.109014675052411e-05,
+      "loss": 0.3464,
+      "step": 98
+    },
+    {
+      "epoch": 0.020791767300220518,
+      "grad_norm": 0.826926589012146,
+      "learning_rate": 4.150943396226415e-05,
+      "loss": 0.241,
+      "step": 99
+    },
+    {
+      "epoch": 0.0210017851517379,
+      "grad_norm": 0.8767513632774353,
+      "learning_rate": 4.192872117400419e-05,
+      "loss": 0.33,
+      "step": 100
+    },
+    {
+      "epoch": 0.021211803003255276,
+      "grad_norm": 0.9166819453239441,
+      "learning_rate": 4.2348008385744236e-05,
+      "loss": 0.3528,
+      "step": 101
+    },
+    {
+      "epoch": 0.021421820854772657,
+      "grad_norm": 0.6607112288475037,
+      "learning_rate": 4.276729559748428e-05,
+      "loss": 0.3294,
+      "step": 102
+    },
+    {
+      "epoch": 0.021631838706290034,
+      "grad_norm": 0.5891725420951843,
+      "learning_rate": 4.318658280922432e-05,
+      "loss": 0.2523,
+      "step": 103
+    },
+    {
+      "epoch": 0.021841856557807415,
+      "grad_norm": 0.5484351515769958,
+      "learning_rate": 4.3605870020964364e-05,
+      "loss": 0.3563,
+      "step": 104
+    },
+    {
+      "epoch": 0.022051874409324792,
+      "grad_norm": 0.6384206414222717,
+      "learning_rate": 4.402515723270441e-05,
+      "loss": 0.5014,
+      "step": 105
+    },
+    {
+      "epoch": 0.022261892260842173,
+      "grad_norm": 0.6228074431419373,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 0.297,
+      "step": 106
+    },
+    {
+      "epoch": 0.02247191011235955,
+      "grad_norm": 0.6993734240531921,
+      "learning_rate": 4.4863731656184485e-05,
+      "loss": 0.3416,
+      "step": 107
+    },
+    {
+      "epoch": 0.02268192796387693,
+      "grad_norm": 0.5191211104393005,
+      "learning_rate": 4.528301886792453e-05,
+      "loss": 0.2403,
+      "step": 108
+    },
+    {
+      "epoch": 0.022891945815394308,
+      "grad_norm": 0.5719013214111328,
+      "learning_rate": 4.570230607966457e-05,
+      "loss": 0.2226,
+      "step": 109
+    },
+    {
+      "epoch": 0.02310196366691169,
+      "grad_norm": 0.5222904682159424,
+      "learning_rate": 4.612159329140461e-05,
+      "loss": 0.2119,
+      "step": 110
+    },
+    {
+      "epoch": 0.023311981518429066,
+      "grad_norm": 0.4741697609424591,
+      "learning_rate": 4.654088050314466e-05,
+      "loss": 0.2076,
+      "step": 111
+    },
+    {
+      "epoch": 0.023521999369946447,
+      "grad_norm": 0.5350250005722046,
+      "learning_rate": 4.69601677148847e-05,
+      "loss": 0.2342,
+      "step": 112
+    },
+    {
+      "epoch": 0.023732017221463824,
+      "grad_norm": 0.6532084345817566,
+      "learning_rate": 4.737945492662474e-05,
+      "loss": 0.3541,
+      "step": 113
+    },
+    {
+      "epoch": 0.023942035072981205,
+      "grad_norm": 0.6158542633056641,
+      "learning_rate": 4.7798742138364785e-05,
+      "loss": 0.2586,
+      "step": 114
+    },
+    {
+      "epoch": 0.024152052924498582,
+      "grad_norm": 0.7820281982421875,
+      "learning_rate": 4.8218029350104823e-05,
+      "loss": 0.472,
+      "step": 115
+    },
+    {
+      "epoch": 0.024362070776015963,
+      "grad_norm": 0.6096176505088806,
+      "learning_rate": 4.863731656184486e-05,
+      "loss": 0.3014,
+      "step": 116
+    },
+    {
+      "epoch": 0.02457208862753334,
+      "grad_norm": 0.5152641534805298,
+      "learning_rate": 4.9056603773584906e-05,
+      "loss": 0.2032,
+      "step": 117
+    },
+    {
+      "epoch": 0.02478210647905072,
+      "grad_norm": 0.6049755215644836,
+      "learning_rate": 4.947589098532495e-05,
+      "loss": 0.2932,
+      "step": 118
+    },
+    {
+      "epoch": 0.024992124330568098,
+      "grad_norm": 0.5203216075897217,
+      "learning_rate": 4.989517819706499e-05,
+      "loss": 0.1978,
+      "step": 119
+    },
+    {
+      "epoch": 0.02520214218208548,
+      "grad_norm": 0.6881438493728638,
+      "learning_rate": 5.0314465408805034e-05,
+      "loss": 0.3873,
+      "step": 120
+    },
+    {
+      "epoch": 0.025412160033602856,
+      "grad_norm": 0.602206289768219,
+      "learning_rate": 5.073375262054507e-05,
+      "loss": 0.2508,
+      "step": 121
+    },
+    {
+      "epoch": 0.025622177885120234,
+      "grad_norm": 0.7059246897697449,
+      "learning_rate": 5.1153039832285124e-05,
+      "loss": 0.2661,
+      "step": 122
+    },
+    {
+      "epoch": 0.025832195736637614,
+      "grad_norm": 0.44054412841796875,
+      "learning_rate": 5.157232704402516e-05,
+      "loss": 0.177,
+      "step": 123
+    },
+    {
+      "epoch": 0.02604221358815499,
+      "grad_norm": 0.6321287155151367,
+      "learning_rate": 5.19916142557652e-05,
+      "loss": 0.2849,
+      "step": 124
+    },
+    {
+      "epoch": 0.026252231439672372,
+      "grad_norm": 0.7430282235145569,
+      "learning_rate": 5.2410901467505245e-05,
+      "loss": 0.3353,
+      "step": 125
+    },
+    {
+      "epoch": 0.02646224929118975,
+      "grad_norm": 0.6884610056877136,
+      "learning_rate": 5.283018867924528e-05,
+      "loss": 0.2935,
+      "step": 126
+    },
+    {
+      "epoch": 0.02667226714270713,
+      "grad_norm": 0.8021960854530334,
+      "learning_rate": 5.324947589098532e-05,
+      "loss": 0.2609,
+      "step": 127
+    },
+    {
+      "epoch": 0.026882284994224508,
+      "grad_norm": 0.5545848608016968,
+      "learning_rate": 5.366876310272537e-05,
+      "loss": 0.1972,
+      "step": 128
+    },
+    {
+      "epoch": 0.02709230284574189,
+      "grad_norm": 0.6628248691558838,
+      "learning_rate": 5.408805031446541e-05,
+      "loss": 0.2184,
+      "step": 129
+    },
+    {
+      "epoch": 0.027302320697259266,
+      "grad_norm": 0.5908805131912231,
+      "learning_rate": 5.450733752620545e-05,
+      "loss": 0.2029,
+      "step": 130
+    },
+    {
+      "epoch": 0.027512338548776646,
+      "grad_norm": 0.6377450823783875,
+      "learning_rate": 5.49266247379455e-05,
+      "loss": 0.2382,
+      "step": 131
+    },
+    {
+      "epoch": 0.027722356400294024,
+      "grad_norm": 0.7006211876869202,
+      "learning_rate": 5.534591194968554e-05,
+      "loss": 0.1936,
+      "step": 132
+    },
+    {
+      "epoch": 0.027932374251811404,
+      "grad_norm": 0.5962005257606506,
+      "learning_rate": 5.576519916142558e-05,
+      "loss": 0.2922,
+      "step": 133
+    },
+    {
+      "epoch": 0.028142392103328782,
+      "grad_norm": 0.6030206084251404,
+      "learning_rate": 5.618448637316562e-05,
+      "loss": 0.1629,
+      "step": 134
+    },
+    {
+      "epoch": 0.028352409954846162,
+      "grad_norm": 0.7888013124465942,
+      "learning_rate": 5.660377358490566e-05,
+      "loss": 0.2866,
+      "step": 135
+    },
+    {
+      "epoch": 0.02856242780636354,
+      "grad_norm": 0.5116386413574219,
+      "learning_rate": 5.70230607966457e-05,
+      "loss": 0.1963,
+      "step": 136
+    },
+    {
+      "epoch": 0.02877244565788092,
+      "grad_norm": 0.6759427785873413,
+      "learning_rate": 5.744234800838575e-05,
+      "loss": 0.2412,
+      "step": 137
+    },
+    {
+      "epoch": 0.028982463509398298,
+      "grad_norm": 0.8643584847450256,
+      "learning_rate": 5.786163522012579e-05,
+      "loss": 0.2277,
+      "step": 138
+    },
+    {
+      "epoch": 0.02919248136091568,
+      "grad_norm": 0.639639139175415,
+      "learning_rate": 5.8280922431865826e-05,
+      "loss": 0.2286,
+      "step": 139
+    },
+    {
+      "epoch": 0.029402499212433056,
+      "grad_norm": 0.6094908714294434,
+      "learning_rate": 5.870020964360588e-05,
+      "loss": 0.1656,
+      "step": 140
+    },
+    {
+      "epoch": 0.029612517063950437,
+      "grad_norm": 0.7927185297012329,
+      "learning_rate": 5.9119496855345916e-05,
+      "loss": 0.2436,
+      "step": 141
+    },
+    {
+      "epoch": 0.029822534915467814,
+      "grad_norm": 0.8780869841575623,
+      "learning_rate": 5.953878406708596e-05,
+      "loss": 0.2614,
+      "step": 142
+    },
+    {
+      "epoch": 0.030032552766985195,
+      "grad_norm": 0.5985304117202759,
+      "learning_rate": 5.9958071278826e-05,
+      "loss": 0.2268,
+      "step": 143
+    },
+    {
+      "epoch": 0.030242570618502572,
+      "grad_norm": 0.6452706456184387,
+      "learning_rate": 6.037735849056604e-05,
+      "loss": 0.211,
+      "step": 144
+    },
+    {
+      "epoch": 0.030452588470019953,
+      "grad_norm": 0.8015931844711304,
+      "learning_rate": 6.079664570230609e-05,
+      "loss": 0.3532,
+      "step": 145
+    },
+    {
+      "epoch": 0.03066260632153733,
+      "grad_norm": 0.667226254940033,
+      "learning_rate": 6.121593291404612e-05,
+      "loss": 0.2051,
+      "step": 146
+    },
+    {
+      "epoch": 0.03087262417305471,
+      "grad_norm": 0.6942270398139954,
+      "learning_rate": 6.163522012578616e-05,
+      "loss": 0.2516,
+      "step": 147
+    },
+    {
+      "epoch": 0.031082642024572088,
+      "grad_norm": 0.845588743686676,
+      "learning_rate": 6.205450733752621e-05,
+      "loss": 0.257,
+      "step": 148
+    },
+    {
+      "epoch": 0.031292659876089465,
+      "grad_norm": 0.6104562878608704,
+      "learning_rate": 6.247379454926625e-05,
+      "loss": 0.246,
+      "step": 149
+    },
+    {
+      "epoch": 0.03150267772760685,
+      "grad_norm": 0.7243993282318115,
+      "learning_rate": 6.289308176100629e-05,
+      "loss": 0.2623,
+      "step": 150
+    },
+    {
+      "epoch": 0.03171269557912423,
+      "grad_norm": 0.6479102373123169,
+      "learning_rate": 6.331236897274634e-05,
+      "loss": 0.3154,
+      "step": 151
+    },
+    {
+      "epoch": 0.031922713430641604,
+      "grad_norm": 0.6088507175445557,
+      "learning_rate": 6.373165618448638e-05,
+      "loss": 0.3639,
+      "step": 152
+    },
+    {
+      "epoch": 0.03213273128215898,
+      "grad_norm": 0.5590083599090576,
+      "learning_rate": 6.415094339622641e-05,
+      "loss": 0.2753,
+      "step": 153
+    },
+    {
+      "epoch": 0.032342749133676366,
+      "grad_norm": 0.6644802093505859,
+      "learning_rate": 6.457023060796647e-05,
+      "loss": 0.2722,
+      "step": 154
+    },
+    {
+      "epoch": 0.03255276698519374,
+      "grad_norm": 0.6034846901893616,
+      "learning_rate": 6.49895178197065e-05,
+      "loss": 0.3084,
+      "step": 155
+    },
+    {
+      "epoch": 0.03276278483671112,
+      "grad_norm": 0.897366464138031,
+      "learning_rate": 6.540880503144654e-05,
+      "loss": 0.2834,
+      "step": 156
+    },
+    {
+      "epoch": 0.0329728026882285,
+      "grad_norm": 0.7516223788261414,
+      "learning_rate": 6.58280922431866e-05,
+      "loss": 0.2515,
+      "step": 157
+    },
+    {
+      "epoch": 0.03318282053974588,
+      "grad_norm": 0.712957501411438,
+      "learning_rate": 6.624737945492663e-05,
+      "loss": 0.217,
+      "step": 158
+    },
+    {
+      "epoch": 0.03339283839126326,
+      "grad_norm": 0.6373322010040283,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.3091,
+      "step": 159
+    },
+    {
+      "epoch": 0.033602856242780636,
+      "grad_norm": 0.6305301189422607,
+      "learning_rate": 6.708595387840672e-05,
+      "loss": 0.1965,
+      "step": 160
+    },
+    {
+      "epoch": 0.033812874094298014,
+      "grad_norm": 0.6340491771697998,
+      "learning_rate": 6.750524109014676e-05,
+      "loss": 0.2316,
+      "step": 161
+    },
+    {
+      "epoch": 0.0340228919458154,
+      "grad_norm": 0.6992335915565491,
+      "learning_rate": 6.79245283018868e-05,
+      "loss": 0.321,
+      "step": 162
+    },
+    {
+      "epoch": 0.034232909797332775,
+      "grad_norm": 0.723899245262146,
+      "learning_rate": 6.834381551362684e-05,
+      "loss": 0.2057,
+      "step": 163
+    },
+    {
+      "epoch": 0.03444292764885015,
+      "grad_norm": 0.6245738863945007,
+      "learning_rate": 6.876310272536687e-05,
+      "loss": 0.2367,
+      "step": 164
+    },
+    {
+      "epoch": 0.03465294550036753,
+      "grad_norm": 0.716299295425415,
+      "learning_rate": 6.918238993710691e-05,
+      "loss": 0.3107,
+      "step": 165
+    },
+    {
+      "epoch": 0.03486296335188491,
+      "grad_norm": 0.8374738097190857,
+      "learning_rate": 6.960167714884696e-05,
+      "loss": 0.4097,
+      "step": 166
+    },
+    {
+      "epoch": 0.03507298120340229,
+      "grad_norm": 0.7812545299530029,
+      "learning_rate": 7.0020964360587e-05,
+      "loss": 0.389,
+      "step": 167
+    },
+    {
+      "epoch": 0.03528299905491967,
+      "grad_norm": 0.516504168510437,
+      "learning_rate": 7.044025157232704e-05,
+      "loss": 0.2321,
+      "step": 168
+    },
+    {
+      "epoch": 0.035493016906437046,
+      "grad_norm": 0.5948511958122253,
+      "learning_rate": 7.085953878406709e-05,
+      "loss": 0.2075,
+      "step": 169
+    },
+    {
+      "epoch": 0.03570303475795442,
+      "grad_norm": 0.5658239126205444,
+      "learning_rate": 7.127882599580713e-05,
+      "loss": 0.2366,
+      "step": 170
+    },
+    {
+      "epoch": 0.03591305260947181,
+      "grad_norm": 0.44888898730278015,
+      "learning_rate": 7.169811320754717e-05,
+      "loss": 0.226,
+      "step": 171
+    },
+    {
+      "epoch": 0.036123070460989185,
+      "grad_norm": 0.5403774380683899,
+      "learning_rate": 7.211740041928722e-05,
+      "loss": 0.2887,
+      "step": 172
+    },
+    {
+      "epoch": 0.03633308831250656,
+      "grad_norm": 0.5742720365524292,
+      "learning_rate": 7.253668763102726e-05,
+      "loss": 0.1841,
+      "step": 173
+    },
+    {
+      "epoch": 0.03654310616402394,
+      "grad_norm": 0.7217287421226501,
+      "learning_rate": 7.29559748427673e-05,
+      "loss": 0.283,
+      "step": 174
+    },
+    {
+      "epoch": 0.03675312401554132,
+      "grad_norm": 0.6517660021781921,
+      "learning_rate": 7.337526205450735e-05,
+      "loss": 0.277,
+      "step": 175
+    },
+    {
+      "epoch": 0.0369631418670587,
+      "grad_norm": 0.5237565040588379,
+      "learning_rate": 7.379454926624739e-05,
+      "loss": 0.2764,
+      "step": 176
+    },
+    {
+      "epoch": 0.03717315971857608,
+      "grad_norm": 0.5715314745903015,
+      "learning_rate": 7.421383647798742e-05,
+      "loss": 0.2947,
+      "step": 177
+    },
+    {
+      "epoch": 0.037383177570093455,
+      "grad_norm": 0.39689743518829346,
+      "learning_rate": 7.463312368972748e-05,
+      "loss": 0.1478,
+      "step": 178
+    },
+    {
+      "epoch": 0.03759319542161084,
+      "grad_norm": 0.62773197889328,
+      "learning_rate": 7.505241090146751e-05,
+      "loss": 0.2688,
+      "step": 179
+    },
+    {
+      "epoch": 0.03780321327312822,
+      "grad_norm": 0.5422549247741699,
+      "learning_rate": 7.547169811320755e-05,
+      "loss": 0.2852,
+      "step": 180
+    },
+    {
+      "epoch": 0.038013231124645594,
+      "grad_norm": 0.7973243594169617,
+      "learning_rate": 7.589098532494759e-05,
+      "loss": 0.2414,
+      "step": 181
+    },
+    {
+      "epoch": 0.03822324897616297,
+      "grad_norm": 0.596788227558136,
+      "learning_rate": 7.631027253668763e-05,
+      "loss": 0.2979,
+      "step": 182
+    },
+    {
+      "epoch": 0.038433266827680355,
+      "grad_norm": 0.7164194583892822,
+      "learning_rate": 7.672955974842768e-05,
+      "loss": 0.3195,
+      "step": 183
+    },
+    {
+      "epoch": 0.03864328467919773,
+      "grad_norm": 0.6374505758285522,
+      "learning_rate": 7.714884696016772e-05,
+      "loss": 0.2244,
+      "step": 184
+    },
+    {
+      "epoch": 0.03885330253071511,
+      "grad_norm": 0.7066443562507629,
+      "learning_rate": 7.756813417190776e-05,
+      "loss": 0.3328,
+      "step": 185
+    },
+    {
+      "epoch": 0.03906332038223249,
+      "grad_norm": 0.5930470824241638,
+      "learning_rate": 7.798742138364781e-05,
+      "loss": 0.208,
+      "step": 186
+    },
+    {
+      "epoch": 0.03927333823374987,
+      "grad_norm": 0.7578011155128479,
+      "learning_rate": 7.840670859538785e-05,
+      "loss": 0.3468,
+      "step": 187
+    },
+    {
+      "epoch": 0.03948335608526725,
+      "grad_norm": 0.5424745678901672,
+      "learning_rate": 7.882599580712788e-05,
+      "loss": 0.159,
+      "step": 188
+    },
+    {
+      "epoch": 0.039693373936784626,
+      "grad_norm": 0.6554968953132629,
+      "learning_rate": 7.924528301886794e-05,
+      "loss": 0.1718,
+      "step": 189
+    },
+    {
+      "epoch": 0.039903391788302,
+      "grad_norm": 0.596862256526947,
+      "learning_rate": 7.966457023060797e-05,
+      "loss": 0.2238,
+      "step": 190
+    },
+    {
+      "epoch": 0.04011340963981939,
+      "grad_norm": 0.7238299250602722,
+      "learning_rate": 8.008385744234801e-05,
+      "loss": 0.2322,
+      "step": 191
+    },
+    {
+      "epoch": 0.040323427491336765,
+      "grad_norm": 0.6230559349060059,
+      "learning_rate": 8.050314465408806e-05,
+      "loss": 0.166,
+      "step": 192
+    },
+    {
+      "epoch": 0.04053344534285414,
+      "grad_norm": 0.63409823179245,
+      "learning_rate": 8.09224318658281e-05,
+      "loss": 0.2431,
+      "step": 193
+    },
+    {
+      "epoch": 0.04074346319437152,
+      "grad_norm": 0.43581536412239075,
+      "learning_rate": 8.134171907756814e-05,
+      "loss": 0.1758,
+      "step": 194
+    },
+    {
+      "epoch": 0.040953481045888904,
+      "grad_norm": 0.5425090789794922,
+      "learning_rate": 8.176100628930818e-05,
+      "loss": 0.1214,
+      "step": 195
+    },
+    {
+      "epoch": 0.04116349889740628,
+      "grad_norm": 0.44201138615608215,
+      "learning_rate": 8.218029350104822e-05,
+      "loss": 0.148,
+      "step": 196
+    },
+    {
+      "epoch": 0.04137351674892366,
+      "grad_norm": 0.8185025453567505,
+      "learning_rate": 8.259958071278825e-05,
+      "loss": 0.2788,
+      "step": 197
+    },
+    {
+      "epoch": 0.041583534600441036,
+      "grad_norm": 0.5838762521743774,
+      "learning_rate": 8.30188679245283e-05,
+      "loss": 0.2547,
+      "step": 198
+    },
+    {
+      "epoch": 0.04179355245195842,
+      "grad_norm": 0.6128750443458557,
+      "learning_rate": 8.343815513626834e-05,
+      "loss": 0.2107,
+      "step": 199
+    },
+    {
+      "epoch": 0.0420035703034758,
+      "grad_norm": 0.6906862854957581,
+      "learning_rate": 8.385744234800838e-05,
+      "loss": 0.2066,
+      "step": 200
+    },
+    {
+      "epoch": 0.042213588154993174,
+      "grad_norm": 0.46028971672058105,
+      "learning_rate": 8.427672955974843e-05,
+      "loss": 0.3119,
+      "step": 201
+    },
+    {
+      "epoch": 0.04242360600651055,
+      "grad_norm": 0.45515576004981995,
+      "learning_rate": 8.469601677148847e-05,
+      "loss": 0.3191,
+      "step": 202
+    },
+    {
+      "epoch": 0.04263362385802793,
+      "grad_norm": 0.5129204392433167,
+      "learning_rate": 8.511530398322851e-05,
+      "loss": 0.2978,
+      "step": 203
+    },
+    {
+      "epoch": 0.04284364170954531,
+      "grad_norm": 0.5413036942481995,
+      "learning_rate": 8.553459119496856e-05,
+      "loss": 0.2489,
+      "step": 204
+    },
+    {
+      "epoch": 0.04305365956106269,
+      "grad_norm": 0.5111532211303711,
+      "learning_rate": 8.59538784067086e-05,
+      "loss": 0.2322,
+      "step": 205
+    },
+    {
+      "epoch": 0.04326367741258007,
+      "grad_norm": 0.50229412317276,
+      "learning_rate": 8.637316561844864e-05,
+      "loss": 0.2858,
+      "step": 206
+    },
+    {
+      "epoch": 0.043473695264097445,
+      "grad_norm": 0.5821024179458618,
+      "learning_rate": 8.679245283018869e-05,
+      "loss": 0.3118,
+      "step": 207
+    },
+    {
+      "epoch": 0.04368371311561483,
+      "grad_norm": 0.5523480176925659,
+      "learning_rate": 8.721174004192873e-05,
+      "loss": 0.3757,
+      "step": 208
+    },
+    {
+      "epoch": 0.043893730967132207,
+      "grad_norm": 0.721248209476471,
+      "learning_rate": 8.763102725366877e-05,
+      "loss": 0.3196,
+      "step": 209
+    },
+    {
+      "epoch": 0.044103748818649584,
+      "grad_norm": 0.47591283917427063,
+      "learning_rate": 8.805031446540882e-05,
+      "loss": 0.233,
+      "step": 210
+    },
+    {
+      "epoch": 0.04431376667016696,
+      "grad_norm": 0.5817727446556091,
+      "learning_rate": 8.846960167714886e-05,
+      "loss": 0.2924,
+      "step": 211
+    },
+    {
+      "epoch": 0.044523784521684345,
+      "grad_norm": 0.5370981097221375,
+      "learning_rate": 8.888888888888889e-05,
+      "loss": 0.2487,
+      "step": 212
+    },
+    {
+      "epoch": 0.04473380237320172,
+      "grad_norm": 0.47605571150779724,
+      "learning_rate": 8.930817610062893e-05,
+      "loss": 0.2249,
+      "step": 213
+    },
+    {
+      "epoch": 0.0449438202247191,
+      "grad_norm": 0.6315035223960876,
+      "learning_rate": 8.972746331236897e-05,
+      "loss": 0.2805,
+      "step": 214
+    },
+    {
+      "epoch": 0.04515383807623648,
+      "grad_norm": 0.7511045932769775,
+      "learning_rate": 9.014675052410901e-05,
+      "loss": 0.2235,
+      "step": 215
+    },
+    {
+      "epoch": 0.04536385592775386,
+      "grad_norm": 0.6990196704864502,
+      "learning_rate": 9.056603773584906e-05,
+      "loss": 0.2548,
+      "step": 216
+    },
+    {
+      "epoch": 0.04557387377927124,
+      "grad_norm": 0.6954676508903503,
+      "learning_rate": 9.09853249475891e-05,
+      "loss": 0.2232,
+      "step": 217
+    },
+    {
+      "epoch": 0.045783891630788616,
+      "grad_norm": 0.6898313164710999,
+      "learning_rate": 9.140461215932914e-05,
+      "loss": 0.2241,
+      "step": 218
+    },
+    {
+      "epoch": 0.04599390948230599,
+      "grad_norm": 0.649360716342926,
+      "learning_rate": 9.182389937106919e-05,
+      "loss": 0.3236,
+      "step": 219
+    },
+    {
+      "epoch": 0.04620392733382338,
+      "grad_norm": 0.6722248792648315,
+      "learning_rate": 9.224318658280923e-05,
+      "loss": 0.4025,
+      "step": 220
+    },
+    {
+      "epoch": 0.046413945185340755,
+      "grad_norm": 0.8752652406692505,
+      "learning_rate": 9.266247379454928e-05,
+      "loss": 0.2937,
+      "step": 221
+    },
+    {
+      "epoch": 0.04662396303685813,
+      "grad_norm": 0.6925809979438782,
+      "learning_rate": 9.308176100628931e-05,
+      "loss": 0.2719,
+      "step": 222
+    },
+    {
+      "epoch": 0.04683398088837551,
+      "grad_norm": 0.6006962656974792,
+      "learning_rate": 9.350104821802935e-05,
+      "loss": 0.3025,
+      "step": 223
+    },
+    {
+      "epoch": 0.047043998739892894,
+      "grad_norm": 0.7841734290122986,
+      "learning_rate": 9.39203354297694e-05,
+      "loss": 0.267,
+      "step": 224
+    },
+    {
+      "epoch": 0.04725401659141027,
+      "grad_norm": 0.6895375847816467,
+      "learning_rate": 9.433962264150944e-05,
+      "loss": 0.185,
+      "step": 225
+    },
+    {
+      "epoch": 0.04746403444292765,
+      "grad_norm": 0.4874520003795624,
+      "learning_rate": 9.475890985324948e-05,
+      "loss": 0.1967,
+      "step": 226
+    },
+    {
+      "epoch": 0.047674052294445025,
+      "grad_norm": 0.5807353258132935,
+      "learning_rate": 9.517819706498953e-05,
+      "loss": 0.2649,
+      "step": 227
+    },
+    {
+      "epoch": 0.04788407014596241,
+      "grad_norm": 0.5833479166030884,
+      "learning_rate": 9.559748427672957e-05,
+      "loss": 0.2506,
+      "step": 228
+    },
+    {
+      "epoch": 0.04809408799747979,
+      "grad_norm": 0.5458546280860901,
+      "learning_rate": 9.601677148846961e-05,
+      "loss": 0.2338,
+      "step": 229
+    },
+    {
+      "epoch": 0.048304105848997164,
+      "grad_norm": 0.543692409992218,
+      "learning_rate": 9.643605870020965e-05,
+      "loss": 0.2058,
+      "step": 230
+    },
+    {
+      "epoch": 0.04851412370051454,
+      "grad_norm": 0.8071588277816772,
+      "learning_rate": 9.685534591194969e-05,
+      "loss": 0.2658,
+      "step": 231
+    },
+    {
+      "epoch": 0.048724141552031926,
+      "grad_norm": 0.6677277088165283,
+      "learning_rate": 9.727463312368972e-05,
+      "loss": 0.3603,
+      "step": 232
+    },
+    {
+      "epoch": 0.0489341594035493,
+      "grad_norm": 0.6699347496032715,
+      "learning_rate": 9.769392033542977e-05,
+      "loss": 0.2474,
+      "step": 233
+    },
+    {
+      "epoch": 0.04914417725506668,
+      "grad_norm": 0.47732773423194885,
+      "learning_rate": 9.811320754716981e-05,
+      "loss": 0.1856,
+      "step": 234
+    },
+    {
+      "epoch": 0.04935419510658406,
+      "grad_norm": 0.738476037979126,
+      "learning_rate": 9.853249475890985e-05,
+      "loss": 0.2443,
+      "step": 235
+    },
+    {
+      "epoch": 0.04956421295810144,
+      "grad_norm": 0.6604114174842834,
+      "learning_rate": 9.89517819706499e-05,
+      "loss": 0.2656,
+      "step": 236
+    },
+    {
+      "epoch": 0.04977423080961882,
+      "grad_norm": 0.6403035521507263,
+      "learning_rate": 9.937106918238994e-05,
+      "loss": 0.2248,
+      "step": 237
+    },
+    {
+      "epoch": 0.049984248661136196,
+      "grad_norm": 0.7960561513900757,
+      "learning_rate": 9.979035639412998e-05,
+      "loss": 0.5721,
+      "step": 238
+    },
+    {
+      "epoch": 0.050194266512653574,
+      "grad_norm": 0.7372507452964783,
+      "learning_rate": 0.00010020964360587002,
+      "loss": 0.263,
+      "step": 239
+    },
+    {
+      "epoch": 0.05040428436417096,
+      "grad_norm": 0.5040899515151978,
+      "learning_rate": 0.00010062893081761007,
+      "loss": 0.2005,
+      "step": 240
+    },
+    {
+      "epoch": 0.050614302215688335,
+      "grad_norm": 0.5214698314666748,
+      "learning_rate": 0.00010104821802935012,
+      "loss": 0.1686,
+      "step": 241
+    },
+    {
+      "epoch": 0.05082432006720571,
+      "grad_norm": 0.5759347677230835,
+      "learning_rate": 0.00010146750524109014,
+      "loss": 0.213,
+      "step": 242
+    },
+    {
+      "epoch": 0.05103433791872309,
+      "grad_norm": 0.5980076789855957,
+      "learning_rate": 0.0001018867924528302,
+      "loss": 0.2073,
+      "step": 243
+    },
+    {
+      "epoch": 0.05124435577024047,
+      "grad_norm": 0.4733896851539612,
+      "learning_rate": 0.00010230607966457025,
+      "loss": 0.2275,
+      "step": 244
+    },
+    {
+      "epoch": 0.05145437362175785,
+      "grad_norm": 0.597186803817749,
+      "learning_rate": 0.00010272536687631027,
+      "loss": 0.2205,
+      "step": 245
+    },
+    {
+      "epoch": 0.05166439147327523,
+      "grad_norm": 0.7414665818214417,
+      "learning_rate": 0.00010314465408805032,
+      "loss": 0.2796,
+      "step": 246
+    },
+    {
+      "epoch": 0.051874409324792606,
+      "grad_norm": 0.5712472200393677,
+      "learning_rate": 0.00010356394129979036,
+      "loss": 0.2343,
+      "step": 247
+    },
+    {
+      "epoch": 0.05208442717630998,
+      "grad_norm": 0.7146592140197754,
+      "learning_rate": 0.0001039832285115304,
+      "loss": 0.2339,
+      "step": 248
+    },
+    {
+      "epoch": 0.05229444502782737,
+      "grad_norm": 0.5384387373924255,
+      "learning_rate": 0.00010440251572327044,
+      "loss": 0.3013,
+      "step": 249
+    },
+    {
+      "epoch": 0.052504462879344745,
+      "grad_norm": 0.6523765921592712,
+      "learning_rate": 0.00010482180293501049,
+      "loss": 0.2295,
+      "step": 250
+    },
+    {
+      "epoch": 0.05271448073086212,
+      "grad_norm": 0.5877966284751892,
+      "learning_rate": 0.00010524109014675052,
+      "loss": 0.3045,
+      "step": 251
+    },
+    {
+      "epoch": 0.0529244985823795,
+      "grad_norm": 0.5458659529685974,
+      "learning_rate": 0.00010566037735849057,
+      "loss": 0.3458,
+      "step": 252
+    },
+    {
+      "epoch": 0.05313451643389688,
+      "grad_norm": 0.6964541673660278,
+      "learning_rate": 0.00010607966457023062,
+      "loss": 0.3715,
+      "step": 253
+    },
+    {
+      "epoch": 0.05334453428541426,
+      "grad_norm": 0.4040902853012085,
+      "learning_rate": 0.00010649895178197064,
+      "loss": 0.1554,
+      "step": 254
+    },
+    {
+      "epoch": 0.05355455213693164,
+      "grad_norm": 0.7013939023017883,
+      "learning_rate": 0.0001069182389937107,
+      "loss": 0.4111,
+      "step": 255
+    },
+    {
+      "epoch": 0.053764569988449015,
+      "grad_norm": 0.622279942035675,
+      "learning_rate": 0.00010733752620545075,
+      "loss": 0.2715,
+      "step": 256
+    },
+    {
+      "epoch": 0.0539745878399664,
+      "grad_norm": 0.6869280934333801,
+      "learning_rate": 0.00010775681341719077,
+      "loss": 0.3444,
+      "step": 257
+    },
+    {
+      "epoch": 0.05418460569148378,
+      "grad_norm": 0.6003340482711792,
+      "learning_rate": 0.00010817610062893082,
+      "loss": 0.261,
+      "step": 258
+    },
+    {
+      "epoch": 0.054394623543001154,
+      "grad_norm": 0.4238702356815338,
+      "learning_rate": 0.00010859538784067087,
+      "loss": 0.2615,
+      "step": 259
+    },
+    {
+      "epoch": 0.05460464139451853,
+      "grad_norm": 0.5799490809440613,
+      "learning_rate": 0.0001090146750524109,
+      "loss": 0.3293,
+      "step": 260
+    },
+    {
+      "epoch": 0.054814659246035916,
+      "grad_norm": 0.6378821134567261,
+      "learning_rate": 0.00010943396226415095,
+      "loss": 0.3184,
+      "step": 261
+    },
+    {
+      "epoch": 0.05502467709755329,
+      "grad_norm": 0.540050745010376,
+      "learning_rate": 0.000109853249475891,
+      "loss": 0.262,
+      "step": 262
+    },
+    {
+      "epoch": 0.05523469494907067,
+      "grad_norm": 0.551968514919281,
+      "learning_rate": 0.00011027253668763103,
+      "loss": 0.3183,
+      "step": 263
+    },
+    {
+      "epoch": 0.05544471280058805,
+      "grad_norm": 0.6024414896965027,
+      "learning_rate": 0.00011069182389937108,
+      "loss": 0.2883,
+      "step": 264
+    },
+    {
+      "epoch": 0.05565473065210543,
+      "grad_norm": 0.6821377277374268,
+      "learning_rate": 0.00011111111111111112,
+      "loss": 0.3178,
+      "step": 265
+    },
+    {
+      "epoch": 0.05586474850362281,
+      "grad_norm": 0.6535030603408813,
+      "learning_rate": 0.00011153039832285115,
+      "loss": 0.2091,
+      "step": 266
+    },
+    {
+      "epoch": 0.056074766355140186,
+      "grad_norm": 0.5255767703056335,
+      "learning_rate": 0.00011194968553459119,
+      "loss": 0.1802,
+      "step": 267
+    },
+    {
+      "epoch": 0.056284784206657563,
+      "grad_norm": 0.59894859790802,
+      "learning_rate": 0.00011236897274633124,
+      "loss": 0.2526,
+      "step": 268
+    },
+    {
+      "epoch": 0.05649480205817495,
+      "grad_norm": 0.6401522159576416,
+      "learning_rate": 0.00011278825995807127,
+      "loss": 0.2462,
+      "step": 269
+    },
+    {
+      "epoch": 0.056704819909692325,
+      "grad_norm": 0.6755663752555847,
+      "learning_rate": 0.00011320754716981132,
+      "loss": 0.1973,
+      "step": 270
+    },
+    {
+      "epoch": 0.0569148377612097,
+      "grad_norm": 0.5885902643203735,
+      "learning_rate": 0.00011362683438155137,
+      "loss": 0.2111,
+      "step": 271
+    },
+    {
+      "epoch": 0.05712485561272708,
+      "grad_norm": 0.48771098256111145,
+      "learning_rate": 0.0001140461215932914,
+      "loss": 0.2409,
+      "step": 272
+    },
+    {
+      "epoch": 0.057334873464244464,
+      "grad_norm": 0.5513054132461548,
+      "learning_rate": 0.00011446540880503145,
+      "loss": 0.1883,
+      "step": 273
+    },
+    {
+      "epoch": 0.05754489131576184,
+      "grad_norm": 0.43761372566223145,
+      "learning_rate": 0.0001148846960167715,
+      "loss": 0.228,
+      "step": 274
+    },
+    {
+      "epoch": 0.05775490916727922,
+      "grad_norm": 0.5232881307601929,
+      "learning_rate": 0.00011530398322851152,
+      "loss": 0.1592,
+      "step": 275
+    },
+    {
+      "epoch": 0.057964927018796596,
+      "grad_norm": 0.5873312950134277,
+      "learning_rate": 0.00011572327044025158,
+      "loss": 0.1972,
+      "step": 276
+    },
+    {
+      "epoch": 0.05817494487031398,
+      "grad_norm": 0.5464483499526978,
+      "learning_rate": 0.00011614255765199163,
+      "loss": 0.1502,
+      "step": 277
+    },
+    {
+      "epoch": 0.05838496272183136,
+      "grad_norm": 0.6480989456176758,
+      "learning_rate": 0.00011656184486373165,
+      "loss": 0.2349,
+      "step": 278
+    },
+    {
+      "epoch": 0.058594980573348734,
+      "grad_norm": 0.41417571902275085,
+      "learning_rate": 0.0001169811320754717,
+      "loss": 0.2097,
+      "step": 279
+    },
+    {
+      "epoch": 0.05880499842486611,
+      "grad_norm": 0.8272523880004883,
+      "learning_rate": 0.00011740041928721176,
+      "loss": 0.2676,
+      "step": 280
+    },
+    {
+      "epoch": 0.059015016276383496,
+      "grad_norm": 0.6363915205001831,
+      "learning_rate": 0.0001178197064989518,
+      "loss": 0.3084,
+      "step": 281
+    },
+    {
+      "epoch": 0.05922503412790087,
+      "grad_norm": 0.6411394476890564,
+      "learning_rate": 0.00011823899371069183,
+      "loss": 0.244,
+      "step": 282
+    },
+    {
+      "epoch": 0.05943505197941825,
+      "grad_norm": 0.9145995378494263,
+      "learning_rate": 0.00011865828092243187,
+      "loss": 0.298,
+      "step": 283
+    },
+    {
+      "epoch": 0.05964506983093563,
+      "grad_norm": 0.7248232960700989,
+      "learning_rate": 0.00011907756813417192,
+      "loss": 0.2438,
+      "step": 284
+    },
+    {
+      "epoch": 0.059855087682453005,
+      "grad_norm": 0.4901827573776245,
+      "learning_rate": 0.00011949685534591195,
+      "loss": 0.1903,
+      "step": 285
+    },
+    {
+      "epoch": 0.06006510553397039,
+      "grad_norm": 0.5104687809944153,
+      "learning_rate": 0.000119916142557652,
+      "loss": 0.2014,
+      "step": 286
+    },
+    {
+      "epoch": 0.06027512338548777,
+      "grad_norm": 0.5063393712043762,
+      "learning_rate": 0.00012033542976939205,
+      "loss": 0.212,
+      "step": 287
+    },
+    {
+      "epoch": 0.060485141237005144,
+      "grad_norm": 0.6044209599494934,
+      "learning_rate": 0.00012075471698113207,
+      "loss": 0.3138,
+      "step": 288
+    },
+    {
+      "epoch": 0.06069515908852252,
+      "grad_norm": 0.5843082666397095,
+      "learning_rate": 0.00012117400419287213,
+      "loss": 0.2199,
+      "step": 289
+    },
+    {
+      "epoch": 0.060905176940039905,
+      "grad_norm": 0.4589983820915222,
+      "learning_rate": 0.00012159329140461218,
+      "loss": 0.2222,
+      "step": 290
+    },
+    {
+      "epoch": 0.06111519479155728,
+      "grad_norm": 0.4094448983669281,
+      "learning_rate": 0.0001220125786163522,
+      "loss": 0.1286,
+      "step": 291
+    },
+    {
+      "epoch": 0.06132521264307466,
+      "grad_norm": 0.42624855041503906,
+      "learning_rate": 0.00012243186582809224,
+      "loss": 0.2719,
+      "step": 292
+    },
+    {
+      "epoch": 0.06153523049459204,
+      "grad_norm": 0.5488569736480713,
+      "learning_rate": 0.0001228511530398323,
+      "loss": 0.2588,
+      "step": 293
+    },
+    {
+      "epoch": 0.06174524834610942,
+      "grad_norm": 0.6029438972473145,
+      "learning_rate": 0.00012327044025157232,
+      "loss": 0.3182,
+      "step": 294
+    },
+    {
+      "epoch": 0.0619552661976268,
+      "grad_norm": 0.49090123176574707,
+      "learning_rate": 0.00012368972746331237,
+      "loss": 0.2192,
+      "step": 295
+    },
+    {
+      "epoch": 0.062165284049144176,
+      "grad_norm": 0.7553131580352783,
+      "learning_rate": 0.00012410901467505242,
+      "loss": 0.2932,
+      "step": 296
+    },
+    {
+      "epoch": 0.06237530190066155,
+      "grad_norm": 0.6839373707771301,
+      "learning_rate": 0.00012452830188679244,
+      "loss": 0.1896,
+      "step": 297
+    },
+    {
+      "epoch": 0.06258531975217893,
+      "grad_norm": 0.5805861949920654,
+      "learning_rate": 0.0001249475890985325,
+      "loss": 0.2613,
+      "step": 298
+    },
+    {
+      "epoch": 0.06279533760369631,
+      "grad_norm": 0.4247298836708069,
+      "learning_rate": 0.00012536687631027255,
+      "loss": 0.1701,
+      "step": 299
+    },
+    {
+      "epoch": 0.0630053554552137,
+      "grad_norm": 0.6167422533035278,
+      "learning_rate": 0.00012578616352201257,
+      "loss": 0.2919,
+      "step": 300
+    },
+    {
+      "epoch": 0.06321537330673108,
+      "grad_norm": 0.5140472054481506,
+      "learning_rate": 0.00012620545073375262,
+      "loss": 0.2204,
+      "step": 301
+    },
+    {
+      "epoch": 0.06342539115824845,
+      "grad_norm": 0.48360675573349,
+      "learning_rate": 0.00012662473794549268,
+      "loss": 0.2625,
+      "step": 302
+    },
+    {
+      "epoch": 0.06363540900976583,
+      "grad_norm": 0.5805841684341431,
+      "learning_rate": 0.0001270440251572327,
+      "loss": 0.2659,
+      "step": 303
+    },
+    {
+      "epoch": 0.06384542686128321,
+      "grad_norm": 0.4108704924583435,
+      "learning_rate": 0.00012746331236897275,
+      "loss": 0.1757,
+      "step": 304
+    },
+    {
+      "epoch": 0.06405544471280059,
+      "grad_norm": 0.4739980697631836,
+      "learning_rate": 0.0001278825995807128,
+      "loss": 0.2413,
+      "step": 305
+    },
+    {
+      "epoch": 0.06426546256431796,
+      "grad_norm": 0.6421864032745361,
+      "learning_rate": 0.00012830188679245283,
+      "loss": 0.3373,
+      "step": 306
+    },
+    {
+      "epoch": 0.06447548041583534,
+      "grad_norm": 0.6035056114196777,
+      "learning_rate": 0.00012872117400419288,
+      "loss": 0.1632,
+      "step": 307
+    },
+    {
+      "epoch": 0.06468549826735273,
+      "grad_norm": 0.5946957468986511,
+      "learning_rate": 0.00012914046121593293,
+      "loss": 0.2797,
+      "step": 308
+    },
+    {
+      "epoch": 0.06489551611887011,
+      "grad_norm": 0.5636250972747803,
+      "learning_rate": 0.00012955974842767296,
+      "loss": 0.3484,
+      "step": 309
+    },
+    {
+      "epoch": 0.06510553397038749,
+      "grad_norm": 0.5175902843475342,
+      "learning_rate": 0.000129979035639413,
+      "loss": 0.2306,
+      "step": 310
+    },
+    {
+      "epoch": 0.06531555182190486,
+      "grad_norm": 0.39933711290359497,
+      "learning_rate": 0.00013039832285115306,
+      "loss": 0.2018,
+      "step": 311
+    },
+    {
+      "epoch": 0.06552556967342224,
+      "grad_norm": 0.6203914284706116,
+      "learning_rate": 0.00013081761006289308,
+      "loss": 0.2519,
+      "step": 312
+    },
+    {
+      "epoch": 0.06573558752493962,
+      "grad_norm": 0.6847423911094666,
+      "learning_rate": 0.00013123689727463314,
+      "loss": 0.2125,
+      "step": 313
+    },
+    {
+      "epoch": 0.065945605376457,
+      "grad_norm": 0.5958030223846436,
+      "learning_rate": 0.0001316561844863732,
+      "loss": 0.2019,
+      "step": 314
+    },
+    {
+      "epoch": 0.06615562322797437,
+      "grad_norm": 0.4878827631473541,
+      "learning_rate": 0.0001320754716981132,
+      "loss": 0.2286,
+      "step": 315
+    },
+    {
+      "epoch": 0.06636564107949176,
+      "grad_norm": 0.5386853814125061,
+      "learning_rate": 0.00013249475890985326,
+      "loss": 0.2349,
+      "step": 316
+    },
+    {
+      "epoch": 0.06657565893100914,
+      "grad_norm": 0.5583687424659729,
+      "learning_rate": 0.00013291404612159332,
+      "loss": 0.273,
+      "step": 317
+    },
+    {
+      "epoch": 0.06678567678252652,
+      "grad_norm": 0.503384530544281,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.2718,
+      "step": 318
+    },
+    {
+      "epoch": 0.0669956946340439,
+      "grad_norm": 0.6256868839263916,
+      "learning_rate": 0.0001337526205450734,
+      "loss": 0.2176,
+      "step": 319
+    },
+    {
+      "epoch": 0.06720571248556127,
+      "grad_norm": 0.4585525095462799,
+      "learning_rate": 0.00013417190775681344,
+      "loss": 0.1671,
+      "step": 320
+    },
+    {
+      "epoch": 0.06741573033707865,
+      "grad_norm": 0.52493816614151,
+      "learning_rate": 0.00013459119496855347,
+      "loss": 0.2129,
+      "step": 321
+    },
+    {
+      "epoch": 0.06762574818859603,
+      "grad_norm": 0.7206648588180542,
+      "learning_rate": 0.00013501048218029352,
+      "loss": 0.1872,
+      "step": 322
+    },
+    {
+      "epoch": 0.0678357660401134,
+      "grad_norm": 0.5732535123825073,
+      "learning_rate": 0.00013542976939203354,
+      "loss": 0.2131,
+      "step": 323
+    },
+    {
+      "epoch": 0.0680457838916308,
+      "grad_norm": 0.5404482483863831,
+      "learning_rate": 0.0001358490566037736,
+      "loss": 0.1472,
+      "step": 324
+    },
+    {
+      "epoch": 0.06825580174314817,
+      "grad_norm": 0.7235817313194275,
+      "learning_rate": 0.00013626834381551362,
+      "loss": 0.2404,
+      "step": 325
+    },
+    {
+      "epoch": 0.06846581959466555,
+      "grad_norm": 0.4254133999347687,
+      "learning_rate": 0.00013668763102725367,
+      "loss": 0.2133,
+      "step": 326
+    },
+    {
+      "epoch": 0.06867583744618293,
+      "grad_norm": 0.4804741144180298,
+      "learning_rate": 0.0001371069182389937,
+      "loss": 0.1776,
+      "step": 327
+    },
+    {
+      "epoch": 0.0688858552977003,
+      "grad_norm": 0.4900747537612915,
+      "learning_rate": 0.00013752620545073375,
+      "loss": 0.1958,
+      "step": 328
+    },
+    {
+      "epoch": 0.06909587314921768,
+      "grad_norm": 0.576337456703186,
+      "learning_rate": 0.0001379454926624738,
+      "loss": 0.2318,
+      "step": 329
+    },
+    {
+      "epoch": 0.06930589100073506,
+      "grad_norm": 0.5610971450805664,
+      "learning_rate": 0.00013836477987421382,
+      "loss": 0.2631,
+      "step": 330
+    },
+    {
+      "epoch": 0.06951590885225244,
+      "grad_norm": 0.6010019779205322,
+      "learning_rate": 0.00013878406708595388,
+      "loss": 0.2201,
+      "step": 331
+    },
+    {
+      "epoch": 0.06972592670376981,
+      "grad_norm": 0.4658229947090149,
+      "learning_rate": 0.00013920335429769393,
+      "loss": 0.1602,
+      "step": 332
+    },
+    {
+      "epoch": 0.0699359445552872,
+      "grad_norm": 0.5411532521247864,
+      "learning_rate": 0.00013962264150943395,
+      "loss": 0.1942,
+      "step": 333
+    },
+    {
+      "epoch": 0.07014596240680458,
+      "grad_norm": 0.875629186630249,
+      "learning_rate": 0.000140041928721174,
+      "loss": 0.2337,
+      "step": 334
+    },
+    {
+      "epoch": 0.07035598025832196,
+      "grad_norm": 0.5620985627174377,
+      "learning_rate": 0.00014046121593291406,
+      "loss": 0.2641,
+      "step": 335
+    },
+    {
+      "epoch": 0.07056599810983934,
+      "grad_norm": 0.8389297723770142,
+      "learning_rate": 0.00014088050314465408,
+      "loss": 0.3069,
+      "step": 336
+    },
+    {
+      "epoch": 0.07077601596135671,
+      "grad_norm": 0.4745865762233734,
+      "learning_rate": 0.00014129979035639413,
+      "loss": 0.1626,
+      "step": 337
+    },
+    {
+      "epoch": 0.07098603381287409,
+      "grad_norm": 0.4688374996185303,
+      "learning_rate": 0.00014171907756813418,
+      "loss": 0.1327,
+      "step": 338
+    },
+    {
+      "epoch": 0.07119605166439147,
+      "grad_norm": 0.4219890832901001,
+      "learning_rate": 0.0001421383647798742,
+      "loss": 0.142,
+      "step": 339
+    },
+    {
+      "epoch": 0.07140606951590885,
+      "grad_norm": 0.700579047203064,
+      "learning_rate": 0.00014255765199161426,
+      "loss": 0.179,
+      "step": 340
+    },
+    {
+      "epoch": 0.07161608736742624,
+      "grad_norm": 0.36132583022117615,
+      "learning_rate": 0.0001429769392033543,
+      "loss": 0.1283,
+      "step": 341
+    },
+    {
+      "epoch": 0.07182610521894361,
+      "grad_norm": 0.9342030882835388,
+      "learning_rate": 0.00014339622641509434,
+      "loss": 0.3873,
+      "step": 342
+    },
+    {
+      "epoch": 0.07203612307046099,
+      "grad_norm": 0.6389639973640442,
+      "learning_rate": 0.0001438155136268344,
+      "loss": 0.2631,
+      "step": 343
+    },
+    {
+      "epoch": 0.07224614092197837,
+      "grad_norm": 0.7687662243843079,
+      "learning_rate": 0.00014423480083857444,
+      "loss": 0.2002,
+      "step": 344
+    },
+    {
+      "epoch": 0.07245615877349575,
+      "grad_norm": 0.6517148613929749,
+      "learning_rate": 0.00014465408805031446,
+      "loss": 0.2454,
+      "step": 345
+    },
+    {
+      "epoch": 0.07266617662501312,
+      "grad_norm": 0.5010355710983276,
+      "learning_rate": 0.00014507337526205452,
+      "loss": 0.1541,
+      "step": 346
+    },
+    {
+      "epoch": 0.0728761944765305,
+      "grad_norm": 0.49431943893432617,
+      "learning_rate": 0.00014549266247379457,
+      "loss": 0.213,
+      "step": 347
+    },
+    {
+      "epoch": 0.07308621232804788,
+      "grad_norm": 0.6462149024009705,
+      "learning_rate": 0.0001459119496855346,
+      "loss": 0.2642,
+      "step": 348
+    },
+    {
+      "epoch": 0.07329623017956527,
+      "grad_norm": 0.5412748456001282,
+      "learning_rate": 0.00014633123689727464,
+      "loss": 0.187,
+      "step": 349
+    },
+    {
+      "epoch": 0.07350624803108265,
+      "grad_norm": 0.6458069682121277,
+      "learning_rate": 0.0001467505241090147,
+      "loss": 0.3224,
+      "step": 350
+    },
+    {
+      "epoch": 0.07371626588260002,
+      "grad_norm": 0.4398702383041382,
+      "learning_rate": 0.00014716981132075472,
+      "loss": 0.2419,
+      "step": 351
+    },
+    {
+      "epoch": 0.0739262837341174,
+      "grad_norm": 0.47583240270614624,
+      "learning_rate": 0.00014758909853249477,
+      "loss": 0.2259,
+      "step": 352
+    },
+    {
+      "epoch": 0.07413630158563478,
+      "grad_norm": 0.5058132410049438,
+      "learning_rate": 0.00014800838574423482,
+      "loss": 0.3733,
+      "step": 353
+    },
+    {
+      "epoch": 0.07434631943715216,
+      "grad_norm": 0.4765789210796356,
+      "learning_rate": 0.00014842767295597485,
+      "loss": 0.3204,
+      "step": 354
+    },
+    {
+      "epoch": 0.07455633728866953,
+      "grad_norm": 0.4549868106842041,
+      "learning_rate": 0.0001488469601677149,
+      "loss": 0.2788,
+      "step": 355
+    },
+    {
+      "epoch": 0.07476635514018691,
+      "grad_norm": 0.44640183448791504,
+      "learning_rate": 0.00014926624737945495,
+      "loss": 0.225,
+      "step": 356
+    },
+    {
+      "epoch": 0.0749763729917043,
+      "grad_norm": 0.5040209293365479,
+      "learning_rate": 0.00014968553459119498,
+      "loss": 0.288,
+      "step": 357
+    },
+    {
+      "epoch": 0.07518639084322168,
+      "grad_norm": 0.7681525349617004,
+      "learning_rate": 0.00015010482180293503,
+      "loss": 0.2677,
+      "step": 358
+    },
+    {
+      "epoch": 0.07539640869473906,
+      "grad_norm": 0.3658473491668701,
+      "learning_rate": 0.00015052410901467505,
+      "loss": 0.1944,
+      "step": 359
+    },
+    {
+      "epoch": 0.07560642654625643,
+      "grad_norm": 0.5071917772293091,
+      "learning_rate": 0.0001509433962264151,
+      "loss": 0.1937,
+      "step": 360
+    },
+    {
+      "epoch": 0.07581644439777381,
+      "grad_norm": 0.5669259428977966,
+      "learning_rate": 0.00015136268343815513,
+      "loss": 0.2755,
+      "step": 361
+    },
+    {
+      "epoch": 0.07602646224929119,
+      "grad_norm": 0.5721021294593811,
+      "learning_rate": 0.00015178197064989518,
+      "loss": 0.2233,
+      "step": 362
+    },
+    {
+      "epoch": 0.07623648010080857,
+      "grad_norm": 0.5776953101158142,
+      "learning_rate": 0.00015220125786163523,
+      "loss": 0.2918,
+      "step": 363
+    },
+    {
+      "epoch": 0.07644649795232594,
+      "grad_norm": 0.7863572239875793,
+      "learning_rate": 0.00015262054507337526,
+      "loss": 0.3225,
+      "step": 364
+    },
+    {
+      "epoch": 0.07665651580384332,
+      "grad_norm": 0.7403888702392578,
+      "learning_rate": 0.0001530398322851153,
+      "loss": 0.1836,
+      "step": 365
+    },
+    {
+      "epoch": 0.07686653365536071,
+      "grad_norm": 0.7344810962677002,
+      "learning_rate": 0.00015345911949685536,
+      "loss": 0.342,
+      "step": 366
+    },
+    {
+      "epoch": 0.07707655150687809,
+      "grad_norm": 0.6341666579246521,
+      "learning_rate": 0.00015387840670859538,
+      "loss": 0.2222,
+      "step": 367
+    },
+    {
+      "epoch": 0.07728656935839547,
+      "grad_norm": 0.7821016907691956,
+      "learning_rate": 0.00015429769392033544,
+      "loss": 0.3106,
+      "step": 368
+    },
+    {
+      "epoch": 0.07749658720991284,
+      "grad_norm": 0.5648399591445923,
+      "learning_rate": 0.0001547169811320755,
+      "loss": 0.1907,
+      "step": 369
+    },
+    {
+      "epoch": 0.07770660506143022,
+      "grad_norm": 0.5853981971740723,
+      "learning_rate": 0.0001551362683438155,
+      "loss": 0.1873,
+      "step": 370
+    },
+    {
+      "epoch": 0.0779166229129476,
+      "grad_norm": 0.6429926753044128,
+      "learning_rate": 0.00015555555555555556,
+      "loss": 0.177,
+      "step": 371
+    },
+    {
+      "epoch": 0.07812664076446497,
+      "grad_norm": 0.5365523099899292,
+      "learning_rate": 0.00015597484276729561,
+      "loss": 0.2283,
+      "step": 372
+    },
+    {
+      "epoch": 0.07833665861598235,
+      "grad_norm": 0.4820340871810913,
+      "learning_rate": 0.00015639412997903564,
+      "loss": 0.2179,
+      "step": 373
+    },
+    {
+      "epoch": 0.07854667646749974,
+      "grad_norm": 0.5231903195381165,
+      "learning_rate": 0.0001568134171907757,
+      "loss": 0.2165,
+      "step": 374
+    },
+    {
+      "epoch": 0.07875669431901712,
+      "grad_norm": 0.6309874057769775,
+      "learning_rate": 0.00015723270440251574,
+      "loss": 0.2511,
+      "step": 375
+    },
+    {
+      "epoch": 0.0789667121705345,
+      "grad_norm": 0.6248964667320251,
+      "learning_rate": 0.00015765199161425577,
+      "loss": 0.192,
+      "step": 376
+    },
+    {
+      "epoch": 0.07917673002205187,
+      "grad_norm": 0.4089469611644745,
+      "learning_rate": 0.00015807127882599582,
+      "loss": 0.1674,
+      "step": 377
+    },
+    {
+      "epoch": 0.07938674787356925,
+      "grad_norm": 0.5720129609107971,
+      "learning_rate": 0.00015849056603773587,
+      "loss": 0.2571,
+      "step": 378
+    },
+    {
+      "epoch": 0.07959676572508663,
+      "grad_norm": 0.505424976348877,
+      "learning_rate": 0.0001589098532494759,
+      "loss": 0.2189,
+      "step": 379
+    },
+    {
+      "epoch": 0.079806783576604,
+      "grad_norm": 0.4483712315559387,
+      "learning_rate": 0.00015932914046121595,
+      "loss": 0.2959,
+      "step": 380
+    },
+    {
+      "epoch": 0.08001680142812138,
+      "grad_norm": 0.6313521862030029,
+      "learning_rate": 0.000159748427672956,
+      "loss": 0.237,
+      "step": 381
+    },
+    {
+      "epoch": 0.08022681927963878,
+      "grad_norm": 0.530503511428833,
+      "learning_rate": 0.00016016771488469602,
+      "loss": 0.1922,
+      "step": 382
+    },
+    {
+      "epoch": 0.08043683713115615,
+      "grad_norm": 0.65278160572052,
+      "learning_rate": 0.00016058700209643607,
+      "loss": 0.2714,
+      "step": 383
+    },
+    {
+      "epoch": 0.08064685498267353,
+      "grad_norm": 0.6226363182067871,
+      "learning_rate": 0.00016100628930817613,
+      "loss": 0.1913,
+      "step": 384
+    },
+    {
+      "epoch": 0.08085687283419091,
+      "grad_norm": 0.6313908696174622,
+      "learning_rate": 0.00016142557651991615,
+      "loss": 0.2457,
+      "step": 385
+    },
+    {
+      "epoch": 0.08106689068570828,
+      "grad_norm": 0.5335121750831604,
+      "learning_rate": 0.0001618448637316562,
+      "loss": 0.2537,
+      "step": 386
+    },
+    {
+      "epoch": 0.08127690853722566,
+      "grad_norm": 0.7243566513061523,
+      "learning_rate": 0.00016226415094339625,
+      "loss": 0.2125,
+      "step": 387
+    },
+    {
+      "epoch": 0.08148692638874304,
+      "grad_norm": 0.5874237418174744,
+      "learning_rate": 0.00016268343815513628,
+      "loss": 0.2104,
+      "step": 388
+    },
+    {
+      "epoch": 0.08169694424026042,
+      "grad_norm": 0.5792878866195679,
+      "learning_rate": 0.00016310272536687633,
+      "loss": 0.198,
+      "step": 389
+    },
+    {
+      "epoch": 0.08190696209177781,
+      "grad_norm": 0.5439760088920593,
+      "learning_rate": 0.00016352201257861635,
+      "loss": 0.1895,
+      "step": 390
+    },
+    {
+      "epoch": 0.08211697994329518,
+      "grad_norm": 0.6903837323188782,
+      "learning_rate": 0.0001639412997903564,
+      "loss": 0.2587,
+      "step": 391
+    },
+    {
+      "epoch": 0.08232699779481256,
+      "grad_norm": 0.6126405596733093,
+      "learning_rate": 0.00016436058700209643,
+      "loss": 0.2232,
+      "step": 392
+    },
+    {
+      "epoch": 0.08253701564632994,
+      "grad_norm": 0.9248547554016113,
+      "learning_rate": 0.00016477987421383648,
+      "loss": 0.267,
+      "step": 393
+    },
+    {
+      "epoch": 0.08274703349784732,
+      "grad_norm": 0.6509301066398621,
+      "learning_rate": 0.0001651991614255765,
+      "loss": 0.2109,
+      "step": 394
+    },
+    {
+      "epoch": 0.0829570513493647,
+      "grad_norm": 0.5985137820243835,
+      "learning_rate": 0.00016561844863731656,
+      "loss": 0.1783,
+      "step": 395
+    },
+    {
+      "epoch": 0.08316706920088207,
+      "grad_norm": 0.6711693406105042,
+      "learning_rate": 0.0001660377358490566,
+      "loss": 0.2115,
+      "step": 396
+    },
+    {
+      "epoch": 0.08337708705239945,
+      "grad_norm": 0.4494445025920868,
+      "learning_rate": 0.00016645702306079664,
+      "loss": 0.1486,
+      "step": 397
+    },
+    {
+      "epoch": 0.08358710490391684,
+      "grad_norm": 0.5083547830581665,
+      "learning_rate": 0.0001668763102725367,
+      "loss": 0.2317,
+      "step": 398
+    },
+    {
+      "epoch": 0.08379712275543422,
+      "grad_norm": 0.7552763819694519,
+      "learning_rate": 0.00016729559748427674,
+      "loss": 0.236,
+      "step": 399
+    },
+    {
+      "epoch": 0.0840071406069516,
+      "grad_norm": 0.7656201124191284,
+      "learning_rate": 0.00016771488469601676,
+      "loss": 0.2732,
+      "step": 400
+    },
+    {
+      "epoch": 0.08421715845846897,
+      "grad_norm": 0.3850518465042114,
+      "learning_rate": 0.00016813417190775681,
+      "loss": 0.3322,
+      "step": 401
+    },
+    {
+      "epoch": 0.08442717630998635,
+      "grad_norm": 0.5610989928245544,
+      "learning_rate": 0.00016855345911949687,
+      "loss": 0.2844,
+      "step": 402
+    },
+    {
+      "epoch": 0.08463719416150373,
+      "grad_norm": 0.7500874400138855,
+      "learning_rate": 0.0001689727463312369,
+      "loss": 0.3651,
+      "step": 403
+    },
+    {
+      "epoch": 0.0848472120130211,
+      "grad_norm": 0.45343145728111267,
+      "learning_rate": 0.00016939203354297694,
+      "loss": 0.2174,
+      "step": 404
+    },
+    {
+      "epoch": 0.08505722986453848,
+      "grad_norm": 0.6427581310272217,
+      "learning_rate": 0.000169811320754717,
+      "loss": 0.2748,
+      "step": 405
+    },
+    {
+      "epoch": 0.08526724771605586,
+      "grad_norm": 0.64598149061203,
+      "learning_rate": 0.00017023060796645702,
+      "loss": 0.2912,
+      "step": 406
+    },
+    {
+      "epoch": 0.08547726556757325,
+      "grad_norm": 0.49100759625434875,
+      "learning_rate": 0.00017064989517819707,
+      "loss": 0.2582,
+      "step": 407
+    },
+    {
+      "epoch": 0.08568728341909063,
+      "grad_norm": 0.5637136101722717,
+      "learning_rate": 0.00017106918238993712,
+      "loss": 0.254,
+      "step": 408
+    },
+    {
+      "epoch": 0.085897301270608,
+      "grad_norm": 0.5617924928665161,
+      "learning_rate": 0.00017148846960167715,
+      "loss": 0.2043,
+      "step": 409
+    },
+    {
+      "epoch": 0.08610731912212538,
+      "grad_norm": 0.5467379093170166,
+      "learning_rate": 0.0001719077568134172,
+      "loss": 0.2363,
+      "step": 410
+    },
+    {
+      "epoch": 0.08631733697364276,
+      "grad_norm": 0.6882631778717041,
+      "learning_rate": 0.00017232704402515725,
+      "loss": 0.2341,
+      "step": 411
+    },
+    {
+      "epoch": 0.08652735482516014,
+      "grad_norm": 0.40710386633872986,
+      "learning_rate": 0.00017274633123689727,
+      "loss": 0.1952,
+      "step": 412
+    },
+    {
+      "epoch": 0.08673737267667751,
+      "grad_norm": 0.688685953617096,
+      "learning_rate": 0.00017316561844863733,
+      "loss": 0.3588,
+      "step": 413
+    },
+    {
+      "epoch": 0.08694739052819489,
+      "grad_norm": 0.7739083170890808,
+      "learning_rate": 0.00017358490566037738,
+      "loss": 0.1897,
+      "step": 414
+    },
+    {
+      "epoch": 0.08715740837971228,
+      "grad_norm": 0.45127734541893005,
+      "learning_rate": 0.0001740041928721174,
+      "loss": 0.2282,
+      "step": 415
+    },
+    {
+      "epoch": 0.08736742623122966,
+      "grad_norm": 0.6713837385177612,
+      "learning_rate": 0.00017442348008385745,
+      "loss": 0.3395,
+      "step": 416
+    },
+    {
+      "epoch": 0.08757744408274704,
+      "grad_norm": 0.5886412858963013,
+      "learning_rate": 0.0001748427672955975,
+      "loss": 0.1673,
+      "step": 417
+    },
+    {
+      "epoch": 0.08778746193426441,
+      "grad_norm": 0.6254634261131287,
+      "learning_rate": 0.00017526205450733753,
+      "loss": 0.2392,
+      "step": 418
+    },
+    {
+      "epoch": 0.08799747978578179,
+      "grad_norm": 0.5936654806137085,
+      "learning_rate": 0.00017568134171907758,
+      "loss": 0.1817,
+      "step": 419
+    },
+    {
+      "epoch": 0.08820749763729917,
+      "grad_norm": 0.6107873320579529,
+      "learning_rate": 0.00017610062893081763,
+      "loss": 0.2877,
+      "step": 420
+    },
+    {
+      "epoch": 0.08841751548881654,
+      "grad_norm": 0.583984911441803,
+      "learning_rate": 0.00017651991614255766,
+      "loss": 0.2382,
+      "step": 421
+    },
+    {
+      "epoch": 0.08862753334033392,
+      "grad_norm": 0.6411318778991699,
+      "learning_rate": 0.0001769392033542977,
+      "loss": 0.2528,
+      "step": 422
+    },
+    {
+      "epoch": 0.08883755119185131,
+      "grad_norm": 0.5407703518867493,
+      "learning_rate": 0.00017735849056603776,
+      "loss": 0.217,
+      "step": 423
+    },
+    {
+      "epoch": 0.08904756904336869,
+      "grad_norm": 0.5086292028427124,
+      "learning_rate": 0.00017777777777777779,
+      "loss": 0.1703,
+      "step": 424
+    },
+    {
+      "epoch": 0.08925758689488607,
+      "grad_norm": 0.534488320350647,
+      "learning_rate": 0.00017819706498951784,
+      "loss": 0.4212,
+      "step": 425
+    },
+    {
+      "epoch": 0.08946760474640345,
+      "grad_norm": 0.5869336724281311,
+      "learning_rate": 0.00017861635220125786,
+      "loss": 0.3634,
+      "step": 426
+    },
+    {
+      "epoch": 0.08967762259792082,
+      "grad_norm": 0.5784481763839722,
+      "learning_rate": 0.00017903563941299791,
+      "loss": 0.2076,
+      "step": 427
+    },
+    {
+      "epoch": 0.0898876404494382,
+      "grad_norm": 0.467438668012619,
+      "learning_rate": 0.00017945492662473794,
+      "loss": 0.1948,
+      "step": 428
+    },
+    {
+      "epoch": 0.09009765830095558,
+      "grad_norm": 0.8514359593391418,
+      "learning_rate": 0.000179874213836478,
+      "loss": 0.2695,
+      "step": 429
+    },
+    {
+      "epoch": 0.09030767615247295,
+      "grad_norm": 0.630066990852356,
+      "learning_rate": 0.00018029350104821801,
+      "loss": 0.2624,
+      "step": 430
+    },
+    {
+      "epoch": 0.09051769400399035,
+      "grad_norm": 0.6442775130271912,
+      "learning_rate": 0.00018071278825995807,
+      "loss": 0.2555,
+      "step": 431
+    },
+    {
+      "epoch": 0.09072771185550772,
+      "grad_norm": 0.6193580031394958,
+      "learning_rate": 0.00018113207547169812,
+      "loss": 0.2388,
+      "step": 432
+    },
+    {
+      "epoch": 0.0909377297070251,
+      "grad_norm": 1.108219027519226,
+      "learning_rate": 0.00018155136268343814,
+      "loss": 0.2135,
+      "step": 433
+    },
+    {
+      "epoch": 0.09114774755854248,
+      "grad_norm": 0.666748046875,
+      "learning_rate": 0.0001819706498951782,
+      "loss": 0.2402,
+      "step": 434
+    },
+    {
+      "epoch": 0.09135776541005985,
+      "grad_norm": 0.516096293926239,
+      "learning_rate": 0.00018238993710691825,
+      "loss": 0.2022,
+      "step": 435
+    },
+    {
+      "epoch": 0.09156778326157723,
+      "grad_norm": 0.4976787269115448,
+      "learning_rate": 0.00018280922431865827,
+      "loss": 0.1492,
+      "step": 436
+    },
+    {
+      "epoch": 0.09177780111309461,
+      "grad_norm": 0.596254289150238,
+      "learning_rate": 0.00018322851153039832,
+      "loss": 0.1926,
+      "step": 437
+    },
+    {
+      "epoch": 0.09198781896461199,
+      "grad_norm": 0.4079163670539856,
+      "learning_rate": 0.00018364779874213837,
+      "loss": 0.219,
+      "step": 438
+    },
+    {
+      "epoch": 0.09219783681612938,
+      "grad_norm": 0.4968511164188385,
+      "learning_rate": 0.00018406708595387843,
+      "loss": 0.2203,
+      "step": 439
+    },
+    {
+      "epoch": 0.09240785466764675,
+      "grad_norm": 0.5749839544296265,
+      "learning_rate": 0.00018448637316561845,
+      "loss": 0.2561,
+      "step": 440
+    },
+    {
+      "epoch": 0.09261787251916413,
+      "grad_norm": 0.46315014362335205,
+      "learning_rate": 0.0001849056603773585,
+      "loss": 0.1608,
+      "step": 441
+    },
+    {
+      "epoch": 0.09282789037068151,
+      "grad_norm": 0.4630315601825714,
+      "learning_rate": 0.00018532494758909855,
+      "loss": 0.1564,
+      "step": 442
+    },
+    {
+      "epoch": 0.09303790822219889,
+      "grad_norm": 0.5688292384147644,
+      "learning_rate": 0.00018574423480083858,
+      "loss": 0.1483,
+      "step": 443
+    },
+    {
+      "epoch": 0.09324792607371626,
+      "grad_norm": 0.9025551676750183,
+      "learning_rate": 0.00018616352201257863,
+      "loss": 0.216,
+      "step": 444
+    },
+    {
+      "epoch": 0.09345794392523364,
+      "grad_norm": 0.6165971755981445,
+      "learning_rate": 0.00018658280922431868,
+      "loss": 0.1852,
+      "step": 445
+    },
+    {
+      "epoch": 0.09366796177675102,
+      "grad_norm": 0.5040764808654785,
+      "learning_rate": 0.0001870020964360587,
+      "loss": 0.1534,
+      "step": 446
+    },
+    {
+      "epoch": 0.0938779796282684,
+      "grad_norm": 0.6921994686126709,
+      "learning_rate": 0.00018742138364779876,
+      "loss": 0.2739,
+      "step": 447
+    },
+    {
+      "epoch": 0.09408799747978579,
+      "grad_norm": 0.9911003708839417,
+      "learning_rate": 0.0001878406708595388,
+      "loss": 0.2148,
+      "step": 448
+    },
+    {
+      "epoch": 0.09429801533130316,
+      "grad_norm": 0.4098629951477051,
+      "learning_rate": 0.00018825995807127883,
+      "loss": 0.1627,
+      "step": 449
+    },
+    {
+      "epoch": 0.09450803318282054,
+      "grad_norm": 0.5267736315727234,
+      "learning_rate": 0.00018867924528301889,
+      "loss": 0.1714,
+      "step": 450
+    },
+    {
+      "epoch": 0.09471805103433792,
+      "grad_norm": 0.826693058013916,
+      "learning_rate": 0.00018909853249475894,
+      "loss": 0.3614,
+      "step": 451
+    },
+    {
+      "epoch": 0.0949280688858553,
+      "grad_norm": 0.7960173487663269,
+      "learning_rate": 0.00018951781970649896,
+      "loss": 0.2831,
+      "step": 452
+    },
+    {
+      "epoch": 0.09513808673737267,
+      "grad_norm": 0.5408324003219604,
+      "learning_rate": 0.00018993710691823901,
+      "loss": 0.2396,
+      "step": 453
+    },
+    {
+      "epoch": 0.09534810458889005,
+      "grad_norm": 0.5551522374153137,
+      "learning_rate": 0.00019035639412997907,
+      "loss": 0.2399,
+      "step": 454
+    },
+    {
+      "epoch": 0.09555812244040743,
+      "grad_norm": 0.5053918361663818,
+      "learning_rate": 0.0001907756813417191,
+      "loss": 0.2054,
+      "step": 455
+    },
+    {
+      "epoch": 0.09576814029192482,
+      "grad_norm": 0.6408351063728333,
+      "learning_rate": 0.00019119496855345914,
+      "loss": 0.2319,
+      "step": 456
+    },
+    {
+      "epoch": 0.0959781581434422,
+      "grad_norm": 0.6061432361602783,
+      "learning_rate": 0.0001916142557651992,
+      "loss": 0.2289,
+      "step": 457
+    },
+    {
+      "epoch": 0.09618817599495957,
+      "grad_norm": 0.6452487111091614,
+      "learning_rate": 0.00019203354297693922,
+      "loss": 0.2787,
+      "step": 458
+    },
+    {
+      "epoch": 0.09639819384647695,
+      "grad_norm": 0.5427165627479553,
+      "learning_rate": 0.00019245283018867927,
+      "loss": 0.3181,
+      "step": 459
+    },
+    {
+      "epoch": 0.09660821169799433,
+      "grad_norm": 0.5678632259368896,
+      "learning_rate": 0.0001928721174004193,
+      "loss": 0.3166,
+      "step": 460
+    },
+    {
+      "epoch": 0.0968182295495117,
+      "grad_norm": 0.554288923740387,
+      "learning_rate": 0.00019329140461215935,
+      "loss": 0.214,
+      "step": 461
+    },
+    {
+      "epoch": 0.09702824740102908,
+      "grad_norm": 0.7040925621986389,
+      "learning_rate": 0.00019371069182389937,
+      "loss": 0.473,
+      "step": 462
+    },
+    {
+      "epoch": 0.09723826525254646,
+      "grad_norm": 0.6425243020057678,
+      "learning_rate": 0.00019412997903563942,
+      "loss": 0.3542,
+      "step": 463
+    },
+    {
+      "epoch": 0.09744828310406385,
+      "grad_norm": 0.6984371542930603,
+      "learning_rate": 0.00019454926624737945,
+      "loss": 0.2165,
+      "step": 464
+    },
+    {
+      "epoch": 0.09765830095558123,
+      "grad_norm": 0.5204288959503174,
+      "learning_rate": 0.0001949685534591195,
+      "loss": 0.232,
+      "step": 465
+    },
+    {
+      "epoch": 0.0978683188070986,
+      "grad_norm": 0.5688004493713379,
+      "learning_rate": 0.00019538784067085955,
+      "loss": 0.3099,
+      "step": 466
+    },
+    {
+      "epoch": 0.09807833665861598,
+      "grad_norm": 0.4850284159183502,
+      "learning_rate": 0.00019580712788259957,
+      "loss": 0.202,
+      "step": 467
+    },
+    {
+      "epoch": 0.09828835451013336,
+      "grad_norm": 0.5034931302070618,
+      "learning_rate": 0.00019622641509433963,
+      "loss": 0.2077,
+      "step": 468
+    },
+    {
+      "epoch": 0.09849837236165074,
+      "grad_norm": 0.6193839311599731,
+      "learning_rate": 0.00019664570230607968,
+      "loss": 0.318,
+      "step": 469
+    },
+    {
+      "epoch": 0.09870839021316812,
+      "grad_norm": 0.6226887702941895,
+      "learning_rate": 0.0001970649895178197,
+      "loss": 0.3538,
+      "step": 470
+    },
+    {
+      "epoch": 0.09891840806468549,
+      "grad_norm": 0.6102244257926941,
+      "learning_rate": 0.00019748427672955975,
+      "loss": 0.2865,
+      "step": 471
+    },
+    {
+      "epoch": 0.09912842591620288,
+      "grad_norm": 0.6731789112091064,
+      "learning_rate": 0.0001979035639412998,
+      "loss": 0.3135,
+      "step": 472
+    },
+    {
+      "epoch": 0.09933844376772026,
+      "grad_norm": 0.661486029624939,
+      "learning_rate": 0.00019832285115303983,
+      "loss": 0.2198,
+      "step": 473
+    },
+    {
+      "epoch": 0.09954846161923764,
+      "grad_norm": 0.7321748733520508,
+      "learning_rate": 0.00019874213836477988,
+      "loss": 0.2557,
+      "step": 474
+    },
+    {
+      "epoch": 0.09975847947075502,
+      "grad_norm": 0.5708514451980591,
+      "learning_rate": 0.00019916142557651993,
+      "loss": 0.2385,
+      "step": 475
+    },
+    {
+      "epoch": 0.09996849732227239,
+      "grad_norm": 0.8140966892242432,
+      "learning_rate": 0.00019958071278825996,
+      "loss": 0.1724,
+      "step": 476
+    },
+    {
+      "epoch": 0.10017851517378977,
+      "grad_norm": 0.5185543298721313,
+      "learning_rate": 0.0002,
+      "loss": 0.192,
+      "step": 477
+    },
+    {
+      "epoch": 0.10038853302530715,
+      "grad_norm": 0.7630559802055359,
+      "learning_rate": 0.00019999999396812126,
+      "loss": 0.3107,
+      "step": 478
+    },
+    {
+      "epoch": 0.10059855087682452,
+      "grad_norm": 0.5256696939468384,
+      "learning_rate": 0.00019999997587248573,
+      "loss": 0.2057,
+      "step": 479
+    },
+    {
+      "epoch": 0.10080856872834192,
+      "grad_norm": 0.5820131301879883,
+      "learning_rate": 0.0001999999457130956,
+      "loss": 0.2444,
+      "step": 480
+    },
+    {
+      "epoch": 0.10101858657985929,
+      "grad_norm": 0.6161417365074158,
+      "learning_rate": 0.00019999990348995456,
+      "loss": 0.1882,
+      "step": 481
+    },
+    {
+      "epoch": 0.10122860443137667,
+      "grad_norm": 0.5549945831298828,
+      "learning_rate": 0.00019999984920306764,
+      "loss": 0.3101,
+      "step": 482
+    },
+    {
+      "epoch": 0.10143862228289405,
+      "grad_norm": 0.8708590269088745,
+      "learning_rate": 0.00019999978285244142,
+      "loss": 0.2377,
+      "step": 483
+    },
+    {
+      "epoch": 0.10164864013441142,
+      "grad_norm": 0.5110476016998291,
+      "learning_rate": 0.00019999970443808387,
+      "loss": 0.1476,
+      "step": 484
+    },
+    {
+      "epoch": 0.1018586579859288,
+      "grad_norm": 1.1280276775360107,
+      "learning_rate": 0.0001999996139600045,
+      "loss": 0.1595,
+      "step": 485
+    },
+    {
+      "epoch": 0.10206867583744618,
+      "grad_norm": 0.7876203656196594,
+      "learning_rate": 0.0001999995114182142,
+      "loss": 0.229,
+      "step": 486
+    },
+    {
+      "epoch": 0.10227869368896356,
+      "grad_norm": 0.7196666598320007,
+      "learning_rate": 0.00019999939681272536,
+      "loss": 0.1838,
+      "step": 487
+    },
+    {
+      "epoch": 0.10248871154048093,
+      "grad_norm": 0.6737300157546997,
+      "learning_rate": 0.00019999927014355175,
+      "loss": 0.1786,
+      "step": 488
+    },
+    {
+      "epoch": 0.10269872939199833,
+      "grad_norm": 0.7758048176765442,
+      "learning_rate": 0.0001999991314107087,
+      "loss": 0.1847,
+      "step": 489
+    },
+    {
+      "epoch": 0.1029087472435157,
+      "grad_norm": 0.8189213871955872,
+      "learning_rate": 0.00019999898061421294,
+      "loss": 0.2842,
+      "step": 490
+    },
+    {
+      "epoch": 0.10311876509503308,
+      "grad_norm": 0.5789510011672974,
+      "learning_rate": 0.00019999881775408263,
+      "loss": 0.2353,
+      "step": 491
+    },
+    {
+      "epoch": 0.10332878294655046,
+      "grad_norm": 0.808729350566864,
+      "learning_rate": 0.00019999864283033747,
+      "loss": 0.2481,
+      "step": 492
+    },
+    {
+      "epoch": 0.10353880079806783,
+      "grad_norm": 0.587478518486023,
+      "learning_rate": 0.00019999845584299855,
+      "loss": 0.1976,
+      "step": 493
+    },
+    {
+      "epoch": 0.10374881864958521,
+      "grad_norm": 0.7419194579124451,
+      "learning_rate": 0.00019999825679208839,
+      "loss": 0.2444,
+      "step": 494
+    },
+    {
+      "epoch": 0.10395883650110259,
+      "grad_norm": 0.6678702235221863,
+      "learning_rate": 0.000199998045677631,
+      "loss": 0.1752,
+      "step": 495
+    },
+    {
+      "epoch": 0.10416885435261997,
+      "grad_norm": 0.5477135181427002,
+      "learning_rate": 0.00019999782249965193,
+      "loss": 0.1176,
+      "step": 496
+    },
+    {
+      "epoch": 0.10437887220413736,
+      "grad_norm": 0.47613173723220825,
+      "learning_rate": 0.000199997587258178,
+      "loss": 0.1734,
+      "step": 497
+    },
+    {
+      "epoch": 0.10458889005565473,
+      "grad_norm": 0.8437466025352478,
+      "learning_rate": 0.0001999973399532377,
+      "loss": 0.2279,
+      "step": 498
+    },
+    {
+      "epoch": 0.10479890790717211,
+      "grad_norm": 0.7599924206733704,
+      "learning_rate": 0.00019999708058486074,
+      "loss": 0.209,
+      "step": 499
+    },
+    {
+      "epoch": 0.10500892575868949,
+      "grad_norm": 0.5578658580780029,
+      "learning_rate": 0.00019999680915307847,
+      "loss": 0.243,
+      "step": 500
+    },
+    {
+      "epoch": 0.10521894361020687,
+      "grad_norm": 0.5664511322975159,
+      "learning_rate": 0.00019999652565792368,
+      "loss": 0.2176,
+      "step": 501
+    },
+    {
+      "epoch": 0.10542896146172424,
+      "grad_norm": 0.5591540336608887,
+      "learning_rate": 0.0001999962300994305,
+      "loss": 0.3467,
+      "step": 502
+    },
+    {
+      "epoch": 0.10563897931324162,
+      "grad_norm": 0.5022396445274353,
+      "learning_rate": 0.0001999959224776346,
+      "loss": 0.2149,
+      "step": 503
+    },
+    {
+      "epoch": 0.105848997164759,
+      "grad_norm": 0.5846520662307739,
+      "learning_rate": 0.00019999560279257314,
+      "loss": 0.3388,
+      "step": 504
+    },
+    {
+      "epoch": 0.10605901501627639,
+      "grad_norm": 0.4137157201766968,
+      "learning_rate": 0.00019999527104428463,
+      "loss": 0.2223,
+      "step": 505
+    },
+    {
+      "epoch": 0.10626903286779377,
+      "grad_norm": 0.49332931637763977,
+      "learning_rate": 0.0001999949272328091,
+      "loss": 0.2679,
+      "step": 506
+    },
+    {
+      "epoch": 0.10647905071931114,
+      "grad_norm": 0.7095859050750732,
+      "learning_rate": 0.00019999457135818805,
+      "loss": 0.2681,
+      "step": 507
+    },
+    {
+      "epoch": 0.10668906857082852,
+      "grad_norm": 0.5563727021217346,
+      "learning_rate": 0.0001999942034204644,
+      "loss": 0.2695,
+      "step": 508
+    },
+    {
+      "epoch": 0.1068990864223459,
+      "grad_norm": 0.5464118719100952,
+      "learning_rate": 0.00019999382341968252,
+      "loss": 0.4308,
+      "step": 509
+    },
+    {
+      "epoch": 0.10710910427386328,
+      "grad_norm": 0.7822732329368591,
+      "learning_rate": 0.00019999343135588827,
+      "loss": 0.2458,
+      "step": 510
+    },
+    {
+      "epoch": 0.10731912212538065,
+      "grad_norm": 0.6268991231918335,
+      "learning_rate": 0.00019999302722912895,
+      "loss": 0.2877,
+      "step": 511
+    },
+    {
+      "epoch": 0.10752913997689803,
+      "grad_norm": 0.7860679626464844,
+      "learning_rate": 0.0001999926110394533,
+      "loss": 0.275,
+      "step": 512
+    },
+    {
+      "epoch": 0.10773915782841542,
+      "grad_norm": 0.5817549228668213,
+      "learning_rate": 0.00019999218278691158,
+      "loss": 0.2005,
+      "step": 513
+    },
+    {
+      "epoch": 0.1079491756799328,
+      "grad_norm": 0.8145076036453247,
+      "learning_rate": 0.00019999174247155535,
+      "loss": 0.2032,
+      "step": 514
+    },
+    {
+      "epoch": 0.10815919353145018,
+      "grad_norm": 0.7561895847320557,
+      "learning_rate": 0.0001999912900934378,
+      "loss": 0.2452,
+      "step": 515
+    },
+    {
+      "epoch": 0.10836921138296755,
+      "grad_norm": 0.8003164529800415,
+      "learning_rate": 0.0001999908256526135,
+      "loss": 0.2318,
+      "step": 516
+    },
+    {
+      "epoch": 0.10857922923448493,
+      "grad_norm": 0.6318978667259216,
+      "learning_rate": 0.0001999903491491385,
+      "loss": 0.2534,
+      "step": 517
+    },
+    {
+      "epoch": 0.10878924708600231,
+      "grad_norm": 0.5220886468887329,
+      "learning_rate": 0.00019998986058307022,
+      "loss": 0.2011,
+      "step": 518
+    },
+    {
+      "epoch": 0.10899926493751969,
+      "grad_norm": 0.5928252935409546,
+      "learning_rate": 0.00019998935995446763,
+      "loss": 0.2175,
+      "step": 519
+    },
+    {
+      "epoch": 0.10920928278903706,
+      "grad_norm": 0.7763411998748779,
+      "learning_rate": 0.00019998884726339116,
+      "loss": 0.2852,
+      "step": 520
+    },
+    {
+      "epoch": 0.10941930064055445,
+      "grad_norm": 0.7260156273841858,
+      "learning_rate": 0.00019998832250990264,
+      "loss": 0.2313,
+      "step": 521
+    },
+    {
+      "epoch": 0.10962931849207183,
+      "grad_norm": 0.7486017942428589,
+      "learning_rate": 0.0001999877856940653,
+      "loss": 0.2789,
+      "step": 522
+    },
+    {
+      "epoch": 0.10983933634358921,
+      "grad_norm": 0.6006895303726196,
+      "learning_rate": 0.00019998723681594402,
+      "loss": 0.1674,
+      "step": 523
+    },
+    {
+      "epoch": 0.11004935419510659,
+      "grad_norm": 0.7286220192909241,
+      "learning_rate": 0.00019998667587560495,
+      "loss": 0.2521,
+      "step": 524
+    },
+    {
+      "epoch": 0.11025937204662396,
+      "grad_norm": 0.7167160511016846,
+      "learning_rate": 0.00019998610287311574,
+      "loss": 0.2308,
+      "step": 525
+    },
+    {
+      "epoch": 0.11046938989814134,
+      "grad_norm": 0.6270986795425415,
+      "learning_rate": 0.00019998551780854557,
+      "loss": 0.2048,
+      "step": 526
+    },
+    {
+      "epoch": 0.11067940774965872,
+      "grad_norm": 0.7366195917129517,
+      "learning_rate": 0.000199984920681965,
+      "loss": 0.2239,
+      "step": 527
+    },
+    {
+      "epoch": 0.1108894256011761,
+      "grad_norm": 0.5709097981452942,
+      "learning_rate": 0.00019998431149344606,
+      "loss": 0.2074,
+      "step": 528
+    },
+    {
+      "epoch": 0.11109944345269347,
+      "grad_norm": 0.6718131899833679,
+      "learning_rate": 0.00019998369024306224,
+      "loss": 0.2902,
+      "step": 529
+    },
+    {
+      "epoch": 0.11130946130421086,
+      "grad_norm": 0.6741777658462524,
+      "learning_rate": 0.00019998305693088848,
+      "loss": 0.2638,
+      "step": 530
+    },
+    {
+      "epoch": 0.11151947915572824,
+      "grad_norm": 0.5218673944473267,
+      "learning_rate": 0.0001999824115570012,
+      "loss": 0.201,
+      "step": 531
+    },
+    {
+      "epoch": 0.11172949700724562,
+      "grad_norm": 0.6867272257804871,
+      "learning_rate": 0.00019998175412147824,
+      "loss": 0.2864,
+      "step": 532
+    },
+    {
+      "epoch": 0.111939514858763,
+      "grad_norm": 0.6319578289985657,
+      "learning_rate": 0.00019998108462439894,
+      "loss": 0.2277,
+      "step": 533
+    },
+    {
+      "epoch": 0.11214953271028037,
+      "grad_norm": 0.5601973533630371,
+      "learning_rate": 0.000199980403065844,
+      "loss": 0.2678,
+      "step": 534
+    },
+    {
+      "epoch": 0.11235955056179775,
+      "grad_norm": 0.5189068913459778,
+      "learning_rate": 0.00019997970944589572,
+      "loss": 0.2036,
+      "step": 535
+    },
+    {
+      "epoch": 0.11256956841331513,
+      "grad_norm": 0.7217200994491577,
+      "learning_rate": 0.00019997900376463778,
+      "loss": 0.2299,
+      "step": 536
+    },
+    {
+      "epoch": 0.1127795862648325,
+      "grad_norm": 0.6617181301116943,
+      "learning_rate": 0.0001999782860221552,
+      "loss": 0.2259,
+      "step": 537
+    },
+    {
+      "epoch": 0.1129896041163499,
+      "grad_norm": 0.6987117528915405,
+      "learning_rate": 0.0001999775562185347,
+      "loss": 0.1882,
+      "step": 538
+    },
+    {
+      "epoch": 0.11319962196786727,
+      "grad_norm": 0.4491863548755646,
+      "learning_rate": 0.00019997681435386422,
+      "loss": 0.1937,
+      "step": 539
+    },
+    {
+      "epoch": 0.11340963981938465,
+      "grad_norm": 0.5842171311378479,
+      "learning_rate": 0.00019997606042823334,
+      "loss": 0.2808,
+      "step": 540
+    },
+    {
+      "epoch": 0.11361965767090203,
+      "grad_norm": 0.7743870615959167,
+      "learning_rate": 0.00019997529444173293,
+      "loss": 0.2329,
+      "step": 541
+    },
+    {
+      "epoch": 0.1138296755224194,
+      "grad_norm": 0.5326593518257141,
+      "learning_rate": 0.00019997451639445547,
+      "loss": 0.188,
+      "step": 542
+    },
+    {
+      "epoch": 0.11403969337393678,
+      "grad_norm": 0.5364864468574524,
+      "learning_rate": 0.00019997372628649478,
+      "loss": 0.2294,
+      "step": 543
+    },
+    {
+      "epoch": 0.11424971122545416,
+      "grad_norm": 0.5609897375106812,
+      "learning_rate": 0.00019997292411794618,
+      "loss": 0.2108,
+      "step": 544
+    },
+    {
+      "epoch": 0.11445972907697154,
+      "grad_norm": 0.5446069836616516,
+      "learning_rate": 0.00019997210988890646,
+      "loss": 0.2577,
+      "step": 545
+    },
+    {
+      "epoch": 0.11466974692848893,
+      "grad_norm": 0.6916573643684387,
+      "learning_rate": 0.0001999712835994738,
+      "loss": 0.1976,
+      "step": 546
+    },
+    {
+      "epoch": 0.1148797647800063,
+      "grad_norm": 0.7029738426208496,
+      "learning_rate": 0.00019997044524974799,
+      "loss": 0.2076,
+      "step": 547
+    },
+    {
+      "epoch": 0.11508978263152368,
+      "grad_norm": 0.8003794550895691,
+      "learning_rate": 0.00019996959483983004,
+      "loss": 0.3284,
+      "step": 548
+    },
+    {
+      "epoch": 0.11529980048304106,
+      "grad_norm": 0.6394858360290527,
+      "learning_rate": 0.00019996873236982258,
+      "loss": 0.2397,
+      "step": 549
+    },
+    {
+      "epoch": 0.11550981833455844,
+      "grad_norm": 0.7164601683616638,
+      "learning_rate": 0.00019996785783982972,
+      "loss": 0.2097,
+      "step": 550
+    },
+    {
+      "epoch": 0.11571983618607581,
+      "grad_norm": 0.5346024036407471,
+      "learning_rate": 0.0001999669712499569,
+      "loss": 0.2559,
+      "step": 551
+    },
+    {
+      "epoch": 0.11592985403759319,
+      "grad_norm": 0.77498859167099,
+      "learning_rate": 0.00019996607260031106,
+      "loss": 0.3734,
+      "step": 552
+    },
+    {
+      "epoch": 0.11613987188911057,
+      "grad_norm": 0.6465743780136108,
+      "learning_rate": 0.00019996516189100066,
+      "loss": 0.2642,
+      "step": 553
+    },
+    {
+      "epoch": 0.11634988974062796,
+      "grad_norm": 0.4624630808830261,
+      "learning_rate": 0.00019996423912213554,
+      "loss": 0.1649,
+      "step": 554
+    },
+    {
+      "epoch": 0.11655990759214534,
+      "grad_norm": 0.6889922618865967,
+      "learning_rate": 0.00019996330429382703,
+      "loss": 0.2902,
+      "step": 555
+    },
+    {
+      "epoch": 0.11676992544366271,
+      "grad_norm": 0.7413411140441895,
+      "learning_rate": 0.0001999623574061879,
+      "loss": 0.2428,
+      "step": 556
+    },
+    {
+      "epoch": 0.11697994329518009,
+      "grad_norm": 0.9009401798248291,
+      "learning_rate": 0.0001999613984593324,
+      "loss": 0.3112,
+      "step": 557
+    },
+    {
+      "epoch": 0.11718996114669747,
+      "grad_norm": 0.6533844470977783,
+      "learning_rate": 0.00019996042745337617,
+      "loss": 0.2118,
+      "step": 558
+    },
+    {
+      "epoch": 0.11739997899821485,
+      "grad_norm": 0.6814008355140686,
+      "learning_rate": 0.00019995944438843636,
+      "loss": 0.2755,
+      "step": 559
+    },
+    {
+      "epoch": 0.11760999684973222,
+      "grad_norm": 0.6098254323005676,
+      "learning_rate": 0.0001999584492646316,
+      "loss": 0.2385,
+      "step": 560
+    },
+    {
+      "epoch": 0.1178200147012496,
+      "grad_norm": 0.7361414432525635,
+      "learning_rate": 0.00019995744208208194,
+      "loss": 0.326,
+      "step": 561
+    },
+    {
+      "epoch": 0.11803003255276699,
+      "grad_norm": 0.6809893250465393,
+      "learning_rate": 0.00019995642284090885,
+      "loss": 0.2727,
+      "step": 562
+    },
+    {
+      "epoch": 0.11824005040428437,
+      "grad_norm": 0.6401339769363403,
+      "learning_rate": 0.00019995539154123529,
+      "loss": 0.2143,
+      "step": 563
+    },
+    {
+      "epoch": 0.11845006825580175,
+      "grad_norm": 0.715313732624054,
+      "learning_rate": 0.00019995434818318567,
+      "loss": 0.2269,
+      "step": 564
+    },
+    {
+      "epoch": 0.11866008610731912,
+      "grad_norm": 0.6071058511734009,
+      "learning_rate": 0.00019995329276688593,
+      "loss": 0.4811,
+      "step": 565
+    },
+    {
+      "epoch": 0.1188701039588365,
+      "grad_norm": 0.882318377494812,
+      "learning_rate": 0.0001999522252924633,
+      "loss": 0.2607,
+      "step": 566
+    },
+    {
+      "epoch": 0.11908012181035388,
+      "grad_norm": 0.4758372902870178,
+      "learning_rate": 0.0001999511457600466,
+      "loss": 0.2013,
+      "step": 567
+    },
+    {
+      "epoch": 0.11929013966187126,
+      "grad_norm": 0.6482694149017334,
+      "learning_rate": 0.00019995005416976604,
+      "loss": 0.2567,
+      "step": 568
+    },
+    {
+      "epoch": 0.11950015751338863,
+      "grad_norm": 0.6689179539680481,
+      "learning_rate": 0.00019994895052175338,
+      "loss": 0.2498,
+      "step": 569
+    },
+    {
+      "epoch": 0.11971017536490601,
+      "grad_norm": 0.5817541480064392,
+      "learning_rate": 0.00019994783481614166,
+      "loss": 0.2013,
+      "step": 570
+    },
+    {
+      "epoch": 0.1199201932164234,
+      "grad_norm": 0.4717533588409424,
+      "learning_rate": 0.00019994670705306554,
+      "loss": 0.2647,
+      "step": 571
+    },
+    {
+      "epoch": 0.12013021106794078,
+      "grad_norm": 0.574079692363739,
+      "learning_rate": 0.00019994556723266103,
+      "loss": 0.1704,
+      "step": 572
+    },
+    {
+      "epoch": 0.12034022891945816,
+      "grad_norm": 0.759425938129425,
+      "learning_rate": 0.00019994441535506569,
+      "loss": 0.208,
+      "step": 573
+    },
+    {
+      "epoch": 0.12055024677097553,
+      "grad_norm": 0.5335227847099304,
+      "learning_rate": 0.0001999432514204184,
+      "loss": 0.2018,
+      "step": 574
+    },
+    {
+      "epoch": 0.12076026462249291,
+      "grad_norm": 0.5595372915267944,
+      "learning_rate": 0.00019994207542885963,
+      "loss": 0.2667,
+      "step": 575
+    },
+    {
+      "epoch": 0.12097028247401029,
+      "grad_norm": 0.6673279404640198,
+      "learning_rate": 0.00019994088738053124,
+      "loss": 0.3175,
+      "step": 576
+    },
+    {
+      "epoch": 0.12118030032552767,
+      "grad_norm": 0.5622591376304626,
+      "learning_rate": 0.0001999396872755766,
+      "loss": 0.1543,
+      "step": 577
+    },
+    {
+      "epoch": 0.12139031817704504,
+      "grad_norm": 0.5784288048744202,
+      "learning_rate": 0.0001999384751141404,
+      "loss": 0.2624,
+      "step": 578
+    },
+    {
+      "epoch": 0.12160033602856243,
+      "grad_norm": 0.6946631669998169,
+      "learning_rate": 0.00019993725089636891,
+      "loss": 0.2469,
+      "step": 579
+    },
+    {
+      "epoch": 0.12181035388007981,
+      "grad_norm": 0.6951069235801697,
+      "learning_rate": 0.00019993601462240984,
+      "loss": 0.2636,
+      "step": 580
+    },
+    {
+      "epoch": 0.12202037173159719,
+      "grad_norm": 0.5134934186935425,
+      "learning_rate": 0.0001999347662924123,
+      "loss": 0.1432,
+      "step": 581
+    },
+    {
+      "epoch": 0.12223038958311457,
+      "grad_norm": 0.5719296932220459,
+      "learning_rate": 0.00019993350590652691,
+      "loss": 0.2388,
+      "step": 582
+    },
+    {
+      "epoch": 0.12244040743463194,
+      "grad_norm": 0.7625638246536255,
+      "learning_rate": 0.0001999322334649057,
+      "loss": 0.2507,
+      "step": 583
+    },
+    {
+      "epoch": 0.12265042528614932,
+      "grad_norm": 0.6974209547042847,
+      "learning_rate": 0.00019993094896770218,
+      "loss": 0.2431,
+      "step": 584
+    },
+    {
+      "epoch": 0.1228604431376667,
+      "grad_norm": 0.7072513699531555,
+      "learning_rate": 0.0001999296524150713,
+      "loss": 0.1745,
+      "step": 585
+    },
+    {
+      "epoch": 0.12307046098918407,
+      "grad_norm": 0.7435344457626343,
+      "learning_rate": 0.00019992834380716946,
+      "loss": 0.216,
+      "step": 586
+    },
+    {
+      "epoch": 0.12328047884070147,
+      "grad_norm": 0.5491403937339783,
+      "learning_rate": 0.00019992702314415461,
+      "loss": 0.1853,
+      "step": 587
+    },
+    {
+      "epoch": 0.12349049669221884,
+      "grad_norm": 0.5487938523292542,
+      "learning_rate": 0.00019992569042618597,
+      "loss": 0.2361,
+      "step": 588
+    },
+    {
+      "epoch": 0.12370051454373622,
+      "grad_norm": 0.4346216320991516,
+      "learning_rate": 0.00019992434565342437,
+      "loss": 0.1812,
+      "step": 589
+    },
+    {
+      "epoch": 0.1239105323952536,
+      "grad_norm": 0.5448020696640015,
+      "learning_rate": 0.00019992298882603202,
+      "loss": 0.2017,
+      "step": 590
+    },
+    {
+      "epoch": 0.12412055024677097,
+      "grad_norm": 0.6867210268974304,
+      "learning_rate": 0.0001999216199441726,
+      "loss": 0.1788,
+      "step": 591
+    },
+    {
+      "epoch": 0.12433056809828835,
+      "grad_norm": 0.6821328401565552,
+      "learning_rate": 0.00019992023900801127,
+      "loss": 0.2159,
+      "step": 592
+    },
+    {
+      "epoch": 0.12454058594980573,
+      "grad_norm": 0.6648369431495667,
+      "learning_rate": 0.0001999188460177146,
+      "loss": 0.223,
+      "step": 593
+    },
+    {
+      "epoch": 0.1247506038013231,
+      "grad_norm": 0.6275060772895813,
+      "learning_rate": 0.00019991744097345068,
+      "loss": 0.2174,
+      "step": 594
+    },
+    {
+      "epoch": 0.1249606216528405,
+      "grad_norm": 0.43622860312461853,
+      "learning_rate": 0.00019991602387538896,
+      "loss": 0.1709,
+      "step": 595
+    },
+    {
+      "epoch": 0.12517063950435786,
+      "grad_norm": 0.41494739055633545,
+      "learning_rate": 0.00019991459472370042,
+      "loss": 0.1615,
+      "step": 596
+    },
+    {
+      "epoch": 0.12538065735587525,
+      "grad_norm": 0.4159907400608063,
+      "learning_rate": 0.00019991315351855748,
+      "loss": 0.1457,
+      "step": 597
+    },
+    {
+      "epoch": 0.12559067520739262,
+      "grad_norm": 0.8123224377632141,
+      "learning_rate": 0.00019991170026013397,
+      "loss": 0.202,
+      "step": 598
+    },
+    {
+      "epoch": 0.12580069305891,
+      "grad_norm": 0.9315401911735535,
+      "learning_rate": 0.00019991023494860522,
+      "loss": 0.2496,
+      "step": 599
+    },
+    {
+      "epoch": 0.1260107109104274,
+      "grad_norm": 0.7999815344810486,
+      "learning_rate": 0.00019990875758414803,
+      "loss": 0.2782,
+      "step": 600
+    },
+    {
+      "epoch": 0.12622072876194476,
+      "grad_norm": 0.5633922815322876,
+      "learning_rate": 0.0001999072681669406,
+      "loss": 0.2276,
+      "step": 601
+    },
+    {
+      "epoch": 0.12643074661346215,
+      "grad_norm": 0.6719483733177185,
+      "learning_rate": 0.00019990576669716258,
+      "loss": 0.3169,
+      "step": 602
+    },
+    {
+      "epoch": 0.12664076446497952,
+      "grad_norm": 0.7311053276062012,
+      "learning_rate": 0.0001999042531749952,
+      "loss": 0.2723,
+      "step": 603
+    },
+    {
+      "epoch": 0.1268507823164969,
+      "grad_norm": 0.5853881239891052,
+      "learning_rate": 0.00019990272760062093,
+      "loss": 0.2869,
+      "step": 604
+    },
+    {
+      "epoch": 0.12706080016801427,
+      "grad_norm": 0.7300302982330322,
+      "learning_rate": 0.0001999011899742239,
+      "loss": 0.2405,
+      "step": 605
+    },
+    {
+      "epoch": 0.12727081801953166,
+      "grad_norm": 0.704954206943512,
+      "learning_rate": 0.00019989964029598953,
+      "loss": 0.3195,
+      "step": 606
+    },
+    {
+      "epoch": 0.12748083587104905,
+      "grad_norm": 0.6305354833602905,
+      "learning_rate": 0.00019989807856610482,
+      "loss": 0.2442,
+      "step": 607
+    },
+    {
+      "epoch": 0.12769085372256642,
+      "grad_norm": 0.5027151107788086,
+      "learning_rate": 0.0001998965047847582,
+      "loss": 0.3006,
+      "step": 608
+    },
+    {
+      "epoch": 0.1279008715740838,
+      "grad_norm": 0.6237658262252808,
+      "learning_rate": 0.00019989491895213948,
+      "loss": 0.2019,
+      "step": 609
+    },
+    {
+      "epoch": 0.12811088942560117,
+      "grad_norm": 0.6959155797958374,
+      "learning_rate": 0.00019989332106844,
+      "loss": 0.2142,
+      "step": 610
+    },
+    {
+      "epoch": 0.12832090727711856,
+      "grad_norm": 0.7905144095420837,
+      "learning_rate": 0.0001998917111338525,
+      "loss": 0.2599,
+      "step": 611
+    },
+    {
+      "epoch": 0.12853092512863593,
+      "grad_norm": 0.7247504591941833,
+      "learning_rate": 0.00019989008914857116,
+      "loss": 0.2679,
+      "step": 612
+    },
+    {
+      "epoch": 0.12874094298015332,
+      "grad_norm": 0.5282559990882874,
+      "learning_rate": 0.0001998884551127917,
+      "loss": 0.2125,
+      "step": 613
+    },
+    {
+      "epoch": 0.12895096083167068,
+      "grad_norm": 0.6418580412864685,
+      "learning_rate": 0.0001998868090267113,
+      "loss": 0.2185,
+      "step": 614
+    },
+    {
+      "epoch": 0.12916097868318807,
+      "grad_norm": 0.48245900869369507,
+      "learning_rate": 0.00019988515089052844,
+      "loss": 0.2175,
+      "step": 615
+    },
+    {
+      "epoch": 0.12937099653470546,
+      "grad_norm": 0.4887724220752716,
+      "learning_rate": 0.00019988348070444322,
+      "loss": 0.1777,
+      "step": 616
+    },
+    {
+      "epoch": 0.12958101438622283,
+      "grad_norm": 0.5296192169189453,
+      "learning_rate": 0.0001998817984686571,
+      "loss": 0.2344,
+      "step": 617
+    },
+    {
+      "epoch": 0.12979103223774022,
+      "grad_norm": 0.6658729314804077,
+      "learning_rate": 0.00019988010418337305,
+      "loss": 0.2322,
+      "step": 618
+    },
+    {
+      "epoch": 0.13000105008925758,
+      "grad_norm": 0.5744292736053467,
+      "learning_rate": 0.0001998783978487954,
+      "loss": 0.2156,
+      "step": 619
+    },
+    {
+      "epoch": 0.13021106794077497,
+      "grad_norm": 0.5000370144844055,
+      "learning_rate": 0.00019987667946513006,
+      "loss": 0.2319,
+      "step": 620
+    },
+    {
+      "epoch": 0.13042108579229234,
+      "grad_norm": 0.8539411425590515,
+      "learning_rate": 0.00019987494903258432,
+      "loss": 0.3729,
+      "step": 621
+    },
+    {
+      "epoch": 0.13063110364380973,
+      "grad_norm": 0.6094825267791748,
+      "learning_rate": 0.00019987320655136693,
+      "loss": 0.2171,
+      "step": 622
+    },
+    {
+      "epoch": 0.1308411214953271,
+      "grad_norm": 0.6408823728561401,
+      "learning_rate": 0.00019987145202168805,
+      "loss": 0.2658,
+      "step": 623
+    },
+    {
+      "epoch": 0.13105113934684448,
+      "grad_norm": 0.5738769769668579,
+      "learning_rate": 0.0001998696854437594,
+      "loss": 0.2127,
+      "step": 624
+    },
+    {
+      "epoch": 0.13126115719836187,
+      "grad_norm": 0.6330286860466003,
+      "learning_rate": 0.00019986790681779412,
+      "loss": 0.1503,
+      "step": 625
+    },
+    {
+      "epoch": 0.13147117504987924,
+      "grad_norm": 0.8125373125076294,
+      "learning_rate": 0.0001998661161440067,
+      "loss": 0.2741,
+      "step": 626
+    },
+    {
+      "epoch": 0.13168119290139663,
+      "grad_norm": 0.710121750831604,
+      "learning_rate": 0.00019986431342261323,
+      "loss": 0.2672,
+      "step": 627
+    },
+    {
+      "epoch": 0.131891210752914,
+      "grad_norm": 0.8024762868881226,
+      "learning_rate": 0.00019986249865383115,
+      "loss": 0.2818,
+      "step": 628
+    },
+    {
+      "epoch": 0.13210122860443138,
+      "grad_norm": 1.0455816984176636,
+      "learning_rate": 0.0001998606718378794,
+      "loss": 0.3204,
+      "step": 629
+    },
+    {
+      "epoch": 0.13231124645594874,
+      "grad_norm": 0.7923910617828369,
+      "learning_rate": 0.00019985883297497835,
+      "loss": 0.213,
+      "step": 630
+    },
+    {
+      "epoch": 0.13252126430746614,
+      "grad_norm": 0.7458345890045166,
+      "learning_rate": 0.00019985698206534985,
+      "loss": 0.2066,
+      "step": 631
+    },
+    {
+      "epoch": 0.13273128215898353,
+      "grad_norm": 0.6166645884513855,
+      "learning_rate": 0.0001998551191092172,
+      "loss": 0.2239,
+      "step": 632
+    },
+    {
+      "epoch": 0.1329413000105009,
+      "grad_norm": 0.7050312161445618,
+      "learning_rate": 0.00019985324410680514,
+      "loss": 0.2692,
+      "step": 633
+    },
+    {
+      "epoch": 0.13315131786201828,
+      "grad_norm": 0.6465736627578735,
+      "learning_rate": 0.00019985135705833984,
+      "loss": 0.235,
+      "step": 634
+    },
+    {
+      "epoch": 0.13336133571353564,
+      "grad_norm": 0.6108490228652954,
+      "learning_rate": 0.00019984945796404894,
+      "loss": 0.2472,
+      "step": 635
+    },
+    {
+      "epoch": 0.13357135356505304,
+      "grad_norm": 0.725173830986023,
+      "learning_rate": 0.00019984754682416157,
+      "loss": 0.2521,
+      "step": 636
+    },
+    {
+      "epoch": 0.1337813714165704,
+      "grad_norm": 0.5391446352005005,
+      "learning_rate": 0.00019984562363890832,
+      "loss": 0.2151,
+      "step": 637
+    },
+    {
+      "epoch": 0.1339913892680878,
+      "grad_norm": 0.44177114963531494,
+      "learning_rate": 0.00019984368840852114,
+      "loss": 0.179,
+      "step": 638
+    },
+    {
+      "epoch": 0.13420140711960515,
+      "grad_norm": 0.48038744926452637,
+      "learning_rate": 0.00019984174113323353,
+      "loss": 0.2296,
+      "step": 639
+    },
+    {
+      "epoch": 0.13441142497112255,
+      "grad_norm": 0.5720350742340088,
+      "learning_rate": 0.00019983978181328037,
+      "loss": 0.1843,
+      "step": 640
+    },
+    {
+      "epoch": 0.13462144282263994,
+      "grad_norm": 0.4996393322944641,
+      "learning_rate": 0.00019983781044889803,
+      "loss": 0.225,
+      "step": 641
+    },
+    {
+      "epoch": 0.1348314606741573,
+      "grad_norm": 0.5970807671546936,
+      "learning_rate": 0.00019983582704032434,
+      "loss": 0.179,
+      "step": 642
+    },
+    {
+      "epoch": 0.1350414785256747,
+      "grad_norm": 0.858808159828186,
+      "learning_rate": 0.0001998338315877986,
+      "loss": 0.246,
+      "step": 643
+    },
+    {
+      "epoch": 0.13525149637719205,
+      "grad_norm": 0.6708926558494568,
+      "learning_rate": 0.0001998318240915615,
+      "loss": 0.197,
+      "step": 644
+    },
+    {
+      "epoch": 0.13546151422870945,
+      "grad_norm": 0.8443548083305359,
+      "learning_rate": 0.00019982980455185526,
+      "loss": 0.1889,
+      "step": 645
+    },
+    {
+      "epoch": 0.1356715320802268,
+      "grad_norm": 0.6451512575149536,
+      "learning_rate": 0.00019982777296892346,
+      "loss": 0.2103,
+      "step": 646
+    },
+    {
+      "epoch": 0.1358815499317442,
+      "grad_norm": 0.7601468563079834,
+      "learning_rate": 0.00019982572934301122,
+      "loss": 0.2338,
+      "step": 647
+    },
+    {
+      "epoch": 0.1360915677832616,
+      "grad_norm": 0.5944762229919434,
+      "learning_rate": 0.00019982367367436506,
+      "loss": 0.1814,
+      "step": 648
+    },
+    {
+      "epoch": 0.13630158563477895,
+      "grad_norm": 0.7542382478713989,
+      "learning_rate": 0.00019982160596323297,
+      "loss": 0.2062,
+      "step": 649
+    },
+    {
+      "epoch": 0.13651160348629635,
+      "grad_norm": 0.560296893119812,
+      "learning_rate": 0.00019981952620986442,
+      "loss": 0.199,
+      "step": 650
+    },
+    {
+      "epoch": 0.1367216213378137,
+      "grad_norm": 0.5254395604133606,
+      "learning_rate": 0.0001998174344145103,
+      "loss": 0.2943,
+      "step": 651
+    },
+    {
+      "epoch": 0.1369316391893311,
+      "grad_norm": 0.6042603254318237,
+      "learning_rate": 0.00019981533057742294,
+      "loss": 0.2355,
+      "step": 652
+    },
+    {
+      "epoch": 0.13714165704084846,
+      "grad_norm": 0.6384417414665222,
+      "learning_rate": 0.00019981321469885615,
+      "loss": 0.202,
+      "step": 653
+    },
+    {
+      "epoch": 0.13735167489236585,
+      "grad_norm": 0.7300348877906799,
+      "learning_rate": 0.0001998110867790652,
+      "loss": 0.2422,
+      "step": 654
+    },
+    {
+      "epoch": 0.13756169274388322,
+      "grad_norm": 0.5238686800003052,
+      "learning_rate": 0.00019980894681830678,
+      "loss": 0.2491,
+      "step": 655
+    },
+    {
+      "epoch": 0.1377717105954006,
+      "grad_norm": 0.7352842092514038,
+      "learning_rate": 0.00019980679481683904,
+      "loss": 0.3193,
+      "step": 656
+    },
+    {
+      "epoch": 0.137981728446918,
+      "grad_norm": 0.6651904582977295,
+      "learning_rate": 0.0001998046307749216,
+      "loss": 0.2618,
+      "step": 657
+    },
+    {
+      "epoch": 0.13819174629843536,
+      "grad_norm": 0.6976970434188843,
+      "learning_rate": 0.00019980245469281553,
+      "loss": 0.2622,
+      "step": 658
+    },
+    {
+      "epoch": 0.13840176414995276,
+      "grad_norm": 0.6078370809555054,
+      "learning_rate": 0.00019980026657078336,
+      "loss": 0.2532,
+      "step": 659
+    },
+    {
+      "epoch": 0.13861178200147012,
+      "grad_norm": 0.7155233025550842,
+      "learning_rate": 0.00019979806640908906,
+      "loss": 0.3283,
+      "step": 660
+    },
+    {
+      "epoch": 0.1388217998529875,
+      "grad_norm": 0.519636869430542,
+      "learning_rate": 0.00019979585420799802,
+      "loss": 0.187,
+      "step": 661
+    },
+    {
+      "epoch": 0.13903181770450487,
+      "grad_norm": 0.7007790803909302,
+      "learning_rate": 0.00019979362996777714,
+      "loss": 0.2554,
+      "step": 662
+    },
+    {
+      "epoch": 0.13924183555602226,
+      "grad_norm": 0.6281614303588867,
+      "learning_rate": 0.00019979139368869473,
+      "loss": 0.2153,
+      "step": 663
+    },
+    {
+      "epoch": 0.13945185340753963,
+      "grad_norm": 0.5729889869689941,
+      "learning_rate": 0.00019978914537102055,
+      "loss": 0.2432,
+      "step": 664
+    },
+    {
+      "epoch": 0.13966187125905702,
+      "grad_norm": 0.4995453357696533,
+      "learning_rate": 0.00019978688501502592,
+      "loss": 0.1931,
+      "step": 665
+    },
+    {
+      "epoch": 0.1398718891105744,
+      "grad_norm": 0.48151615262031555,
+      "learning_rate": 0.00019978461262098343,
+      "loss": 0.1664,
+      "step": 666
+    },
+    {
+      "epoch": 0.14008190696209177,
+      "grad_norm": 0.6951011419296265,
+      "learning_rate": 0.00019978232818916727,
+      "loss": 0.2229,
+      "step": 667
+    },
+    {
+      "epoch": 0.14029192481360916,
+      "grad_norm": 0.5914542078971863,
+      "learning_rate": 0.000199780031719853,
+      "loss": 0.2126,
+      "step": 668
+    },
+    {
+      "epoch": 0.14050194266512653,
+      "grad_norm": 0.7551674246788025,
+      "learning_rate": 0.00019977772321331765,
+      "loss": 0.2806,
+      "step": 669
+    },
+    {
+      "epoch": 0.14071196051664392,
+      "grad_norm": 0.7960730195045471,
+      "learning_rate": 0.00019977540266983976,
+      "loss": 0.2653,
+      "step": 670
+    },
+    {
+      "epoch": 0.14092197836816128,
+      "grad_norm": 0.5545317530632019,
+      "learning_rate": 0.00019977307008969922,
+      "loss": 0.2141,
+      "step": 671
+    },
+    {
+      "epoch": 0.14113199621967867,
+      "grad_norm": 0.7467978596687317,
+      "learning_rate": 0.0001997707254731775,
+      "loss": 0.1961,
+      "step": 672
+    },
+    {
+      "epoch": 0.14134201407119606,
+      "grad_norm": 0.6775459051132202,
+      "learning_rate": 0.00019976836882055736,
+      "loss": 0.2304,
+      "step": 673
+    },
+    {
+      "epoch": 0.14155203192271343,
+      "grad_norm": 0.793547511100769,
+      "learning_rate": 0.00019976600013212317,
+      "loss": 0.2266,
+      "step": 674
+    },
+    {
+      "epoch": 0.14176204977423082,
+      "grad_norm": 0.6920728087425232,
+      "learning_rate": 0.00019976361940816063,
+      "loss": 0.3469,
+      "step": 675
+    },
+    {
+      "epoch": 0.14197206762574818,
+      "grad_norm": 0.840145468711853,
+      "learning_rate": 0.000199761226648957,
+      "loss": 0.235,
+      "step": 676
+    },
+    {
+      "epoch": 0.14218208547726557,
+      "grad_norm": 0.8047716617584229,
+      "learning_rate": 0.0001997588218548009,
+      "loss": 0.3034,
+      "step": 677
+    },
+    {
+      "epoch": 0.14239210332878294,
+      "grad_norm": 0.649042010307312,
+      "learning_rate": 0.00019975640502598244,
+      "loss": 0.2822,
+      "step": 678
+    },
+    {
+      "epoch": 0.14260212118030033,
+      "grad_norm": 0.6780881881713867,
+      "learning_rate": 0.0001997539761627932,
+      "loss": 0.1593,
+      "step": 679
+    },
+    {
+      "epoch": 0.1428121390318177,
+      "grad_norm": 0.6812571883201599,
+      "learning_rate": 0.00019975153526552615,
+      "loss": 0.1898,
+      "step": 680
+    },
+    {
+      "epoch": 0.14302215688333508,
+      "grad_norm": 0.5687631368637085,
+      "learning_rate": 0.0001997490823344758,
+      "loss": 0.2193,
+      "step": 681
+    },
+    {
+      "epoch": 0.14323217473485247,
+      "grad_norm": 0.8981772065162659,
+      "learning_rate": 0.00019974661736993804,
+      "loss": 0.2785,
+      "step": 682
+    },
+    {
+      "epoch": 0.14344219258636984,
+      "grad_norm": 0.6966889500617981,
+      "learning_rate": 0.00019974414037221027,
+      "loss": 0.2678,
+      "step": 683
+    },
+    {
+      "epoch": 0.14365221043788723,
+      "grad_norm": 0.5631129145622253,
+      "learning_rate": 0.00019974165134159126,
+      "loss": 0.2836,
+      "step": 684
+    },
+    {
+      "epoch": 0.1438622282894046,
+      "grad_norm": 0.7686763405799866,
+      "learning_rate": 0.00019973915027838134,
+      "loss": 0.2372,
+      "step": 685
+    },
+    {
+      "epoch": 0.14407224614092198,
+      "grad_norm": 0.881515383720398,
+      "learning_rate": 0.00019973663718288217,
+      "loss": 0.2901,
+      "step": 686
+    },
+    {
+      "epoch": 0.14428226399243935,
+      "grad_norm": 0.6973896026611328,
+      "learning_rate": 0.00019973411205539694,
+      "loss": 0.2577,
+      "step": 687
+    },
+    {
+      "epoch": 0.14449228184395674,
+      "grad_norm": 0.3398377299308777,
+      "learning_rate": 0.0001997315748962303,
+      "loss": 0.1305,
+      "step": 688
+    },
+    {
+      "epoch": 0.1447022996954741,
+      "grad_norm": 0.6775567531585693,
+      "learning_rate": 0.0001997290257056883,
+      "loss": 0.3746,
+      "step": 689
+    },
+    {
+      "epoch": 0.1449123175469915,
+      "grad_norm": 0.3776891827583313,
+      "learning_rate": 0.0001997264644840785,
+      "loss": 0.1731,
+      "step": 690
+    },
+    {
+      "epoch": 0.14512233539850888,
+      "grad_norm": 0.6515337824821472,
+      "learning_rate": 0.00019972389123170986,
+      "loss": 0.2596,
+      "step": 691
+    },
+    {
+      "epoch": 0.14533235325002625,
+      "grad_norm": 0.7165318131446838,
+      "learning_rate": 0.00019972130594889286,
+      "loss": 0.2673,
+      "step": 692
+    },
+    {
+      "epoch": 0.14554237110154364,
+      "grad_norm": 0.5702444314956665,
+      "learning_rate": 0.00019971870863593925,
+      "loss": 0.1928,
+      "step": 693
+    },
+    {
+      "epoch": 0.145752388953061,
+      "grad_norm": 0.3542981743812561,
+      "learning_rate": 0.0001997160992931625,
+      "loss": 0.1277,
+      "step": 694
+    },
+    {
+      "epoch": 0.1459624068045784,
+      "grad_norm": 0.6520780920982361,
+      "learning_rate": 0.00019971347792087732,
+      "loss": 0.2623,
+      "step": 695
+    },
+    {
+      "epoch": 0.14617242465609576,
+      "grad_norm": 0.4505969285964966,
+      "learning_rate": 0.00019971084451939997,
+      "loss": 0.2026,
+      "step": 696
+    },
+    {
+      "epoch": 0.14638244250761315,
+      "grad_norm": 0.623036801815033,
+      "learning_rate": 0.00019970819908904814,
+      "loss": 0.2371,
+      "step": 697
+    },
+    {
+      "epoch": 0.14659246035913054,
+      "grad_norm": 0.60871422290802,
+      "learning_rate": 0.00019970554163014097,
+      "loss": 0.3128,
+      "step": 698
+    },
+    {
+      "epoch": 0.1468024782106479,
+      "grad_norm": 0.6321155428886414,
+      "learning_rate": 0.00019970287214299902,
+      "loss": 0.2183,
+      "step": 699
+    },
+    {
+      "epoch": 0.1470124960621653,
+      "grad_norm": 0.7513316869735718,
+      "learning_rate": 0.0001997001906279444,
+      "loss": 0.2753,
+      "step": 700
+    },
+    {
+      "epoch": 0.14722251391368266,
+      "grad_norm": 0.4192676842212677,
+      "learning_rate": 0.0001996974970853005,
+      "loss": 0.3071,
+      "step": 701
+    },
+    {
+      "epoch": 0.14743253176520005,
+      "grad_norm": 0.5773706436157227,
+      "learning_rate": 0.00019969479151539236,
+      "loss": 0.2883,
+      "step": 702
+    },
+    {
+      "epoch": 0.1476425496167174,
+      "grad_norm": 0.4587963819503784,
+      "learning_rate": 0.00019969207391854632,
+      "loss": 0.2997,
+      "step": 703
+    },
+    {
+      "epoch": 0.1478525674682348,
+      "grad_norm": 0.6077782511711121,
+      "learning_rate": 0.00019968934429509023,
+      "loss": 0.182,
+      "step": 704
+    },
+    {
+      "epoch": 0.14806258531975217,
+      "grad_norm": 0.6285839676856995,
+      "learning_rate": 0.0001996866026453534,
+      "loss": 0.3573,
+      "step": 705
+    },
+    {
+      "epoch": 0.14827260317126956,
+      "grad_norm": 0.7416669726371765,
+      "learning_rate": 0.00019968384896966657,
+      "loss": 0.2424,
+      "step": 706
+    },
+    {
+      "epoch": 0.14848262102278695,
+      "grad_norm": 0.5475688576698303,
+      "learning_rate": 0.0001996810832683619,
+      "loss": 0.1766,
+      "step": 707
+    },
+    {
+      "epoch": 0.1486926388743043,
+      "grad_norm": 0.5601086020469666,
+      "learning_rate": 0.00019967830554177312,
+      "loss": 0.2725,
+      "step": 708
+    },
+    {
+      "epoch": 0.1489026567258217,
+      "grad_norm": 0.7686034440994263,
+      "learning_rate": 0.00019967551579023524,
+      "loss": 0.3008,
+      "step": 709
+    },
+    {
+      "epoch": 0.14911267457733907,
+      "grad_norm": 0.8172418475151062,
+      "learning_rate": 0.00019967271401408486,
+      "loss": 0.3042,
+      "step": 710
+    },
+    {
+      "epoch": 0.14932269242885646,
+      "grad_norm": 0.8726032972335815,
+      "learning_rate": 0.00019966990021366,
+      "loss": 0.224,
+      "step": 711
+    },
+    {
+      "epoch": 0.14953271028037382,
+      "grad_norm": 0.6053635478019714,
+      "learning_rate": 0.00019966707438930003,
+      "loss": 0.2325,
+      "step": 712
+    },
+    {
+      "epoch": 0.1497427281318912,
+      "grad_norm": 0.7157438397407532,
+      "learning_rate": 0.00019966423654134592,
+      "loss": 0.2656,
+      "step": 713
+    },
+    {
+      "epoch": 0.1499527459834086,
+      "grad_norm": 0.6943267583847046,
+      "learning_rate": 0.00019966138667014,
+      "loss": 0.2625,
+      "step": 714
+    },
+    {
+      "epoch": 0.15016276383492597,
+      "grad_norm": 0.7070578336715698,
+      "learning_rate": 0.00019965852477602604,
+      "loss": 0.2795,
+      "step": 715
+    },
+    {
+      "epoch": 0.15037278168644336,
+      "grad_norm": 0.654684841632843,
+      "learning_rate": 0.00019965565085934935,
+      "loss": 0.2168,
+      "step": 716
+    },
+    {
+      "epoch": 0.15058279953796072,
+      "grad_norm": 0.5972804427146912,
+      "learning_rate": 0.00019965276492045662,
+      "loss": 0.2337,
+      "step": 717
+    },
+    {
+      "epoch": 0.1507928173894781,
+      "grad_norm": 0.4990095794200897,
+      "learning_rate": 0.000199649866959696,
+      "loss": 0.3187,
+      "step": 718
+    },
+    {
+      "epoch": 0.15100283524099548,
+      "grad_norm": 0.6247003078460693,
+      "learning_rate": 0.00019964695697741703,
+      "loss": 0.2139,
+      "step": 719
+    },
+    {
+      "epoch": 0.15121285309251287,
+      "grad_norm": 0.6358337998390198,
+      "learning_rate": 0.00019964403497397084,
+      "loss": 0.244,
+      "step": 720
+    },
+    {
+      "epoch": 0.15142287094403023,
+      "grad_norm": 0.5211917161941528,
+      "learning_rate": 0.0001996411009497099,
+      "loss": 0.1784,
+      "step": 721
+    },
+    {
+      "epoch": 0.15163288879554762,
+      "grad_norm": 0.464606374502182,
+      "learning_rate": 0.00019963815490498817,
+      "loss": 0.2137,
+      "step": 722
+    },
+    {
+      "epoch": 0.151842906647065,
+      "grad_norm": 0.7099301815032959,
+      "learning_rate": 0.00019963519684016107,
+      "loss": 0.2927,
+      "step": 723
+    },
+    {
+      "epoch": 0.15205292449858238,
+      "grad_norm": 0.7805564999580383,
+      "learning_rate": 0.00019963222675558543,
+      "loss": 0.2374,
+      "step": 724
+    },
+    {
+      "epoch": 0.15226294235009977,
+      "grad_norm": 0.6172361373901367,
+      "learning_rate": 0.00019962924465161957,
+      "loss": 0.201,
+      "step": 725
+    },
+    {
+      "epoch": 0.15247296020161713,
+      "grad_norm": 0.6261605620384216,
+      "learning_rate": 0.0001996262505286232,
+      "loss": 0.1709,
+      "step": 726
+    },
+    {
+      "epoch": 0.15268297805313452,
+      "grad_norm": 0.6561511158943176,
+      "learning_rate": 0.00019962324438695762,
+      "loss": 0.2283,
+      "step": 727
+    },
+    {
+      "epoch": 0.15289299590465188,
+      "grad_norm": 0.5386349558830261,
+      "learning_rate": 0.0001996202262269854,
+      "loss": 0.231,
+      "step": 728
+    },
+    {
+      "epoch": 0.15310301375616928,
+      "grad_norm": 0.644136369228363,
+      "learning_rate": 0.00019961719604907066,
+      "loss": 0.1875,
+      "step": 729
+    },
+    {
+      "epoch": 0.15331303160768664,
+      "grad_norm": 0.6452980041503906,
+      "learning_rate": 0.00019961415385357897,
+      "loss": 0.2294,
+      "step": 730
+    },
+    {
+      "epoch": 0.15352304945920403,
+      "grad_norm": 0.5558809041976929,
+      "learning_rate": 0.0001996110996408773,
+      "loss": 0.1988,
+      "step": 731
+    },
+    {
+      "epoch": 0.15373306731072142,
+      "grad_norm": 0.6049979329109192,
+      "learning_rate": 0.00019960803341133413,
+      "loss": 0.2368,
+      "step": 732
+    },
+    {
+      "epoch": 0.15394308516223879,
+      "grad_norm": 0.6450143456459045,
+      "learning_rate": 0.00019960495516531935,
+      "loss": 0.2217,
+      "step": 733
+    },
+    {
+      "epoch": 0.15415310301375618,
+      "grad_norm": 0.6582781672477722,
+      "learning_rate": 0.00019960186490320436,
+      "loss": 0.1942,
+      "step": 734
+    },
+    {
+      "epoch": 0.15436312086527354,
+      "grad_norm": 0.5160269141197205,
+      "learning_rate": 0.0001995987626253619,
+      "loss": 0.1723,
+      "step": 735
+    },
+    {
+      "epoch": 0.15457313871679093,
+      "grad_norm": 0.6058139801025391,
+      "learning_rate": 0.00019959564833216625,
+      "loss": 0.2089,
+      "step": 736
+    },
+    {
+      "epoch": 0.1547831565683083,
+      "grad_norm": 0.540282666683197,
+      "learning_rate": 0.0001995925220239931,
+      "loss": 0.2089,
+      "step": 737
+    },
+    {
+      "epoch": 0.15499317441982569,
+      "grad_norm": 0.7635892033576965,
+      "learning_rate": 0.0001995893837012196,
+      "loss": 0.2825,
+      "step": 738
+    },
+    {
+      "epoch": 0.15520319227134308,
+      "grad_norm": 0.5233755111694336,
+      "learning_rate": 0.00019958623336422434,
+      "loss": 0.1514,
+      "step": 739
+    },
+    {
+      "epoch": 0.15541321012286044,
+      "grad_norm": 0.44758716225624084,
+      "learning_rate": 0.00019958307101338742,
+      "loss": 0.132,
+      "step": 740
+    },
+    {
+      "epoch": 0.15562322797437783,
+      "grad_norm": 0.7145951390266418,
+      "learning_rate": 0.00019957989664909026,
+      "loss": 0.2395,
+      "step": 741
+    },
+    {
+      "epoch": 0.1558332458258952,
+      "grad_norm": 0.6241814494132996,
+      "learning_rate": 0.0001995767102717159,
+      "loss": 0.2255,
+      "step": 742
+    },
+    {
+      "epoch": 0.15604326367741259,
+      "grad_norm": 0.502863883972168,
+      "learning_rate": 0.00019957351188164865,
+      "loss": 0.1941,
+      "step": 743
+    },
+    {
+      "epoch": 0.15625328152892995,
+      "grad_norm": 0.5572714805603027,
+      "learning_rate": 0.00019957030147927442,
+      "loss": 0.1664,
+      "step": 744
+    },
+    {
+      "epoch": 0.15646329938044734,
+      "grad_norm": 1.0500191450119019,
+      "learning_rate": 0.00019956707906498044,
+      "loss": 0.3229,
+      "step": 745
+    },
+    {
+      "epoch": 0.1566733172319647,
+      "grad_norm": 0.595522403717041,
+      "learning_rate": 0.0001995638446391555,
+      "loss": 0.1932,
+      "step": 746
+    },
+    {
+      "epoch": 0.1568833350834821,
+      "grad_norm": 0.38818204402923584,
+      "learning_rate": 0.00019956059820218982,
+      "loss": 0.1324,
+      "step": 747
+    },
+    {
+      "epoch": 0.1570933529349995,
+      "grad_norm": 0.6705027222633362,
+      "learning_rate": 0.000199557339754475,
+      "loss": 0.194,
+      "step": 748
+    },
+    {
+      "epoch": 0.15730337078651685,
+      "grad_norm": 0.4935189485549927,
+      "learning_rate": 0.0001995540692964041,
+      "loss": 0.2492,
+      "step": 749
+    },
+    {
+      "epoch": 0.15751338863803424,
+      "grad_norm": 0.3950806260108948,
+      "learning_rate": 0.00019955078682837174,
+      "loss": 0.1331,
+      "step": 750
+    },
+    {
+      "epoch": 0.1577234064895516,
+      "grad_norm": 0.6625058650970459,
+      "learning_rate": 0.00019954749235077384,
+      "loss": 0.297,
+      "step": 751
+    },
+    {
+      "epoch": 0.157933424341069,
+      "grad_norm": 0.5862818956375122,
+      "learning_rate": 0.00019954418586400787,
+      "loss": 0.2628,
+      "step": 752
+    },
+    {
+      "epoch": 0.15814344219258636,
+      "grad_norm": 0.6951611042022705,
+      "learning_rate": 0.0001995408673684727,
+      "loss": 0.2573,
+      "step": 753
+    },
+    {
+      "epoch": 0.15835346004410375,
+      "grad_norm": 0.8030470013618469,
+      "learning_rate": 0.0001995375368645687,
+      "loss": 0.2671,
+      "step": 754
+    },
+    {
+      "epoch": 0.15856347789562114,
+      "grad_norm": 0.4509555995464325,
+      "learning_rate": 0.00019953419435269764,
+      "loss": 0.1808,
+      "step": 755
+    },
+    {
+      "epoch": 0.1587734957471385,
+      "grad_norm": 0.7687417268753052,
+      "learning_rate": 0.0001995308398332627,
+      "loss": 0.2906,
+      "step": 756
+    },
+    {
+      "epoch": 0.1589835135986559,
+      "grad_norm": 0.7642715573310852,
+      "learning_rate": 0.00019952747330666867,
+      "loss": 0.3541,
+      "step": 757
+    },
+    {
+      "epoch": 0.15919353145017326,
+      "grad_norm": 0.6699778437614441,
+      "learning_rate": 0.00019952409477332156,
+      "loss": 0.2494,
+      "step": 758
+    },
+    {
+      "epoch": 0.15940354930169065,
+      "grad_norm": 0.7119278907775879,
+      "learning_rate": 0.00019952070423362903,
+      "loss": 0.1994,
+      "step": 759
+    },
+    {
+      "epoch": 0.159613567153208,
+      "grad_norm": 0.6130563616752625,
+      "learning_rate": 0.00019951730168800004,
+      "loss": 0.3433,
+      "step": 760
+    },
+    {
+      "epoch": 0.1598235850047254,
+      "grad_norm": 0.692933201789856,
+      "learning_rate": 0.00019951388713684514,
+      "loss": 0.1762,
+      "step": 761
+    },
+    {
+      "epoch": 0.16003360285624277,
+      "grad_norm": 0.5561717748641968,
+      "learning_rate": 0.00019951046058057622,
+      "loss": 0.2266,
+      "step": 762
+    },
+    {
+      "epoch": 0.16024362070776016,
+      "grad_norm": 0.8559679388999939,
+      "learning_rate": 0.00019950702201960665,
+      "loss": 0.3145,
+      "step": 763
+    },
+    {
+      "epoch": 0.16045363855927755,
+      "grad_norm": 0.7173314094543457,
+      "learning_rate": 0.00019950357145435122,
+      "loss": 0.2079,
+      "step": 764
+    },
+    {
+      "epoch": 0.16066365641079491,
+      "grad_norm": 0.4696892201900482,
+      "learning_rate": 0.00019950010888522625,
+      "loss": 0.2374,
+      "step": 765
+    },
+    {
+      "epoch": 0.1608736742623123,
+      "grad_norm": 0.5349077582359314,
+      "learning_rate": 0.00019949663431264943,
+      "loss": 0.2221,
+      "step": 766
+    },
+    {
+      "epoch": 0.16108369211382967,
+      "grad_norm": 0.49449819326400757,
+      "learning_rate": 0.0001994931477370399,
+      "loss": 0.1432,
+      "step": 767
+    },
+    {
+      "epoch": 0.16129370996534706,
+      "grad_norm": 0.652260422706604,
+      "learning_rate": 0.00019948964915881835,
+      "loss": 0.2122,
+      "step": 768
+    },
+    {
+      "epoch": 0.16150372781686442,
+      "grad_norm": 0.6549475789070129,
+      "learning_rate": 0.00019948613857840672,
+      "loss": 0.3484,
+      "step": 769
+    },
+    {
+      "epoch": 0.16171374566838181,
+      "grad_norm": 0.6772179007530212,
+      "learning_rate": 0.00019948261599622865,
+      "loss": 0.2784,
+      "step": 770
+    },
+    {
+      "epoch": 0.16192376351989918,
+      "grad_norm": 0.788960337638855,
+      "learning_rate": 0.00019947908141270898,
+      "loss": 0.1939,
+      "step": 771
+    },
+    {
+      "epoch": 0.16213378137141657,
+      "grad_norm": 0.6915500164031982,
+      "learning_rate": 0.00019947553482827418,
+      "loss": 0.1541,
+      "step": 772
+    },
+    {
+      "epoch": 0.16234379922293396,
+      "grad_norm": 0.604015052318573,
+      "learning_rate": 0.0001994719762433521,
+      "loss": 0.2148,
+      "step": 773
+    },
+    {
+      "epoch": 0.16255381707445132,
+      "grad_norm": 0.8275285959243774,
+      "learning_rate": 0.00019946840565837203,
+      "loss": 0.2808,
+      "step": 774
+    },
+    {
+      "epoch": 0.16276383492596871,
+      "grad_norm": 0.6737775802612305,
+      "learning_rate": 0.00019946482307376472,
+      "loss": 0.1813,
+      "step": 775
+    },
+    {
+      "epoch": 0.16297385277748608,
+      "grad_norm": 0.8311626315116882,
+      "learning_rate": 0.0001994612284899623,
+      "loss": 0.2819,
+      "step": 776
+    },
+    {
+      "epoch": 0.16318387062900347,
+      "grad_norm": 0.7368951439857483,
+      "learning_rate": 0.00019945762190739852,
+      "loss": 0.2619,
+      "step": 777
+    },
+    {
+      "epoch": 0.16339388848052083,
+      "grad_norm": 0.6095349788665771,
+      "learning_rate": 0.0001994540033265084,
+      "loss": 0.2449,
+      "step": 778
+    },
+    {
+      "epoch": 0.16360390633203822,
+      "grad_norm": 0.6738486886024475,
+      "learning_rate": 0.0001994503727477285,
+      "loss": 0.1493,
+      "step": 779
+    },
+    {
+      "epoch": 0.16381392418355561,
+      "grad_norm": 0.5636208653450012,
+      "learning_rate": 0.0001994467301714968,
+      "loss": 0.1949,
+      "step": 780
+    },
+    {
+      "epoch": 0.16402394203507298,
+      "grad_norm": 0.9404299259185791,
+      "learning_rate": 0.00019944307559825272,
+      "loss": 0.2503,
+      "step": 781
+    },
+    {
+      "epoch": 0.16423395988659037,
+      "grad_norm": 0.6188719868659973,
+      "learning_rate": 0.0001994394090284372,
+      "loss": 0.1658,
+      "step": 782
+    },
+    {
+      "epoch": 0.16444397773810773,
+      "grad_norm": 0.9498090147972107,
+      "learning_rate": 0.00019943573046249244,
+      "loss": 0.3425,
+      "step": 783
+    },
+    {
+      "epoch": 0.16465399558962512,
+      "grad_norm": 0.6508981585502625,
+      "learning_rate": 0.00019943203990086233,
+      "loss": 0.1384,
+      "step": 784
+    },
+    {
+      "epoch": 0.1648640134411425,
+      "grad_norm": 1.0658531188964844,
+      "learning_rate": 0.00019942833734399202,
+      "loss": 0.2609,
+      "step": 785
+    },
+    {
+      "epoch": 0.16507403129265988,
+      "grad_norm": 0.7281699776649475,
+      "learning_rate": 0.00019942462279232825,
+      "loss": 0.1985,
+      "step": 786
+    },
+    {
+      "epoch": 0.16528404914417724,
+      "grad_norm": 0.7734364867210388,
+      "learning_rate": 0.00019942089624631906,
+      "loss": 0.2617,
+      "step": 787
+    },
+    {
+      "epoch": 0.16549406699569463,
+      "grad_norm": 0.977069616317749,
+      "learning_rate": 0.00019941715770641408,
+      "loss": 0.2928,
+      "step": 788
+    },
+    {
+      "epoch": 0.16570408484721202,
+      "grad_norm": 0.7139049768447876,
+      "learning_rate": 0.00019941340717306424,
+      "loss": 0.3369,
+      "step": 789
+    },
+    {
+      "epoch": 0.1659141026987294,
+      "grad_norm": 0.5771147012710571,
+      "learning_rate": 0.00019940964464672205,
+      "loss": 0.2304,
+      "step": 790
+    },
+    {
+      "epoch": 0.16612412055024678,
+      "grad_norm": 0.5506160855293274,
+      "learning_rate": 0.00019940587012784138,
+      "loss": 0.2084,
+      "step": 791
+    },
+    {
+      "epoch": 0.16633413840176414,
+      "grad_norm": 0.48316794633865356,
+      "learning_rate": 0.0001994020836168776,
+      "loss": 0.1835,
+      "step": 792
+    },
+    {
+      "epoch": 0.16654415625328153,
+      "grad_norm": 0.5649861693382263,
+      "learning_rate": 0.00019939828511428753,
+      "loss": 0.2426,
+      "step": 793
+    },
+    {
+      "epoch": 0.1667541741047989,
+      "grad_norm": 0.5224729180335999,
+      "learning_rate": 0.00019939447462052936,
+      "loss": 0.1862,
+      "step": 794
+    },
+    {
+      "epoch": 0.1669641919563163,
+      "grad_norm": 0.5801841616630554,
+      "learning_rate": 0.00019939065213606282,
+      "loss": 0.2081,
+      "step": 795
+    },
+    {
+      "epoch": 0.16717420980783368,
+      "grad_norm": 0.4274038076400757,
+      "learning_rate": 0.00019938681766134902,
+      "loss": 0.14,
+      "step": 796
+    },
+    {
+      "epoch": 0.16738422765935104,
+      "grad_norm": 0.5294644236564636,
+      "learning_rate": 0.00019938297119685054,
+      "loss": 0.1851,
+      "step": 797
+    },
+    {
+      "epoch": 0.16759424551086843,
+      "grad_norm": 0.5110440850257874,
+      "learning_rate": 0.00019937911274303145,
+      "loss": 0.2346,
+      "step": 798
+    },
+    {
+      "epoch": 0.1678042633623858,
+      "grad_norm": 0.5785256028175354,
+      "learning_rate": 0.00019937524230035717,
+      "loss": 0.1554,
+      "step": 799
+    },
+    {
+      "epoch": 0.1680142812139032,
+      "grad_norm": 0.586320161819458,
+      "learning_rate": 0.00019937135986929465,
+      "loss": 0.2672,
+      "step": 800
+    },
+    {
+      "epoch": 0.16822429906542055,
+      "grad_norm": 0.502890408039093,
+      "learning_rate": 0.00019936746545031223,
+      "loss": 0.3023,
+      "step": 801
+    },
+    {
+      "epoch": 0.16843431691693794,
+      "grad_norm": 0.5421012043952942,
+      "learning_rate": 0.00019936355904387977,
+      "loss": 0.2331,
+      "step": 802
+    },
+    {
+      "epoch": 0.1686443347684553,
+      "grad_norm": 0.5681023001670837,
+      "learning_rate": 0.0001993596406504685,
+      "loss": 0.2064,
+      "step": 803
+    },
+    {
+      "epoch": 0.1688543526199727,
+      "grad_norm": 0.4179142713546753,
+      "learning_rate": 0.00019935571027055113,
+      "loss": 0.2302,
+      "step": 804
+    },
+    {
+      "epoch": 0.1690643704714901,
+      "grad_norm": 0.7016621232032776,
+      "learning_rate": 0.00019935176790460179,
+      "loss": 0.2442,
+      "step": 805
+    },
+    {
+      "epoch": 0.16927438832300745,
+      "grad_norm": 0.5401879549026489,
+      "learning_rate": 0.00019934781355309612,
+      "loss": 0.2798,
+      "step": 806
+    },
+    {
+      "epoch": 0.16948440617452484,
+      "grad_norm": 0.5687265396118164,
+      "learning_rate": 0.00019934384721651113,
+      "loss": 0.2097,
+      "step": 807
+    },
+    {
+      "epoch": 0.1696944240260422,
+      "grad_norm": 0.659520149230957,
+      "learning_rate": 0.00019933986889532533,
+      "loss": 0.1938,
+      "step": 808
+    },
+    {
+      "epoch": 0.1699044418775596,
+      "grad_norm": 0.8230718970298767,
+      "learning_rate": 0.00019933587859001866,
+      "loss": 0.4148,
+      "step": 809
+    },
+    {
+      "epoch": 0.17011445972907696,
+      "grad_norm": 0.7954551577568054,
+      "learning_rate": 0.00019933187630107244,
+      "loss": 0.4564,
+      "step": 810
+    },
+    {
+      "epoch": 0.17032447758059435,
+      "grad_norm": 0.6618001461029053,
+      "learning_rate": 0.0001993278620289696,
+      "loss": 0.2819,
+      "step": 811
+    },
+    {
+      "epoch": 0.17053449543211172,
+      "grad_norm": 0.9731025099754333,
+      "learning_rate": 0.00019932383577419432,
+      "loss": 0.3954,
+      "step": 812
+    },
+    {
+      "epoch": 0.1707445132836291,
+      "grad_norm": 0.7344256639480591,
+      "learning_rate": 0.00019931979753723232,
+      "loss": 0.2502,
+      "step": 813
+    },
+    {
+      "epoch": 0.1709545311351465,
+      "grad_norm": 0.6986575722694397,
+      "learning_rate": 0.00019931574731857086,
+      "loss": 0.2499,
+      "step": 814
+    },
+    {
+      "epoch": 0.17116454898666386,
+      "grad_norm": 0.5757253170013428,
+      "learning_rate": 0.00019931168511869846,
+      "loss": 0.2445,
+      "step": 815
+    },
+    {
+      "epoch": 0.17137456683818125,
+      "grad_norm": 0.5453664064407349,
+      "learning_rate": 0.0001993076109381052,
+      "loss": 0.2494,
+      "step": 816
+    },
+    {
+      "epoch": 0.17158458468969862,
+      "grad_norm": 0.7031118869781494,
+      "learning_rate": 0.00019930352477728257,
+      "loss": 0.2777,
+      "step": 817
+    },
+    {
+      "epoch": 0.171794602541216,
+      "grad_norm": 0.6201139092445374,
+      "learning_rate": 0.0001992994266367235,
+      "loss": 0.2145,
+      "step": 818
+    },
+    {
+      "epoch": 0.17200462039273337,
+      "grad_norm": 0.6421683430671692,
+      "learning_rate": 0.00019929531651692245,
+      "loss": 0.1951,
+      "step": 819
+    },
+    {
+      "epoch": 0.17221463824425076,
+      "grad_norm": 0.6390677094459534,
+      "learning_rate": 0.00019929119441837518,
+      "loss": 0.2011,
+      "step": 820
+    },
+    {
+      "epoch": 0.17242465609576815,
+      "grad_norm": 0.5171882510185242,
+      "learning_rate": 0.00019928706034157901,
+      "loss": 0.1459,
+      "step": 821
+    },
+    {
+      "epoch": 0.17263467394728552,
+      "grad_norm": 0.6737155914306641,
+      "learning_rate": 0.00019928291428703262,
+      "loss": 0.1507,
+      "step": 822
+    },
+    {
+      "epoch": 0.1728446917988029,
+      "grad_norm": 0.526128351688385,
+      "learning_rate": 0.00019927875625523625,
+      "loss": 0.1565,
+      "step": 823
+    },
+    {
+      "epoch": 0.17305470965032027,
+      "grad_norm": 0.7430817484855652,
+      "learning_rate": 0.00019927458624669145,
+      "loss": 0.2575,
+      "step": 824
+    },
+    {
+      "epoch": 0.17326472750183766,
+      "grad_norm": 0.4702281355857849,
+      "learning_rate": 0.0001992704042619013,
+      "loss": 0.1796,
+      "step": 825
+    },
+    {
+      "epoch": 0.17347474535335503,
+      "grad_norm": 0.5295049548149109,
+      "learning_rate": 0.00019926621030137034,
+      "loss": 0.1974,
+      "step": 826
+    },
+    {
+      "epoch": 0.17368476320487242,
+      "grad_norm": 0.667036771774292,
+      "learning_rate": 0.00019926200436560447,
+      "loss": 0.2125,
+      "step": 827
+    },
+    {
+      "epoch": 0.17389478105638978,
+      "grad_norm": 0.7351561188697815,
+      "learning_rate": 0.0001992577864551111,
+      "loss": 0.2271,
+      "step": 828
+    },
+    {
+      "epoch": 0.17410479890790717,
+      "grad_norm": 0.8084509372711182,
+      "learning_rate": 0.0001992535565703991,
+      "loss": 0.2301,
+      "step": 829
+    },
+    {
+      "epoch": 0.17431481675942456,
+      "grad_norm": 0.7022576928138733,
+      "learning_rate": 0.0001992493147119787,
+      "loss": 0.2662,
+      "step": 830
+    },
+    {
+      "epoch": 0.17452483461094193,
+      "grad_norm": 0.7098193168640137,
+      "learning_rate": 0.00019924506088036165,
+      "loss": 0.1979,
+      "step": 831
+    },
+    {
+      "epoch": 0.17473485246245932,
+      "grad_norm": 0.590630292892456,
+      "learning_rate": 0.00019924079507606114,
+      "loss": 0.1872,
+      "step": 832
+    },
+    {
+      "epoch": 0.17494487031397668,
+      "grad_norm": 0.7556937336921692,
+      "learning_rate": 0.00019923651729959177,
+      "loss": 0.1651,
+      "step": 833
+    },
+    {
+      "epoch": 0.17515488816549407,
+      "grad_norm": 0.6680110096931458,
+      "learning_rate": 0.00019923222755146956,
+      "loss": 0.1837,
+      "step": 834
+    },
+    {
+      "epoch": 0.17536490601701143,
+      "grad_norm": 0.7310810685157776,
+      "learning_rate": 0.0001992279258322121,
+      "loss": 0.3201,
+      "step": 835
+    },
+    {
+      "epoch": 0.17557492386852883,
+      "grad_norm": 0.5796787142753601,
+      "learning_rate": 0.0001992236121423383,
+      "loss": 0.178,
+      "step": 836
+    },
+    {
+      "epoch": 0.17578494172004622,
+      "grad_norm": 0.45521265268325806,
+      "learning_rate": 0.00019921928648236853,
+      "loss": 0.1723,
+      "step": 837
+    },
+    {
+      "epoch": 0.17599495957156358,
+      "grad_norm": 0.43274396657943726,
+      "learning_rate": 0.00019921494885282467,
+      "loss": 0.1597,
+      "step": 838
+    },
+    {
+      "epoch": 0.17620497742308097,
+      "grad_norm": 0.40754616260528564,
+      "learning_rate": 0.00019921059925422996,
+      "loss": 0.1299,
+      "step": 839
+    },
+    {
+      "epoch": 0.17641499527459834,
+      "grad_norm": 0.6628978252410889,
+      "learning_rate": 0.00019920623768710912,
+      "loss": 0.1931,
+      "step": 840
+    },
+    {
+      "epoch": 0.17662501312611573,
+      "grad_norm": 0.644637405872345,
+      "learning_rate": 0.0001992018641519884,
+      "loss": 0.199,
+      "step": 841
+    },
+    {
+      "epoch": 0.1768350309776331,
+      "grad_norm": 0.5001009106636047,
+      "learning_rate": 0.0001991974786493953,
+      "loss": 0.2109,
+      "step": 842
+    },
+    {
+      "epoch": 0.17704504882915048,
+      "grad_norm": 0.49435755610466003,
+      "learning_rate": 0.00019919308117985894,
+      "loss": 0.1832,
+      "step": 843
+    },
+    {
+      "epoch": 0.17725506668066784,
+      "grad_norm": 0.7176212668418884,
+      "learning_rate": 0.0001991886717439098,
+      "loss": 0.2491,
+      "step": 844
+    },
+    {
+      "epoch": 0.17746508453218524,
+      "grad_norm": 0.5122328996658325,
+      "learning_rate": 0.00019918425034207984,
+      "loss": 0.2618,
+      "step": 845
+    },
+    {
+      "epoch": 0.17767510238370263,
+      "grad_norm": 0.6069608926773071,
+      "learning_rate": 0.00019917981697490245,
+      "loss": 0.2119,
+      "step": 846
+    },
+    {
+      "epoch": 0.17788512023522,
+      "grad_norm": 0.8389537334442139,
+      "learning_rate": 0.00019917537164291244,
+      "loss": 0.2619,
+      "step": 847
+    },
+    {
+      "epoch": 0.17809513808673738,
+      "grad_norm": 0.5856572389602661,
+      "learning_rate": 0.00019917091434664612,
+      "loss": 0.1928,
+      "step": 848
+    },
+    {
+      "epoch": 0.17830515593825474,
+      "grad_norm": 0.5682632327079773,
+      "learning_rate": 0.00019916644508664115,
+      "loss": 0.2963,
+      "step": 849
+    },
+    {
+      "epoch": 0.17851517378977214,
+      "grad_norm": 0.45547807216644287,
+      "learning_rate": 0.00019916196386343674,
+      "loss": 0.1277,
+      "step": 850
+    },
+    {
+      "epoch": 0.1787251916412895,
+      "grad_norm": 0.648499071598053,
+      "learning_rate": 0.00019915747067757349,
+      "loss": 0.3407,
+      "step": 851
+    },
+    {
+      "epoch": 0.1789352094928069,
+      "grad_norm": 0.48874902725219727,
+      "learning_rate": 0.0001991529655295934,
+      "loss": 0.185,
+      "step": 852
+    },
+    {
+      "epoch": 0.17914522734432425,
+      "grad_norm": 0.7059923410415649,
+      "learning_rate": 0.00019914844842004002,
+      "loss": 0.2352,
+      "step": 853
+    },
+    {
+      "epoch": 0.17935524519584164,
+      "grad_norm": 0.6532195210456848,
+      "learning_rate": 0.00019914391934945823,
+      "loss": 0.292,
+      "step": 854
+    },
+    {
+      "epoch": 0.17956526304735904,
+      "grad_norm": 0.6922776103019714,
+      "learning_rate": 0.0001991393783183945,
+      "loss": 0.4635,
+      "step": 855
+    },
+    {
+      "epoch": 0.1797752808988764,
+      "grad_norm": 0.6560776829719543,
+      "learning_rate": 0.00019913482532739652,
+      "loss": 0.2684,
+      "step": 856
+    },
+    {
+      "epoch": 0.1799852987503938,
+      "grad_norm": 0.5644369125366211,
+      "learning_rate": 0.00019913026037701362,
+      "loss": 0.2018,
+      "step": 857
+    },
+    {
+      "epoch": 0.18019531660191115,
+      "grad_norm": 0.6108200550079346,
+      "learning_rate": 0.00019912568346779652,
+      "loss": 0.1746,
+      "step": 858
+    },
+    {
+      "epoch": 0.18040533445342855,
+      "grad_norm": 0.6762723326683044,
+      "learning_rate": 0.00019912109460029734,
+      "loss": 0.4662,
+      "step": 859
+    },
+    {
+      "epoch": 0.1806153523049459,
+      "grad_norm": 0.5877822041511536,
+      "learning_rate": 0.00019911649377506966,
+      "loss": 0.2546,
+      "step": 860
+    },
+    {
+      "epoch": 0.1808253701564633,
+      "grad_norm": 0.5038641095161438,
+      "learning_rate": 0.00019911188099266855,
+      "loss": 0.3073,
+      "step": 861
+    },
+    {
+      "epoch": 0.1810353880079807,
+      "grad_norm": 0.6587141156196594,
+      "learning_rate": 0.00019910725625365045,
+      "loss": 0.2991,
+      "step": 862
+    },
+    {
+      "epoch": 0.18124540585949805,
+      "grad_norm": 0.9041693210601807,
+      "learning_rate": 0.0001991026195585733,
+      "loss": 0.3111,
+      "step": 863
+    },
+    {
+      "epoch": 0.18145542371101545,
+      "grad_norm": 0.6296244263648987,
+      "learning_rate": 0.00019909797090799644,
+      "loss": 0.2272,
+      "step": 864
+    },
+    {
+      "epoch": 0.1816654415625328,
+      "grad_norm": 0.6931461691856384,
+      "learning_rate": 0.00019909331030248072,
+      "loss": 0.3503,
+      "step": 865
+    },
+    {
+      "epoch": 0.1818754594140502,
+      "grad_norm": 0.7656722664833069,
+      "learning_rate": 0.00019908863774258827,
+      "loss": 0.3773,
+      "step": 866
+    },
+    {
+      "epoch": 0.18208547726556756,
+      "grad_norm": 0.6011465787887573,
+      "learning_rate": 0.00019908395322888294,
+      "loss": 0.2101,
+      "step": 867
+    },
+    {
+      "epoch": 0.18229549511708495,
+      "grad_norm": 0.6926429867744446,
+      "learning_rate": 0.0001990792567619297,
+      "loss": 0.2027,
+      "step": 868
+    },
+    {
+      "epoch": 0.18250551296860232,
+      "grad_norm": 0.5799981355667114,
+      "learning_rate": 0.00019907454834229525,
+      "loss": 0.2129,
+      "step": 869
+    },
+    {
+      "epoch": 0.1827155308201197,
+      "grad_norm": 0.5605289936065674,
+      "learning_rate": 0.0001990698279705475,
+      "loss": 0.2104,
+      "step": 870
+    },
+    {
+      "epoch": 0.1829255486716371,
+      "grad_norm": 0.9048646092414856,
+      "learning_rate": 0.00019906509564725596,
+      "loss": 0.4131,
+      "step": 871
+    },
+    {
+      "epoch": 0.18313556652315446,
+      "grad_norm": 0.6802535057067871,
+      "learning_rate": 0.0001990603513729915,
+      "loss": 0.2715,
+      "step": 872
+    },
+    {
+      "epoch": 0.18334558437467185,
+      "grad_norm": 0.4949076771736145,
+      "learning_rate": 0.0001990555951483265,
+      "loss": 0.1725,
+      "step": 873
+    },
+    {
+      "epoch": 0.18355560222618922,
+      "grad_norm": 0.6589632034301758,
+      "learning_rate": 0.0001990508269738347,
+      "loss": 0.1424,
+      "step": 874
+    },
+    {
+      "epoch": 0.1837656200777066,
+      "grad_norm": 0.5366025567054749,
+      "learning_rate": 0.00019904604685009133,
+      "loss": 0.1578,
+      "step": 875
+    },
+    {
+      "epoch": 0.18397563792922397,
+      "grad_norm": 0.584173858165741,
+      "learning_rate": 0.00019904125477767303,
+      "loss": 0.2381,
+      "step": 876
+    },
+    {
+      "epoch": 0.18418565578074136,
+      "grad_norm": 0.6884530186653137,
+      "learning_rate": 0.00019903645075715798,
+      "loss": 0.2043,
+      "step": 877
+    },
+    {
+      "epoch": 0.18439567363225876,
+      "grad_norm": 0.6070178747177124,
+      "learning_rate": 0.00019903163478912563,
+      "loss": 0.1792,
+      "step": 878
+    },
+    {
+      "epoch": 0.18460569148377612,
+      "grad_norm": 0.6375721096992493,
+      "learning_rate": 0.00019902680687415705,
+      "loss": 0.218,
+      "step": 879
+    },
+    {
+      "epoch": 0.1848157093352935,
+      "grad_norm": 0.564017653465271,
+      "learning_rate": 0.0001990219670128346,
+      "loss": 0.1885,
+      "step": 880
+    },
+    {
+      "epoch": 0.18502572718681087,
+      "grad_norm": 0.6779912710189819,
+      "learning_rate": 0.0001990171152057422,
+      "loss": 0.1949,
+      "step": 881
+    },
+    {
+      "epoch": 0.18523574503832826,
+      "grad_norm": 0.6086128950119019,
+      "learning_rate": 0.0001990122514534651,
+      "loss": 0.1818,
+      "step": 882
+    },
+    {
+      "epoch": 0.18544576288984563,
+      "grad_norm": 0.4768702983856201,
+      "learning_rate": 0.0001990073757565901,
+      "loss": 0.1459,
+      "step": 883
+    },
+    {
+      "epoch": 0.18565578074136302,
+      "grad_norm": 0.5171164870262146,
+      "learning_rate": 0.0001990024881157054,
+      "loss": 0.1624,
+      "step": 884
+    },
+    {
+      "epoch": 0.18586579859288038,
+      "grad_norm": 0.6542419195175171,
+      "learning_rate": 0.00019899758853140064,
+      "loss": 0.2035,
+      "step": 885
+    },
+    {
+      "epoch": 0.18607581644439777,
+      "grad_norm": 0.7479321956634521,
+      "learning_rate": 0.0001989926770042668,
+      "loss": 0.3654,
+      "step": 886
+    },
+    {
+      "epoch": 0.18628583429591516,
+      "grad_norm": 0.7446826696395874,
+      "learning_rate": 0.0001989877535348965,
+      "loss": 0.236,
+      "step": 887
+    },
+    {
+      "epoch": 0.18649585214743253,
+      "grad_norm": 0.5898016691207886,
+      "learning_rate": 0.00019898281812388366,
+      "loss": 0.2013,
+      "step": 888
+    },
+    {
+      "epoch": 0.18670586999894992,
+      "grad_norm": 0.6942265629768372,
+      "learning_rate": 0.00019897787077182368,
+      "loss": 0.1912,
+      "step": 889
+    },
+    {
+      "epoch": 0.18691588785046728,
+      "grad_norm": 0.7095215320587158,
+      "learning_rate": 0.0001989729114793134,
+      "loss": 0.2031,
+      "step": 890
+    },
+    {
+      "epoch": 0.18712590570198467,
+      "grad_norm": 0.49590814113616943,
+      "learning_rate": 0.00019896794024695108,
+      "loss": 0.1848,
+      "step": 891
+    },
+    {
+      "epoch": 0.18733592355350204,
+      "grad_norm": 0.3615363836288452,
+      "learning_rate": 0.00019896295707533642,
+      "loss": 0.1357,
+      "step": 892
+    },
+    {
+      "epoch": 0.18754594140501943,
+      "grad_norm": 0.540952205657959,
+      "learning_rate": 0.00019895796196507063,
+      "loss": 0.1622,
+      "step": 893
+    },
+    {
+      "epoch": 0.1877559592565368,
+      "grad_norm": 0.6152564883232117,
+      "learning_rate": 0.00019895295491675628,
+      "loss": 0.2229,
+      "step": 894
+    },
+    {
+      "epoch": 0.18796597710805418,
+      "grad_norm": 0.6287555694580078,
+      "learning_rate": 0.0001989479359309974,
+      "loss": 0.1855,
+      "step": 895
+    },
+    {
+      "epoch": 0.18817599495957157,
+      "grad_norm": 0.6615211963653564,
+      "learning_rate": 0.00019894290500839946,
+      "loss": 0.2001,
+      "step": 896
+    },
+    {
+      "epoch": 0.18838601281108894,
+      "grad_norm": 0.6587905883789062,
+      "learning_rate": 0.00019893786214956945,
+      "loss": 0.2368,
+      "step": 897
+    },
+    {
+      "epoch": 0.18859603066260633,
+      "grad_norm": 0.3502175807952881,
+      "learning_rate": 0.00019893280735511565,
+      "loss": 0.1203,
+      "step": 898
+    },
+    {
+      "epoch": 0.1888060485141237,
+      "grad_norm": 0.6989165544509888,
+      "learning_rate": 0.00019892774062564786,
+      "loss": 0.2108,
+      "step": 899
+    },
+    {
+      "epoch": 0.18901606636564108,
+      "grad_norm": 0.5993213057518005,
+      "learning_rate": 0.00019892266196177736,
+      "loss": 0.2667,
+      "step": 900
+    },
+    {
+      "epoch": 0.18922608421715845,
+      "grad_norm": 0.6625016331672668,
+      "learning_rate": 0.0001989175713641168,
+      "loss": 0.3081,
+      "step": 901
+    },
+    {
+      "epoch": 0.18943610206867584,
+      "grad_norm": 0.6831103563308716,
+      "learning_rate": 0.0001989124688332803,
+      "loss": 0.2826,
+      "step": 902
+    },
+    {
+      "epoch": 0.18964611992019323,
+      "grad_norm": 0.6341603994369507,
+      "learning_rate": 0.00019890735436988347,
+      "loss": 0.2738,
+      "step": 903
+    },
+    {
+      "epoch": 0.1898561377717106,
+      "grad_norm": 0.6546643376350403,
+      "learning_rate": 0.0001989022279745432,
+      "loss": 0.3065,
+      "step": 904
+    },
+    {
+      "epoch": 0.19006615562322798,
+      "grad_norm": 0.7356497645378113,
+      "learning_rate": 0.000198897089647878,
+      "loss": 0.2955,
+      "step": 905
+    },
+    {
+      "epoch": 0.19027617347474535,
+      "grad_norm": 0.71455317735672,
+      "learning_rate": 0.00019889193939050777,
+      "loss": 0.2069,
+      "step": 906
+    },
+    {
+      "epoch": 0.19048619132626274,
+      "grad_norm": 0.7142229676246643,
+      "learning_rate": 0.00019888677720305374,
+      "loss": 0.3386,
+      "step": 907
+    },
+    {
+      "epoch": 0.1906962091777801,
+      "grad_norm": 0.6420140862464905,
+      "learning_rate": 0.00019888160308613874,
+      "loss": 0.2952,
+      "step": 908
+    },
+    {
+      "epoch": 0.1909062270292975,
+      "grad_norm": 0.757895290851593,
+      "learning_rate": 0.00019887641704038688,
+      "loss": 0.299,
+      "step": 909
+    },
+    {
+      "epoch": 0.19111624488081486,
+      "grad_norm": 0.5329816937446594,
+      "learning_rate": 0.00019887121906642387,
+      "loss": 0.2005,
+      "step": 910
+    },
+    {
+      "epoch": 0.19132626273233225,
+      "grad_norm": 0.5069072842597961,
+      "learning_rate": 0.00019886600916487677,
+      "loss": 0.1971,
+      "step": 911
+    },
+    {
+      "epoch": 0.19153628058384964,
+      "grad_norm": 0.7712031602859497,
+      "learning_rate": 0.00019886078733637408,
+      "loss": 0.2952,
+      "step": 912
+    },
+    {
+      "epoch": 0.191746298435367,
+      "grad_norm": 0.6340819001197815,
+      "learning_rate": 0.00019885555358154574,
+      "loss": 0.2403,
+      "step": 913
+    },
+    {
+      "epoch": 0.1919563162868844,
+      "grad_norm": 0.707127034664154,
+      "learning_rate": 0.0001988503079010231,
+      "loss": 0.262,
+      "step": 914
+    },
+    {
+      "epoch": 0.19216633413840176,
+      "grad_norm": 0.5502609014511108,
+      "learning_rate": 0.00019884505029543908,
+      "loss": 0.1767,
+      "step": 915
+    },
+    {
+      "epoch": 0.19237635198991915,
+      "grad_norm": 0.6637031435966492,
+      "learning_rate": 0.00019883978076542787,
+      "loss": 0.317,
+      "step": 916
+    },
+    {
+      "epoch": 0.1925863698414365,
+      "grad_norm": 0.5921664237976074,
+      "learning_rate": 0.00019883449931162517,
+      "loss": 0.2848,
+      "step": 917
+    },
+    {
+      "epoch": 0.1927963876929539,
+      "grad_norm": 0.8460182547569275,
+      "learning_rate": 0.0001988292059346682,
+      "loss": 0.2741,
+      "step": 918
+    },
+    {
+      "epoch": 0.1930064055444713,
+      "grad_norm": 0.7577118277549744,
+      "learning_rate": 0.00019882390063519543,
+      "loss": 0.2589,
+      "step": 919
+    },
+    {
+      "epoch": 0.19321642339598866,
+      "grad_norm": 0.5957863330841064,
+      "learning_rate": 0.00019881858341384696,
+      "loss": 0.1834,
+      "step": 920
+    },
+    {
+      "epoch": 0.19342644124750605,
+      "grad_norm": 0.6584639549255371,
+      "learning_rate": 0.00019881325427126422,
+      "loss": 0.232,
+      "step": 921
+    },
+    {
+      "epoch": 0.1936364590990234,
+      "grad_norm": 0.6941714882850647,
+      "learning_rate": 0.0001988079132080901,
+      "loss": 0.2514,
+      "step": 922
+    },
+    {
+      "epoch": 0.1938464769505408,
+      "grad_norm": 0.829231321811676,
+      "learning_rate": 0.00019880256022496897,
+      "loss": 0.2023,
+      "step": 923
+    },
+    {
+      "epoch": 0.19405649480205817,
+      "grad_norm": 0.6720934510231018,
+      "learning_rate": 0.00019879719532254654,
+      "loss": 0.2535,
+      "step": 924
+    },
+    {
+      "epoch": 0.19426651265357556,
+      "grad_norm": 0.7159935832023621,
+      "learning_rate": 0.00019879181850147005,
+      "loss": 0.3129,
+      "step": 925
+    },
+    {
+      "epoch": 0.19447653050509292,
+      "grad_norm": 0.6411039233207703,
+      "learning_rate": 0.00019878642976238817,
+      "loss": 0.1729,
+      "step": 926
+    },
+    {
+      "epoch": 0.1946865483566103,
+      "grad_norm": 0.7253606915473938,
+      "learning_rate": 0.00019878102910595095,
+      "loss": 0.2599,
+      "step": 927
+    },
+    {
+      "epoch": 0.1948965662081277,
+      "grad_norm": 0.6732550859451294,
+      "learning_rate": 0.0001987756165328099,
+      "loss": 0.1881,
+      "step": 928
+    },
+    {
+      "epoch": 0.19510658405964507,
+      "grad_norm": 0.6675817966461182,
+      "learning_rate": 0.00019877019204361804,
+      "loss": 0.2417,
+      "step": 929
+    },
+    {
+      "epoch": 0.19531660191116246,
+      "grad_norm": 0.5525332093238831,
+      "learning_rate": 0.0001987647556390297,
+      "loss": 0.2445,
+      "step": 930
+    },
+    {
+      "epoch": 0.19552661976267982,
+      "grad_norm": 0.7800937294960022,
+      "learning_rate": 0.00019875930731970076,
+      "loss": 0.2401,
+      "step": 931
+    },
+    {
+      "epoch": 0.1957366376141972,
+      "grad_norm": 0.5669112205505371,
+      "learning_rate": 0.00019875384708628848,
+      "loss": 0.1925,
+      "step": 932
+    },
+    {
+      "epoch": 0.19594665546571458,
+      "grad_norm": 0.6367275714874268,
+      "learning_rate": 0.00019874837493945156,
+      "loss": 0.205,
+      "step": 933
+    },
+    {
+      "epoch": 0.19615667331723197,
+      "grad_norm": 0.6173298954963684,
+      "learning_rate": 0.00019874289087985013,
+      "loss": 0.2426,
+      "step": 934
+    },
+    {
+      "epoch": 0.19636669116874933,
+      "grad_norm": 0.7045214176177979,
+      "learning_rate": 0.00019873739490814583,
+      "loss": 0.1647,
+      "step": 935
+    },
+    {
+      "epoch": 0.19657670902026672,
+      "grad_norm": 0.5824179649353027,
+      "learning_rate": 0.00019873188702500163,
+      "loss": 0.1527,
+      "step": 936
+    },
+    {
+      "epoch": 0.1967867268717841,
+      "grad_norm": 0.585749626159668,
+      "learning_rate": 0.000198726367231082,
+      "loss": 0.2119,
+      "step": 937
+    },
+    {
+      "epoch": 0.19699674472330148,
+      "grad_norm": 0.679140031337738,
+      "learning_rate": 0.00019872083552705284,
+      "loss": 0.2037,
+      "step": 938
+    },
+    {
+      "epoch": 0.19720676257481887,
+      "grad_norm": 0.3865984380245209,
+      "learning_rate": 0.0001987152919135815,
+      "loss": 0.1508,
+      "step": 939
+    },
+    {
+      "epoch": 0.19741678042633623,
+      "grad_norm": 0.5994648933410645,
+      "learning_rate": 0.0001987097363913367,
+      "loss": 0.1536,
+      "step": 940
+    },
+    {
+      "epoch": 0.19762679827785362,
+      "grad_norm": 0.8374373912811279,
+      "learning_rate": 0.0001987041689609887,
+      "loss": 0.3113,
+      "step": 941
+    },
+    {
+      "epoch": 0.19783681612937098,
+      "grad_norm": 0.4448517858982086,
+      "learning_rate": 0.0001986985896232091,
+      "loss": 0.1523,
+      "step": 942
+    },
+    {
+      "epoch": 0.19804683398088838,
+      "grad_norm": 0.5031003952026367,
+      "learning_rate": 0.00019869299837867098,
+      "loss": 0.2351,
+      "step": 943
+    },
+    {
+      "epoch": 0.19825685183240577,
+      "grad_norm": 0.8319448232650757,
+      "learning_rate": 0.0001986873952280489,
+      "loss": 0.2768,
+      "step": 944
+    },
+    {
+      "epoch": 0.19846686968392313,
+      "grad_norm": 0.4768364429473877,
+      "learning_rate": 0.00019868178017201874,
+      "loss": 0.2041,
+      "step": 945
+    },
+    {
+      "epoch": 0.19867688753544052,
+      "grad_norm": 0.36797624826431274,
+      "learning_rate": 0.00019867615321125795,
+      "loss": 0.1703,
+      "step": 946
+    },
+    {
+      "epoch": 0.19888690538695789,
+      "grad_norm": 0.629489541053772,
+      "learning_rate": 0.0001986705143464453,
+      "loss": 0.1989,
+      "step": 947
+    },
+    {
+      "epoch": 0.19909692323847528,
+      "grad_norm": 0.757764458656311,
+      "learning_rate": 0.00019866486357826107,
+      "loss": 0.1972,
+      "step": 948
+    },
+    {
+      "epoch": 0.19930694108999264,
+      "grad_norm": 0.884556770324707,
+      "learning_rate": 0.00019865920090738698,
+      "loss": 0.2592,
+      "step": 949
+    },
+    {
+      "epoch": 0.19951695894151003,
+      "grad_norm": 0.5489534139633179,
+      "learning_rate": 0.00019865352633450614,
+      "loss": 0.2075,
+      "step": 950
+    },
+    {
+      "epoch": 0.1997269767930274,
+      "grad_norm": 0.6485860347747803,
+      "learning_rate": 0.00019864783986030314,
+      "loss": 0.3648,
+      "step": 951
+    },
+    {
+      "epoch": 0.19993699464454479,
+      "grad_norm": 0.8612170219421387,
+      "learning_rate": 0.00019864214148546393,
+      "loss": 0.2175,
+      "step": 952
+    },
+    {
+      "epoch": 0.20014701249606218,
+      "grad_norm": 0.6336376070976257,
+      "learning_rate": 0.00019863643121067597,
+      "loss": 0.2935,
+      "step": 953
+    },
+    {
+      "epoch": 0.20035703034757954,
+      "grad_norm": 0.7330135703086853,
+      "learning_rate": 0.00019863070903662817,
+      "loss": 0.4322,
+      "step": 954
+    },
+    {
+      "epoch": 0.20056704819909693,
+      "grad_norm": 0.6464625000953674,
+      "learning_rate": 0.0001986249749640108,
+      "loss": 0.242,
+      "step": 955
+    },
+    {
+      "epoch": 0.2007770660506143,
+      "grad_norm": 0.6884174942970276,
+      "learning_rate": 0.00019861922899351561,
+      "loss": 0.3043,
+      "step": 956
+    },
+    {
+      "epoch": 0.20098708390213169,
+      "grad_norm": 0.4948609471321106,
+      "learning_rate": 0.0001986134711258358,
+      "loss": 0.1735,
+      "step": 957
+    },
+    {
+      "epoch": 0.20119710175364905,
+      "grad_norm": 0.9207262396812439,
+      "learning_rate": 0.00019860770136166596,
+      "loss": 0.2473,
+      "step": 958
+    },
+    {
+      "epoch": 0.20140711960516644,
+      "grad_norm": 0.6444927453994751,
+      "learning_rate": 0.00019860191970170216,
+      "loss": 0.2995,
+      "step": 959
+    },
+    {
+      "epoch": 0.20161713745668383,
+      "grad_norm": 0.8041002750396729,
+      "learning_rate": 0.00019859612614664185,
+      "loss": 0.3079,
+      "step": 960
+    },
+    {
+      "epoch": 0.2018271553082012,
+      "grad_norm": 0.520293653011322,
+      "learning_rate": 0.000198590320697184,
+      "loss": 0.2038,
+      "step": 961
+    },
+    {
+      "epoch": 0.20203717315971859,
+      "grad_norm": 0.6968462467193604,
+      "learning_rate": 0.00019858450335402897,
+      "loss": 0.2791,
+      "step": 962
+    },
+    {
+      "epoch": 0.20224719101123595,
+      "grad_norm": 0.5260444283485413,
+      "learning_rate": 0.00019857867411787847,
+      "loss": 0.2164,
+      "step": 963
+    },
+    {
+      "epoch": 0.20245720886275334,
+      "grad_norm": 0.7742235660552979,
+      "learning_rate": 0.0001985728329894358,
+      "loss": 0.3005,
+      "step": 964
+    },
+    {
+      "epoch": 0.2026672267142707,
+      "grad_norm": 0.4388875961303711,
+      "learning_rate": 0.0001985669799694056,
+      "loss": 0.2046,
+      "step": 965
+    },
+    {
+      "epoch": 0.2028772445657881,
+      "grad_norm": 0.8159006237983704,
+      "learning_rate": 0.00019856111505849394,
+      "loss": 0.2219,
+      "step": 966
+    },
+    {
+      "epoch": 0.20308726241730546,
+      "grad_norm": 0.5616422295570374,
+      "learning_rate": 0.0001985552382574084,
+      "loss": 0.3792,
+      "step": 967
+    },
+    {
+      "epoch": 0.20329728026882285,
+      "grad_norm": 0.5863935351371765,
+      "learning_rate": 0.00019854934956685792,
+      "loss": 0.2077,
+      "step": 968
+    },
+    {
+      "epoch": 0.20350729812034024,
+      "grad_norm": 0.5828328728675842,
+      "learning_rate": 0.00019854344898755287,
+      "loss": 0.273,
+      "step": 969
+    },
+    {
+      "epoch": 0.2037173159718576,
+      "grad_norm": 0.5963171124458313,
+      "learning_rate": 0.00019853753652020507,
+      "loss": 0.2407,
+      "step": 970
+    },
+    {
+      "epoch": 0.203927333823375,
+      "grad_norm": 0.5114577412605286,
+      "learning_rate": 0.00019853161216552788,
+      "loss": 0.1663,
+      "step": 971
+    },
+    {
+      "epoch": 0.20413735167489236,
+      "grad_norm": 0.5106688737869263,
+      "learning_rate": 0.0001985256759242359,
+      "loss": 0.1823,
+      "step": 972
+    },
+    {
+      "epoch": 0.20434736952640975,
+      "grad_norm": 0.5732094645500183,
+      "learning_rate": 0.00019851972779704534,
+      "loss": 0.2206,
+      "step": 973
+    },
+    {
+      "epoch": 0.2045573873779271,
+      "grad_norm": 0.5627723932266235,
+      "learning_rate": 0.00019851376778467366,
+      "loss": 0.1715,
+      "step": 974
+    },
+    {
+      "epoch": 0.2047674052294445,
+      "grad_norm": 0.7939655184745789,
+      "learning_rate": 0.00019850779588783998,
+      "loss": 0.1669,
+      "step": 975
+    },
+    {
+      "epoch": 0.20497742308096187,
+      "grad_norm": 0.5675683617591858,
+      "learning_rate": 0.00019850181210726467,
+      "loss": 0.1696,
+      "step": 976
+    },
+    {
+      "epoch": 0.20518744093247926,
+      "grad_norm": 0.9706809520721436,
+      "learning_rate": 0.00019849581644366965,
+      "loss": 0.376,
+      "step": 977
+    },
+    {
+      "epoch": 0.20539745878399665,
+      "grad_norm": 0.6787039041519165,
+      "learning_rate": 0.00019848980889777815,
+      "loss": 0.1528,
+      "step": 978
+    },
+    {
+      "epoch": 0.205607476635514,
+      "grad_norm": 0.8098447918891907,
+      "learning_rate": 0.00019848378947031492,
+      "loss": 0.1659,
+      "step": 979
+    },
+    {
+      "epoch": 0.2058174944870314,
+      "grad_norm": 0.46255457401275635,
+      "learning_rate": 0.0001984777581620062,
+      "loss": 0.1445,
+      "step": 980
+    },
+    {
+      "epoch": 0.20602751233854877,
+      "grad_norm": 0.7909157872200012,
+      "learning_rate": 0.00019847171497357953,
+      "loss": 0.2294,
+      "step": 981
+    },
+    {
+      "epoch": 0.20623753019006616,
+      "grad_norm": 0.6739736795425415,
+      "learning_rate": 0.00019846565990576392,
+      "loss": 0.2624,
+      "step": 982
+    },
+    {
+      "epoch": 0.20644754804158352,
+      "grad_norm": 0.7638704180717468,
+      "learning_rate": 0.00019845959295928994,
+      "loss": 0.2863,
+      "step": 983
+    },
+    {
+      "epoch": 0.20665756589310091,
+      "grad_norm": 0.48239898681640625,
+      "learning_rate": 0.00019845351413488939,
+      "loss": 0.1564,
+      "step": 984
+    },
+    {
+      "epoch": 0.2068675837446183,
+      "grad_norm": 0.6511039137840271,
+      "learning_rate": 0.00019844742343329568,
+      "loss": 0.1856,
+      "step": 985
+    },
+    {
+      "epoch": 0.20707760159613567,
+      "grad_norm": 0.48949161171913147,
+      "learning_rate": 0.0001984413208552435,
+      "loss": 0.1862,
+      "step": 986
+    },
+    {
+      "epoch": 0.20728761944765306,
+      "grad_norm": 0.6529719829559326,
+      "learning_rate": 0.0001984352064014691,
+      "loss": 0.2008,
+      "step": 987
+    },
+    {
+      "epoch": 0.20749763729917042,
+      "grad_norm": 0.5295738577842712,
+      "learning_rate": 0.00019842908007271012,
+      "loss": 0.2141,
+      "step": 988
+    },
+    {
+      "epoch": 0.20770765515068781,
+      "grad_norm": 0.5440765023231506,
+      "learning_rate": 0.00019842294186970562,
+      "loss": 0.264,
+      "step": 989
+    },
+    {
+      "epoch": 0.20791767300220518,
+      "grad_norm": 0.5315092206001282,
+      "learning_rate": 0.00019841679179319606,
+      "loss": 0.2116,
+      "step": 990
+    },
+    {
+      "epoch": 0.20812769085372257,
+      "grad_norm": 0.4537929892539978,
+      "learning_rate": 0.0001984106298439234,
+      "loss": 0.1269,
+      "step": 991
+    },
+    {
+      "epoch": 0.20833770870523993,
+      "grad_norm": 0.5806244015693665,
+      "learning_rate": 0.000198404456022631,
+      "loss": 0.1808,
+      "step": 992
+    },
+    {
+      "epoch": 0.20854772655675732,
+      "grad_norm": 0.5772647261619568,
+      "learning_rate": 0.00019839827033006372,
+      "loss": 0.1637,
+      "step": 993
+    },
+    {
+      "epoch": 0.20875774440827471,
+      "grad_norm": 0.4130006432533264,
+      "learning_rate": 0.00019839207276696764,
+      "loss": 0.1398,
+      "step": 994
+    },
+    {
+      "epoch": 0.20896776225979208,
+      "grad_norm": 0.47043028473854065,
+      "learning_rate": 0.00019838586333409056,
+      "loss": 0.1209,
+      "step": 995
+    },
+    {
+      "epoch": 0.20917778011130947,
+      "grad_norm": 0.713445782661438,
+      "learning_rate": 0.00019837964203218148,
+      "loss": 0.2175,
+      "step": 996
+    },
+    {
+      "epoch": 0.20938779796282683,
+      "grad_norm": 0.7947505116462708,
+      "learning_rate": 0.00019837340886199096,
+      "loss": 0.3172,
+      "step": 997
+    },
+    {
+      "epoch": 0.20959781581434422,
+      "grad_norm": 0.7544185519218445,
+      "learning_rate": 0.00019836716382427096,
+      "loss": 0.2506,
+      "step": 998
+    },
+    {
+      "epoch": 0.2098078336658616,
+      "grad_norm": 0.8411846160888672,
+      "learning_rate": 0.00019836090691977484,
+      "loss": 0.2619,
+      "step": 999
+    },
+    {
+      "epoch": 0.21001785151737898,
+      "grad_norm": 0.7358798384666443,
+      "learning_rate": 0.00019835463814925745,
+      "loss": 0.264,
+      "step": 1000
+    },
+    {
+      "epoch": 0.21022786936889637,
+      "grad_norm": 0.623121440410614,
+      "learning_rate": 0.00019834835751347503,
+      "loss": 0.4566,
+      "step": 1001
+    },
+    {
+      "epoch": 0.21043788722041373,
+      "grad_norm": 0.6662508845329285,
+      "learning_rate": 0.00019834206501318524,
+      "loss": 0.232,
+      "step": 1002
+    },
+    {
+      "epoch": 0.21064790507193112,
+      "grad_norm": 0.7510089874267578,
+      "learning_rate": 0.00019833576064914722,
+      "loss": 0.2207,
+      "step": 1003
+    },
+    {
+      "epoch": 0.2108579229234485,
+      "grad_norm": 0.6487518548965454,
+      "learning_rate": 0.0001983294444221215,
+      "loss": 0.2665,
+      "step": 1004
+    },
+    {
+      "epoch": 0.21106794077496588,
+      "grad_norm": 0.4078707695007324,
+      "learning_rate": 0.00019832311633287002,
+      "loss": 0.2028,
+      "step": 1005
+    },
+    {
+      "epoch": 0.21127795862648324,
+      "grad_norm": 0.7619323134422302,
+      "learning_rate": 0.00019831677638215624,
+      "loss": 0.29,
+      "step": 1006
+    },
+    {
+      "epoch": 0.21148797647800063,
+      "grad_norm": 0.6697717308998108,
+      "learning_rate": 0.00019831042457074498,
+      "loss": 0.2623,
+      "step": 1007
+    },
+    {
+      "epoch": 0.211697994329518,
+      "grad_norm": 0.6049818396568298,
+      "learning_rate": 0.0001983040608994025,
+      "loss": 0.2045,
+      "step": 1008
+    },
+    {
+      "epoch": 0.2119080121810354,
+      "grad_norm": 0.7906011343002319,
+      "learning_rate": 0.0001982976853688965,
+      "loss": 0.244,
+      "step": 1009
+    },
+    {
+      "epoch": 0.21211803003255278,
+      "grad_norm": 0.44965484738349915,
+      "learning_rate": 0.0001982912979799961,
+      "loss": 0.2391,
+      "step": 1010
+    },
+    {
+      "epoch": 0.21232804788407014,
+      "grad_norm": 0.9092258214950562,
+      "learning_rate": 0.00019828489873347188,
+      "loss": 0.3971,
+      "step": 1011
+    },
+    {
+      "epoch": 0.21253806573558753,
+      "grad_norm": 0.6709781289100647,
+      "learning_rate": 0.0001982784876300958,
+      "loss": 0.1851,
+      "step": 1012
+    },
+    {
+      "epoch": 0.2127480835871049,
+      "grad_norm": 0.608507513999939,
+      "learning_rate": 0.00019827206467064133,
+      "loss": 0.2602,
+      "step": 1013
+    },
+    {
+      "epoch": 0.2129581014386223,
+      "grad_norm": 0.7793399095535278,
+      "learning_rate": 0.00019826562985588328,
+      "loss": 0.288,
+      "step": 1014
+    },
+    {
+      "epoch": 0.21316811929013965,
+      "grad_norm": 0.8137920498847961,
+      "learning_rate": 0.00019825918318659792,
+      "loss": 0.2724,
+      "step": 1015
+    },
+    {
+      "epoch": 0.21337813714165704,
+      "grad_norm": 0.7229858636856079,
+      "learning_rate": 0.000198252724663563,
+      "loss": 0.1762,
+      "step": 1016
+    },
+    {
+      "epoch": 0.2135881549931744,
+      "grad_norm": 0.7144889831542969,
+      "learning_rate": 0.0001982462542875576,
+      "loss": 0.2199,
+      "step": 1017
+    },
+    {
+      "epoch": 0.2137981728446918,
+      "grad_norm": 0.5533698797225952,
+      "learning_rate": 0.00019823977205936236,
+      "loss": 0.1532,
+      "step": 1018
+    },
+    {
+      "epoch": 0.2140081906962092,
+      "grad_norm": 0.6681041717529297,
+      "learning_rate": 0.00019823327797975927,
+      "loss": 0.2221,
+      "step": 1019
+    },
+    {
+      "epoch": 0.21421820854772655,
+      "grad_norm": 0.7258886098861694,
+      "learning_rate": 0.0001982267720495317,
+      "loss": 0.2249,
+      "step": 1020
+    },
+    {
+      "epoch": 0.21442822639924394,
+      "grad_norm": 0.7201298475265503,
+      "learning_rate": 0.00019822025426946457,
+      "loss": 0.3041,
+      "step": 1021
+    },
+    {
+      "epoch": 0.2146382442507613,
+      "grad_norm": 0.5295907855033875,
+      "learning_rate": 0.00019821372464034416,
+      "loss": 0.1514,
+      "step": 1022
+    },
+    {
+      "epoch": 0.2148482621022787,
+      "grad_norm": 0.6460062861442566,
+      "learning_rate": 0.00019820718316295816,
+      "loss": 0.2194,
+      "step": 1023
+    },
+    {
+      "epoch": 0.21505827995379606,
+      "grad_norm": 0.6456478834152222,
+      "learning_rate": 0.00019820062983809576,
+      "loss": 0.1645,
+      "step": 1024
+    },
+    {
+      "epoch": 0.21526829780531345,
+      "grad_norm": 0.5274572372436523,
+      "learning_rate": 0.0001981940646665475,
+      "loss": 0.1434,
+      "step": 1025
+    },
+    {
+      "epoch": 0.21547831565683084,
+      "grad_norm": 0.7552756667137146,
+      "learning_rate": 0.00019818748764910537,
+      "loss": 0.3032,
+      "step": 1026
+    },
+    {
+      "epoch": 0.2156883335083482,
+      "grad_norm": 0.4831709861755371,
+      "learning_rate": 0.00019818089878656287,
+      "loss": 0.2132,
+      "step": 1027
+    },
+    {
+      "epoch": 0.2158983513598656,
+      "grad_norm": 0.8715883493423462,
+      "learning_rate": 0.00019817429807971482,
+      "loss": 0.2465,
+      "step": 1028
+    },
+    {
+      "epoch": 0.21610836921138296,
+      "grad_norm": 0.7408828735351562,
+      "learning_rate": 0.0001981676855293575,
+      "loss": 0.2652,
+      "step": 1029
+    },
+    {
+      "epoch": 0.21631838706290035,
+      "grad_norm": 0.4847085177898407,
+      "learning_rate": 0.00019816106113628866,
+      "loss": 0.239,
+      "step": 1030
+    },
+    {
+      "epoch": 0.21652840491441772,
+      "grad_norm": 0.6377763152122498,
+      "learning_rate": 0.00019815442490130747,
+      "loss": 0.2536,
+      "step": 1031
+    },
+    {
+      "epoch": 0.2167384227659351,
+      "grad_norm": 0.6141781806945801,
+      "learning_rate": 0.00019814777682521445,
+      "loss": 0.1938,
+      "step": 1032
+    },
+    {
+      "epoch": 0.21694844061745247,
+      "grad_norm": 0.8136757016181946,
+      "learning_rate": 0.0001981411169088117,
+      "loss": 0.2595,
+      "step": 1033
+    },
+    {
+      "epoch": 0.21715845846896986,
+      "grad_norm": 0.645139217376709,
+      "learning_rate": 0.00019813444515290253,
+      "loss": 0.2553,
+      "step": 1034
+    },
+    {
+      "epoch": 0.21736847632048725,
+      "grad_norm": 0.4579085409641266,
+      "learning_rate": 0.00019812776155829194,
+      "loss": 0.1846,
+      "step": 1035
+    },
+    {
+      "epoch": 0.21757849417200462,
+      "grad_norm": 0.6354373097419739,
+      "learning_rate": 0.0001981210661257861,
+      "loss": 0.1565,
+      "step": 1036
+    },
+    {
+      "epoch": 0.217788512023522,
+      "grad_norm": 0.7238495945930481,
+      "learning_rate": 0.0001981143588561928,
+      "loss": 0.1602,
+      "step": 1037
+    },
+    {
+      "epoch": 0.21799852987503937,
+      "grad_norm": 0.4990311563014984,
+      "learning_rate": 0.00019810763975032118,
+      "loss": 0.1736,
+      "step": 1038
+    },
+    {
+      "epoch": 0.21820854772655676,
+      "grad_norm": 0.6193257570266724,
+      "learning_rate": 0.0001981009088089818,
+      "loss": 0.2047,
+      "step": 1039
+    },
+    {
+      "epoch": 0.21841856557807413,
+      "grad_norm": 0.6221904158592224,
+      "learning_rate": 0.0001980941660329867,
+      "loss": 0.18,
+      "step": 1040
+    },
+    {
+      "epoch": 0.21862858342959152,
+      "grad_norm": 0.5321659445762634,
+      "learning_rate": 0.0001980874114231493,
+      "loss": 0.1644,
+      "step": 1041
+    },
+    {
+      "epoch": 0.2188386012811089,
+      "grad_norm": 0.6771279573440552,
+      "learning_rate": 0.00019808064498028443,
+      "loss": 0.2196,
+      "step": 1042
+    },
+    {
+      "epoch": 0.21904861913262627,
+      "grad_norm": 0.5956505537033081,
+      "learning_rate": 0.00019807386670520836,
+      "loss": 0.2051,
+      "step": 1043
+    },
+    {
+      "epoch": 0.21925863698414366,
+      "grad_norm": 0.7111203670501709,
+      "learning_rate": 0.00019806707659873887,
+      "loss": 0.1721,
+      "step": 1044
+    },
+    {
+      "epoch": 0.21946865483566103,
+      "grad_norm": 0.5506051182746887,
+      "learning_rate": 0.00019806027466169506,
+      "loss": 0.1351,
+      "step": 1045
+    },
+    {
+      "epoch": 0.21967867268717842,
+      "grad_norm": 0.5250877737998962,
+      "learning_rate": 0.00019805346089489753,
+      "loss": 0.2592,
+      "step": 1046
+    },
+    {
+      "epoch": 0.21988869053869578,
+      "grad_norm": 0.7672072649002075,
+      "learning_rate": 0.00019804663529916826,
+      "loss": 0.2551,
+      "step": 1047
+    },
+    {
+      "epoch": 0.22009870839021317,
+      "grad_norm": 0.3871646821498871,
+      "learning_rate": 0.00019803979787533064,
+      "loss": 0.1114,
+      "step": 1048
+    },
+    {
+      "epoch": 0.22030872624173053,
+      "grad_norm": 0.49425801634788513,
+      "learning_rate": 0.00019803294862420957,
+      "loss": 0.1579,
+      "step": 1049
+    },
+    {
+      "epoch": 0.22051874409324793,
+      "grad_norm": 0.8091879487037659,
+      "learning_rate": 0.0001980260875466313,
+      "loss": 0.3894,
+      "step": 1050
+    },
+    {
+      "epoch": 0.22072876194476532,
+      "grad_norm": 0.5933718085289001,
+      "learning_rate": 0.00019801921464342358,
+      "loss": 0.2827,
+      "step": 1051
+    },
+    {
+      "epoch": 0.22093877979628268,
+      "grad_norm": 0.8242902159690857,
+      "learning_rate": 0.00019801232991541548,
+      "loss": 0.4183,
+      "step": 1052
+    },
+    {
+      "epoch": 0.22114879764780007,
+      "grad_norm": 0.5886990427970886,
+      "learning_rate": 0.00019800543336343757,
+      "loss": 0.1928,
+      "step": 1053
+    },
+    {
+      "epoch": 0.22135881549931744,
+      "grad_norm": 0.559283435344696,
+      "learning_rate": 0.00019799852498832184,
+      "loss": 0.2042,
+      "step": 1054
+    },
+    {
+      "epoch": 0.22156883335083483,
+      "grad_norm": 0.4517320990562439,
+      "learning_rate": 0.0001979916047909017,
+      "loss": 0.1895,
+      "step": 1055
+    },
+    {
+      "epoch": 0.2217788512023522,
+      "grad_norm": 0.6435453295707703,
+      "learning_rate": 0.000197984672772012,
+      "loss": 0.2571,
+      "step": 1056
+    },
+    {
+      "epoch": 0.22198886905386958,
+      "grad_norm": 0.6028056740760803,
+      "learning_rate": 0.00019797772893248897,
+      "loss": 0.2509,
+      "step": 1057
+    },
+    {
+      "epoch": 0.22219888690538694,
+      "grad_norm": 0.5414546132087708,
+      "learning_rate": 0.00019797077327317033,
+      "loss": 0.2198,
+      "step": 1058
+    },
+    {
+      "epoch": 0.22240890475690434,
+      "grad_norm": 0.42511531710624695,
+      "learning_rate": 0.0001979638057948952,
+      "loss": 0.1837,
+      "step": 1059
+    },
+    {
+      "epoch": 0.22261892260842173,
+      "grad_norm": 0.7643983960151672,
+      "learning_rate": 0.00019795682649850408,
+      "loss": 0.2282,
+      "step": 1060
+    },
+    {
+      "epoch": 0.2228289404599391,
+      "grad_norm": 0.713973343372345,
+      "learning_rate": 0.00019794983538483894,
+      "loss": 0.2568,
+      "step": 1061
+    },
+    {
+      "epoch": 0.22303895831145648,
+      "grad_norm": 0.4897744655609131,
+      "learning_rate": 0.0001979428324547432,
+      "loss": 0.176,
+      "step": 1062
+    },
+    {
+      "epoch": 0.22324897616297384,
+      "grad_norm": 0.4883119463920593,
+      "learning_rate": 0.0001979358177090617,
+      "loss": 0.2891,
+      "step": 1063
+    },
+    {
+      "epoch": 0.22345899401449124,
+      "grad_norm": 0.7395027875900269,
+      "learning_rate": 0.0001979287911486406,
+      "loss": 0.1988,
+      "step": 1064
+    },
+    {
+      "epoch": 0.2236690118660086,
+      "grad_norm": 0.8084394931793213,
+      "learning_rate": 0.00019792175277432762,
+      "loss": 0.2925,
+      "step": 1065
+    },
+    {
+      "epoch": 0.223879029717526,
+      "grad_norm": 0.452434778213501,
+      "learning_rate": 0.00019791470258697188,
+      "loss": 0.1406,
+      "step": 1066
+    },
+    {
+      "epoch": 0.22408904756904338,
+      "grad_norm": 0.6661890745162964,
+      "learning_rate": 0.00019790764058742383,
+      "loss": 0.2275,
+      "step": 1067
+    },
+    {
+      "epoch": 0.22429906542056074,
+      "grad_norm": 0.6966044902801514,
+      "learning_rate": 0.00019790056677653547,
+      "loss": 0.2291,
+      "step": 1068
+    },
+    {
+      "epoch": 0.22450908327207814,
+      "grad_norm": 0.6009211540222168,
+      "learning_rate": 0.00019789348115516008,
+      "loss": 0.1446,
+      "step": 1069
+    },
+    {
+      "epoch": 0.2247191011235955,
+      "grad_norm": 0.6025848388671875,
+      "learning_rate": 0.0001978863837241526,
+      "loss": 0.1969,
+      "step": 1070
+    },
+    {
+      "epoch": 0.2249291189751129,
+      "grad_norm": 0.6915929913520813,
+      "learning_rate": 0.0001978792744843691,
+      "loss": 0.1574,
+      "step": 1071
+    },
+    {
+      "epoch": 0.22513913682663025,
+      "grad_norm": 0.6229645609855652,
+      "learning_rate": 0.00019787215343666732,
+      "loss": 0.3064,
+      "step": 1072
+    },
+    {
+      "epoch": 0.22534915467814765,
+      "grad_norm": 0.5794253945350647,
+      "learning_rate": 0.00019786502058190627,
+      "loss": 0.1497,
+      "step": 1073
+    },
+    {
+      "epoch": 0.225559172529665,
+      "grad_norm": 0.8496554493904114,
+      "learning_rate": 0.00019785787592094647,
+      "loss": 0.2076,
+      "step": 1074
+    },
+    {
+      "epoch": 0.2257691903811824,
+      "grad_norm": 0.6304386258125305,
+      "learning_rate": 0.0001978507194546498,
+      "loss": 0.2767,
+      "step": 1075
+    },
+    {
+      "epoch": 0.2259792082326998,
+      "grad_norm": 0.928598940372467,
+      "learning_rate": 0.00019784355118387966,
+      "loss": 0.2917,
+      "step": 1076
+    },
+    {
+      "epoch": 0.22618922608421715,
+      "grad_norm": 0.6034556031227112,
+      "learning_rate": 0.00019783637110950075,
+      "loss": 0.1979,
+      "step": 1077
+    },
+    {
+      "epoch": 0.22639924393573455,
+      "grad_norm": 0.6918156743049622,
+      "learning_rate": 0.0001978291792323793,
+      "loss": 0.2555,
+      "step": 1078
+    },
+    {
+      "epoch": 0.2266092617872519,
+      "grad_norm": 0.8922538757324219,
+      "learning_rate": 0.00019782197555338288,
+      "loss": 0.1926,
+      "step": 1079
+    },
+    {
+      "epoch": 0.2268192796387693,
+      "grad_norm": 1.1521648168563843,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 0.3391,
+      "step": 1080
+    },
+    {
+      "epoch": 0.22702929749028666,
+      "grad_norm": 0.760995626449585,
+      "learning_rate": 0.0001978075327932428,
+      "loss": 0.2672,
+      "step": 1081
+    },
+    {
+      "epoch": 0.22723931534180405,
+      "grad_norm": 0.6859043836593628,
+      "learning_rate": 0.00019780029371384145,
+      "loss": 0.2383,
+      "step": 1082
+    },
+    {
+      "epoch": 0.22744933319332145,
+      "grad_norm": 0.8544055223464966,
+      "learning_rate": 0.00019779304283604985,
+      "loss": 0.2957,
+      "step": 1083
+    },
+    {
+      "epoch": 0.2276593510448388,
+      "grad_norm": 0.8154661655426025,
+      "learning_rate": 0.0001977857801607427,
+      "loss": 0.2017,
+      "step": 1084
+    },
+    {
+      "epoch": 0.2278693688963562,
+      "grad_norm": 0.5382252931594849,
+      "learning_rate": 0.00019777850568879614,
+      "loss": 0.179,
+      "step": 1085
+    },
+    {
+      "epoch": 0.22807938674787356,
+      "grad_norm": 0.6206135749816895,
+      "learning_rate": 0.0001977712194210878,
+      "loss": 0.2559,
+      "step": 1086
+    },
+    {
+      "epoch": 0.22828940459939095,
+      "grad_norm": 0.6016466021537781,
+      "learning_rate": 0.00019776392135849663,
+      "loss": 0.1619,
+      "step": 1087
+    },
+    {
+      "epoch": 0.22849942245090832,
+      "grad_norm": 0.3538142442703247,
+      "learning_rate": 0.00019775661150190306,
+      "loss": 0.1291,
+      "step": 1088
+    },
+    {
+      "epoch": 0.2287094403024257,
+      "grad_norm": 0.5222025513648987,
+      "learning_rate": 0.00019774928985218893,
+      "loss": 0.1718,
+      "step": 1089
+    },
+    {
+      "epoch": 0.22891945815394307,
+      "grad_norm": 0.7469238638877869,
+      "learning_rate": 0.00019774195641023755,
+      "loss": 0.2725,
+      "step": 1090
+    },
+    {
+      "epoch": 0.22912947600546046,
+      "grad_norm": 0.5208479166030884,
+      "learning_rate": 0.00019773461117693355,
+      "loss": 0.2309,
+      "step": 1091
+    },
+    {
+      "epoch": 0.22933949385697786,
+      "grad_norm": 0.6402025818824768,
+      "learning_rate": 0.00019772725415316303,
+      "loss": 0.2116,
+      "step": 1092
+    },
+    {
+      "epoch": 0.22954951170849522,
+      "grad_norm": 0.7494028806686401,
+      "learning_rate": 0.0001977198853398136,
+      "loss": 0.2141,
+      "step": 1093
+    },
+    {
+      "epoch": 0.2297595295600126,
+      "grad_norm": 0.6795624494552612,
+      "learning_rate": 0.00019771250473777418,
+      "loss": 0.3321,
+      "step": 1094
+    },
+    {
+      "epoch": 0.22996954741152997,
+      "grad_norm": 0.6777510046958923,
+      "learning_rate": 0.0001977051123479351,
+      "loss": 0.1976,
+      "step": 1095
+    },
+    {
+      "epoch": 0.23017956526304736,
+      "grad_norm": 0.8358330130577087,
+      "learning_rate": 0.00019769770817118824,
+      "loss": 0.2433,
+      "step": 1096
+    },
+    {
+      "epoch": 0.23038958311456473,
+      "grad_norm": 0.6221851110458374,
+      "learning_rate": 0.00019769029220842677,
+      "loss": 0.15,
+      "step": 1097
+    },
+    {
+      "epoch": 0.23059960096608212,
+      "grad_norm": 0.9023957252502441,
+      "learning_rate": 0.00019768286446054532,
+      "loss": 0.25,
+      "step": 1098
+    },
+    {
+      "epoch": 0.23080961881759948,
+      "grad_norm": 0.4097208082675934,
+      "learning_rate": 0.00019767542492844006,
+      "loss": 0.1478,
+      "step": 1099
+    },
+    {
+      "epoch": 0.23101963666911687,
+      "grad_norm": 0.5772308707237244,
+      "learning_rate": 0.00019766797361300833,
+      "loss": 0.1754,
+      "step": 1100
+    },
+    {
+      "epoch": 0.23122965452063426,
+      "grad_norm": 0.5117380619049072,
+      "learning_rate": 0.00019766051051514914,
+      "loss": 0.2119,
+      "step": 1101
+    },
+    {
+      "epoch": 0.23143967237215163,
+      "grad_norm": 0.7716235518455505,
+      "learning_rate": 0.00019765303563576276,
+      "loss": 0.3488,
+      "step": 1102
+    },
+    {
+      "epoch": 0.23164969022366902,
+      "grad_norm": 0.7328181862831116,
+      "learning_rate": 0.000197645548975751,
+      "loss": 0.2983,
+      "step": 1103
+    },
+    {
+      "epoch": 0.23185970807518638,
+      "grad_norm": 0.5517509579658508,
+      "learning_rate": 0.00019763805053601695,
+      "loss": 0.3993,
+      "step": 1104
+    },
+    {
+      "epoch": 0.23206972592670377,
+      "grad_norm": 0.639631450176239,
+      "learning_rate": 0.00019763054031746532,
+      "loss": 0.3138,
+      "step": 1105
+    },
+    {
+      "epoch": 0.23227974377822114,
+      "grad_norm": 0.6691192388534546,
+      "learning_rate": 0.00019762301832100204,
+      "loss": 0.2601,
+      "step": 1106
+    },
+    {
+      "epoch": 0.23248976162973853,
+      "grad_norm": 0.6119483709335327,
+      "learning_rate": 0.00019761548454753453,
+      "loss": 0.2382,
+      "step": 1107
+    },
+    {
+      "epoch": 0.23269977948125592,
+      "grad_norm": 0.7129586935043335,
+      "learning_rate": 0.00019760793899797172,
+      "loss": 0.2975,
+      "step": 1108
+    },
+    {
+      "epoch": 0.23290979733277328,
+      "grad_norm": 0.5922974944114685,
+      "learning_rate": 0.00019760038167322382,
+      "loss": 0.2665,
+      "step": 1109
+    },
+    {
+      "epoch": 0.23311981518429067,
+      "grad_norm": 0.438453733921051,
+      "learning_rate": 0.0001975928125742026,
+      "loss": 0.217,
+      "step": 1110
+    },
+    {
+      "epoch": 0.23332983303580804,
+      "grad_norm": 0.5052367448806763,
+      "learning_rate": 0.0001975852317018211,
+      "loss": 0.1992,
+      "step": 1111
+    },
+    {
+      "epoch": 0.23353985088732543,
+      "grad_norm": 0.7396247386932373,
+      "learning_rate": 0.0001975776390569939,
+      "loss": 0.2561,
+      "step": 1112
+    },
+    {
+      "epoch": 0.2337498687388428,
+      "grad_norm": 0.502675473690033,
+      "learning_rate": 0.00019757003464063695,
+      "loss": 0.1859,
+      "step": 1113
+    },
+    {
+      "epoch": 0.23395988659036018,
+      "grad_norm": 0.5194231271743774,
+      "learning_rate": 0.0001975624184536676,
+      "loss": 0.19,
+      "step": 1114
+    },
+    {
+      "epoch": 0.23416990444187755,
+      "grad_norm": 0.6101930737495422,
+      "learning_rate": 0.00019755479049700473,
+      "loss": 0.2109,
+      "step": 1115
+    },
+    {
+      "epoch": 0.23437992229339494,
+      "grad_norm": 0.5229907631874084,
+      "learning_rate": 0.0001975471507715685,
+      "loss": 0.2359,
+      "step": 1116
+    },
+    {
+      "epoch": 0.23458994014491233,
+      "grad_norm": 0.7651471495628357,
+      "learning_rate": 0.0001975394992782805,
+      "loss": 0.3001,
+      "step": 1117
+    },
+    {
+      "epoch": 0.2347999579964297,
+      "grad_norm": 0.7726680636405945,
+      "learning_rate": 0.0001975318360180639,
+      "loss": 0.2422,
+      "step": 1118
+    },
+    {
+      "epoch": 0.23500997584794708,
+      "grad_norm": 0.8400713801383972,
+      "learning_rate": 0.00019752416099184307,
+      "loss": 0.305,
+      "step": 1119
+    },
+    {
+      "epoch": 0.23521999369946445,
+      "grad_norm": 0.7534275650978088,
+      "learning_rate": 0.00019751647420054397,
+      "loss": 0.2488,
+      "step": 1120
+    },
+    {
+      "epoch": 0.23543001155098184,
+      "grad_norm": 0.5462124347686768,
+      "learning_rate": 0.0001975087756450939,
+      "loss": 0.1503,
+      "step": 1121
+    },
+    {
+      "epoch": 0.2356400294024992,
+      "grad_norm": 0.5736708641052246,
+      "learning_rate": 0.0001975010653264216,
+      "loss": 0.239,
+      "step": 1122
+    },
+    {
+      "epoch": 0.2358500472540166,
+      "grad_norm": 0.6881362795829773,
+      "learning_rate": 0.00019749334324545723,
+      "loss": 0.29,
+      "step": 1123
+    },
+    {
+      "epoch": 0.23606006510553398,
+      "grad_norm": 0.6573613286018372,
+      "learning_rate": 0.00019748560940313232,
+      "loss": 0.1698,
+      "step": 1124
+    },
+    {
+      "epoch": 0.23627008295705135,
+      "grad_norm": 0.5784547924995422,
+      "learning_rate": 0.0001974778638003799,
+      "loss": 0.1637,
+      "step": 1125
+    },
+    {
+      "epoch": 0.23648010080856874,
+      "grad_norm": 0.5579979419708252,
+      "learning_rate": 0.0001974701064381344,
+      "loss": 0.289,
+      "step": 1126
+    },
+    {
+      "epoch": 0.2366901186600861,
+      "grad_norm": 0.6849252581596375,
+      "learning_rate": 0.00019746233731733162,
+      "loss": 0.2424,
+      "step": 1127
+    },
+    {
+      "epoch": 0.2369001365116035,
+      "grad_norm": 0.735053300857544,
+      "learning_rate": 0.0001974545564389088,
+      "loss": 0.2899,
+      "step": 1128
+    },
+    {
+      "epoch": 0.23711015436312086,
+      "grad_norm": 0.5866673588752747,
+      "learning_rate": 0.00019744676380380462,
+      "loss": 0.1714,
+      "step": 1129
+    },
+    {
+      "epoch": 0.23732017221463825,
+      "grad_norm": 0.863044023513794,
+      "learning_rate": 0.00019743895941295918,
+      "loss": 0.2236,
+      "step": 1130
+    },
+    {
+      "epoch": 0.2375301900661556,
+      "grad_norm": 0.5845204591751099,
+      "learning_rate": 0.00019743114326731395,
+      "loss": 0.1645,
+      "step": 1131
+    },
+    {
+      "epoch": 0.237740207917673,
+      "grad_norm": 0.5500687956809998,
+      "learning_rate": 0.00019742331536781187,
+      "loss": 0.1583,
+      "step": 1132
+    },
+    {
+      "epoch": 0.2379502257691904,
+      "grad_norm": 0.4377477169036865,
+      "learning_rate": 0.00019741547571539727,
+      "loss": 0.177,
+      "step": 1133
+    },
+    {
+      "epoch": 0.23816024362070776,
+      "grad_norm": 0.6978058218955994,
+      "learning_rate": 0.0001974076243110159,
+      "loss": 0.2614,
+      "step": 1134
+    },
+    {
+      "epoch": 0.23837026147222515,
+      "grad_norm": 0.5615448355674744,
+      "learning_rate": 0.00019739976115561495,
+      "loss": 0.2396,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2385802793237425,
+      "grad_norm": 0.674802839756012,
+      "learning_rate": 0.00019739188625014304,
+      "loss": 0.1782,
+      "step": 1136
+    },
+    {
+      "epoch": 0.2387902971752599,
+      "grad_norm": 0.5170165300369263,
+      "learning_rate": 0.0001973839995955501,
+      "loss": 0.1917,
+      "step": 1137
+    },
+    {
+      "epoch": 0.23900031502677727,
+      "grad_norm": 0.34832054376602173,
+      "learning_rate": 0.00019737610119278766,
+      "loss": 0.0986,
+      "step": 1138
+    },
+    {
+      "epoch": 0.23921033287829466,
+      "grad_norm": 0.5801171064376831,
+      "learning_rate": 0.0001973681910428085,
+      "loss": 0.1948,
+      "step": 1139
+    },
+    {
+      "epoch": 0.23942035072981202,
+      "grad_norm": 0.6232351660728455,
+      "learning_rate": 0.00019736026914656687,
+      "loss": 0.172,
+      "step": 1140
+    },
+    {
+      "epoch": 0.2396303685813294,
+      "grad_norm": 0.4935660660266876,
+      "learning_rate": 0.00019735233550501847,
+      "loss": 0.1967,
+      "step": 1141
+    },
+    {
+      "epoch": 0.2398403864328468,
+      "grad_norm": 0.5983342528343201,
+      "learning_rate": 0.0001973443901191204,
+      "loss": 0.1474,
+      "step": 1142
+    },
+    {
+      "epoch": 0.24005040428436417,
+      "grad_norm": 0.4784546196460724,
+      "learning_rate": 0.00019733643298983116,
+      "loss": 0.1675,
+      "step": 1143
+    },
+    {
+      "epoch": 0.24026042213588156,
+      "grad_norm": 0.4301772713661194,
+      "learning_rate": 0.0001973284641181107,
+      "loss": 0.135,
+      "step": 1144
+    },
+    {
+      "epoch": 0.24047043998739892,
+      "grad_norm": 0.7016049027442932,
+      "learning_rate": 0.0001973204835049203,
+      "loss": 0.1911,
+      "step": 1145
+    },
+    {
+      "epoch": 0.2406804578389163,
+      "grad_norm": 0.6154897809028625,
+      "learning_rate": 0.00019731249115122283,
+      "loss": 0.1488,
+      "step": 1146
+    },
+    {
+      "epoch": 0.24089047569043368,
+      "grad_norm": 1.0888360738754272,
+      "learning_rate": 0.00019730448705798239,
+      "loss": 0.2994,
+      "step": 1147
+    },
+    {
+      "epoch": 0.24110049354195107,
+      "grad_norm": 0.5985755920410156,
+      "learning_rate": 0.0001972964712261646,
+      "loss": 0.188,
+      "step": 1148
+    },
+    {
+      "epoch": 0.24131051139346846,
+      "grad_norm": 0.40716445446014404,
+      "learning_rate": 0.00019728844365673646,
+      "loss": 0.1215,
+      "step": 1149
+    },
+    {
+      "epoch": 0.24152052924498582,
+      "grad_norm": 0.5432882905006409,
+      "learning_rate": 0.0001972804043506664,
+      "loss": 0.2196,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2417305470965032,
+      "grad_norm": 0.564270555973053,
+      "learning_rate": 0.00019727235330892426,
+      "loss": 0.2982,
+      "step": 1151
+    },
+    {
+      "epoch": 0.24194056494802058,
+      "grad_norm": 0.6324896216392517,
+      "learning_rate": 0.0001972642905324813,
+      "loss": 0.3008,
+      "step": 1152
+    },
+    {
+      "epoch": 0.24215058279953797,
+      "grad_norm": 0.6564551591873169,
+      "learning_rate": 0.0001972562160223102,
+      "loss": 0.2006,
+      "step": 1153
+    },
+    {
+      "epoch": 0.24236060065105533,
+      "grad_norm": 0.5932807326316833,
+      "learning_rate": 0.00019724812977938507,
+      "loss": 0.32,
+      "step": 1154
+    },
+    {
+      "epoch": 0.24257061850257272,
+      "grad_norm": 0.6603784561157227,
+      "learning_rate": 0.00019724003180468137,
+      "loss": 0.2916,
+      "step": 1155
+    },
+    {
+      "epoch": 0.24278063635409008,
+      "grad_norm": 0.540715754032135,
+      "learning_rate": 0.00019723192209917604,
+      "loss": 0.2362,
+      "step": 1156
+    },
+    {
+      "epoch": 0.24299065420560748,
+      "grad_norm": 0.5223392844200134,
+      "learning_rate": 0.00019722380066384743,
+      "loss": 0.2172,
+      "step": 1157
+    },
+    {
+      "epoch": 0.24320067205712487,
+      "grad_norm": 0.6323894262313843,
+      "learning_rate": 0.00019721566749967523,
+      "loss": 0.1791,
+      "step": 1158
+    },
+    {
+      "epoch": 0.24341068990864223,
+      "grad_norm": 0.6830046772956848,
+      "learning_rate": 0.00019720752260764067,
+      "loss": 0.2777,
+      "step": 1159
+    },
+    {
+      "epoch": 0.24362070776015962,
+      "grad_norm": 0.6310597062110901,
+      "learning_rate": 0.00019719936598872634,
+      "loss": 0.2385,
+      "step": 1160
+    },
+    {
+      "epoch": 0.24383072561167698,
+      "grad_norm": 0.5483822822570801,
+      "learning_rate": 0.0001971911976439162,
+      "loss": 0.2548,
+      "step": 1161
+    },
+    {
+      "epoch": 0.24404074346319438,
+      "grad_norm": 0.6507066488265991,
+      "learning_rate": 0.00019718301757419565,
+      "loss": 0.1978,
+      "step": 1162
+    },
+    {
+      "epoch": 0.24425076131471174,
+      "grad_norm": 0.9127165079116821,
+      "learning_rate": 0.00019717482578055154,
+      "loss": 0.1521,
+      "step": 1163
+    },
+    {
+      "epoch": 0.24446077916622913,
+      "grad_norm": 0.49684974551200867,
+      "learning_rate": 0.0001971666222639721,
+      "loss": 0.2183,
+      "step": 1164
+    },
+    {
+      "epoch": 0.24467079701774652,
+      "grad_norm": 0.5121373534202576,
+      "learning_rate": 0.00019715840702544694,
+      "loss": 0.1734,
+      "step": 1165
+    },
+    {
+      "epoch": 0.24488081486926389,
+      "grad_norm": 0.5407108068466187,
+      "learning_rate": 0.0001971501800659672,
+      "loss": 0.2193,
+      "step": 1166
+    },
+    {
+      "epoch": 0.24509083272078128,
+      "grad_norm": 0.9084086418151855,
+      "learning_rate": 0.00019714194138652533,
+      "loss": 0.2936,
+      "step": 1167
+    },
+    {
+      "epoch": 0.24530085057229864,
+      "grad_norm": 0.5651358366012573,
+      "learning_rate": 0.0001971336909881152,
+      "loss": 0.1931,
+      "step": 1168
+    },
+    {
+      "epoch": 0.24551086842381603,
+      "grad_norm": 0.7590070366859436,
+      "learning_rate": 0.00019712542887173213,
+      "loss": 0.2404,
+      "step": 1169
+    },
+    {
+      "epoch": 0.2457208862753334,
+      "grad_norm": 0.7864325642585754,
+      "learning_rate": 0.0001971171550383729,
+      "loss": 0.2917,
+      "step": 1170
+    },
+    {
+      "epoch": 0.24593090412685079,
+      "grad_norm": 0.6653593182563782,
+      "learning_rate": 0.00019710886948903555,
+      "loss": 0.2011,
+      "step": 1171
+    },
+    {
+      "epoch": 0.24614092197836815,
+      "grad_norm": 0.49746832251548767,
+      "learning_rate": 0.00019710057222471967,
+      "loss": 0.1644,
+      "step": 1172
+    },
+    {
+      "epoch": 0.24635093982988554,
+      "grad_norm": 0.6505274176597595,
+      "learning_rate": 0.00019709226324642626,
+      "loss": 0.2415,
+      "step": 1173
+    },
+    {
+      "epoch": 0.24656095768140293,
+      "grad_norm": 0.9204030632972717,
+      "learning_rate": 0.00019708394255515765,
+      "loss": 0.3014,
+      "step": 1174
+    },
+    {
+      "epoch": 0.2467709755329203,
+      "grad_norm": 0.7631303071975708,
+      "learning_rate": 0.00019707561015191763,
+      "loss": 0.1956,
+      "step": 1175
+    },
+    {
+      "epoch": 0.24698099338443769,
+      "grad_norm": 0.4676910936832428,
+      "learning_rate": 0.0001970672660377114,
+      "loss": 0.1778,
+      "step": 1176
+    },
+    {
+      "epoch": 0.24719101123595505,
+      "grad_norm": 0.5645480751991272,
+      "learning_rate": 0.0001970589102135456,
+      "loss": 0.2386,
+      "step": 1177
+    },
+    {
+      "epoch": 0.24740102908747244,
+      "grad_norm": 0.6220672130584717,
+      "learning_rate": 0.00019705054268042823,
+      "loss": 0.2715,
+      "step": 1178
+    },
+    {
+      "epoch": 0.2476110469389898,
+      "grad_norm": 0.6659072041511536,
+      "learning_rate": 0.00019704216343936873,
+      "loss": 0.176,
+      "step": 1179
+    },
+    {
+      "epoch": 0.2478210647905072,
+      "grad_norm": 0.5557059049606323,
+      "learning_rate": 0.000197033772491378,
+      "loss": 0.2246,
+      "step": 1180
+    },
+    {
+      "epoch": 0.24803108264202456,
+      "grad_norm": 0.8239718675613403,
+      "learning_rate": 0.00019702536983746822,
+      "loss": 0.2168,
+      "step": 1181
+    },
+    {
+      "epoch": 0.24824110049354195,
+      "grad_norm": 0.7284471988677979,
+      "learning_rate": 0.00019701695547865312,
+      "loss": 0.234,
+      "step": 1182
+    },
+    {
+      "epoch": 0.24845111834505934,
+      "grad_norm": 0.540712296962738,
+      "learning_rate": 0.00019700852941594778,
+      "loss": 0.2099,
+      "step": 1183
+    },
+    {
+      "epoch": 0.2486611361965767,
+      "grad_norm": 0.4334312379360199,
+      "learning_rate": 0.0001970000916503687,
+      "loss": 0.1851,
+      "step": 1184
+    },
+    {
+      "epoch": 0.2488711540480941,
+      "grad_norm": 0.5734902620315552,
+      "learning_rate": 0.0001969916421829338,
+      "loss": 0.1798,
+      "step": 1185
+    },
+    {
+      "epoch": 0.24908117189961146,
+      "grad_norm": 0.8426918983459473,
+      "learning_rate": 0.00019698318101466237,
+      "loss": 0.2933,
+      "step": 1186
+    },
+    {
+      "epoch": 0.24929118975112885,
+      "grad_norm": 0.6708935499191284,
+      "learning_rate": 0.0001969747081465752,
+      "loss": 0.1688,
+      "step": 1187
+    },
+    {
+      "epoch": 0.2495012076026462,
+      "grad_norm": 0.564127504825592,
+      "learning_rate": 0.00019696622357969436,
+      "loss": 0.1875,
+      "step": 1188
+    },
+    {
+      "epoch": 0.2497112254541636,
+      "grad_norm": 0.44726812839508057,
+      "learning_rate": 0.00019695772731504347,
+      "loss": 0.1629,
+      "step": 1189
+    },
+    {
+      "epoch": 0.249921243305681,
+      "grad_norm": 0.4793647229671478,
+      "learning_rate": 0.00019694921935364747,
+      "loss": 0.1696,
+      "step": 1190
+    },
+    {
+      "epoch": 0.25013126115719836,
+      "grad_norm": 0.5669199228286743,
+      "learning_rate": 0.00019694069969653278,
+      "loss": 0.241,
+      "step": 1191
+    },
+    {
+      "epoch": 0.2503412790087157,
+      "grad_norm": 0.48771825432777405,
+      "learning_rate": 0.0001969321683447271,
+      "loss": 0.1922,
+      "step": 1192
+    },
+    {
+      "epoch": 0.25055129686023314,
+      "grad_norm": 0.5992656350135803,
+      "learning_rate": 0.00019692362529925977,
+      "loss": 0.2127,
+      "step": 1193
+    },
+    {
+      "epoch": 0.2507613147117505,
+      "grad_norm": 0.6230032444000244,
+      "learning_rate": 0.00019691507056116128,
+      "loss": 0.2035,
+      "step": 1194
+    },
+    {
+      "epoch": 0.25097133256326787,
+      "grad_norm": 0.49502989649772644,
+      "learning_rate": 0.00019690650413146368,
+      "loss": 0.1737,
+      "step": 1195
+    },
+    {
+      "epoch": 0.25118135041478523,
+      "grad_norm": 0.5283368825912476,
+      "learning_rate": 0.00019689792601120044,
+      "loss": 0.1145,
+      "step": 1196
+    },
+    {
+      "epoch": 0.25139136826630265,
+      "grad_norm": 0.4804970324039459,
+      "learning_rate": 0.00019688933620140637,
+      "loss": 0.159,
+      "step": 1197
+    },
+    {
+      "epoch": 0.25160138611782,
+      "grad_norm": 0.48868709802627563,
+      "learning_rate": 0.00019688073470311776,
+      "loss": 0.1543,
+      "step": 1198
+    },
+    {
+      "epoch": 0.2518114039693374,
+      "grad_norm": 0.638172447681427,
+      "learning_rate": 0.00019687212151737224,
+      "loss": 0.2184,
+      "step": 1199
+    },
+    {
+      "epoch": 0.2520214218208548,
+      "grad_norm": 0.5951210856437683,
+      "learning_rate": 0.0001968634966452089,
+      "loss": 0.203,
+      "step": 1200
+    },
+    {
+      "epoch": 0.25223143967237216,
+      "grad_norm": 0.687088668346405,
+      "learning_rate": 0.0001968548600876682,
+      "loss": 0.2776,
+      "step": 1201
+    },
+    {
+      "epoch": 0.2524414575238895,
+      "grad_norm": 0.572437584400177,
+      "learning_rate": 0.00019684621184579208,
+      "loss": 0.2266,
+      "step": 1202
+    },
+    {
+      "epoch": 0.2526514753754069,
+      "grad_norm": 0.47711312770843506,
+      "learning_rate": 0.0001968375519206238,
+      "loss": 0.2029,
+      "step": 1203
+    },
+    {
+      "epoch": 0.2528614932269243,
+      "grad_norm": 0.624614953994751,
+      "learning_rate": 0.0001968288803132081,
+      "loss": 0.19,
+      "step": 1204
+    },
+    {
+      "epoch": 0.25307151107844167,
+      "grad_norm": 0.544484555721283,
+      "learning_rate": 0.00019682019702459106,
+      "loss": 0.2015,
+      "step": 1205
+    },
+    {
+      "epoch": 0.25328152892995903,
+      "grad_norm": 0.8100757598876953,
+      "learning_rate": 0.00019681150205582025,
+      "loss": 0.3052,
+      "step": 1206
+    },
+    {
+      "epoch": 0.25349154678147645,
+      "grad_norm": 0.4385722279548645,
+      "learning_rate": 0.00019680279540794463,
+      "loss": 0.2004,
+      "step": 1207
+    },
+    {
+      "epoch": 0.2537015646329938,
+      "grad_norm": 0.5414237976074219,
+      "learning_rate": 0.0001967940770820145,
+      "loss": 0.2526,
+      "step": 1208
+    },
+    {
+      "epoch": 0.2539115824845112,
+      "grad_norm": 0.8186227083206177,
+      "learning_rate": 0.00019678534707908161,
+      "loss": 0.4667,
+      "step": 1209
+    },
+    {
+      "epoch": 0.25412160033602854,
+      "grad_norm": 1.0033893585205078,
+      "learning_rate": 0.0001967766054001992,
+      "loss": 0.2813,
+      "step": 1210
+    },
+    {
+      "epoch": 0.25433161818754596,
+      "grad_norm": 0.7333622574806213,
+      "learning_rate": 0.00019676785204642176,
+      "loss": 0.3368,
+      "step": 1211
+    },
+    {
+      "epoch": 0.2545416360390633,
+      "grad_norm": 0.7219803333282471,
+      "learning_rate": 0.00019675908701880532,
+      "loss": 0.3029,
+      "step": 1212
+    },
+    {
+      "epoch": 0.2547516538905807,
+      "grad_norm": 0.651919960975647,
+      "learning_rate": 0.00019675031031840727,
+      "loss": 0.2637,
+      "step": 1213
+    },
+    {
+      "epoch": 0.2549616717420981,
+      "grad_norm": 0.6580129265785217,
+      "learning_rate": 0.00019674152194628638,
+      "loss": 0.1855,
+      "step": 1214
+    },
+    {
+      "epoch": 0.25517168959361547,
+      "grad_norm": 0.6222065687179565,
+      "learning_rate": 0.00019673272190350293,
+      "loss": 0.2179,
+      "step": 1215
+    },
+    {
+      "epoch": 0.25538170744513283,
+      "grad_norm": 0.8877820372581482,
+      "learning_rate": 0.00019672391019111846,
+      "loss": 0.3499,
+      "step": 1216
+    },
+    {
+      "epoch": 0.2555917252966502,
+      "grad_norm": 0.5138952732086182,
+      "learning_rate": 0.000196715086810196,
+      "loss": 0.3148,
+      "step": 1217
+    },
+    {
+      "epoch": 0.2558017431481676,
+      "grad_norm": 0.7864513993263245,
+      "learning_rate": 0.00019670625176180002,
+      "loss": 0.1968,
+      "step": 1218
+    },
+    {
+      "epoch": 0.256011760999685,
+      "grad_norm": 0.5596680641174316,
+      "learning_rate": 0.00019669740504699634,
+      "loss": 0.2496,
+      "step": 1219
+    },
+    {
+      "epoch": 0.25622177885120234,
+      "grad_norm": 0.5104958415031433,
+      "learning_rate": 0.0001966885466668522,
+      "loss": 0.1978,
+      "step": 1220
+    },
+    {
+      "epoch": 0.2564317967027197,
+      "grad_norm": 0.5994324684143066,
+      "learning_rate": 0.00019667967662243628,
+      "loss": 0.1641,
+      "step": 1221
+    },
+    {
+      "epoch": 0.2566418145542371,
+      "grad_norm": 0.5470710396766663,
+      "learning_rate": 0.0001966707949148186,
+      "loss": 0.1936,
+      "step": 1222
+    },
+    {
+      "epoch": 0.2568518324057545,
+      "grad_norm": 0.5853798389434814,
+      "learning_rate": 0.00019666190154507066,
+      "loss": 0.3196,
+      "step": 1223
+    },
+    {
+      "epoch": 0.25706185025727185,
+      "grad_norm": 0.5121351480484009,
+      "learning_rate": 0.0001966529965142653,
+      "loss": 0.2273,
+      "step": 1224
+    },
+    {
+      "epoch": 0.25727186810878927,
+      "grad_norm": 0.5247400999069214,
+      "learning_rate": 0.00019664407982347684,
+      "loss": 0.2338,
+      "step": 1225
+    },
+    {
+      "epoch": 0.25748188596030663,
+      "grad_norm": 0.49028703570365906,
+      "learning_rate": 0.00019663515147378096,
+      "loss": 0.1937,
+      "step": 1226
+    },
+    {
+      "epoch": 0.257691903811824,
+      "grad_norm": 0.6964942216873169,
+      "learning_rate": 0.00019662621146625473,
+      "loss": 0.2488,
+      "step": 1227
+    },
+    {
+      "epoch": 0.25790192166334136,
+      "grad_norm": 0.5908783674240112,
+      "learning_rate": 0.00019661725980197668,
+      "loss": 0.2333,
+      "step": 1228
+    },
+    {
+      "epoch": 0.2581119395148588,
+      "grad_norm": 0.5808953046798706,
+      "learning_rate": 0.0001966082964820267,
+      "loss": 0.2952,
+      "step": 1229
+    },
+    {
+      "epoch": 0.25832195736637614,
+      "grad_norm": 0.7375782132148743,
+      "learning_rate": 0.0001965993215074861,
+      "loss": 0.212,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2585319752178935,
+      "grad_norm": 0.5336644649505615,
+      "learning_rate": 0.00019659033487943762,
+      "loss": 0.1298,
+      "step": 1231
+    },
+    {
+      "epoch": 0.2587419930694109,
+      "grad_norm": 0.6842905879020691,
+      "learning_rate": 0.00019658133659896537,
+      "loss": 0.1563,
+      "step": 1232
+    },
+    {
+      "epoch": 0.2589520109209283,
+      "grad_norm": 0.6709338426589966,
+      "learning_rate": 0.00019657232666715486,
+      "loss": 0.238,
+      "step": 1233
+    },
+    {
+      "epoch": 0.25916202877244565,
+      "grad_norm": 0.6698547005653381,
+      "learning_rate": 0.00019656330508509306,
+      "loss": 0.2658,
+      "step": 1234
+    },
+    {
+      "epoch": 0.259372046623963,
+      "grad_norm": 0.5134257078170776,
+      "learning_rate": 0.00019655427185386832,
+      "loss": 0.1681,
+      "step": 1235
+    },
+    {
+      "epoch": 0.25958206447548043,
+      "grad_norm": 0.6031914353370667,
+      "learning_rate": 0.00019654522697457036,
+      "loss": 0.2422,
+      "step": 1236
+    },
+    {
+      "epoch": 0.2597920823269978,
+      "grad_norm": 0.7376905083656311,
+      "learning_rate": 0.00019653617044829033,
+      "loss": 0.2232,
+      "step": 1237
+    },
+    {
+      "epoch": 0.26000210017851516,
+      "grad_norm": 0.661484956741333,
+      "learning_rate": 0.0001965271022761208,
+      "loss": 0.2423,
+      "step": 1238
+    },
+    {
+      "epoch": 0.2602121180300326,
+      "grad_norm": 0.47797203063964844,
+      "learning_rate": 0.00019651802245915573,
+      "loss": 0.1129,
+      "step": 1239
+    },
+    {
+      "epoch": 0.26042213588154994,
+      "grad_norm": 0.5190190672874451,
+      "learning_rate": 0.00019650893099849048,
+      "loss": 0.1518,
+      "step": 1240
+    },
+    {
+      "epoch": 0.2606321537330673,
+      "grad_norm": 0.786258339881897,
+      "learning_rate": 0.00019649982789522182,
+      "loss": 0.2716,
+      "step": 1241
+    },
+    {
+      "epoch": 0.26084217158458467,
+      "grad_norm": 0.528610110282898,
+      "learning_rate": 0.00019649071315044797,
+      "loss": 0.2129,
+      "step": 1242
+    },
+    {
+      "epoch": 0.2610521894361021,
+      "grad_norm": 0.4367738664150238,
+      "learning_rate": 0.00019648158676526846,
+      "loss": 0.1212,
+      "step": 1243
+    },
+    {
+      "epoch": 0.26126220728761945,
+      "grad_norm": 0.8334630131721497,
+      "learning_rate": 0.0001964724487407843,
+      "loss": 0.2661,
+      "step": 1244
+    },
+    {
+      "epoch": 0.2614722251391368,
+      "grad_norm": 0.42555752396583557,
+      "learning_rate": 0.00019646329907809786,
+      "loss": 0.1394,
+      "step": 1245
+    },
+    {
+      "epoch": 0.2616822429906542,
+      "grad_norm": 0.7967664003372192,
+      "learning_rate": 0.00019645413777831294,
+      "loss": 0.2523,
+      "step": 1246
+    },
+    {
+      "epoch": 0.2618922608421716,
+      "grad_norm": 0.606065571308136,
+      "learning_rate": 0.00019644496484253474,
+      "loss": 0.2318,
+      "step": 1247
+    },
+    {
+      "epoch": 0.26210227869368896,
+      "grad_norm": 0.7196457982063293,
+      "learning_rate": 0.00019643578027186983,
+      "loss": 0.1763,
+      "step": 1248
+    },
+    {
+      "epoch": 0.2623122965452063,
+      "grad_norm": 0.6310532093048096,
+      "learning_rate": 0.0001964265840674263,
+      "loss": 0.1654,
+      "step": 1249
+    },
+    {
+      "epoch": 0.26252231439672374,
+      "grad_norm": 0.48370879888534546,
+      "learning_rate": 0.00019641737623031348,
+      "loss": 0.1486,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2627323322482411,
+      "grad_norm": 0.5700078010559082,
+      "learning_rate": 0.00019640815676164218,
+      "loss": 0.1662,
+      "step": 1251
+    },
+    {
+      "epoch": 0.26294235009975847,
+      "grad_norm": 0.5395934581756592,
+      "learning_rate": 0.00019639892566252466,
+      "loss": 0.2383,
+      "step": 1252
+    },
+    {
+      "epoch": 0.26315236795127583,
+      "grad_norm": 0.8851696252822876,
+      "learning_rate": 0.00019638968293407452,
+      "loss": 0.3027,
+      "step": 1253
+    },
+    {
+      "epoch": 0.26336238580279325,
+      "grad_norm": 0.968064546585083,
+      "learning_rate": 0.00019638042857740676,
+      "loss": 0.216,
+      "step": 1254
+    },
+    {
+      "epoch": 0.2635724036543106,
+      "grad_norm": 0.7724452018737793,
+      "learning_rate": 0.00019637116259363783,
+      "loss": 0.2397,
+      "step": 1255
+    },
+    {
+      "epoch": 0.263782421505828,
+      "grad_norm": 0.5674394369125366,
+      "learning_rate": 0.00019636188498388556,
+      "loss": 0.2246,
+      "step": 1256
+    },
+    {
+      "epoch": 0.2639924393573454,
+      "grad_norm": 0.9327764511108398,
+      "learning_rate": 0.00019635259574926912,
+      "loss": 0.2233,
+      "step": 1257
+    },
+    {
+      "epoch": 0.26420245720886276,
+      "grad_norm": 0.7048301100730896,
+      "learning_rate": 0.00019634329489090925,
+      "loss": 0.2003,
+      "step": 1258
+    },
+    {
+      "epoch": 0.2644124750603801,
+      "grad_norm": 0.6150112152099609,
+      "learning_rate": 0.00019633398240992785,
+      "loss": 0.1781,
+      "step": 1259
+    },
+    {
+      "epoch": 0.2646224929118975,
+      "grad_norm": 0.7421455979347229,
+      "learning_rate": 0.00019632465830744846,
+      "loss": 0.1789,
+      "step": 1260
+    },
+    {
+      "epoch": 0.2648325107634149,
+      "grad_norm": 0.8235256671905518,
+      "learning_rate": 0.00019631532258459586,
+      "loss": 0.2139,
+      "step": 1261
+    },
+    {
+      "epoch": 0.26504252861493227,
+      "grad_norm": 0.605702817440033,
+      "learning_rate": 0.00019630597524249632,
+      "loss": 0.1901,
+      "step": 1262
+    },
+    {
+      "epoch": 0.26525254646644963,
+      "grad_norm": 0.605636715888977,
+      "learning_rate": 0.00019629661628227748,
+      "loss": 0.2281,
+      "step": 1263
+    },
+    {
+      "epoch": 0.26546256431796705,
+      "grad_norm": 0.4805593490600586,
+      "learning_rate": 0.00019628724570506834,
+      "loss": 0.1407,
+      "step": 1264
+    },
+    {
+      "epoch": 0.2656725821694844,
+      "grad_norm": 0.7765632271766663,
+      "learning_rate": 0.00019627786351199936,
+      "loss": 0.3228,
+      "step": 1265
+    },
+    {
+      "epoch": 0.2658826000210018,
+      "grad_norm": 0.9582871794700623,
+      "learning_rate": 0.00019626846970420244,
+      "loss": 0.2351,
+      "step": 1266
+    },
+    {
+      "epoch": 0.26609261787251914,
+      "grad_norm": 0.545551061630249,
+      "learning_rate": 0.00019625906428281077,
+      "loss": 0.2201,
+      "step": 1267
+    },
+    {
+      "epoch": 0.26630263572403656,
+      "grad_norm": 0.6563419103622437,
+      "learning_rate": 0.00019624964724895906,
+      "loss": 0.2605,
+      "step": 1268
+    },
+    {
+      "epoch": 0.2665126535755539,
+      "grad_norm": 0.685642659664154,
+      "learning_rate": 0.00019624021860378325,
+      "loss": 0.2217,
+      "step": 1269
+    },
+    {
+      "epoch": 0.2667226714270713,
+      "grad_norm": 0.5008343458175659,
+      "learning_rate": 0.00019623077834842088,
+      "loss": 0.1678,
+      "step": 1270
+    },
+    {
+      "epoch": 0.26693268927858865,
+      "grad_norm": 0.7975322008132935,
+      "learning_rate": 0.00019622132648401076,
+      "loss": 0.2541,
+      "step": 1271
+    },
+    {
+      "epoch": 0.26714270713010607,
+      "grad_norm": 0.7770018577575684,
+      "learning_rate": 0.00019621186301169315,
+      "loss": 0.2721,
+      "step": 1272
+    },
+    {
+      "epoch": 0.26735272498162344,
+      "grad_norm": 0.7773646712303162,
+      "learning_rate": 0.00019620238793260968,
+      "loss": 0.2694,
+      "step": 1273
+    },
+    {
+      "epoch": 0.2675627428331408,
+      "grad_norm": 0.5535711050033569,
+      "learning_rate": 0.00019619290124790344,
+      "loss": 0.1834,
+      "step": 1274
+    },
+    {
+      "epoch": 0.2677727606846582,
+      "grad_norm": 0.6472839713096619,
+      "learning_rate": 0.00019618340295871888,
+      "loss": 0.2094,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2679827785361756,
+      "grad_norm": 0.7514201998710632,
+      "learning_rate": 0.0001961738930662018,
+      "loss": 0.316,
+      "step": 1276
+    },
+    {
+      "epoch": 0.26819279638769294,
+      "grad_norm": 0.6256974935531616,
+      "learning_rate": 0.00019616437157149948,
+      "loss": 0.1446,
+      "step": 1277
+    },
+    {
+      "epoch": 0.2684028142392103,
+      "grad_norm": 0.618664026260376,
+      "learning_rate": 0.0001961548384757606,
+      "loss": 0.217,
+      "step": 1278
+    },
+    {
+      "epoch": 0.2686128320907277,
+      "grad_norm": 0.7553743124008179,
+      "learning_rate": 0.00019614529378013517,
+      "loss": 0.2514,
+      "step": 1279
+    },
+    {
+      "epoch": 0.2688228499422451,
+      "grad_norm": 0.5204927921295166,
+      "learning_rate": 0.00019613573748577468,
+      "loss": 0.2412,
+      "step": 1280
+    },
+    {
+      "epoch": 0.26903286779376245,
+      "grad_norm": 0.725058376789093,
+      "learning_rate": 0.0001961261695938319,
+      "loss": 0.2307,
+      "step": 1281
+    },
+    {
+      "epoch": 0.2692428856452799,
+      "grad_norm": 0.3503430485725403,
+      "learning_rate": 0.00019611659010546114,
+      "loss": 0.101,
+      "step": 1282
+    },
+    {
+      "epoch": 0.26945290349679724,
+      "grad_norm": 0.5544008016586304,
+      "learning_rate": 0.00019610699902181803,
+      "loss": 0.2656,
+      "step": 1283
+    },
+    {
+      "epoch": 0.2696629213483146,
+      "grad_norm": 0.6133410334587097,
+      "learning_rate": 0.00019609739634405963,
+      "loss": 0.2247,
+      "step": 1284
+    },
+    {
+      "epoch": 0.26987293919983196,
+      "grad_norm": 0.4621675908565521,
+      "learning_rate": 0.00019608778207334438,
+      "loss": 0.1388,
+      "step": 1285
+    },
+    {
+      "epoch": 0.2700829570513494,
+      "grad_norm": 0.450885146856308,
+      "learning_rate": 0.00019607815621083209,
+      "loss": 0.1387,
+      "step": 1286
+    },
+    {
+      "epoch": 0.27029297490286674,
+      "grad_norm": 0.4488065838813782,
+      "learning_rate": 0.000196068518757684,
+      "loss": 0.1359,
+      "step": 1287
+    },
+    {
+      "epoch": 0.2705029927543841,
+      "grad_norm": 0.3958616554737091,
+      "learning_rate": 0.00019605886971506284,
+      "loss": 0.1378,
+      "step": 1288
+    },
+    {
+      "epoch": 0.2707130106059015,
+      "grad_norm": 0.6814693212509155,
+      "learning_rate": 0.00019604920908413255,
+      "loss": 0.1745,
+      "step": 1289
+    },
+    {
+      "epoch": 0.2709230284574189,
+      "grad_norm": 0.5673383474349976,
+      "learning_rate": 0.0001960395368660586,
+      "loss": 0.2067,
+      "step": 1290
+    },
+    {
+      "epoch": 0.27113304630893625,
+      "grad_norm": 0.468888521194458,
+      "learning_rate": 0.0001960298530620078,
+      "loss": 0.1938,
+      "step": 1291
+    },
+    {
+      "epoch": 0.2713430641604536,
+      "grad_norm": 0.5092310309410095,
+      "learning_rate": 0.00019602015767314842,
+      "loss": 0.1773,
+      "step": 1292
+    },
+    {
+      "epoch": 0.27155308201197104,
+      "grad_norm": 0.6761277318000793,
+      "learning_rate": 0.00019601045070065005,
+      "loss": 0.2399,
+      "step": 1293
+    },
+    {
+      "epoch": 0.2717630998634884,
+      "grad_norm": 0.682894229888916,
+      "learning_rate": 0.00019600073214568373,
+      "loss": 0.173,
+      "step": 1294
+    },
+    {
+      "epoch": 0.27197311771500576,
+      "grad_norm": 0.9188515543937683,
+      "learning_rate": 0.0001959910020094219,
+      "loss": 0.2221,
+      "step": 1295
+    },
+    {
+      "epoch": 0.2721831355665232,
+      "grad_norm": 0.5459915995597839,
+      "learning_rate": 0.00019598126029303836,
+      "loss": 0.1978,
+      "step": 1296
+    },
+    {
+      "epoch": 0.27239315341804055,
+      "grad_norm": 0.7605480551719666,
+      "learning_rate": 0.00019597150699770835,
+      "loss": 0.2157,
+      "step": 1297
+    },
+    {
+      "epoch": 0.2726031712695579,
+      "grad_norm": 0.6047395467758179,
+      "learning_rate": 0.00019596174212460846,
+      "loss": 0.1816,
+      "step": 1298
+    },
+    {
+      "epoch": 0.2728131891210753,
+      "grad_norm": 0.9177975058555603,
+      "learning_rate": 0.00019595196567491667,
+      "loss": 0.4008,
+      "step": 1299
+    },
+    {
+      "epoch": 0.2730232069725927,
+      "grad_norm": 0.492226779460907,
+      "learning_rate": 0.00019594217764981245,
+      "loss": 0.1476,
+      "step": 1300
+    },
+    {
+      "epoch": 0.27323322482411005,
+      "grad_norm": 0.700484573841095,
+      "learning_rate": 0.00019593237805047656,
+      "loss": 0.466,
+      "step": 1301
+    },
+    {
+      "epoch": 0.2734432426756274,
+      "grad_norm": 0.6979876756668091,
+      "learning_rate": 0.00019592256687809125,
+      "loss": 0.3519,
+      "step": 1302
+    },
+    {
+      "epoch": 0.2736532605271448,
+      "grad_norm": 0.7417042255401611,
+      "learning_rate": 0.0001959127441338401,
+      "loss": 0.2779,
+      "step": 1303
+    },
+    {
+      "epoch": 0.2738632783786622,
+      "grad_norm": 0.6014847755432129,
+      "learning_rate": 0.00019590290981890803,
+      "loss": 0.3175,
+      "step": 1304
+    },
+    {
+      "epoch": 0.27407329623017956,
+      "grad_norm": 0.6579805016517639,
+      "learning_rate": 0.00019589306393448153,
+      "loss": 0.3319,
+      "step": 1305
+    },
+    {
+      "epoch": 0.2742833140816969,
+      "grad_norm": 0.44514089822769165,
+      "learning_rate": 0.0001958832064817483,
+      "loss": 0.172,
+      "step": 1306
+    },
+    {
+      "epoch": 0.27449333193321435,
+      "grad_norm": 0.4212436079978943,
+      "learning_rate": 0.0001958733374618976,
+      "loss": 0.1436,
+      "step": 1307
+    },
+    {
+      "epoch": 0.2747033497847317,
+      "grad_norm": 0.659275233745575,
+      "learning_rate": 0.00019586345687611992,
+      "loss": 0.1561,
+      "step": 1308
+    },
+    {
+      "epoch": 0.2749133676362491,
+      "grad_norm": 0.6682752370834351,
+      "learning_rate": 0.00019585356472560732,
+      "loss": 0.2189,
+      "step": 1309
+    },
+    {
+      "epoch": 0.27512338548776644,
+      "grad_norm": 0.5080957412719727,
+      "learning_rate": 0.00019584366101155307,
+      "loss": 0.1666,
+      "step": 1310
+    },
+    {
+      "epoch": 0.27533340333928386,
+      "grad_norm": 0.651543378829956,
+      "learning_rate": 0.00019583374573515198,
+      "loss": 0.2687,
+      "step": 1311
+    },
+    {
+      "epoch": 0.2755434211908012,
+      "grad_norm": 0.8879132270812988,
+      "learning_rate": 0.00019582381889760023,
+      "loss": 0.268,
+      "step": 1312
+    },
+    {
+      "epoch": 0.2757534390423186,
+      "grad_norm": 0.8491726517677307,
+      "learning_rate": 0.0001958138805000953,
+      "loss": 0.3921,
+      "step": 1313
+    },
+    {
+      "epoch": 0.275963456893836,
+      "grad_norm": 0.5724290013313293,
+      "learning_rate": 0.00019580393054383622,
+      "loss": 0.1724,
+      "step": 1314
+    },
+    {
+      "epoch": 0.27617347474535336,
+      "grad_norm": 0.6225562691688538,
+      "learning_rate": 0.00019579396903002328,
+      "loss": 0.2008,
+      "step": 1315
+    },
+    {
+      "epoch": 0.27638349259687073,
+      "grad_norm": 0.601390540599823,
+      "learning_rate": 0.0001957839959598582,
+      "loss": 0.2018,
+      "step": 1316
+    },
+    {
+      "epoch": 0.2765935104483881,
+      "grad_norm": 0.7996687889099121,
+      "learning_rate": 0.0001957740113345441,
+      "loss": 0.192,
+      "step": 1317
+    },
+    {
+      "epoch": 0.2768035282999055,
+      "grad_norm": 0.696975827217102,
+      "learning_rate": 0.00019576401515528555,
+      "loss": 0.249,
+      "step": 1318
+    },
+    {
+      "epoch": 0.2770135461514229,
+      "grad_norm": 0.5437553524971008,
+      "learning_rate": 0.00019575400742328843,
+      "loss": 0.1665,
+      "step": 1319
+    },
+    {
+      "epoch": 0.27722356400294024,
+      "grad_norm": 0.6747444868087769,
+      "learning_rate": 0.00019574398813976008,
+      "loss": 0.2043,
+      "step": 1320
+    },
+    {
+      "epoch": 0.27743358185445766,
+      "grad_norm": 0.7587124109268188,
+      "learning_rate": 0.00019573395730590915,
+      "loss": 0.2238,
+      "step": 1321
+    },
+    {
+      "epoch": 0.277643599705975,
+      "grad_norm": 0.8004297614097595,
+      "learning_rate": 0.0001957239149229458,
+      "loss": 0.2887,
+      "step": 1322
+    },
+    {
+      "epoch": 0.2778536175574924,
+      "grad_norm": 0.7234312295913696,
+      "learning_rate": 0.00019571386099208145,
+      "loss": 0.2323,
+      "step": 1323
+    },
+    {
+      "epoch": 0.27806363540900975,
+      "grad_norm": 0.7471593022346497,
+      "learning_rate": 0.000195703795514529,
+      "loss": 0.2379,
+      "step": 1324
+    },
+    {
+      "epoch": 0.27827365326052717,
+      "grad_norm": 0.5377416610717773,
+      "learning_rate": 0.00019569371849150282,
+      "loss": 0.2095,
+      "step": 1325
+    },
+    {
+      "epoch": 0.27848367111204453,
+      "grad_norm": 0.5749472975730896,
+      "learning_rate": 0.00019568362992421844,
+      "loss": 0.1778,
+      "step": 1326
+    },
+    {
+      "epoch": 0.2786936889635619,
+      "grad_norm": 0.8375914692878723,
+      "learning_rate": 0.00019567352981389298,
+      "loss": 0.2322,
+      "step": 1327
+    },
+    {
+      "epoch": 0.27890370681507926,
+      "grad_norm": 0.5606945157051086,
+      "learning_rate": 0.0001956634181617449,
+      "loss": 0.206,
+      "step": 1328
+    },
+    {
+      "epoch": 0.2791137246665967,
+      "grad_norm": 0.399161159992218,
+      "learning_rate": 0.00019565329496899406,
+      "loss": 0.124,
+      "step": 1329
+    },
+    {
+      "epoch": 0.27932374251811404,
+      "grad_norm": 0.6910118460655212,
+      "learning_rate": 0.00019564316023686163,
+      "loss": 0.2299,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2795337603696314,
+      "grad_norm": 0.43964332342147827,
+      "learning_rate": 0.0001956330139665703,
+      "loss": 0.1785,
+      "step": 1331
+    },
+    {
+      "epoch": 0.2797437782211488,
+      "grad_norm": 0.6819096207618713,
+      "learning_rate": 0.0001956228561593441,
+      "loss": 0.1957,
+      "step": 1332
+    },
+    {
+      "epoch": 0.2799537960726662,
+      "grad_norm": 0.5263475775718689,
+      "learning_rate": 0.0001956126868164084,
+      "loss": 0.1348,
+      "step": 1333
+    },
+    {
+      "epoch": 0.28016381392418355,
+      "grad_norm": 0.5277931690216064,
+      "learning_rate": 0.00019560250593899002,
+      "loss": 0.1553,
+      "step": 1334
+    },
+    {
+      "epoch": 0.2803738317757009,
+      "grad_norm": 0.6066485643386841,
+      "learning_rate": 0.00019559231352831715,
+      "loss": 0.2063,
+      "step": 1335
+    },
+    {
+      "epoch": 0.28058384962721833,
+      "grad_norm": 0.668892502784729,
+      "learning_rate": 0.00019558210958561939,
+      "loss": 0.1936,
+      "step": 1336
+    },
+    {
+      "epoch": 0.2807938674787357,
+      "grad_norm": 0.7482750415802002,
+      "learning_rate": 0.00019557189411212772,
+      "loss": 0.2001,
+      "step": 1337
+    },
+    {
+      "epoch": 0.28100388533025306,
+      "grad_norm": 0.8485071659088135,
+      "learning_rate": 0.00019556166710907452,
+      "loss": 0.2095,
+      "step": 1338
+    },
+    {
+      "epoch": 0.2812139031817705,
+      "grad_norm": 0.6988540887832642,
+      "learning_rate": 0.00019555142857769354,
+      "loss": 0.1574,
+      "step": 1339
+    },
+    {
+      "epoch": 0.28142392103328784,
+      "grad_norm": 0.7636557817459106,
+      "learning_rate": 0.00019554117851921992,
+      "loss": 0.2992,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2816339388848052,
+      "grad_norm": 0.47566983103752136,
+      "learning_rate": 0.00019553091693489018,
+      "loss": 0.1391,
+      "step": 1341
+    },
+    {
+      "epoch": 0.28184395673632257,
+      "grad_norm": 0.7958580255508423,
+      "learning_rate": 0.00019552064382594232,
+      "loss": 0.2805,
+      "step": 1342
+    },
+    {
+      "epoch": 0.28205397458784,
+      "grad_norm": 0.6596616506576538,
+      "learning_rate": 0.0001955103591936156,
+      "loss": 0.196,
+      "step": 1343
+    },
+    {
+      "epoch": 0.28226399243935735,
+      "grad_norm": 0.42053359746932983,
+      "learning_rate": 0.0001955000630391508,
+      "loss": 0.159,
+      "step": 1344
+    },
+    {
+      "epoch": 0.2824740102908747,
+      "grad_norm": 0.5475199818611145,
+      "learning_rate": 0.00019548975536378996,
+      "loss": 0.155,
+      "step": 1345
+    },
+    {
+      "epoch": 0.28268402814239213,
+      "grad_norm": 0.6333903074264526,
+      "learning_rate": 0.00019547943616877658,
+      "loss": 0.1959,
+      "step": 1346
+    },
+    {
+      "epoch": 0.2828940459939095,
+      "grad_norm": 0.891545832157135,
+      "learning_rate": 0.00019546910545535558,
+      "loss": 0.2339,
+      "step": 1347
+    },
+    {
+      "epoch": 0.28310406384542686,
+      "grad_norm": 0.5310159921646118,
+      "learning_rate": 0.0001954587632247732,
+      "loss": 0.1912,
+      "step": 1348
+    },
+    {
+      "epoch": 0.2833140816969442,
+      "grad_norm": 0.6109257340431213,
+      "learning_rate": 0.0001954484094782771,
+      "loss": 0.1883,
+      "step": 1349
+    },
+    {
+      "epoch": 0.28352409954846164,
+      "grad_norm": 0.5322039723396301,
+      "learning_rate": 0.00019543804421711639,
+      "loss": 0.2055,
+      "step": 1350
+    },
+    {
+      "epoch": 0.283734117399979,
+      "grad_norm": 0.4660230576992035,
+      "learning_rate": 0.00019542766744254142,
+      "loss": 0.2831,
+      "step": 1351
+    },
+    {
+      "epoch": 0.28394413525149637,
+      "grad_norm": 0.6315797567367554,
+      "learning_rate": 0.00019541727915580408,
+      "loss": 0.2592,
+      "step": 1352
+    },
+    {
+      "epoch": 0.28415415310301373,
+      "grad_norm": 0.488646000623703,
+      "learning_rate": 0.00019540687935815754,
+      "loss": 0.1616,
+      "step": 1353
+    },
+    {
+      "epoch": 0.28436417095453115,
+      "grad_norm": 0.6158355474472046,
+      "learning_rate": 0.00019539646805085648,
+      "loss": 0.291,
+      "step": 1354
+    },
+    {
+      "epoch": 0.2845741888060485,
+      "grad_norm": 0.9165632724761963,
+      "learning_rate": 0.00019538604523515682,
+      "loss": 0.3223,
+      "step": 1355
+    },
+    {
+      "epoch": 0.2847842066575659,
+      "grad_norm": 0.5712313055992126,
+      "learning_rate": 0.00019537561091231598,
+      "loss": 0.2613,
+      "step": 1356
+    },
+    {
+      "epoch": 0.2849942245090833,
+      "grad_norm": 0.7423316240310669,
+      "learning_rate": 0.00019536516508359273,
+      "loss": 0.26,
+      "step": 1357
+    },
+    {
+      "epoch": 0.28520424236060066,
+      "grad_norm": 0.5036366581916809,
+      "learning_rate": 0.00019535470775024723,
+      "loss": 0.1588,
+      "step": 1358
+    },
+    {
+      "epoch": 0.285414260212118,
+      "grad_norm": 0.9148122668266296,
+      "learning_rate": 0.00019534423891354102,
+      "loss": 0.3,
+      "step": 1359
+    },
+    {
+      "epoch": 0.2856242780636354,
+      "grad_norm": 0.6107314825057983,
+      "learning_rate": 0.00019533375857473702,
+      "loss": 0.2617,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2858342959151528,
+      "grad_norm": 0.658970832824707,
+      "learning_rate": 0.00019532326673509957,
+      "loss": 0.1758,
+      "step": 1361
+    },
+    {
+      "epoch": 0.28604431376667017,
+      "grad_norm": 0.47163254022598267,
+      "learning_rate": 0.00019531276339589438,
+      "loss": 0.1711,
+      "step": 1362
+    },
+    {
+      "epoch": 0.28625433161818753,
+      "grad_norm": 0.4909091293811798,
+      "learning_rate": 0.0001953022485583886,
+      "loss": 0.1717,
+      "step": 1363
+    },
+    {
+      "epoch": 0.28646434946970495,
+      "grad_norm": 0.7579825520515442,
+      "learning_rate": 0.00019529172222385063,
+      "loss": 0.2174,
+      "step": 1364
+    },
+    {
+      "epoch": 0.2866743673212223,
+      "grad_norm": 0.5118494033813477,
+      "learning_rate": 0.00019528118439355034,
+      "loss": 0.306,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2868843851727397,
+      "grad_norm": 0.7731876969337463,
+      "learning_rate": 0.00019527063506875905,
+      "loss": 0.3721,
+      "step": 1366
+    },
+    {
+      "epoch": 0.28709440302425704,
+      "grad_norm": 0.7073164582252502,
+      "learning_rate": 0.00019526007425074937,
+      "loss": 0.1929,
+      "step": 1367
+    },
+    {
+      "epoch": 0.28730442087577446,
+      "grad_norm": 0.5947062969207764,
+      "learning_rate": 0.00019524950194079534,
+      "loss": 0.1888,
+      "step": 1368
+    },
+    {
+      "epoch": 0.2875144387272918,
+      "grad_norm": 0.6583393812179565,
+      "learning_rate": 0.00019523891814017237,
+      "loss": 0.1993,
+      "step": 1369
+    },
+    {
+      "epoch": 0.2877244565788092,
+      "grad_norm": 0.5850254893302917,
+      "learning_rate": 0.0001952283228501573,
+      "loss": 0.2447,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2879344744303266,
+      "grad_norm": 0.6584855914115906,
+      "learning_rate": 0.00019521771607202822,
+      "loss": 0.2517,
+      "step": 1371
+    },
+    {
+      "epoch": 0.28814449228184397,
+      "grad_norm": 0.6195645332336426,
+      "learning_rate": 0.00019520709780706486,
+      "loss": 0.183,
+      "step": 1372
+    },
+    {
+      "epoch": 0.28835451013336133,
+      "grad_norm": 0.6261805295944214,
+      "learning_rate": 0.00019519646805654802,
+      "loss": 0.194,
+      "step": 1373
+    },
+    {
+      "epoch": 0.2885645279848787,
+      "grad_norm": 0.9010648727416992,
+      "learning_rate": 0.00019518582682176018,
+      "loss": 0.3978,
+      "step": 1374
+    },
+    {
+      "epoch": 0.2887745458363961,
+      "grad_norm": 0.7001438736915588,
+      "learning_rate": 0.00019517517410398501,
+      "loss": 0.1781,
+      "step": 1375
+    },
+    {
+      "epoch": 0.2889845636879135,
+      "grad_norm": 0.5351212024688721,
+      "learning_rate": 0.00019516450990450762,
+      "loss": 0.1933,
+      "step": 1376
+    },
+    {
+      "epoch": 0.28919458153943084,
+      "grad_norm": 0.5710474252700806,
+      "learning_rate": 0.00019515383422461454,
+      "loss": 0.2288,
+      "step": 1377
+    },
+    {
+      "epoch": 0.2894045993909482,
+      "grad_norm": 0.6244832873344421,
+      "learning_rate": 0.00019514314706559364,
+      "loss": 0.1796,
+      "step": 1378
+    },
+    {
+      "epoch": 0.2896146172424656,
+      "grad_norm": 0.8345138430595398,
+      "learning_rate": 0.0001951324484287342,
+      "loss": 0.2747,
+      "step": 1379
+    },
+    {
+      "epoch": 0.289824635093983,
+      "grad_norm": 0.9983725547790527,
+      "learning_rate": 0.0001951217383153269,
+      "loss": 0.2806,
+      "step": 1380
+    },
+    {
+      "epoch": 0.29003465294550035,
+      "grad_norm": 0.6748480200767517,
+      "learning_rate": 0.00019511101672666374,
+      "loss": 0.1839,
+      "step": 1381
+    },
+    {
+      "epoch": 0.29024467079701777,
+      "grad_norm": 0.5453517436981201,
+      "learning_rate": 0.0001951002836640382,
+      "loss": 0.143,
+      "step": 1382
+    },
+    {
+      "epoch": 0.29045468864853513,
+      "grad_norm": 0.4747224748134613,
+      "learning_rate": 0.00019508953912874503,
+      "loss": 0.1445,
+      "step": 1383
+    },
+    {
+      "epoch": 0.2906647065000525,
+      "grad_norm": 0.8038759231567383,
+      "learning_rate": 0.0001950787831220804,
+      "loss": 0.2043,
+      "step": 1384
+    },
+    {
+      "epoch": 0.29087472435156986,
+      "grad_norm": 0.7946329116821289,
+      "learning_rate": 0.000195068015645342,
+      "loss": 0.1471,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2910847422030873,
+      "grad_norm": 0.7635067105293274,
+      "learning_rate": 0.0001950572366998287,
+      "loss": 0.193,
+      "step": 1386
+    },
+    {
+      "epoch": 0.29129476005460464,
+      "grad_norm": 0.8812301754951477,
+      "learning_rate": 0.0001950464462868409,
+      "loss": 0.3098,
+      "step": 1387
+    },
+    {
+      "epoch": 0.291504777906122,
+      "grad_norm": 0.6174623370170593,
+      "learning_rate": 0.00019503564440768033,
+      "loss": 0.22,
+      "step": 1388
+    },
+    {
+      "epoch": 0.2917147957576394,
+      "grad_norm": 0.6797242164611816,
+      "learning_rate": 0.00019502483106365005,
+      "loss": 0.1588,
+      "step": 1389
+    },
+    {
+      "epoch": 0.2919248136091568,
+      "grad_norm": 0.7031803727149963,
+      "learning_rate": 0.0001950140062560546,
+      "loss": 0.179,
+      "step": 1390
+    },
+    {
+      "epoch": 0.29213483146067415,
+      "grad_norm": 0.6158111095428467,
+      "learning_rate": 0.00019500316998619983,
+      "loss": 0.1637,
+      "step": 1391
+    },
+    {
+      "epoch": 0.2923448493121915,
+      "grad_norm": 0.5845907926559448,
+      "learning_rate": 0.000194992322255393,
+      "loss": 0.1435,
+      "step": 1392
+    },
+    {
+      "epoch": 0.29255486716370893,
+      "grad_norm": 0.5280815958976746,
+      "learning_rate": 0.00019498146306494283,
+      "loss": 0.1887,
+      "step": 1393
+    },
+    {
+      "epoch": 0.2927648850152263,
+      "grad_norm": 0.5925720930099487,
+      "learning_rate": 0.00019497059241615922,
+      "loss": 0.2006,
+      "step": 1394
+    },
+    {
+      "epoch": 0.29297490286674366,
+      "grad_norm": 0.6027230620384216,
+      "learning_rate": 0.00019495971031035367,
+      "loss": 0.2644,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2931849207182611,
+      "grad_norm": 0.36043769121170044,
+      "learning_rate": 0.00019494881674883896,
+      "loss": 0.1382,
+      "step": 1396
+    },
+    {
+      "epoch": 0.29339493856977844,
+      "grad_norm": 0.7824574708938599,
+      "learning_rate": 0.00019493791173292923,
+      "loss": 0.3109,
+      "step": 1397
+    },
+    {
+      "epoch": 0.2936049564212958,
+      "grad_norm": 0.590056300163269,
+      "learning_rate": 0.00019492699526394005,
+      "loss": 0.1493,
+      "step": 1398
+    },
+    {
+      "epoch": 0.29381497427281317,
+      "grad_norm": 0.4534437954425812,
+      "learning_rate": 0.00019491606734318837,
+      "loss": 0.1195,
+      "step": 1399
+    },
+    {
+      "epoch": 0.2940249921243306,
+      "grad_norm": 0.6726170778274536,
+      "learning_rate": 0.0001949051279719925,
+      "loss": 0.2224,
+      "step": 1400
+    },
+    {
+      "epoch": 0.29423500997584795,
+      "grad_norm": 0.6782220602035522,
+      "learning_rate": 0.00019489417715167214,
+      "loss": 0.2842,
+      "step": 1401
+    },
+    {
+      "epoch": 0.2944450278273653,
+      "grad_norm": 0.6347521543502808,
+      "learning_rate": 0.00019488321488354834,
+      "loss": 0.3274,
+      "step": 1402
+    },
+    {
+      "epoch": 0.29465504567888273,
+      "grad_norm": 0.5405476689338684,
+      "learning_rate": 0.0001948722411689436,
+      "loss": 0.2027,
+      "step": 1403
+    },
+    {
+      "epoch": 0.2948650635304001,
+      "grad_norm": 0.7271124124526978,
+      "learning_rate": 0.00019486125600918177,
+      "loss": 0.3642,
+      "step": 1404
+    },
+    {
+      "epoch": 0.29507508138191746,
+      "grad_norm": 0.6853488683700562,
+      "learning_rate": 0.00019485025940558804,
+      "loss": 0.2335,
+      "step": 1405
+    },
+    {
+      "epoch": 0.2952850992334348,
+      "grad_norm": 0.9300238490104675,
+      "learning_rate": 0.00019483925135948903,
+      "loss": 0.3032,
+      "step": 1406
+    },
+    {
+      "epoch": 0.29549511708495224,
+      "grad_norm": 0.4893563985824585,
+      "learning_rate": 0.0001948282318722127,
+      "loss": 0.1537,
+      "step": 1407
+    },
+    {
+      "epoch": 0.2957051349364696,
+      "grad_norm": 0.4719460606575012,
+      "learning_rate": 0.00019481720094508847,
+      "loss": 0.208,
+      "step": 1408
+    },
+    {
+      "epoch": 0.29591515278798697,
+      "grad_norm": 0.5899675488471985,
+      "learning_rate": 0.00019480615857944705,
+      "loss": 0.339,
+      "step": 1409
+    },
+    {
+      "epoch": 0.29612517063950433,
+      "grad_norm": 0.7662889361381531,
+      "learning_rate": 0.00019479510477662053,
+      "loss": 0.2625,
+      "step": 1410
+    },
+    {
+      "epoch": 0.29633518849102175,
+      "grad_norm": 0.7491856813430786,
+      "learning_rate": 0.00019478403953794246,
+      "loss": 0.272,
+      "step": 1411
+    },
+    {
+      "epoch": 0.2965452063425391,
+      "grad_norm": 0.5673351287841797,
+      "learning_rate": 0.00019477296286474772,
+      "loss": 0.2619,
+      "step": 1412
+    },
+    {
+      "epoch": 0.2967552241940565,
+      "grad_norm": 0.5551652908325195,
+      "learning_rate": 0.00019476187475837256,
+      "loss": 0.1971,
+      "step": 1413
+    },
+    {
+      "epoch": 0.2969652420455739,
+      "grad_norm": 0.6211804747581482,
+      "learning_rate": 0.00019475077522015463,
+      "loss": 0.2488,
+      "step": 1414
+    },
+    {
+      "epoch": 0.29717525989709126,
+      "grad_norm": 0.5500017404556274,
+      "learning_rate": 0.00019473966425143292,
+      "loss": 0.1745,
+      "step": 1415
+    },
+    {
+      "epoch": 0.2973852777486086,
+      "grad_norm": 0.8525585532188416,
+      "learning_rate": 0.00019472854185354792,
+      "loss": 0.2419,
+      "step": 1416
+    },
+    {
+      "epoch": 0.297595295600126,
+      "grad_norm": 0.8762658834457397,
+      "learning_rate": 0.0001947174080278413,
+      "loss": 0.2314,
+      "step": 1417
+    },
+    {
+      "epoch": 0.2978053134516434,
+      "grad_norm": 0.6880355477333069,
+      "learning_rate": 0.00019470626277565627,
+      "loss": 0.1981,
+      "step": 1418
+    },
+    {
+      "epoch": 0.29801533130316077,
+      "grad_norm": 0.4452259838581085,
+      "learning_rate": 0.00019469510609833736,
+      "loss": 0.1546,
+      "step": 1419
+    },
+    {
+      "epoch": 0.29822534915467813,
+      "grad_norm": 0.6769298911094666,
+      "learning_rate": 0.0001946839379972305,
+      "loss": 0.2219,
+      "step": 1420
+    },
+    {
+      "epoch": 0.29843536700619555,
+      "grad_norm": 0.7594902515411377,
+      "learning_rate": 0.00019467275847368296,
+      "loss": 0.2559,
+      "step": 1421
+    },
+    {
+      "epoch": 0.2986453848577129,
+      "grad_norm": 0.7277812957763672,
+      "learning_rate": 0.00019466156752904343,
+      "loss": 0.1504,
+      "step": 1422
+    },
+    {
+      "epoch": 0.2988554027092303,
+      "grad_norm": 0.47229236364364624,
+      "learning_rate": 0.00019465036516466192,
+      "loss": 0.215,
+      "step": 1423
+    },
+    {
+      "epoch": 0.29906542056074764,
+      "grad_norm": 0.7107577919960022,
+      "learning_rate": 0.00019463915138188994,
+      "loss": 0.323,
+      "step": 1424
+    },
+    {
+      "epoch": 0.29927543841226506,
+      "grad_norm": 0.7239084243774414,
+      "learning_rate": 0.00019462792618208017,
+      "loss": 0.2366,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2994854562637824,
+      "grad_norm": 0.5515862107276917,
+      "learning_rate": 0.0001946166895665869,
+      "loss": 0.1774,
+      "step": 1426
+    },
+    {
+      "epoch": 0.2996954741152998,
+      "grad_norm": 0.8459638357162476,
+      "learning_rate": 0.00019460544153676563,
+      "loss": 0.3038,
+      "step": 1427
+    },
+    {
+      "epoch": 0.2999054919668172,
+      "grad_norm": 0.7625037431716919,
+      "learning_rate": 0.0001945941820939733,
+      "loss": 0.226,
+      "step": 1428
+    },
+    {
+      "epoch": 0.30011550981833457,
+      "grad_norm": 0.6378094553947449,
+      "learning_rate": 0.00019458291123956823,
+      "loss": 0.2211,
+      "step": 1429
+    },
+    {
+      "epoch": 0.30032552766985193,
+      "grad_norm": 0.7584728002548218,
+      "learning_rate": 0.00019457162897491018,
+      "loss": 0.1805,
+      "step": 1430
+    },
+    {
+      "epoch": 0.3005355455213693,
+      "grad_norm": 0.7867018580436707,
+      "learning_rate": 0.0001945603353013601,
+      "loss": 0.2666,
+      "step": 1431
+    },
+    {
+      "epoch": 0.3007455633728867,
+      "grad_norm": 0.8175768852233887,
+      "learning_rate": 0.00019454903022028046,
+      "loss": 0.2373,
+      "step": 1432
+    },
+    {
+      "epoch": 0.3009555812244041,
+      "grad_norm": 0.4827120006084442,
+      "learning_rate": 0.0001945377137330351,
+      "loss": 0.1528,
+      "step": 1433
+    },
+    {
+      "epoch": 0.30116559907592144,
+      "grad_norm": 0.7759237885475159,
+      "learning_rate": 0.00019452638584098925,
+      "loss": 0.2113,
+      "step": 1434
+    },
+    {
+      "epoch": 0.3013756169274388,
+      "grad_norm": 0.4536662995815277,
+      "learning_rate": 0.00019451504654550937,
+      "loss": 0.1506,
+      "step": 1435
+    },
+    {
+      "epoch": 0.3015856347789562,
+      "grad_norm": 0.7473581433296204,
+      "learning_rate": 0.00019450369584796354,
+      "loss": 0.2763,
+      "step": 1436
+    },
+    {
+      "epoch": 0.3017956526304736,
+      "grad_norm": 0.4402840733528137,
+      "learning_rate": 0.00019449233374972097,
+      "loss": 0.1507,
+      "step": 1437
+    },
+    {
+      "epoch": 0.30200567048199095,
+      "grad_norm": 0.39426618814468384,
+      "learning_rate": 0.00019448096025215242,
+      "loss": 0.1298,
+      "step": 1438
+    },
+    {
+      "epoch": 0.30221568833350837,
+      "grad_norm": 0.5444751977920532,
+      "learning_rate": 0.00019446957535662992,
+      "loss": 0.2057,
+      "step": 1439
+    },
+    {
+      "epoch": 0.30242570618502573,
+      "grad_norm": 1.0035810470581055,
+      "learning_rate": 0.00019445817906452696,
+      "loss": 0.2256,
+      "step": 1440
+    },
+    {
+      "epoch": 0.3026357240365431,
+      "grad_norm": 0.6751492023468018,
+      "learning_rate": 0.00019444677137721834,
+      "loss": 0.2483,
+      "step": 1441
+    },
+    {
+      "epoch": 0.30284574188806046,
+      "grad_norm": 0.6625372767448425,
+      "learning_rate": 0.00019443535229608024,
+      "loss": 0.2556,
+      "step": 1442
+    },
+    {
+      "epoch": 0.3030557597395779,
+      "grad_norm": 0.5708040595054626,
+      "learning_rate": 0.00019442392182249024,
+      "loss": 0.2081,
+      "step": 1443
+    },
+    {
+      "epoch": 0.30326577759109524,
+      "grad_norm": 0.46071097254753113,
+      "learning_rate": 0.00019441247995782731,
+      "loss": 0.1115,
+      "step": 1444
+    },
+    {
+      "epoch": 0.3034757954426126,
+      "grad_norm": 0.7548226714134216,
+      "learning_rate": 0.00019440102670347176,
+      "loss": 0.2926,
+      "step": 1445
+    },
+    {
+      "epoch": 0.30368581329413,
+      "grad_norm": 0.4947049021720886,
+      "learning_rate": 0.00019438956206080526,
+      "loss": 0.1596,
+      "step": 1446
+    },
+    {
+      "epoch": 0.3038958311456474,
+      "grad_norm": 0.5163564085960388,
+      "learning_rate": 0.00019437808603121087,
+      "loss": 0.1452,
+      "step": 1447
+    },
+    {
+      "epoch": 0.30410584899716475,
+      "grad_norm": 0.7467600107192993,
+      "learning_rate": 0.00019436659861607304,
+      "loss": 0.2338,
+      "step": 1448
+    },
+    {
+      "epoch": 0.3043158668486821,
+      "grad_norm": 0.6491103172302246,
+      "learning_rate": 0.00019435509981677762,
+      "loss": 0.2451,
+      "step": 1449
+    },
+    {
+      "epoch": 0.30452588470019953,
+      "grad_norm": 0.5478000640869141,
+      "learning_rate": 0.00019434358963471175,
+      "loss": 0.206,
+      "step": 1450
+    },
+    {
+      "epoch": 0.3047359025517169,
+      "grad_norm": 0.6403598189353943,
+      "learning_rate": 0.000194332068071264,
+      "loss": 0.3738,
+      "step": 1451
+    },
+    {
+      "epoch": 0.30494592040323426,
+      "grad_norm": 0.7929242253303528,
+      "learning_rate": 0.00019432053512782435,
+      "loss": 0.3969,
+      "step": 1452
+    },
+    {
+      "epoch": 0.3051559382547517,
+      "grad_norm": 0.5820061564445496,
+      "learning_rate": 0.00019430899080578407,
+      "loss": 0.2117,
+      "step": 1453
+    },
+    {
+      "epoch": 0.30536595610626904,
+      "grad_norm": 0.5793547034263611,
+      "learning_rate": 0.0001942974351065358,
+      "loss": 0.2539,
+      "step": 1454
+    },
+    {
+      "epoch": 0.3055759739577864,
+      "grad_norm": 0.5798085331916809,
+      "learning_rate": 0.00019428586803147365,
+      "loss": 0.2523,
+      "step": 1455
+    },
+    {
+      "epoch": 0.30578599180930377,
+      "grad_norm": 0.5463792085647583,
+      "learning_rate": 0.00019427428958199302,
+      "loss": 0.2095,
+      "step": 1456
+    },
+    {
+      "epoch": 0.3059960096608212,
+      "grad_norm": 0.7590929865837097,
+      "learning_rate": 0.00019426269975949073,
+      "loss": 0.2232,
+      "step": 1457
+    },
+    {
+      "epoch": 0.30620602751233855,
+      "grad_norm": 0.6503840088844299,
+      "learning_rate": 0.0001942510985653649,
+      "loss": 0.2576,
+      "step": 1458
+    },
+    {
+      "epoch": 0.3064160453638559,
+      "grad_norm": 0.7579488158226013,
+      "learning_rate": 0.0001942394860010151,
+      "loss": 0.2439,
+      "step": 1459
+    },
+    {
+      "epoch": 0.3066260632153733,
+      "grad_norm": 0.6022104620933533,
+      "learning_rate": 0.00019422786206784224,
+      "loss": 0.2076,
+      "step": 1460
+    },
+    {
+      "epoch": 0.3068360810668907,
+      "grad_norm": 0.5977317094802856,
+      "learning_rate": 0.00019421622676724863,
+      "loss": 0.2094,
+      "step": 1461
+    },
+    {
+      "epoch": 0.30704609891840806,
+      "grad_norm": 0.6613571047782898,
+      "learning_rate": 0.00019420458010063787,
+      "loss": 0.3956,
+      "step": 1462
+    },
+    {
+      "epoch": 0.3072561167699254,
+      "grad_norm": 0.5474089980125427,
+      "learning_rate": 0.00019419292206941503,
+      "loss": 0.2094,
+      "step": 1463
+    },
+    {
+      "epoch": 0.30746613462144284,
+      "grad_norm": 0.6770557761192322,
+      "learning_rate": 0.0001941812526749865,
+      "loss": 0.2626,
+      "step": 1464
+    },
+    {
+      "epoch": 0.3076761524729602,
+      "grad_norm": 0.7406523823738098,
+      "learning_rate": 0.00019416957191876,
+      "loss": 0.1788,
+      "step": 1465
+    },
+    {
+      "epoch": 0.30788617032447757,
+      "grad_norm": 0.6642553210258484,
+      "learning_rate": 0.00019415787980214472,
+      "loss": 0.2695,
+      "step": 1466
+    },
+    {
+      "epoch": 0.30809618817599493,
+      "grad_norm": 0.5584815740585327,
+      "learning_rate": 0.00019414617632655115,
+      "loss": 0.1823,
+      "step": 1467
+    },
+    {
+      "epoch": 0.30830620602751235,
+      "grad_norm": 0.9342876076698303,
+      "learning_rate": 0.00019413446149339119,
+      "loss": 0.2435,
+      "step": 1468
+    },
+    {
+      "epoch": 0.3085162238790297,
+      "grad_norm": 0.9271596670150757,
+      "learning_rate": 0.00019412273530407804,
+      "loss": 0.2211,
+      "step": 1469
+    },
+    {
+      "epoch": 0.3087262417305471,
+      "grad_norm": 0.7659008502960205,
+      "learning_rate": 0.00019411099776002637,
+      "loss": 0.231,
+      "step": 1470
+    },
+    {
+      "epoch": 0.3089362595820645,
+      "grad_norm": 0.5303452014923096,
+      "learning_rate": 0.00019409924886265215,
+      "loss": 0.1846,
+      "step": 1471
+    },
+    {
+      "epoch": 0.30914627743358186,
+      "grad_norm": 0.6797159314155579,
+      "learning_rate": 0.00019408748861337273,
+      "loss": 0.1999,
+      "step": 1472
+    },
+    {
+      "epoch": 0.3093562952850992,
+      "grad_norm": 0.8978555798530579,
+      "learning_rate": 0.00019407571701360684,
+      "loss": 0.2137,
+      "step": 1473
+    },
+    {
+      "epoch": 0.3095663131366166,
+      "grad_norm": 0.7389368414878845,
+      "learning_rate": 0.0001940639340647746,
+      "loss": 0.21,
+      "step": 1474
+    },
+    {
+      "epoch": 0.309776330988134,
+      "grad_norm": 0.9239479899406433,
+      "learning_rate": 0.00019405213976829745,
+      "loss": 0.2784,
+      "step": 1475
+    },
+    {
+      "epoch": 0.30998634883965137,
+      "grad_norm": 0.5426455140113831,
+      "learning_rate": 0.00019404033412559826,
+      "loss": 0.1512,
+      "step": 1476
+    },
+    {
+      "epoch": 0.31019636669116873,
+      "grad_norm": 0.7385450005531311,
+      "learning_rate": 0.0001940285171381012,
+      "loss": 0.3286,
+      "step": 1477
+    },
+    {
+      "epoch": 0.31040638454268615,
+      "grad_norm": 1.2557915449142456,
+      "learning_rate": 0.00019401668880723183,
+      "loss": 0.2918,
+      "step": 1478
+    },
+    {
+      "epoch": 0.3106164023942035,
+      "grad_norm": 0.503313422203064,
+      "learning_rate": 0.0001940048491344171,
+      "loss": 0.1517,
+      "step": 1479
+    },
+    {
+      "epoch": 0.3108264202457209,
+      "grad_norm": 0.7646273374557495,
+      "learning_rate": 0.00019399299812108538,
+      "loss": 0.1937,
+      "step": 1480
+    },
+    {
+      "epoch": 0.31103643809723824,
+      "grad_norm": 0.7608271837234497,
+      "learning_rate": 0.00019398113576866627,
+      "loss": 0.2097,
+      "step": 1481
+    },
+    {
+      "epoch": 0.31124645594875566,
+      "grad_norm": 0.4755302667617798,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.1295,
+      "step": 1482
+    },
+    {
+      "epoch": 0.311456473800273,
+      "grad_norm": 0.6126772165298462,
+      "learning_rate": 0.00019395737705229152,
+      "loss": 0.2683,
+      "step": 1483
+    },
+    {
+      "epoch": 0.3116664916517904,
+      "grad_norm": 0.5116230249404907,
+      "learning_rate": 0.0001939454806912021,
+      "loss": 0.1898,
+      "step": 1484
+    },
+    {
+      "epoch": 0.3118765095033078,
+      "grad_norm": 0.5085896849632263,
+      "learning_rate": 0.00019393357299675765,
+      "loss": 0.1396,
+      "step": 1485
+    },
+    {
+      "epoch": 0.31208652735482517,
+      "grad_norm": 0.6756225228309631,
+      "learning_rate": 0.0001939216539703948,
+      "loss": 0.246,
+      "step": 1486
+    },
+    {
+      "epoch": 0.31229654520634254,
+      "grad_norm": 0.7151501178741455,
+      "learning_rate": 0.00019390972361355132,
+      "loss": 0.2391,
+      "step": 1487
+    },
+    {
+      "epoch": 0.3125065630578599,
+      "grad_norm": 0.5057124495506287,
+      "learning_rate": 0.00019389778192766655,
+      "loss": 0.1646,
+      "step": 1488
+    },
+    {
+      "epoch": 0.3127165809093773,
+      "grad_norm": 0.4521079957485199,
+      "learning_rate": 0.0001938858289141811,
+      "loss": 0.1356,
+      "step": 1489
+    },
+    {
+      "epoch": 0.3129265987608947,
+      "grad_norm": 0.5103457570075989,
+      "learning_rate": 0.00019387386457453686,
+      "loss": 0.1519,
+      "step": 1490
+    },
+    {
+      "epoch": 0.31313661661241204,
+      "grad_norm": 0.6395041346549988,
+      "learning_rate": 0.0001938618889101773,
+      "loss": 0.2737,
+      "step": 1491
+    },
+    {
+      "epoch": 0.3133466344639294,
+      "grad_norm": 0.6011154055595398,
+      "learning_rate": 0.00019384990192254704,
+      "loss": 0.2148,
+      "step": 1492
+    },
+    {
+      "epoch": 0.3135566523154468,
+      "grad_norm": 0.554009735584259,
+      "learning_rate": 0.0001938379036130922,
+      "loss": 0.1939,
+      "step": 1493
+    },
+    {
+      "epoch": 0.3137666701669642,
+      "grad_norm": 0.6606349349021912,
+      "learning_rate": 0.00019382589398326023,
+      "loss": 0.2804,
+      "step": 1494
+    },
+    {
+      "epoch": 0.31397668801848155,
+      "grad_norm": 0.5309305787086487,
+      "learning_rate": 0.00019381387303449995,
+      "loss": 0.2004,
+      "step": 1495
+    },
+    {
+      "epoch": 0.314186705869999,
+      "grad_norm": 0.6982274055480957,
+      "learning_rate": 0.0001938018407682615,
+      "loss": 0.2301,
+      "step": 1496
+    },
+    {
+      "epoch": 0.31439672372151634,
+      "grad_norm": 0.6347817778587341,
+      "learning_rate": 0.00019378979718599645,
+      "loss": 0.2073,
+      "step": 1497
+    },
+    {
+      "epoch": 0.3146067415730337,
+      "grad_norm": 0.5214099884033203,
+      "learning_rate": 0.00019377774228915775,
+      "loss": 0.1831,
+      "step": 1498
+    },
+    {
+      "epoch": 0.31481675942455106,
+      "grad_norm": 0.5580305457115173,
+      "learning_rate": 0.0001937656760791996,
+      "loss": 0.2311,
+      "step": 1499
+    },
+    {
+      "epoch": 0.3150267772760685,
+      "grad_norm": 0.7240638732910156,
+      "learning_rate": 0.00019375359855757767,
+      "loss": 0.2134,
+      "step": 1500
+    },
+    {
+      "epoch": 0.31523679512758584,
+      "grad_norm": 0.681068480014801,
+      "learning_rate": 0.00019374150972574896,
+      "loss": 0.2452,
+      "step": 1501
+    },
+    {
+      "epoch": 0.3154468129791032,
+      "grad_norm": 0.723272442817688,
+      "learning_rate": 0.00019372940958517184,
+      "loss": 0.2387,
+      "step": 1502
+    },
+    {
+      "epoch": 0.3156568308306206,
+      "grad_norm": 0.5136252045631409,
+      "learning_rate": 0.00019371729813730606,
+      "loss": 0.161,
+      "step": 1503
+    },
+    {
+      "epoch": 0.315866848682138,
+      "grad_norm": 0.5376479625701904,
+      "learning_rate": 0.00019370517538361268,
+      "loss": 0.2566,
+      "step": 1504
+    },
+    {
+      "epoch": 0.31607686653365535,
+      "grad_norm": 0.4473995566368103,
+      "learning_rate": 0.00019369304132555416,
+      "loss": 0.2606,
+      "step": 1505
+    },
+    {
+      "epoch": 0.3162868843851727,
+      "grad_norm": 0.6067336797714233,
+      "learning_rate": 0.00019368089596459438,
+      "loss": 0.1801,
+      "step": 1506
+    },
+    {
+      "epoch": 0.31649690223669014,
+      "grad_norm": 0.654007077217102,
+      "learning_rate": 0.00019366873930219846,
+      "loss": 0.1709,
+      "step": 1507
+    },
+    {
+      "epoch": 0.3167069200882075,
+      "grad_norm": 0.803130567073822,
+      "learning_rate": 0.00019365657133983298,
+      "loss": 0.2791,
+      "step": 1508
+    },
+    {
+      "epoch": 0.31691693793972486,
+      "grad_norm": 0.6500551104545593,
+      "learning_rate": 0.00019364439207896584,
+      "loss": 0.2356,
+      "step": 1509
+    },
+    {
+      "epoch": 0.3171269557912423,
+      "grad_norm": 0.644919753074646,
+      "learning_rate": 0.00019363220152106636,
+      "loss": 0.3375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.31733697364275965,
+      "grad_norm": 0.6162266135215759,
+      "learning_rate": 0.00019361999966760514,
+      "loss": 0.2503,
+      "step": 1511
+    },
+    {
+      "epoch": 0.317546991494277,
+      "grad_norm": 0.7240517139434814,
+      "learning_rate": 0.00019360778652005416,
+      "loss": 0.2265,
+      "step": 1512
+    },
+    {
+      "epoch": 0.3177570093457944,
+      "grad_norm": 0.6839559674263,
+      "learning_rate": 0.00019359556207988683,
+      "loss": 0.2455,
+      "step": 1513
+    },
+    {
+      "epoch": 0.3179670271973118,
+      "grad_norm": 0.6277573108673096,
+      "learning_rate": 0.00019358332634857787,
+      "loss": 0.1721,
+      "step": 1514
+    },
+    {
+      "epoch": 0.31817704504882915,
+      "grad_norm": 0.7677130103111267,
+      "learning_rate": 0.00019357107932760334,
+      "loss": 0.3157,
+      "step": 1515
+    },
+    {
+      "epoch": 0.3183870629003465,
+      "grad_norm": 0.8040608167648315,
+      "learning_rate": 0.00019355882101844074,
+      "loss": 0.2176,
+      "step": 1516
+    },
+    {
+      "epoch": 0.3185970807518639,
+      "grad_norm": 0.8311493396759033,
+      "learning_rate": 0.0001935465514225688,
+      "loss": 0.2872,
+      "step": 1517
+    },
+    {
+      "epoch": 0.3188070986033813,
+      "grad_norm": 0.5430156588554382,
+      "learning_rate": 0.0001935342705414678,
+      "loss": 0.14,
+      "step": 1518
+    },
+    {
+      "epoch": 0.31901711645489866,
+      "grad_norm": 0.6012011766433716,
+      "learning_rate": 0.00019352197837661922,
+      "loss": 0.2076,
+      "step": 1519
+    },
+    {
+      "epoch": 0.319227134306416,
+      "grad_norm": 0.6674286127090454,
+      "learning_rate": 0.0001935096749295059,
+      "loss": 0.2552,
+      "step": 1520
+    },
+    {
+      "epoch": 0.31943715215793345,
+      "grad_norm": 0.8329024910926819,
+      "learning_rate": 0.0001934973602016122,
+      "loss": 0.369,
+      "step": 1521
+    },
+    {
+      "epoch": 0.3196471700094508,
+      "grad_norm": 0.5887104868888855,
+      "learning_rate": 0.0001934850341944237,
+      "loss": 0.2622,
+      "step": 1522
+    },
+    {
+      "epoch": 0.3198571878609682,
+      "grad_norm": 0.5798863172531128,
+      "learning_rate": 0.00019347269690942736,
+      "loss": 0.1691,
+      "step": 1523
+    },
+    {
+      "epoch": 0.32006720571248554,
+      "grad_norm": 0.5610338449478149,
+      "learning_rate": 0.00019346034834811154,
+      "loss": 0.236,
+      "step": 1524
+    },
+    {
+      "epoch": 0.32027722356400296,
+      "grad_norm": 0.6147192120552063,
+      "learning_rate": 0.00019344798851196596,
+      "loss": 0.3255,
+      "step": 1525
+    },
+    {
+      "epoch": 0.3204872414155203,
+      "grad_norm": 0.5580636262893677,
+      "learning_rate": 0.00019343561740248165,
+      "loss": 0.2891,
+      "step": 1526
+    },
+    {
+      "epoch": 0.3206972592670377,
+      "grad_norm": 0.6528330445289612,
+      "learning_rate": 0.00019342323502115103,
+      "loss": 0.2725,
+      "step": 1527
+    },
+    {
+      "epoch": 0.3209072771185551,
+      "grad_norm": 0.641440212726593,
+      "learning_rate": 0.00019341084136946786,
+      "loss": 0.2535,
+      "step": 1528
+    },
+    {
+      "epoch": 0.32111729497007246,
+      "grad_norm": 0.5117334127426147,
+      "learning_rate": 0.00019339843644892735,
+      "loss": 0.2805,
+      "step": 1529
+    },
+    {
+      "epoch": 0.32132731282158983,
+      "grad_norm": 0.6867427825927734,
+      "learning_rate": 0.00019338602026102594,
+      "loss": 0.2848,
+      "step": 1530
+    },
+    {
+      "epoch": 0.3215373306731072,
+      "grad_norm": 0.635741651058197,
+      "learning_rate": 0.0001933735928072615,
+      "loss": 0.2608,
+      "step": 1531
+    },
+    {
+      "epoch": 0.3217473485246246,
+      "grad_norm": 0.745174765586853,
+      "learning_rate": 0.00019336115408913327,
+      "loss": 0.1958,
+      "step": 1532
+    },
+    {
+      "epoch": 0.321957366376142,
+      "grad_norm": 0.5438898205757141,
+      "learning_rate": 0.0001933487041081418,
+      "loss": 0.1649,
+      "step": 1533
+    },
+    {
+      "epoch": 0.32216738422765934,
+      "grad_norm": 0.6228841543197632,
+      "learning_rate": 0.00019333624286578904,
+      "loss": 0.2266,
+      "step": 1534
+    },
+    {
+      "epoch": 0.32237740207917676,
+      "grad_norm": 0.9263353943824768,
+      "learning_rate": 0.00019332377036357826,
+      "loss": 0.2374,
+      "step": 1535
+    },
+    {
+      "epoch": 0.3225874199306941,
+      "grad_norm": 0.5164491534233093,
+      "learning_rate": 0.00019331128660301418,
+      "loss": 0.1553,
+      "step": 1536
+    },
+    {
+      "epoch": 0.3227974377822115,
+      "grad_norm": 0.6456015110015869,
+      "learning_rate": 0.00019329879158560274,
+      "loss": 0.2299,
+      "step": 1537
+    },
+    {
+      "epoch": 0.32300745563372885,
+      "grad_norm": 0.735962450504303,
+      "learning_rate": 0.00019328628531285134,
+      "loss": 0.1944,
+      "step": 1538
+    },
+    {
+      "epoch": 0.32321747348524626,
+      "grad_norm": 0.5499973893165588,
+      "learning_rate": 0.0001932737677862687,
+      "loss": 0.2006,
+      "step": 1539
+    },
+    {
+      "epoch": 0.32342749133676363,
+      "grad_norm": 0.5221630334854126,
+      "learning_rate": 0.0001932612390073649,
+      "loss": 0.1536,
+      "step": 1540
+    },
+    {
+      "epoch": 0.323637509188281,
+      "grad_norm": 0.6822124123573303,
+      "learning_rate": 0.00019324869897765137,
+      "loss": 0.2061,
+      "step": 1541
+    },
+    {
+      "epoch": 0.32384752703979836,
+      "grad_norm": 0.6536619663238525,
+      "learning_rate": 0.00019323614769864095,
+      "loss": 0.1727,
+      "step": 1542
+    },
+    {
+      "epoch": 0.3240575448913158,
+      "grad_norm": 0.5224418640136719,
+      "learning_rate": 0.00019322358517184774,
+      "loss": 0.1519,
+      "step": 1543
+    },
+    {
+      "epoch": 0.32426756274283314,
+      "grad_norm": 0.6163767576217651,
+      "learning_rate": 0.00019321101139878729,
+      "loss": 0.1736,
+      "step": 1544
+    },
+    {
+      "epoch": 0.3244775805943505,
+      "grad_norm": 0.7845442295074463,
+      "learning_rate": 0.00019319842638097648,
+      "loss": 0.3157,
+      "step": 1545
+    },
+    {
+      "epoch": 0.3246875984458679,
+      "grad_norm": 0.5813778638839722,
+      "learning_rate": 0.0001931858301199335,
+      "loss": 0.1722,
+      "step": 1546
+    },
+    {
+      "epoch": 0.3248976162973853,
+      "grad_norm": 0.6266569495201111,
+      "learning_rate": 0.00019317322261717794,
+      "loss": 0.1653,
+      "step": 1547
+    },
+    {
+      "epoch": 0.32510763414890265,
+      "grad_norm": 0.6247707009315491,
+      "learning_rate": 0.00019316060387423076,
+      "loss": 0.2562,
+      "step": 1548
+    },
+    {
+      "epoch": 0.32531765200042,
+      "grad_norm": 0.777686357498169,
+      "learning_rate": 0.00019314797389261424,
+      "loss": 0.1978,
+      "step": 1549
+    },
+    {
+      "epoch": 0.32552766985193743,
+      "grad_norm": 0.5679663419723511,
+      "learning_rate": 0.00019313533267385205,
+      "loss": 0.134,
+      "step": 1550
+    },
+    {
+      "epoch": 0.3257376877034548,
+      "grad_norm": 0.4159435033798218,
+      "learning_rate": 0.00019312268021946918,
+      "loss": 0.2976,
+      "step": 1551
+    },
+    {
+      "epoch": 0.32594770555497216,
+      "grad_norm": 0.737212061882019,
+      "learning_rate": 0.00019311001653099193,
+      "loss": 0.2798,
+      "step": 1552
+    },
+    {
+      "epoch": 0.3261577234064896,
+      "grad_norm": 0.7335833311080933,
+      "learning_rate": 0.00019309734160994816,
+      "loss": 0.2681,
+      "step": 1553
+    },
+    {
+      "epoch": 0.32636774125800694,
+      "grad_norm": 0.5905042290687561,
+      "learning_rate": 0.00019308465545786683,
+      "loss": 0.2378,
+      "step": 1554
+    },
+    {
+      "epoch": 0.3265777591095243,
+      "grad_norm": 0.5820741057395935,
+      "learning_rate": 0.00019307195807627837,
+      "loss": 0.2429,
+      "step": 1555
+    },
+    {
+      "epoch": 0.32678777696104166,
+      "grad_norm": 0.6299948692321777,
+      "learning_rate": 0.00019305924946671463,
+      "loss": 0.1792,
+      "step": 1556
+    },
+    {
+      "epoch": 0.3269977948125591,
+      "grad_norm": 0.6246170997619629,
+      "learning_rate": 0.0001930465296307087,
+      "loss": 0.203,
+      "step": 1557
+    },
+    {
+      "epoch": 0.32720781266407645,
+      "grad_norm": 0.5834625363349915,
+      "learning_rate": 0.00019303379856979501,
+      "loss": 0.2687,
+      "step": 1558
+    },
+    {
+      "epoch": 0.3274178305155938,
+      "grad_norm": 0.648408830165863,
+      "learning_rate": 0.00019302105628550952,
+      "loss": 0.2195,
+      "step": 1559
+    },
+    {
+      "epoch": 0.32762784836711123,
+      "grad_norm": 0.7197003960609436,
+      "learning_rate": 0.00019300830277938936,
+      "loss": 0.2717,
+      "step": 1560
+    },
+    {
+      "epoch": 0.3278378662186286,
+      "grad_norm": 0.6736263036727905,
+      "learning_rate": 0.0001929955380529731,
+      "loss": 0.4104,
+      "step": 1561
+    },
+    {
+      "epoch": 0.32804788407014596,
+      "grad_norm": 0.5383338332176208,
+      "learning_rate": 0.00019298276210780068,
+      "loss": 0.2272,
+      "step": 1562
+    },
+    {
+      "epoch": 0.3282579019216633,
+      "grad_norm": 0.5980930328369141,
+      "learning_rate": 0.0001929699749454133,
+      "loss": 0.237,
+      "step": 1563
+    },
+    {
+      "epoch": 0.32846791977318074,
+      "grad_norm": 0.5906263589859009,
+      "learning_rate": 0.00019295717656735357,
+      "loss": 0.2028,
+      "step": 1564
+    },
+    {
+      "epoch": 0.3286779376246981,
+      "grad_norm": 0.5565531849861145,
+      "learning_rate": 0.0001929443669751655,
+      "loss": 0.1398,
+      "step": 1565
+    },
+    {
+      "epoch": 0.32888795547621547,
+      "grad_norm": 0.5388147830963135,
+      "learning_rate": 0.00019293154617039437,
+      "loss": 0.1712,
+      "step": 1566
+    },
+    {
+      "epoch": 0.3290979733277329,
+      "grad_norm": 0.5639932751655579,
+      "learning_rate": 0.00019291871415458688,
+      "loss": 0.1461,
+      "step": 1567
+    },
+    {
+      "epoch": 0.32930799117925025,
+      "grad_norm": 0.8730794787406921,
+      "learning_rate": 0.00019290587092929106,
+      "loss": 0.3195,
+      "step": 1568
+    },
+    {
+      "epoch": 0.3295180090307676,
+      "grad_norm": 0.8476155400276184,
+      "learning_rate": 0.00019289301649605625,
+      "loss": 0.2727,
+      "step": 1569
+    },
+    {
+      "epoch": 0.329728026882285,
+      "grad_norm": 0.8710721135139465,
+      "learning_rate": 0.0001928801508564332,
+      "loss": 0.2103,
+      "step": 1570
+    },
+    {
+      "epoch": 0.3299380447338024,
+      "grad_norm": 0.8265402913093567,
+      "learning_rate": 0.000192867274011974,
+      "loss": 0.3057,
+      "step": 1571
+    },
+    {
+      "epoch": 0.33014806258531976,
+      "grad_norm": 0.7343376278877258,
+      "learning_rate": 0.00019285438596423204,
+      "loss": 0.3153,
+      "step": 1572
+    },
+    {
+      "epoch": 0.3303580804368371,
+      "grad_norm": 0.6627643704414368,
+      "learning_rate": 0.00019284148671476215,
+      "loss": 0.2521,
+      "step": 1573
+    },
+    {
+      "epoch": 0.3305680982883545,
+      "grad_norm": 0.6346861124038696,
+      "learning_rate": 0.0001928285762651204,
+      "loss": 0.1335,
+      "step": 1574
+    },
+    {
+      "epoch": 0.3307781161398719,
+      "grad_norm": 0.7667282223701477,
+      "learning_rate": 0.00019281565461686437,
+      "loss": 0.3119,
+      "step": 1575
+    },
+    {
+      "epoch": 0.33098813399138927,
+      "grad_norm": 0.6494827270507812,
+      "learning_rate": 0.00019280272177155282,
+      "loss": 0.2105,
+      "step": 1576
+    },
+    {
+      "epoch": 0.33119815184290663,
+      "grad_norm": 0.7444926500320435,
+      "learning_rate": 0.00019278977773074595,
+      "loss": 0.2244,
+      "step": 1577
+    },
+    {
+      "epoch": 0.33140816969442405,
+      "grad_norm": 0.5099835991859436,
+      "learning_rate": 0.00019277682249600536,
+      "loss": 0.1396,
+      "step": 1578
+    },
+    {
+      "epoch": 0.3316181875459414,
+      "grad_norm": 0.7752482295036316,
+      "learning_rate": 0.00019276385606889384,
+      "loss": 0.2423,
+      "step": 1579
+    },
+    {
+      "epoch": 0.3318282053974588,
+      "grad_norm": 0.8107450008392334,
+      "learning_rate": 0.0001927508784509757,
+      "loss": 0.348,
+      "step": 1580
+    },
+    {
+      "epoch": 0.33203822324897614,
+      "grad_norm": 0.7489921450614929,
+      "learning_rate": 0.00019273788964381647,
+      "loss": 0.2613,
+      "step": 1581
+    },
+    {
+      "epoch": 0.33224824110049356,
+      "grad_norm": 0.7400534152984619,
+      "learning_rate": 0.00019272488964898316,
+      "loss": 0.2607,
+      "step": 1582
+    },
+    {
+      "epoch": 0.3324582589520109,
+      "grad_norm": 0.5672757029533386,
+      "learning_rate": 0.00019271187846804403,
+      "loss": 0.2251,
+      "step": 1583
+    },
+    {
+      "epoch": 0.3326682768035283,
+      "grad_norm": 0.7830666303634644,
+      "learning_rate": 0.00019269885610256865,
+      "loss": 0.2471,
+      "step": 1584
+    },
+    {
+      "epoch": 0.3328782946550457,
+      "grad_norm": 0.753364622592926,
+      "learning_rate": 0.00019268582255412814,
+      "loss": 0.2339,
+      "step": 1585
+    },
+    {
+      "epoch": 0.33308831250656307,
+      "grad_norm": 0.6402462124824524,
+      "learning_rate": 0.0001926727778242947,
+      "loss": 0.1535,
+      "step": 1586
+    },
+    {
+      "epoch": 0.33329833035808043,
+      "grad_norm": 0.9237805008888245,
+      "learning_rate": 0.00019265972191464213,
+      "loss": 0.1999,
+      "step": 1587
+    },
+    {
+      "epoch": 0.3335083482095978,
+      "grad_norm": 0.8045313954353333,
+      "learning_rate": 0.00019264665482674536,
+      "loss": 0.2769,
+      "step": 1588
+    },
+    {
+      "epoch": 0.3337183660611152,
+      "grad_norm": 0.6476468443870544,
+      "learning_rate": 0.0001926335765621808,
+      "loss": 0.2729,
+      "step": 1589
+    },
+    {
+      "epoch": 0.3339283839126326,
+      "grad_norm": 0.5111613869667053,
+      "learning_rate": 0.00019262048712252624,
+      "loss": 0.1721,
+      "step": 1590
+    },
+    {
+      "epoch": 0.33413840176414994,
+      "grad_norm": 0.6063307523727417,
+      "learning_rate": 0.00019260738650936073,
+      "loss": 0.1926,
+      "step": 1591
+    },
+    {
+      "epoch": 0.33434841961566736,
+      "grad_norm": 0.5078137516975403,
+      "learning_rate": 0.00019259427472426467,
+      "loss": 0.2166,
+      "step": 1592
+    },
+    {
+      "epoch": 0.3345584374671847,
+      "grad_norm": 0.5363006591796875,
+      "learning_rate": 0.00019258115176881986,
+      "loss": 0.233,
+      "step": 1593
+    },
+    {
+      "epoch": 0.3347684553187021,
+      "grad_norm": 0.4900319576263428,
+      "learning_rate": 0.00019256801764460936,
+      "loss": 0.1666,
+      "step": 1594
+    },
+    {
+      "epoch": 0.33497847317021945,
+      "grad_norm": 0.5940386056900024,
+      "learning_rate": 0.00019255487235321774,
+      "loss": 0.1563,
+      "step": 1595
+    },
+    {
+      "epoch": 0.33518849102173687,
+      "grad_norm": 0.6830379366874695,
+      "learning_rate": 0.00019254171589623076,
+      "loss": 0.2101,
+      "step": 1596
+    },
+    {
+      "epoch": 0.33539850887325423,
+      "grad_norm": 0.7788860201835632,
+      "learning_rate": 0.00019252854827523557,
+      "loss": 0.2819,
+      "step": 1597
+    },
+    {
+      "epoch": 0.3356085267247716,
+      "grad_norm": 0.4878160357475281,
+      "learning_rate": 0.0001925153694918207,
+      "loss": 0.173,
+      "step": 1598
+    },
+    {
+      "epoch": 0.33581854457628896,
+      "grad_norm": 0.643084704875946,
+      "learning_rate": 0.00019250217954757602,
+      "loss": 0.1881,
+      "step": 1599
+    },
+    {
+      "epoch": 0.3360285624278064,
+      "grad_norm": 0.5981395244598389,
+      "learning_rate": 0.0001924889784440927,
+      "loss": 0.2883,
+      "step": 1600
+    },
+    {
+      "epoch": 0.33623858027932374,
+      "grad_norm": 0.4534986615180969,
+      "learning_rate": 0.0001924757661829633,
+      "loss": 0.3139,
+      "step": 1601
+    },
+    {
+      "epoch": 0.3364485981308411,
+      "grad_norm": 0.5547178387641907,
+      "learning_rate": 0.00019246254276578174,
+      "loss": 0.3448,
+      "step": 1602
+    },
+    {
+      "epoch": 0.3366586159823585,
+      "grad_norm": 0.5542036890983582,
+      "learning_rate": 0.00019244930819414325,
+      "loss": 0.3081,
+      "step": 1603
+    },
+    {
+      "epoch": 0.3368686338338759,
+      "grad_norm": 0.5334186553955078,
+      "learning_rate": 0.00019243606246964438,
+      "loss": 0.2045,
+      "step": 1604
+    },
+    {
+      "epoch": 0.33707865168539325,
+      "grad_norm": 0.4146350026130676,
+      "learning_rate": 0.00019242280559388311,
+      "loss": 0.2225,
+      "step": 1605
+    },
+    {
+      "epoch": 0.3372886695369106,
+      "grad_norm": 0.726662814617157,
+      "learning_rate": 0.0001924095375684587,
+      "loss": 0.2042,
+      "step": 1606
+    },
+    {
+      "epoch": 0.33749868738842803,
+      "grad_norm": 0.6827337741851807,
+      "learning_rate": 0.00019239625839497174,
+      "loss": 0.2285,
+      "step": 1607
+    },
+    {
+      "epoch": 0.3377087052399454,
+      "grad_norm": 0.4500672519207001,
+      "learning_rate": 0.00019238296807502428,
+      "loss": 0.1484,
+      "step": 1608
+    },
+    {
+      "epoch": 0.33791872309146276,
+      "grad_norm": 0.6254449486732483,
+      "learning_rate": 0.00019236966661021954,
+      "loss": 0.1773,
+      "step": 1609
+    },
+    {
+      "epoch": 0.3381287409429802,
+      "grad_norm": 0.5149936676025391,
+      "learning_rate": 0.00019235635400216222,
+      "loss": 0.1525,
+      "step": 1610
+    },
+    {
+      "epoch": 0.33833875879449754,
+      "grad_norm": 0.4187399744987488,
+      "learning_rate": 0.00019234303025245835,
+      "loss": 0.1808,
+      "step": 1611
+    },
+    {
+      "epoch": 0.3385487766460149,
+      "grad_norm": 0.6186479330062866,
+      "learning_rate": 0.00019232969536271522,
+      "loss": 0.3883,
+      "step": 1612
+    },
+    {
+      "epoch": 0.33875879449753227,
+      "grad_norm": 0.5121021270751953,
+      "learning_rate": 0.00019231634933454154,
+      "loss": 0.1468,
+      "step": 1613
+    },
+    {
+      "epoch": 0.3389688123490497,
+      "grad_norm": 0.5343567132949829,
+      "learning_rate": 0.00019230299216954736,
+      "loss": 0.1794,
+      "step": 1614
+    },
+    {
+      "epoch": 0.33917883020056705,
+      "grad_norm": 0.933382511138916,
+      "learning_rate": 0.000192289623869344,
+      "loss": 0.3237,
+      "step": 1615
+    },
+    {
+      "epoch": 0.3393888480520844,
+      "grad_norm": 1.0121095180511475,
+      "learning_rate": 0.00019227624443554425,
+      "loss": 0.2953,
+      "step": 1616
+    },
+    {
+      "epoch": 0.33959886590360183,
+      "grad_norm": 0.6155586242675781,
+      "learning_rate": 0.00019226285386976212,
+      "loss": 0.1835,
+      "step": 1617
+    },
+    {
+      "epoch": 0.3398088837551192,
+      "grad_norm": 0.5891084671020508,
+      "learning_rate": 0.00019224945217361306,
+      "loss": 0.2457,
+      "step": 1618
+    },
+    {
+      "epoch": 0.34001890160663656,
+      "grad_norm": 1.027543067932129,
+      "learning_rate": 0.0001922360393487138,
+      "loss": 0.2847,
+      "step": 1619
+    },
+    {
+      "epoch": 0.3402289194581539,
+      "grad_norm": 0.5866810083389282,
+      "learning_rate": 0.0001922226153966824,
+      "loss": 0.2284,
+      "step": 1620
+    },
+    {
+      "epoch": 0.34043893730967134,
+      "grad_norm": 0.7704328298568726,
+      "learning_rate": 0.00019220918031913833,
+      "loss": 0.2465,
+      "step": 1621
+    },
+    {
+      "epoch": 0.3406489551611887,
+      "grad_norm": 0.7415633797645569,
+      "learning_rate": 0.00019219573411770235,
+      "loss": 0.1966,
+      "step": 1622
+    },
+    {
+      "epoch": 0.34085897301270607,
+      "grad_norm": 0.8156745433807373,
+      "learning_rate": 0.00019218227679399657,
+      "loss": 0.2097,
+      "step": 1623
+    },
+    {
+      "epoch": 0.34106899086422343,
+      "grad_norm": 0.7245871424674988,
+      "learning_rate": 0.00019216880834964448,
+      "loss": 0.2093,
+      "step": 1624
+    },
+    {
+      "epoch": 0.34127900871574085,
+      "grad_norm": 0.6942924857139587,
+      "learning_rate": 0.00019215532878627084,
+      "loss": 0.2203,
+      "step": 1625
+    },
+    {
+      "epoch": 0.3414890265672582,
+      "grad_norm": 0.7590915560722351,
+      "learning_rate": 0.00019214183810550183,
+      "loss": 0.209,
+      "step": 1626
+    },
+    {
+      "epoch": 0.3416990444187756,
+      "grad_norm": 0.6332980990409851,
+      "learning_rate": 0.0001921283363089649,
+      "loss": 0.2052,
+      "step": 1627
+    },
+    {
+      "epoch": 0.341909062270293,
+      "grad_norm": 0.7253382802009583,
+      "learning_rate": 0.00019211482339828893,
+      "loss": 0.2334,
+      "step": 1628
+    },
+    {
+      "epoch": 0.34211908012181036,
+      "grad_norm": 0.6251415014266968,
+      "learning_rate": 0.000192101299375104,
+      "loss": 0.1544,
+      "step": 1629
+    },
+    {
+      "epoch": 0.3423290979733277,
+      "grad_norm": 0.5384700894355774,
+      "learning_rate": 0.00019208776424104165,
+      "loss": 0.1295,
+      "step": 1630
+    },
+    {
+      "epoch": 0.3425391158248451,
+      "grad_norm": 0.788681149482727,
+      "learning_rate": 0.00019207421799773475,
+      "loss": 0.2974,
+      "step": 1631
+    },
+    {
+      "epoch": 0.3427491336763625,
+      "grad_norm": 0.48648181557655334,
+      "learning_rate": 0.0001920606606468175,
+      "loss": 0.1941,
+      "step": 1632
+    },
+    {
+      "epoch": 0.34295915152787987,
+      "grad_norm": 1.0241416692733765,
+      "learning_rate": 0.00019204709218992536,
+      "loss": 0.195,
+      "step": 1633
+    },
+    {
+      "epoch": 0.34316916937939723,
+      "grad_norm": 0.45205456018447876,
+      "learning_rate": 0.00019203351262869525,
+      "loss": 0.1465,
+      "step": 1634
+    },
+    {
+      "epoch": 0.34337918723091465,
+      "grad_norm": 0.5081611275672913,
+      "learning_rate": 0.00019201992196476533,
+      "loss": 0.1738,
+      "step": 1635
+    },
+    {
+      "epoch": 0.343589205082432,
+      "grad_norm": 0.8194214701652527,
+      "learning_rate": 0.00019200632019977521,
+      "loss": 0.2898,
+      "step": 1636
+    },
+    {
+      "epoch": 0.3437992229339494,
+      "grad_norm": 0.28592121601104736,
+      "learning_rate": 0.00019199270733536572,
+      "loss": 0.1141,
+      "step": 1637
+    },
+    {
+      "epoch": 0.34400924078546674,
+      "grad_norm": 0.7359632849693298,
+      "learning_rate": 0.0001919790833731791,
+      "loss": 0.3353,
+      "step": 1638
+    },
+    {
+      "epoch": 0.34421925863698416,
+      "grad_norm": 1.307955026626587,
+      "learning_rate": 0.00019196544831485892,
+      "loss": 0.2808,
+      "step": 1639
+    },
+    {
+      "epoch": 0.3444292764885015,
+      "grad_norm": 0.7657724618911743,
+      "learning_rate": 0.00019195180216205007,
+      "loss": 0.2771,
+      "step": 1640
+    },
+    {
+      "epoch": 0.3446392943400189,
+      "grad_norm": 0.4949481785297394,
+      "learning_rate": 0.0001919381449163988,
+      "loss": 0.1887,
+      "step": 1641
+    },
+    {
+      "epoch": 0.3448493121915363,
+      "grad_norm": 0.5673692226409912,
+      "learning_rate": 0.00019192447657955262,
+      "loss": 0.1795,
+      "step": 1642
+    },
+    {
+      "epoch": 0.34505933004305367,
+      "grad_norm": 0.6279990673065186,
+      "learning_rate": 0.00019191079715316056,
+      "loss": 0.1995,
+      "step": 1643
+    },
+    {
+      "epoch": 0.34526934789457103,
+      "grad_norm": 0.7078985571861267,
+      "learning_rate": 0.0001918971066388728,
+      "loss": 0.2012,
+      "step": 1644
+    },
+    {
+      "epoch": 0.3454793657460884,
+      "grad_norm": 0.47900810837745667,
+      "learning_rate": 0.00019188340503834095,
+      "loss": 0.2149,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3456893835976058,
+      "grad_norm": 0.9682589769363403,
+      "learning_rate": 0.0001918696923532179,
+      "loss": 0.2038,
+      "step": 1646
+    },
+    {
+      "epoch": 0.3458994014491232,
+      "grad_norm": 0.6839099526405334,
+      "learning_rate": 0.000191855968585158,
+      "loss": 0.192,
+      "step": 1647
+    },
+    {
+      "epoch": 0.34610941930064054,
+      "grad_norm": 0.44559231400489807,
+      "learning_rate": 0.0001918422337358168,
+      "loss": 0.1752,
+      "step": 1648
+    },
+    {
+      "epoch": 0.34631943715215796,
+      "grad_norm": 0.48147323727607727,
+      "learning_rate": 0.00019182848780685115,
+      "loss": 0.1978,
+      "step": 1649
+    },
+    {
+      "epoch": 0.3465294550036753,
+      "grad_norm": 0.4760332405567169,
+      "learning_rate": 0.0001918147307999195,
+      "loss": 0.197,
+      "step": 1650
+    },
+    {
+      "epoch": 0.3467394728551927,
+      "grad_norm": 0.46297112107276917,
+      "learning_rate": 0.00019180096271668138,
+      "loss": 0.1567,
+      "step": 1651
+    },
+    {
+      "epoch": 0.34694949070671005,
+      "grad_norm": 0.5012452602386475,
+      "learning_rate": 0.0001917871835587977,
+      "loss": 0.218,
+      "step": 1652
+    },
+    {
+      "epoch": 0.34715950855822747,
+      "grad_norm": 0.5509222149848938,
+      "learning_rate": 0.00019177339332793078,
+      "loss": 0.2306,
+      "step": 1653
+    },
+    {
+      "epoch": 0.34736952640974483,
+      "grad_norm": 0.6989178657531738,
+      "learning_rate": 0.00019175959202574427,
+      "loss": 0.3619,
+      "step": 1654
+    },
+    {
+      "epoch": 0.3475795442612622,
+      "grad_norm": 0.6114615201950073,
+      "learning_rate": 0.00019174577965390304,
+      "loss": 0.2084,
+      "step": 1655
+    },
+    {
+      "epoch": 0.34778956211277956,
+      "grad_norm": 0.600771963596344,
+      "learning_rate": 0.0001917319562140735,
+      "loss": 0.1736,
+      "step": 1656
+    },
+    {
+      "epoch": 0.347999579964297,
+      "grad_norm": 0.8332004547119141,
+      "learning_rate": 0.00019171812170792318,
+      "loss": 0.2071,
+      "step": 1657
+    },
+    {
+      "epoch": 0.34820959781581434,
+      "grad_norm": 0.4657239317893982,
+      "learning_rate": 0.0001917042761371211,
+      "loss": 0.1708,
+      "step": 1658
+    },
+    {
+      "epoch": 0.3484196156673317,
+      "grad_norm": 0.6832460761070251,
+      "learning_rate": 0.0001916904195033375,
+      "loss": 0.2495,
+      "step": 1659
+    },
+    {
+      "epoch": 0.3486296335188491,
+      "grad_norm": 0.7702248692512512,
+      "learning_rate": 0.00019167655180824404,
+      "loss": 0.1839,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3488396513703665,
+      "grad_norm": 0.6671183705329895,
+      "learning_rate": 0.0001916626730535137,
+      "loss": 0.3184,
+      "step": 1661
+    },
+    {
+      "epoch": 0.34904966922188385,
+      "grad_norm": 0.6315720081329346,
+      "learning_rate": 0.00019164878324082074,
+      "loss": 0.204,
+      "step": 1662
+    },
+    {
+      "epoch": 0.3492596870734012,
+      "grad_norm": 0.7643944621086121,
+      "learning_rate": 0.00019163488237184084,
+      "loss": 0.1952,
+      "step": 1663
+    },
+    {
+      "epoch": 0.34946970492491863,
+      "grad_norm": 0.6874391436576843,
+      "learning_rate": 0.00019162097044825096,
+      "loss": 0.157,
+      "step": 1664
+    },
+    {
+      "epoch": 0.349679722776436,
+      "grad_norm": 0.7709307074546814,
+      "learning_rate": 0.00019160704747172934,
+      "loss": 0.3383,
+      "step": 1665
+    },
+    {
+      "epoch": 0.34988974062795336,
+      "grad_norm": 0.4958166182041168,
+      "learning_rate": 0.0001915931134439557,
+      "loss": 0.1265,
+      "step": 1666
+    },
+    {
+      "epoch": 0.3500997584794708,
+      "grad_norm": 0.7482470273971558,
+      "learning_rate": 0.00019157916836661095,
+      "loss": 0.2388,
+      "step": 1667
+    },
+    {
+      "epoch": 0.35030977633098814,
+      "grad_norm": 0.7006984353065491,
+      "learning_rate": 0.00019156521224137743,
+      "loss": 0.1407,
+      "step": 1668
+    },
+    {
+      "epoch": 0.3505197941825055,
+      "grad_norm": 0.9646068811416626,
+      "learning_rate": 0.00019155124506993874,
+      "loss": 0.2785,
+      "step": 1669
+    },
+    {
+      "epoch": 0.35072981203402287,
+      "grad_norm": 0.5841155052185059,
+      "learning_rate": 0.00019153726685397984,
+      "loss": 0.2303,
+      "step": 1670
+    },
+    {
+      "epoch": 0.3509398298855403,
+      "grad_norm": 0.8856375813484192,
+      "learning_rate": 0.00019152327759518705,
+      "loss": 0.2264,
+      "step": 1671
+    },
+    {
+      "epoch": 0.35114984773705765,
+      "grad_norm": 0.7846825122833252,
+      "learning_rate": 0.000191509277295248,
+      "loss": 0.2239,
+      "step": 1672
+    },
+    {
+      "epoch": 0.351359865588575,
+      "grad_norm": 1.0474859476089478,
+      "learning_rate": 0.00019149526595585163,
+      "loss": 0.3247,
+      "step": 1673
+    },
+    {
+      "epoch": 0.35156988344009243,
+      "grad_norm": 0.8523626923561096,
+      "learning_rate": 0.0001914812435786883,
+      "loss": 0.2623,
+      "step": 1674
+    },
+    {
+      "epoch": 0.3517799012916098,
+      "grad_norm": 0.8021546006202698,
+      "learning_rate": 0.00019146721016544954,
+      "loss": 0.1953,
+      "step": 1675
+    },
+    {
+      "epoch": 0.35198991914312716,
+      "grad_norm": 0.9674374461174011,
+      "learning_rate": 0.00019145316571782836,
+      "loss": 0.2123,
+      "step": 1676
+    },
+    {
+      "epoch": 0.3521999369946445,
+      "grad_norm": 0.7059686183929443,
+      "learning_rate": 0.00019143911023751907,
+      "loss": 0.2011,
+      "step": 1677
+    },
+    {
+      "epoch": 0.35240995484616194,
+      "grad_norm": 0.5109952092170715,
+      "learning_rate": 0.00019142504372621723,
+      "loss": 0.1854,
+      "step": 1678
+    },
+    {
+      "epoch": 0.3526199726976793,
+      "grad_norm": 0.6696085333824158,
+      "learning_rate": 0.00019141096618561983,
+      "loss": 0.3242,
+      "step": 1679
+    },
+    {
+      "epoch": 0.35282999054919667,
+      "grad_norm": 0.6039811968803406,
+      "learning_rate": 0.00019139687761742514,
+      "loss": 0.1479,
+      "step": 1680
+    },
+    {
+      "epoch": 0.35304000840071403,
+      "grad_norm": 0.7796909213066101,
+      "learning_rate": 0.00019138277802333278,
+      "loss": 0.2724,
+      "step": 1681
+    },
+    {
+      "epoch": 0.35325002625223145,
+      "grad_norm": 0.740599513053894,
+      "learning_rate": 0.00019136866740504367,
+      "loss": 0.2449,
+      "step": 1682
+    },
+    {
+      "epoch": 0.3534600441037488,
+      "grad_norm": 0.5968409180641174,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.145,
+      "step": 1683
+    },
+    {
+      "epoch": 0.3536700619552662,
+      "grad_norm": 0.5513572096824646,
+      "learning_rate": 0.00019134041310268568,
+      "loss": 0.166,
+      "step": 1684
+    },
+    {
+      "epoch": 0.3538800798067836,
+      "grad_norm": 0.6873581409454346,
+      "learning_rate": 0.0001913262694220253,
+      "loss": 0.3052,
+      "step": 1685
+    },
+    {
+      "epoch": 0.35409009765830096,
+      "grad_norm": 0.7691472768783569,
+      "learning_rate": 0.00019131211472398524,
+      "loss": 0.1708,
+      "step": 1686
+    },
+    {
+      "epoch": 0.3543001155098183,
+      "grad_norm": 0.4396502673625946,
+      "learning_rate": 0.00019129794901027308,
+      "loss": 0.1501,
+      "step": 1687
+    },
+    {
+      "epoch": 0.3545101333613357,
+      "grad_norm": 0.5293841361999512,
+      "learning_rate": 0.0001912837722825978,
+      "loss": 0.2255,
+      "step": 1688
+    },
+    {
+      "epoch": 0.3547201512128531,
+      "grad_norm": 0.43543753027915955,
+      "learning_rate": 0.00019126958454266957,
+      "loss": 0.1803,
+      "step": 1689
+    },
+    {
+      "epoch": 0.35493016906437047,
+      "grad_norm": 0.6785498261451721,
+      "learning_rate": 0.00019125538579219998,
+      "loss": 0.2687,
+      "step": 1690
+    },
+    {
+      "epoch": 0.35514018691588783,
+      "grad_norm": 0.42633846402168274,
+      "learning_rate": 0.00019124117603290194,
+      "loss": 0.1214,
+      "step": 1691
+    },
+    {
+      "epoch": 0.35535020476740525,
+      "grad_norm": 0.5481752157211304,
+      "learning_rate": 0.00019122695526648968,
+      "loss": 0.2199,
+      "step": 1692
+    },
+    {
+      "epoch": 0.3555602226189226,
+      "grad_norm": 0.8807294964790344,
+      "learning_rate": 0.00019121272349467878,
+      "loss": 0.1997,
+      "step": 1693
+    },
+    {
+      "epoch": 0.35577024047044,
+      "grad_norm": 0.8869441151618958,
+      "learning_rate": 0.0001911984807191861,
+      "loss": 0.1732,
+      "step": 1694
+    },
+    {
+      "epoch": 0.35598025832195734,
+      "grad_norm": 0.5254248976707458,
+      "learning_rate": 0.00019118422694172987,
+      "loss": 0.1422,
+      "step": 1695
+    },
+    {
+      "epoch": 0.35619027617347476,
+      "grad_norm": 0.5750237107276917,
+      "learning_rate": 0.00019116996216402961,
+      "loss": 0.1809,
+      "step": 1696
+    },
+    {
+      "epoch": 0.3564002940249921,
+      "grad_norm": 0.5350000262260437,
+      "learning_rate": 0.00019115568638780622,
+      "loss": 0.2012,
+      "step": 1697
+    },
+    {
+      "epoch": 0.3566103118765095,
+      "grad_norm": 0.7510976195335388,
+      "learning_rate": 0.00019114139961478186,
+      "loss": 0.3177,
+      "step": 1698
+    },
+    {
+      "epoch": 0.3568203297280269,
+      "grad_norm": 0.8712655901908875,
+      "learning_rate": 0.00019112710184668007,
+      "loss": 0.3399,
+      "step": 1699
+    },
+    {
+      "epoch": 0.35703034757954427,
+      "grad_norm": 0.4693390130996704,
+      "learning_rate": 0.0001911127930852257,
+      "loss": 0.1943,
+      "step": 1700
+    },
+    {
+      "epoch": 0.35724036543106163,
+      "grad_norm": 0.4523710012435913,
+      "learning_rate": 0.0001910984733321449,
+      "loss": 0.2226,
+      "step": 1701
+    },
+    {
+      "epoch": 0.357450383282579,
+      "grad_norm": 0.5822139978408813,
+      "learning_rate": 0.00019108414258916522,
+      "loss": 0.2993,
+      "step": 1702
+    },
+    {
+      "epoch": 0.3576604011340964,
+      "grad_norm": 0.5016921758651733,
+      "learning_rate": 0.00019106980085801544,
+      "loss": 0.1991,
+      "step": 1703
+    },
+    {
+      "epoch": 0.3578704189856138,
+      "grad_norm": 0.45039233565330505,
+      "learning_rate": 0.00019105544814042576,
+      "loss": 0.1539,
+      "step": 1704
+    },
+    {
+      "epoch": 0.35808043683713114,
+      "grad_norm": 0.5696535110473633,
+      "learning_rate": 0.00019104108443812758,
+      "loss": 0.2083,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3582904546886485,
+      "grad_norm": 0.46891167759895325,
+      "learning_rate": 0.0001910267097528538,
+      "loss": 0.2401,
+      "step": 1706
+    },
+    {
+      "epoch": 0.3585004725401659,
+      "grad_norm": 0.5347768068313599,
+      "learning_rate": 0.00019101232408633845,
+      "loss": 0.2143,
+      "step": 1707
+    },
+    {
+      "epoch": 0.3587104903916833,
+      "grad_norm": 0.49566882848739624,
+      "learning_rate": 0.00019099792744031705,
+      "loss": 0.2299,
+      "step": 1708
+    },
+    {
+      "epoch": 0.35892050824320065,
+      "grad_norm": 0.39414703845977783,
+      "learning_rate": 0.00019098351981652634,
+      "loss": 0.2156,
+      "step": 1709
+    },
+    {
+      "epoch": 0.35913052609471807,
+      "grad_norm": 0.5206068754196167,
+      "learning_rate": 0.00019096910121670443,
+      "loss": 0.1837,
+      "step": 1710
+    },
+    {
+      "epoch": 0.35934054394623544,
+      "grad_norm": 0.527384340763092,
+      "learning_rate": 0.0001909546716425908,
+      "loss": 0.2735,
+      "step": 1711
+    },
+    {
+      "epoch": 0.3595505617977528,
+      "grad_norm": 0.5682224631309509,
+      "learning_rate": 0.00019094023109592607,
+      "loss": 0.1809,
+      "step": 1712
+    },
+    {
+      "epoch": 0.35976057964927016,
+      "grad_norm": 0.7277165055274963,
+      "learning_rate": 0.00019092577957845243,
+      "loss": 0.2018,
+      "step": 1713
+    },
+    {
+      "epoch": 0.3599705975007876,
+      "grad_norm": 0.5524745583534241,
+      "learning_rate": 0.00019091131709191324,
+      "loss": 0.222,
+      "step": 1714
+    },
+    {
+      "epoch": 0.36018061535230494,
+      "grad_norm": 0.7268317341804504,
+      "learning_rate": 0.0001908968436380532,
+      "loss": 0.2371,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3603906332038223,
+      "grad_norm": 0.8419438600540161,
+      "learning_rate": 0.00019088235921861839,
+      "loss": 0.3045,
+      "step": 1716
+    },
+    {
+      "epoch": 0.3606006510553397,
+      "grad_norm": 0.6292913556098938,
+      "learning_rate": 0.00019086786383535614,
+      "loss": 0.1484,
+      "step": 1717
+    },
+    {
+      "epoch": 0.3608106689068571,
+      "grad_norm": 0.6693284511566162,
+      "learning_rate": 0.00019085335749001515,
+      "loss": 0.2263,
+      "step": 1718
+    },
+    {
+      "epoch": 0.36102068675837445,
+      "grad_norm": 1.0132428407669067,
+      "learning_rate": 0.00019083884018434547,
+      "loss": 0.2318,
+      "step": 1719
+    },
+    {
+      "epoch": 0.3612307046098918,
+      "grad_norm": 0.4503970146179199,
+      "learning_rate": 0.00019082431192009834,
+      "loss": 0.2717,
+      "step": 1720
+    },
+    {
+      "epoch": 0.36144072246140924,
+      "grad_norm": 0.5765817165374756,
+      "learning_rate": 0.0001908097726990265,
+      "loss": 0.1652,
+      "step": 1721
+    },
+    {
+      "epoch": 0.3616507403129266,
+      "grad_norm": 0.6819046139717102,
+      "learning_rate": 0.00019079522252288386,
+      "loss": 0.2853,
+      "step": 1722
+    },
+    {
+      "epoch": 0.36186075816444396,
+      "grad_norm": 0.7787238359451294,
+      "learning_rate": 0.0001907806613934258,
+      "loss": 0.2279,
+      "step": 1723
+    },
+    {
+      "epoch": 0.3620707760159614,
+      "grad_norm": 0.6114417314529419,
+      "learning_rate": 0.00019076608931240885,
+      "loss": 0.1959,
+      "step": 1724
+    },
+    {
+      "epoch": 0.36228079386747875,
+      "grad_norm": 0.7302160263061523,
+      "learning_rate": 0.00019075150628159105,
+      "loss": 0.2769,
+      "step": 1725
+    },
+    {
+      "epoch": 0.3624908117189961,
+      "grad_norm": 0.5485323667526245,
+      "learning_rate": 0.00019073691230273154,
+      "loss": 0.1907,
+      "step": 1726
+    },
+    {
+      "epoch": 0.36270082957051347,
+      "grad_norm": 0.4471105635166168,
+      "learning_rate": 0.000190722307377591,
+      "loss": 0.1456,
+      "step": 1727
+    },
+    {
+      "epoch": 0.3629108474220309,
+      "grad_norm": 0.5556613206863403,
+      "learning_rate": 0.0001907076915079313,
+      "loss": 0.1888,
+      "step": 1728
+    },
+    {
+      "epoch": 0.36312086527354825,
+      "grad_norm": 0.4751920998096466,
+      "learning_rate": 0.00019069306469551565,
+      "loss": 0.1423,
+      "step": 1729
+    },
+    {
+      "epoch": 0.3633308831250656,
+      "grad_norm": 0.6774847507476807,
+      "learning_rate": 0.0001906784269421086,
+      "loss": 0.2371,
+      "step": 1730
+    },
+    {
+      "epoch": 0.36354090097658304,
+      "grad_norm": 0.5875648260116577,
+      "learning_rate": 0.00019066377824947605,
+      "loss": 0.1678,
+      "step": 1731
+    },
+    {
+      "epoch": 0.3637509188281004,
+      "grad_norm": 0.864040732383728,
+      "learning_rate": 0.00019064911861938513,
+      "loss": 0.247,
+      "step": 1732
+    },
+    {
+      "epoch": 0.36396093667961776,
+      "grad_norm": 0.528397798538208,
+      "learning_rate": 0.00019063444805360438,
+      "loss": 0.2097,
+      "step": 1733
+    },
+    {
+      "epoch": 0.3641709545311351,
+      "grad_norm": 0.5226781368255615,
+      "learning_rate": 0.00019061976655390358,
+      "loss": 0.187,
+      "step": 1734
+    },
+    {
+      "epoch": 0.36438097238265255,
+      "grad_norm": 0.6590418219566345,
+      "learning_rate": 0.0001906050741220539,
+      "loss": 0.2919,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3645909902341699,
+      "grad_norm": 0.6287481784820557,
+      "learning_rate": 0.00019059037075982782,
+      "loss": 0.1835,
+      "step": 1736
+    },
+    {
+      "epoch": 0.3648010080856873,
+      "grad_norm": 0.44010448455810547,
+      "learning_rate": 0.00019057565646899907,
+      "loss": 0.1346,
+      "step": 1737
+    },
+    {
+      "epoch": 0.36501102593720464,
+      "grad_norm": 0.44869789481163025,
+      "learning_rate": 0.00019056093125134277,
+      "loss": 0.117,
+      "step": 1738
+    },
+    {
+      "epoch": 0.36522104378872206,
+      "grad_norm": 0.5611539483070374,
+      "learning_rate": 0.00019054619510863534,
+      "loss": 0.1521,
+      "step": 1739
+    },
+    {
+      "epoch": 0.3654310616402394,
+      "grad_norm": 0.7520772218704224,
+      "learning_rate": 0.00019053144804265451,
+      "loss": 0.2429,
+      "step": 1740
+    },
+    {
+      "epoch": 0.3656410794917568,
+      "grad_norm": 0.6511863470077515,
+      "learning_rate": 0.00019051669005517932,
+      "loss": 0.1681,
+      "step": 1741
+    },
+    {
+      "epoch": 0.3658510973432742,
+      "grad_norm": 0.5798068046569824,
+      "learning_rate": 0.00019050192114799014,
+      "loss": 0.1843,
+      "step": 1742
+    },
+    {
+      "epoch": 0.36606111519479156,
+      "grad_norm": 1.0162572860717773,
+      "learning_rate": 0.0001904871413228687,
+      "loss": 0.1762,
+      "step": 1743
+    },
+    {
+      "epoch": 0.3662711330463089,
+      "grad_norm": 0.6126002073287964,
+      "learning_rate": 0.00019047235058159792,
+      "loss": 0.2216,
+      "step": 1744
+    },
+    {
+      "epoch": 0.3664811508978263,
+      "grad_norm": 0.593628466129303,
+      "learning_rate": 0.00019045754892596216,
+      "loss": 0.1877,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3666911687493437,
+      "grad_norm": 0.8319538831710815,
+      "learning_rate": 0.00019044273635774705,
+      "loss": 0.2641,
+      "step": 1746
+    },
+    {
+      "epoch": 0.3669011866008611,
+      "grad_norm": 0.42954209446907043,
+      "learning_rate": 0.00019042791287873957,
+      "loss": 0.139,
+      "step": 1747
+    },
+    {
+      "epoch": 0.36711120445237844,
+      "grad_norm": 0.5830192565917969,
+      "learning_rate": 0.00019041307849072798,
+      "loss": 0.1926,
+      "step": 1748
+    },
+    {
+      "epoch": 0.36732122230389586,
+      "grad_norm": 0.7822489142417908,
+      "learning_rate": 0.00019039823319550182,
+      "loss": 0.2662,
+      "step": 1749
+    },
+    {
+      "epoch": 0.3675312401554132,
+      "grad_norm": 0.6861689686775208,
+      "learning_rate": 0.00019038337699485208,
+      "loss": 0.1516,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3677412580069306,
+      "grad_norm": 0.5381961464881897,
+      "learning_rate": 0.00019036850989057088,
+      "loss": 0.2698,
+      "step": 1751
+    },
+    {
+      "epoch": 0.36795127585844795,
+      "grad_norm": 0.5581590533256531,
+      "learning_rate": 0.00019035363188445178,
+      "loss": 0.2409,
+      "step": 1752
+    },
+    {
+      "epoch": 0.36816129370996536,
+      "grad_norm": 0.5967221260070801,
+      "learning_rate": 0.00019033874297828964,
+      "loss": 0.2265,
+      "step": 1753
+    },
+    {
+      "epoch": 0.36837131156148273,
+      "grad_norm": 0.5475670695304871,
+      "learning_rate": 0.00019032384317388062,
+      "loss": 0.2678,
+      "step": 1754
+    },
+    {
+      "epoch": 0.3685813294130001,
+      "grad_norm": 0.8465176224708557,
+      "learning_rate": 0.0001903089324730222,
+      "loss": 0.3007,
+      "step": 1755
+    },
+    {
+      "epoch": 0.3687913472645175,
+      "grad_norm": 0.5701761841773987,
+      "learning_rate": 0.00019029401087751317,
+      "loss": 0.3469,
+      "step": 1756
+    },
+    {
+      "epoch": 0.3690013651160349,
+      "grad_norm": 0.47910815477371216,
+      "learning_rate": 0.00019027907838915363,
+      "loss": 0.1803,
+      "step": 1757
+    },
+    {
+      "epoch": 0.36921138296755224,
+      "grad_norm": 0.5992498397827148,
+      "learning_rate": 0.000190264135009745,
+      "loss": 0.303,
+      "step": 1758
+    },
+    {
+      "epoch": 0.3694214008190696,
+      "grad_norm": 0.662324845790863,
+      "learning_rate": 0.00019024918074109,
+      "loss": 0.3612,
+      "step": 1759
+    },
+    {
+      "epoch": 0.369631418670587,
+      "grad_norm": 0.6369922161102295,
+      "learning_rate": 0.00019023421558499274,
+      "loss": 0.174,
+      "step": 1760
+    },
+    {
+      "epoch": 0.3698414365221044,
+      "grad_norm": 0.5556310415267944,
+      "learning_rate": 0.00019021923954325845,
+      "loss": 0.1904,
+      "step": 1761
+    },
+    {
+      "epoch": 0.37005145437362175,
+      "grad_norm": 0.6958837509155273,
+      "learning_rate": 0.00019020425261769393,
+      "loss": 0.3308,
+      "step": 1762
+    },
+    {
+      "epoch": 0.3702614722251391,
+      "grad_norm": 0.6934865117073059,
+      "learning_rate": 0.00019018925481010713,
+      "loss": 0.2886,
+      "step": 1763
+    },
+    {
+      "epoch": 0.37047149007665653,
+      "grad_norm": 0.6797325015068054,
+      "learning_rate": 0.0001901742461223073,
+      "loss": 0.2314,
+      "step": 1764
+    },
+    {
+      "epoch": 0.3706815079281739,
+      "grad_norm": 0.680874764919281,
+      "learning_rate": 0.00019015922655610511,
+      "loss": 0.252,
+      "step": 1765
+    },
+    {
+      "epoch": 0.37089152577969126,
+      "grad_norm": 0.6662421822547913,
+      "learning_rate": 0.0001901441961133125,
+      "loss": 0.2375,
+      "step": 1766
+    },
+    {
+      "epoch": 0.3711015436312087,
+      "grad_norm": 0.6213268637657166,
+      "learning_rate": 0.00019012915479574264,
+      "loss": 0.2062,
+      "step": 1767
+    },
+    {
+      "epoch": 0.37131156148272604,
+      "grad_norm": 0.523102343082428,
+      "learning_rate": 0.00019011410260521007,
+      "loss": 0.1552,
+      "step": 1768
+    },
+    {
+      "epoch": 0.3715215793342434,
+      "grad_norm": 0.6789844036102295,
+      "learning_rate": 0.00019009903954353075,
+      "loss": 0.2338,
+      "step": 1769
+    },
+    {
+      "epoch": 0.37173159718576076,
+      "grad_norm": 0.5860502123832703,
+      "learning_rate": 0.00019008396561252173,
+      "loss": 0.2464,
+      "step": 1770
+    },
+    {
+      "epoch": 0.3719416150372782,
+      "grad_norm": 0.6294893622398376,
+      "learning_rate": 0.0001900688808140016,
+      "loss": 0.2327,
+      "step": 1771
+    },
+    {
+      "epoch": 0.37215163288879555,
+      "grad_norm": 0.6527291536331177,
+      "learning_rate": 0.00019005378514979008,
+      "loss": 0.1679,
+      "step": 1772
+    },
+    {
+      "epoch": 0.3723616507403129,
+      "grad_norm": 0.6352584958076477,
+      "learning_rate": 0.00019003867862170832,
+      "loss": 0.2133,
+      "step": 1773
+    },
+    {
+      "epoch": 0.37257166859183033,
+      "grad_norm": 0.3638416528701782,
+      "learning_rate": 0.0001900235612315787,
+      "loss": 0.1247,
+      "step": 1774
+    },
+    {
+      "epoch": 0.3727816864433477,
+      "grad_norm": 0.5455829501152039,
+      "learning_rate": 0.00019000843298122496,
+      "loss": 0.1433,
+      "step": 1775
+    },
+    {
+      "epoch": 0.37299170429486506,
+      "grad_norm": 0.7970212697982788,
+      "learning_rate": 0.00018999329387247216,
+      "loss": 0.2801,
+      "step": 1776
+    },
+    {
+      "epoch": 0.3732017221463824,
+      "grad_norm": 0.6363832354545593,
+      "learning_rate": 0.00018997814390714658,
+      "loss": 0.2533,
+      "step": 1777
+    },
+    {
+      "epoch": 0.37341173999789984,
+      "grad_norm": 0.6059479713439941,
+      "learning_rate": 0.00018996298308707595,
+      "loss": 0.1884,
+      "step": 1778
+    },
+    {
+      "epoch": 0.3736217578494172,
+      "grad_norm": 0.6873182654380798,
+      "learning_rate": 0.0001899478114140892,
+      "loss": 0.1553,
+      "step": 1779
+    },
+    {
+      "epoch": 0.37383177570093457,
+      "grad_norm": 0.6038809418678284,
+      "learning_rate": 0.00018993262889001658,
+      "loss": 0.152,
+      "step": 1780
+    },
+    {
+      "epoch": 0.374041793552452,
+      "grad_norm": 0.685796320438385,
+      "learning_rate": 0.00018991743551668972,
+      "loss": 0.1444,
+      "step": 1781
+    },
+    {
+      "epoch": 0.37425181140396935,
+      "grad_norm": 0.7347593903541565,
+      "learning_rate": 0.00018990223129594148,
+      "loss": 0.2191,
+      "step": 1782
+    },
+    {
+      "epoch": 0.3744618292554867,
+      "grad_norm": 0.6126787066459656,
+      "learning_rate": 0.00018988701622960606,
+      "loss": 0.1944,
+      "step": 1783
+    },
+    {
+      "epoch": 0.3746718471070041,
+      "grad_norm": 0.5741068124771118,
+      "learning_rate": 0.000189871790319519,
+      "loss": 0.1662,
+      "step": 1784
+    },
+    {
+      "epoch": 0.3748818649585215,
+      "grad_norm": 0.7055535912513733,
+      "learning_rate": 0.00018985655356751707,
+      "loss": 0.1707,
+      "step": 1785
+    },
+    {
+      "epoch": 0.37509188281003886,
+      "grad_norm": 0.7289325594902039,
+      "learning_rate": 0.00018984130597543843,
+      "loss": 0.2198,
+      "step": 1786
+    },
+    {
+      "epoch": 0.3753019006615562,
+      "grad_norm": 0.3809838891029358,
+      "learning_rate": 0.0001898260475451225,
+      "loss": 0.124,
+      "step": 1787
+    },
+    {
+      "epoch": 0.3755119185130736,
+      "grad_norm": 0.6106488108634949,
+      "learning_rate": 0.00018981077827841003,
+      "loss": 0.177,
+      "step": 1788
+    },
+    {
+      "epoch": 0.375721936364591,
+      "grad_norm": 0.48331376910209656,
+      "learning_rate": 0.00018979549817714305,
+      "loss": 0.1342,
+      "step": 1789
+    },
+    {
+      "epoch": 0.37593195421610837,
+      "grad_norm": 0.7540026307106018,
+      "learning_rate": 0.00018978020724316492,
+      "loss": 0.2452,
+      "step": 1790
+    },
+    {
+      "epoch": 0.37614197206762573,
+      "grad_norm": 0.6960353851318359,
+      "learning_rate": 0.00018976490547832034,
+      "loss": 0.3285,
+      "step": 1791
+    },
+    {
+      "epoch": 0.37635198991914315,
+      "grad_norm": 0.5434857606887817,
+      "learning_rate": 0.00018974959288445522,
+      "loss": 0.1576,
+      "step": 1792
+    },
+    {
+      "epoch": 0.3765620077706605,
+      "grad_norm": 0.5403981804847717,
+      "learning_rate": 0.00018973426946341683,
+      "loss": 0.128,
+      "step": 1793
+    },
+    {
+      "epoch": 0.3767720256221779,
+      "grad_norm": 0.5323323011398315,
+      "learning_rate": 0.00018971893521705383,
+      "loss": 0.2257,
+      "step": 1794
+    },
+    {
+      "epoch": 0.37698204347369524,
+      "grad_norm": 0.8335232138633728,
+      "learning_rate": 0.000189703590147216,
+      "loss": 0.1963,
+      "step": 1795
+    },
+    {
+      "epoch": 0.37719206132521266,
+      "grad_norm": 0.655535101890564,
+      "learning_rate": 0.0001896882342557546,
+      "loss": 0.1997,
+      "step": 1796
+    },
+    {
+      "epoch": 0.37740207917673,
+      "grad_norm": 0.5367481112480164,
+      "learning_rate": 0.00018967286754452214,
+      "loss": 0.2294,
+      "step": 1797
+    },
+    {
+      "epoch": 0.3776120970282474,
+      "grad_norm": 0.9579022526741028,
+      "learning_rate": 0.00018965749001537238,
+      "loss": 0.2051,
+      "step": 1798
+    },
+    {
+      "epoch": 0.3778221148797648,
+      "grad_norm": 0.6223967671394348,
+      "learning_rate": 0.00018964210167016045,
+      "loss": 0.3153,
+      "step": 1799
+    },
+    {
+      "epoch": 0.37803213273128217,
+      "grad_norm": 0.7840321660041809,
+      "learning_rate": 0.00018962670251074275,
+      "loss": 0.2971,
+      "step": 1800
+    },
+    {
+      "epoch": 0.37824215058279953,
+      "grad_norm": 0.531063437461853,
+      "learning_rate": 0.000189611292538977,
+      "loss": 0.3475,
+      "step": 1801
+    },
+    {
+      "epoch": 0.3784521684343169,
+      "grad_norm": 0.5457683801651001,
+      "learning_rate": 0.00018959587175672223,
+      "loss": 0.2136,
+      "step": 1802
+    },
+    {
+      "epoch": 0.3786621862858343,
+      "grad_norm": 0.513944685459137,
+      "learning_rate": 0.00018958044016583877,
+      "loss": 0.1691,
+      "step": 1803
+    },
+    {
+      "epoch": 0.3788722041373517,
+      "grad_norm": 0.5060524344444275,
+      "learning_rate": 0.00018956499776818822,
+      "loss": 0.1869,
+      "step": 1804
+    },
+    {
+      "epoch": 0.37908222198886904,
+      "grad_norm": 0.6220847964286804,
+      "learning_rate": 0.00018954954456563356,
+      "loss": 0.2979,
+      "step": 1805
+    },
+    {
+      "epoch": 0.37929223984038646,
+      "grad_norm": 0.515264630317688,
+      "learning_rate": 0.000189534080560039,
+      "loss": 0.1507,
+      "step": 1806
+    },
+    {
+      "epoch": 0.3795022576919038,
+      "grad_norm": 0.4674825668334961,
+      "learning_rate": 0.00018951860575327006,
+      "loss": 0.1454,
+      "step": 1807
+    },
+    {
+      "epoch": 0.3797122755434212,
+      "grad_norm": 0.5598218441009521,
+      "learning_rate": 0.0001895031201471936,
+      "loss": 0.2971,
+      "step": 1808
+    },
+    {
+      "epoch": 0.37992229339493855,
+      "grad_norm": 0.42762941122055054,
+      "learning_rate": 0.00018948762374367778,
+      "loss": 0.163,
+      "step": 1809
+    },
+    {
+      "epoch": 0.38013231124645597,
+      "grad_norm": 0.5744110345840454,
+      "learning_rate": 0.00018947211654459208,
+      "loss": 0.2604,
+      "step": 1810
+    },
+    {
+      "epoch": 0.38034232909797333,
+      "grad_norm": 0.5260441303253174,
+      "learning_rate": 0.00018945659855180714,
+      "loss": 0.1798,
+      "step": 1811
+    },
+    {
+      "epoch": 0.3805523469494907,
+      "grad_norm": 0.5875288844108582,
+      "learning_rate": 0.00018944106976719513,
+      "loss": 0.231,
+      "step": 1812
+    },
+    {
+      "epoch": 0.3807623648010081,
+      "grad_norm": 0.8182070851325989,
+      "learning_rate": 0.00018942553019262937,
+      "loss": 0.2722,
+      "step": 1813
+    },
+    {
+      "epoch": 0.3809723826525255,
+      "grad_norm": 0.5589549541473389,
+      "learning_rate": 0.00018940997982998446,
+      "loss": 0.2102,
+      "step": 1814
+    },
+    {
+      "epoch": 0.38118240050404284,
+      "grad_norm": 0.7185777425765991,
+      "learning_rate": 0.0001893944186811364,
+      "loss": 0.1846,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3813924183555602,
+      "grad_norm": 0.9046144485473633,
+      "learning_rate": 0.0001893788467479625,
+      "loss": 0.2614,
+      "step": 1816
+    },
+    {
+      "epoch": 0.3816024362070776,
+      "grad_norm": 0.5875299572944641,
+      "learning_rate": 0.00018936326403234125,
+      "loss": 0.3383,
+      "step": 1817
+    },
+    {
+      "epoch": 0.381812454058595,
+      "grad_norm": 0.7503211498260498,
+      "learning_rate": 0.0001893476705361525,
+      "loss": 0.1901,
+      "step": 1818
+    },
+    {
+      "epoch": 0.38202247191011235,
+      "grad_norm": 0.6497657895088196,
+      "learning_rate": 0.00018933206626127748,
+      "loss": 0.2323,
+      "step": 1819
+    },
+    {
+      "epoch": 0.3822324897616297,
+      "grad_norm": 0.6288959383964539,
+      "learning_rate": 0.00018931645120959863,
+      "loss": 0.2134,
+      "step": 1820
+    },
+    {
+      "epoch": 0.38244250761314713,
+      "grad_norm": 0.6646990776062012,
+      "learning_rate": 0.00018930082538299968,
+      "loss": 0.1965,
+      "step": 1821
+    },
+    {
+      "epoch": 0.3826525254646645,
+      "grad_norm": 0.741087794303894,
+      "learning_rate": 0.0001892851887833657,
+      "loss": 0.2197,
+      "step": 1822
+    },
+    {
+      "epoch": 0.38286254331618186,
+      "grad_norm": 0.9484357833862305,
+      "learning_rate": 0.00018926954141258305,
+      "loss": 0.2848,
+      "step": 1823
+    },
+    {
+      "epoch": 0.3830725611676993,
+      "grad_norm": 0.4895227551460266,
+      "learning_rate": 0.00018925388327253943,
+      "loss": 0.1414,
+      "step": 1824
+    },
+    {
+      "epoch": 0.38328257901921664,
+      "grad_norm": 0.41706594824790955,
+      "learning_rate": 0.00018923821436512376,
+      "loss": 0.1251,
+      "step": 1825
+    },
+    {
+      "epoch": 0.383492596870734,
+      "grad_norm": 0.7507045269012451,
+      "learning_rate": 0.00018922253469222633,
+      "loss": 0.2518,
+      "step": 1826
+    },
+    {
+      "epoch": 0.38370261472225137,
+      "grad_norm": 0.521273672580719,
+      "learning_rate": 0.00018920684425573865,
+      "loss": 0.2102,
+      "step": 1827
+    },
+    {
+      "epoch": 0.3839126325737688,
+      "grad_norm": 0.7842849493026733,
+      "learning_rate": 0.00018919114305755364,
+      "loss": 0.208,
+      "step": 1828
+    },
+    {
+      "epoch": 0.38412265042528615,
+      "grad_norm": 0.39324215054512024,
+      "learning_rate": 0.0001891754310995654,
+      "loss": 0.1627,
+      "step": 1829
+    },
+    {
+      "epoch": 0.3843326682768035,
+      "grad_norm": 0.4770127534866333,
+      "learning_rate": 0.0001891597083836694,
+      "loss": 0.1703,
+      "step": 1830
+    },
+    {
+      "epoch": 0.38454268612832093,
+      "grad_norm": 0.6963898539543152,
+      "learning_rate": 0.00018914397491176242,
+      "loss": 0.2574,
+      "step": 1831
+    },
+    {
+      "epoch": 0.3847527039798383,
+      "grad_norm": 0.42212000489234924,
+      "learning_rate": 0.00018912823068574242,
+      "loss": 0.1968,
+      "step": 1832
+    },
+    {
+      "epoch": 0.38496272183135566,
+      "grad_norm": 0.695845365524292,
+      "learning_rate": 0.00018911247570750885,
+      "loss": 0.1894,
+      "step": 1833
+    },
+    {
+      "epoch": 0.385172739682873,
+      "grad_norm": 0.7652266621589661,
+      "learning_rate": 0.00018909670997896232,
+      "loss": 0.273,
+      "step": 1834
+    },
+    {
+      "epoch": 0.38538275753439044,
+      "grad_norm": 0.4933221638202667,
+      "learning_rate": 0.00018908093350200473,
+      "loss": 0.1563,
+      "step": 1835
+    },
+    {
+      "epoch": 0.3855927753859078,
+      "grad_norm": 0.6543354392051697,
+      "learning_rate": 0.00018906514627853936,
+      "loss": 0.1746,
+      "step": 1836
+    },
+    {
+      "epoch": 0.38580279323742517,
+      "grad_norm": 0.7568399310112,
+      "learning_rate": 0.00018904934831047072,
+      "loss": 0.2761,
+      "step": 1837
+    },
+    {
+      "epoch": 0.3860128110889426,
+      "grad_norm": 0.7224242091178894,
+      "learning_rate": 0.00018903353959970462,
+      "loss": 0.2429,
+      "step": 1838
+    },
+    {
+      "epoch": 0.38622282894045995,
+      "grad_norm": 0.9286659359931946,
+      "learning_rate": 0.00018901772014814824,
+      "loss": 0.2213,
+      "step": 1839
+    },
+    {
+      "epoch": 0.3864328467919773,
+      "grad_norm": 0.35347288846969604,
+      "learning_rate": 0.00018900188995770996,
+      "loss": 0.1102,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3866428646434947,
+      "grad_norm": 0.5769889950752258,
+      "learning_rate": 0.00018898604903029952,
+      "loss": 0.2866,
+      "step": 1841
+    },
+    {
+      "epoch": 0.3868528824950121,
+      "grad_norm": 0.8280730843544006,
+      "learning_rate": 0.0001889701973678279,
+      "loss": 0.135,
+      "step": 1842
+    },
+    {
+      "epoch": 0.38706290034652946,
+      "grad_norm": 0.6738591194152832,
+      "learning_rate": 0.00018895433497220744,
+      "loss": 0.2106,
+      "step": 1843
+    },
+    {
+      "epoch": 0.3872729181980468,
+      "grad_norm": 0.5238274335861206,
+      "learning_rate": 0.0001889384618453517,
+      "loss": 0.1742,
+      "step": 1844
+    },
+    {
+      "epoch": 0.3874829360495642,
+      "grad_norm": 0.4777994155883789,
+      "learning_rate": 0.0001889225779891756,
+      "loss": 0.2191,
+      "step": 1845
+    },
+    {
+      "epoch": 0.3876929539010816,
+      "grad_norm": 0.7966942191123962,
+      "learning_rate": 0.00018890668340559535,
+      "loss": 0.2583,
+      "step": 1846
+    },
+    {
+      "epoch": 0.38790297175259897,
+      "grad_norm": 0.7737426161766052,
+      "learning_rate": 0.0001888907780965284,
+      "loss": 0.225,
+      "step": 1847
+    },
+    {
+      "epoch": 0.38811298960411633,
+      "grad_norm": 0.712505578994751,
+      "learning_rate": 0.0001888748620638935,
+      "loss": 0.2436,
+      "step": 1848
+    },
+    {
+      "epoch": 0.38832300745563375,
+      "grad_norm": 0.5878315567970276,
+      "learning_rate": 0.00018885893530961082,
+      "loss": 0.1689,
+      "step": 1849
+    },
+    {
+      "epoch": 0.3885330253071511,
+      "grad_norm": 0.8706987500190735,
+      "learning_rate": 0.00018884299783560165,
+      "loss": 0.3054,
+      "step": 1850
+    },
+    {
+      "epoch": 0.3887430431586685,
+      "grad_norm": 0.5303587913513184,
+      "learning_rate": 0.00018882704964378867,
+      "loss": 0.2341,
+      "step": 1851
+    },
+    {
+      "epoch": 0.38895306101018584,
+      "grad_norm": 0.5766577124595642,
+      "learning_rate": 0.00018881109073609582,
+      "loss": 0.1952,
+      "step": 1852
+    },
+    {
+      "epoch": 0.38916307886170326,
+      "grad_norm": 0.5528069138526917,
+      "learning_rate": 0.00018879512111444834,
+      "loss": 0.2278,
+      "step": 1853
+    },
+    {
+      "epoch": 0.3893730967132206,
+      "grad_norm": 0.7019896507263184,
+      "learning_rate": 0.0001887791407807728,
+      "loss": 0.2488,
+      "step": 1854
+    },
+    {
+      "epoch": 0.389583114564738,
+      "grad_norm": 0.6992369294166565,
+      "learning_rate": 0.000188763149736997,
+      "loss": 0.3551,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3897931324162554,
+      "grad_norm": 0.5621406435966492,
+      "learning_rate": 0.00018874714798505004,
+      "loss": 0.2692,
+      "step": 1856
+    },
+    {
+      "epoch": 0.39000315026777277,
+      "grad_norm": 0.7229904532432556,
+      "learning_rate": 0.0001887311355268624,
+      "loss": 0.3149,
+      "step": 1857
+    },
+    {
+      "epoch": 0.39021316811929013,
+      "grad_norm": 0.7131726741790771,
+      "learning_rate": 0.0001887151123643657,
+      "loss": 0.2333,
+      "step": 1858
+    },
+    {
+      "epoch": 0.3904231859708075,
+      "grad_norm": 0.8763433694839478,
+      "learning_rate": 0.000188699078499493,
+      "loss": 0.3252,
+      "step": 1859
+    },
+    {
+      "epoch": 0.3906332038223249,
+      "grad_norm": 0.7169297337532043,
+      "learning_rate": 0.00018868303393417856,
+      "loss": 0.3808,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3908432216738423,
+      "grad_norm": 0.47738391160964966,
+      "learning_rate": 0.00018866697867035796,
+      "loss": 0.2403,
+      "step": 1861
+    },
+    {
+      "epoch": 0.39105323952535964,
+      "grad_norm": 0.6296969056129456,
+      "learning_rate": 0.00018865091270996807,
+      "loss": 0.2048,
+      "step": 1862
+    },
+    {
+      "epoch": 0.39126325737687706,
+      "grad_norm": 0.5875237584114075,
+      "learning_rate": 0.00018863483605494709,
+      "loss": 0.175,
+      "step": 1863
+    },
+    {
+      "epoch": 0.3914732752283944,
+      "grad_norm": 0.5673640370368958,
+      "learning_rate": 0.00018861874870723438,
+      "loss": 0.1633,
+      "step": 1864
+    },
+    {
+      "epoch": 0.3916832930799118,
+      "grad_norm": 0.6922441720962524,
+      "learning_rate": 0.00018860265066877074,
+      "loss": 0.263,
+      "step": 1865
+    },
+    {
+      "epoch": 0.39189331093142915,
+      "grad_norm": 0.5726366639137268,
+      "learning_rate": 0.0001885865419414982,
+      "loss": 0.2623,
+      "step": 1866
+    },
+    {
+      "epoch": 0.39210332878294657,
+      "grad_norm": 0.5620572566986084,
+      "learning_rate": 0.00018857042252736004,
+      "loss": 0.2096,
+      "step": 1867
+    },
+    {
+      "epoch": 0.39231334663446393,
+      "grad_norm": 0.5184766054153442,
+      "learning_rate": 0.0001885542924283009,
+      "loss": 0.1827,
+      "step": 1868
+    },
+    {
+      "epoch": 0.3925233644859813,
+      "grad_norm": 0.41842037439346313,
+      "learning_rate": 0.00018853815164626667,
+      "loss": 0.1194,
+      "step": 1869
+    },
+    {
+      "epoch": 0.39273338233749866,
+      "grad_norm": 0.7123748064041138,
+      "learning_rate": 0.00018852200018320452,
+      "loss": 0.2417,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3929434001890161,
+      "grad_norm": 0.6199332475662231,
+      "learning_rate": 0.00018850583804106292,
+      "loss": 0.1418,
+      "step": 1871
+    },
+    {
+      "epoch": 0.39315341804053344,
+      "grad_norm": 0.6634207963943481,
+      "learning_rate": 0.00018848966522179168,
+      "loss": 0.3139,
+      "step": 1872
+    },
+    {
+      "epoch": 0.3933634358920508,
+      "grad_norm": 0.5441175699234009,
+      "learning_rate": 0.00018847348172734178,
+      "loss": 0.1824,
+      "step": 1873
+    },
+    {
+      "epoch": 0.3935734537435682,
+      "grad_norm": 0.794145405292511,
+      "learning_rate": 0.00018845728755966564,
+      "loss": 0.2092,
+      "step": 1874
+    },
+    {
+      "epoch": 0.3937834715950856,
+      "grad_norm": 0.43816208839416504,
+      "learning_rate": 0.0001884410827207168,
+      "loss": 0.1461,
+      "step": 1875
+    },
+    {
+      "epoch": 0.39399348944660295,
+      "grad_norm": 0.5776886343955994,
+      "learning_rate": 0.00018842486721245023,
+      "loss": 0.1782,
+      "step": 1876
+    },
+    {
+      "epoch": 0.3942035072981203,
+      "grad_norm": 0.5448632836341858,
+      "learning_rate": 0.00018840864103682208,
+      "loss": 0.1693,
+      "step": 1877
+    },
+    {
+      "epoch": 0.39441352514963773,
+      "grad_norm": 0.9205145239830017,
+      "learning_rate": 0.00018839240419578988,
+      "loss": 0.2517,
+      "step": 1878
+    },
+    {
+      "epoch": 0.3946235430011551,
+      "grad_norm": 0.6252496838569641,
+      "learning_rate": 0.00018837615669131238,
+      "loss": 0.2044,
+      "step": 1879
+    },
+    {
+      "epoch": 0.39483356085267246,
+      "grad_norm": 0.7013902068138123,
+      "learning_rate": 0.00018835989852534967,
+      "loss": 0.2082,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3950435787041899,
+      "grad_norm": 0.7002107501029968,
+      "learning_rate": 0.00018834362969986308,
+      "loss": 0.2845,
+      "step": 1881
+    },
+    {
+      "epoch": 0.39525359655570724,
+      "grad_norm": 0.5892364382743835,
+      "learning_rate": 0.00018832735021681523,
+      "loss": 0.2595,
+      "step": 1882
+    },
+    {
+      "epoch": 0.3954636144072246,
+      "grad_norm": 0.6822951436042786,
+      "learning_rate": 0.00018831106007817004,
+      "loss": 0.2269,
+      "step": 1883
+    },
+    {
+      "epoch": 0.39567363225874197,
+      "grad_norm": 0.5758324265480042,
+      "learning_rate": 0.00018829475928589271,
+      "loss": 0.1774,
+      "step": 1884
+    },
+    {
+      "epoch": 0.3958836501102594,
+      "grad_norm": 0.6938642859458923,
+      "learning_rate": 0.00018827844784194975,
+      "loss": 0.2571,
+      "step": 1885
+    },
+    {
+      "epoch": 0.39609366796177675,
+      "grad_norm": 0.8935677409172058,
+      "learning_rate": 0.0001882621257483089,
+      "loss": 0.221,
+      "step": 1886
+    },
+    {
+      "epoch": 0.3963036858132941,
+      "grad_norm": 0.6208258271217346,
+      "learning_rate": 0.00018824579300693922,
+      "loss": 0.3372,
+      "step": 1887
+    },
+    {
+      "epoch": 0.39651370366481153,
+      "grad_norm": 0.6998998522758484,
+      "learning_rate": 0.0001882294496198111,
+      "loss": 0.2326,
+      "step": 1888
+    },
+    {
+      "epoch": 0.3967237215163289,
+      "grad_norm": 1.004351019859314,
+      "learning_rate": 0.0001882130955888961,
+      "loss": 0.2532,
+      "step": 1889
+    },
+    {
+      "epoch": 0.39693373936784626,
+      "grad_norm": 0.8813514113426208,
+      "learning_rate": 0.0001881967309161672,
+      "loss": 0.2859,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3971437572193636,
+      "grad_norm": 0.48337891697883606,
+      "learning_rate": 0.00018818035560359855,
+      "loss": 0.1619,
+      "step": 1891
+    },
+    {
+      "epoch": 0.39735377507088104,
+      "grad_norm": 0.5266842246055603,
+      "learning_rate": 0.00018816396965316563,
+      "loss": 0.1676,
+      "step": 1892
+    },
+    {
+      "epoch": 0.3975637929223984,
+      "grad_norm": 0.48444923758506775,
+      "learning_rate": 0.00018814757306684522,
+      "loss": 0.149,
+      "step": 1893
+    },
+    {
+      "epoch": 0.39777381077391577,
+      "grad_norm": 0.651096522808075,
+      "learning_rate": 0.00018813116584661535,
+      "loss": 0.1735,
+      "step": 1894
+    },
+    {
+      "epoch": 0.3979838286254332,
+      "grad_norm": 0.5188713669776917,
+      "learning_rate": 0.00018811474799445535,
+      "loss": 0.2133,
+      "step": 1895
+    },
+    {
+      "epoch": 0.39819384647695055,
+      "grad_norm": 0.376615971326828,
+      "learning_rate": 0.00018809831951234582,
+      "loss": 0.1414,
+      "step": 1896
+    },
+    {
+      "epoch": 0.3984038643284679,
+      "grad_norm": 0.38462603092193604,
+      "learning_rate": 0.00018808188040226868,
+      "loss": 0.1363,
+      "step": 1897
+    },
+    {
+      "epoch": 0.3986138821799853,
+      "grad_norm": 0.6370077133178711,
+      "learning_rate": 0.00018806543066620708,
+      "loss": 0.1403,
+      "step": 1898
+    },
+    {
+      "epoch": 0.3988239000315027,
+      "grad_norm": 0.6808513402938843,
+      "learning_rate": 0.00018804897030614548,
+      "loss": 0.2621,
+      "step": 1899
+    },
+    {
+      "epoch": 0.39903391788302006,
+      "grad_norm": 0.4564354121685028,
+      "learning_rate": 0.00018803249932406962,
+      "loss": 0.1761,
+      "step": 1900
+    },
+    {
+      "epoch": 0.3992439357345374,
+      "grad_norm": 0.46451982855796814,
+      "learning_rate": 0.0001880160177219665,
+      "loss": 0.1802,
+      "step": 1901
+    },
+    {
+      "epoch": 0.3994539535860548,
+      "grad_norm": 0.46525609493255615,
+      "learning_rate": 0.00018799952550182446,
+      "loss": 0.2133,
+      "step": 1902
+    },
+    {
+      "epoch": 0.3996639714375722,
+      "grad_norm": 0.49694451689720154,
+      "learning_rate": 0.0001879830226656331,
+      "loss": 0.1612,
+      "step": 1903
+    },
+    {
+      "epoch": 0.39987398928908957,
+      "grad_norm": 0.7554435133934021,
+      "learning_rate": 0.00018796650921538318,
+      "loss": 0.4251,
+      "step": 1904
+    },
+    {
+      "epoch": 0.40008400714060693,
+      "grad_norm": 0.5206797122955322,
+      "learning_rate": 0.00018794998515306692,
+      "loss": 0.1765,
+      "step": 1905
+    },
+    {
+      "epoch": 0.40029402499212435,
+      "grad_norm": 0.5828625559806824,
+      "learning_rate": 0.00018793345048067774,
+      "loss": 0.278,
+      "step": 1906
+    },
+    {
+      "epoch": 0.4005040428436417,
+      "grad_norm": 0.48372307419776917,
+      "learning_rate": 0.00018791690520021036,
+      "loss": 0.1755,
+      "step": 1907
+    },
+    {
+      "epoch": 0.4007140606951591,
+      "grad_norm": 0.70604008436203,
+      "learning_rate": 0.00018790034931366072,
+      "loss": 0.2469,
+      "step": 1908
+    },
+    {
+      "epoch": 0.40092407854667644,
+      "grad_norm": 0.6227346658706665,
+      "learning_rate": 0.00018788378282302606,
+      "loss": 0.242,
+      "step": 1909
+    },
+    {
+      "epoch": 0.40113409639819386,
+      "grad_norm": 0.6886958479881287,
+      "learning_rate": 0.000187867205730305,
+      "loss": 0.1898,
+      "step": 1910
+    },
+    {
+      "epoch": 0.4013441142497112,
+      "grad_norm": 0.6775689125061035,
+      "learning_rate": 0.0001878506180374973,
+      "loss": 0.182,
+      "step": 1911
+    },
+    {
+      "epoch": 0.4015541321012286,
+      "grad_norm": 0.7775040864944458,
+      "learning_rate": 0.0001878340197466041,
+      "loss": 0.2411,
+      "step": 1912
+    },
+    {
+      "epoch": 0.401764149952746,
+      "grad_norm": 0.7616844177246094,
+      "learning_rate": 0.00018781741085962774,
+      "loss": 0.3289,
+      "step": 1913
+    },
+    {
+      "epoch": 0.40197416780426337,
+      "grad_norm": 0.7016698718070984,
+      "learning_rate": 0.0001878007913785719,
+      "loss": 0.1935,
+      "step": 1914
+    },
+    {
+      "epoch": 0.40218418565578073,
+      "grad_norm": 0.5730949640274048,
+      "learning_rate": 0.0001877841613054415,
+      "loss": 0.1898,
+      "step": 1915
+    },
+    {
+      "epoch": 0.4023942035072981,
+      "grad_norm": 0.7912655472755432,
+      "learning_rate": 0.00018776752064224273,
+      "loss": 0.2357,
+      "step": 1916
+    },
+    {
+      "epoch": 0.4026042213588155,
+      "grad_norm": 0.6684666872024536,
+      "learning_rate": 0.00018775086939098312,
+      "loss": 0.1697,
+      "step": 1917
+    },
+    {
+      "epoch": 0.4028142392103329,
+      "grad_norm": 0.7139754891395569,
+      "learning_rate": 0.00018773420755367144,
+      "loss": 0.3489,
+      "step": 1918
+    },
+    {
+      "epoch": 0.40302425706185024,
+      "grad_norm": 0.8503203392028809,
+      "learning_rate": 0.00018771753513231772,
+      "loss": 0.2465,
+      "step": 1919
+    },
+    {
+      "epoch": 0.40323427491336766,
+      "grad_norm": 0.7805736660957336,
+      "learning_rate": 0.00018770085212893326,
+      "loss": 0.1745,
+      "step": 1920
+    },
+    {
+      "epoch": 0.403444292764885,
+      "grad_norm": 0.996834397315979,
+      "learning_rate": 0.00018768415854553068,
+      "loss": 0.293,
+      "step": 1921
+    },
+    {
+      "epoch": 0.4036543106164024,
+      "grad_norm": 0.9461717009544373,
+      "learning_rate": 0.00018766745438412384,
+      "loss": 0.1942,
+      "step": 1922
+    },
+    {
+      "epoch": 0.40386432846791975,
+      "grad_norm": 0.4621502757072449,
+      "learning_rate": 0.0001876507396467279,
+      "loss": 0.1086,
+      "step": 1923
+    },
+    {
+      "epoch": 0.40407434631943717,
+      "grad_norm": 0.7377074360847473,
+      "learning_rate": 0.0001876340143353593,
+      "loss": 0.2052,
+      "step": 1924
+    },
+    {
+      "epoch": 0.40428436417095454,
+      "grad_norm": 0.6381657123565674,
+      "learning_rate": 0.00018761727845203568,
+      "loss": 0.2239,
+      "step": 1925
+    },
+    {
+      "epoch": 0.4044943820224719,
+      "grad_norm": 0.6710989475250244,
+      "learning_rate": 0.0001876005319987761,
+      "loss": 0.241,
+      "step": 1926
+    },
+    {
+      "epoch": 0.40470439987398926,
+      "grad_norm": 0.6272912621498108,
+      "learning_rate": 0.00018758377497760074,
+      "loss": 0.1398,
+      "step": 1927
+    },
+    {
+      "epoch": 0.4049144177255067,
+      "grad_norm": 0.7110686302185059,
+      "learning_rate": 0.00018756700739053117,
+      "loss": 0.1825,
+      "step": 1928
+    },
+    {
+      "epoch": 0.40512443557702404,
+      "grad_norm": 0.38758787512779236,
+      "learning_rate": 0.00018755022923959018,
+      "loss": 0.1484,
+      "step": 1929
+    },
+    {
+      "epoch": 0.4053344534285414,
+      "grad_norm": 0.6288896799087524,
+      "learning_rate": 0.00018753344052680184,
+      "loss": 0.2315,
+      "step": 1930
+    },
+    {
+      "epoch": 0.4055444712800588,
+      "grad_norm": 0.5400965809822083,
+      "learning_rate": 0.00018751664125419152,
+      "loss": 0.1389,
+      "step": 1931
+    },
+    {
+      "epoch": 0.4057544891315762,
+      "grad_norm": 0.754919707775116,
+      "learning_rate": 0.00018749983142378582,
+      "loss": 0.238,
+      "step": 1932
+    },
+    {
+      "epoch": 0.40596450698309355,
+      "grad_norm": 0.5651653409004211,
+      "learning_rate": 0.00018748301103761264,
+      "loss": 0.1729,
+      "step": 1933
+    },
+    {
+      "epoch": 0.4061745248346109,
+      "grad_norm": 0.6878918409347534,
+      "learning_rate": 0.00018746618009770117,
+      "loss": 0.1804,
+      "step": 1934
+    },
+    {
+      "epoch": 0.40638454268612834,
+      "grad_norm": 0.46758532524108887,
+      "learning_rate": 0.00018744933860608183,
+      "loss": 0.1225,
+      "step": 1935
+    },
+    {
+      "epoch": 0.4065945605376457,
+      "grad_norm": 0.5796334147453308,
+      "learning_rate": 0.00018743248656478634,
+      "loss": 0.1805,
+      "step": 1936
+    },
+    {
+      "epoch": 0.40680457838916306,
+      "grad_norm": 0.6088680028915405,
+      "learning_rate": 0.00018741562397584768,
+      "loss": 0.1575,
+      "step": 1937
+    },
+    {
+      "epoch": 0.4070145962406805,
+      "grad_norm": 0.5850851535797119,
+      "learning_rate": 0.00018739875084130013,
+      "loss": 0.1373,
+      "step": 1938
+    },
+    {
+      "epoch": 0.40722461409219785,
+      "grad_norm": 0.5626951456069946,
+      "learning_rate": 0.00018738186716317924,
+      "loss": 0.1653,
+      "step": 1939
+    },
+    {
+      "epoch": 0.4074346319437152,
+      "grad_norm": 0.5403121113777161,
+      "learning_rate": 0.0001873649729435218,
+      "loss": 0.253,
+      "step": 1940
+    },
+    {
+      "epoch": 0.40764464979523257,
+      "grad_norm": 0.5512644052505493,
+      "learning_rate": 0.00018734806818436584,
+      "loss": 0.1745,
+      "step": 1941
+    },
+    {
+      "epoch": 0.40785466764675,
+      "grad_norm": 0.4115297794342041,
+      "learning_rate": 0.00018733115288775077,
+      "loss": 0.1222,
+      "step": 1942
+    },
+    {
+      "epoch": 0.40806468549826735,
+      "grad_norm": 0.43221354484558105,
+      "learning_rate": 0.00018731422705571725,
+      "loss": 0.1172,
+      "step": 1943
+    },
+    {
+      "epoch": 0.4082747033497847,
+      "grad_norm": 0.5241899490356445,
+      "learning_rate": 0.00018729729069030704,
+      "loss": 0.1785,
+      "step": 1944
+    },
+    {
+      "epoch": 0.40848472120130214,
+      "grad_norm": 0.8017915487289429,
+      "learning_rate": 0.00018728034379356342,
+      "loss": 0.1877,
+      "step": 1945
+    },
+    {
+      "epoch": 0.4086947390528195,
+      "grad_norm": 0.5832831263542175,
+      "learning_rate": 0.00018726338636753078,
+      "loss": 0.1465,
+      "step": 1946
+    },
+    {
+      "epoch": 0.40890475690433686,
+      "grad_norm": 0.6607633829116821,
+      "learning_rate": 0.00018724641841425478,
+      "loss": 0.1821,
+      "step": 1947
+    },
+    {
+      "epoch": 0.4091147747558542,
+      "grad_norm": 0.8016804456710815,
+      "learning_rate": 0.00018722943993578248,
+      "loss": 0.2567,
+      "step": 1948
+    },
+    {
+      "epoch": 0.40932479260737165,
+      "grad_norm": 0.5984741449356079,
+      "learning_rate": 0.00018721245093416208,
+      "loss": 0.2095,
+      "step": 1949
+    },
+    {
+      "epoch": 0.409534810458889,
+      "grad_norm": 0.7901354432106018,
+      "learning_rate": 0.00018719545141144308,
+      "loss": 0.1916,
+      "step": 1950
+    },
+    {
+      "epoch": 0.4097448283104064,
+      "grad_norm": 0.5149995684623718,
+      "learning_rate": 0.00018717844136967624,
+      "loss": 0.227,
+      "step": 1951
+    },
+    {
+      "epoch": 0.40995484616192374,
+      "grad_norm": 0.6096331477165222,
+      "learning_rate": 0.00018716142081091368,
+      "loss": 0.2901,
+      "step": 1952
+    },
+    {
+      "epoch": 0.41016486401344115,
+      "grad_norm": 0.5911889672279358,
+      "learning_rate": 0.00018714438973720866,
+      "loss": 0.3882,
+      "step": 1953
+    },
+    {
+      "epoch": 0.4103748818649585,
+      "grad_norm": 0.5551436543464661,
+      "learning_rate": 0.0001871273481506158,
+      "loss": 0.1997,
+      "step": 1954
+    },
+    {
+      "epoch": 0.4105848997164759,
+      "grad_norm": 0.5714811682701111,
+      "learning_rate": 0.00018711029605319093,
+      "loss": 0.2507,
+      "step": 1955
+    },
+    {
+      "epoch": 0.4107949175679933,
+      "grad_norm": 0.6794240474700928,
+      "learning_rate": 0.00018709323344699117,
+      "loss": 0.2722,
+      "step": 1956
+    },
+    {
+      "epoch": 0.41100493541951066,
+      "grad_norm": 0.6706187725067139,
+      "learning_rate": 0.00018707616033407498,
+      "loss": 0.2413,
+      "step": 1957
+    },
+    {
+      "epoch": 0.411214953271028,
+      "grad_norm": 0.7831113934516907,
+      "learning_rate": 0.0001870590767165019,
+      "loss": 0.2789,
+      "step": 1958
+    },
+    {
+      "epoch": 0.4114249711225454,
+      "grad_norm": 0.8539621829986572,
+      "learning_rate": 0.00018704198259633298,
+      "loss": 0.3216,
+      "step": 1959
+    },
+    {
+      "epoch": 0.4116349889740628,
+      "grad_norm": 0.7270777821540833,
+      "learning_rate": 0.00018702487797563034,
+      "loss": 0.1851,
+      "step": 1960
+    },
+    {
+      "epoch": 0.4118450068255802,
+      "grad_norm": 0.49483752250671387,
+      "learning_rate": 0.00018700776285645744,
+      "loss": 0.2089,
+      "step": 1961
+    },
+    {
+      "epoch": 0.41205502467709754,
+      "grad_norm": 0.5273351073265076,
+      "learning_rate": 0.00018699063724087904,
+      "loss": 0.1737,
+      "step": 1962
+    },
+    {
+      "epoch": 0.41226504252861496,
+      "grad_norm": 0.5160996317863464,
+      "learning_rate": 0.0001869735011309611,
+      "loss": 0.142,
+      "step": 1963
+    },
+    {
+      "epoch": 0.4124750603801323,
+      "grad_norm": 0.5540682077407837,
+      "learning_rate": 0.00018695635452877093,
+      "loss": 0.3374,
+      "step": 1964
+    },
+    {
+      "epoch": 0.4126850782316497,
+      "grad_norm": 0.6244357228279114,
+      "learning_rate": 0.000186939197436377,
+      "loss": 0.1763,
+      "step": 1965
+    },
+    {
+      "epoch": 0.41289509608316705,
+      "grad_norm": 0.6462785005569458,
+      "learning_rate": 0.00018692202985584909,
+      "loss": 0.1546,
+      "step": 1966
+    },
+    {
+      "epoch": 0.41310511393468446,
+      "grad_norm": 0.6570327877998352,
+      "learning_rate": 0.0001869048517892583,
+      "loss": 0.1957,
+      "step": 1967
+    },
+    {
+      "epoch": 0.41331513178620183,
+      "grad_norm": 0.45998433232307434,
+      "learning_rate": 0.00018688766323867695,
+      "loss": 0.1414,
+      "step": 1968
+    },
+    {
+      "epoch": 0.4135251496377192,
+      "grad_norm": 0.6960569024085999,
+      "learning_rate": 0.0001868704642061786,
+      "loss": 0.1752,
+      "step": 1969
+    },
+    {
+      "epoch": 0.4137351674892366,
+      "grad_norm": 0.5571821928024292,
+      "learning_rate": 0.0001868532546938381,
+      "loss": 0.1502,
+      "step": 1970
+    },
+    {
+      "epoch": 0.413945185340754,
+      "grad_norm": 0.3070127069950104,
+      "learning_rate": 0.0001868360347037316,
+      "loss": 0.0917,
+      "step": 1971
+    },
+    {
+      "epoch": 0.41415520319227134,
+      "grad_norm": 0.5906655192375183,
+      "learning_rate": 0.00018681880423793642,
+      "loss": 0.1616,
+      "step": 1972
+    },
+    {
+      "epoch": 0.4143652210437887,
+      "grad_norm": 0.49494755268096924,
+      "learning_rate": 0.00018680156329853125,
+      "loss": 0.1839,
+      "step": 1973
+    },
+    {
+      "epoch": 0.4145752388953061,
+      "grad_norm": 0.48421719670295715,
+      "learning_rate": 0.00018678431188759594,
+      "loss": 0.1576,
+      "step": 1974
+    },
+    {
+      "epoch": 0.4147852567468235,
+      "grad_norm": 0.5349069833755493,
+      "learning_rate": 0.00018676705000721176,
+      "loss": 0.2045,
+      "step": 1975
+    },
+    {
+      "epoch": 0.41499527459834085,
+      "grad_norm": 0.49466922879219055,
+      "learning_rate": 0.00018674977765946105,
+      "loss": 0.1873,
+      "step": 1976
+    },
+    {
+      "epoch": 0.41520529244985827,
+      "grad_norm": 0.8801465034484863,
+      "learning_rate": 0.0001867324948464275,
+      "loss": 0.2954,
+      "step": 1977
+    },
+    {
+      "epoch": 0.41541531030137563,
+      "grad_norm": 0.7728480100631714,
+      "learning_rate": 0.00018671520157019614,
+      "loss": 0.1919,
+      "step": 1978
+    },
+    {
+      "epoch": 0.415625328152893,
+      "grad_norm": 0.4381806254386902,
+      "learning_rate": 0.0001866978978328531,
+      "loss": 0.1868,
+      "step": 1979
+    },
+    {
+      "epoch": 0.41583534600441036,
+      "grad_norm": 0.7297528386116028,
+      "learning_rate": 0.00018668058363648598,
+      "loss": 0.2358,
+      "step": 1980
+    },
+    {
+      "epoch": 0.4160453638559278,
+      "grad_norm": 0.9243682622909546,
+      "learning_rate": 0.00018666325898318342,
+      "loss": 0.3351,
+      "step": 1981
+    },
+    {
+      "epoch": 0.41625538170744514,
+      "grad_norm": 0.6876251101493835,
+      "learning_rate": 0.00018664592387503543,
+      "loss": 0.1835,
+      "step": 1982
+    },
+    {
+      "epoch": 0.4164653995589625,
+      "grad_norm": 0.6412570476531982,
+      "learning_rate": 0.0001866285783141333,
+      "loss": 0.1726,
+      "step": 1983
+    },
+    {
+      "epoch": 0.41667541741047986,
+      "grad_norm": 0.7733229994773865,
+      "learning_rate": 0.0001866112223025696,
+      "loss": 0.2133,
+      "step": 1984
+    },
+    {
+      "epoch": 0.4168854352619973,
+      "grad_norm": 0.6993643045425415,
+      "learning_rate": 0.00018659385584243805,
+      "loss": 0.1651,
+      "step": 1985
+    },
+    {
+      "epoch": 0.41709545311351465,
+      "grad_norm": 0.438504695892334,
+      "learning_rate": 0.0001865764789358337,
+      "loss": 0.1346,
+      "step": 1986
+    },
+    {
+      "epoch": 0.417305470965032,
+      "grad_norm": 0.8238644599914551,
+      "learning_rate": 0.0001865590915848529,
+      "loss": 0.2273,
+      "step": 1987
+    },
+    {
+      "epoch": 0.41751548881654943,
+      "grad_norm": 0.5988185405731201,
+      "learning_rate": 0.0001865416937915932,
+      "loss": 0.2296,
+      "step": 1988
+    },
+    {
+      "epoch": 0.4177255066680668,
+      "grad_norm": 0.3924911618232727,
+      "learning_rate": 0.0001865242855581534,
+      "loss": 0.1437,
+      "step": 1989
+    },
+    {
+      "epoch": 0.41793552451958416,
+      "grad_norm": 0.3869493901729584,
+      "learning_rate": 0.00018650686688663362,
+      "loss": 0.1312,
+      "step": 1990
+    },
+    {
+      "epoch": 0.4181455423711015,
+      "grad_norm": 0.708566427230835,
+      "learning_rate": 0.0001864894377791352,
+      "loss": 0.1798,
+      "step": 1991
+    },
+    {
+      "epoch": 0.41835556022261894,
+      "grad_norm": 0.4604390859603882,
+      "learning_rate": 0.00018647199823776075,
+      "loss": 0.1269,
+      "step": 1992
+    },
+    {
+      "epoch": 0.4185655780741363,
+      "grad_norm": 0.6290651559829712,
+      "learning_rate": 0.00018645454826461414,
+      "loss": 0.306,
+      "step": 1993
+    },
+    {
+      "epoch": 0.41877559592565367,
+      "grad_norm": 0.7372380495071411,
+      "learning_rate": 0.00018643708786180048,
+      "loss": 0.1536,
+      "step": 1994
+    },
+    {
+      "epoch": 0.4189856137771711,
+      "grad_norm": 0.699030339717865,
+      "learning_rate": 0.00018641961703142612,
+      "loss": 0.2426,
+      "step": 1995
+    },
+    {
+      "epoch": 0.41919563162868845,
+      "grad_norm": 0.6631273627281189,
+      "learning_rate": 0.00018640213577559873,
+      "loss": 0.1994,
+      "step": 1996
+    },
+    {
+      "epoch": 0.4194056494802058,
+      "grad_norm": 0.6958602070808411,
+      "learning_rate": 0.00018638464409642723,
+      "loss": 0.2799,
+      "step": 1997
+    },
+    {
+      "epoch": 0.4196156673317232,
+      "grad_norm": 0.8655551075935364,
+      "learning_rate": 0.00018636714199602174,
+      "loss": 0.2509,
+      "step": 1998
+    },
+    {
+      "epoch": 0.4198256851832406,
+      "grad_norm": 0.6360480189323425,
+      "learning_rate": 0.00018634962947649368,
+      "loss": 0.1622,
+      "step": 1999
+    },
+    {
+      "epoch": 0.42003570303475796,
+      "grad_norm": 0.6162030100822449,
+      "learning_rate": 0.00018633210653995572,
+      "loss": 0.2662,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 9522,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.958982538786406e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}