diff --git "a/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-1000/trainer_state.json" "b/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-1000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-1000/trainer_state.json"
@@ -0,0 +1,7033 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.21001785151737898,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00021001785151737897,
+      "grad_norm": 0.5090809464454651,
+      "learning_rate": 4.19287211740042e-07,
+      "loss": 1.1155,
+      "step": 1
+    },
+    {
+      "epoch": 0.00042003570303475793,
+      "grad_norm": 0.6277585625648499,
+      "learning_rate": 8.38574423480084e-07,
+      "loss": 1.3855,
+      "step": 2
+    },
+    {
+      "epoch": 0.000630053554552137,
+      "grad_norm": 0.7032777070999146,
+      "learning_rate": 1.257861635220126e-06,
+      "loss": 1.3905,
+      "step": 3
+    },
+    {
+      "epoch": 0.0008400714060695159,
+      "grad_norm": 0.745212197303772,
+      "learning_rate": 1.677148846960168e-06,
+      "loss": 1.4668,
+      "step": 4
+    },
+    {
+      "epoch": 0.0010500892575868949,
+      "grad_norm": 0.7577304840087891,
+      "learning_rate": 2.09643605870021e-06,
+      "loss": 1.473,
+      "step": 5
+    },
+    {
+      "epoch": 0.001260107109104274,
+      "grad_norm": 0.7788395881652832,
+      "learning_rate": 2.515723270440252e-06,
+      "loss": 1.485,
+      "step": 6
+    },
+    {
+      "epoch": 0.001470124960621653,
+      "grad_norm": 0.7430889010429382,
+      "learning_rate": 2.935010482180294e-06,
+      "loss": 1.4338,
+      "step": 7
+    },
+    {
+      "epoch": 0.0016801428121390317,
+      "grad_norm": 0.8291558623313904,
+      "learning_rate": 3.354297693920336e-06,
+      "loss": 1.4768,
+      "step": 8
+    },
+    {
+      "epoch": 0.0018901606636564107,
+      "grad_norm": 0.7731107473373413,
+      "learning_rate": 3.7735849056603773e-06,
+      "loss": 1.5853,
+      "step": 9
+    },
+    {
+      "epoch": 0.0021001785151737898,
+      "grad_norm": 0.8241227269172668,
+      "learning_rate": 4.19287211740042e-06,
+      "loss": 1.7314,
+      "step": 10
+    },
+    {
+      "epoch": 0.002310196366691169,
+      "grad_norm": 0.8158630728721619,
+      "learning_rate": 4.612159329140462e-06,
+      "loss": 1.5916,
+      "step": 11
+    },
+    {
+      "epoch": 0.002520214218208548,
+      "grad_norm": 0.8860861659049988,
+      "learning_rate": 5.031446540880504e-06,
+      "loss": 1.5115,
+      "step": 12
+    },
+    {
+      "epoch": 0.002730232069725927,
+      "grad_norm": 0.868651270866394,
+      "learning_rate": 5.4507337526205454e-06,
+      "loss": 1.7514,
+      "step": 13
+    },
+    {
+      "epoch": 0.002940249921243306,
+      "grad_norm": 0.9116117358207703,
+      "learning_rate": 5.870020964360588e-06,
+      "loss": 1.5645,
+      "step": 14
+    },
+    {
+      "epoch": 0.003150267772760685,
+      "grad_norm": 0.8694919347763062,
+      "learning_rate": 6.289308176100629e-06,
+      "loss": 1.6326,
+      "step": 15
+    },
+    {
+      "epoch": 0.0033602856242780635,
+      "grad_norm": 0.8614499568939209,
+      "learning_rate": 6.708595387840672e-06,
+      "loss": 1.6224,
+      "step": 16
+    },
+    {
+      "epoch": 0.0035703034757954425,
+      "grad_norm": 0.8713967800140381,
+      "learning_rate": 7.127882599580712e-06,
+      "loss": 1.5923,
+      "step": 17
+    },
+    {
+      "epoch": 0.0037803213273128215,
+      "grad_norm": 0.8446964025497437,
+      "learning_rate": 7.547169811320755e-06,
+      "loss": 1.5843,
+      "step": 18
+    },
+    {
+      "epoch": 0.0039903391788302005,
+      "grad_norm": 0.8920742869377136,
+      "learning_rate": 7.966457023060797e-06,
+      "loss": 1.5485,
+      "step": 19
+    },
+    {
+      "epoch": 0.0042003570303475795,
+      "grad_norm": 0.9501891136169434,
+      "learning_rate": 8.38574423480084e-06,
+      "loss": 1.663,
+      "step": 20
+    },
+    {
+      "epoch": 0.0044103748818649586,
+      "grad_norm": 0.9179856181144714,
+      "learning_rate": 8.80503144654088e-06,
+      "loss": 1.6163,
+      "step": 21
+    },
+    {
+      "epoch": 0.004620392733382338,
+      "grad_norm": 0.8716169595718384,
+      "learning_rate": 9.224318658280923e-06,
+      "loss": 1.5311,
+      "step": 22
+    },
+    {
+      "epoch": 0.004830410584899717,
+      "grad_norm": 0.9034018516540527,
+      "learning_rate": 9.643605870020965e-06,
+      "loss": 1.5315,
+      "step": 23
+    },
+    {
+      "epoch": 0.005040428436417096,
+      "grad_norm": 0.9811834692955017,
+      "learning_rate": 1.0062893081761008e-05,
+      "loss": 1.5356,
+      "step": 24
+    },
+    {
+      "epoch": 0.005250446287934475,
+      "grad_norm": 0.8846603035926819,
+      "learning_rate": 1.0482180293501048e-05,
+      "loss": 1.5815,
+      "step": 25
+    },
+    {
+      "epoch": 0.005460464139451854,
+      "grad_norm": 0.8842517137527466,
+      "learning_rate": 1.0901467505241091e-05,
+      "loss": 1.5628,
+      "step": 26
+    },
+    {
+      "epoch": 0.005670481990969233,
+      "grad_norm": 0.9207525253295898,
+      "learning_rate": 1.1320754716981132e-05,
+      "loss": 1.5623,
+      "step": 27
+    },
+    {
+      "epoch": 0.005880499842486612,
+      "grad_norm": 0.9082942605018616,
+      "learning_rate": 1.1740041928721176e-05,
+      "loss": 1.4599,
+      "step": 28
+    },
+    {
+      "epoch": 0.006090517694003991,
+      "grad_norm": 0.8724138736724854,
+      "learning_rate": 1.2159329140461215e-05,
+      "loss": 1.5282,
+      "step": 29
+    },
+    {
+      "epoch": 0.00630053554552137,
+      "grad_norm": 0.8738006353378296,
+      "learning_rate": 1.2578616352201259e-05,
+      "loss": 1.4782,
+      "step": 30
+    },
+    {
+      "epoch": 0.006510553397038748,
+      "grad_norm": 0.9410291910171509,
+      "learning_rate": 1.29979035639413e-05,
+      "loss": 1.3856,
+      "step": 31
+    },
+    {
+      "epoch": 0.006720571248556127,
+      "grad_norm": 0.9309423565864563,
+      "learning_rate": 1.3417190775681343e-05,
+      "loss": 1.4267,
+      "step": 32
+    },
+    {
+      "epoch": 0.006930589100073506,
+      "grad_norm": 0.9442999362945557,
+      "learning_rate": 1.3836477987421385e-05,
+      "loss": 1.3706,
+      "step": 33
+    },
+    {
+      "epoch": 0.007140606951590885,
+      "grad_norm": 0.9511269927024841,
+      "learning_rate": 1.4255765199161425e-05,
+      "loss": 1.26,
+      "step": 34
+    },
+    {
+      "epoch": 0.007350624803108264,
+      "grad_norm": 1.0389297008514404,
+      "learning_rate": 1.467505241090147e-05,
+      "loss": 1.235,
+      "step": 35
+    },
+    {
+      "epoch": 0.007560642654625643,
+      "grad_norm": 1.0033001899719238,
+      "learning_rate": 1.509433962264151e-05,
+      "loss": 1.2687,
+      "step": 36
+    },
+    {
+      "epoch": 0.007770660506143022,
+      "grad_norm": 1.075852632522583,
+      "learning_rate": 1.5513626834381552e-05,
+      "loss": 1.2762,
+      "step": 37
+    },
+    {
+      "epoch": 0.007980678357660401,
+      "grad_norm": 1.0721476078033447,
+      "learning_rate": 1.5932914046121594e-05,
+      "loss": 1.1935,
+      "step": 38
+    },
+    {
+      "epoch": 0.00819069620917778,
+      "grad_norm": 1.0784581899642944,
+      "learning_rate": 1.6352201257861635e-05,
+      "loss": 1.1119,
+      "step": 39
+    },
+    {
+      "epoch": 0.008400714060695159,
+      "grad_norm": 1.1390137672424316,
+      "learning_rate": 1.677148846960168e-05,
+      "loss": 1.0373,
+      "step": 40
+    },
+    {
+      "epoch": 0.008610731912212538,
+      "grad_norm": 1.3073922395706177,
+      "learning_rate": 1.719077568134172e-05,
+      "loss": 1.1536,
+      "step": 41
+    },
+    {
+      "epoch": 0.008820749763729917,
+      "grad_norm": 1.3248019218444824,
+      "learning_rate": 1.761006289308176e-05,
+      "loss": 0.9316,
+      "step": 42
+    },
+    {
+      "epoch": 0.009030767615247296,
+      "grad_norm": 1.3569798469543457,
+      "learning_rate": 1.8029350104821805e-05,
+      "loss": 0.8881,
+      "step": 43
+    },
+    {
+      "epoch": 0.009240785466764675,
+      "grad_norm": 1.3192838430404663,
+      "learning_rate": 1.8448637316561846e-05,
+      "loss": 0.8825,
+      "step": 44
+    },
+    {
+      "epoch": 0.009450803318282054,
+      "grad_norm": 1.1947859525680542,
+      "learning_rate": 1.8867924528301888e-05,
+      "loss": 0.9415,
+      "step": 45
+    },
+    {
+      "epoch": 0.009660821169799433,
+      "grad_norm": 1.1684753894805908,
+      "learning_rate": 1.928721174004193e-05,
+      "loss": 0.83,
+      "step": 46
+    },
+    {
+      "epoch": 0.009870839021316812,
+      "grad_norm": 1.1097474098205566,
+      "learning_rate": 1.970649895178197e-05,
+      "loss": 0.7248,
+      "step": 47
+    },
+    {
+      "epoch": 0.010080856872834191,
+      "grad_norm": 1.0564842224121094,
+      "learning_rate": 2.0125786163522016e-05,
+      "loss": 0.7427,
+      "step": 48
+    },
+    {
+      "epoch": 0.01029087472435157,
+      "grad_norm": 0.9865881204605103,
+      "learning_rate": 2.0545073375262054e-05,
+      "loss": 0.877,
+      "step": 49
+    },
+    {
+      "epoch": 0.01050089257586895,
+      "grad_norm": 1.072039246559143,
+      "learning_rate": 2.0964360587002095e-05,
+      "loss": 0.8473,
+      "step": 50
+    },
+    {
+      "epoch": 0.010710910427386328,
+      "grad_norm": 0.5596430897712708,
+      "learning_rate": 2.138364779874214e-05,
+      "loss": 0.4599,
+      "step": 51
+    },
+    {
+      "epoch": 0.010920928278903707,
+      "grad_norm": 0.6180581450462341,
+      "learning_rate": 2.1802935010482182e-05,
+      "loss": 0.5215,
+      "step": 52
+    },
+    {
+      "epoch": 0.011130946130421086,
+      "grad_norm": 0.6805194616317749,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 0.6148,
+      "step": 53
+    },
+    {
+      "epoch": 0.011340963981938465,
+      "grad_norm": 0.7125585079193115,
+      "learning_rate": 2.2641509433962265e-05,
+      "loss": 0.5615,
+      "step": 54
+    },
+    {
+      "epoch": 0.011550981833455844,
+      "grad_norm": 0.6816964745521545,
+      "learning_rate": 2.3060796645702306e-05,
+      "loss": 0.478,
+      "step": 55
+    },
+    {
+      "epoch": 0.011760999684973223,
+      "grad_norm": 0.5821985602378845,
+      "learning_rate": 2.348008385744235e-05,
+      "loss": 0.5604,
+      "step": 56
+    },
+    {
+      "epoch": 0.011971017536490602,
+      "grad_norm": 0.642721951007843,
+      "learning_rate": 2.3899371069182393e-05,
+      "loss": 0.4068,
+      "step": 57
+    },
+    {
+      "epoch": 0.012181035388007981,
+      "grad_norm": 0.5806999206542969,
+      "learning_rate": 2.431865828092243e-05,
+      "loss": 0.4711,
+      "step": 58
+    },
+    {
+      "epoch": 0.01239105323952536,
+      "grad_norm": 0.6702911257743835,
+      "learning_rate": 2.4737945492662476e-05,
+      "loss": 0.4601,
+      "step": 59
+    },
+    {
+      "epoch": 0.01260107109104274,
+      "grad_norm": 0.6345894932746887,
+      "learning_rate": 2.5157232704402517e-05,
+      "loss": 0.4172,
+      "step": 60
+    },
+    {
+      "epoch": 0.012811088942560117,
+      "grad_norm": 0.6444422602653503,
+      "learning_rate": 2.5576519916142562e-05,
+      "loss": 0.4635,
+      "step": 61
+    },
+    {
+      "epoch": 0.013021106794077496,
+      "grad_norm": 0.6568206548690796,
+      "learning_rate": 2.59958071278826e-05,
+      "loss": 0.4327,
+      "step": 62
+    },
+    {
+      "epoch": 0.013231124645594875,
+      "grad_norm": 0.6627638936042786,
+      "learning_rate": 2.641509433962264e-05,
+      "loss": 0.4935,
+      "step": 63
+    },
+    {
+      "epoch": 0.013441142497112254,
+      "grad_norm": 0.6746403574943542,
+      "learning_rate": 2.6834381551362687e-05,
+      "loss": 0.4013,
+      "step": 64
+    },
+    {
+      "epoch": 0.013651160348629633,
+      "grad_norm": 0.7141286134719849,
+      "learning_rate": 2.7253668763102725e-05,
+      "loss": 0.5162,
+      "step": 65
+    },
+    {
+      "epoch": 0.013861178200147012,
+      "grad_norm": 0.779960572719574,
+      "learning_rate": 2.767295597484277e-05,
+      "loss": 0.4126,
+      "step": 66
+    },
+    {
+      "epoch": 0.014071196051664391,
+      "grad_norm": 0.6626395583152771,
+      "learning_rate": 2.809224318658281e-05,
+      "loss": 0.3925,
+      "step": 67
+    },
+    {
+      "epoch": 0.01428121390318177,
+      "grad_norm": 0.6545393466949463,
+      "learning_rate": 2.851153039832285e-05,
+      "loss": 0.3152,
+      "step": 68
+    },
+    {
+      "epoch": 0.014491231754699149,
+      "grad_norm": 0.7004114389419556,
+      "learning_rate": 2.8930817610062894e-05,
+      "loss": 0.4223,
+      "step": 69
+    },
+    {
+      "epoch": 0.014701249606216528,
+      "grad_norm": 0.6912452578544617,
+      "learning_rate": 2.935010482180294e-05,
+      "loss": 0.3089,
+      "step": 70
+    },
+    {
+      "epoch": 0.014911267457733907,
+      "grad_norm": 0.7729060649871826,
+      "learning_rate": 2.976939203354298e-05,
+      "loss": 0.4045,
+      "step": 71
+    },
+    {
+      "epoch": 0.015121285309251286,
+      "grad_norm": 0.7606898546218872,
+      "learning_rate": 3.018867924528302e-05,
+      "loss": 0.3079,
+      "step": 72
+    },
+    {
+      "epoch": 0.015331303160768665,
+      "grad_norm": 0.6202028393745422,
+      "learning_rate": 3.060796645702306e-05,
+      "loss": 0.3833,
+      "step": 73
+    },
+    {
+      "epoch": 0.015541321012286044,
+      "grad_norm": 0.6014758348464966,
+      "learning_rate": 3.1027253668763105e-05,
+      "loss": 0.3815,
+      "step": 74
+    },
+    {
+      "epoch": 0.015751338863803425,
+      "grad_norm": 0.6792122721672058,
+      "learning_rate": 3.144654088050314e-05,
+      "loss": 0.3383,
+      "step": 75
+    },
+    {
+      "epoch": 0.015961356715320802,
+      "grad_norm": 0.7135879993438721,
+      "learning_rate": 3.186582809224319e-05,
+      "loss": 0.3744,
+      "step": 76
+    },
+    {
+      "epoch": 0.016171374566838183,
+      "grad_norm": 0.6972818374633789,
+      "learning_rate": 3.228511530398323e-05,
+      "loss": 0.3256,
+      "step": 77
+    },
+    {
+      "epoch": 0.01638139241835556,
+      "grad_norm": 0.5925168395042419,
+      "learning_rate": 3.270440251572327e-05,
+      "loss": 0.3309,
+      "step": 78
+    },
+    {
+      "epoch": 0.01659141026987294,
+      "grad_norm": 0.7750416994094849,
+      "learning_rate": 3.3123689727463316e-05,
+      "loss": 0.4142,
+      "step": 79
+    },
+    {
+      "epoch": 0.016801428121390318,
+      "grad_norm": 0.7466484904289246,
+      "learning_rate": 3.354297693920336e-05,
+      "loss": 0.2527,
+      "step": 80
+    },
+    {
+      "epoch": 0.0170114459729077,
+      "grad_norm": 0.7709718942642212,
+      "learning_rate": 3.39622641509434e-05,
+      "loss": 0.3717,
+      "step": 81
+    },
+    {
+      "epoch": 0.017221463824425076,
+      "grad_norm": 0.6134454607963562,
+      "learning_rate": 3.438155136268344e-05,
+      "loss": 0.2969,
+      "step": 82
+    },
+    {
+      "epoch": 0.017431481675942453,
+      "grad_norm": 0.6442283391952515,
+      "learning_rate": 3.480083857442348e-05,
+      "loss": 0.3009,
+      "step": 83
+    },
+    {
+      "epoch": 0.017641499527459834,
+      "grad_norm": 0.6788150072097778,
+      "learning_rate": 3.522012578616352e-05,
+      "loss": 0.309,
+      "step": 84
+    },
+    {
+      "epoch": 0.01785151737897721,
+      "grad_norm": 0.7172322869300842,
+      "learning_rate": 3.5639412997903565e-05,
+      "loss": 0.3602,
+      "step": 85
+    },
+    {
+      "epoch": 0.018061535230494592,
+      "grad_norm": 0.7475742697715759,
+      "learning_rate": 3.605870020964361e-05,
+      "loss": 0.1889,
+      "step": 86
+    },
+    {
+      "epoch": 0.01827155308201197,
+      "grad_norm": 0.7164073586463928,
+      "learning_rate": 3.647798742138365e-05,
+      "loss": 0.2062,
+      "step": 87
+    },
+    {
+      "epoch": 0.01848157093352935,
+      "grad_norm": 0.7514247298240662,
+      "learning_rate": 3.689727463312369e-05,
+      "loss": 0.2426,
+      "step": 88
+    },
+    {
+      "epoch": 0.018691588785046728,
+      "grad_norm": 0.8898234963417053,
+      "learning_rate": 3.731656184486374e-05,
+      "loss": 0.3759,
+      "step": 89
+    },
+    {
+      "epoch": 0.01890160663656411,
+      "grad_norm": 0.8034729361534119,
+      "learning_rate": 3.7735849056603776e-05,
+      "loss": 0.2547,
+      "step": 90
+    },
+    {
+      "epoch": 0.019111624488081486,
+      "grad_norm": 0.771716296672821,
+      "learning_rate": 3.8155136268343814e-05,
+      "loss": 0.2125,
+      "step": 91
+    },
+    {
+      "epoch": 0.019321642339598866,
+      "grad_norm": 0.811174213886261,
+      "learning_rate": 3.857442348008386e-05,
+      "loss": 0.3535,
+      "step": 92
+    },
+    {
+      "epoch": 0.019531660191116244,
+      "grad_norm": 1.0474952459335327,
+      "learning_rate": 3.8993710691823904e-05,
+      "loss": 0.3278,
+      "step": 93
+    },
+    {
+      "epoch": 0.019741678042633624,
+      "grad_norm": 0.752088725566864,
+      "learning_rate": 3.941299790356394e-05,
+      "loss": 0.2574,
+      "step": 94
+    },
+    {
+      "epoch": 0.019951695894151,
+      "grad_norm": 0.9202740788459778,
+      "learning_rate": 3.983228511530399e-05,
+      "loss": 0.2618,
+      "step": 95
+    },
+    {
+      "epoch": 0.020161713745668382,
+      "grad_norm": 0.663686990737915,
+      "learning_rate": 4.025157232704403e-05,
+      "loss": 0.1981,
+      "step": 96
+    },
+    {
+      "epoch": 0.02037173159718576,
+      "grad_norm": 0.7075244784355164,
+      "learning_rate": 4.067085953878407e-05,
+      "loss": 0.195,
+      "step": 97
+    },
+    {
+      "epoch": 0.02058174944870314,
+      "grad_norm": 0.8226995468139648,
+      "learning_rate": 4.109014675052411e-05,
+      "loss": 0.3464,
+      "step": 98
+    },
+    {
+      "epoch": 0.020791767300220518,
+      "grad_norm": 0.826926589012146,
+      "learning_rate": 4.150943396226415e-05,
+      "loss": 0.241,
+      "step": 99
+    },
+    {
+      "epoch": 0.0210017851517379,
+      "grad_norm": 0.8767513632774353,
+      "learning_rate": 4.192872117400419e-05,
+      "loss": 0.33,
+      "step": 100
+    },
+    {
+      "epoch": 0.021211803003255276,
+      "grad_norm": 0.9166819453239441,
+      "learning_rate": 4.2348008385744236e-05,
+      "loss": 0.3528,
+      "step": 101
+    },
+    {
+      "epoch": 0.021421820854772657,
+      "grad_norm": 0.6607112288475037,
+      "learning_rate": 4.276729559748428e-05,
+      "loss": 0.3294,
+      "step": 102
+    },
+    {
+      "epoch": 0.021631838706290034,
+      "grad_norm": 0.5891725420951843,
+      "learning_rate": 4.318658280922432e-05,
+      "loss": 0.2523,
+      "step": 103
+    },
+    {
+      "epoch": 0.021841856557807415,
+      "grad_norm": 0.5484351515769958,
+      "learning_rate": 4.3605870020964364e-05,
+      "loss": 0.3563,
+      "step": 104
+    },
+    {
+      "epoch": 0.022051874409324792,
+      "grad_norm": 0.6384206414222717,
+      "learning_rate": 4.402515723270441e-05,
+      "loss": 0.5014,
+      "step": 105
+    },
+    {
+      "epoch": 0.022261892260842173,
+      "grad_norm": 0.6228074431419373,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 0.297,
+      "step": 106
+    },
+    {
+      "epoch": 0.02247191011235955,
+      "grad_norm": 0.6993734240531921,
+      "learning_rate": 4.4863731656184485e-05,
+      "loss": 0.3416,
+      "step": 107
+    },
+    {
+      "epoch": 0.02268192796387693,
+      "grad_norm": 0.5191211104393005,
+      "learning_rate": 4.528301886792453e-05,
+      "loss": 0.2403,
+      "step": 108
+    },
+    {
+      "epoch": 0.022891945815394308,
+      "grad_norm": 0.5719013214111328,
+      "learning_rate": 4.570230607966457e-05,
+      "loss": 0.2226,
+      "step": 109
+    },
+    {
+      "epoch": 0.02310196366691169,
+      "grad_norm": 0.5222904682159424,
+      "learning_rate": 4.612159329140461e-05,
+      "loss": 0.2119,
+      "step": 110
+    },
+    {
+      "epoch": 0.023311981518429066,
+      "grad_norm": 0.4741697609424591,
+      "learning_rate": 4.654088050314466e-05,
+      "loss": 0.2076,
+      "step": 111
+    },
+    {
+      "epoch": 0.023521999369946447,
+      "grad_norm": 0.5350250005722046,
+      "learning_rate": 4.69601677148847e-05,
+      "loss": 0.2342,
+      "step": 112
+    },
+    {
+      "epoch": 0.023732017221463824,
+      "grad_norm": 0.6532084345817566,
+      "learning_rate": 4.737945492662474e-05,
+      "loss": 0.3541,
+      "step": 113
+    },
+    {
+      "epoch": 0.023942035072981205,
+      "grad_norm": 0.6158542633056641,
+      "learning_rate": 4.7798742138364785e-05,
+      "loss": 0.2586,
+      "step": 114
+    },
+    {
+      "epoch": 0.024152052924498582,
+      "grad_norm": 0.7820281982421875,
+      "learning_rate": 4.8218029350104823e-05,
+      "loss": 0.472,
+      "step": 115
+    },
+    {
+      "epoch": 0.024362070776015963,
+      "grad_norm": 0.6096176505088806,
+      "learning_rate": 4.863731656184486e-05,
+      "loss": 0.3014,
+      "step": 116
+    },
+    {
+      "epoch": 0.02457208862753334,
+      "grad_norm": 0.5152641534805298,
+      "learning_rate": 4.9056603773584906e-05,
+      "loss": 0.2032,
+      "step": 117
+    },
+    {
+      "epoch": 0.02478210647905072,
+      "grad_norm": 0.6049755215644836,
+      "learning_rate": 4.947589098532495e-05,
+      "loss": 0.2932,
+      "step": 118
+    },
+    {
+      "epoch": 0.024992124330568098,
+      "grad_norm": 0.5203216075897217,
+      "learning_rate": 4.989517819706499e-05,
+      "loss": 0.1978,
+      "step": 119
+    },
+    {
+      "epoch": 0.02520214218208548,
+      "grad_norm": 0.6881438493728638,
+      "learning_rate": 5.0314465408805034e-05,
+      "loss": 0.3873,
+      "step": 120
+    },
+    {
+      "epoch": 0.025412160033602856,
+      "grad_norm": 0.602206289768219,
+      "learning_rate": 5.073375262054507e-05,
+      "loss": 0.2508,
+      "step": 121
+    },
+    {
+      "epoch": 0.025622177885120234,
+      "grad_norm": 0.7059246897697449,
+      "learning_rate": 5.1153039832285124e-05,
+      "loss": 0.2661,
+      "step": 122
+    },
+    {
+      "epoch": 0.025832195736637614,
+      "grad_norm": 0.44054412841796875,
+      "learning_rate": 5.157232704402516e-05,
+      "loss": 0.177,
+      "step": 123
+    },
+    {
+      "epoch": 0.02604221358815499,
+      "grad_norm": 0.6321287155151367,
+      "learning_rate": 5.19916142557652e-05,
+      "loss": 0.2849,
+      "step": 124
+    },
+    {
+      "epoch": 0.026252231439672372,
+      "grad_norm": 0.7430282235145569,
+      "learning_rate": 5.2410901467505245e-05,
+      "loss": 0.3353,
+      "step": 125
+    },
+    {
+      "epoch": 0.02646224929118975,
+      "grad_norm": 0.6884610056877136,
+      "learning_rate": 5.283018867924528e-05,
+      "loss": 0.2935,
+      "step": 126
+    },
+    {
+      "epoch": 0.02667226714270713,
+      "grad_norm": 0.8021960854530334,
+      "learning_rate": 5.324947589098532e-05,
+      "loss": 0.2609,
+      "step": 127
+    },
+    {
+      "epoch": 0.026882284994224508,
+      "grad_norm": 0.5545848608016968,
+      "learning_rate": 5.366876310272537e-05,
+      "loss": 0.1972,
+      "step": 128
+    },
+    {
+      "epoch": 0.02709230284574189,
+      "grad_norm": 0.6628248691558838,
+      "learning_rate": 5.408805031446541e-05,
+      "loss": 0.2184,
+      "step": 129
+    },
+    {
+      "epoch": 0.027302320697259266,
+      "grad_norm": 0.5908805131912231,
+      "learning_rate": 5.450733752620545e-05,
+      "loss": 0.2029,
+      "step": 130
+    },
+    {
+      "epoch": 0.027512338548776646,
+      "grad_norm": 0.6377450823783875,
+      "learning_rate": 5.49266247379455e-05,
+      "loss": 0.2382,
+      "step": 131
+    },
+    {
+      "epoch": 0.027722356400294024,
+      "grad_norm": 0.7006211876869202,
+      "learning_rate": 5.534591194968554e-05,
+      "loss": 0.1936,
+      "step": 132
+    },
+    {
+      "epoch": 0.027932374251811404,
+      "grad_norm": 0.5962005257606506,
+      "learning_rate": 5.576519916142558e-05,
+      "loss": 0.2922,
+      "step": 133
+    },
+    {
+      "epoch": 0.028142392103328782,
+      "grad_norm": 0.6030206084251404,
+      "learning_rate": 5.618448637316562e-05,
+      "loss": 0.1629,
+      "step": 134
+    },
+    {
+      "epoch": 0.028352409954846162,
+      "grad_norm": 0.7888013124465942,
+      "learning_rate": 5.660377358490566e-05,
+      "loss": 0.2866,
+      "step": 135
+    },
+    {
+      "epoch": 0.02856242780636354,
+      "grad_norm": 0.5116386413574219,
+      "learning_rate": 5.70230607966457e-05,
+      "loss": 0.1963,
+      "step": 136
+    },
+    {
+      "epoch": 0.02877244565788092,
+      "grad_norm": 0.6759427785873413,
+      "learning_rate": 5.744234800838575e-05,
+      "loss": 0.2412,
+      "step": 137
+    },
+    {
+      "epoch": 0.028982463509398298,
+      "grad_norm": 0.8643584847450256,
+      "learning_rate": 5.786163522012579e-05,
+      "loss": 0.2277,
+      "step": 138
+    },
+    {
+      "epoch": 0.02919248136091568,
+      "grad_norm": 0.639639139175415,
+      "learning_rate": 5.8280922431865826e-05,
+      "loss": 0.2286,
+      "step": 139
+    },
+    {
+      "epoch": 0.029402499212433056,
+      "grad_norm": 0.6094908714294434,
+      "learning_rate": 5.870020964360588e-05,
+      "loss": 0.1656,
+      "step": 140
+    },
+    {
+      "epoch": 0.029612517063950437,
+      "grad_norm": 0.7927185297012329,
+      "learning_rate": 5.9119496855345916e-05,
+      "loss": 0.2436,
+      "step": 141
+    },
+    {
+      "epoch": 0.029822534915467814,
+      "grad_norm": 0.8780869841575623,
+      "learning_rate": 5.953878406708596e-05,
+      "loss": 0.2614,
+      "step": 142
+    },
+    {
+      "epoch": 0.030032552766985195,
+      "grad_norm": 0.5985304117202759,
+      "learning_rate": 5.9958071278826e-05,
+      "loss": 0.2268,
+      "step": 143
+    },
+    {
+      "epoch": 0.030242570618502572,
+      "grad_norm": 0.6452706456184387,
+      "learning_rate": 6.037735849056604e-05,
+      "loss": 0.211,
+      "step": 144
+    },
+    {
+      "epoch": 0.030452588470019953,
+      "grad_norm": 0.8015931844711304,
+      "learning_rate": 6.079664570230609e-05,
+      "loss": 0.3532,
+      "step": 145
+    },
+    {
+      "epoch": 0.03066260632153733,
+      "grad_norm": 0.667226254940033,
+      "learning_rate": 6.121593291404612e-05,
+      "loss": 0.2051,
+      "step": 146
+    },
+    {
+      "epoch": 0.03087262417305471,
+      "grad_norm": 0.6942270398139954,
+      "learning_rate": 6.163522012578616e-05,
+      "loss": 0.2516,
+      "step": 147
+    },
+    {
+      "epoch": 0.031082642024572088,
+      "grad_norm": 0.845588743686676,
+      "learning_rate": 6.205450733752621e-05,
+      "loss": 0.257,
+      "step": 148
+    },
+    {
+      "epoch": 0.031292659876089465,
+      "grad_norm": 0.6104562878608704,
+      "learning_rate": 6.247379454926625e-05,
+      "loss": 0.246,
+      "step": 149
+    },
+    {
+      "epoch": 0.03150267772760685,
+      "grad_norm": 0.7243993282318115,
+      "learning_rate": 6.289308176100629e-05,
+      "loss": 0.2623,
+      "step": 150
+    },
+    {
+      "epoch": 0.03171269557912423,
+      "grad_norm": 0.6479102373123169,
+      "learning_rate": 6.331236897274634e-05,
+      "loss": 0.3154,
+      "step": 151
+    },
+    {
+      "epoch": 0.031922713430641604,
+      "grad_norm": 0.6088507175445557,
+      "learning_rate": 6.373165618448638e-05,
+      "loss": 0.3639,
+      "step": 152
+    },
+    {
+      "epoch": 0.03213273128215898,
+      "grad_norm": 0.5590083599090576,
+      "learning_rate": 6.415094339622641e-05,
+      "loss": 0.2753,
+      "step": 153
+    },
+    {
+      "epoch": 0.032342749133676366,
+      "grad_norm": 0.6644802093505859,
+      "learning_rate": 6.457023060796647e-05,
+      "loss": 0.2722,
+      "step": 154
+    },
+    {
+      "epoch": 0.03255276698519374,
+      "grad_norm": 0.6034846901893616,
+      "learning_rate": 6.49895178197065e-05,
+      "loss": 0.3084,
+      "step": 155
+    },
+    {
+      "epoch": 0.03276278483671112,
+      "grad_norm": 0.897366464138031,
+      "learning_rate": 6.540880503144654e-05,
+      "loss": 0.2834,
+      "step": 156
+    },
+    {
+      "epoch": 0.0329728026882285,
+      "grad_norm": 0.7516223788261414,
+      "learning_rate": 6.58280922431866e-05,
+      "loss": 0.2515,
+      "step": 157
+    },
+    {
+      "epoch": 0.03318282053974588,
+      "grad_norm": 0.712957501411438,
+      "learning_rate": 6.624737945492663e-05,
+      "loss": 0.217,
+      "step": 158
+    },
+    {
+      "epoch": 0.03339283839126326,
+      "grad_norm": 0.6373322010040283,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.3091,
+      "step": 159
+    },
+    {
+      "epoch": 0.033602856242780636,
+      "grad_norm": 0.6305301189422607,
+      "learning_rate": 6.708595387840672e-05,
+      "loss": 0.1965,
+      "step": 160
+    },
+    {
+      "epoch": 0.033812874094298014,
+      "grad_norm": 0.6340491771697998,
+      "learning_rate": 6.750524109014676e-05,
+      "loss": 0.2316,
+      "step": 161
+    },
+    {
+      "epoch": 0.0340228919458154,
+      "grad_norm": 0.6992335915565491,
+      "learning_rate": 6.79245283018868e-05,
+      "loss": 0.321,
+      "step": 162
+    },
+    {
+      "epoch": 0.034232909797332775,
+      "grad_norm": 0.723899245262146,
+      "learning_rate": 6.834381551362684e-05,
+      "loss": 0.2057,
+      "step": 163
+    },
+    {
+      "epoch": 0.03444292764885015,
+      "grad_norm": 0.6245738863945007,
+      "learning_rate": 6.876310272536687e-05,
+      "loss": 0.2367,
+      "step": 164
+    },
+    {
+      "epoch": 0.03465294550036753,
+      "grad_norm": 0.716299295425415,
+      "learning_rate": 6.918238993710691e-05,
+      "loss": 0.3107,
+      "step": 165
+    },
+    {
+      "epoch": 0.03486296335188491,
+      "grad_norm": 0.8374738097190857,
+      "learning_rate": 6.960167714884696e-05,
+      "loss": 0.4097,
+      "step": 166
+    },
+    {
+      "epoch": 0.03507298120340229,
+      "grad_norm": 0.7812545299530029,
+      "learning_rate": 7.0020964360587e-05,
+      "loss": 0.389,
+      "step": 167
+    },
+    {
+      "epoch": 0.03528299905491967,
+      "grad_norm": 0.516504168510437,
+      "learning_rate": 7.044025157232704e-05,
+      "loss": 0.2321,
+      "step": 168
+    },
+    {
+      "epoch": 0.035493016906437046,
+      "grad_norm": 0.5948511958122253,
+      "learning_rate": 7.085953878406709e-05,
+      "loss": 0.2075,
+      "step": 169
+    },
+    {
+      "epoch": 0.03570303475795442,
+      "grad_norm": 0.5658239126205444,
+      "learning_rate": 7.127882599580713e-05,
+      "loss": 0.2366,
+      "step": 170
+    },
+    {
+      "epoch": 0.03591305260947181,
+      "grad_norm": 0.44888898730278015,
+      "learning_rate": 7.169811320754717e-05,
+      "loss": 0.226,
+      "step": 171
+    },
+    {
+      "epoch": 0.036123070460989185,
+      "grad_norm": 0.5403774380683899,
+      "learning_rate": 7.211740041928722e-05,
+      "loss": 0.2887,
+      "step": 172
+    },
+    {
+      "epoch": 0.03633308831250656,
+      "grad_norm": 0.5742720365524292,
+      "learning_rate": 7.253668763102726e-05,
+      "loss": 0.1841,
+      "step": 173
+    },
+    {
+      "epoch": 0.03654310616402394,
+      "grad_norm": 0.7217287421226501,
+      "learning_rate": 7.29559748427673e-05,
+      "loss": 0.283,
+      "step": 174
+    },
+    {
+      "epoch": 0.03675312401554132,
+      "grad_norm": 0.6517660021781921,
+      "learning_rate": 7.337526205450735e-05,
+      "loss": 0.277,
+      "step": 175
+    },
+    {
+      "epoch": 0.0369631418670587,
+      "grad_norm": 0.5237565040588379,
+      "learning_rate": 7.379454926624739e-05,
+      "loss": 0.2764,
+      "step": 176
+    },
+    {
+      "epoch": 0.03717315971857608,
+      "grad_norm": 0.5715314745903015,
+      "learning_rate": 7.421383647798742e-05,
+      "loss": 0.2947,
+      "step": 177
+    },
+    {
+      "epoch": 0.037383177570093455,
+      "grad_norm": 0.39689743518829346,
+      "learning_rate": 7.463312368972748e-05,
+      "loss": 0.1478,
+      "step": 178
+    },
+    {
+      "epoch": 0.03759319542161084,
+      "grad_norm": 0.62773197889328,
+      "learning_rate": 7.505241090146751e-05,
+      "loss": 0.2688,
+      "step": 179
+    },
+    {
+      "epoch": 0.03780321327312822,
+      "grad_norm": 0.5422549247741699,
+      "learning_rate": 7.547169811320755e-05,
+      "loss": 0.2852,
+      "step": 180
+    },
+    {
+      "epoch": 0.038013231124645594,
+      "grad_norm": 0.7973243594169617,
+      "learning_rate": 7.589098532494759e-05,
+      "loss": 0.2414,
+      "step": 181
+    },
+    {
+      "epoch": 0.03822324897616297,
+      "grad_norm": 0.596788227558136,
+      "learning_rate": 7.631027253668763e-05,
+      "loss": 0.2979,
+      "step": 182
+    },
+    {
+      "epoch": 0.038433266827680355,
+      "grad_norm": 0.7164194583892822,
+      "learning_rate": 7.672955974842768e-05,
+      "loss": 0.3195,
+      "step": 183
+    },
+    {
+      "epoch": 0.03864328467919773,
+      "grad_norm": 0.6374505758285522,
+      "learning_rate": 7.714884696016772e-05,
+      "loss": 0.2244,
+      "step": 184
+    },
+    {
+      "epoch": 0.03885330253071511,
+      "grad_norm": 0.7066443562507629,
+      "learning_rate": 7.756813417190776e-05,
+      "loss": 0.3328,
+      "step": 185
+    },
+    {
+      "epoch": 0.03906332038223249,
+      "grad_norm": 0.5930470824241638,
+      "learning_rate": 7.798742138364781e-05,
+      "loss": 0.208,
+      "step": 186
+    },
+    {
+      "epoch": 0.03927333823374987,
+      "grad_norm": 0.7578011155128479,
+      "learning_rate": 7.840670859538785e-05,
+      "loss": 0.3468,
+      "step": 187
+    },
+    {
+      "epoch": 0.03948335608526725,
+      "grad_norm": 0.5424745678901672,
+      "learning_rate": 7.882599580712788e-05,
+      "loss": 0.159,
+      "step": 188
+    },
+    {
+      "epoch": 0.039693373936784626,
+      "grad_norm": 0.6554968953132629,
+      "learning_rate": 7.924528301886794e-05,
+      "loss": 0.1718,
+      "step": 189
+    },
+    {
+      "epoch": 0.039903391788302,
+      "grad_norm": 0.596862256526947,
+      "learning_rate": 7.966457023060797e-05,
+      "loss": 0.2238,
+      "step": 190
+    },
+    {
+      "epoch": 0.04011340963981939,
+      "grad_norm": 0.7238299250602722,
+      "learning_rate": 8.008385744234801e-05,
+      "loss": 0.2322,
+      "step": 191
+    },
+    {
+      "epoch": 0.040323427491336765,
+      "grad_norm": 0.6230559349060059,
+      "learning_rate": 8.050314465408806e-05,
+      "loss": 0.166,
+      "step": 192
+    },
+    {
+      "epoch": 0.04053344534285414,
+      "grad_norm": 0.63409823179245,
+      "learning_rate": 8.09224318658281e-05,
+      "loss": 0.2431,
+      "step": 193
+    },
+    {
+      "epoch": 0.04074346319437152,
+      "grad_norm": 0.43581536412239075,
+      "learning_rate": 8.134171907756814e-05,
+      "loss": 0.1758,
+      "step": 194
+    },
+    {
+      "epoch": 0.040953481045888904,
+      "grad_norm": 0.5425090789794922,
+      "learning_rate": 8.176100628930818e-05,
+      "loss": 0.1214,
+      "step": 195
+    },
+    {
+      "epoch": 0.04116349889740628,
+      "grad_norm": 0.44201138615608215,
+      "learning_rate": 8.218029350104822e-05,
+      "loss": 0.148,
+      "step": 196
+    },
+    {
+      "epoch": 0.04137351674892366,
+      "grad_norm": 0.8185025453567505,
+      "learning_rate": 8.259958071278825e-05,
+      "loss": 0.2788,
+      "step": 197
+    },
+    {
+      "epoch": 0.041583534600441036,
+      "grad_norm": 0.5838762521743774,
+      "learning_rate": 8.30188679245283e-05,
+      "loss": 0.2547,
+      "step": 198
+    },
+    {
+      "epoch": 0.04179355245195842,
+      "grad_norm": 0.6128750443458557,
+      "learning_rate": 8.343815513626834e-05,
+      "loss": 0.2107,
+      "step": 199
+    },
+    {
+      "epoch": 0.0420035703034758,
+      "grad_norm": 0.6906862854957581,
+      "learning_rate": 8.385744234800838e-05,
+      "loss": 0.2066,
+      "step": 200
+    },
+    {
+      "epoch": 0.042213588154993174,
+      "grad_norm": 0.46028971672058105,
+      "learning_rate": 8.427672955974843e-05,
+      "loss": 0.3119,
+      "step": 201
+    },
+    {
+      "epoch": 0.04242360600651055,
+      "grad_norm": 0.45515576004981995,
+      "learning_rate": 8.469601677148847e-05,
+      "loss": 0.3191,
+      "step": 202
+    },
+    {
+      "epoch": 0.04263362385802793,
+      "grad_norm": 0.5129204392433167,
+      "learning_rate": 8.511530398322851e-05,
+      "loss": 0.2978,
+      "step": 203
+    },
+    {
+      "epoch": 0.04284364170954531,
+      "grad_norm": 0.5413036942481995,
+      "learning_rate": 8.553459119496856e-05,
+      "loss": 0.2489,
+      "step": 204
+    },
+    {
+      "epoch": 0.04305365956106269,
+      "grad_norm": 0.5111532211303711,
+      "learning_rate": 8.59538784067086e-05,
+      "loss": 0.2322,
+      "step": 205
+    },
+    {
+      "epoch": 0.04326367741258007,
+      "grad_norm": 0.50229412317276,
+      "learning_rate": 8.637316561844864e-05,
+      "loss": 0.2858,
+      "step": 206
+    },
+    {
+      "epoch": 0.043473695264097445,
+      "grad_norm": 0.5821024179458618,
+      "learning_rate": 8.679245283018869e-05,
+      "loss": 0.3118,
+      "step": 207
+    },
+    {
+      "epoch": 0.04368371311561483,
+      "grad_norm": 0.5523480176925659,
+      "learning_rate": 8.721174004192873e-05,
+      "loss": 0.3757,
+      "step": 208
+    },
+    {
+      "epoch": 0.043893730967132207,
+      "grad_norm": 0.721248209476471,
+      "learning_rate": 8.763102725366877e-05,
+      "loss": 0.3196,
+      "step": 209
+    },
+    {
+      "epoch": 0.044103748818649584,
+      "grad_norm": 0.47591283917427063,
+      "learning_rate": 8.805031446540882e-05,
+      "loss": 0.233,
+      "step": 210
+    },
+    {
+      "epoch": 0.04431376667016696,
+      "grad_norm": 0.5817727446556091,
+      "learning_rate": 8.846960167714886e-05,
+      "loss": 0.2924,
+      "step": 211
+    },
+    {
+      "epoch": 0.044523784521684345,
+      "grad_norm": 0.5370981097221375,
+      "learning_rate": 8.888888888888889e-05,
+      "loss": 0.2487,
+      "step": 212
+    },
+    {
+      "epoch": 0.04473380237320172,
+      "grad_norm": 0.47605571150779724,
+      "learning_rate": 8.930817610062893e-05,
+      "loss": 0.2249,
+      "step": 213
+    },
+    {
+      "epoch": 0.0449438202247191,
+      "grad_norm": 0.6315035223960876,
+      "learning_rate": 8.972746331236897e-05,
+      "loss": 0.2805,
+      "step": 214
+    },
+    {
+      "epoch": 0.04515383807623648,
+      "grad_norm": 0.7511045932769775,
+      "learning_rate": 9.014675052410901e-05,
+      "loss": 0.2235,
+      "step": 215
+    },
+    {
+      "epoch": 0.04536385592775386,
+      "grad_norm": 0.6990196704864502,
+      "learning_rate": 9.056603773584906e-05,
+      "loss": 0.2548,
+      "step": 216
+    },
+    {
+      "epoch": 0.04557387377927124,
+      "grad_norm": 0.6954676508903503,
+      "learning_rate": 9.09853249475891e-05,
+      "loss": 0.2232,
+      "step": 217
+    },
+    {
+      "epoch": 0.045783891630788616,
+      "grad_norm": 0.6898313164710999,
+      "learning_rate": 9.140461215932914e-05,
+      "loss": 0.2241,
+      "step": 218
+    },
+    {
+      "epoch": 0.04599390948230599,
+      "grad_norm": 0.649360716342926,
+      "learning_rate": 9.182389937106919e-05,
+      "loss": 0.3236,
+      "step": 219
+    },
+    {
+      "epoch": 0.04620392733382338,
+      "grad_norm": 0.6722248792648315,
+      "learning_rate": 9.224318658280923e-05,
+      "loss": 0.4025,
+      "step": 220
+    },
+    {
+      "epoch": 0.046413945185340755,
+      "grad_norm": 0.8752652406692505,
+      "learning_rate": 9.266247379454928e-05,
+      "loss": 0.2937,
+      "step": 221
+    },
+    {
+      "epoch": 0.04662396303685813,
+      "grad_norm": 0.6925809979438782,
+      "learning_rate": 9.308176100628931e-05,
+      "loss": 0.2719,
+      "step": 222
+    },
+    {
+      "epoch": 0.04683398088837551,
+      "grad_norm": 0.6006962656974792,
+      "learning_rate": 9.350104821802935e-05,
+      "loss": 0.3025,
+      "step": 223
+    },
+    {
+      "epoch": 0.047043998739892894,
+      "grad_norm": 0.7841734290122986,
+      "learning_rate": 9.39203354297694e-05,
+      "loss": 0.267,
+      "step": 224
+    },
+    {
+      "epoch": 0.04725401659141027,
+      "grad_norm": 0.6895375847816467,
+      "learning_rate": 9.433962264150944e-05,
+      "loss": 0.185,
+      "step": 225
+    },
+    {
+      "epoch": 0.04746403444292765,
+      "grad_norm": 0.4874520003795624,
+      "learning_rate": 9.475890985324948e-05,
+      "loss": 0.1967,
+      "step": 226
+    },
+    {
+      "epoch": 0.047674052294445025,
+      "grad_norm": 0.5807353258132935,
+      "learning_rate": 9.517819706498953e-05,
+      "loss": 0.2649,
+      "step": 227
+    },
+    {
+      "epoch": 0.04788407014596241,
+      "grad_norm": 0.5833479166030884,
+      "learning_rate": 9.559748427672957e-05,
+      "loss": 0.2506,
+      "step": 228
+    },
+    {
+      "epoch": 0.04809408799747979,
+      "grad_norm": 0.5458546280860901,
+      "learning_rate": 9.601677148846961e-05,
+      "loss": 0.2338,
+      "step": 229
+    },
+    {
+      "epoch": 0.048304105848997164,
+      "grad_norm": 0.543692409992218,
+      "learning_rate": 9.643605870020965e-05,
+      "loss": 0.2058,
+      "step": 230
+    },
+    {
+      "epoch": 0.04851412370051454,
+      "grad_norm": 0.8071588277816772,
+      "learning_rate": 9.685534591194969e-05,
+      "loss": 0.2658,
+      "step": 231
+    },
+    {
+      "epoch": 0.048724141552031926,
+      "grad_norm": 0.6677277088165283,
+      "learning_rate": 9.727463312368972e-05,
+      "loss": 0.3603,
+      "step": 232
+    },
+    {
+      "epoch": 0.0489341594035493,
+      "grad_norm": 0.6699347496032715,
+      "learning_rate": 9.769392033542977e-05,
+      "loss": 0.2474,
+      "step": 233
+    },
+    {
+      "epoch": 0.04914417725506668,
+      "grad_norm": 0.47732773423194885,
+      "learning_rate": 9.811320754716981e-05,
+      "loss": 0.1856,
+      "step": 234
+    },
+    {
+      "epoch": 0.04935419510658406,
+      "grad_norm": 0.738476037979126,
+      "learning_rate": 9.853249475890985e-05,
+      "loss": 0.2443,
+      "step": 235
+    },
+    {
+      "epoch": 0.04956421295810144,
+      "grad_norm": 0.6604114174842834,
+      "learning_rate": 9.89517819706499e-05,
+      "loss": 0.2656,
+      "step": 236
+    },
+    {
+      "epoch": 0.04977423080961882,
+      "grad_norm": 0.6403035521507263,
+      "learning_rate": 9.937106918238994e-05,
+      "loss": 0.2248,
+      "step": 237
+    },
+    {
+      "epoch": 0.049984248661136196,
+      "grad_norm": 0.7960561513900757,
+      "learning_rate": 9.979035639412998e-05,
+      "loss": 0.5721,
+      "step": 238
+    },
+    {
+      "epoch": 0.050194266512653574,
+      "grad_norm": 0.7372507452964783,
+      "learning_rate": 0.00010020964360587002,
+      "loss": 0.263,
+      "step": 239
+    },
+    {
+      "epoch": 0.05040428436417096,
+      "grad_norm": 0.5040899515151978,
+      "learning_rate": 0.00010062893081761007,
+      "loss": 0.2005,
+      "step": 240
+    },
+    {
+      "epoch": 0.050614302215688335,
+      "grad_norm": 0.5214698314666748,
+      "learning_rate": 0.00010104821802935012,
+      "loss": 0.1686,
+      "step": 241
+    },
+    {
+      "epoch": 0.05082432006720571,
+      "grad_norm": 0.5759347677230835,
+      "learning_rate": 0.00010146750524109014,
+      "loss": 0.213,
+      "step": 242
+    },
+    {
+      "epoch": 0.05103433791872309,
+      "grad_norm": 0.5980076789855957,
+      "learning_rate": 0.0001018867924528302,
+      "loss": 0.2073,
+      "step": 243
+    },
+    {
+      "epoch": 0.05124435577024047,
+      "grad_norm": 0.4733896851539612,
+      "learning_rate": 0.00010230607966457025,
+      "loss": 0.2275,
+      "step": 244
+    },
+    {
+      "epoch": 0.05145437362175785,
+      "grad_norm": 0.597186803817749,
+      "learning_rate": 0.00010272536687631027,
+      "loss": 0.2205,
+      "step": 245
+    },
+    {
+      "epoch": 0.05166439147327523,
+      "grad_norm": 0.7414665818214417,
+      "learning_rate": 0.00010314465408805032,
+      "loss": 0.2796,
+      "step": 246
+    },
+    {
+      "epoch": 0.051874409324792606,
+      "grad_norm": 0.5712472200393677,
+      "learning_rate": 0.00010356394129979036,
+      "loss": 0.2343,
+      "step": 247
+    },
+    {
+      "epoch": 0.05208442717630998,
+      "grad_norm": 0.7146592140197754,
+      "learning_rate": 0.0001039832285115304,
+      "loss": 0.2339,
+      "step": 248
+    },
+    {
+      "epoch": 0.05229444502782737,
+      "grad_norm": 0.5384387373924255,
+      "learning_rate": 0.00010440251572327044,
+      "loss": 0.3013,
+      "step": 249
+    },
+    {
+      "epoch": 0.052504462879344745,
+      "grad_norm": 0.6523765921592712,
+      "learning_rate": 0.00010482180293501049,
+      "loss": 0.2295,
+      "step": 250
+    },
+    {
+      "epoch": 0.05271448073086212,
+      "grad_norm": 0.5877966284751892,
+      "learning_rate": 0.00010524109014675052,
+      "loss": 0.3045,
+      "step": 251
+    },
+    {
+      "epoch": 0.0529244985823795,
+      "grad_norm": 0.5458659529685974,
+      "learning_rate": 0.00010566037735849057,
+      "loss": 0.3458,
+      "step": 252
+    },
+    {
+      "epoch": 0.05313451643389688,
+      "grad_norm": 0.6964541673660278,
+      "learning_rate": 0.00010607966457023062,
+      "loss": 0.3715,
+      "step": 253
+    },
+    {
+      "epoch": 0.05334453428541426,
+      "grad_norm": 0.4040902853012085,
+      "learning_rate": 0.00010649895178197064,
+      "loss": 0.1554,
+      "step": 254
+    },
+    {
+      "epoch": 0.05355455213693164,
+      "grad_norm": 0.7013939023017883,
+      "learning_rate": 0.0001069182389937107,
+      "loss": 0.4111,
+      "step": 255
+    },
+    {
+      "epoch": 0.053764569988449015,
+      "grad_norm": 0.622279942035675,
+      "learning_rate": 0.00010733752620545075,
+      "loss": 0.2715,
+      "step": 256
+    },
+    {
+      "epoch": 0.0539745878399664,
+      "grad_norm": 0.6869280934333801,
+      "learning_rate": 0.00010775681341719077,
+      "loss": 0.3444,
+      "step": 257
+    },
+    {
+      "epoch": 0.05418460569148378,
+      "grad_norm": 0.6003340482711792,
+      "learning_rate": 0.00010817610062893082,
+      "loss": 0.261,
+      "step": 258
+    },
+    {
+      "epoch": 0.054394623543001154,
+      "grad_norm": 0.4238702356815338,
+      "learning_rate": 0.00010859538784067087,
+      "loss": 0.2615,
+      "step": 259
+    },
+    {
+      "epoch": 0.05460464139451853,
+      "grad_norm": 0.5799490809440613,
+      "learning_rate": 0.0001090146750524109,
+      "loss": 0.3293,
+      "step": 260
+    },
+    {
+      "epoch": 0.054814659246035916,
+      "grad_norm": 0.6378821134567261,
+      "learning_rate": 0.00010943396226415095,
+      "loss": 0.3184,
+      "step": 261
+    },
+    {
+      "epoch": 0.05502467709755329,
+      "grad_norm": 0.540050745010376,
+      "learning_rate": 0.000109853249475891,
+      "loss": 0.262,
+      "step": 262
+    },
+    {
+      "epoch": 0.05523469494907067,
+      "grad_norm": 0.551968514919281,
+      "learning_rate": 0.00011027253668763103,
+      "loss": 0.3183,
+      "step": 263
+    },
+    {
+      "epoch": 0.05544471280058805,
+      "grad_norm": 0.6024414896965027,
+      "learning_rate": 0.00011069182389937108,
+      "loss": 0.2883,
+      "step": 264
+    },
+    {
+      "epoch": 0.05565473065210543,
+      "grad_norm": 0.6821377277374268,
+      "learning_rate": 0.00011111111111111112,
+      "loss": 0.3178,
+      "step": 265
+    },
+    {
+      "epoch": 0.05586474850362281,
+      "grad_norm": 0.6535030603408813,
+      "learning_rate": 0.00011153039832285115,
+      "loss": 0.2091,
+      "step": 266
+    },
+    {
+      "epoch": 0.056074766355140186,
+      "grad_norm": 0.5255767703056335,
+      "learning_rate": 0.00011194968553459119,
+      "loss": 0.1802,
+      "step": 267
+    },
+    {
+      "epoch": 0.056284784206657563,
+      "grad_norm": 0.59894859790802,
+      "learning_rate": 0.00011236897274633124,
+      "loss": 0.2526,
+      "step": 268
+    },
+    {
+      "epoch": 0.05649480205817495,
+      "grad_norm": 0.6401522159576416,
+      "learning_rate": 0.00011278825995807127,
+      "loss": 0.2462,
+      "step": 269
+    },
+    {
+      "epoch": 0.056704819909692325,
+      "grad_norm": 0.6755663752555847,
+      "learning_rate": 0.00011320754716981132,
+      "loss": 0.1973,
+      "step": 270
+    },
+    {
+      "epoch": 0.0569148377612097,
+      "grad_norm": 0.5885902643203735,
+      "learning_rate": 0.00011362683438155137,
+      "loss": 0.2111,
+      "step": 271
+    },
+    {
+      "epoch": 0.05712485561272708,
+      "grad_norm": 0.48771098256111145,
+      "learning_rate": 0.0001140461215932914,
+      "loss": 0.2409,
+      "step": 272
+    },
+    {
+      "epoch": 0.057334873464244464,
+      "grad_norm": 0.5513054132461548,
+      "learning_rate": 0.00011446540880503145,
+      "loss": 0.1883,
+      "step": 273
+    },
+    {
+      "epoch": 0.05754489131576184,
+      "grad_norm": 0.43761372566223145,
+      "learning_rate": 0.0001148846960167715,
+      "loss": 0.228,
+      "step": 274
+    },
+    {
+      "epoch": 0.05775490916727922,
+      "grad_norm": 0.5232881307601929,
+      "learning_rate": 0.00011530398322851152,
+      "loss": 0.1592,
+      "step": 275
+    },
+    {
+      "epoch": 0.057964927018796596,
+      "grad_norm": 0.5873312950134277,
+      "learning_rate": 0.00011572327044025158,
+      "loss": 0.1972,
+      "step": 276
+    },
+    {
+      "epoch": 0.05817494487031398,
+      "grad_norm": 0.5464483499526978,
+      "learning_rate": 0.00011614255765199163,
+      "loss": 0.1502,
+      "step": 277
+    },
+    {
+      "epoch": 0.05838496272183136,
+      "grad_norm": 0.6480989456176758,
+      "learning_rate": 0.00011656184486373165,
+      "loss": 0.2349,
+      "step": 278
+    },
+    {
+      "epoch": 0.058594980573348734,
+      "grad_norm": 0.41417571902275085,
+      "learning_rate": 0.0001169811320754717,
+      "loss": 0.2097,
+      "step": 279
+    },
+    {
+      "epoch": 0.05880499842486611,
+      "grad_norm": 0.8272523880004883,
+      "learning_rate": 0.00011740041928721176,
+      "loss": 0.2676,
+      "step": 280
+    },
+    {
+      "epoch": 0.059015016276383496,
+      "grad_norm": 0.6363915205001831,
+      "learning_rate": 0.0001178197064989518,
+      "loss": 0.3084,
+      "step": 281
+    },
+    {
+      "epoch": 0.05922503412790087,
+      "grad_norm": 0.6411394476890564,
+      "learning_rate": 0.00011823899371069183,
+      "loss": 0.244,
+      "step": 282
+    },
+    {
+      "epoch": 0.05943505197941825,
+      "grad_norm": 0.9145995378494263,
+      "learning_rate": 0.00011865828092243187,
+      "loss": 0.298,
+      "step": 283
+    },
+    {
+      "epoch": 0.05964506983093563,
+      "grad_norm": 0.7248232960700989,
+      "learning_rate": 0.00011907756813417192,
+      "loss": 0.2438,
+      "step": 284
+    },
+    {
+      "epoch": 0.059855087682453005,
+      "grad_norm": 0.4901827573776245,
+      "learning_rate": 0.00011949685534591195,
+      "loss": 0.1903,
+      "step": 285
+    },
+    {
+      "epoch": 0.06006510553397039,
+      "grad_norm": 0.5104687809944153,
+      "learning_rate": 0.000119916142557652,
+      "loss": 0.2014,
+      "step": 286
+    },
+    {
+      "epoch": 0.06027512338548777,
+      "grad_norm": 0.5063393712043762,
+      "learning_rate": 0.00012033542976939205,
+      "loss": 0.212,
+      "step": 287
+    },
+    {
+      "epoch": 0.060485141237005144,
+      "grad_norm": 0.6044209599494934,
+      "learning_rate": 0.00012075471698113207,
+      "loss": 0.3138,
+      "step": 288
+    },
+    {
+      "epoch": 0.06069515908852252,
+      "grad_norm": 0.5843082666397095,
+      "learning_rate": 0.00012117400419287213,
+      "loss": 0.2199,
+      "step": 289
+    },
+    {
+      "epoch": 0.060905176940039905,
+      "grad_norm": 0.4589983820915222,
+      "learning_rate": 0.00012159329140461218,
+      "loss": 0.2222,
+      "step": 290
+    },
+    {
+      "epoch": 0.06111519479155728,
+      "grad_norm": 0.4094448983669281,
+      "learning_rate": 0.0001220125786163522,
+      "loss": 0.1286,
+      "step": 291
+    },
+    {
+      "epoch": 0.06132521264307466,
+      "grad_norm": 0.42624855041503906,
+      "learning_rate": 0.00012243186582809224,
+      "loss": 0.2719,
+      "step": 292
+    },
+    {
+      "epoch": 0.06153523049459204,
+      "grad_norm": 0.5488569736480713,
+      "learning_rate": 0.0001228511530398323,
+      "loss": 0.2588,
+      "step": 293
+    },
+    {
+      "epoch": 0.06174524834610942,
+      "grad_norm": 0.6029438972473145,
+      "learning_rate": 0.00012327044025157232,
+      "loss": 0.3182,
+      "step": 294
+    },
+    {
+      "epoch": 0.0619552661976268,
+      "grad_norm": 0.49090123176574707,
+      "learning_rate": 0.00012368972746331237,
+      "loss": 0.2192,
+      "step": 295
+    },
+    {
+      "epoch": 0.062165284049144176,
+      "grad_norm": 0.7553131580352783,
+      "learning_rate": 0.00012410901467505242,
+      "loss": 0.2932,
+      "step": 296
+    },
+    {
+      "epoch": 0.06237530190066155,
+      "grad_norm": 0.6839373707771301,
+      "learning_rate": 0.00012452830188679244,
+      "loss": 0.1896,
+      "step": 297
+    },
+    {
+      "epoch": 0.06258531975217893,
+      "grad_norm": 0.5805861949920654,
+      "learning_rate": 0.0001249475890985325,
+      "loss": 0.2613,
+      "step": 298
+    },
+    {
+      "epoch": 0.06279533760369631,
+      "grad_norm": 0.4247298836708069,
+      "learning_rate": 0.00012536687631027255,
+      "loss": 0.1701,
+      "step": 299
+    },
+    {
+      "epoch": 0.0630053554552137,
+      "grad_norm": 0.6167422533035278,
+      "learning_rate": 0.00012578616352201257,
+      "loss": 0.2919,
+      "step": 300
+    },
+    {
+      "epoch": 0.06321537330673108,
+      "grad_norm": 0.5140472054481506,
+      "learning_rate": 0.00012620545073375262,
+      "loss": 0.2204,
+      "step": 301
+    },
+    {
+      "epoch": 0.06342539115824845,
+      "grad_norm": 0.48360675573349,
+      "learning_rate": 0.00012662473794549268,
+      "loss": 0.2625,
+      "step": 302
+    },
+    {
+      "epoch": 0.06363540900976583,
+      "grad_norm": 0.5805841684341431,
+      "learning_rate": 0.0001270440251572327,
+      "loss": 0.2659,
+      "step": 303
+    },
+    {
+      "epoch": 0.06384542686128321,
+      "grad_norm": 0.4108704924583435,
+      "learning_rate": 0.00012746331236897275,
+      "loss": 0.1757,
+      "step": 304
+    },
+    {
+      "epoch": 0.06405544471280059,
+      "grad_norm": 0.4739980697631836,
+      "learning_rate": 0.0001278825995807128,
+      "loss": 0.2413,
+      "step": 305
+    },
+    {
+      "epoch": 0.06426546256431796,
+      "grad_norm": 0.6421864032745361,
+      "learning_rate": 0.00012830188679245283,
+      "loss": 0.3373,
+      "step": 306
+    },
+    {
+      "epoch": 0.06447548041583534,
+      "grad_norm": 0.6035056114196777,
+      "learning_rate": 0.00012872117400419288,
+      "loss": 0.1632,
+      "step": 307
+    },
+    {
+      "epoch": 0.06468549826735273,
+      "grad_norm": 0.5946957468986511,
+      "learning_rate": 0.00012914046121593293,
+      "loss": 0.2797,
+      "step": 308
+    },
+    {
+      "epoch": 0.06489551611887011,
+      "grad_norm": 0.5636250972747803,
+      "learning_rate": 0.00012955974842767296,
+      "loss": 0.3484,
+      "step": 309
+    },
+    {
+      "epoch": 0.06510553397038749,
+      "grad_norm": 0.5175902843475342,
+      "learning_rate": 0.000129979035639413,
+      "loss": 0.2306,
+      "step": 310
+    },
+    {
+      "epoch": 0.06531555182190486,
+      "grad_norm": 0.39933711290359497,
+      "learning_rate": 0.00013039832285115306,
+      "loss": 0.2018,
+      "step": 311
+    },
+    {
+      "epoch": 0.06552556967342224,
+      "grad_norm": 0.6203914284706116,
+      "learning_rate": 0.00013081761006289308,
+      "loss": 0.2519,
+      "step": 312
+    },
+    {
+      "epoch": 0.06573558752493962,
+      "grad_norm": 0.6847423911094666,
+      "learning_rate": 0.00013123689727463314,
+      "loss": 0.2125,
+      "step": 313
+    },
+    {
+      "epoch": 0.065945605376457,
+      "grad_norm": 0.5958030223846436,
+      "learning_rate": 0.0001316561844863732,
+      "loss": 0.2019,
+      "step": 314
+    },
+    {
+      "epoch": 0.06615562322797437,
+      "grad_norm": 0.4878827631473541,
+      "learning_rate": 0.0001320754716981132,
+      "loss": 0.2286,
+      "step": 315
+    },
+    {
+      "epoch": 0.06636564107949176,
+      "grad_norm": 0.5386853814125061,
+      "learning_rate": 0.00013249475890985326,
+      "loss": 0.2349,
+      "step": 316
+    },
+    {
+      "epoch": 0.06657565893100914,
+      "grad_norm": 0.5583687424659729,
+      "learning_rate": 0.00013291404612159332,
+      "loss": 0.273,
+      "step": 317
+    },
+    {
+      "epoch": 0.06678567678252652,
+      "grad_norm": 0.503384530544281,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.2718,
+      "step": 318
+    },
+    {
+      "epoch": 0.0669956946340439,
+      "grad_norm": 0.6256868839263916,
+      "learning_rate": 0.0001337526205450734,
+      "loss": 0.2176,
+      "step": 319
+    },
+    {
+      "epoch": 0.06720571248556127,
+      "grad_norm": 0.4585525095462799,
+      "learning_rate": 0.00013417190775681344,
+      "loss": 0.1671,
+      "step": 320
+    },
+    {
+      "epoch": 0.06741573033707865,
+      "grad_norm": 0.52493816614151,
+      "learning_rate": 0.00013459119496855347,
+      "loss": 0.2129,
+      "step": 321
+    },
+    {
+      "epoch": 0.06762574818859603,
+      "grad_norm": 0.7206648588180542,
+      "learning_rate": 0.00013501048218029352,
+      "loss": 0.1872,
+      "step": 322
+    },
+    {
+      "epoch": 0.0678357660401134,
+      "grad_norm": 0.5732535123825073,
+      "learning_rate": 0.00013542976939203354,
+      "loss": 0.2131,
+      "step": 323
+    },
+    {
+      "epoch": 0.0680457838916308,
+      "grad_norm": 0.5404482483863831,
+      "learning_rate": 0.0001358490566037736,
+      "loss": 0.1472,
+      "step": 324
+    },
+    {
+      "epoch": 0.06825580174314817,
+      "grad_norm": 0.7235817313194275,
+      "learning_rate": 0.00013626834381551362,
+      "loss": 0.2404,
+      "step": 325
+    },
+    {
+      "epoch": 0.06846581959466555,
+      "grad_norm": 0.4254133999347687,
+      "learning_rate": 0.00013668763102725367,
+      "loss": 0.2133,
+      "step": 326
+    },
+    {
+      "epoch": 0.06867583744618293,
+      "grad_norm": 0.4804741144180298,
+      "learning_rate": 0.0001371069182389937,
+      "loss": 0.1776,
+      "step": 327
+    },
+    {
+      "epoch": 0.0688858552977003,
+      "grad_norm": 0.4900747537612915,
+      "learning_rate": 0.00013752620545073375,
+      "loss": 0.1958,
+      "step": 328
+    },
+    {
+      "epoch": 0.06909587314921768,
+      "grad_norm": 0.576337456703186,
+      "learning_rate": 0.0001379454926624738,
+      "loss": 0.2318,
+      "step": 329
+    },
+    {
+      "epoch": 0.06930589100073506,
+      "grad_norm": 0.5610971450805664,
+      "learning_rate": 0.00013836477987421382,
+      "loss": 0.2631,
+      "step": 330
+    },
+    {
+      "epoch": 0.06951590885225244,
+      "grad_norm": 0.6010019779205322,
+      "learning_rate": 0.00013878406708595388,
+      "loss": 0.2201,
+      "step": 331
+    },
+    {
+      "epoch": 0.06972592670376981,
+      "grad_norm": 0.4658229947090149,
+      "learning_rate": 0.00013920335429769393,
+      "loss": 0.1602,
+      "step": 332
+    },
+    {
+      "epoch": 0.0699359445552872,
+      "grad_norm": 0.5411532521247864,
+      "learning_rate": 0.00013962264150943395,
+      "loss": 0.1942,
+      "step": 333
+    },
+    {
+      "epoch": 0.07014596240680458,
+      "grad_norm": 0.875629186630249,
+      "learning_rate": 0.000140041928721174,
+      "loss": 0.2337,
+      "step": 334
+    },
+    {
+      "epoch": 0.07035598025832196,
+      "grad_norm": 0.5620985627174377,
+      "learning_rate": 0.00014046121593291406,
+      "loss": 0.2641,
+      "step": 335
+    },
+    {
+      "epoch": 0.07056599810983934,
+      "grad_norm": 0.8389297723770142,
+      "learning_rate": 0.00014088050314465408,
+      "loss": 0.3069,
+      "step": 336
+    },
+    {
+      "epoch": 0.07077601596135671,
+      "grad_norm": 0.4745865762233734,
+      "learning_rate": 0.00014129979035639413,
+      "loss": 0.1626,
+      "step": 337
+    },
+    {
+      "epoch": 0.07098603381287409,
+      "grad_norm": 0.4688374996185303,
+      "learning_rate": 0.00014171907756813418,
+      "loss": 0.1327,
+      "step": 338
+    },
+    {
+      "epoch": 0.07119605166439147,
+      "grad_norm": 0.4219890832901001,
+      "learning_rate": 0.0001421383647798742,
+      "loss": 0.142,
+      "step": 339
+    },
+    {
+      "epoch": 0.07140606951590885,
+      "grad_norm": 0.700579047203064,
+      "learning_rate": 0.00014255765199161426,
+      "loss": 0.179,
+      "step": 340
+    },
+    {
+      "epoch": 0.07161608736742624,
+      "grad_norm": 0.36132583022117615,
+      "learning_rate": 0.0001429769392033543,
+      "loss": 0.1283,
+      "step": 341
+    },
+    {
+      "epoch": 0.07182610521894361,
+      "grad_norm": 0.9342030882835388,
+      "learning_rate": 0.00014339622641509434,
+      "loss": 0.3873,
+      "step": 342
+    },
+    {
+      "epoch": 0.07203612307046099,
+      "grad_norm": 0.6389639973640442,
+      "learning_rate": 0.0001438155136268344,
+      "loss": 0.2631,
+      "step": 343
+    },
+    {
+      "epoch": 0.07224614092197837,
+      "grad_norm": 0.7687662243843079,
+      "learning_rate": 0.00014423480083857444,
+      "loss": 0.2002,
+      "step": 344
+    },
+    {
+      "epoch": 0.07245615877349575,
+      "grad_norm": 0.6517148613929749,
+      "learning_rate": 0.00014465408805031446,
+      "loss": 0.2454,
+      "step": 345
+    },
+    {
+      "epoch": 0.07266617662501312,
+      "grad_norm": 0.5010355710983276,
+      "learning_rate": 0.00014507337526205452,
+      "loss": 0.1541,
+      "step": 346
+    },
+    {
+      "epoch": 0.0728761944765305,
+      "grad_norm": 0.49431943893432617,
+      "learning_rate": 0.00014549266247379457,
+      "loss": 0.213,
+      "step": 347
+    },
+    {
+      "epoch": 0.07308621232804788,
+      "grad_norm": 0.6462149024009705,
+      "learning_rate": 0.0001459119496855346,
+      "loss": 0.2642,
+      "step": 348
+    },
+    {
+      "epoch": 0.07329623017956527,
+      "grad_norm": 0.5412748456001282,
+      "learning_rate": 0.00014633123689727464,
+      "loss": 0.187,
+      "step": 349
+    },
+    {
+      "epoch": 0.07350624803108265,
+      "grad_norm": 0.6458069682121277,
+      "learning_rate": 0.0001467505241090147,
+      "loss": 0.3224,
+      "step": 350
+    },
+    {
+      "epoch": 0.07371626588260002,
+      "grad_norm": 0.4398702383041382,
+      "learning_rate": 0.00014716981132075472,
+      "loss": 0.2419,
+      "step": 351
+    },
+    {
+      "epoch": 0.0739262837341174,
+      "grad_norm": 0.47583240270614624,
+      "learning_rate": 0.00014758909853249477,
+      "loss": 0.2259,
+      "step": 352
+    },
+    {
+      "epoch": 0.07413630158563478,
+      "grad_norm": 0.5058132410049438,
+      "learning_rate": 0.00014800838574423482,
+      "loss": 0.3733,
+      "step": 353
+    },
+    {
+      "epoch": 0.07434631943715216,
+      "grad_norm": 0.4765789210796356,
+      "learning_rate": 0.00014842767295597485,
+      "loss": 0.3204,
+      "step": 354
+    },
+    {
+      "epoch": 0.07455633728866953,
+      "grad_norm": 0.4549868106842041,
+      "learning_rate": 0.0001488469601677149,
+      "loss": 0.2788,
+      "step": 355
+    },
+    {
+      "epoch": 0.07476635514018691,
+      "grad_norm": 0.44640183448791504,
+      "learning_rate": 0.00014926624737945495,
+      "loss": 0.225,
+      "step": 356
+    },
+    {
+      "epoch": 0.0749763729917043,
+      "grad_norm": 0.5040209293365479,
+      "learning_rate": 0.00014968553459119498,
+      "loss": 0.288,
+      "step": 357
+    },
+    {
+      "epoch": 0.07518639084322168,
+      "grad_norm": 0.7681525349617004,
+      "learning_rate": 0.00015010482180293503,
+      "loss": 0.2677,
+      "step": 358
+    },
+    {
+      "epoch": 0.07539640869473906,
+      "grad_norm": 0.3658473491668701,
+      "learning_rate": 0.00015052410901467505,
+      "loss": 0.1944,
+      "step": 359
+    },
+    {
+      "epoch": 0.07560642654625643,
+      "grad_norm": 0.5071917772293091,
+      "learning_rate": 0.0001509433962264151,
+      "loss": 0.1937,
+      "step": 360
+    },
+    {
+      "epoch": 0.07581644439777381,
+      "grad_norm": 0.5669259428977966,
+      "learning_rate": 0.00015136268343815513,
+      "loss": 0.2755,
+      "step": 361
+    },
+    {
+      "epoch": 0.07602646224929119,
+      "grad_norm": 0.5721021294593811,
+      "learning_rate": 0.00015178197064989518,
+      "loss": 0.2233,
+      "step": 362
+    },
+    {
+      "epoch": 0.07623648010080857,
+      "grad_norm": 0.5776953101158142,
+      "learning_rate": 0.00015220125786163523,
+      "loss": 0.2918,
+      "step": 363
+    },
+    {
+      "epoch": 0.07644649795232594,
+      "grad_norm": 0.7863572239875793,
+      "learning_rate": 0.00015262054507337526,
+      "loss": 0.3225,
+      "step": 364
+    },
+    {
+      "epoch": 0.07665651580384332,
+      "grad_norm": 0.7403888702392578,
+      "learning_rate": 0.0001530398322851153,
+      "loss": 0.1836,
+      "step": 365
+    },
+    {
+      "epoch": 0.07686653365536071,
+      "grad_norm": 0.7344810962677002,
+      "learning_rate": 0.00015345911949685536,
+      "loss": 0.342,
+      "step": 366
+    },
+    {
+      "epoch": 0.07707655150687809,
+      "grad_norm": 0.6341666579246521,
+      "learning_rate": 0.00015387840670859538,
+      "loss": 0.2222,
+      "step": 367
+    },
+    {
+      "epoch": 0.07728656935839547,
+      "grad_norm": 0.7821016907691956,
+      "learning_rate": 0.00015429769392033544,
+      "loss": 0.3106,
+      "step": 368
+    },
+    {
+      "epoch": 0.07749658720991284,
+      "grad_norm": 0.5648399591445923,
+      "learning_rate": 0.0001547169811320755,
+      "loss": 0.1907,
+      "step": 369
+    },
+    {
+      "epoch": 0.07770660506143022,
+      "grad_norm": 0.5853981971740723,
+      "learning_rate": 0.0001551362683438155,
+      "loss": 0.1873,
+      "step": 370
+    },
+    {
+      "epoch": 0.0779166229129476,
+      "grad_norm": 0.6429926753044128,
+      "learning_rate": 0.00015555555555555556,
+      "loss": 0.177,
+      "step": 371
+    },
+    {
+      "epoch": 0.07812664076446497,
+      "grad_norm": 0.5365523099899292,
+      "learning_rate": 0.00015597484276729561,
+      "loss": 0.2283,
+      "step": 372
+    },
+    {
+      "epoch": 0.07833665861598235,
+      "grad_norm": 0.4820340871810913,
+      "learning_rate": 0.00015639412997903564,
+      "loss": 0.2179,
+      "step": 373
+    },
+    {
+      "epoch": 0.07854667646749974,
+      "grad_norm": 0.5231903195381165,
+      "learning_rate": 0.0001568134171907757,
+      "loss": 0.2165,
+      "step": 374
+    },
+    {
+      "epoch": 0.07875669431901712,
+      "grad_norm": 0.6309874057769775,
+      "learning_rate": 0.00015723270440251574,
+      "loss": 0.2511,
+      "step": 375
+    },
+    {
+      "epoch": 0.0789667121705345,
+      "grad_norm": 0.6248964667320251,
+      "learning_rate": 0.00015765199161425577,
+      "loss": 0.192,
+      "step": 376
+    },
+    {
+      "epoch": 0.07917673002205187,
+      "grad_norm": 0.4089469611644745,
+      "learning_rate": 0.00015807127882599582,
+      "loss": 0.1674,
+      "step": 377
+    },
+    {
+      "epoch": 0.07938674787356925,
+      "grad_norm": 0.5720129609107971,
+      "learning_rate": 0.00015849056603773587,
+      "loss": 0.2571,
+      "step": 378
+    },
+    {
+      "epoch": 0.07959676572508663,
+      "grad_norm": 0.505424976348877,
+      "learning_rate": 0.0001589098532494759,
+      "loss": 0.2189,
+      "step": 379
+    },
+    {
+      "epoch": 0.079806783576604,
+      "grad_norm": 0.4483712315559387,
+      "learning_rate": 0.00015932914046121595,
+      "loss": 0.2959,
+      "step": 380
+    },
+    {
+      "epoch": 0.08001680142812138,
+      "grad_norm": 0.6313521862030029,
+      "learning_rate": 0.000159748427672956,
+      "loss": 0.237,
+      "step": 381
+    },
+    {
+      "epoch": 0.08022681927963878,
+      "grad_norm": 0.530503511428833,
+      "learning_rate": 0.00016016771488469602,
+      "loss": 0.1922,
+      "step": 382
+    },
+    {
+      "epoch": 0.08043683713115615,
+      "grad_norm": 0.65278160572052,
+      "learning_rate": 0.00016058700209643607,
+      "loss": 0.2714,
+      "step": 383
+    },
+    {
+      "epoch": 0.08064685498267353,
+      "grad_norm": 0.6226363182067871,
+      "learning_rate": 0.00016100628930817613,
+      "loss": 0.1913,
+      "step": 384
+    },
+    {
+      "epoch": 0.08085687283419091,
+      "grad_norm": 0.6313908696174622,
+      "learning_rate": 0.00016142557651991615,
+      "loss": 0.2457,
+      "step": 385
+    },
+    {
+      "epoch": 0.08106689068570828,
+      "grad_norm": 0.5335121750831604,
+      "learning_rate": 0.0001618448637316562,
+      "loss": 0.2537,
+      "step": 386
+    },
+    {
+      "epoch": 0.08127690853722566,
+      "grad_norm": 0.7243566513061523,
+      "learning_rate": 0.00016226415094339625,
+      "loss": 0.2125,
+      "step": 387
+    },
+    {
+      "epoch": 0.08148692638874304,
+      "grad_norm": 0.5874237418174744,
+      "learning_rate": 0.00016268343815513628,
+      "loss": 0.2104,
+      "step": 388
+    },
+    {
+      "epoch": 0.08169694424026042,
+      "grad_norm": 0.5792878866195679,
+      "learning_rate": 0.00016310272536687633,
+      "loss": 0.198,
+      "step": 389
+    },
+    {
+      "epoch": 0.08190696209177781,
+      "grad_norm": 0.5439760088920593,
+      "learning_rate": 0.00016352201257861635,
+      "loss": 0.1895,
+      "step": 390
+    },
+    {
+      "epoch": 0.08211697994329518,
+      "grad_norm": 0.6903837323188782,
+      "learning_rate": 0.0001639412997903564,
+      "loss": 0.2587,
+      "step": 391
+    },
+    {
+      "epoch": 0.08232699779481256,
+      "grad_norm": 0.6126405596733093,
+      "learning_rate": 0.00016436058700209643,
+      "loss": 0.2232,
+      "step": 392
+    },
+    {
+      "epoch": 0.08253701564632994,
+      "grad_norm": 0.9248547554016113,
+      "learning_rate": 0.00016477987421383648,
+      "loss": 0.267,
+      "step": 393
+    },
+    {
+      "epoch": 0.08274703349784732,
+      "grad_norm": 0.6509301066398621,
+      "learning_rate": 0.0001651991614255765,
+      "loss": 0.2109,
+      "step": 394
+    },
+    {
+      "epoch": 0.0829570513493647,
+      "grad_norm": 0.5985137820243835,
+      "learning_rate": 0.00016561844863731656,
+      "loss": 0.1783,
+      "step": 395
+    },
+    {
+      "epoch": 0.08316706920088207,
+      "grad_norm": 0.6711693406105042,
+      "learning_rate": 0.0001660377358490566,
+      "loss": 0.2115,
+      "step": 396
+    },
+    {
+      "epoch": 0.08337708705239945,
+      "grad_norm": 0.4494445025920868,
+      "learning_rate": 0.00016645702306079664,
+      "loss": 0.1486,
+      "step": 397
+    },
+    {
+      "epoch": 0.08358710490391684,
+      "grad_norm": 0.5083547830581665,
+      "learning_rate": 0.0001668763102725367,
+      "loss": 0.2317,
+      "step": 398
+    },
+    {
+      "epoch": 0.08379712275543422,
+      "grad_norm": 0.7552763819694519,
+      "learning_rate": 0.00016729559748427674,
+      "loss": 0.236,
+      "step": 399
+    },
+    {
+      "epoch": 0.0840071406069516,
+      "grad_norm": 0.7656201124191284,
+      "learning_rate": 0.00016771488469601676,
+      "loss": 0.2732,
+      "step": 400
+    },
+    {
+      "epoch": 0.08421715845846897,
+      "grad_norm": 0.3850518465042114,
+      "learning_rate": 0.00016813417190775681,
+      "loss": 0.3322,
+      "step": 401
+    },
+    {
+      "epoch": 0.08442717630998635,
+      "grad_norm": 0.5610989928245544,
+      "learning_rate": 0.00016855345911949687,
+      "loss": 0.2844,
+      "step": 402
+    },
+    {
+      "epoch": 0.08463719416150373,
+      "grad_norm": 0.7500874400138855,
+      "learning_rate": 0.0001689727463312369,
+      "loss": 0.3651,
+      "step": 403
+    },
+    {
+      "epoch": 0.0848472120130211,
+      "grad_norm": 0.45343145728111267,
+      "learning_rate": 0.00016939203354297694,
+      "loss": 0.2174,
+      "step": 404
+    },
+    {
+      "epoch": 0.08505722986453848,
+      "grad_norm": 0.6427581310272217,
+      "learning_rate": 0.000169811320754717,
+      "loss": 0.2748,
+      "step": 405
+    },
+    {
+      "epoch": 0.08526724771605586,
+      "grad_norm": 0.64598149061203,
+      "learning_rate": 0.00017023060796645702,
+      "loss": 0.2912,
+      "step": 406
+    },
+    {
+      "epoch": 0.08547726556757325,
+      "grad_norm": 0.49100759625434875,
+      "learning_rate": 0.00017064989517819707,
+      "loss": 0.2582,
+      "step": 407
+    },
+    {
+      "epoch": 0.08568728341909063,
+      "grad_norm": 0.5637136101722717,
+      "learning_rate": 0.00017106918238993712,
+      "loss": 0.254,
+      "step": 408
+    },
+    {
+      "epoch": 0.085897301270608,
+      "grad_norm": 0.5617924928665161,
+      "learning_rate": 0.00017148846960167715,
+      "loss": 0.2043,
+      "step": 409
+    },
+    {
+      "epoch": 0.08610731912212538,
+      "grad_norm": 0.5467379093170166,
+      "learning_rate": 0.0001719077568134172,
+      "loss": 0.2363,
+      "step": 410
+    },
+    {
+      "epoch": 0.08631733697364276,
+      "grad_norm": 0.6882631778717041,
+      "learning_rate": 0.00017232704402515725,
+      "loss": 0.2341,
+      "step": 411
+    },
+    {
+      "epoch": 0.08652735482516014,
+      "grad_norm": 0.40710386633872986,
+      "learning_rate": 0.00017274633123689727,
+      "loss": 0.1952,
+      "step": 412
+    },
+    {
+      "epoch": 0.08673737267667751,
+      "grad_norm": 0.688685953617096,
+      "learning_rate": 0.00017316561844863733,
+      "loss": 0.3588,
+      "step": 413
+    },
+    {
+      "epoch": 0.08694739052819489,
+      "grad_norm": 0.7739083170890808,
+      "learning_rate": 0.00017358490566037738,
+      "loss": 0.1897,
+      "step": 414
+    },
+    {
+      "epoch": 0.08715740837971228,
+      "grad_norm": 0.45127734541893005,
+      "learning_rate": 0.0001740041928721174,
+      "loss": 0.2282,
+      "step": 415
+    },
+    {
+      "epoch": 0.08736742623122966,
+      "grad_norm": 0.6713837385177612,
+      "learning_rate": 0.00017442348008385745,
+      "loss": 0.3395,
+      "step": 416
+    },
+    {
+      "epoch": 0.08757744408274704,
+      "grad_norm": 0.5886412858963013,
+      "learning_rate": 0.0001748427672955975,
+      "loss": 0.1673,
+      "step": 417
+    },
+    {
+      "epoch": 0.08778746193426441,
+      "grad_norm": 0.6254634261131287,
+      "learning_rate": 0.00017526205450733753,
+      "loss": 0.2392,
+      "step": 418
+    },
+    {
+      "epoch": 0.08799747978578179,
+      "grad_norm": 0.5936654806137085,
+      "learning_rate": 0.00017568134171907758,
+      "loss": 0.1817,
+      "step": 419
+    },
+    {
+      "epoch": 0.08820749763729917,
+      "grad_norm": 0.6107873320579529,
+      "learning_rate": 0.00017610062893081763,
+      "loss": 0.2877,
+      "step": 420
+    },
+    {
+      "epoch": 0.08841751548881654,
+      "grad_norm": 0.583984911441803,
+      "learning_rate": 0.00017651991614255766,
+      "loss": 0.2382,
+      "step": 421
+    },
+    {
+      "epoch": 0.08862753334033392,
+      "grad_norm": 0.6411318778991699,
+      "learning_rate": 0.0001769392033542977,
+      "loss": 0.2528,
+      "step": 422
+    },
+    {
+      "epoch": 0.08883755119185131,
+      "grad_norm": 0.5407703518867493,
+      "learning_rate": 0.00017735849056603776,
+      "loss": 0.217,
+      "step": 423
+    },
+    {
+      "epoch": 0.08904756904336869,
+      "grad_norm": 0.5086292028427124,
+      "learning_rate": 0.00017777777777777779,
+      "loss": 0.1703,
+      "step": 424
+    },
+    {
+      "epoch": 0.08925758689488607,
+      "grad_norm": 0.534488320350647,
+      "learning_rate": 0.00017819706498951784,
+      "loss": 0.4212,
+      "step": 425
+    },
+    {
+      "epoch": 0.08946760474640345,
+      "grad_norm": 0.5869336724281311,
+      "learning_rate": 0.00017861635220125786,
+      "loss": 0.3634,
+      "step": 426
+    },
+    {
+      "epoch": 0.08967762259792082,
+      "grad_norm": 0.5784481763839722,
+      "learning_rate": 0.00017903563941299791,
+      "loss": 0.2076,
+      "step": 427
+    },
+    {
+      "epoch": 0.0898876404494382,
+      "grad_norm": 0.467438668012619,
+      "learning_rate": 0.00017945492662473794,
+      "loss": 0.1948,
+      "step": 428
+    },
+    {
+      "epoch": 0.09009765830095558,
+      "grad_norm": 0.8514359593391418,
+      "learning_rate": 0.000179874213836478,
+      "loss": 0.2695,
+      "step": 429
+    },
+    {
+      "epoch": 0.09030767615247295,
+      "grad_norm": 0.630066990852356,
+      "learning_rate": 0.00018029350104821801,
+      "loss": 0.2624,
+      "step": 430
+    },
+    {
+      "epoch": 0.09051769400399035,
+      "grad_norm": 0.6442775130271912,
+      "learning_rate": 0.00018071278825995807,
+      "loss": 0.2555,
+      "step": 431
+    },
+    {
+      "epoch": 0.09072771185550772,
+      "grad_norm": 0.6193580031394958,
+      "learning_rate": 0.00018113207547169812,
+      "loss": 0.2388,
+      "step": 432
+    },
+    {
+      "epoch": 0.0909377297070251,
+      "grad_norm": 1.108219027519226,
+      "learning_rate": 0.00018155136268343814,
+      "loss": 0.2135,
+      "step": 433
+    },
+    {
+      "epoch": 0.09114774755854248,
+      "grad_norm": 0.666748046875,
+      "learning_rate": 0.0001819706498951782,
+      "loss": 0.2402,
+      "step": 434
+    },
+    {
+      "epoch": 0.09135776541005985,
+      "grad_norm": 0.516096293926239,
+      "learning_rate": 0.00018238993710691825,
+      "loss": 0.2022,
+      "step": 435
+    },
+    {
+      "epoch": 0.09156778326157723,
+      "grad_norm": 0.4976787269115448,
+      "learning_rate": 0.00018280922431865827,
+      "loss": 0.1492,
+      "step": 436
+    },
+    {
+      "epoch": 0.09177780111309461,
+      "grad_norm": 0.596254289150238,
+      "learning_rate": 0.00018322851153039832,
+      "loss": 0.1926,
+      "step": 437
+    },
+    {
+      "epoch": 0.09198781896461199,
+      "grad_norm": 0.4079163670539856,
+      "learning_rate": 0.00018364779874213837,
+      "loss": 0.219,
+      "step": 438
+    },
+    {
+      "epoch": 0.09219783681612938,
+      "grad_norm": 0.4968511164188385,
+      "learning_rate": 0.00018406708595387843,
+      "loss": 0.2203,
+      "step": 439
+    },
+    {
+      "epoch": 0.09240785466764675,
+      "grad_norm": 0.5749839544296265,
+      "learning_rate": 0.00018448637316561845,
+      "loss": 0.2561,
+      "step": 440
+    },
+    {
+      "epoch": 0.09261787251916413,
+      "grad_norm": 0.46315014362335205,
+      "learning_rate": 0.0001849056603773585,
+      "loss": 0.1608,
+      "step": 441
+    },
+    {
+      "epoch": 0.09282789037068151,
+      "grad_norm": 0.4630315601825714,
+      "learning_rate": 0.00018532494758909855,
+      "loss": 0.1564,
+      "step": 442
+    },
+    {
+      "epoch": 0.09303790822219889,
+      "grad_norm": 0.5688292384147644,
+      "learning_rate": 0.00018574423480083858,
+      "loss": 0.1483,
+      "step": 443
+    },
+    {
+      "epoch": 0.09324792607371626,
+      "grad_norm": 0.9025551676750183,
+      "learning_rate": 0.00018616352201257863,
+      "loss": 0.216,
+      "step": 444
+    },
+    {
+      "epoch": 0.09345794392523364,
+      "grad_norm": 0.6165971755981445,
+      "learning_rate": 0.00018658280922431868,
+      "loss": 0.1852,
+      "step": 445
+    },
+    {
+      "epoch": 0.09366796177675102,
+      "grad_norm": 0.5040764808654785,
+      "learning_rate": 0.0001870020964360587,
+      "loss": 0.1534,
+      "step": 446
+    },
+    {
+      "epoch": 0.0938779796282684,
+      "grad_norm": 0.6921994686126709,
+      "learning_rate": 0.00018742138364779876,
+      "loss": 0.2739,
+      "step": 447
+    },
+    {
+      "epoch": 0.09408799747978579,
+      "grad_norm": 0.9911003708839417,
+      "learning_rate": 0.0001878406708595388,
+      "loss": 0.2148,
+      "step": 448
+    },
+    {
+      "epoch": 0.09429801533130316,
+      "grad_norm": 0.4098629951477051,
+      "learning_rate": 0.00018825995807127883,
+      "loss": 0.1627,
+      "step": 449
+    },
+    {
+      "epoch": 0.09450803318282054,
+      "grad_norm": 0.5267736315727234,
+      "learning_rate": 0.00018867924528301889,
+      "loss": 0.1714,
+      "step": 450
+    },
+    {
+      "epoch": 0.09471805103433792,
+      "grad_norm": 0.826693058013916,
+      "learning_rate": 0.00018909853249475894,
+      "loss": 0.3614,
+      "step": 451
+    },
+    {
+      "epoch": 0.0949280688858553,
+      "grad_norm": 0.7960173487663269,
+      "learning_rate": 0.00018951781970649896,
+      "loss": 0.2831,
+      "step": 452
+    },
+    {
+      "epoch": 0.09513808673737267,
+      "grad_norm": 0.5408324003219604,
+      "learning_rate": 0.00018993710691823901,
+      "loss": 0.2396,
+      "step": 453
+    },
+    {
+      "epoch": 0.09534810458889005,
+      "grad_norm": 0.5551522374153137,
+      "learning_rate": 0.00019035639412997907,
+      "loss": 0.2399,
+      "step": 454
+    },
+    {
+      "epoch": 0.09555812244040743,
+      "grad_norm": 0.5053918361663818,
+      "learning_rate": 0.0001907756813417191,
+      "loss": 0.2054,
+      "step": 455
+    },
+    {
+      "epoch": 0.09576814029192482,
+      "grad_norm": 0.6408351063728333,
+      "learning_rate": 0.00019119496855345914,
+      "loss": 0.2319,
+      "step": 456
+    },
+    {
+      "epoch": 0.0959781581434422,
+      "grad_norm": 0.6061432361602783,
+      "learning_rate": 0.0001916142557651992,
+      "loss": 0.2289,
+      "step": 457
+    },
+    {
+      "epoch": 0.09618817599495957,
+      "grad_norm": 0.6452487111091614,
+      "learning_rate": 0.00019203354297693922,
+      "loss": 0.2787,
+      "step": 458
+    },
+    {
+      "epoch": 0.09639819384647695,
+      "grad_norm": 0.5427165627479553,
+      "learning_rate": 0.00019245283018867927,
+      "loss": 0.3181,
+      "step": 459
+    },
+    {
+      "epoch": 0.09660821169799433,
+      "grad_norm": 0.5678632259368896,
+      "learning_rate": 0.0001928721174004193,
+      "loss": 0.3166,
+      "step": 460
+    },
+    {
+      "epoch": 0.0968182295495117,
+      "grad_norm": 0.554288923740387,
+      "learning_rate": 0.00019329140461215935,
+      "loss": 0.214,
+      "step": 461
+    },
+    {
+      "epoch": 0.09702824740102908,
+      "grad_norm": 0.7040925621986389,
+      "learning_rate": 0.00019371069182389937,
+      "loss": 0.473,
+      "step": 462
+    },
+    {
+      "epoch": 0.09723826525254646,
+      "grad_norm": 0.6425243020057678,
+      "learning_rate": 0.00019412997903563942,
+      "loss": 0.3542,
+      "step": 463
+    },
+    {
+      "epoch": 0.09744828310406385,
+      "grad_norm": 0.6984371542930603,
+      "learning_rate": 0.00019454926624737945,
+      "loss": 0.2165,
+      "step": 464
+    },
+    {
+      "epoch": 0.09765830095558123,
+      "grad_norm": 0.5204288959503174,
+      "learning_rate": 0.0001949685534591195,
+      "loss": 0.232,
+      "step": 465
+    },
+    {
+      "epoch": 0.0978683188070986,
+      "grad_norm": 0.5688004493713379,
+      "learning_rate": 0.00019538784067085955,
+      "loss": 0.3099,
+      "step": 466
+    },
+    {
+      "epoch": 0.09807833665861598,
+      "grad_norm": 0.4850284159183502,
+      "learning_rate": 0.00019580712788259957,
+      "loss": 0.202,
+      "step": 467
+    },
+    {
+      "epoch": 0.09828835451013336,
+      "grad_norm": 0.5034931302070618,
+      "learning_rate": 0.00019622641509433963,
+      "loss": 0.2077,
+      "step": 468
+    },
+    {
+      "epoch": 0.09849837236165074,
+      "grad_norm": 0.6193839311599731,
+      "learning_rate": 0.00019664570230607968,
+      "loss": 0.318,
+      "step": 469
+    },
+    {
+      "epoch": 0.09870839021316812,
+      "grad_norm": 0.6226887702941895,
+      "learning_rate": 0.0001970649895178197,
+      "loss": 0.3538,
+      "step": 470
+    },
+    {
+      "epoch": 0.09891840806468549,
+      "grad_norm": 0.6102244257926941,
+      "learning_rate": 0.00019748427672955975,
+      "loss": 0.2865,
+      "step": 471
+    },
+    {
+      "epoch": 0.09912842591620288,
+      "grad_norm": 0.6731789112091064,
+      "learning_rate": 0.0001979035639412998,
+      "loss": 0.3135,
+      "step": 472
+    },
+    {
+      "epoch": 0.09933844376772026,
+      "grad_norm": 0.661486029624939,
+      "learning_rate": 0.00019832285115303983,
+      "loss": 0.2198,
+      "step": 473
+    },
+    {
+      "epoch": 0.09954846161923764,
+      "grad_norm": 0.7321748733520508,
+      "learning_rate": 0.00019874213836477988,
+      "loss": 0.2557,
+      "step": 474
+    },
+    {
+      "epoch": 0.09975847947075502,
+      "grad_norm": 0.5708514451980591,
+      "learning_rate": 0.00019916142557651993,
+      "loss": 0.2385,
+      "step": 475
+    },
+    {
+      "epoch": 0.09996849732227239,
+      "grad_norm": 0.8140966892242432,
+      "learning_rate": 0.00019958071278825996,
+      "loss": 0.1724,
+      "step": 476
+    },
+    {
+      "epoch": 0.10017851517378977,
+      "grad_norm": 0.5185543298721313,
+      "learning_rate": 0.0002,
+      "loss": 0.192,
+      "step": 477
+    },
+    {
+      "epoch": 0.10038853302530715,
+      "grad_norm": 0.7630559802055359,
+      "learning_rate": 0.00019999999396812126,
+      "loss": 0.3107,
+      "step": 478
+    },
+    {
+      "epoch": 0.10059855087682452,
+      "grad_norm": 0.5256696939468384,
+      "learning_rate": 0.00019999997587248573,
+      "loss": 0.2057,
+      "step": 479
+    },
+    {
+      "epoch": 0.10080856872834192,
+      "grad_norm": 0.5820131301879883,
+      "learning_rate": 0.0001999999457130956,
+      "loss": 0.2444,
+      "step": 480
+    },
+    {
+      "epoch": 0.10101858657985929,
+      "grad_norm": 0.6161417365074158,
+      "learning_rate": 0.00019999990348995456,
+      "loss": 0.1882,
+      "step": 481
+    },
+    {
+      "epoch": 0.10122860443137667,
+      "grad_norm": 0.5549945831298828,
+      "learning_rate": 0.00019999984920306764,
+      "loss": 0.3101,
+      "step": 482
+    },
+    {
+      "epoch": 0.10143862228289405,
+      "grad_norm": 0.8708590269088745,
+      "learning_rate": 0.00019999978285244142,
+      "loss": 0.2377,
+      "step": 483
+    },
+    {
+      "epoch": 0.10164864013441142,
+      "grad_norm": 0.5110476016998291,
+      "learning_rate": 0.00019999970443808387,
+      "loss": 0.1476,
+      "step": 484
+    },
+    {
+      "epoch": 0.1018586579859288,
+      "grad_norm": 1.1280276775360107,
+      "learning_rate": 0.0001999996139600045,
+      "loss": 0.1595,
+      "step": 485
+    },
+    {
+      "epoch": 0.10206867583744618,
+      "grad_norm": 0.7876203656196594,
+      "learning_rate": 0.0001999995114182142,
+      "loss": 0.229,
+      "step": 486
+    },
+    {
+      "epoch": 0.10227869368896356,
+      "grad_norm": 0.7196666598320007,
+      "learning_rate": 0.00019999939681272536,
+      "loss": 0.1838,
+      "step": 487
+    },
+    {
+      "epoch": 0.10248871154048093,
+      "grad_norm": 0.6737300157546997,
+      "learning_rate": 0.00019999927014355175,
+      "loss": 0.1786,
+      "step": 488
+    },
+    {
+      "epoch": 0.10269872939199833,
+      "grad_norm": 0.7758048176765442,
+      "learning_rate": 0.0001999991314107087,
+      "loss": 0.1847,
+      "step": 489
+    },
+    {
+      "epoch": 0.1029087472435157,
+      "grad_norm": 0.8189213871955872,
+      "learning_rate": 0.00019999898061421294,
+      "loss": 0.2842,
+      "step": 490
+    },
+    {
+      "epoch": 0.10311876509503308,
+      "grad_norm": 0.5789510011672974,
+      "learning_rate": 0.00019999881775408263,
+      "loss": 0.2353,
+      "step": 491
+    },
+    {
+      "epoch": 0.10332878294655046,
+      "grad_norm": 0.808729350566864,
+      "learning_rate": 0.00019999864283033747,
+      "loss": 0.2481,
+      "step": 492
+    },
+    {
+      "epoch": 0.10353880079806783,
+      "grad_norm": 0.587478518486023,
+      "learning_rate": 0.00019999845584299855,
+      "loss": 0.1976,
+      "step": 493
+    },
+    {
+      "epoch": 0.10374881864958521,
+      "grad_norm": 0.7419194579124451,
+      "learning_rate": 0.00019999825679208839,
+      "loss": 0.2444,
+      "step": 494
+    },
+    {
+      "epoch": 0.10395883650110259,
+      "grad_norm": 0.6678702235221863,
+      "learning_rate": 0.000199998045677631,
+      "loss": 0.1752,
+      "step": 495
+    },
+    {
+      "epoch": 0.10416885435261997,
+      "grad_norm": 0.5477135181427002,
+      "learning_rate": 0.00019999782249965193,
+      "loss": 0.1176,
+      "step": 496
+    },
+    {
+      "epoch": 0.10437887220413736,
+      "grad_norm": 0.47613173723220825,
+      "learning_rate": 0.000199997587258178,
+      "loss": 0.1734,
+      "step": 497
+    },
+    {
+      "epoch": 0.10458889005565473,
+      "grad_norm": 0.8437466025352478,
+      "learning_rate": 0.0001999973399532377,
+      "loss": 0.2279,
+      "step": 498
+    },
+    {
+      "epoch": 0.10479890790717211,
+      "grad_norm": 0.7599924206733704,
+      "learning_rate": 0.00019999708058486074,
+      "loss": 0.209,
+      "step": 499
+    },
+    {
+      "epoch": 0.10500892575868949,
+      "grad_norm": 0.5578658580780029,
+      "learning_rate": 0.00019999680915307847,
+      "loss": 0.243,
+      "step": 500
+    },
+    {
+      "epoch": 0.10521894361020687,
+      "grad_norm": 0.5664511322975159,
+      "learning_rate": 0.00019999652565792368,
+      "loss": 0.2176,
+      "step": 501
+    },
+    {
+      "epoch": 0.10542896146172424,
+      "grad_norm": 0.5591540336608887,
+      "learning_rate": 0.0001999962300994305,
+      "loss": 0.3467,
+      "step": 502
+    },
+    {
+      "epoch": 0.10563897931324162,
+      "grad_norm": 0.5022396445274353,
+      "learning_rate": 0.0001999959224776346,
+      "loss": 0.2149,
+      "step": 503
+    },
+    {
+      "epoch": 0.105848997164759,
+      "grad_norm": 0.5846520662307739,
+      "learning_rate": 0.00019999560279257314,
+      "loss": 0.3388,
+      "step": 504
+    },
+    {
+      "epoch": 0.10605901501627639,
+      "grad_norm": 0.4137157201766968,
+      "learning_rate": 0.00019999527104428463,
+      "loss": 0.2223,
+      "step": 505
+    },
+    {
+      "epoch": 0.10626903286779377,
+      "grad_norm": 0.49332931637763977,
+      "learning_rate": 0.0001999949272328091,
+      "loss": 0.2679,
+      "step": 506
+    },
+    {
+      "epoch": 0.10647905071931114,
+      "grad_norm": 0.7095859050750732,
+      "learning_rate": 0.00019999457135818805,
+      "loss": 0.2681,
+      "step": 507
+    },
+    {
+      "epoch": 0.10668906857082852,
+      "grad_norm": 0.5563727021217346,
+      "learning_rate": 0.0001999942034204644,
+      "loss": 0.2695,
+      "step": 508
+    },
+    {
+      "epoch": 0.1068990864223459,
+      "grad_norm": 0.5464118719100952,
+      "learning_rate": 0.00019999382341968252,
+      "loss": 0.4308,
+      "step": 509
+    },
+    {
+      "epoch": 0.10710910427386328,
+      "grad_norm": 0.7822732329368591,
+      "learning_rate": 0.00019999343135588827,
+      "loss": 0.2458,
+      "step": 510
+    },
+    {
+      "epoch": 0.10731912212538065,
+      "grad_norm": 0.6268991231918335,
+      "learning_rate": 0.00019999302722912895,
+      "loss": 0.2877,
+      "step": 511
+    },
+    {
+      "epoch": 0.10752913997689803,
+      "grad_norm": 0.7860679626464844,
+      "learning_rate": 0.0001999926110394533,
+      "loss": 0.275,
+      "step": 512
+    },
+    {
+      "epoch": 0.10773915782841542,
+      "grad_norm": 0.5817549228668213,
+      "learning_rate": 0.00019999218278691158,
+      "loss": 0.2005,
+      "step": 513
+    },
+    {
+      "epoch": 0.1079491756799328,
+      "grad_norm": 0.8145076036453247,
+      "learning_rate": 0.00019999174247155535,
+      "loss": 0.2032,
+      "step": 514
+    },
+    {
+      "epoch": 0.10815919353145018,
+      "grad_norm": 0.7561895847320557,
+      "learning_rate": 0.0001999912900934378,
+      "loss": 0.2452,
+      "step": 515
+    },
+    {
+      "epoch": 0.10836921138296755,
+      "grad_norm": 0.8003164529800415,
+      "learning_rate": 0.0001999908256526135,
+      "loss": 0.2318,
+      "step": 516
+    },
+    {
+      "epoch": 0.10857922923448493,
+      "grad_norm": 0.6318978667259216,
+      "learning_rate": 0.0001999903491491385,
+      "loss": 0.2534,
+      "step": 517
+    },
+    {
+      "epoch": 0.10878924708600231,
+      "grad_norm": 0.5220886468887329,
+      "learning_rate": 0.00019998986058307022,
+      "loss": 0.2011,
+      "step": 518
+    },
+    {
+      "epoch": 0.10899926493751969,
+      "grad_norm": 0.5928252935409546,
+      "learning_rate": 0.00019998935995446763,
+      "loss": 0.2175,
+      "step": 519
+    },
+    {
+      "epoch": 0.10920928278903706,
+      "grad_norm": 0.7763411998748779,
+      "learning_rate": 0.00019998884726339116,
+      "loss": 0.2852,
+      "step": 520
+    },
+    {
+      "epoch": 0.10941930064055445,
+      "grad_norm": 0.7260156273841858,
+      "learning_rate": 0.00019998832250990264,
+      "loss": 0.2313,
+      "step": 521
+    },
+    {
+      "epoch": 0.10962931849207183,
+      "grad_norm": 0.7486017942428589,
+      "learning_rate": 0.0001999877856940653,
+      "loss": 0.2789,
+      "step": 522
+    },
+    {
+      "epoch": 0.10983933634358921,
+      "grad_norm": 0.6006895303726196,
+      "learning_rate": 0.00019998723681594402,
+      "loss": 0.1674,
+      "step": 523
+    },
+    {
+      "epoch": 0.11004935419510659,
+      "grad_norm": 0.7286220192909241,
+      "learning_rate": 0.00019998667587560495,
+      "loss": 0.2521,
+      "step": 524
+    },
+    {
+      "epoch": 0.11025937204662396,
+      "grad_norm": 0.7167160511016846,
+      "learning_rate": 0.00019998610287311574,
+      "loss": 0.2308,
+      "step": 525
+    },
+    {
+      "epoch": 0.11046938989814134,
+      "grad_norm": 0.6270986795425415,
+      "learning_rate": 0.00019998551780854557,
+      "loss": 0.2048,
+      "step": 526
+    },
+    {
+      "epoch": 0.11067940774965872,
+      "grad_norm": 0.7366195917129517,
+      "learning_rate": 0.000199984920681965,
+      "loss": 0.2239,
+      "step": 527
+    },
+    {
+      "epoch": 0.1108894256011761,
+      "grad_norm": 0.5709097981452942,
+      "learning_rate": 0.00019998431149344606,
+      "loss": 0.2074,
+      "step": 528
+    },
+    {
+      "epoch": 0.11109944345269347,
+      "grad_norm": 0.6718131899833679,
+      "learning_rate": 0.00019998369024306224,
+      "loss": 0.2902,
+      "step": 529
+    },
+    {
+      "epoch": 0.11130946130421086,
+      "grad_norm": 0.6741777658462524,
+      "learning_rate": 0.00019998305693088848,
+      "loss": 0.2638,
+      "step": 530
+    },
+    {
+      "epoch": 0.11151947915572824,
+      "grad_norm": 0.5218673944473267,
+      "learning_rate": 0.0001999824115570012,
+      "loss": 0.201,
+      "step": 531
+    },
+    {
+      "epoch": 0.11172949700724562,
+      "grad_norm": 0.6867272257804871,
+      "learning_rate": 0.00019998175412147824,
+      "loss": 0.2864,
+      "step": 532
+    },
+    {
+      "epoch": 0.111939514858763,
+      "grad_norm": 0.6319578289985657,
+      "learning_rate": 0.00019998108462439894,
+      "loss": 0.2277,
+      "step": 533
+    },
+    {
+      "epoch": 0.11214953271028037,
+      "grad_norm": 0.5601973533630371,
+      "learning_rate": 0.000199980403065844,
+      "loss": 0.2678,
+      "step": 534
+    },
+    {
+      "epoch": 0.11235955056179775,
+      "grad_norm": 0.5189068913459778,
+      "learning_rate": 0.00019997970944589572,
+      "loss": 0.2036,
+      "step": 535
+    },
+    {
+      "epoch": 0.11256956841331513,
+      "grad_norm": 0.7217200994491577,
+      "learning_rate": 0.00019997900376463778,
+      "loss": 0.2299,
+      "step": 536
+    },
+    {
+      "epoch": 0.1127795862648325,
+      "grad_norm": 0.6617181301116943,
+      "learning_rate": 0.0001999782860221552,
+      "loss": 0.2259,
+      "step": 537
+    },
+    {
+      "epoch": 0.1129896041163499,
+      "grad_norm": 0.6987117528915405,
+      "learning_rate": 0.0001999775562185347,
+      "loss": 0.1882,
+      "step": 538
+    },
+    {
+      "epoch": 0.11319962196786727,
+      "grad_norm": 0.4491863548755646,
+      "learning_rate": 0.00019997681435386422,
+      "loss": 0.1937,
+      "step": 539
+    },
+    {
+      "epoch": 0.11340963981938465,
+      "grad_norm": 0.5842171311378479,
+      "learning_rate": 0.00019997606042823334,
+      "loss": 0.2808,
+      "step": 540
+    },
+    {
+      "epoch": 0.11361965767090203,
+      "grad_norm": 0.7743870615959167,
+      "learning_rate": 0.00019997529444173293,
+      "loss": 0.2329,
+      "step": 541
+    },
+    {
+      "epoch": 0.1138296755224194,
+      "grad_norm": 0.5326593518257141,
+      "learning_rate": 0.00019997451639445547,
+      "loss": 0.188,
+      "step": 542
+    },
+    {
+      "epoch": 0.11403969337393678,
+      "grad_norm": 0.5364864468574524,
+      "learning_rate": 0.00019997372628649478,
+      "loss": 0.2294,
+      "step": 543
+    },
+    {
+      "epoch": 0.11424971122545416,
+      "grad_norm": 0.5609897375106812,
+      "learning_rate": 0.00019997292411794618,
+      "loss": 0.2108,
+      "step": 544
+    },
+    {
+      "epoch": 0.11445972907697154,
+      "grad_norm": 0.5446069836616516,
+      "learning_rate": 0.00019997210988890646,
+      "loss": 0.2577,
+      "step": 545
+    },
+    {
+      "epoch": 0.11466974692848893,
+      "grad_norm": 0.6916573643684387,
+      "learning_rate": 0.0001999712835994738,
+      "loss": 0.1976,
+      "step": 546
+    },
+    {
+      "epoch": 0.1148797647800063,
+      "grad_norm": 0.7029738426208496,
+      "learning_rate": 0.00019997044524974799,
+      "loss": 0.2076,
+      "step": 547
+    },
+    {
+      "epoch": 0.11508978263152368,
+      "grad_norm": 0.8003794550895691,
+      "learning_rate": 0.00019996959483983004,
+      "loss": 0.3284,
+      "step": 548
+    },
+    {
+      "epoch": 0.11529980048304106,
+      "grad_norm": 0.6394858360290527,
+      "learning_rate": 0.00019996873236982258,
+      "loss": 0.2397,
+      "step": 549
+    },
+    {
+      "epoch": 0.11550981833455844,
+      "grad_norm": 0.7164601683616638,
+      "learning_rate": 0.00019996785783982972,
+      "loss": 0.2097,
+      "step": 550
+    },
+    {
+      "epoch": 0.11571983618607581,
+      "grad_norm": 0.5346024036407471,
+      "learning_rate": 0.0001999669712499569,
+      "loss": 0.2559,
+      "step": 551
+    },
+    {
+      "epoch": 0.11592985403759319,
+      "grad_norm": 0.77498859167099,
+      "learning_rate": 0.00019996607260031106,
+      "loss": 0.3734,
+      "step": 552
+    },
+    {
+      "epoch": 0.11613987188911057,
+      "grad_norm": 0.6465743780136108,
+      "learning_rate": 0.00019996516189100066,
+      "loss": 0.2642,
+      "step": 553
+    },
+    {
+      "epoch": 0.11634988974062796,
+      "grad_norm": 0.4624630808830261,
+      "learning_rate": 0.00019996423912213554,
+      "loss": 0.1649,
+      "step": 554
+    },
+    {
+      "epoch": 0.11655990759214534,
+      "grad_norm": 0.6889922618865967,
+      "learning_rate": 0.00019996330429382703,
+      "loss": 0.2902,
+      "step": 555
+    },
+    {
+      "epoch": 0.11676992544366271,
+      "grad_norm": 0.7413411140441895,
+      "learning_rate": 0.0001999623574061879,
+      "loss": 0.2428,
+      "step": 556
+    },
+    {
+      "epoch": 0.11697994329518009,
+      "grad_norm": 0.9009401798248291,
+      "learning_rate": 0.0001999613984593324,
+      "loss": 0.3112,
+      "step": 557
+    },
+    {
+      "epoch": 0.11718996114669747,
+      "grad_norm": 0.6533844470977783,
+      "learning_rate": 0.00019996042745337617,
+      "loss": 0.2118,
+      "step": 558
+    },
+    {
+      "epoch": 0.11739997899821485,
+      "grad_norm": 0.6814008355140686,
+      "learning_rate": 0.00019995944438843636,
+      "loss": 0.2755,
+      "step": 559
+    },
+    {
+      "epoch": 0.11760999684973222,
+      "grad_norm": 0.6098254323005676,
+      "learning_rate": 0.0001999584492646316,
+      "loss": 0.2385,
+      "step": 560
+    },
+    {
+      "epoch": 0.1178200147012496,
+      "grad_norm": 0.7361414432525635,
+      "learning_rate": 0.00019995744208208194,
+      "loss": 0.326,
+      "step": 561
+    },
+    {
+      "epoch": 0.11803003255276699,
+      "grad_norm": 0.6809893250465393,
+      "learning_rate": 0.00019995642284090885,
+      "loss": 0.2727,
+      "step": 562
+    },
+    {
+      "epoch": 0.11824005040428437,
+      "grad_norm": 0.6401339769363403,
+      "learning_rate": 0.00019995539154123529,
+      "loss": 0.2143,
+      "step": 563
+    },
+    {
+      "epoch": 0.11845006825580175,
+      "grad_norm": 0.715313732624054,
+      "learning_rate": 0.00019995434818318567,
+      "loss": 0.2269,
+      "step": 564
+    },
+    {
+      "epoch": 0.11866008610731912,
+      "grad_norm": 0.6071058511734009,
+      "learning_rate": 0.00019995329276688593,
+      "loss": 0.4811,
+      "step": 565
+    },
+    {
+      "epoch": 0.1188701039588365,
+      "grad_norm": 0.882318377494812,
+      "learning_rate": 0.0001999522252924633,
+      "loss": 0.2607,
+      "step": 566
+    },
+    {
+      "epoch": 0.11908012181035388,
+      "grad_norm": 0.4758372902870178,
+      "learning_rate": 0.0001999511457600466,
+      "loss": 0.2013,
+      "step": 567
+    },
+    {
+      "epoch": 0.11929013966187126,
+      "grad_norm": 0.6482694149017334,
+      "learning_rate": 0.00019995005416976604,
+      "loss": 0.2567,
+      "step": 568
+    },
+    {
+      "epoch": 0.11950015751338863,
+      "grad_norm": 0.6689179539680481,
+      "learning_rate": 0.00019994895052175338,
+      "loss": 0.2498,
+      "step": 569
+    },
+    {
+      "epoch": 0.11971017536490601,
+      "grad_norm": 0.5817541480064392,
+      "learning_rate": 0.00019994783481614166,
+      "loss": 0.2013,
+      "step": 570
+    },
+    {
+      "epoch": 0.1199201932164234,
+      "grad_norm": 0.4717533588409424,
+      "learning_rate": 0.00019994670705306554,
+      "loss": 0.2647,
+      "step": 571
+    },
+    {
+      "epoch": 0.12013021106794078,
+      "grad_norm": 0.574079692363739,
+      "learning_rate": 0.00019994556723266103,
+      "loss": 0.1704,
+      "step": 572
+    },
+    {
+      "epoch": 0.12034022891945816,
+      "grad_norm": 0.759425938129425,
+      "learning_rate": 0.00019994441535506569,
+      "loss": 0.208,
+      "step": 573
+    },
+    {
+      "epoch": 0.12055024677097553,
+      "grad_norm": 0.5335227847099304,
+      "learning_rate": 0.0001999432514204184,
+      "loss": 0.2018,
+      "step": 574
+    },
+    {
+      "epoch": 0.12076026462249291,
+      "grad_norm": 0.5595372915267944,
+      "learning_rate": 0.00019994207542885963,
+      "loss": 0.2667,
+      "step": 575
+    },
+    {
+      "epoch": 0.12097028247401029,
+      "grad_norm": 0.6673279404640198,
+      "learning_rate": 0.00019994088738053124,
+      "loss": 0.3175,
+      "step": 576
+    },
+    {
+      "epoch": 0.12118030032552767,
+      "grad_norm": 0.5622591376304626,
+      "learning_rate": 0.0001999396872755766,
+      "loss": 0.1543,
+      "step": 577
+    },
+    {
+      "epoch": 0.12139031817704504,
+      "grad_norm": 0.5784288048744202,
+      "learning_rate": 0.0001999384751141404,
+      "loss": 0.2624,
+      "step": 578
+    },
+    {
+      "epoch": 0.12160033602856243,
+      "grad_norm": 0.6946631669998169,
+      "learning_rate": 0.00019993725089636891,
+      "loss": 0.2469,
+      "step": 579
+    },
+    {
+      "epoch": 0.12181035388007981,
+      "grad_norm": 0.6951069235801697,
+      "learning_rate": 0.00019993601462240984,
+      "loss": 0.2636,
+      "step": 580
+    },
+    {
+      "epoch": 0.12202037173159719,
+      "grad_norm": 0.5134934186935425,
+      "learning_rate": 0.0001999347662924123,
+      "loss": 0.1432,
+      "step": 581
+    },
+    {
+      "epoch": 0.12223038958311457,
+      "grad_norm": 0.5719296932220459,
+      "learning_rate": 0.00019993350590652691,
+      "loss": 0.2388,
+      "step": 582
+    },
+    {
+      "epoch": 0.12244040743463194,
+      "grad_norm": 0.7625638246536255,
+      "learning_rate": 0.0001999322334649057,
+      "loss": 0.2507,
+      "step": 583
+    },
+    {
+      "epoch": 0.12265042528614932,
+      "grad_norm": 0.6974209547042847,
+      "learning_rate": 0.00019993094896770218,
+      "loss": 0.2431,
+      "step": 584
+    },
+    {
+      "epoch": 0.1228604431376667,
+      "grad_norm": 0.7072513699531555,
+      "learning_rate": 0.0001999296524150713,
+      "loss": 0.1745,
+      "step": 585
+    },
+    {
+      "epoch": 0.12307046098918407,
+      "grad_norm": 0.7435344457626343,
+      "learning_rate": 0.00019992834380716946,
+      "loss": 0.216,
+      "step": 586
+    },
+    {
+      "epoch": 0.12328047884070147,
+      "grad_norm": 0.5491403937339783,
+      "learning_rate": 0.00019992702314415461,
+      "loss": 0.1853,
+      "step": 587
+    },
+    {
+      "epoch": 0.12349049669221884,
+      "grad_norm": 0.5487938523292542,
+      "learning_rate": 0.00019992569042618597,
+      "loss": 0.2361,
+      "step": 588
+    },
+    {
+      "epoch": 0.12370051454373622,
+      "grad_norm": 0.4346216320991516,
+      "learning_rate": 0.00019992434565342437,
+      "loss": 0.1812,
+      "step": 589
+    },
+    {
+      "epoch": 0.1239105323952536,
+      "grad_norm": 0.5448020696640015,
+      "learning_rate": 0.00019992298882603202,
+      "loss": 0.2017,
+      "step": 590
+    },
+    {
+      "epoch": 0.12412055024677097,
+      "grad_norm": 0.6867210268974304,
+      "learning_rate": 0.0001999216199441726,
+      "loss": 0.1788,
+      "step": 591
+    },
+    {
+      "epoch": 0.12433056809828835,
+      "grad_norm": 0.6821328401565552,
+      "learning_rate": 0.00019992023900801127,
+      "loss": 0.2159,
+      "step": 592
+    },
+    {
+      "epoch": 0.12454058594980573,
+      "grad_norm": 0.6648369431495667,
+      "learning_rate": 0.0001999188460177146,
+      "loss": 0.223,
+      "step": 593
+    },
+    {
+      "epoch": 0.1247506038013231,
+      "grad_norm": 0.6275060772895813,
+      "learning_rate": 0.00019991744097345068,
+      "loss": 0.2174,
+      "step": 594
+    },
+    {
+      "epoch": 0.1249606216528405,
+      "grad_norm": 0.43622860312461853,
+      "learning_rate": 0.00019991602387538896,
+      "loss": 0.1709,
+      "step": 595
+    },
+    {
+      "epoch": 0.12517063950435786,
+      "grad_norm": 0.41494739055633545,
+      "learning_rate": 0.00019991459472370042,
+      "loss": 0.1615,
+      "step": 596
+    },
+    {
+      "epoch": 0.12538065735587525,
+      "grad_norm": 0.4159907400608063,
+      "learning_rate": 0.00019991315351855748,
+      "loss": 0.1457,
+      "step": 597
+    },
+    {
+      "epoch": 0.12559067520739262,
+      "grad_norm": 0.8123224377632141,
+      "learning_rate": 0.00019991170026013397,
+      "loss": 0.202,
+      "step": 598
+    },
+    {
+      "epoch": 0.12580069305891,
+      "grad_norm": 0.9315401911735535,
+      "learning_rate": 0.00019991023494860522,
+      "loss": 0.2496,
+      "step": 599
+    },
+    {
+      "epoch": 0.1260107109104274,
+      "grad_norm": 0.7999815344810486,
+      "learning_rate": 0.00019990875758414803,
+      "loss": 0.2782,
+      "step": 600
+    },
+    {
+      "epoch": 0.12622072876194476,
+      "grad_norm": 0.5633922815322876,
+      "learning_rate": 0.0001999072681669406,
+      "loss": 0.2276,
+      "step": 601
+    },
+    {
+      "epoch": 0.12643074661346215,
+      "grad_norm": 0.6719483733177185,
+      "learning_rate": 0.00019990576669716258,
+      "loss": 0.3169,
+      "step": 602
+    },
+    {
+      "epoch": 0.12664076446497952,
+      "grad_norm": 0.7311053276062012,
+      "learning_rate": 0.0001999042531749952,
+      "loss": 0.2723,
+      "step": 603
+    },
+    {
+      "epoch": 0.1268507823164969,
+      "grad_norm": 0.5853881239891052,
+      "learning_rate": 0.00019990272760062093,
+      "loss": 0.2869,
+      "step": 604
+    },
+    {
+      "epoch": 0.12706080016801427,
+      "grad_norm": 0.7300302982330322,
+      "learning_rate": 0.0001999011899742239,
+      "loss": 0.2405,
+      "step": 605
+    },
+    {
+      "epoch": 0.12727081801953166,
+      "grad_norm": 0.704954206943512,
+      "learning_rate": 0.00019989964029598953,
+      "loss": 0.3195,
+      "step": 606
+    },
+    {
+      "epoch": 0.12748083587104905,
+      "grad_norm": 0.6305354833602905,
+      "learning_rate": 0.00019989807856610482,
+      "loss": 0.2442,
+      "step": 607
+    },
+    {
+      "epoch": 0.12769085372256642,
+      "grad_norm": 0.5027151107788086,
+      "learning_rate": 0.0001998965047847582,
+      "loss": 0.3006,
+      "step": 608
+    },
+    {
+      "epoch": 0.1279008715740838,
+      "grad_norm": 0.6237658262252808,
+      "learning_rate": 0.00019989491895213948,
+      "loss": 0.2019,
+      "step": 609
+    },
+    {
+      "epoch": 0.12811088942560117,
+      "grad_norm": 0.6959155797958374,
+      "learning_rate": 0.00019989332106844,
+      "loss": 0.2142,
+      "step": 610
+    },
+    {
+      "epoch": 0.12832090727711856,
+      "grad_norm": 0.7905144095420837,
+      "learning_rate": 0.0001998917111338525,
+      "loss": 0.2599,
+      "step": 611
+    },
+    {
+      "epoch": 0.12853092512863593,
+      "grad_norm": 0.7247504591941833,
+      "learning_rate": 0.00019989008914857116,
+      "loss": 0.2679,
+      "step": 612
+    },
+    {
+      "epoch": 0.12874094298015332,
+      "grad_norm": 0.5282559990882874,
+      "learning_rate": 0.0001998884551127917,
+      "loss": 0.2125,
+      "step": 613
+    },
+    {
+      "epoch": 0.12895096083167068,
+      "grad_norm": 0.6418580412864685,
+      "learning_rate": 0.0001998868090267113,
+      "loss": 0.2185,
+      "step": 614
+    },
+    {
+      "epoch": 0.12916097868318807,
+      "grad_norm": 0.48245900869369507,
+      "learning_rate": 0.00019988515089052844,
+      "loss": 0.2175,
+      "step": 615
+    },
+    {
+      "epoch": 0.12937099653470546,
+      "grad_norm": 0.4887724220752716,
+      "learning_rate": 0.00019988348070444322,
+      "loss": 0.1777,
+      "step": 616
+    },
+    {
+      "epoch": 0.12958101438622283,
+      "grad_norm": 0.5296192169189453,
+      "learning_rate": 0.0001998817984686571,
+      "loss": 0.2344,
+      "step": 617
+    },
+    {
+      "epoch": 0.12979103223774022,
+      "grad_norm": 0.6658729314804077,
+      "learning_rate": 0.00019988010418337305,
+      "loss": 0.2322,
+      "step": 618
+    },
+    {
+      "epoch": 0.13000105008925758,
+      "grad_norm": 0.5744292736053467,
+      "learning_rate": 0.0001998783978487954,
+      "loss": 0.2156,
+      "step": 619
+    },
+    {
+      "epoch": 0.13021106794077497,
+      "grad_norm": 0.5000370144844055,
+      "learning_rate": 0.00019987667946513006,
+      "loss": 0.2319,
+      "step": 620
+    },
+    {
+      "epoch": 0.13042108579229234,
+      "grad_norm": 0.8539411425590515,
+      "learning_rate": 0.00019987494903258432,
+      "loss": 0.3729,
+      "step": 621
+    },
+    {
+      "epoch": 0.13063110364380973,
+      "grad_norm": 0.6094825267791748,
+      "learning_rate": 0.00019987320655136693,
+      "loss": 0.2171,
+      "step": 622
+    },
+    {
+      "epoch": 0.1308411214953271,
+      "grad_norm": 0.6408823728561401,
+      "learning_rate": 0.00019987145202168805,
+      "loss": 0.2658,
+      "step": 623
+    },
+    {
+      "epoch": 0.13105113934684448,
+      "grad_norm": 0.5738769769668579,
+      "learning_rate": 0.0001998696854437594,
+      "loss": 0.2127,
+      "step": 624
+    },
+    {
+      "epoch": 0.13126115719836187,
+      "grad_norm": 0.6330286860466003,
+      "learning_rate": 0.00019986790681779412,
+      "loss": 0.1503,
+      "step": 625
+    },
+    {
+      "epoch": 0.13147117504987924,
+      "grad_norm": 0.8125373125076294,
+      "learning_rate": 0.0001998661161440067,
+      "loss": 0.2741,
+      "step": 626
+    },
+    {
+      "epoch": 0.13168119290139663,
+      "grad_norm": 0.710121750831604,
+      "learning_rate": 0.00019986431342261323,
+      "loss": 0.2672,
+      "step": 627
+    },
+    {
+      "epoch": 0.131891210752914,
+      "grad_norm": 0.8024762868881226,
+      "learning_rate": 0.00019986249865383115,
+      "loss": 0.2818,
+      "step": 628
+    },
+    {
+      "epoch": 0.13210122860443138,
+      "grad_norm": 1.0455816984176636,
+      "learning_rate": 0.0001998606718378794,
+      "loss": 0.3204,
+      "step": 629
+    },
+    {
+      "epoch": 0.13231124645594874,
+      "grad_norm": 0.7923910617828369,
+      "learning_rate": 0.00019985883297497835,
+      "loss": 0.213,
+      "step": 630
+    },
+    {
+      "epoch": 0.13252126430746614,
+      "grad_norm": 0.7458345890045166,
+      "learning_rate": 0.00019985698206534985,
+      "loss": 0.2066,
+      "step": 631
+    },
+    {
+      "epoch": 0.13273128215898353,
+      "grad_norm": 0.6166645884513855,
+      "learning_rate": 0.0001998551191092172,
+      "loss": 0.2239,
+      "step": 632
+    },
+    {
+      "epoch": 0.1329413000105009,
+      "grad_norm": 0.7050312161445618,
+      "learning_rate": 0.00019985324410680514,
+      "loss": 0.2692,
+      "step": 633
+    },
+    {
+      "epoch": 0.13315131786201828,
+      "grad_norm": 0.6465736627578735,
+      "learning_rate": 0.00019985135705833984,
+      "loss": 0.235,
+      "step": 634
+    },
+    {
+      "epoch": 0.13336133571353564,
+      "grad_norm": 0.6108490228652954,
+      "learning_rate": 0.00019984945796404894,
+      "loss": 0.2472,
+      "step": 635
+    },
+    {
+      "epoch": 0.13357135356505304,
+      "grad_norm": 0.725173830986023,
+      "learning_rate": 0.00019984754682416157,
+      "loss": 0.2521,
+      "step": 636
+    },
+    {
+      "epoch": 0.1337813714165704,
+      "grad_norm": 0.5391446352005005,
+      "learning_rate": 0.00019984562363890832,
+      "loss": 0.2151,
+      "step": 637
+    },
+    {
+      "epoch": 0.1339913892680878,
+      "grad_norm": 0.44177114963531494,
+      "learning_rate": 0.00019984368840852114,
+      "loss": 0.179,
+      "step": 638
+    },
+    {
+      "epoch": 0.13420140711960515,
+      "grad_norm": 0.48038744926452637,
+      "learning_rate": 0.00019984174113323353,
+      "loss": 0.2296,
+      "step": 639
+    },
+    {
+      "epoch": 0.13441142497112255,
+      "grad_norm": 0.5720350742340088,
+      "learning_rate": 0.00019983978181328037,
+      "loss": 0.1843,
+      "step": 640
+    },
+    {
+      "epoch": 0.13462144282263994,
+      "grad_norm": 0.4996393322944641,
+      "learning_rate": 0.00019983781044889803,
+      "loss": 0.225,
+      "step": 641
+    },
+    {
+      "epoch": 0.1348314606741573,
+      "grad_norm": 0.5970807671546936,
+      "learning_rate": 0.00019983582704032434,
+      "loss": 0.179,
+      "step": 642
+    },
+    {
+      "epoch": 0.1350414785256747,
+      "grad_norm": 0.858808159828186,
+      "learning_rate": 0.0001998338315877986,
+      "loss": 0.246,
+      "step": 643
+    },
+    {
+      "epoch": 0.13525149637719205,
+      "grad_norm": 0.6708926558494568,
+      "learning_rate": 0.0001998318240915615,
+      "loss": 0.197,
+      "step": 644
+    },
+    {
+      "epoch": 0.13546151422870945,
+      "grad_norm": 0.8443548083305359,
+      "learning_rate": 0.00019982980455185526,
+      "loss": 0.1889,
+      "step": 645
+    },
+    {
+      "epoch": 0.1356715320802268,
+      "grad_norm": 0.6451512575149536,
+      "learning_rate": 0.00019982777296892346,
+      "loss": 0.2103,
+      "step": 646
+    },
+    {
+      "epoch": 0.1358815499317442,
+      "grad_norm": 0.7601468563079834,
+      "learning_rate": 0.00019982572934301122,
+      "loss": 0.2338,
+      "step": 647
+    },
+    {
+      "epoch": 0.1360915677832616,
+      "grad_norm": 0.5944762229919434,
+      "learning_rate": 0.00019982367367436506,
+      "loss": 0.1814,
+      "step": 648
+    },
+    {
+      "epoch": 0.13630158563477895,
+      "grad_norm": 0.7542382478713989,
+      "learning_rate": 0.00019982160596323297,
+      "loss": 0.2062,
+      "step": 649
+    },
+    {
+      "epoch": 0.13651160348629635,
+      "grad_norm": 0.560296893119812,
+      "learning_rate": 0.00019981952620986442,
+      "loss": 0.199,
+      "step": 650
+    },
+    {
+      "epoch": 0.1367216213378137,
+      "grad_norm": 0.5254395604133606,
+      "learning_rate": 0.0001998174344145103,
+      "loss": 0.2943,
+      "step": 651
+    },
+    {
+      "epoch": 0.1369316391893311,
+      "grad_norm": 0.6042603254318237,
+      "learning_rate": 0.00019981533057742294,
+      "loss": 0.2355,
+      "step": 652
+    },
+    {
+      "epoch": 0.13714165704084846,
+      "grad_norm": 0.6384417414665222,
+      "learning_rate": 0.00019981321469885615,
+      "loss": 0.202,
+      "step": 653
+    },
+    {
+      "epoch": 0.13735167489236585,
+      "grad_norm": 0.7300348877906799,
+      "learning_rate": 0.0001998110867790652,
+      "loss": 0.2422,
+      "step": 654
+    },
+    {
+      "epoch": 0.13756169274388322,
+      "grad_norm": 0.5238686800003052,
+      "learning_rate": 0.00019980894681830678,
+      "loss": 0.2491,
+      "step": 655
+    },
+    {
+      "epoch": 0.1377717105954006,
+      "grad_norm": 0.7352842092514038,
+      "learning_rate": 0.00019980679481683904,
+      "loss": 0.3193,
+      "step": 656
+    },
+    {
+      "epoch": 0.137981728446918,
+      "grad_norm": 0.6651904582977295,
+      "learning_rate": 0.0001998046307749216,
+      "loss": 0.2618,
+      "step": 657
+    },
+    {
+      "epoch": 0.13819174629843536,
+      "grad_norm": 0.6976970434188843,
+      "learning_rate": 0.00019980245469281553,
+      "loss": 0.2622,
+      "step": 658
+    },
+    {
+      "epoch": 0.13840176414995276,
+      "grad_norm": 0.6078370809555054,
+      "learning_rate": 0.00019980026657078336,
+      "loss": 0.2532,
+      "step": 659
+    },
+    {
+      "epoch": 0.13861178200147012,
+      "grad_norm": 0.7155233025550842,
+      "learning_rate": 0.00019979806640908906,
+      "loss": 0.3283,
+      "step": 660
+    },
+    {
+      "epoch": 0.1388217998529875,
+      "grad_norm": 0.519636869430542,
+      "learning_rate": 0.00019979585420799802,
+      "loss": 0.187,
+      "step": 661
+    },
+    {
+      "epoch": 0.13903181770450487,
+      "grad_norm": 0.7007790803909302,
+      "learning_rate": 0.00019979362996777714,
+      "loss": 0.2554,
+      "step": 662
+    },
+    {
+      "epoch": 0.13924183555602226,
+      "grad_norm": 0.6281614303588867,
+      "learning_rate": 0.00019979139368869473,
+      "loss": 0.2153,
+      "step": 663
+    },
+    {
+      "epoch": 0.13945185340753963,
+      "grad_norm": 0.5729889869689941,
+      "learning_rate": 0.00019978914537102055,
+      "loss": 0.2432,
+      "step": 664
+    },
+    {
+      "epoch": 0.13966187125905702,
+      "grad_norm": 0.4995453357696533,
+      "learning_rate": 0.00019978688501502592,
+      "loss": 0.1931,
+      "step": 665
+    },
+    {
+      "epoch": 0.1398718891105744,
+      "grad_norm": 0.48151615262031555,
+      "learning_rate": 0.00019978461262098343,
+      "loss": 0.1664,
+      "step": 666
+    },
+    {
+      "epoch": 0.14008190696209177,
+      "grad_norm": 0.6951011419296265,
+      "learning_rate": 0.00019978232818916727,
+      "loss": 0.2229,
+      "step": 667
+    },
+    {
+      "epoch": 0.14029192481360916,
+      "grad_norm": 0.5914542078971863,
+      "learning_rate": 0.000199780031719853,
+      "loss": 0.2126,
+      "step": 668
+    },
+    {
+      "epoch": 0.14050194266512653,
+      "grad_norm": 0.7551674246788025,
+      "learning_rate": 0.00019977772321331765,
+      "loss": 0.2806,
+      "step": 669
+    },
+    {
+      "epoch": 0.14071196051664392,
+      "grad_norm": 0.7960730195045471,
+      "learning_rate": 0.00019977540266983976,
+      "loss": 0.2653,
+      "step": 670
+    },
+    {
+      "epoch": 0.14092197836816128,
+      "grad_norm": 0.5545317530632019,
+      "learning_rate": 0.00019977307008969922,
+      "loss": 0.2141,
+      "step": 671
+    },
+    {
+      "epoch": 0.14113199621967867,
+      "grad_norm": 0.7467978596687317,
+      "learning_rate": 0.0001997707254731775,
+      "loss": 0.1961,
+      "step": 672
+    },
+    {
+      "epoch": 0.14134201407119606,
+      "grad_norm": 0.6775459051132202,
+      "learning_rate": 0.00019976836882055736,
+      "loss": 0.2304,
+      "step": 673
+    },
+    {
+      "epoch": 0.14155203192271343,
+      "grad_norm": 0.793547511100769,
+      "learning_rate": 0.00019976600013212317,
+      "loss": 0.2266,
+      "step": 674
+    },
+    {
+      "epoch": 0.14176204977423082,
+      "grad_norm": 0.6920728087425232,
+      "learning_rate": 0.00019976361940816063,
+      "loss": 0.3469,
+      "step": 675
+    },
+    {
+      "epoch": 0.14197206762574818,
+      "grad_norm": 0.840145468711853,
+      "learning_rate": 0.000199761226648957,
+      "loss": 0.235,
+      "step": 676
+    },
+    {
+      "epoch": 0.14218208547726557,
+      "grad_norm": 0.8047716617584229,
+      "learning_rate": 0.0001997588218548009,
+      "loss": 0.3034,
+      "step": 677
+    },
+    {
+      "epoch": 0.14239210332878294,
+      "grad_norm": 0.649042010307312,
+      "learning_rate": 0.00019975640502598244,
+      "loss": 0.2822,
+      "step": 678
+    },
+    {
+      "epoch": 0.14260212118030033,
+      "grad_norm": 0.6780881881713867,
+      "learning_rate": 0.0001997539761627932,
+      "loss": 0.1593,
+      "step": 679
+    },
+    {
+      "epoch": 0.1428121390318177,
+      "grad_norm": 0.6812571883201599,
+      "learning_rate": 0.00019975153526552615,
+      "loss": 0.1898,
+      "step": 680
+    },
+    {
+      "epoch": 0.14302215688333508,
+      "grad_norm": 0.5687631368637085,
+      "learning_rate": 0.0001997490823344758,
+      "loss": 0.2193,
+      "step": 681
+    },
+    {
+      "epoch": 0.14323217473485247,
+      "grad_norm": 0.8981772065162659,
+      "learning_rate": 0.00019974661736993804,
+      "loss": 0.2785,
+      "step": 682
+    },
+    {
+      "epoch": 0.14344219258636984,
+      "grad_norm": 0.6966889500617981,
+      "learning_rate": 0.00019974414037221027,
+      "loss": 0.2678,
+      "step": 683
+    },
+    {
+      "epoch": 0.14365221043788723,
+      "grad_norm": 0.5631129145622253,
+      "learning_rate": 0.00019974165134159126,
+      "loss": 0.2836,
+      "step": 684
+    },
+    {
+      "epoch": 0.1438622282894046,
+      "grad_norm": 0.7686763405799866,
+      "learning_rate": 0.00019973915027838134,
+      "loss": 0.2372,
+      "step": 685
+    },
+    {
+      "epoch": 0.14407224614092198,
+      "grad_norm": 0.881515383720398,
+      "learning_rate": 0.00019973663718288217,
+      "loss": 0.2901,
+      "step": 686
+    },
+    {
+      "epoch": 0.14428226399243935,
+      "grad_norm": 0.6973896026611328,
+      "learning_rate": 0.00019973411205539694,
+      "loss": 0.2577,
+      "step": 687
+    },
+    {
+      "epoch": 0.14449228184395674,
+      "grad_norm": 0.3398377299308777,
+      "learning_rate": 0.0001997315748962303,
+      "loss": 0.1305,
+      "step": 688
+    },
+    {
+      "epoch": 0.1447022996954741,
+      "grad_norm": 0.6775567531585693,
+      "learning_rate": 0.0001997290257056883,
+      "loss": 0.3746,
+      "step": 689
+    },
+    {
+      "epoch": 0.1449123175469915,
+      "grad_norm": 0.3776891827583313,
+      "learning_rate": 0.0001997264644840785,
+      "loss": 0.1731,
+      "step": 690
+    },
+    {
+      "epoch": 0.14512233539850888,
+      "grad_norm": 0.6515337824821472,
+      "learning_rate": 0.00019972389123170986,
+      "loss": 0.2596,
+      "step": 691
+    },
+    {
+      "epoch": 0.14533235325002625,
+      "grad_norm": 0.7165318131446838,
+      "learning_rate": 0.00019972130594889286,
+      "loss": 0.2673,
+      "step": 692
+    },
+    {
+      "epoch": 0.14554237110154364,
+      "grad_norm": 0.5702444314956665,
+      "learning_rate": 0.00019971870863593925,
+      "loss": 0.1928,
+      "step": 693
+    },
+    {
+      "epoch": 0.145752388953061,
+      "grad_norm": 0.3542981743812561,
+      "learning_rate": 0.0001997160992931625,
+      "loss": 0.1277,
+      "step": 694
+    },
+    {
+      "epoch": 0.1459624068045784,
+      "grad_norm": 0.6520780920982361,
+      "learning_rate": 0.00019971347792087732,
+      "loss": 0.2623,
+      "step": 695
+    },
+    {
+      "epoch": 0.14617242465609576,
+      "grad_norm": 0.4505969285964966,
+      "learning_rate": 0.00019971084451939997,
+      "loss": 0.2026,
+      "step": 696
+    },
+    {
+      "epoch": 0.14638244250761315,
+      "grad_norm": 0.623036801815033,
+      "learning_rate": 0.00019970819908904814,
+      "loss": 0.2371,
+      "step": 697
+    },
+    {
+      "epoch": 0.14659246035913054,
+      "grad_norm": 0.60871422290802,
+      "learning_rate": 0.00019970554163014097,
+      "loss": 0.3128,
+      "step": 698
+    },
+    {
+      "epoch": 0.1468024782106479,
+      "grad_norm": 0.6321155428886414,
+      "learning_rate": 0.00019970287214299902,
+      "loss": 0.2183,
+      "step": 699
+    },
+    {
+      "epoch": 0.1470124960621653,
+      "grad_norm": 0.7513316869735718,
+      "learning_rate": 0.0001997001906279444,
+      "loss": 0.2753,
+      "step": 700
+    },
+    {
+      "epoch": 0.14722251391368266,
+      "grad_norm": 0.4192676842212677,
+      "learning_rate": 0.0001996974970853005,
+      "loss": 0.3071,
+      "step": 701
+    },
+    {
+      "epoch": 0.14743253176520005,
+      "grad_norm": 0.5773706436157227,
+      "learning_rate": 0.00019969479151539236,
+      "loss": 0.2883,
+      "step": 702
+    },
+    {
+      "epoch": 0.1476425496167174,
+      "grad_norm": 0.4587963819503784,
+      "learning_rate": 0.00019969207391854632,
+      "loss": 0.2997,
+      "step": 703
+    },
+    {
+      "epoch": 0.1478525674682348,
+      "grad_norm": 0.6077782511711121,
+      "learning_rate": 0.00019968934429509023,
+      "loss": 0.182,
+      "step": 704
+    },
+    {
+      "epoch": 0.14806258531975217,
+      "grad_norm": 0.6285839676856995,
+      "learning_rate": 0.0001996866026453534,
+      "loss": 0.3573,
+      "step": 705
+    },
+    {
+      "epoch": 0.14827260317126956,
+      "grad_norm": 0.7416669726371765,
+      "learning_rate": 0.00019968384896966657,
+      "loss": 0.2424,
+      "step": 706
+    },
+    {
+      "epoch": 0.14848262102278695,
+      "grad_norm": 0.5475688576698303,
+      "learning_rate": 0.0001996810832683619,
+      "loss": 0.1766,
+      "step": 707
+    },
+    {
+      "epoch": 0.1486926388743043,
+      "grad_norm": 0.5601086020469666,
+      "learning_rate": 0.00019967830554177312,
+      "loss": 0.2725,
+      "step": 708
+    },
+    {
+      "epoch": 0.1489026567258217,
+      "grad_norm": 0.7686034440994263,
+      "learning_rate": 0.00019967551579023524,
+      "loss": 0.3008,
+      "step": 709
+    },
+    {
+      "epoch": 0.14911267457733907,
+      "grad_norm": 0.8172418475151062,
+      "learning_rate": 0.00019967271401408486,
+      "loss": 0.3042,
+      "step": 710
+    },
+    {
+      "epoch": 0.14932269242885646,
+      "grad_norm": 0.8726032972335815,
+      "learning_rate": 0.00019966990021366,
+      "loss": 0.224,
+      "step": 711
+    },
+    {
+      "epoch": 0.14953271028037382,
+      "grad_norm": 0.6053635478019714,
+      "learning_rate": 0.00019966707438930003,
+      "loss": 0.2325,
+      "step": 712
+    },
+    {
+      "epoch": 0.1497427281318912,
+      "grad_norm": 0.7157438397407532,
+      "learning_rate": 0.00019966423654134592,
+      "loss": 0.2656,
+      "step": 713
+    },
+    {
+      "epoch": 0.1499527459834086,
+      "grad_norm": 0.6943267583847046,
+      "learning_rate": 0.00019966138667014,
+      "loss": 0.2625,
+      "step": 714
+    },
+    {
+      "epoch": 0.15016276383492597,
+      "grad_norm": 0.7070578336715698,
+      "learning_rate": 0.00019965852477602604,
+      "loss": 0.2795,
+      "step": 715
+    },
+    {
+      "epoch": 0.15037278168644336,
+      "grad_norm": 0.654684841632843,
+      "learning_rate": 0.00019965565085934935,
+      "loss": 0.2168,
+      "step": 716
+    },
+    {
+      "epoch": 0.15058279953796072,
+      "grad_norm": 0.5972804427146912,
+      "learning_rate": 0.00019965276492045662,
+      "loss": 0.2337,
+      "step": 717
+    },
+    {
+      "epoch": 0.1507928173894781,
+      "grad_norm": 0.4990095794200897,
+      "learning_rate": 0.000199649866959696,
+      "loss": 0.3187,
+      "step": 718
+    },
+    {
+      "epoch": 0.15100283524099548,
+      "grad_norm": 0.6247003078460693,
+      "learning_rate": 0.00019964695697741703,
+      "loss": 0.2139,
+      "step": 719
+    },
+    {
+      "epoch": 0.15121285309251287,
+      "grad_norm": 0.6358337998390198,
+      "learning_rate": 0.00019964403497397084,
+      "loss": 0.244,
+      "step": 720
+    },
+    {
+      "epoch": 0.15142287094403023,
+      "grad_norm": 0.5211917161941528,
+      "learning_rate": 0.0001996411009497099,
+      "loss": 0.1784,
+      "step": 721
+    },
+    {
+      "epoch": 0.15163288879554762,
+      "grad_norm": 0.464606374502182,
+      "learning_rate": 0.00019963815490498817,
+      "loss": 0.2137,
+      "step": 722
+    },
+    {
+      "epoch": 0.151842906647065,
+      "grad_norm": 0.7099301815032959,
+      "learning_rate": 0.00019963519684016107,
+      "loss": 0.2927,
+      "step": 723
+    },
+    {
+      "epoch": 0.15205292449858238,
+      "grad_norm": 0.7805564999580383,
+      "learning_rate": 0.00019963222675558543,
+      "loss": 0.2374,
+      "step": 724
+    },
+    {
+      "epoch": 0.15226294235009977,
+      "grad_norm": 0.6172361373901367,
+      "learning_rate": 0.00019962924465161957,
+      "loss": 0.201,
+      "step": 725
+    },
+    {
+      "epoch": 0.15247296020161713,
+      "grad_norm": 0.6261605620384216,
+      "learning_rate": 0.0001996262505286232,
+      "loss": 0.1709,
+      "step": 726
+    },
+    {
+      "epoch": 0.15268297805313452,
+      "grad_norm": 0.6561511158943176,
+      "learning_rate": 0.00019962324438695762,
+      "loss": 0.2283,
+      "step": 727
+    },
+    {
+      "epoch": 0.15289299590465188,
+      "grad_norm": 0.5386349558830261,
+      "learning_rate": 0.0001996202262269854,
+      "loss": 0.231,
+      "step": 728
+    },
+    {
+      "epoch": 0.15310301375616928,
+      "grad_norm": 0.644136369228363,
+      "learning_rate": 0.00019961719604907066,
+      "loss": 0.1875,
+      "step": 729
+    },
+    {
+      "epoch": 0.15331303160768664,
+      "grad_norm": 0.6452980041503906,
+      "learning_rate": 0.00019961415385357897,
+      "loss": 0.2294,
+      "step": 730
+    },
+    {
+      "epoch": 0.15352304945920403,
+      "grad_norm": 0.5558809041976929,
+      "learning_rate": 0.0001996110996408773,
+      "loss": 0.1988,
+      "step": 731
+    },
+    {
+      "epoch": 0.15373306731072142,
+      "grad_norm": 0.6049979329109192,
+      "learning_rate": 0.00019960803341133413,
+      "loss": 0.2368,
+      "step": 732
+    },
+    {
+      "epoch": 0.15394308516223879,
+      "grad_norm": 0.6450143456459045,
+      "learning_rate": 0.00019960495516531935,
+      "loss": 0.2217,
+      "step": 733
+    },
+    {
+      "epoch": 0.15415310301375618,
+      "grad_norm": 0.6582781672477722,
+      "learning_rate": 0.00019960186490320436,
+      "loss": 0.1942,
+      "step": 734
+    },
+    {
+      "epoch": 0.15436312086527354,
+      "grad_norm": 0.5160269141197205,
+      "learning_rate": 0.0001995987626253619,
+      "loss": 0.1723,
+      "step": 735
+    },
+    {
+      "epoch": 0.15457313871679093,
+      "grad_norm": 0.6058139801025391,
+      "learning_rate": 0.00019959564833216625,
+      "loss": 0.2089,
+      "step": 736
+    },
+    {
+      "epoch": 0.1547831565683083,
+      "grad_norm": 0.540282666683197,
+      "learning_rate": 0.0001995925220239931,
+      "loss": 0.2089,
+      "step": 737
+    },
+    {
+      "epoch": 0.15499317441982569,
+      "grad_norm": 0.7635892033576965,
+      "learning_rate": 0.0001995893837012196,
+      "loss": 0.2825,
+      "step": 738
+    },
+    {
+      "epoch": 0.15520319227134308,
+      "grad_norm": 0.5233755111694336,
+      "learning_rate": 0.00019958623336422434,
+      "loss": 0.1514,
+      "step": 739
+    },
+    {
+      "epoch": 0.15541321012286044,
+      "grad_norm": 0.44758716225624084,
+      "learning_rate": 0.00019958307101338742,
+      "loss": 0.132,
+      "step": 740
+    },
+    {
+      "epoch": 0.15562322797437783,
+      "grad_norm": 0.7145951390266418,
+      "learning_rate": 0.00019957989664909026,
+      "loss": 0.2395,
+      "step": 741
+    },
+    {
+      "epoch": 0.1558332458258952,
+      "grad_norm": 0.6241814494132996,
+      "learning_rate": 0.0001995767102717159,
+      "loss": 0.2255,
+      "step": 742
+    },
+    {
+      "epoch": 0.15604326367741259,
+      "grad_norm": 0.502863883972168,
+      "learning_rate": 0.00019957351188164865,
+      "loss": 0.1941,
+      "step": 743
+    },
+    {
+      "epoch": 0.15625328152892995,
+      "grad_norm": 0.5572714805603027,
+      "learning_rate": 0.00019957030147927442,
+      "loss": 0.1664,
+      "step": 744
+    },
+    {
+      "epoch": 0.15646329938044734,
+      "grad_norm": 1.0500191450119019,
+      "learning_rate": 0.00019956707906498044,
+      "loss": 0.3229,
+      "step": 745
+    },
+    {
+      "epoch": 0.1566733172319647,
+      "grad_norm": 0.595522403717041,
+      "learning_rate": 0.0001995638446391555,
+      "loss": 0.1932,
+      "step": 746
+    },
+    {
+      "epoch": 0.1568833350834821,
+      "grad_norm": 0.38818204402923584,
+      "learning_rate": 0.00019956059820218982,
+      "loss": 0.1324,
+      "step": 747
+    },
+    {
+      "epoch": 0.1570933529349995,
+      "grad_norm": 0.6705027222633362,
+      "learning_rate": 0.000199557339754475,
+      "loss": 0.194,
+      "step": 748
+    },
+    {
+      "epoch": 0.15730337078651685,
+      "grad_norm": 0.4935189485549927,
+      "learning_rate": 0.0001995540692964041,
+      "loss": 0.2492,
+      "step": 749
+    },
+    {
+      "epoch": 0.15751338863803424,
+      "grad_norm": 0.3950806260108948,
+      "learning_rate": 0.00019955078682837174,
+      "loss": 0.1331,
+      "step": 750
+    },
+    {
+      "epoch": 0.1577234064895516,
+      "grad_norm": 0.6625058650970459,
+      "learning_rate": 0.00019954749235077384,
+      "loss": 0.297,
+      "step": 751
+    },
+    {
+      "epoch": 0.157933424341069,
+      "grad_norm": 0.5862818956375122,
+      "learning_rate": 0.00019954418586400787,
+      "loss": 0.2628,
+      "step": 752
+    },
+    {
+      "epoch": 0.15814344219258636,
+      "grad_norm": 0.6951611042022705,
+      "learning_rate": 0.0001995408673684727,
+      "loss": 0.2573,
+      "step": 753
+    },
+    {
+      "epoch": 0.15835346004410375,
+      "grad_norm": 0.8030470013618469,
+      "learning_rate": 0.0001995375368645687,
+      "loss": 0.2671,
+      "step": 754
+    },
+    {
+      "epoch": 0.15856347789562114,
+      "grad_norm": 0.4509555995464325,
+      "learning_rate": 0.00019953419435269764,
+      "loss": 0.1808,
+      "step": 755
+    },
+    {
+      "epoch": 0.1587734957471385,
+      "grad_norm": 0.7687417268753052,
+      "learning_rate": 0.0001995308398332627,
+      "loss": 0.2906,
+      "step": 756
+    },
+    {
+      "epoch": 0.1589835135986559,
+      "grad_norm": 0.7642715573310852,
+      "learning_rate": 0.00019952747330666867,
+      "loss": 0.3541,
+      "step": 757
+    },
+    {
+      "epoch": 0.15919353145017326,
+      "grad_norm": 0.6699778437614441,
+      "learning_rate": 0.00019952409477332156,
+      "loss": 0.2494,
+      "step": 758
+    },
+    {
+      "epoch": 0.15940354930169065,
+      "grad_norm": 0.7119278907775879,
+      "learning_rate": 0.00019952070423362903,
+      "loss": 0.1994,
+      "step": 759
+    },
+    {
+      "epoch": 0.159613567153208,
+      "grad_norm": 0.6130563616752625,
+      "learning_rate": 0.00019951730168800004,
+      "loss": 0.3433,
+      "step": 760
+    },
+    {
+      "epoch": 0.1598235850047254,
+      "grad_norm": 0.692933201789856,
+      "learning_rate": 0.00019951388713684514,
+      "loss": 0.1762,
+      "step": 761
+    },
+    {
+      "epoch": 0.16003360285624277,
+      "grad_norm": 0.5561717748641968,
+      "learning_rate": 0.00019951046058057622,
+      "loss": 0.2266,
+      "step": 762
+    },
+    {
+      "epoch": 0.16024362070776016,
+      "grad_norm": 0.8559679388999939,
+      "learning_rate": 0.00019950702201960665,
+      "loss": 0.3145,
+      "step": 763
+    },
+    {
+      "epoch": 0.16045363855927755,
+      "grad_norm": 0.7173314094543457,
+      "learning_rate": 0.00019950357145435122,
+      "loss": 0.2079,
+      "step": 764
+    },
+    {
+      "epoch": 0.16066365641079491,
+      "grad_norm": 0.4696892201900482,
+      "learning_rate": 0.00019950010888522625,
+      "loss": 0.2374,
+      "step": 765
+    },
+    {
+      "epoch": 0.1608736742623123,
+      "grad_norm": 0.5349077582359314,
+      "learning_rate": 0.00019949663431264943,
+      "loss": 0.2221,
+      "step": 766
+    },
+    {
+      "epoch": 0.16108369211382967,
+      "grad_norm": 0.49449819326400757,
+      "learning_rate": 0.0001994931477370399,
+      "loss": 0.1432,
+      "step": 767
+    },
+    {
+      "epoch": 0.16129370996534706,
+      "grad_norm": 0.652260422706604,
+      "learning_rate": 0.00019948964915881835,
+      "loss": 0.2122,
+      "step": 768
+    },
+    {
+      "epoch": 0.16150372781686442,
+      "grad_norm": 0.6549475789070129,
+      "learning_rate": 0.00019948613857840672,
+      "loss": 0.3484,
+      "step": 769
+    },
+    {
+      "epoch": 0.16171374566838181,
+      "grad_norm": 0.6772179007530212,
+      "learning_rate": 0.00019948261599622865,
+      "loss": 0.2784,
+      "step": 770
+    },
+    {
+      "epoch": 0.16192376351989918,
+      "grad_norm": 0.788960337638855,
+      "learning_rate": 0.00019947908141270898,
+      "loss": 0.1939,
+      "step": 771
+    },
+    {
+      "epoch": 0.16213378137141657,
+      "grad_norm": 0.6915500164031982,
+      "learning_rate": 0.00019947553482827418,
+      "loss": 0.1541,
+      "step": 772
+    },
+    {
+      "epoch": 0.16234379922293396,
+      "grad_norm": 0.604015052318573,
+      "learning_rate": 0.0001994719762433521,
+      "loss": 0.2148,
+      "step": 773
+    },
+    {
+      "epoch": 0.16255381707445132,
+      "grad_norm": 0.8275285959243774,
+      "learning_rate": 0.00019946840565837203,
+      "loss": 0.2808,
+      "step": 774
+    },
+    {
+      "epoch": 0.16276383492596871,
+      "grad_norm": 0.6737775802612305,
+      "learning_rate": 0.00019946482307376472,
+      "loss": 0.1813,
+      "step": 775
+    },
+    {
+      "epoch": 0.16297385277748608,
+      "grad_norm": 0.8311626315116882,
+      "learning_rate": 0.0001994612284899623,
+      "loss": 0.2819,
+      "step": 776
+    },
+    {
+      "epoch": 0.16318387062900347,
+      "grad_norm": 0.7368951439857483,
+      "learning_rate": 0.00019945762190739852,
+      "loss": 0.2619,
+      "step": 777
+    },
+    {
+      "epoch": 0.16339388848052083,
+      "grad_norm": 0.6095349788665771,
+      "learning_rate": 0.0001994540033265084,
+      "loss": 0.2449,
+      "step": 778
+    },
+    {
+      "epoch": 0.16360390633203822,
+      "grad_norm": 0.6738486886024475,
+      "learning_rate": 0.0001994503727477285,
+      "loss": 0.1493,
+      "step": 779
+    },
+    {
+      "epoch": 0.16381392418355561,
+      "grad_norm": 0.5636208653450012,
+      "learning_rate": 0.0001994467301714968,
+      "loss": 0.1949,
+      "step": 780
+    },
+    {
+      "epoch": 0.16402394203507298,
+      "grad_norm": 0.9404299259185791,
+      "learning_rate": 0.00019944307559825272,
+      "loss": 0.2503,
+      "step": 781
+    },
+    {
+      "epoch": 0.16423395988659037,
+      "grad_norm": 0.6188719868659973,
+      "learning_rate": 0.0001994394090284372,
+      "loss": 0.1658,
+      "step": 782
+    },
+    {
+      "epoch": 0.16444397773810773,
+      "grad_norm": 0.9498090147972107,
+      "learning_rate": 0.00019943573046249244,
+      "loss": 0.3425,
+      "step": 783
+    },
+    {
+      "epoch": 0.16465399558962512,
+      "grad_norm": 0.6508981585502625,
+      "learning_rate": 0.00019943203990086233,
+      "loss": 0.1384,
+      "step": 784
+    },
+    {
+      "epoch": 0.1648640134411425,
+      "grad_norm": 1.0658531188964844,
+      "learning_rate": 0.00019942833734399202,
+      "loss": 0.2609,
+      "step": 785
+    },
+    {
+      "epoch": 0.16507403129265988,
+      "grad_norm": 0.7281699776649475,
+      "learning_rate": 0.00019942462279232825,
+      "loss": 0.1985,
+      "step": 786
+    },
+    {
+      "epoch": 0.16528404914417724,
+      "grad_norm": 0.7734364867210388,
+      "learning_rate": 0.00019942089624631906,
+      "loss": 0.2617,
+      "step": 787
+    },
+    {
+      "epoch": 0.16549406699569463,
+      "grad_norm": 0.977069616317749,
+      "learning_rate": 0.00019941715770641408,
+      "loss": 0.2928,
+      "step": 788
+    },
+    {
+      "epoch": 0.16570408484721202,
+      "grad_norm": 0.7139049768447876,
+      "learning_rate": 0.00019941340717306424,
+      "loss": 0.3369,
+      "step": 789
+    },
+    {
+      "epoch": 0.1659141026987294,
+      "grad_norm": 0.5771147012710571,
+      "learning_rate": 0.00019940964464672205,
+      "loss": 0.2304,
+      "step": 790
+    },
+    {
+      "epoch": 0.16612412055024678,
+      "grad_norm": 0.5506160855293274,
+      "learning_rate": 0.00019940587012784138,
+      "loss": 0.2084,
+      "step": 791
+    },
+    {
+      "epoch": 0.16633413840176414,
+      "grad_norm": 0.48316794633865356,
+      "learning_rate": 0.0001994020836168776,
+      "loss": 0.1835,
+      "step": 792
+    },
+    {
+      "epoch": 0.16654415625328153,
+      "grad_norm": 0.5649861693382263,
+      "learning_rate": 0.00019939828511428753,
+      "loss": 0.2426,
+      "step": 793
+    },
+    {
+      "epoch": 0.1667541741047989,
+      "grad_norm": 0.5224729180335999,
+      "learning_rate": 0.00019939447462052936,
+      "loss": 0.1862,
+      "step": 794
+    },
+    {
+      "epoch": 0.1669641919563163,
+      "grad_norm": 0.5801841616630554,
+      "learning_rate": 0.00019939065213606282,
+      "loss": 0.2081,
+      "step": 795
+    },
+    {
+      "epoch": 0.16717420980783368,
+      "grad_norm": 0.4274038076400757,
+      "learning_rate": 0.00019938681766134902,
+      "loss": 0.14,
+      "step": 796
+    },
+    {
+      "epoch": 0.16738422765935104,
+      "grad_norm": 0.5294644236564636,
+      "learning_rate": 0.00019938297119685054,
+      "loss": 0.1851,
+      "step": 797
+    },
+    {
+      "epoch": 0.16759424551086843,
+      "grad_norm": 0.5110440850257874,
+      "learning_rate": 0.00019937911274303145,
+      "loss": 0.2346,
+      "step": 798
+    },
+    {
+      "epoch": 0.1678042633623858,
+      "grad_norm": 0.5785256028175354,
+      "learning_rate": 0.00019937524230035717,
+      "loss": 0.1554,
+      "step": 799
+    },
+    {
+      "epoch": 0.1680142812139032,
+      "grad_norm": 0.586320161819458,
+      "learning_rate": 0.00019937135986929465,
+      "loss": 0.2672,
+      "step": 800
+    },
+    {
+      "epoch": 0.16822429906542055,
+      "grad_norm": 0.502890408039093,
+      "learning_rate": 0.00019936746545031223,
+      "loss": 0.3023,
+      "step": 801
+    },
+    {
+      "epoch": 0.16843431691693794,
+      "grad_norm": 0.5421012043952942,
+      "learning_rate": 0.00019936355904387977,
+      "loss": 0.2331,
+      "step": 802
+    },
+    {
+      "epoch": 0.1686443347684553,
+      "grad_norm": 0.5681023001670837,
+      "learning_rate": 0.0001993596406504685,
+      "loss": 0.2064,
+      "step": 803
+    },
+    {
+      "epoch": 0.1688543526199727,
+      "grad_norm": 0.4179142713546753,
+      "learning_rate": 0.00019935571027055113,
+      "loss": 0.2302,
+      "step": 804
+    },
+    {
+      "epoch": 0.1690643704714901,
+      "grad_norm": 0.7016621232032776,
+      "learning_rate": 0.00019935176790460179,
+      "loss": 0.2442,
+      "step": 805
+    },
+    {
+      "epoch": 0.16927438832300745,
+      "grad_norm": 0.5401879549026489,
+      "learning_rate": 0.00019934781355309612,
+      "loss": 0.2798,
+      "step": 806
+    },
+    {
+      "epoch": 0.16948440617452484,
+      "grad_norm": 0.5687265396118164,
+      "learning_rate": 0.00019934384721651113,
+      "loss": 0.2097,
+      "step": 807
+    },
+    {
+      "epoch": 0.1696944240260422,
+      "grad_norm": 0.659520149230957,
+      "learning_rate": 0.00019933986889532533,
+      "loss": 0.1938,
+      "step": 808
+    },
+    {
+      "epoch": 0.1699044418775596,
+      "grad_norm": 0.8230718970298767,
+      "learning_rate": 0.00019933587859001866,
+      "loss": 0.4148,
+      "step": 809
+    },
+    {
+      "epoch": 0.17011445972907696,
+      "grad_norm": 0.7954551577568054,
+      "learning_rate": 0.00019933187630107244,
+      "loss": 0.4564,
+      "step": 810
+    },
+    {
+      "epoch": 0.17032447758059435,
+      "grad_norm": 0.6618001461029053,
+      "learning_rate": 0.0001993278620289696,
+      "loss": 0.2819,
+      "step": 811
+    },
+    {
+      "epoch": 0.17053449543211172,
+      "grad_norm": 0.9731025099754333,
+      "learning_rate": 0.00019932383577419432,
+      "loss": 0.3954,
+      "step": 812
+    },
+    {
+      "epoch": 0.1707445132836291,
+      "grad_norm": 0.7344256639480591,
+      "learning_rate": 0.00019931979753723232,
+      "loss": 0.2502,
+      "step": 813
+    },
+    {
+      "epoch": 0.1709545311351465,
+      "grad_norm": 0.6986575722694397,
+      "learning_rate": 0.00019931574731857086,
+      "loss": 0.2499,
+      "step": 814
+    },
+    {
+      "epoch": 0.17116454898666386,
+      "grad_norm": 0.5757253170013428,
+      "learning_rate": 0.00019931168511869846,
+      "loss": 0.2445,
+      "step": 815
+    },
+    {
+      "epoch": 0.17137456683818125,
+      "grad_norm": 0.5453664064407349,
+      "learning_rate": 0.0001993076109381052,
+      "loss": 0.2494,
+      "step": 816
+    },
+    {
+      "epoch": 0.17158458468969862,
+      "grad_norm": 0.7031118869781494,
+      "learning_rate": 0.00019930352477728257,
+      "loss": 0.2777,
+      "step": 817
+    },
+    {
+      "epoch": 0.171794602541216,
+      "grad_norm": 0.6201139092445374,
+      "learning_rate": 0.0001992994266367235,
+      "loss": 0.2145,
+      "step": 818
+    },
+    {
+      "epoch": 0.17200462039273337,
+      "grad_norm": 0.6421683430671692,
+      "learning_rate": 0.00019929531651692245,
+      "loss": 0.1951,
+      "step": 819
+    },
+    {
+      "epoch": 0.17221463824425076,
+      "grad_norm": 0.6390677094459534,
+      "learning_rate": 0.00019929119441837518,
+      "loss": 0.2011,
+      "step": 820
+    },
+    {
+      "epoch": 0.17242465609576815,
+      "grad_norm": 0.5171882510185242,
+      "learning_rate": 0.00019928706034157901,
+      "loss": 0.1459,
+      "step": 821
+    },
+    {
+      "epoch": 0.17263467394728552,
+      "grad_norm": 0.6737155914306641,
+      "learning_rate": 0.00019928291428703262,
+      "loss": 0.1507,
+      "step": 822
+    },
+    {
+      "epoch": 0.1728446917988029,
+      "grad_norm": 0.526128351688385,
+      "learning_rate": 0.00019927875625523625,
+      "loss": 0.1565,
+      "step": 823
+    },
+    {
+      "epoch": 0.17305470965032027,
+      "grad_norm": 0.7430817484855652,
+      "learning_rate": 0.00019927458624669145,
+      "loss": 0.2575,
+      "step": 824
+    },
+    {
+      "epoch": 0.17326472750183766,
+      "grad_norm": 0.4702281355857849,
+      "learning_rate": 0.0001992704042619013,
+      "loss": 0.1796,
+      "step": 825
+    },
+    {
+      "epoch": 0.17347474535335503,
+      "grad_norm": 0.5295049548149109,
+      "learning_rate": 0.00019926621030137034,
+      "loss": 0.1974,
+      "step": 826
+    },
+    {
+      "epoch": 0.17368476320487242,
+      "grad_norm": 0.667036771774292,
+      "learning_rate": 0.00019926200436560447,
+      "loss": 0.2125,
+      "step": 827
+    },
+    {
+      "epoch": 0.17389478105638978,
+      "grad_norm": 0.7351561188697815,
+      "learning_rate": 0.0001992577864551111,
+      "loss": 0.2271,
+      "step": 828
+    },
+    {
+      "epoch": 0.17410479890790717,
+      "grad_norm": 0.8084509372711182,
+      "learning_rate": 0.0001992535565703991,
+      "loss": 0.2301,
+      "step": 829
+    },
+    {
+      "epoch": 0.17431481675942456,
+      "grad_norm": 0.7022576928138733,
+      "learning_rate": 0.0001992493147119787,
+      "loss": 0.2662,
+      "step": 830
+    },
+    {
+      "epoch": 0.17452483461094193,
+      "grad_norm": 0.7098193168640137,
+      "learning_rate": 0.00019924506088036165,
+      "loss": 0.1979,
+      "step": 831
+    },
+    {
+      "epoch": 0.17473485246245932,
+      "grad_norm": 0.590630292892456,
+      "learning_rate": 0.00019924079507606114,
+      "loss": 0.1872,
+      "step": 832
+    },
+    {
+      "epoch": 0.17494487031397668,
+      "grad_norm": 0.7556937336921692,
+      "learning_rate": 0.00019923651729959177,
+      "loss": 0.1651,
+      "step": 833
+    },
+    {
+      "epoch": 0.17515488816549407,
+      "grad_norm": 0.6680110096931458,
+      "learning_rate": 0.00019923222755146956,
+      "loss": 0.1837,
+      "step": 834
+    },
+    {
+      "epoch": 0.17536490601701143,
+      "grad_norm": 0.7310810685157776,
+      "learning_rate": 0.0001992279258322121,
+      "loss": 0.3201,
+      "step": 835
+    },
+    {
+      "epoch": 0.17557492386852883,
+      "grad_norm": 0.5796787142753601,
+      "learning_rate": 0.0001992236121423383,
+      "loss": 0.178,
+      "step": 836
+    },
+    {
+      "epoch": 0.17578494172004622,
+      "grad_norm": 0.45521265268325806,
+      "learning_rate": 0.00019921928648236853,
+      "loss": 0.1723,
+      "step": 837
+    },
+    {
+      "epoch": 0.17599495957156358,
+      "grad_norm": 0.43274396657943726,
+      "learning_rate": 0.00019921494885282467,
+      "loss": 0.1597,
+      "step": 838
+    },
+    {
+      "epoch": 0.17620497742308097,
+      "grad_norm": 0.40754616260528564,
+      "learning_rate": 0.00019921059925422996,
+      "loss": 0.1299,
+      "step": 839
+    },
+    {
+      "epoch": 0.17641499527459834,
+      "grad_norm": 0.6628978252410889,
+      "learning_rate": 0.00019920623768710912,
+      "loss": 0.1931,
+      "step": 840
+    },
+    {
+      "epoch": 0.17662501312611573,
+      "grad_norm": 0.644637405872345,
+      "learning_rate": 0.0001992018641519884,
+      "loss": 0.199,
+      "step": 841
+    },
+    {
+      "epoch": 0.1768350309776331,
+      "grad_norm": 0.5001009106636047,
+      "learning_rate": 0.0001991974786493953,
+      "loss": 0.2109,
+      "step": 842
+    },
+    {
+      "epoch": 0.17704504882915048,
+      "grad_norm": 0.49435755610466003,
+      "learning_rate": 0.00019919308117985894,
+      "loss": 0.1832,
+      "step": 843
+    },
+    {
+      "epoch": 0.17725506668066784,
+      "grad_norm": 0.7176212668418884,
+      "learning_rate": 0.0001991886717439098,
+      "loss": 0.2491,
+      "step": 844
+    },
+    {
+      "epoch": 0.17746508453218524,
+      "grad_norm": 0.5122328996658325,
+      "learning_rate": 0.00019918425034207984,
+      "loss": 0.2618,
+      "step": 845
+    },
+    {
+      "epoch": 0.17767510238370263,
+      "grad_norm": 0.6069608926773071,
+      "learning_rate": 0.00019917981697490245,
+      "loss": 0.2119,
+      "step": 846
+    },
+    {
+      "epoch": 0.17788512023522,
+      "grad_norm": 0.8389537334442139,
+      "learning_rate": 0.00019917537164291244,
+      "loss": 0.2619,
+      "step": 847
+    },
+    {
+      "epoch": 0.17809513808673738,
+      "grad_norm": 0.5856572389602661,
+      "learning_rate": 0.00019917091434664612,
+      "loss": 0.1928,
+      "step": 848
+    },
+    {
+      "epoch": 0.17830515593825474,
+      "grad_norm": 0.5682632327079773,
+      "learning_rate": 0.00019916644508664115,
+      "loss": 0.2963,
+      "step": 849
+    },
+    {
+      "epoch": 0.17851517378977214,
+      "grad_norm": 0.45547807216644287,
+      "learning_rate": 0.00019916196386343674,
+      "loss": 0.1277,
+      "step": 850
+    },
+    {
+      "epoch": 0.1787251916412895,
+      "grad_norm": 0.648499071598053,
+      "learning_rate": 0.00019915747067757349,
+      "loss": 0.3407,
+      "step": 851
+    },
+    {
+      "epoch": 0.1789352094928069,
+      "grad_norm": 0.48874902725219727,
+      "learning_rate": 0.0001991529655295934,
+      "loss": 0.185,
+      "step": 852
+    },
+    {
+      "epoch": 0.17914522734432425,
+      "grad_norm": 0.7059923410415649,
+      "learning_rate": 0.00019914844842004002,
+      "loss": 0.2352,
+      "step": 853
+    },
+    {
+      "epoch": 0.17935524519584164,
+      "grad_norm": 0.6532195210456848,
+      "learning_rate": 0.00019914391934945823,
+      "loss": 0.292,
+      "step": 854
+    },
+    {
+      "epoch": 0.17956526304735904,
+      "grad_norm": 0.6922776103019714,
+      "learning_rate": 0.0001991393783183945,
+      "loss": 0.4635,
+      "step": 855
+    },
+    {
+      "epoch": 0.1797752808988764,
+      "grad_norm": 0.6560776829719543,
+      "learning_rate": 0.00019913482532739652,
+      "loss": 0.2684,
+      "step": 856
+    },
+    {
+      "epoch": 0.1799852987503938,
+      "grad_norm": 0.5644369125366211,
+      "learning_rate": 0.00019913026037701362,
+      "loss": 0.2018,
+      "step": 857
+    },
+    {
+      "epoch": 0.18019531660191115,
+      "grad_norm": 0.6108200550079346,
+      "learning_rate": 0.00019912568346779652,
+      "loss": 0.1746,
+      "step": 858
+    },
+    {
+      "epoch": 0.18040533445342855,
+      "grad_norm": 0.6762723326683044,
+      "learning_rate": 0.00019912109460029734,
+      "loss": 0.4662,
+      "step": 859
+    },
+    {
+      "epoch": 0.1806153523049459,
+      "grad_norm": 0.5877822041511536,
+      "learning_rate": 0.00019911649377506966,
+      "loss": 0.2546,
+      "step": 860
+    },
+    {
+      "epoch": 0.1808253701564633,
+      "grad_norm": 0.5038641095161438,
+      "learning_rate": 0.00019911188099266855,
+      "loss": 0.3073,
+      "step": 861
+    },
+    {
+      "epoch": 0.1810353880079807,
+      "grad_norm": 0.6587141156196594,
+      "learning_rate": 0.00019910725625365045,
+      "loss": 0.2991,
+      "step": 862
+    },
+    {
+      "epoch": 0.18124540585949805,
+      "grad_norm": 0.9041693210601807,
+      "learning_rate": 0.0001991026195585733,
+      "loss": 0.3111,
+      "step": 863
+    },
+    {
+      "epoch": 0.18145542371101545,
+      "grad_norm": 0.6296244263648987,
+      "learning_rate": 0.00019909797090799644,
+      "loss": 0.2272,
+      "step": 864
+    },
+    {
+      "epoch": 0.1816654415625328,
+      "grad_norm": 0.6931461691856384,
+      "learning_rate": 0.00019909331030248072,
+      "loss": 0.3503,
+      "step": 865
+    },
+    {
+      "epoch": 0.1818754594140502,
+      "grad_norm": 0.7656722664833069,
+      "learning_rate": 0.00019908863774258827,
+      "loss": 0.3773,
+      "step": 866
+    },
+    {
+      "epoch": 0.18208547726556756,
+      "grad_norm": 0.6011465787887573,
+      "learning_rate": 0.00019908395322888294,
+      "loss": 0.2101,
+      "step": 867
+    },
+    {
+      "epoch": 0.18229549511708495,
+      "grad_norm": 0.6926429867744446,
+      "learning_rate": 0.0001990792567619297,
+      "loss": 0.2027,
+      "step": 868
+    },
+    {
+      "epoch": 0.18250551296860232,
+      "grad_norm": 0.5799981355667114,
+      "learning_rate": 0.00019907454834229525,
+      "loss": 0.2129,
+      "step": 869
+    },
+    {
+      "epoch": 0.1827155308201197,
+      "grad_norm": 0.5605289936065674,
+      "learning_rate": 0.0001990698279705475,
+      "loss": 0.2104,
+      "step": 870
+    },
+    {
+      "epoch": 0.1829255486716371,
+      "grad_norm": 0.9048646092414856,
+      "learning_rate": 0.00019906509564725596,
+      "loss": 0.4131,
+      "step": 871
+    },
+    {
+      "epoch": 0.18313556652315446,
+      "grad_norm": 0.6802535057067871,
+      "learning_rate": 0.0001990603513729915,
+      "loss": 0.2715,
+      "step": 872
+    },
+    {
+      "epoch": 0.18334558437467185,
+      "grad_norm": 0.4949076771736145,
+      "learning_rate": 0.0001990555951483265,
+      "loss": 0.1725,
+      "step": 873
+    },
+    {
+      "epoch": 0.18355560222618922,
+      "grad_norm": 0.6589632034301758,
+      "learning_rate": 0.0001990508269738347,
+      "loss": 0.1424,
+      "step": 874
+    },
+    {
+      "epoch": 0.1837656200777066,
+      "grad_norm": 0.5366025567054749,
+      "learning_rate": 0.00019904604685009133,
+      "loss": 0.1578,
+      "step": 875
+    },
+    {
+      "epoch": 0.18397563792922397,
+      "grad_norm": 0.584173858165741,
+      "learning_rate": 0.00019904125477767303,
+      "loss": 0.2381,
+      "step": 876
+    },
+    {
+      "epoch": 0.18418565578074136,
+      "grad_norm": 0.6884530186653137,
+      "learning_rate": 0.00019903645075715798,
+      "loss": 0.2043,
+      "step": 877
+    },
+    {
+      "epoch": 0.18439567363225876,
+      "grad_norm": 0.6070178747177124,
+      "learning_rate": 0.00019903163478912563,
+      "loss": 0.1792,
+      "step": 878
+    },
+    {
+      "epoch": 0.18460569148377612,
+      "grad_norm": 0.6375721096992493,
+      "learning_rate": 0.00019902680687415705,
+      "loss": 0.218,
+      "step": 879
+    },
+    {
+      "epoch": 0.1848157093352935,
+      "grad_norm": 0.564017653465271,
+      "learning_rate": 0.0001990219670128346,
+      "loss": 0.1885,
+      "step": 880
+    },
+    {
+      "epoch": 0.18502572718681087,
+      "grad_norm": 0.6779912710189819,
+      "learning_rate": 0.0001990171152057422,
+      "loss": 0.1949,
+      "step": 881
+    },
+    {
+      "epoch": 0.18523574503832826,
+      "grad_norm": 0.6086128950119019,
+      "learning_rate": 0.0001990122514534651,
+      "loss": 0.1818,
+      "step": 882
+    },
+    {
+      "epoch": 0.18544576288984563,
+      "grad_norm": 0.4768702983856201,
+      "learning_rate": 0.0001990073757565901,
+      "loss": 0.1459,
+      "step": 883
+    },
+    {
+      "epoch": 0.18565578074136302,
+      "grad_norm": 0.5171164870262146,
+      "learning_rate": 0.0001990024881157054,
+      "loss": 0.1624,
+      "step": 884
+    },
+    {
+      "epoch": 0.18586579859288038,
+      "grad_norm": 0.6542419195175171,
+      "learning_rate": 0.00019899758853140064,
+      "loss": 0.2035,
+      "step": 885
+    },
+    {
+      "epoch": 0.18607581644439777,
+      "grad_norm": 0.7479321956634521,
+      "learning_rate": 0.0001989926770042668,
+      "loss": 0.3654,
+      "step": 886
+    },
+    {
+      "epoch": 0.18628583429591516,
+      "grad_norm": 0.7446826696395874,
+      "learning_rate": 0.0001989877535348965,
+      "loss": 0.236,
+      "step": 887
+    },
+    {
+      "epoch": 0.18649585214743253,
+      "grad_norm": 0.5898016691207886,
+      "learning_rate": 0.00019898281812388366,
+      "loss": 0.2013,
+      "step": 888
+    },
+    {
+      "epoch": 0.18670586999894992,
+      "grad_norm": 0.6942265629768372,
+      "learning_rate": 0.00019897787077182368,
+      "loss": 0.1912,
+      "step": 889
+    },
+    {
+      "epoch": 0.18691588785046728,
+      "grad_norm": 0.7095215320587158,
+      "learning_rate": 0.0001989729114793134,
+      "loss": 0.2031,
+      "step": 890
+    },
+    {
+      "epoch": 0.18712590570198467,
+      "grad_norm": 0.49590814113616943,
+      "learning_rate": 0.00019896794024695108,
+      "loss": 0.1848,
+      "step": 891
+    },
+    {
+      "epoch": 0.18733592355350204,
+      "grad_norm": 0.3615363836288452,
+      "learning_rate": 0.00019896295707533642,
+      "loss": 0.1357,
+      "step": 892
+    },
+    {
+      "epoch": 0.18754594140501943,
+      "grad_norm": 0.540952205657959,
+      "learning_rate": 0.00019895796196507063,
+      "loss": 0.1622,
+      "step": 893
+    },
+    {
+      "epoch": 0.1877559592565368,
+      "grad_norm": 0.6152564883232117,
+      "learning_rate": 0.00019895295491675628,
+      "loss": 0.2229,
+      "step": 894
+    },
+    {
+      "epoch": 0.18796597710805418,
+      "grad_norm": 0.6287555694580078,
+      "learning_rate": 0.0001989479359309974,
+      "loss": 0.1855,
+      "step": 895
+    },
+    {
+      "epoch": 0.18817599495957157,
+      "grad_norm": 0.6615211963653564,
+      "learning_rate": 0.00019894290500839946,
+      "loss": 0.2001,
+      "step": 896
+    },
+    {
+      "epoch": 0.18838601281108894,
+      "grad_norm": 0.6587905883789062,
+      "learning_rate": 0.00019893786214956945,
+      "loss": 0.2368,
+      "step": 897
+    },
+    {
+      "epoch": 0.18859603066260633,
+      "grad_norm": 0.3502175807952881,
+      "learning_rate": 0.00019893280735511565,
+      "loss": 0.1203,
+      "step": 898
+    },
+    {
+      "epoch": 0.1888060485141237,
+      "grad_norm": 0.6989165544509888,
+      "learning_rate": 0.00019892774062564786,
+      "loss": 0.2108,
+      "step": 899
+    },
+    {
+      "epoch": 0.18901606636564108,
+      "grad_norm": 0.5993213057518005,
+      "learning_rate": 0.00019892266196177736,
+      "loss": 0.2667,
+      "step": 900
+    },
+    {
+      "epoch": 0.18922608421715845,
+      "grad_norm": 0.6625016331672668,
+      "learning_rate": 0.0001989175713641168,
+      "loss": 0.3081,
+      "step": 901
+    },
+    {
+      "epoch": 0.18943610206867584,
+      "grad_norm": 0.6831103563308716,
+      "learning_rate": 0.0001989124688332803,
+      "loss": 0.2826,
+      "step": 902
+    },
+    {
+      "epoch": 0.18964611992019323,
+      "grad_norm": 0.6341603994369507,
+      "learning_rate": 0.00019890735436988347,
+      "loss": 0.2738,
+      "step": 903
+    },
+    {
+      "epoch": 0.1898561377717106,
+      "grad_norm": 0.6546643376350403,
+      "learning_rate": 0.0001989022279745432,
+      "loss": 0.3065,
+      "step": 904
+    },
+    {
+      "epoch": 0.19006615562322798,
+      "grad_norm": 0.7356497645378113,
+      "learning_rate": 0.000198897089647878,
+      "loss": 0.2955,
+      "step": 905
+    },
+    {
+      "epoch": 0.19027617347474535,
+      "grad_norm": 0.71455317735672,
+      "learning_rate": 0.00019889193939050777,
+      "loss": 0.2069,
+      "step": 906
+    },
+    {
+      "epoch": 0.19048619132626274,
+      "grad_norm": 0.7142229676246643,
+      "learning_rate": 0.00019888677720305374,
+      "loss": 0.3386,
+      "step": 907
+    },
+    {
+      "epoch": 0.1906962091777801,
+      "grad_norm": 0.6420140862464905,
+      "learning_rate": 0.00019888160308613874,
+      "loss": 0.2952,
+      "step": 908
+    },
+    {
+      "epoch": 0.1909062270292975,
+      "grad_norm": 0.757895290851593,
+      "learning_rate": 0.00019887641704038688,
+      "loss": 0.299,
+      "step": 909
+    },
+    {
+      "epoch": 0.19111624488081486,
+      "grad_norm": 0.5329816937446594,
+      "learning_rate": 0.00019887121906642387,
+      "loss": 0.2005,
+      "step": 910
+    },
+    {
+      "epoch": 0.19132626273233225,
+      "grad_norm": 0.5069072842597961,
+      "learning_rate": 0.00019886600916487677,
+      "loss": 0.1971,
+      "step": 911
+    },
+    {
+      "epoch": 0.19153628058384964,
+      "grad_norm": 0.7712031602859497,
+      "learning_rate": 0.00019886078733637408,
+      "loss": 0.2952,
+      "step": 912
+    },
+    {
+      "epoch": 0.191746298435367,
+      "grad_norm": 0.6340819001197815,
+      "learning_rate": 0.00019885555358154574,
+      "loss": 0.2403,
+      "step": 913
+    },
+    {
+      "epoch": 0.1919563162868844,
+      "grad_norm": 0.707127034664154,
+      "learning_rate": 0.0001988503079010231,
+      "loss": 0.262,
+      "step": 914
+    },
+    {
+      "epoch": 0.19216633413840176,
+      "grad_norm": 0.5502609014511108,
+      "learning_rate": 0.00019884505029543908,
+      "loss": 0.1767,
+      "step": 915
+    },
+    {
+      "epoch": 0.19237635198991915,
+      "grad_norm": 0.6637031435966492,
+      "learning_rate": 0.00019883978076542787,
+      "loss": 0.317,
+      "step": 916
+    },
+    {
+      "epoch": 0.1925863698414365,
+      "grad_norm": 0.5921664237976074,
+      "learning_rate": 0.00019883449931162517,
+      "loss": 0.2848,
+      "step": 917
+    },
+    {
+      "epoch": 0.1927963876929539,
+      "grad_norm": 0.8460182547569275,
+      "learning_rate": 0.0001988292059346682,
+      "loss": 0.2741,
+      "step": 918
+    },
+    {
+      "epoch": 0.1930064055444713,
+      "grad_norm": 0.7577118277549744,
+      "learning_rate": 0.00019882390063519543,
+      "loss": 0.2589,
+      "step": 919
+    },
+    {
+      "epoch": 0.19321642339598866,
+      "grad_norm": 0.5957863330841064,
+      "learning_rate": 0.00019881858341384696,
+      "loss": 0.1834,
+      "step": 920
+    },
+    {
+      "epoch": 0.19342644124750605,
+      "grad_norm": 0.6584639549255371,
+      "learning_rate": 0.00019881325427126422,
+      "loss": 0.232,
+      "step": 921
+    },
+    {
+      "epoch": 0.1936364590990234,
+      "grad_norm": 0.6941714882850647,
+      "learning_rate": 0.0001988079132080901,
+      "loss": 0.2514,
+      "step": 922
+    },
+    {
+      "epoch": 0.1938464769505408,
+      "grad_norm": 0.829231321811676,
+      "learning_rate": 0.00019880256022496897,
+      "loss": 0.2023,
+      "step": 923
+    },
+    {
+      "epoch": 0.19405649480205817,
+      "grad_norm": 0.6720934510231018,
+      "learning_rate": 0.00019879719532254654,
+      "loss": 0.2535,
+      "step": 924
+    },
+    {
+      "epoch": 0.19426651265357556,
+      "grad_norm": 0.7159935832023621,
+      "learning_rate": 0.00019879181850147005,
+      "loss": 0.3129,
+      "step": 925
+    },
+    {
+      "epoch": 0.19447653050509292,
+      "grad_norm": 0.6411039233207703,
+      "learning_rate": 0.00019878642976238817,
+      "loss": 0.1729,
+      "step": 926
+    },
+    {
+      "epoch": 0.1946865483566103,
+      "grad_norm": 0.7253606915473938,
+      "learning_rate": 0.00019878102910595095,
+      "loss": 0.2599,
+      "step": 927
+    },
+    {
+      "epoch": 0.1948965662081277,
+      "grad_norm": 0.6732550859451294,
+      "learning_rate": 0.0001987756165328099,
+      "loss": 0.1881,
+      "step": 928
+    },
+    {
+      "epoch": 0.19510658405964507,
+      "grad_norm": 0.6675817966461182,
+      "learning_rate": 0.00019877019204361804,
+      "loss": 0.2417,
+      "step": 929
+    },
+    {
+      "epoch": 0.19531660191116246,
+      "grad_norm": 0.5525332093238831,
+      "learning_rate": 0.0001987647556390297,
+      "loss": 0.2445,
+      "step": 930
+    },
+    {
+      "epoch": 0.19552661976267982,
+      "grad_norm": 0.7800937294960022,
+      "learning_rate": 0.00019875930731970076,
+      "loss": 0.2401,
+      "step": 931
+    },
+    {
+      "epoch": 0.1957366376141972,
+      "grad_norm": 0.5669112205505371,
+      "learning_rate": 0.00019875384708628848,
+      "loss": 0.1925,
+      "step": 932
+    },
+    {
+      "epoch": 0.19594665546571458,
+      "grad_norm": 0.6367275714874268,
+      "learning_rate": 0.00019874837493945156,
+      "loss": 0.205,
+      "step": 933
+    },
+    {
+      "epoch": 0.19615667331723197,
+      "grad_norm": 0.6173298954963684,
+      "learning_rate": 0.00019874289087985013,
+      "loss": 0.2426,
+      "step": 934
+    },
+    {
+      "epoch": 0.19636669116874933,
+      "grad_norm": 0.7045214176177979,
+      "learning_rate": 0.00019873739490814583,
+      "loss": 0.1647,
+      "step": 935
+    },
+    {
+      "epoch": 0.19657670902026672,
+      "grad_norm": 0.5824179649353027,
+      "learning_rate": 0.00019873188702500163,
+      "loss": 0.1527,
+      "step": 936
+    },
+    {
+      "epoch": 0.1967867268717841,
+      "grad_norm": 0.585749626159668,
+      "learning_rate": 0.000198726367231082,
+      "loss": 0.2119,
+      "step": 937
+    },
+    {
+      "epoch": 0.19699674472330148,
+      "grad_norm": 0.679140031337738,
+      "learning_rate": 0.00019872083552705284,
+      "loss": 0.2037,
+      "step": 938
+    },
+    {
+      "epoch": 0.19720676257481887,
+      "grad_norm": 0.3865984380245209,
+      "learning_rate": 0.0001987152919135815,
+      "loss": 0.1508,
+      "step": 939
+    },
+    {
+      "epoch": 0.19741678042633623,
+      "grad_norm": 0.5994648933410645,
+      "learning_rate": 0.0001987097363913367,
+      "loss": 0.1536,
+      "step": 940
+    },
+    {
+      "epoch": 0.19762679827785362,
+      "grad_norm": 0.8374373912811279,
+      "learning_rate": 0.0001987041689609887,
+      "loss": 0.3113,
+      "step": 941
+    },
+    {
+      "epoch": 0.19783681612937098,
+      "grad_norm": 0.4448517858982086,
+      "learning_rate": 0.0001986985896232091,
+      "loss": 0.1523,
+      "step": 942
+    },
+    {
+      "epoch": 0.19804683398088838,
+      "grad_norm": 0.5031003952026367,
+      "learning_rate": 0.00019869299837867098,
+      "loss": 0.2351,
+      "step": 943
+    },
+    {
+      "epoch": 0.19825685183240577,
+      "grad_norm": 0.8319448232650757,
+      "learning_rate": 0.0001986873952280489,
+      "loss": 0.2768,
+      "step": 944
+    },
+    {
+      "epoch": 0.19846686968392313,
+      "grad_norm": 0.4768364429473877,
+      "learning_rate": 0.00019868178017201874,
+      "loss": 0.2041,
+      "step": 945
+    },
+    {
+      "epoch": 0.19867688753544052,
+      "grad_norm": 0.36797624826431274,
+      "learning_rate": 0.00019867615321125795,
+      "loss": 0.1703,
+      "step": 946
+    },
+    {
+      "epoch": 0.19888690538695789,
+      "grad_norm": 0.629489541053772,
+      "learning_rate": 0.0001986705143464453,
+      "loss": 0.1989,
+      "step": 947
+    },
+    {
+      "epoch": 0.19909692323847528,
+      "grad_norm": 0.757764458656311,
+      "learning_rate": 0.00019866486357826107,
+      "loss": 0.1972,
+      "step": 948
+    },
+    {
+      "epoch": 0.19930694108999264,
+      "grad_norm": 0.884556770324707,
+      "learning_rate": 0.00019865920090738698,
+      "loss": 0.2592,
+      "step": 949
+    },
+    {
+      "epoch": 0.19951695894151003,
+      "grad_norm": 0.5489534139633179,
+      "learning_rate": 0.00019865352633450614,
+      "loss": 0.2075,
+      "step": 950
+    },
+    {
+      "epoch": 0.1997269767930274,
+      "grad_norm": 0.6485860347747803,
+      "learning_rate": 0.00019864783986030314,
+      "loss": 0.3648,
+      "step": 951
+    },
+    {
+      "epoch": 0.19993699464454479,
+      "grad_norm": 0.8612170219421387,
+      "learning_rate": 0.00019864214148546393,
+      "loss": 0.2175,
+      "step": 952
+    },
+    {
+      "epoch": 0.20014701249606218,
+      "grad_norm": 0.6336376070976257,
+      "learning_rate": 0.00019863643121067597,
+      "loss": 0.2935,
+      "step": 953
+    },
+    {
+      "epoch": 0.20035703034757954,
+      "grad_norm": 0.7330135703086853,
+      "learning_rate": 0.00019863070903662817,
+      "loss": 0.4322,
+      "step": 954
+    },
+    {
+      "epoch": 0.20056704819909693,
+      "grad_norm": 0.6464625000953674,
+      "learning_rate": 0.0001986249749640108,
+      "loss": 0.242,
+      "step": 955
+    },
+    {
+      "epoch": 0.2007770660506143,
+      "grad_norm": 0.6884174942970276,
+      "learning_rate": 0.00019861922899351561,
+      "loss": 0.3043,
+      "step": 956
+    },
+    {
+      "epoch": 0.20098708390213169,
+      "grad_norm": 0.4948609471321106,
+      "learning_rate": 0.0001986134711258358,
+      "loss": 0.1735,
+      "step": 957
+    },
+    {
+      "epoch": 0.20119710175364905,
+      "grad_norm": 0.9207262396812439,
+      "learning_rate": 0.00019860770136166596,
+      "loss": 0.2473,
+      "step": 958
+    },
+    {
+      "epoch": 0.20140711960516644,
+      "grad_norm": 0.6444927453994751,
+      "learning_rate": 0.00019860191970170216,
+      "loss": 0.2995,
+      "step": 959
+    },
+    {
+      "epoch": 0.20161713745668383,
+      "grad_norm": 0.8041002750396729,
+      "learning_rate": 0.00019859612614664185,
+      "loss": 0.3079,
+      "step": 960
+    },
+    {
+      "epoch": 0.2018271553082012,
+      "grad_norm": 0.520293653011322,
+      "learning_rate": 0.000198590320697184,
+      "loss": 0.2038,
+      "step": 961
+    },
+    {
+      "epoch": 0.20203717315971859,
+      "grad_norm": 0.6968462467193604,
+      "learning_rate": 0.00019858450335402897,
+      "loss": 0.2791,
+      "step": 962
+    },
+    {
+      "epoch": 0.20224719101123595,
+      "grad_norm": 0.5260444283485413,
+      "learning_rate": 0.00019857867411787847,
+      "loss": 0.2164,
+      "step": 963
+    },
+    {
+      "epoch": 0.20245720886275334,
+      "grad_norm": 0.7742235660552979,
+      "learning_rate": 0.0001985728329894358,
+      "loss": 0.3005,
+      "step": 964
+    },
+    {
+      "epoch": 0.2026672267142707,
+      "grad_norm": 0.4388875961303711,
+      "learning_rate": 0.0001985669799694056,
+      "loss": 0.2046,
+      "step": 965
+    },
+    {
+      "epoch": 0.2028772445657881,
+      "grad_norm": 0.8159006237983704,
+      "learning_rate": 0.00019856111505849394,
+      "loss": 0.2219,
+      "step": 966
+    },
+    {
+      "epoch": 0.20308726241730546,
+      "grad_norm": 0.5616422295570374,
+      "learning_rate": 0.0001985552382574084,
+      "loss": 0.3792,
+      "step": 967
+    },
+    {
+      "epoch": 0.20329728026882285,
+      "grad_norm": 0.5863935351371765,
+      "learning_rate": 0.00019854934956685792,
+      "loss": 0.2077,
+      "step": 968
+    },
+    {
+      "epoch": 0.20350729812034024,
+      "grad_norm": 0.5828328728675842,
+      "learning_rate": 0.00019854344898755287,
+      "loss": 0.273,
+      "step": 969
+    },
+    {
+      "epoch": 0.2037173159718576,
+      "grad_norm": 0.5963171124458313,
+      "learning_rate": 0.00019853753652020507,
+      "loss": 0.2407,
+      "step": 970
+    },
+    {
+      "epoch": 0.203927333823375,
+      "grad_norm": 0.5114577412605286,
+      "learning_rate": 0.00019853161216552788,
+      "loss": 0.1663,
+      "step": 971
+    },
+    {
+      "epoch": 0.20413735167489236,
+      "grad_norm": 0.5106688737869263,
+      "learning_rate": 0.0001985256759242359,
+      "loss": 0.1823,
+      "step": 972
+    },
+    {
+      "epoch": 0.20434736952640975,
+      "grad_norm": 0.5732094645500183,
+      "learning_rate": 0.00019851972779704534,
+      "loss": 0.2206,
+      "step": 973
+    },
+    {
+      "epoch": 0.2045573873779271,
+      "grad_norm": 0.5627723932266235,
+      "learning_rate": 0.00019851376778467366,
+      "loss": 0.1715,
+      "step": 974
+    },
+    {
+      "epoch": 0.2047674052294445,
+      "grad_norm": 0.7939655184745789,
+      "learning_rate": 0.00019850779588783998,
+      "loss": 0.1669,
+      "step": 975
+    },
+    {
+      "epoch": 0.20497742308096187,
+      "grad_norm": 0.5675683617591858,
+      "learning_rate": 0.00019850181210726467,
+      "loss": 0.1696,
+      "step": 976
+    },
+    {
+      "epoch": 0.20518744093247926,
+      "grad_norm": 0.9706809520721436,
+      "learning_rate": 0.00019849581644366965,
+      "loss": 0.376,
+      "step": 977
+    },
+    {
+      "epoch": 0.20539745878399665,
+      "grad_norm": 0.6787039041519165,
+      "learning_rate": 0.00019848980889777815,
+      "loss": 0.1528,
+      "step": 978
+    },
+    {
+      "epoch": 0.205607476635514,
+      "grad_norm": 0.8098447918891907,
+      "learning_rate": 0.00019848378947031492,
+      "loss": 0.1659,
+      "step": 979
+    },
+    {
+      "epoch": 0.2058174944870314,
+      "grad_norm": 0.46255457401275635,
+      "learning_rate": 0.0001984777581620062,
+      "loss": 0.1445,
+      "step": 980
+    },
+    {
+      "epoch": 0.20602751233854877,
+      "grad_norm": 0.7909157872200012,
+      "learning_rate": 0.00019847171497357953,
+      "loss": 0.2294,
+      "step": 981
+    },
+    {
+      "epoch": 0.20623753019006616,
+      "grad_norm": 0.6739736795425415,
+      "learning_rate": 0.00019846565990576392,
+      "loss": 0.2624,
+      "step": 982
+    },
+    {
+      "epoch": 0.20644754804158352,
+      "grad_norm": 0.7638704180717468,
+      "learning_rate": 0.00019845959295928994,
+      "loss": 0.2863,
+      "step": 983
+    },
+    {
+      "epoch": 0.20665756589310091,
+      "grad_norm": 0.48239898681640625,
+      "learning_rate": 0.00019845351413488939,
+      "loss": 0.1564,
+      "step": 984
+    },
+    {
+      "epoch": 0.2068675837446183,
+      "grad_norm": 0.6511039137840271,
+      "learning_rate": 0.00019844742343329568,
+      "loss": 0.1856,
+      "step": 985
+    },
+    {
+      "epoch": 0.20707760159613567,
+      "grad_norm": 0.48949161171913147,
+      "learning_rate": 0.0001984413208552435,
+      "loss": 0.1862,
+      "step": 986
+    },
+    {
+      "epoch": 0.20728761944765306,
+      "grad_norm": 0.6529719829559326,
+      "learning_rate": 0.0001984352064014691,
+      "loss": 0.2008,
+      "step": 987
+    },
+    {
+      "epoch": 0.20749763729917042,
+      "grad_norm": 0.5295738577842712,
+      "learning_rate": 0.00019842908007271012,
+      "loss": 0.2141,
+      "step": 988
+    },
+    {
+      "epoch": 0.20770765515068781,
+      "grad_norm": 0.5440765023231506,
+      "learning_rate": 0.00019842294186970562,
+      "loss": 0.264,
+      "step": 989
+    },
+    {
+      "epoch": 0.20791767300220518,
+      "grad_norm": 0.5315092206001282,
+      "learning_rate": 0.00019841679179319606,
+      "loss": 0.2116,
+      "step": 990
+    },
+    {
+      "epoch": 0.20812769085372257,
+      "grad_norm": 0.4537929892539978,
+      "learning_rate": 0.0001984106298439234,
+      "loss": 0.1269,
+      "step": 991
+    },
+    {
+      "epoch": 0.20833770870523993,
+      "grad_norm": 0.5806244015693665,
+      "learning_rate": 0.000198404456022631,
+      "loss": 0.1808,
+      "step": 992
+    },
+    {
+      "epoch": 0.20854772655675732,
+      "grad_norm": 0.5772647261619568,
+      "learning_rate": 0.00019839827033006372,
+      "loss": 0.1637,
+      "step": 993
+    },
+    {
+      "epoch": 0.20875774440827471,
+      "grad_norm": 0.4130006432533264,
+      "learning_rate": 0.00019839207276696764,
+      "loss": 0.1398,
+      "step": 994
+    },
+    {
+      "epoch": 0.20896776225979208,
+      "grad_norm": 0.47043028473854065,
+      "learning_rate": 0.00019838586333409056,
+      "loss": 0.1209,
+      "step": 995
+    },
+    {
+      "epoch": 0.20917778011130947,
+      "grad_norm": 0.713445782661438,
+      "learning_rate": 0.00019837964203218148,
+      "loss": 0.2175,
+      "step": 996
+    },
+    {
+      "epoch": 0.20938779796282683,
+      "grad_norm": 0.7947505116462708,
+      "learning_rate": 0.00019837340886199096,
+      "loss": 0.3172,
+      "step": 997
+    },
+    {
+      "epoch": 0.20959781581434422,
+      "grad_norm": 0.7544185519218445,
+      "learning_rate": 0.00019836716382427096,
+      "loss": 0.2506,
+      "step": 998
+    },
+    {
+      "epoch": 0.2098078336658616,
+      "grad_norm": 0.8411846160888672,
+      "learning_rate": 0.00019836090691977484,
+      "loss": 0.2619,
+      "step": 999
+    },
+    {
+      "epoch": 0.21001785151737898,
+      "grad_norm": 0.7358798384666443,
+      "learning_rate": 0.00019835463814925745,
+      "loss": 0.264,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 9522,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.969459342303437e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}