diff --git "a/checkpoint-1572/trainer_state.json" "b/checkpoint-1572/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-1572/trainer_state.json"
@@ -0,0 +1,10534 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.8650519031141869,
+  "eval_steps": 500,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005767012687427913,
+      "grad_norm": 0.5134636163711548,
+      "learning_rate": 0.0,
+      "loss": 1.6129628419876099,
+      "step": 1
+    },
+    {
+      "epoch": 0.0011534025374855825,
+      "grad_norm": 0.45678019523620605,
+      "learning_rate": 4e-05,
+      "loss": 1.713558554649353,
+      "step": 2
+    },
+    {
+      "epoch": 0.0017301038062283738,
+      "grad_norm": 0.6324027180671692,
+      "learning_rate": 8e-05,
+      "loss": 1.9871511459350586,
+      "step": 3
+    },
+    {
+      "epoch": 0.002306805074971165,
+      "grad_norm": 0.5307025909423828,
+      "learning_rate": 0.00012,
+      "loss": 1.6862211227416992,
+      "step": 4
+    },
+    {
+      "epoch": 0.0028835063437139563,
+      "grad_norm": 0.616538941860199,
+      "learning_rate": 0.00016,
+      "loss": 2.1033642292022705,
+      "step": 5
+    },
+    {
+      "epoch": 0.0034602076124567475,
+      "grad_norm": 0.7627953290939331,
+      "learning_rate": 0.0002,
+      "loss": 2.150984764099121,
+      "step": 6
+    },
+    {
+      "epoch": 0.004036908881199538,
+      "grad_norm": 0.8402333855628967,
+      "learning_rate": 0.00019996151625938042,
+      "loss": 2.0197458267211914,
+      "step": 7
+    },
+    {
+      "epoch": 0.00461361014994233,
+      "grad_norm": 3.813333034515381,
+      "learning_rate": 0.00019992303251876084,
+      "loss": 2.396656036376953,
+      "step": 8
+    },
+    {
+      "epoch": 0.005190311418685121,
+      "grad_norm": 0.9861733913421631,
+      "learning_rate": 0.00019988454877814126,
+      "loss": 2.1392970085144043,
+      "step": 9
+    },
+    {
+      "epoch": 0.0057670126874279125,
+      "grad_norm": 0.7931668758392334,
+      "learning_rate": 0.00019984606503752164,
+      "loss": 1.8062304258346558,
+      "step": 10
+    },
+    {
+      "epoch": 0.006343713956170703,
+      "grad_norm": 0.8828097581863403,
+      "learning_rate": 0.00019980758129690206,
+      "loss": 1.76358962059021,
+      "step": 11
+    },
+    {
+      "epoch": 0.006920415224913495,
+      "grad_norm": 0.7205682396888733,
+      "learning_rate": 0.00019976909755628247,
+      "loss": 1.3197358846664429,
+      "step": 12
+    },
+    {
+      "epoch": 0.007497116493656286,
+      "grad_norm": 1.2321408987045288,
+      "learning_rate": 0.0001997306138156629,
+      "loss": 1.7697328329086304,
+      "step": 13
+    },
+    {
+      "epoch": 0.008073817762399077,
+      "grad_norm": 0.9804911613464355,
+      "learning_rate": 0.0001996921300750433,
+      "loss": 1.7214155197143555,
+      "step": 14
+    },
+    {
+      "epoch": 0.00865051903114187,
+      "grad_norm": 0.9436901807785034,
+      "learning_rate": 0.00019965364633442372,
+      "loss": 1.6395944356918335,
+      "step": 15
+    },
+    {
+      "epoch": 0.00922722029988466,
+      "grad_norm": 1.6564269065856934,
+      "learning_rate": 0.00019961516259380414,
+      "loss": 1.8607707023620605,
+      "step": 16
+    },
+    {
+      "epoch": 0.00980392156862745,
+      "grad_norm": 1.0676305294036865,
+      "learning_rate": 0.00019957667885318455,
+      "loss": 1.4897263050079346,
+      "step": 17
+    },
+    {
+      "epoch": 0.010380622837370242,
+      "grad_norm": 0.9889469146728516,
+      "learning_rate": 0.00019953819511256494,
+      "loss": 1.7445942163467407,
+      "step": 18
+    },
+    {
+      "epoch": 0.010957324106113034,
+      "grad_norm": 0.8717456459999084,
+      "learning_rate": 0.00019949971137194535,
+      "loss": 1.4854474067687988,
+      "step": 19
+    },
+    {
+      "epoch": 0.011534025374855825,
+      "grad_norm": 1.110196590423584,
+      "learning_rate": 0.00019946122763132577,
+      "loss": 1.32136869430542,
+      "step": 20
+    },
+    {
+      "epoch": 0.012110726643598616,
+      "grad_norm": 0.7795314192771912,
+      "learning_rate": 0.00019942274389070618,
+      "loss": 1.7199318408966064,
+      "step": 21
+    },
+    {
+      "epoch": 0.012687427912341407,
+      "grad_norm": 0.7504187822341919,
+      "learning_rate": 0.0001993842601500866,
+      "loss": 1.2975201606750488,
+      "step": 22
+    },
+    {
+      "epoch": 0.0132641291810842,
+      "grad_norm": 0.8012252449989319,
+      "learning_rate": 0.00019934577640946702,
+      "loss": 1.2630457878112793,
+      "step": 23
+    },
+    {
+      "epoch": 0.01384083044982699,
+      "grad_norm": 0.9531145691871643,
+      "learning_rate": 0.00019930729266884743,
+      "loss": 1.6974424123764038,
+      "step": 24
+    },
+    {
+      "epoch": 0.01441753171856978,
+      "grad_norm": 1.020970106124878,
+      "learning_rate": 0.00019926880892822785,
+      "loss": 1.294957160949707,
+      "step": 25
+    },
+    {
+      "epoch": 0.014994232987312572,
+      "grad_norm": 1.7608129978179932,
+      "learning_rate": 0.00019923032518760823,
+      "loss": 1.801735520362854,
+      "step": 26
+    },
+    {
+      "epoch": 0.015570934256055362,
+      "grad_norm": 0.9601960182189941,
+      "learning_rate": 0.00019919184144698865,
+      "loss": 1.4538304805755615,
+      "step": 27
+    },
+    {
+      "epoch": 0.016147635524798153,
+      "grad_norm": 0.7025886178016663,
+      "learning_rate": 0.00019915335770636906,
+      "loss": 1.1746238470077515,
+      "step": 28
+    },
+    {
+      "epoch": 0.016724336793540944,
+      "grad_norm": 0.8506267666816711,
+      "learning_rate": 0.00019911487396574948,
+      "loss": 1.1891943216323853,
+      "step": 29
+    },
+    {
+      "epoch": 0.01730103806228374,
+      "grad_norm": 0.9117224216461182,
+      "learning_rate": 0.0001990763902251299,
+      "loss": 1.4325735569000244,
+      "step": 30
+    },
+    {
+      "epoch": 0.01787773933102653,
+      "grad_norm": 0.8756442070007324,
+      "learning_rate": 0.0001990379064845103,
+      "loss": 1.3962581157684326,
+      "step": 31
+    },
+    {
+      "epoch": 0.01845444059976932,
+      "grad_norm": 1.0293549299240112,
+      "learning_rate": 0.00019899942274389073,
+      "loss": 1.4936443567276,
+      "step": 32
+    },
+    {
+      "epoch": 0.01903114186851211,
+      "grad_norm": 0.8239012360572815,
+      "learning_rate": 0.00019896093900327114,
+      "loss": 1.1294159889221191,
+      "step": 33
+    },
+    {
+      "epoch": 0.0196078431372549,
+      "grad_norm": 0.6293753385543823,
+      "learning_rate": 0.00019892245526265153,
+      "loss": 1.219704031944275,
+      "step": 34
+    },
+    {
+      "epoch": 0.020184544405997693,
+      "grad_norm": 0.9778785109519958,
+      "learning_rate": 0.00019888397152203194,
+      "loss": 1.3405961990356445,
+      "step": 35
+    },
+    {
+      "epoch": 0.020761245674740483,
+      "grad_norm": 0.9916248917579651,
+      "learning_rate": 0.00019884548778141236,
+      "loss": 1.7191007137298584,
+      "step": 36
+    },
+    {
+      "epoch": 0.021337946943483274,
+      "grad_norm": 0.9758312106132507,
+      "learning_rate": 0.00019880700404079277,
+      "loss": 1.2949879169464111,
+      "step": 37
+    },
+    {
+      "epoch": 0.02191464821222607,
+      "grad_norm": 0.7310605645179749,
+      "learning_rate": 0.0001987685203001732,
+      "loss": 1.282931923866272,
+      "step": 38
+    },
+    {
+      "epoch": 0.02249134948096886,
+      "grad_norm": 0.6537899374961853,
+      "learning_rate": 0.0001987300365595536,
+      "loss": 1.4050456285476685,
+      "step": 39
+    },
+    {
+      "epoch": 0.02306805074971165,
+      "grad_norm": 0.6727839708328247,
+      "learning_rate": 0.00019869155281893402,
+      "loss": 1.3566672801971436,
+      "step": 40
+    },
+    {
+      "epoch": 0.02364475201845444,
+      "grad_norm": 0.6026540994644165,
+      "learning_rate": 0.00019865306907831444,
+      "loss": 1.6914572715759277,
+      "step": 41
+    },
+    {
+      "epoch": 0.02422145328719723,
+      "grad_norm": 0.7345203161239624,
+      "learning_rate": 0.00019861458533769482,
+      "loss": 1.3210856914520264,
+      "step": 42
+    },
+    {
+      "epoch": 0.024798154555940023,
+      "grad_norm": 1.7062476873397827,
+      "learning_rate": 0.00019857610159707524,
+      "loss": 1.6727783679962158,
+      "step": 43
+    },
+    {
+      "epoch": 0.025374855824682813,
+      "grad_norm": 0.7726621627807617,
+      "learning_rate": 0.00019853761785645565,
+      "loss": 1.7425484657287598,
+      "step": 44
+    },
+    {
+      "epoch": 0.025951557093425604,
+      "grad_norm": 0.6947644948959351,
+      "learning_rate": 0.00019849913411583607,
+      "loss": 1.0628504753112793,
+      "step": 45
+    },
+    {
+      "epoch": 0.0265282583621684,
+      "grad_norm": 0.7833652496337891,
+      "learning_rate": 0.00019846065037521649,
+      "loss": 1.4800021648406982,
+      "step": 46
+    },
+    {
+      "epoch": 0.02710495963091119,
+      "grad_norm": 0.8065851926803589,
+      "learning_rate": 0.0001984221666345969,
+      "loss": 1.2809616327285767,
+      "step": 47
+    },
+    {
+      "epoch": 0.02768166089965398,
+      "grad_norm": 1.044630527496338,
+      "learning_rate": 0.00019838368289397732,
+      "loss": 1.602962851524353,
+      "step": 48
+    },
+    {
+      "epoch": 0.02825836216839677,
+      "grad_norm": 0.5969672203063965,
+      "learning_rate": 0.00019834519915335773,
+      "loss": 1.5166534185409546,
+      "step": 49
+    },
+    {
+      "epoch": 0.02883506343713956,
+      "grad_norm": 0.848512589931488,
+      "learning_rate": 0.00019830671541273812,
+      "loss": 1.442568063735962,
+      "step": 50
+    },
+    {
+      "epoch": 0.029411764705882353,
+      "grad_norm": 0.5782500505447388,
+      "learning_rate": 0.00019826823167211853,
+      "loss": 1.3492627143859863,
+      "step": 51
+    },
+    {
+      "epoch": 0.029988465974625143,
+      "grad_norm": 0.850151777267456,
+      "learning_rate": 0.00019822974793149895,
+      "loss": 1.5313668251037598,
+      "step": 52
+    },
+    {
+      "epoch": 0.030565167243367934,
+      "grad_norm": 0.613896906375885,
+      "learning_rate": 0.00019819126419087937,
+      "loss": 1.0709185600280762,
+      "step": 53
+    },
+    {
+      "epoch": 0.031141868512110725,
+      "grad_norm": 0.9450347423553467,
+      "learning_rate": 0.00019815278045025978,
+      "loss": 1.5562160015106201,
+      "step": 54
+    },
+    {
+      "epoch": 0.031718569780853516,
+      "grad_norm": 0.9424428939819336,
+      "learning_rate": 0.0001981142967096402,
+      "loss": 1.764065146446228,
+      "step": 55
+    },
+    {
+      "epoch": 0.03229527104959631,
+      "grad_norm": 0.9744471311569214,
+      "learning_rate": 0.0001980758129690206,
+      "loss": 0.9400297403335571,
+      "step": 56
+    },
+    {
+      "epoch": 0.0328719723183391,
+      "grad_norm": 0.7247487902641296,
+      "learning_rate": 0.00019803732922840103,
+      "loss": 1.572107195854187,
+      "step": 57
+    },
+    {
+      "epoch": 0.03344867358708189,
+      "grad_norm": 0.6125597357749939,
+      "learning_rate": 0.00019799884548778141,
+      "loss": 1.2189209461212158,
+      "step": 58
+    },
+    {
+      "epoch": 0.034025374855824686,
+      "grad_norm": 1.0781699419021606,
+      "learning_rate": 0.00019796036174716183,
+      "loss": 1.3933414220809937,
+      "step": 59
+    },
+    {
+      "epoch": 0.03460207612456748,
+      "grad_norm": 0.8329439759254456,
+      "learning_rate": 0.00019792187800654224,
+      "loss": 1.4748475551605225,
+      "step": 60
+    },
+    {
+      "epoch": 0.03517877739331027,
+      "grad_norm": 0.7766849398612976,
+      "learning_rate": 0.00019788339426592266,
+      "loss": 1.4775745868682861,
+      "step": 61
+    },
+    {
+      "epoch": 0.03575547866205306,
+      "grad_norm": 0.7776947021484375,
+      "learning_rate": 0.00019784491052530308,
+      "loss": 1.4959548711776733,
+      "step": 62
+    },
+    {
+      "epoch": 0.03633217993079585,
+      "grad_norm": 0.7114179134368896,
+      "learning_rate": 0.0001978064267846835,
+      "loss": 1.4756664037704468,
+      "step": 63
+    },
+    {
+      "epoch": 0.03690888119953864,
+      "grad_norm": 0.675800621509552,
+      "learning_rate": 0.0001977679430440639,
+      "loss": 1.4753670692443848,
+      "step": 64
+    },
+    {
+      "epoch": 0.03748558246828143,
+      "grad_norm": 1.5709729194641113,
+      "learning_rate": 0.00019772945930344432,
+      "loss": 1.5947999954223633,
+      "step": 65
+    },
+    {
+      "epoch": 0.03806228373702422,
+      "grad_norm": 0.7363697290420532,
+      "learning_rate": 0.0001976909755628247,
+      "loss": 1.2786856889724731,
+      "step": 66
+    },
+    {
+      "epoch": 0.03863898500576701,
+      "grad_norm": 0.8212243318557739,
+      "learning_rate": 0.00019765249182220512,
+      "loss": 1.3553478717803955,
+      "step": 67
+    },
+    {
+      "epoch": 0.0392156862745098,
+      "grad_norm": 0.6724039912223816,
+      "learning_rate": 0.00019761400808158554,
+      "loss": 1.3045082092285156,
+      "step": 68
+    },
+    {
+      "epoch": 0.039792387543252594,
+      "grad_norm": 1.0372695922851562,
+      "learning_rate": 0.00019757552434096596,
+      "loss": 1.5149048566818237,
+      "step": 69
+    },
+    {
+      "epoch": 0.040369088811995385,
+      "grad_norm": 0.7058703303337097,
+      "learning_rate": 0.00019753704060034637,
+      "loss": 1.2227076292037964,
+      "step": 70
+    },
+    {
+      "epoch": 0.040945790080738176,
+      "grad_norm": 0.8637105226516724,
+      "learning_rate": 0.00019749855685972679,
+      "loss": 1.0762852430343628,
+      "step": 71
+    },
+    {
+      "epoch": 0.04152249134948097,
+      "grad_norm": 0.8108904957771301,
+      "learning_rate": 0.0001974600731191072,
+      "loss": 1.4130628108978271,
+      "step": 72
+    },
+    {
+      "epoch": 0.04209919261822376,
+      "grad_norm": 1.2491207122802734,
+      "learning_rate": 0.00019742158937848762,
+      "loss": 1.7983347177505493,
+      "step": 73
+    },
+    {
+      "epoch": 0.04267589388696655,
+      "grad_norm": 1.1523128747940063,
+      "learning_rate": 0.000197383105637868,
+      "loss": 1.5859603881835938,
+      "step": 74
+    },
+    {
+      "epoch": 0.04325259515570934,
+      "grad_norm": 0.7240892648696899,
+      "learning_rate": 0.00019734462189724842,
+      "loss": 1.4029178619384766,
+      "step": 75
+    },
+    {
+      "epoch": 0.04382929642445214,
+      "grad_norm": 0.7445366978645325,
+      "learning_rate": 0.00019730613815662884,
+      "loss": 1.351811170578003,
+      "step": 76
+    },
+    {
+      "epoch": 0.04440599769319493,
+      "grad_norm": 0.9881113767623901,
+      "learning_rate": 0.00019726765441600925,
+      "loss": 1.437370777130127,
+      "step": 77
+    },
+    {
+      "epoch": 0.04498269896193772,
+      "grad_norm": 1.0404249429702759,
+      "learning_rate": 0.00019722917067538967,
+      "loss": 1.0401325225830078,
+      "step": 78
+    },
+    {
+      "epoch": 0.04555940023068051,
+      "grad_norm": 0.998892605304718,
+      "learning_rate": 0.00019719068693477008,
+      "loss": 1.2733221054077148,
+      "step": 79
+    },
+    {
+      "epoch": 0.0461361014994233,
+      "grad_norm": 1.0299255847930908,
+      "learning_rate": 0.0001971522031941505,
+      "loss": 1.8878190517425537,
+      "step": 80
+    },
+    {
+      "epoch": 0.04671280276816609,
+      "grad_norm": 0.6168495416641235,
+      "learning_rate": 0.0001971137194535309,
+      "loss": 1.3375468254089355,
+      "step": 81
+    },
+    {
+      "epoch": 0.04728950403690888,
+      "grad_norm": 0.645830512046814,
+      "learning_rate": 0.0001970752357129113,
+      "loss": 0.986657440662384,
+      "step": 82
+    },
+    {
+      "epoch": 0.04786620530565167,
+      "grad_norm": 0.7971145510673523,
+      "learning_rate": 0.00019703675197229172,
+      "loss": 1.3205912113189697,
+      "step": 83
+    },
+    {
+      "epoch": 0.04844290657439446,
+      "grad_norm": 0.6297418475151062,
+      "learning_rate": 0.00019699826823167213,
+      "loss": 1.3360888957977295,
+      "step": 84
+    },
+    {
+      "epoch": 0.049019607843137254,
+      "grad_norm": 0.9845420718193054,
+      "learning_rate": 0.00019695978449105255,
+      "loss": 1.4006659984588623,
+      "step": 85
+    },
+    {
+      "epoch": 0.049596309111880045,
+      "grad_norm": 0.73700350522995,
+      "learning_rate": 0.00019692130075043296,
+      "loss": 1.1298922300338745,
+      "step": 86
+    },
+    {
+      "epoch": 0.050173010380622836,
+      "grad_norm": 0.7659608721733093,
+      "learning_rate": 0.00019688281700981338,
+      "loss": 1.2487225532531738,
+      "step": 87
+    },
+    {
+      "epoch": 0.05074971164936563,
+      "grad_norm": 0.7576966285705566,
+      "learning_rate": 0.0001968443332691938,
+      "loss": 1.346827507019043,
+      "step": 88
+    },
+    {
+      "epoch": 0.05132641291810842,
+      "grad_norm": 0.6777650117874146,
+      "learning_rate": 0.0001968058495285742,
+      "loss": 1.9484481811523438,
+      "step": 89
+    },
+    {
+      "epoch": 0.05190311418685121,
+      "grad_norm": 0.9935969114303589,
+      "learning_rate": 0.0001967673657879546,
+      "loss": 1.1737089157104492,
+      "step": 90
+    },
+    {
+      "epoch": 0.052479815455594,
+      "grad_norm": 1.0581051111221313,
+      "learning_rate": 0.000196728882047335,
+      "loss": 1.2755905389785767,
+      "step": 91
+    },
+    {
+      "epoch": 0.0530565167243368,
+      "grad_norm": 0.8372200131416321,
+      "learning_rate": 0.00019669039830671543,
+      "loss": 1.7988427877426147,
+      "step": 92
+    },
+    {
+      "epoch": 0.05363321799307959,
+      "grad_norm": 0.8300452828407288,
+      "learning_rate": 0.00019665191456609584,
+      "loss": 0.9904743432998657,
+      "step": 93
+    },
+    {
+      "epoch": 0.05420991926182238,
+      "grad_norm": 0.6703553199768066,
+      "learning_rate": 0.00019661343082547626,
+      "loss": 1.2092053890228271,
+      "step": 94
+    },
+    {
+      "epoch": 0.05478662053056517,
+      "grad_norm": 0.703804075717926,
+      "learning_rate": 0.00019657494708485667,
+      "loss": 1.1028215885162354,
+      "step": 95
+    },
+    {
+      "epoch": 0.05536332179930796,
+      "grad_norm": 0.8232657313346863,
+      "learning_rate": 0.0001965364633442371,
+      "loss": 1.3875727653503418,
+      "step": 96
+    },
+    {
+      "epoch": 0.05594002306805075,
+      "grad_norm": 0.6119164824485779,
+      "learning_rate": 0.00019649797960361747,
+      "loss": 1.161183476448059,
+      "step": 97
+    },
+    {
+      "epoch": 0.05651672433679354,
+      "grad_norm": 0.7460926175117493,
+      "learning_rate": 0.0001964594958629979,
+      "loss": 1.3667285442352295,
+      "step": 98
+    },
+    {
+      "epoch": 0.05709342560553633,
+      "grad_norm": 0.6345133185386658,
+      "learning_rate": 0.0001964210121223783,
+      "loss": 1.1740115880966187,
+      "step": 99
+    },
+    {
+      "epoch": 0.05767012687427912,
+      "grad_norm": 0.800463080406189,
+      "learning_rate": 0.00019638252838175872,
+      "loss": 1.1274670362472534,
+      "step": 100
+    },
+    {
+      "epoch": 0.058246828143021914,
+      "grad_norm": 0.6817663311958313,
+      "learning_rate": 0.00019634404464113914,
+      "loss": 1.2432150840759277,
+      "step": 101
+    },
+    {
+      "epoch": 0.058823529411764705,
+      "grad_norm": 0.7663673162460327,
+      "learning_rate": 0.00019630556090051955,
+      "loss": 1.2066948413848877,
+      "step": 102
+    },
+    {
+      "epoch": 0.059400230680507496,
+      "grad_norm": 1.0259535312652588,
+      "learning_rate": 0.00019626707715989997,
+      "loss": 1.3713116645812988,
+      "step": 103
+    },
+    {
+      "epoch": 0.05997693194925029,
+      "grad_norm": 0.6617158055305481,
+      "learning_rate": 0.00019622859341928038,
+      "loss": 1.0320123434066772,
+      "step": 104
+    },
+    {
+      "epoch": 0.06055363321799308,
+      "grad_norm": 1.0050235986709595,
+      "learning_rate": 0.00019619010967866077,
+      "loss": 1.5375267267227173,
+      "step": 105
+    },
+    {
+      "epoch": 0.06113033448673587,
+      "grad_norm": 0.5563177466392517,
+      "learning_rate": 0.00019615162593804119,
+      "loss": 0.9102802276611328,
+      "step": 106
+    },
+    {
+      "epoch": 0.06170703575547866,
+      "grad_norm": 0.9994164109230042,
+      "learning_rate": 0.0001961131421974216,
+      "loss": 1.6505589485168457,
+      "step": 107
+    },
+    {
+      "epoch": 0.06228373702422145,
+      "grad_norm": 0.907625675201416,
+      "learning_rate": 0.00019607465845680202,
+      "loss": 1.6013598442077637,
+      "step": 108
+    },
+    {
+      "epoch": 0.06286043829296424,
+      "grad_norm": 1.0009554624557495,
+      "learning_rate": 0.00019603617471618243,
+      "loss": 1.0403454303741455,
+      "step": 109
+    },
+    {
+      "epoch": 0.06343713956170703,
+      "grad_norm": 0.8243467807769775,
+      "learning_rate": 0.00019599769097556285,
+      "loss": 1.5382654666900635,
+      "step": 110
+    },
+    {
+      "epoch": 0.06401384083044982,
+      "grad_norm": 1.0160003900527954,
+      "learning_rate": 0.00019595920723494326,
+      "loss": 1.2732863426208496,
+      "step": 111
+    },
+    {
+      "epoch": 0.06459054209919261,
+      "grad_norm": 0.608269453048706,
+      "learning_rate": 0.00019592072349432368,
+      "loss": 1.070478916168213,
+      "step": 112
+    },
+    {
+      "epoch": 0.0651672433679354,
+      "grad_norm": 0.7176778316497803,
+      "learning_rate": 0.00019588223975370406,
+      "loss": 1.302718162536621,
+      "step": 113
+    },
+    {
+      "epoch": 0.0657439446366782,
+      "grad_norm": 0.551771879196167,
+      "learning_rate": 0.00019584375601308448,
+      "loss": 0.9242706894874573,
+      "step": 114
+    },
+    {
+      "epoch": 0.06632064590542099,
+      "grad_norm": 0.9680222868919373,
+      "learning_rate": 0.0001958052722724649,
+      "loss": 1.9658548831939697,
+      "step": 115
+    },
+    {
+      "epoch": 0.06689734717416378,
+      "grad_norm": 0.8025707602500916,
+      "learning_rate": 0.0001957667885318453,
+      "loss": 1.5753577947616577,
+      "step": 116
+    },
+    {
+      "epoch": 0.06747404844290658,
+      "grad_norm": 0.7211287021636963,
+      "learning_rate": 0.00019572830479122573,
+      "loss": 1.3677327632904053,
+      "step": 117
+    },
+    {
+      "epoch": 0.06805074971164937,
+      "grad_norm": 0.7547542452812195,
+      "learning_rate": 0.00019568982105060614,
+      "loss": 1.507096767425537,
+      "step": 118
+    },
+    {
+      "epoch": 0.06862745098039216,
+      "grad_norm": 0.6146650314331055,
+      "learning_rate": 0.00019565133730998656,
+      "loss": 1.1320711374282837,
+      "step": 119
+    },
+    {
+      "epoch": 0.06920415224913495,
+      "grad_norm": 0.7611070275306702,
+      "learning_rate": 0.00019561285356936697,
+      "loss": 1.207049012184143,
+      "step": 120
+    },
+    {
+      "epoch": 0.06978085351787774,
+      "grad_norm": 0.714883029460907,
+      "learning_rate": 0.00019557436982874736,
+      "loss": 1.3823729753494263,
+      "step": 121
+    },
+    {
+      "epoch": 0.07035755478662054,
+      "grad_norm": 0.6768732666969299,
+      "learning_rate": 0.00019553588608812778,
+      "loss": 1.3038188219070435,
+      "step": 122
+    },
+    {
+      "epoch": 0.07093425605536333,
+      "grad_norm": 0.6013675332069397,
+      "learning_rate": 0.0001954974023475082,
+      "loss": 1.056199073791504,
+      "step": 123
+    },
+    {
+      "epoch": 0.07151095732410612,
+      "grad_norm": 0.8240784406661987,
+      "learning_rate": 0.0001954589186068886,
+      "loss": 1.4242757558822632,
+      "step": 124
+    },
+    {
+      "epoch": 0.07208765859284891,
+      "grad_norm": 0.6539785265922546,
+      "learning_rate": 0.00019542043486626902,
+      "loss": 1.161075472831726,
+      "step": 125
+    },
+    {
+      "epoch": 0.0726643598615917,
+      "grad_norm": 0.6347744464874268,
+      "learning_rate": 0.00019538195112564944,
+      "loss": 1.179503321647644,
+      "step": 126
+    },
+    {
+      "epoch": 0.07324106113033449,
+      "grad_norm": 0.7294688820838928,
+      "learning_rate": 0.00019534346738502985,
+      "loss": 1.2521535158157349,
+      "step": 127
+    },
+    {
+      "epoch": 0.07381776239907728,
+      "grad_norm": 0.6087843179702759,
+      "learning_rate": 0.00019530498364441027,
+      "loss": 1.0938013792037964,
+      "step": 128
+    },
+    {
+      "epoch": 0.07439446366782007,
+      "grad_norm": 1.116716980934143,
+      "learning_rate": 0.00019526649990379066,
+      "loss": 1.74098539352417,
+      "step": 129
+    },
+    {
+      "epoch": 0.07497116493656286,
+      "grad_norm": 0.7590331435203552,
+      "learning_rate": 0.00019522801616317107,
+      "loss": 1.2943538427352905,
+      "step": 130
+    },
+    {
+      "epoch": 0.07554786620530565,
+      "grad_norm": 0.9142744541168213,
+      "learning_rate": 0.00019518953242255149,
+      "loss": 1.0948201417922974,
+      "step": 131
+    },
+    {
+      "epoch": 0.07612456747404844,
+      "grad_norm": 0.8165064454078674,
+      "learning_rate": 0.0001951510486819319,
+      "loss": 1.5152888298034668,
+      "step": 132
+    },
+    {
+      "epoch": 0.07670126874279123,
+      "grad_norm": 0.8904751539230347,
+      "learning_rate": 0.00019511256494131232,
+      "loss": 1.3492425680160522,
+      "step": 133
+    },
+    {
+      "epoch": 0.07727797001153403,
+      "grad_norm": 0.632338285446167,
+      "learning_rate": 0.00019507408120069273,
+      "loss": 1.1460604667663574,
+      "step": 134
+    },
+    {
+      "epoch": 0.07785467128027682,
+      "grad_norm": 0.6621445417404175,
+      "learning_rate": 0.00019503559746007315,
+      "loss": 1.153398871421814,
+      "step": 135
+    },
+    {
+      "epoch": 0.0784313725490196,
+      "grad_norm": 0.928593635559082,
+      "learning_rate": 0.00019499711371945356,
+      "loss": 1.4575080871582031,
+      "step": 136
+    },
+    {
+      "epoch": 0.0790080738177624,
+      "grad_norm": 0.9125704765319824,
+      "learning_rate": 0.00019495862997883395,
+      "loss": 1.176555871963501,
+      "step": 137
+    },
+    {
+      "epoch": 0.07958477508650519,
+      "grad_norm": 0.7735126614570618,
+      "learning_rate": 0.00019492014623821437,
+      "loss": 1.3028615713119507,
+      "step": 138
+    },
+    {
+      "epoch": 0.08016147635524798,
+      "grad_norm": 1.4182281494140625,
+      "learning_rate": 0.00019488166249759478,
+      "loss": 1.7123095989227295,
+      "step": 139
+    },
+    {
+      "epoch": 0.08073817762399077,
+      "grad_norm": 0.957777738571167,
+      "learning_rate": 0.0001948431787569752,
+      "loss": 1.2952847480773926,
+      "step": 140
+    },
+    {
+      "epoch": 0.08131487889273356,
+      "grad_norm": 0.6284865140914917,
+      "learning_rate": 0.0001948046950163556,
+      "loss": 1.063300609588623,
+      "step": 141
+    },
+    {
+      "epoch": 0.08189158016147635,
+      "grad_norm": 1.020240068435669,
+      "learning_rate": 0.00019476621127573603,
+      "loss": 1.0956578254699707,
+      "step": 142
+    },
+    {
+      "epoch": 0.08246828143021914,
+      "grad_norm": 0.9629870057106018,
+      "learning_rate": 0.00019472772753511644,
+      "loss": 1.6626744270324707,
+      "step": 143
+    },
+    {
+      "epoch": 0.08304498269896193,
+      "grad_norm": 0.723129391670227,
+      "learning_rate": 0.00019468924379449686,
+      "loss": 1.5930454730987549,
+      "step": 144
+    },
+    {
+      "epoch": 0.08362168396770472,
+      "grad_norm": 0.6031758785247803,
+      "learning_rate": 0.00019465076005387725,
+      "loss": 1.3550267219543457,
+      "step": 145
+    },
+    {
+      "epoch": 0.08419838523644751,
+      "grad_norm": 0.6608120799064636,
+      "learning_rate": 0.00019461227631325766,
+      "loss": 1.091226577758789,
+      "step": 146
+    },
+    {
+      "epoch": 0.0847750865051903,
+      "grad_norm": 0.8583825826644897,
+      "learning_rate": 0.00019457379257263808,
+      "loss": 1.2840064764022827,
+      "step": 147
+    },
+    {
+      "epoch": 0.0853517877739331,
+      "grad_norm": 0.6371753215789795,
+      "learning_rate": 0.0001945353088320185,
+      "loss": 1.0223405361175537,
+      "step": 148
+    },
+    {
+      "epoch": 0.08592848904267589,
+      "grad_norm": 0.6101475954055786,
+      "learning_rate": 0.0001944968250913989,
+      "loss": 1.2935165166854858,
+      "step": 149
+    },
+    {
+      "epoch": 0.08650519031141868,
+      "grad_norm": 0.8921840190887451,
+      "learning_rate": 0.00019445834135077932,
+      "loss": 1.3194819688796997,
+      "step": 150
+    },
+    {
+      "epoch": 0.08708189158016148,
+      "grad_norm": 1.0423651933670044,
+      "learning_rate": 0.0001944198576101597,
+      "loss": 1.162503957748413,
+      "step": 151
+    },
+    {
+      "epoch": 0.08765859284890427,
+      "grad_norm": 0.9011998772621155,
+      "learning_rate": 0.00019438137386954013,
+      "loss": 1.4854192733764648,
+      "step": 152
+    },
+    {
+      "epoch": 0.08823529411764706,
+      "grad_norm": 0.6850185990333557,
+      "learning_rate": 0.00019434289012892054,
+      "loss": 1.2653287649154663,
+      "step": 153
+    },
+    {
+      "epoch": 0.08881199538638986,
+      "grad_norm": 0.5742697715759277,
+      "learning_rate": 0.00019430440638830093,
+      "loss": 1.1639142036437988,
+      "step": 154
+    },
+    {
+      "epoch": 0.08938869665513265,
+      "grad_norm": 0.5625914931297302,
+      "learning_rate": 0.00019426592264768134,
+      "loss": 1.0387107133865356,
+      "step": 155
+    },
+    {
+      "epoch": 0.08996539792387544,
+      "grad_norm": 0.7183355689048767,
+      "learning_rate": 0.00019422743890706176,
+      "loss": 1.211965799331665,
+      "step": 156
+    },
+    {
+      "epoch": 0.09054209919261823,
+      "grad_norm": 0.8835011124610901,
+      "learning_rate": 0.00019418895516644217,
+      "loss": 1.0958670377731323,
+      "step": 157
+    },
+    {
+      "epoch": 0.09111880046136102,
+      "grad_norm": 0.6885069608688354,
+      "learning_rate": 0.0001941504714258226,
+      "loss": 1.297393798828125,
+      "step": 158
+    },
+    {
+      "epoch": 0.09169550173010381,
+      "grad_norm": 0.7518923878669739,
+      "learning_rate": 0.000194111987685203,
+      "loss": 1.1739790439605713,
+      "step": 159
+    },
+    {
+      "epoch": 0.0922722029988466,
+      "grad_norm": 0.8452180027961731,
+      "learning_rate": 0.00019407350394458342,
+      "loss": 1.2312185764312744,
+      "step": 160
+    },
+    {
+      "epoch": 0.09284890426758939,
+      "grad_norm": 0.8018324971199036,
+      "learning_rate": 0.00019403502020396384,
+      "loss": 1.392999291419983,
+      "step": 161
+    },
+    {
+      "epoch": 0.09342560553633218,
+      "grad_norm": 0.743302583694458,
+      "learning_rate": 0.00019399653646334422,
+      "loss": 1.1602349281311035,
+      "step": 162
+    },
+    {
+      "epoch": 0.09400230680507497,
+      "grad_norm": 0.551163911819458,
+      "learning_rate": 0.00019395805272272464,
+      "loss": 1.0061742067337036,
+      "step": 163
+    },
+    {
+      "epoch": 0.09457900807381776,
+      "grad_norm": 0.6732088327407837,
+      "learning_rate": 0.00019391956898210505,
+      "loss": 1.2422168254852295,
+      "step": 164
+    },
+    {
+      "epoch": 0.09515570934256055,
+      "grad_norm": 0.6432737708091736,
+      "learning_rate": 0.00019388108524148547,
+      "loss": 0.8992981910705566,
+      "step": 165
+    },
+    {
+      "epoch": 0.09573241061130335,
+      "grad_norm": 0.893099308013916,
+      "learning_rate": 0.00019384260150086589,
+      "loss": 1.4426004886627197,
+      "step": 166
+    },
+    {
+      "epoch": 0.09630911188004614,
+      "grad_norm": 0.7915064692497253,
+      "learning_rate": 0.0001938041177602463,
+      "loss": 1.1332988739013672,
+      "step": 167
+    },
+    {
+      "epoch": 0.09688581314878893,
+      "grad_norm": 0.7785482406616211,
+      "learning_rate": 0.00019376563401962672,
+      "loss": 1.1662797927856445,
+      "step": 168
+    },
+    {
+      "epoch": 0.09746251441753172,
+      "grad_norm": 0.7676025032997131,
+      "learning_rate": 0.00019372715027900713,
+      "loss": 1.276615858078003,
+      "step": 169
+    },
+    {
+      "epoch": 0.09803921568627451,
+      "grad_norm": 0.7058248519897461,
+      "learning_rate": 0.00019368866653838752,
+      "loss": 1.2280982732772827,
+      "step": 170
+    },
+    {
+      "epoch": 0.0986159169550173,
+      "grad_norm": 0.7814574241638184,
+      "learning_rate": 0.00019365018279776793,
+      "loss": 1.6545538902282715,
+      "step": 171
+    },
+    {
+      "epoch": 0.09919261822376009,
+      "grad_norm": 0.5429863333702087,
+      "learning_rate": 0.00019361169905714835,
+      "loss": 1.047904133796692,
+      "step": 172
+    },
+    {
+      "epoch": 0.09976931949250288,
+      "grad_norm": 0.7021914124488831,
+      "learning_rate": 0.00019357321531652876,
+      "loss": 1.3578035831451416,
+      "step": 173
+    },
+    {
+      "epoch": 0.10034602076124567,
+      "grad_norm": 0.7608473896980286,
+      "learning_rate": 0.00019353473157590918,
+      "loss": 1.3332273960113525,
+      "step": 174
+    },
+    {
+      "epoch": 0.10092272202998846,
+      "grad_norm": 0.8988219499588013,
+      "learning_rate": 0.0001934962478352896,
+      "loss": 1.5955560207366943,
+      "step": 175
+    },
+    {
+      "epoch": 0.10149942329873125,
+      "grad_norm": 0.8784334659576416,
+      "learning_rate": 0.00019345776409467,
+      "loss": 1.4267313480377197,
+      "step": 176
+    },
+    {
+      "epoch": 0.10207612456747404,
+      "grad_norm": 0.9006462097167969,
+      "learning_rate": 0.00019341928035405043,
+      "loss": 1.2960124015808105,
+      "step": 177
+    },
+    {
+      "epoch": 0.10265282583621683,
+      "grad_norm": 0.7736122608184814,
+      "learning_rate": 0.00019338079661343081,
+      "loss": 1.3841434717178345,
+      "step": 178
+    },
+    {
+      "epoch": 0.10322952710495963,
+      "grad_norm": 0.8202458620071411,
+      "learning_rate": 0.00019334231287281123,
+      "loss": 1.2962226867675781,
+      "step": 179
+    },
+    {
+      "epoch": 0.10380622837370242,
+      "grad_norm": 0.743390679359436,
+      "learning_rate": 0.00019330382913219164,
+      "loss": 1.010484218597412,
+      "step": 180
+    },
+    {
+      "epoch": 0.10438292964244521,
+      "grad_norm": 0.7926476001739502,
+      "learning_rate": 0.00019326534539157206,
+      "loss": 1.45333731174469,
+      "step": 181
+    },
+    {
+      "epoch": 0.104959630911188,
+      "grad_norm": 0.527367889881134,
+      "learning_rate": 0.00019322686165095248,
+      "loss": 0.7763160467147827,
+      "step": 182
+    },
+    {
+      "epoch": 0.10553633217993079,
+      "grad_norm": 1.0006170272827148,
+      "learning_rate": 0.0001931883779103329,
+      "loss": 1.089290738105774,
+      "step": 183
+    },
+    {
+      "epoch": 0.1061130334486736,
+      "grad_norm": 0.7497840523719788,
+      "learning_rate": 0.0001931498941697133,
+      "loss": 1.1641783714294434,
+      "step": 184
+    },
+    {
+      "epoch": 0.10668973471741638,
+      "grad_norm": 0.6732814908027649,
+      "learning_rate": 0.00019311141042909372,
+      "loss": 1.0954653024673462,
+      "step": 185
+    },
+    {
+      "epoch": 0.10726643598615918,
+      "grad_norm": 0.7817464470863342,
+      "learning_rate": 0.0001930729266884741,
+      "loss": 1.5050190687179565,
+      "step": 186
+    },
+    {
+      "epoch": 0.10784313725490197,
+      "grad_norm": 0.813869297504425,
+      "learning_rate": 0.00019303444294785452,
+      "loss": 1.5048751831054688,
+      "step": 187
+    },
+    {
+      "epoch": 0.10841983852364476,
+      "grad_norm": 0.6368386745452881,
+      "learning_rate": 0.00019299595920723494,
+      "loss": 1.0601242780685425,
+      "step": 188
+    },
+    {
+      "epoch": 0.10899653979238755,
+      "grad_norm": 0.817610502243042,
+      "learning_rate": 0.00019295747546661536,
+      "loss": 1.2267041206359863,
+      "step": 189
+    },
+    {
+      "epoch": 0.10957324106113034,
+      "grad_norm": 0.768892228603363,
+      "learning_rate": 0.00019291899172599577,
+      "loss": 1.0935152769088745,
+      "step": 190
+    },
+    {
+      "epoch": 0.11014994232987313,
+      "grad_norm": 0.8072124123573303,
+      "learning_rate": 0.00019288050798537619,
+      "loss": 1.5566798448562622,
+      "step": 191
+    },
+    {
+      "epoch": 0.11072664359861592,
+      "grad_norm": 0.7275574803352356,
+      "learning_rate": 0.0001928420242447566,
+      "loss": 1.5278323888778687,
+      "step": 192
+    },
+    {
+      "epoch": 0.11130334486735871,
+      "grad_norm": 0.6448370814323425,
+      "learning_rate": 0.00019280354050413702,
+      "loss": 1.2096084356307983,
+      "step": 193
+    },
+    {
+      "epoch": 0.1118800461361015,
+      "grad_norm": 0.9334590435028076,
+      "learning_rate": 0.0001927650567635174,
+      "loss": 1.2487378120422363,
+      "step": 194
+    },
+    {
+      "epoch": 0.11245674740484429,
+      "grad_norm": 0.6830427646636963,
+      "learning_rate": 0.00019272657302289782,
+      "loss": 1.3567012548446655,
+      "step": 195
+    },
+    {
+      "epoch": 0.11303344867358708,
+      "grad_norm": 0.9035089612007141,
+      "learning_rate": 0.00019268808928227823,
+      "loss": 1.1751577854156494,
+      "step": 196
+    },
+    {
+      "epoch": 0.11361014994232987,
+      "grad_norm": 0.5569579005241394,
+      "learning_rate": 0.00019264960554165865,
+      "loss": 1.0159823894500732,
+      "step": 197
+    },
+    {
+      "epoch": 0.11418685121107267,
+      "grad_norm": 0.6232113838195801,
+      "learning_rate": 0.00019261112180103907,
+      "loss": 1.0779603719711304,
+      "step": 198
+    },
+    {
+      "epoch": 0.11476355247981546,
+      "grad_norm": 0.7666590213775635,
+      "learning_rate": 0.00019257263806041948,
+      "loss": 1.2052793502807617,
+      "step": 199
+    },
+    {
+      "epoch": 0.11534025374855825,
+      "grad_norm": 0.6218665242195129,
+      "learning_rate": 0.0001925341543197999,
+      "loss": 1.2699958086013794,
+      "step": 200
+    },
+    {
+      "epoch": 0.11591695501730104,
+      "grad_norm": 0.6059345006942749,
+      "learning_rate": 0.0001924956705791803,
+      "loss": 1.0522977113723755,
+      "step": 201
+    },
+    {
+      "epoch": 0.11649365628604383,
+      "grad_norm": 0.6952403783798218,
+      "learning_rate": 0.0001924571868385607,
+      "loss": 1.3461261987686157,
+      "step": 202
+    },
+    {
+      "epoch": 0.11707035755478662,
+      "grad_norm": 0.7097076177597046,
+      "learning_rate": 0.00019241870309794111,
+      "loss": 1.0901520252227783,
+      "step": 203
+    },
+    {
+      "epoch": 0.11764705882352941,
+      "grad_norm": 1.3426554203033447,
+      "learning_rate": 0.00019238021935732153,
+      "loss": 1.8886399269104004,
+      "step": 204
+    },
+    {
+      "epoch": 0.1182237600922722,
+      "grad_norm": 1.00478196144104,
+      "learning_rate": 0.00019234173561670195,
+      "loss": 1.2172045707702637,
+      "step": 205
+    },
+    {
+      "epoch": 0.11880046136101499,
+      "grad_norm": 0.8586134314537048,
+      "learning_rate": 0.00019230325187608236,
+      "loss": 1.0469045639038086,
+      "step": 206
+    },
+    {
+      "epoch": 0.11937716262975778,
+      "grad_norm": 0.7872591018676758,
+      "learning_rate": 0.00019226476813546278,
+      "loss": 1.1137733459472656,
+      "step": 207
+    },
+    {
+      "epoch": 0.11995386389850057,
+      "grad_norm": 0.8721824884414673,
+      "learning_rate": 0.0001922262843948432,
+      "loss": 1.3743940591812134,
+      "step": 208
+    },
+    {
+      "epoch": 0.12053056516724336,
+      "grad_norm": 0.6212759613990784,
+      "learning_rate": 0.0001921878006542236,
+      "loss": 0.900457501411438,
+      "step": 209
+    },
+    {
+      "epoch": 0.12110726643598616,
+      "grad_norm": 1.0083750486373901,
+      "learning_rate": 0.000192149316913604,
+      "loss": 1.339089035987854,
+      "step": 210
+    },
+    {
+      "epoch": 0.12168396770472895,
+      "grad_norm": 0.794417142868042,
+      "learning_rate": 0.0001921108331729844,
+      "loss": 1.194704532623291,
+      "step": 211
+    },
+    {
+      "epoch": 0.12226066897347174,
+      "grad_norm": 1.1438184976577759,
+      "learning_rate": 0.00019207234943236483,
+      "loss": 1.3168675899505615,
+      "step": 212
+    },
+    {
+      "epoch": 0.12283737024221453,
+      "grad_norm": 0.5655554533004761,
+      "learning_rate": 0.00019203386569174524,
+      "loss": 1.008853793144226,
+      "step": 213
+    },
+    {
+      "epoch": 0.12341407151095732,
+      "grad_norm": 0.7868179082870483,
+      "learning_rate": 0.00019199538195112566,
+      "loss": 1.3174118995666504,
+      "step": 214
+    },
+    {
+      "epoch": 0.12399077277970011,
+      "grad_norm": 0.6736404299736023,
+      "learning_rate": 0.00019195689821050607,
+      "loss": 1.054055094718933,
+      "step": 215
+    },
+    {
+      "epoch": 0.1245674740484429,
+      "grad_norm": 0.7425172328948975,
+      "learning_rate": 0.00019191841446988649,
+      "loss": 1.2892072200775146,
+      "step": 216
+    },
+    {
+      "epoch": 0.1251441753171857,
+      "grad_norm": 0.7724793553352356,
+      "learning_rate": 0.00019187993072926687,
+      "loss": 1.3278907537460327,
+      "step": 217
+    },
+    {
+      "epoch": 0.12572087658592848,
+      "grad_norm": 0.7415600419044495,
+      "learning_rate": 0.0001918414469886473,
+      "loss": 1.1893579959869385,
+      "step": 218
+    },
+    {
+      "epoch": 0.12629757785467127,
+      "grad_norm": 0.8178536295890808,
+      "learning_rate": 0.0001918029632480277,
+      "loss": 1.3486452102661133,
+      "step": 219
+    },
+    {
+      "epoch": 0.12687427912341406,
+      "grad_norm": 0.803683340549469,
+      "learning_rate": 0.00019176447950740812,
+      "loss": 1.297539234161377,
+      "step": 220
+    },
+    {
+      "epoch": 0.12745098039215685,
+      "grad_norm": 0.6226982474327087,
+      "learning_rate": 0.00019172599576678854,
+      "loss": 1.0952314138412476,
+      "step": 221
+    },
+    {
+      "epoch": 0.12802768166089964,
+      "grad_norm": 0.652317225933075,
+      "learning_rate": 0.00019168751202616895,
+      "loss": 0.9360387325286865,
+      "step": 222
+    },
+    {
+      "epoch": 0.12860438292964244,
+      "grad_norm": 0.8147749900817871,
+      "learning_rate": 0.00019164902828554937,
+      "loss": 1.0632787942886353,
+      "step": 223
+    },
+    {
+      "epoch": 0.12918108419838523,
+      "grad_norm": 0.9202223420143127,
+      "learning_rate": 0.00019161054454492978,
+      "loss": 1.3678290843963623,
+      "step": 224
+    },
+    {
+      "epoch": 0.12975778546712802,
+      "grad_norm": 1.1951165199279785,
+      "learning_rate": 0.00019157206080431017,
+      "loss": 1.2670767307281494,
+      "step": 225
+    },
+    {
+      "epoch": 0.1303344867358708,
+      "grad_norm": 0.7266793847084045,
+      "learning_rate": 0.00019153357706369058,
+      "loss": 1.1158084869384766,
+      "step": 226
+    },
+    {
+      "epoch": 0.1309111880046136,
+      "grad_norm": 0.6181395649909973,
+      "learning_rate": 0.000191495093323071,
+      "loss": 1.1156044006347656,
+      "step": 227
+    },
+    {
+      "epoch": 0.1314878892733564,
+      "grad_norm": 0.7921776175498962,
+      "learning_rate": 0.00019145660958245142,
+      "loss": 1.001752257347107,
+      "step": 228
+    },
+    {
+      "epoch": 0.13206459054209918,
+      "grad_norm": 0.5998401045799255,
+      "learning_rate": 0.00019141812584183183,
+      "loss": 0.7688826322555542,
+      "step": 229
+    },
+    {
+      "epoch": 0.13264129181084197,
+      "grad_norm": 0.7660285234451294,
+      "learning_rate": 0.00019137964210121225,
+      "loss": 1.2462745904922485,
+      "step": 230
+    },
+    {
+      "epoch": 0.13321799307958476,
+      "grad_norm": 0.7925796508789062,
+      "learning_rate": 0.00019134115836059266,
+      "loss": 1.1053651571273804,
+      "step": 231
+    },
+    {
+      "epoch": 0.13379469434832755,
+      "grad_norm": 0.6407649517059326,
+      "learning_rate": 0.00019130267461997308,
+      "loss": 0.8710946440696716,
+      "step": 232
+    },
+    {
+      "epoch": 0.13437139561707034,
+      "grad_norm": 0.7516645789146423,
+      "learning_rate": 0.00019126419087935346,
+      "loss": 1.009436011314392,
+      "step": 233
+    },
+    {
+      "epoch": 0.13494809688581316,
+      "grad_norm": 0.5998948216438293,
+      "learning_rate": 0.00019122570713873388,
+      "loss": 1.0309457778930664,
+      "step": 234
+    },
+    {
+      "epoch": 0.13552479815455595,
+      "grad_norm": 1.1897567510604858,
+      "learning_rate": 0.0001911872233981143,
+      "loss": 0.9930981397628784,
+      "step": 235
+    },
+    {
+      "epoch": 0.13610149942329874,
+      "grad_norm": 0.7404462695121765,
+      "learning_rate": 0.0001911487396574947,
+      "loss": 1.1489670276641846,
+      "step": 236
+    },
+    {
+      "epoch": 0.13667820069204153,
+      "grad_norm": 0.7168471813201904,
+      "learning_rate": 0.00019111025591687513,
+      "loss": 1.202157735824585,
+      "step": 237
+    },
+    {
+      "epoch": 0.13725490196078433,
+      "grad_norm": 0.7502639293670654,
+      "learning_rate": 0.00019107177217625554,
+      "loss": 1.022951364517212,
+      "step": 238
+    },
+    {
+      "epoch": 0.13783160322952712,
+      "grad_norm": 0.6795151233673096,
+      "learning_rate": 0.00019103328843563596,
+      "loss": 1.1194236278533936,
+      "step": 239
+    },
+    {
+      "epoch": 0.1384083044982699,
+      "grad_norm": 0.7620200514793396,
+      "learning_rate": 0.00019099480469501637,
+      "loss": 0.8411365747451782,
+      "step": 240
+    },
+    {
+      "epoch": 0.1389850057670127,
+      "grad_norm": 0.6618032455444336,
+      "learning_rate": 0.00019095632095439676,
+      "loss": 0.7801553606987,
+      "step": 241
+    },
+    {
+      "epoch": 0.1395617070357555,
+      "grad_norm": 0.9366044402122498,
+      "learning_rate": 0.00019091783721377718,
+      "loss": 1.0621672868728638,
+      "step": 242
+    },
+    {
+      "epoch": 0.14013840830449828,
+      "grad_norm": 1.0874788761138916,
+      "learning_rate": 0.0001908793534731576,
+      "loss": 1.6787068843841553,
+      "step": 243
+    },
+    {
+      "epoch": 0.14071510957324107,
+      "grad_norm": 0.8962084054946899,
+      "learning_rate": 0.000190840869732538,
+      "loss": 1.1922732591629028,
+      "step": 244
+    },
+    {
+      "epoch": 0.14129181084198386,
+      "grad_norm": 0.7039315700531006,
+      "learning_rate": 0.00019080238599191842,
+      "loss": 1.177897334098816,
+      "step": 245
+    },
+    {
+      "epoch": 0.14186851211072665,
+      "grad_norm": 0.9172819256782532,
+      "learning_rate": 0.00019076390225129884,
+      "loss": 1.3276829719543457,
+      "step": 246
+    },
+    {
+      "epoch": 0.14244521337946944,
+      "grad_norm": 1.002533197402954,
+      "learning_rate": 0.00019072541851067925,
+      "loss": 1.11848783493042,
+      "step": 247
+    },
+    {
+      "epoch": 0.14302191464821223,
+      "grad_norm": 0.9164738059043884,
+      "learning_rate": 0.00019068693477005967,
+      "loss": 0.7153259515762329,
+      "step": 248
+    },
+    {
+      "epoch": 0.14359861591695502,
+      "grad_norm": 0.7163867354393005,
+      "learning_rate": 0.00019064845102944006,
+      "loss": 1.206921100616455,
+      "step": 249
+    },
+    {
+      "epoch": 0.14417531718569782,
+      "grad_norm": 0.8200199604034424,
+      "learning_rate": 0.00019060996728882047,
+      "loss": 0.9798004031181335,
+      "step": 250
+    },
+    {
+      "epoch": 0.1447520184544406,
+      "grad_norm": 0.9806034564971924,
+      "learning_rate": 0.00019057148354820089,
+      "loss": 1.0969898700714111,
+      "step": 251
+    },
+    {
+      "epoch": 0.1453287197231834,
+      "grad_norm": 1.0849624872207642,
+      "learning_rate": 0.0001905329998075813,
+      "loss": 1.2618253231048584,
+      "step": 252
+    },
+    {
+      "epoch": 0.1459054209919262,
+      "grad_norm": 0.8736698031425476,
+      "learning_rate": 0.00019049451606696172,
+      "loss": 1.1534979343414307,
+      "step": 253
+    },
+    {
+      "epoch": 0.14648212226066898,
+      "grad_norm": 0.6748337745666504,
+      "learning_rate": 0.00019045603232634213,
+      "loss": 0.9178370237350464,
+      "step": 254
+    },
+    {
+      "epoch": 0.14705882352941177,
+      "grad_norm": 0.8655548691749573,
+      "learning_rate": 0.00019041754858572255,
+      "loss": 1.157179355621338,
+      "step": 255
+    },
+    {
+      "epoch": 0.14763552479815456,
+      "grad_norm": 0.7558174133300781,
+      "learning_rate": 0.00019037906484510296,
+      "loss": 0.7844438552856445,
+      "step": 256
+    },
+    {
+      "epoch": 0.14821222606689735,
+      "grad_norm": 0.8278117179870605,
+      "learning_rate": 0.00019034058110448335,
+      "loss": 1.4085724353790283,
+      "step": 257
+    },
+    {
+      "epoch": 0.14878892733564014,
+      "grad_norm": 0.9563509225845337,
+      "learning_rate": 0.00019030209736386377,
+      "loss": 1.244802713394165,
+      "step": 258
+    },
+    {
+      "epoch": 0.14936562860438293,
+      "grad_norm": 0.8018333315849304,
+      "learning_rate": 0.00019026361362324418,
+      "loss": 0.801522970199585,
+      "step": 259
+    },
+    {
+      "epoch": 0.14994232987312572,
+      "grad_norm": 0.555248498916626,
+      "learning_rate": 0.0001902251298826246,
+      "loss": 0.8989696502685547,
+      "step": 260
+    },
+    {
+      "epoch": 0.15051903114186851,
+      "grad_norm": 0.5092940926551819,
+      "learning_rate": 0.000190186646142005,
+      "loss": 0.8229849338531494,
+      "step": 261
+    },
+    {
+      "epoch": 0.1510957324106113,
+      "grad_norm": 0.614162266254425,
+      "learning_rate": 0.00019014816240138543,
+      "loss": 1.14143705368042,
+      "step": 262
+    },
+    {
+      "epoch": 0.1516724336793541,
+      "grad_norm": 0.7050411701202393,
+      "learning_rate": 0.00019010967866076584,
+      "loss": 1.2602849006652832,
+      "step": 263
+    },
+    {
+      "epoch": 0.1522491349480969,
+      "grad_norm": 0.8917875289916992,
+      "learning_rate": 0.00019007119492014626,
+      "loss": 1.2684617042541504,
+      "step": 264
+    },
+    {
+      "epoch": 0.15282583621683968,
+      "grad_norm": 0.7177139520645142,
+      "learning_rate": 0.00019003271117952665,
+      "loss": 0.664681077003479,
+      "step": 265
+    },
+    {
+      "epoch": 0.15340253748558247,
+      "grad_norm": 0.7513463497161865,
+      "learning_rate": 0.00018999422743890706,
+      "loss": 0.9689874649047852,
+      "step": 266
+    },
+    {
+      "epoch": 0.15397923875432526,
+      "grad_norm": 0.8350100517272949,
+      "learning_rate": 0.00018995574369828748,
+      "loss": 1.222740888595581,
+      "step": 267
+    },
+    {
+      "epoch": 0.15455594002306805,
+      "grad_norm": 1.152787685394287,
+      "learning_rate": 0.0001899172599576679,
+      "loss": 1.0707926750183105,
+      "step": 268
+    },
+    {
+      "epoch": 0.15513264129181084,
+      "grad_norm": 0.7810789346694946,
+      "learning_rate": 0.0001898787762170483,
+      "loss": 1.1552890539169312,
+      "step": 269
+    },
+    {
+      "epoch": 0.15570934256055363,
+      "grad_norm": 0.864863395690918,
+      "learning_rate": 0.00018984029247642872,
+      "loss": 1.2455859184265137,
+      "step": 270
+    },
+    {
+      "epoch": 0.15628604382929642,
+      "grad_norm": 0.578794002532959,
+      "learning_rate": 0.00018980180873580914,
+      "loss": 0.9284070730209351,
+      "step": 271
+    },
+    {
+      "epoch": 0.1568627450980392,
+      "grad_norm": 0.9245108962059021,
+      "learning_rate": 0.00018976332499518955,
+      "loss": 0.8936307430267334,
+      "step": 272
+    },
+    {
+      "epoch": 0.157439446366782,
+      "grad_norm": 1.022964358329773,
+      "learning_rate": 0.00018972484125456994,
+      "loss": 1.2052812576293945,
+      "step": 273
+    },
+    {
+      "epoch": 0.1580161476355248,
+      "grad_norm": 0.6136555075645447,
+      "learning_rate": 0.00018968635751395036,
+      "loss": 0.9395220875740051,
+      "step": 274
+    },
+    {
+      "epoch": 0.15859284890426759,
+      "grad_norm": 0.49354949593544006,
+      "learning_rate": 0.00018964787377333077,
+      "loss": 0.7979940176010132,
+      "step": 275
+    },
+    {
+      "epoch": 0.15916955017301038,
+      "grad_norm": 0.8118260502815247,
+      "learning_rate": 0.00018960939003271119,
+      "loss": 1.3310189247131348,
+      "step": 276
+    },
+    {
+      "epoch": 0.15974625144175317,
+      "grad_norm": 0.7864040732383728,
+      "learning_rate": 0.0001895709062920916,
+      "loss": 0.995107889175415,
+      "step": 277
+    },
+    {
+      "epoch": 0.16032295271049596,
+      "grad_norm": 0.7795019149780273,
+      "learning_rate": 0.00018953242255147202,
+      "loss": 1.031097412109375,
+      "step": 278
+    },
+    {
+      "epoch": 0.16089965397923875,
+      "grad_norm": 0.7358199954032898,
+      "learning_rate": 0.00018949393881085243,
+      "loss": 1.2151832580566406,
+      "step": 279
+    },
+    {
+      "epoch": 0.16147635524798154,
+      "grad_norm": 0.592187225818634,
+      "learning_rate": 0.00018945545507023285,
+      "loss": 1.18082857131958,
+      "step": 280
+    },
+    {
+      "epoch": 0.16205305651672433,
+      "grad_norm": 0.6349275708198547,
+      "learning_rate": 0.00018941697132961324,
+      "loss": 1.0011241436004639,
+      "step": 281
+    },
+    {
+      "epoch": 0.16262975778546712,
+      "grad_norm": 0.827673614025116,
+      "learning_rate": 0.00018937848758899365,
+      "loss": 1.1634137630462646,
+      "step": 282
+    },
+    {
+      "epoch": 0.1632064590542099,
+      "grad_norm": 0.7459465861320496,
+      "learning_rate": 0.00018934000384837407,
+      "loss": 1.2054771184921265,
+      "step": 283
+    },
+    {
+      "epoch": 0.1637831603229527,
+      "grad_norm": 0.8688679337501526,
+      "learning_rate": 0.00018930152010775448,
+      "loss": 1.5523681640625,
+      "step": 284
+    },
+    {
+      "epoch": 0.1643598615916955,
+      "grad_norm": 0.5501953959465027,
+      "learning_rate": 0.0001892630363671349,
+      "loss": 0.8807846903800964,
+      "step": 285
+    },
+    {
+      "epoch": 0.16493656286043828,
+      "grad_norm": 0.9370623230934143,
+      "learning_rate": 0.0001892245526265153,
+      "loss": 1.480832815170288,
+      "step": 286
+    },
+    {
+      "epoch": 0.16551326412918108,
+      "grad_norm": 0.824664831161499,
+      "learning_rate": 0.00018918606888589573,
+      "loss": 1.1490377187728882,
+      "step": 287
+    },
+    {
+      "epoch": 0.16608996539792387,
+      "grad_norm": 0.6960827708244324,
+      "learning_rate": 0.00018914758514527614,
+      "loss": 0.9883493185043335,
+      "step": 288
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 0.5384089946746826,
+      "learning_rate": 0.00018910910140465653,
+      "loss": 0.9772455096244812,
+      "step": 289
+    },
+    {
+      "epoch": 0.16724336793540945,
+      "grad_norm": 0.5826528072357178,
+      "learning_rate": 0.00018907061766403695,
+      "loss": 0.80659019947052,
+      "step": 290
+    },
+    {
+      "epoch": 0.16782006920415224,
+      "grad_norm": 0.8662609457969666,
+      "learning_rate": 0.00018903213392341736,
+      "loss": 1.438920497894287,
+      "step": 291
+    },
+    {
+      "epoch": 0.16839677047289503,
+      "grad_norm": 0.8694437742233276,
+      "learning_rate": 0.00018899365018279778,
+      "loss": 1.594082236289978,
+      "step": 292
+    },
+    {
+      "epoch": 0.16897347174163782,
+      "grad_norm": 0.9895355701446533,
+      "learning_rate": 0.0001889551664421782,
+      "loss": 1.1623947620391846,
+      "step": 293
+    },
+    {
+      "epoch": 0.1695501730103806,
+      "grad_norm": 0.7757118940353394,
+      "learning_rate": 0.0001889166827015586,
+      "loss": 1.2969348430633545,
+      "step": 294
+    },
+    {
+      "epoch": 0.1701268742791234,
+      "grad_norm": 1.1235777139663696,
+      "learning_rate": 0.00018887819896093902,
+      "loss": 1.5447598695755005,
+      "step": 295
+    },
+    {
+      "epoch": 0.1707035755478662,
+      "grad_norm": 0.5995392799377441,
+      "learning_rate": 0.00018883971522031944,
+      "loss": 1.1860620975494385,
+      "step": 296
+    },
+    {
+      "epoch": 0.17128027681660898,
+      "grad_norm": 0.7350177764892578,
+      "learning_rate": 0.00018880123147969983,
+      "loss": 1.1964070796966553,
+      "step": 297
+    },
+    {
+      "epoch": 0.17185697808535177,
+      "grad_norm": 0.7769676446914673,
+      "learning_rate": 0.00018876274773908024,
+      "loss": 0.9732775688171387,
+      "step": 298
+    },
+    {
+      "epoch": 0.17243367935409457,
+      "grad_norm": 1.0317054986953735,
+      "learning_rate": 0.00018872426399846066,
+      "loss": 1.1931625604629517,
+      "step": 299
+    },
+    {
+      "epoch": 0.17301038062283736,
+      "grad_norm": 0.855571985244751,
+      "learning_rate": 0.00018868578025784107,
+      "loss": 1.2726032733917236,
+      "step": 300
+    },
+    {
+      "epoch": 0.17358708189158017,
+      "grad_norm": 1.0038337707519531,
+      "learning_rate": 0.0001886472965172215,
+      "loss": 1.3021737337112427,
+      "step": 301
+    },
+    {
+      "epoch": 0.17416378316032297,
+      "grad_norm": 1.05097496509552,
+      "learning_rate": 0.0001886088127766019,
+      "loss": 1.6369917392730713,
+      "step": 302
+    },
+    {
+      "epoch": 0.17474048442906576,
+      "grad_norm": 0.6620575189590454,
+      "learning_rate": 0.00018857032903598232,
+      "loss": 1.0873693227767944,
+      "step": 303
+    },
+    {
+      "epoch": 0.17531718569780855,
+      "grad_norm": 0.8430469036102295,
+      "learning_rate": 0.0001885318452953627,
+      "loss": 1.1750123500823975,
+      "step": 304
+    },
+    {
+      "epoch": 0.17589388696655134,
+      "grad_norm": 0.8181238174438477,
+      "learning_rate": 0.00018849336155474312,
+      "loss": 1.3522461652755737,
+      "step": 305
+    },
+    {
+      "epoch": 0.17647058823529413,
+      "grad_norm": 0.6994307041168213,
+      "learning_rate": 0.00018845487781412354,
+      "loss": 1.327797293663025,
+      "step": 306
+    },
+    {
+      "epoch": 0.17704728950403692,
+      "grad_norm": 0.7090145349502563,
+      "learning_rate": 0.00018841639407350395,
+      "loss": 1.3075491189956665,
+      "step": 307
+    },
+    {
+      "epoch": 0.1776239907727797,
+      "grad_norm": 0.7612029314041138,
+      "learning_rate": 0.00018837791033288437,
+      "loss": 1.0585792064666748,
+      "step": 308
+    },
+    {
+      "epoch": 0.1782006920415225,
+      "grad_norm": 0.8592241406440735,
+      "learning_rate": 0.00018833942659226478,
+      "loss": 0.6441008448600769,
+      "step": 309
+    },
+    {
+      "epoch": 0.1787773933102653,
+      "grad_norm": 1.0303255319595337,
+      "learning_rate": 0.0001883009428516452,
+      "loss": 1.520599365234375,
+      "step": 310
+    },
+    {
+      "epoch": 0.17935409457900808,
+      "grad_norm": 0.80874103307724,
+      "learning_rate": 0.0001882624591110256,
+      "loss": 0.902335524559021,
+      "step": 311
+    },
+    {
+      "epoch": 0.17993079584775087,
+      "grad_norm": 0.7039778232574463,
+      "learning_rate": 0.000188223975370406,
+      "loss": 1.0226070880889893,
+      "step": 312
+    },
+    {
+      "epoch": 0.18050749711649366,
+      "grad_norm": 0.7102690935134888,
+      "learning_rate": 0.00018818549162978642,
+      "loss": 1.0590555667877197,
+      "step": 313
+    },
+    {
+      "epoch": 0.18108419838523646,
+      "grad_norm": 1.0405141115188599,
+      "learning_rate": 0.00018814700788916683,
+      "loss": 1.4237335920333862,
+      "step": 314
+    },
+    {
+      "epoch": 0.18166089965397925,
+      "grad_norm": 0.6633170247077942,
+      "learning_rate": 0.00018810852414854725,
+      "loss": 0.9277420043945312,
+      "step": 315
+    },
+    {
+      "epoch": 0.18223760092272204,
+      "grad_norm": 0.6740328073501587,
+      "learning_rate": 0.00018807004040792766,
+      "loss": 1.053580403327942,
+      "step": 316
+    },
+    {
+      "epoch": 0.18281430219146483,
+      "grad_norm": 0.6842854619026184,
+      "learning_rate": 0.00018803155666730808,
+      "loss": 1.0379540920257568,
+      "step": 317
+    },
+    {
+      "epoch": 0.18339100346020762,
+      "grad_norm": 0.6766674518585205,
+      "learning_rate": 0.0001879930729266885,
+      "loss": 0.9214432835578918,
+      "step": 318
+    },
+    {
+      "epoch": 0.1839677047289504,
+      "grad_norm": 0.8358355164527893,
+      "learning_rate": 0.0001879545891860689,
+      "loss": 1.069684624671936,
+      "step": 319
+    },
+    {
+      "epoch": 0.1845444059976932,
+      "grad_norm": 0.9044516086578369,
+      "learning_rate": 0.0001879161054454493,
+      "loss": 1.4757916927337646,
+      "step": 320
+    },
+    {
+      "epoch": 0.185121107266436,
+      "grad_norm": 0.9662521481513977,
+      "learning_rate": 0.0001878776217048297,
+      "loss": 1.3449480533599854,
+      "step": 321
+    },
+    {
+      "epoch": 0.18569780853517878,
+      "grad_norm": 0.8681714534759521,
+      "learning_rate": 0.00018783913796421013,
+      "loss": 1.2057011127471924,
+      "step": 322
+    },
+    {
+      "epoch": 0.18627450980392157,
+      "grad_norm": 0.7318335175514221,
+      "learning_rate": 0.00018780065422359054,
+      "loss": 1.276970386505127,
+      "step": 323
+    },
+    {
+      "epoch": 0.18685121107266436,
+      "grad_norm": 0.798865556716919,
+      "learning_rate": 0.00018776217048297096,
+      "loss": 1.1334099769592285,
+      "step": 324
+    },
+    {
+      "epoch": 0.18742791234140715,
+      "grad_norm": 0.6787270903587341,
+      "learning_rate": 0.00018772368674235137,
+      "loss": 1.0829839706420898,
+      "step": 325
+    },
+    {
+      "epoch": 0.18800461361014995,
+      "grad_norm": 0.705894947052002,
+      "learning_rate": 0.0001876852030017318,
+      "loss": 1.3146710395812988,
+      "step": 326
+    },
+    {
+      "epoch": 0.18858131487889274,
+      "grad_norm": 0.7403978705406189,
+      "learning_rate": 0.0001876467192611122,
+      "loss": 0.7811852693557739,
+      "step": 327
+    },
+    {
+      "epoch": 0.18915801614763553,
+      "grad_norm": 0.8138331770896912,
+      "learning_rate": 0.0001876082355204926,
+      "loss": 1.3800559043884277,
+      "step": 328
+    },
+    {
+      "epoch": 0.18973471741637832,
+      "grad_norm": 1.0053505897521973,
+      "learning_rate": 0.000187569751779873,
+      "loss": 1.502892017364502,
+      "step": 329
+    },
+    {
+      "epoch": 0.1903114186851211,
+      "grad_norm": 1.2905986309051514,
+      "learning_rate": 0.00018753126803925342,
+      "loss": 1.6044906377792358,
+      "step": 330
+    },
+    {
+      "epoch": 0.1908881199538639,
+      "grad_norm": 0.7266846299171448,
+      "learning_rate": 0.00018749278429863384,
+      "loss": 0.8269582390785217,
+      "step": 331
+    },
+    {
+      "epoch": 0.1914648212226067,
+      "grad_norm": 0.9892683029174805,
+      "learning_rate": 0.00018745430055801425,
+      "loss": 1.2374012470245361,
+      "step": 332
+    },
+    {
+      "epoch": 0.19204152249134948,
+      "grad_norm": 0.8026344180107117,
+      "learning_rate": 0.00018741581681739467,
+      "loss": 0.9166598916053772,
+      "step": 333
+    },
+    {
+      "epoch": 0.19261822376009227,
+      "grad_norm": 0.7790790796279907,
+      "learning_rate": 0.00018737733307677508,
+      "loss": 0.8837241530418396,
+      "step": 334
+    },
+    {
+      "epoch": 0.19319492502883506,
+      "grad_norm": 0.8625907897949219,
+      "learning_rate": 0.0001873388493361555,
+      "loss": 1.0963804721832275,
+      "step": 335
+    },
+    {
+      "epoch": 0.19377162629757785,
+      "grad_norm": 0.8408490419387817,
+      "learning_rate": 0.00018730036559553589,
+      "loss": 1.2887423038482666,
+      "step": 336
+    },
+    {
+      "epoch": 0.19434832756632064,
+      "grad_norm": 0.8141940236091614,
+      "learning_rate": 0.0001872618818549163,
+      "loss": 1.234419584274292,
+      "step": 337
+    },
+    {
+      "epoch": 0.19492502883506344,
+      "grad_norm": 0.7913158535957336,
+      "learning_rate": 0.00018722339811429672,
+      "loss": 0.8931217193603516,
+      "step": 338
+    },
+    {
+      "epoch": 0.19550173010380623,
+      "grad_norm": 0.9377291202545166,
+      "learning_rate": 0.00018718491437367713,
+      "loss": 1.1958264112472534,
+      "step": 339
+    },
+    {
+      "epoch": 0.19607843137254902,
+      "grad_norm": 1.1096664667129517,
+      "learning_rate": 0.00018714643063305755,
+      "loss": 0.871677041053772,
+      "step": 340
+    },
+    {
+      "epoch": 0.1966551326412918,
+      "grad_norm": 0.7379001379013062,
+      "learning_rate": 0.00018710794689243796,
+      "loss": 0.9309886693954468,
+      "step": 341
+    },
+    {
+      "epoch": 0.1972318339100346,
+      "grad_norm": 0.738572895526886,
+      "learning_rate": 0.00018706946315181838,
+      "loss": 1.065298080444336,
+      "step": 342
+    },
+    {
+      "epoch": 0.1978085351787774,
+      "grad_norm": 0.8279491066932678,
+      "learning_rate": 0.0001870309794111988,
+      "loss": 1.0682514905929565,
+      "step": 343
+    },
+    {
+      "epoch": 0.19838523644752018,
+      "grad_norm": 0.9108213782310486,
+      "learning_rate": 0.00018699249567057918,
+      "loss": 1.2043181657791138,
+      "step": 344
+    },
+    {
+      "epoch": 0.19896193771626297,
+      "grad_norm": 0.9347065687179565,
+      "learning_rate": 0.0001869540119299596,
+      "loss": 1.5744340419769287,
+      "step": 345
+    },
+    {
+      "epoch": 0.19953863898500576,
+      "grad_norm": 0.5783383250236511,
+      "learning_rate": 0.00018691552818934,
+      "loss": 0.7808327674865723,
+      "step": 346
+    },
+    {
+      "epoch": 0.20011534025374855,
+      "grad_norm": 0.661321759223938,
+      "learning_rate": 0.00018687704444872043,
+      "loss": 0.9458237290382385,
+      "step": 347
+    },
+    {
+      "epoch": 0.20069204152249134,
+      "grad_norm": 0.5592895746231079,
+      "learning_rate": 0.00018683856070810084,
+      "loss": 0.8761368989944458,
+      "step": 348
+    },
+    {
+      "epoch": 0.20126874279123413,
+      "grad_norm": 0.6626494526863098,
+      "learning_rate": 0.00018680007696748126,
+      "loss": 0.9110841751098633,
+      "step": 349
+    },
+    {
+      "epoch": 0.20184544405997693,
+      "grad_norm": 0.8392354249954224,
+      "learning_rate": 0.00018676159322686167,
+      "loss": 1.234721302986145,
+      "step": 350
+    },
+    {
+      "epoch": 0.20242214532871972,
+      "grad_norm": 0.5596436262130737,
+      "learning_rate": 0.0001867231094862421,
+      "loss": 0.837221622467041,
+      "step": 351
+    },
+    {
+      "epoch": 0.2029988465974625,
+      "grad_norm": 0.5023308992385864,
+      "learning_rate": 0.00018668462574562248,
+      "loss": 0.7079763412475586,
+      "step": 352
+    },
+    {
+      "epoch": 0.2035755478662053,
+      "grad_norm": 0.7946610450744629,
+      "learning_rate": 0.0001866461420050029,
+      "loss": 1.3043620586395264,
+      "step": 353
+    },
+    {
+      "epoch": 0.2041522491349481,
+      "grad_norm": 0.8124772310256958,
+      "learning_rate": 0.0001866076582643833,
+      "loss": 1.1276662349700928,
+      "step": 354
+    },
+    {
+      "epoch": 0.20472895040369088,
+      "grad_norm": 0.5195242166519165,
+      "learning_rate": 0.00018656917452376372,
+      "loss": 0.737315833568573,
+      "step": 355
+    },
+    {
+      "epoch": 0.20530565167243367,
+      "grad_norm": 0.7146646976470947,
+      "learning_rate": 0.00018653069078314414,
+      "loss": 1.0838680267333984,
+      "step": 356
+    },
+    {
+      "epoch": 0.20588235294117646,
+      "grad_norm": 0.7928506135940552,
+      "learning_rate": 0.00018649220704252455,
+      "loss": 1.2697861194610596,
+      "step": 357
+    },
+    {
+      "epoch": 0.20645905420991925,
+      "grad_norm": 0.6152468919754028,
+      "learning_rate": 0.00018645372330190497,
+      "loss": 0.9355758428573608,
+      "step": 358
+    },
+    {
+      "epoch": 0.20703575547866204,
+      "grad_norm": 1.0809266567230225,
+      "learning_rate": 0.00018641523956128538,
+      "loss": 1.9420266151428223,
+      "step": 359
+    },
+    {
+      "epoch": 0.20761245674740483,
+      "grad_norm": 0.59016352891922,
+      "learning_rate": 0.00018637675582066577,
+      "loss": 0.9944459199905396,
+      "step": 360
+    },
+    {
+      "epoch": 0.20818915801614762,
+      "grad_norm": 0.7870339751243591,
+      "learning_rate": 0.0001863382720800462,
+      "loss": 1.0614302158355713,
+      "step": 361
+    },
+    {
+      "epoch": 0.20876585928489041,
+      "grad_norm": 0.7203708291053772,
+      "learning_rate": 0.0001862997883394266,
+      "loss": 0.9602723717689514,
+      "step": 362
+    },
+    {
+      "epoch": 0.2093425605536332,
+      "grad_norm": 0.532341480255127,
+      "learning_rate": 0.00018626130459880702,
+      "loss": 0.8718068599700928,
+      "step": 363
+    },
+    {
+      "epoch": 0.209919261822376,
+      "grad_norm": 0.9565883278846741,
+      "learning_rate": 0.00018622282085818743,
+      "loss": 1.278198480606079,
+      "step": 364
+    },
+    {
+      "epoch": 0.2104959630911188,
+      "grad_norm": 0.7197461724281311,
+      "learning_rate": 0.00018618433711756785,
+      "loss": 1.3148860931396484,
+      "step": 365
+    },
+    {
+      "epoch": 0.21107266435986158,
+      "grad_norm": 0.6119058728218079,
+      "learning_rate": 0.00018614585337694826,
+      "loss": 0.9266935586929321,
+      "step": 366
+    },
+    {
+      "epoch": 0.2116493656286044,
+      "grad_norm": 0.9047015309333801,
+      "learning_rate": 0.00018610736963632868,
+      "loss": 1.1473264694213867,
+      "step": 367
+    },
+    {
+      "epoch": 0.2122260668973472,
+      "grad_norm": 0.6796925663948059,
+      "learning_rate": 0.00018606888589570907,
+      "loss": 1.0393201112747192,
+      "step": 368
+    },
+    {
+      "epoch": 0.21280276816608998,
+      "grad_norm": 0.6059300303459167,
+      "learning_rate": 0.00018603040215508948,
+      "loss": 1.001380443572998,
+      "step": 369
+    },
+    {
+      "epoch": 0.21337946943483277,
+      "grad_norm": 0.6669148206710815,
+      "learning_rate": 0.0001859919184144699,
+      "loss": 0.8133573532104492,
+      "step": 370
+    },
+    {
+      "epoch": 0.21395617070357556,
+      "grad_norm": 0.6025424003601074,
+      "learning_rate": 0.0001859534346738503,
+      "loss": 0.9277598261833191,
+      "step": 371
+    },
+    {
+      "epoch": 0.21453287197231835,
+      "grad_norm": 0.8728757500648499,
+      "learning_rate": 0.00018591495093323073,
+      "loss": 1.1496421098709106,
+      "step": 372
+    },
+    {
+      "epoch": 0.21510957324106114,
+      "grad_norm": 0.587089478969574,
+      "learning_rate": 0.00018587646719261114,
+      "loss": 0.8672431707382202,
+      "step": 373
+    },
+    {
+      "epoch": 0.21568627450980393,
+      "grad_norm": 0.7482187747955322,
+      "learning_rate": 0.00018583798345199156,
+      "loss": 1.0713750123977661,
+      "step": 374
+    },
+    {
+      "epoch": 0.21626297577854672,
+      "grad_norm": 0.8591217398643494,
+      "learning_rate": 0.00018579949971137197,
+      "loss": 1.4045636653900146,
+      "step": 375
+    },
+    {
+      "epoch": 0.21683967704728951,
+      "grad_norm": 0.7630711793899536,
+      "learning_rate": 0.00018576101597075236,
+      "loss": 0.9842856526374817,
+      "step": 376
+    },
+    {
+      "epoch": 0.2174163783160323,
+      "grad_norm": 1.2762526273727417,
+      "learning_rate": 0.00018572253223013278,
+      "loss": 1.5381450653076172,
+      "step": 377
+    },
+    {
+      "epoch": 0.2179930795847751,
+      "grad_norm": 0.7234092950820923,
+      "learning_rate": 0.0001856840484895132,
+      "loss": 1.0782972574234009,
+      "step": 378
+    },
+    {
+      "epoch": 0.2185697808535179,
+      "grad_norm": 0.8868815898895264,
+      "learning_rate": 0.0001856455647488936,
+      "loss": 0.9910011291503906,
+      "step": 379
+    },
+    {
+      "epoch": 0.21914648212226068,
+      "grad_norm": 0.5880477428436279,
+      "learning_rate": 0.00018560708100827402,
+      "loss": 0.9178383946418762,
+      "step": 380
+    },
+    {
+      "epoch": 0.21972318339100347,
+      "grad_norm": 0.7115210294723511,
+      "learning_rate": 0.00018556859726765444,
+      "loss": 1.3695993423461914,
+      "step": 381
+    },
+    {
+      "epoch": 0.22029988465974626,
+      "grad_norm": 0.9036445617675781,
+      "learning_rate": 0.00018553011352703485,
+      "loss": 1.049261212348938,
+      "step": 382
+    },
+    {
+      "epoch": 0.22087658592848905,
+      "grad_norm": 1.044411540031433,
+      "learning_rate": 0.00018549162978641527,
+      "loss": 1.272240400314331,
+      "step": 383
+    },
+    {
+      "epoch": 0.22145328719723184,
+      "grad_norm": 0.6363574862480164,
+      "learning_rate": 0.00018545314604579566,
+      "loss": 1.0237360000610352,
+      "step": 384
+    },
+    {
+      "epoch": 0.22202998846597463,
+      "grad_norm": 0.7671105861663818,
+      "learning_rate": 0.00018541466230517607,
+      "loss": 0.9970401525497437,
+      "step": 385
+    },
+    {
+      "epoch": 0.22260668973471742,
+      "grad_norm": 1.170229434967041,
+      "learning_rate": 0.0001853761785645565,
+      "loss": 1.5654575824737549,
+      "step": 386
+    },
+    {
+      "epoch": 0.2231833910034602,
+      "grad_norm": 0.9486715793609619,
+      "learning_rate": 0.0001853376948239369,
+      "loss": 1.8445625305175781,
+      "step": 387
+    },
+    {
+      "epoch": 0.223760092272203,
+      "grad_norm": 0.7049561142921448,
+      "learning_rate": 0.00018529921108331732,
+      "loss": 1.147915005683899,
+      "step": 388
+    },
+    {
+      "epoch": 0.2243367935409458,
+      "grad_norm": 0.7626886963844299,
+      "learning_rate": 0.00018526072734269773,
+      "loss": 0.9354770183563232,
+      "step": 389
+    },
+    {
+      "epoch": 0.22491349480968859,
+      "grad_norm": 0.8018368482589722,
+      "learning_rate": 0.00018522224360207815,
+      "loss": 1.0617220401763916,
+      "step": 390
+    },
+    {
+      "epoch": 0.22549019607843138,
+      "grad_norm": 0.7590807676315308,
+      "learning_rate": 0.00018518375986145854,
+      "loss": 0.9120303988456726,
+      "step": 391
+    },
+    {
+      "epoch": 0.22606689734717417,
+      "grad_norm": 0.6623148918151855,
+      "learning_rate": 0.00018514527612083895,
+      "loss": 0.7569756507873535,
+      "step": 392
+    },
+    {
+      "epoch": 0.22664359861591696,
+      "grad_norm": 0.5547282099723816,
+      "learning_rate": 0.00018510679238021937,
+      "loss": 0.7989190816879272,
+      "step": 393
+    },
+    {
+      "epoch": 0.22722029988465975,
+      "grad_norm": 0.5765286087989807,
+      "learning_rate": 0.00018506830863959978,
+      "loss": 0.6133571863174438,
+      "step": 394
+    },
+    {
+      "epoch": 0.22779700115340254,
+      "grad_norm": 0.8331816792488098,
+      "learning_rate": 0.0001850298248989802,
+      "loss": 1.1577847003936768,
+      "step": 395
+    },
+    {
+      "epoch": 0.22837370242214533,
+      "grad_norm": 0.7655069231987,
+      "learning_rate": 0.0001849913411583606,
+      "loss": 1.0809553861618042,
+      "step": 396
+    },
+    {
+      "epoch": 0.22895040369088812,
+      "grad_norm": 0.7397854924201965,
+      "learning_rate": 0.00018495285741774103,
+      "loss": 0.9830250144004822,
+      "step": 397
+    },
+    {
+      "epoch": 0.2295271049596309,
+      "grad_norm": 0.6970857381820679,
+      "learning_rate": 0.00018491437367712144,
+      "loss": 0.8101853132247925,
+      "step": 398
+    },
+    {
+      "epoch": 0.2301038062283737,
+      "grad_norm": 0.5724602937698364,
+      "learning_rate": 0.00018487588993650183,
+      "loss": 0.70196932554245,
+      "step": 399
+    },
+    {
+      "epoch": 0.2306805074971165,
+      "grad_norm": 0.9593637585639954,
+      "learning_rate": 0.00018483740619588225,
+      "loss": 0.9378552436828613,
+      "step": 400
+    },
+    {
+      "epoch": 0.23125720876585928,
+      "grad_norm": 0.7079650163650513,
+      "learning_rate": 0.00018479892245526266,
+      "loss": 0.8764985799789429,
+      "step": 401
+    },
+    {
+      "epoch": 0.23183391003460208,
+      "grad_norm": 0.7374391555786133,
+      "learning_rate": 0.00018476043871464308,
+      "loss": 0.8556146025657654,
+      "step": 402
+    },
+    {
+      "epoch": 0.23241061130334487,
+      "grad_norm": 0.6992713809013367,
+      "learning_rate": 0.0001847219549740235,
+      "loss": 0.9657334089279175,
+      "step": 403
+    },
+    {
+      "epoch": 0.23298731257208766,
+      "grad_norm": 0.8299751281738281,
+      "learning_rate": 0.0001846834712334039,
+      "loss": 1.2171483039855957,
+      "step": 404
+    },
+    {
+      "epoch": 0.23356401384083045,
+      "grad_norm": 0.5866743922233582,
+      "learning_rate": 0.00018464498749278432,
+      "loss": 0.9809523820877075,
+      "step": 405
+    },
+    {
+      "epoch": 0.23414071510957324,
+      "grad_norm": 0.8412980437278748,
+      "learning_rate": 0.00018460650375216474,
+      "loss": 1.1848514080047607,
+      "step": 406
+    },
+    {
+      "epoch": 0.23471741637831603,
+      "grad_norm": 0.7566470503807068,
+      "learning_rate": 0.00018456802001154513,
+      "loss": 1.0939483642578125,
+      "step": 407
+    },
+    {
+      "epoch": 0.23529411764705882,
+      "grad_norm": 0.787800669670105,
+      "learning_rate": 0.00018452953627092554,
+      "loss": 1.2347867488861084,
+      "step": 408
+    },
+    {
+      "epoch": 0.2358708189158016,
+      "grad_norm": 0.8511201739311218,
+      "learning_rate": 0.00018449105253030596,
+      "loss": 0.9385696053504944,
+      "step": 409
+    },
+    {
+      "epoch": 0.2364475201845444,
+      "grad_norm": 0.9360937476158142,
+      "learning_rate": 0.00018445256878968637,
+      "loss": 1.3519483804702759,
+      "step": 410
+    },
+    {
+      "epoch": 0.2370242214532872,
+      "grad_norm": 0.556093692779541,
+      "learning_rate": 0.0001844140850490668,
+      "loss": 0.8482391238212585,
+      "step": 411
+    },
+    {
+      "epoch": 0.23760092272202998,
+      "grad_norm": 0.6390929818153381,
+      "learning_rate": 0.0001843756013084472,
+      "loss": 1.0374037027359009,
+      "step": 412
+    },
+    {
+      "epoch": 0.23817762399077277,
+      "grad_norm": 0.5385326743125916,
+      "learning_rate": 0.00018433711756782762,
+      "loss": 0.8951395750045776,
+      "step": 413
+    },
+    {
+      "epoch": 0.23875432525951557,
+      "grad_norm": 0.7417898774147034,
+      "learning_rate": 0.00018429863382720803,
+      "loss": 1.1854356527328491,
+      "step": 414
+    },
+    {
+      "epoch": 0.23933102652825836,
+      "grad_norm": 0.7092972993850708,
+      "learning_rate": 0.00018426015008658842,
+      "loss": 1.2556312084197998,
+      "step": 415
+    },
+    {
+      "epoch": 0.23990772779700115,
+      "grad_norm": 0.6026037931442261,
+      "learning_rate": 0.00018422166634596884,
+      "loss": 0.8205006718635559,
+      "step": 416
+    },
+    {
+      "epoch": 0.24048442906574394,
+      "grad_norm": 0.7460249662399292,
+      "learning_rate": 0.00018418318260534925,
+      "loss": 0.9955434203147888,
+      "step": 417
+    },
+    {
+      "epoch": 0.24106113033448673,
+      "grad_norm": 0.6313579082489014,
+      "learning_rate": 0.00018414469886472967,
+      "loss": 1.15024995803833,
+      "step": 418
+    },
+    {
+      "epoch": 0.24163783160322952,
+      "grad_norm": 0.7596423029899597,
+      "learning_rate": 0.00018410621512411008,
+      "loss": 1.196816325187683,
+      "step": 419
+    },
+    {
+      "epoch": 0.2422145328719723,
+      "grad_norm": 0.7336683869361877,
+      "learning_rate": 0.0001840677313834905,
+      "loss": 1.0791605710983276,
+      "step": 420
+    },
+    {
+      "epoch": 0.2427912341407151,
+      "grad_norm": 0.6802041530609131,
+      "learning_rate": 0.00018402924764287091,
+      "loss": 0.8439788222312927,
+      "step": 421
+    },
+    {
+      "epoch": 0.2433679354094579,
+      "grad_norm": 0.9311268329620361,
+      "learning_rate": 0.00018399076390225133,
+      "loss": 1.4188232421875,
+      "step": 422
+    },
+    {
+      "epoch": 0.24394463667820068,
+      "grad_norm": 0.9715989232063293,
+      "learning_rate": 0.00018395228016163172,
+      "loss": 1.149898648262024,
+      "step": 423
+    },
+    {
+      "epoch": 0.24452133794694347,
+      "grad_norm": 0.6722977161407471,
+      "learning_rate": 0.00018391379642101213,
+      "loss": 1.0626373291015625,
+      "step": 424
+    },
+    {
+      "epoch": 0.24509803921568626,
+      "grad_norm": 0.9417729377746582,
+      "learning_rate": 0.00018387531268039255,
+      "loss": 1.277899980545044,
+      "step": 425
+    },
+    {
+      "epoch": 0.24567474048442905,
+      "grad_norm": 0.8700136542320251,
+      "learning_rate": 0.00018383682893977296,
+      "loss": 1.106884479522705,
+      "step": 426
+    },
+    {
+      "epoch": 0.24625144175317185,
+      "grad_norm": 0.71380615234375,
+      "learning_rate": 0.00018379834519915338,
+      "loss": 1.1928266286849976,
+      "step": 427
+    },
+    {
+      "epoch": 0.24682814302191464,
+      "grad_norm": 0.7276275157928467,
+      "learning_rate": 0.0001837598614585338,
+      "loss": 1.2448585033416748,
+      "step": 428
+    },
+    {
+      "epoch": 0.24740484429065743,
+      "grad_norm": 0.8795212507247925,
+      "learning_rate": 0.0001837213777179142,
+      "loss": 1.317166805267334,
+      "step": 429
+    },
+    {
+      "epoch": 0.24798154555940022,
+      "grad_norm": 0.9904524087905884,
+      "learning_rate": 0.00018368289397729462,
+      "loss": 1.166348934173584,
+      "step": 430
+    },
+    {
+      "epoch": 0.248558246828143,
+      "grad_norm": 0.7632173299789429,
+      "learning_rate": 0.000183644410236675,
+      "loss": 1.5664170980453491,
+      "step": 431
+    },
+    {
+      "epoch": 0.2491349480968858,
+      "grad_norm": 0.8291054964065552,
+      "learning_rate": 0.00018360592649605543,
+      "loss": 1.4953291416168213,
+      "step": 432
+    },
+    {
+      "epoch": 0.2497116493656286,
+      "grad_norm": 0.6445023417472839,
+      "learning_rate": 0.00018356744275543584,
+      "loss": 0.8673335313796997,
+      "step": 433
+    },
+    {
+      "epoch": 0.2502883506343714,
+      "grad_norm": 1.2072186470031738,
+      "learning_rate": 0.00018352895901481626,
+      "loss": 1.59421968460083,
+      "step": 434
+    },
+    {
+      "epoch": 0.2508650519031142,
+      "grad_norm": 0.7409680485725403,
+      "learning_rate": 0.00018349047527419667,
+      "loss": 1.0224432945251465,
+      "step": 435
+    },
+    {
+      "epoch": 0.25144175317185696,
+      "grad_norm": 0.8207524418830872,
+      "learning_rate": 0.0001834519915335771,
+      "loss": 1.276658058166504,
+      "step": 436
+    },
+    {
+      "epoch": 0.2520184544405998,
+      "grad_norm": 0.8591949343681335,
+      "learning_rate": 0.0001834135077929575,
+      "loss": 1.1319093704223633,
+      "step": 437
+    },
+    {
+      "epoch": 0.25259515570934254,
+      "grad_norm": 0.6689372658729553,
+      "learning_rate": 0.00018337502405233792,
+      "loss": 0.9691576361656189,
+      "step": 438
+    },
+    {
+      "epoch": 0.25317185697808536,
+      "grad_norm": 0.9033296704292297,
+      "learning_rate": 0.0001833365403117183,
+      "loss": 1.4272680282592773,
+      "step": 439
+    },
+    {
+      "epoch": 0.2537485582468281,
+      "grad_norm": 0.6959604620933533,
+      "learning_rate": 0.0001832980565710987,
+      "loss": 1.1449182033538818,
+      "step": 440
+    },
+    {
+      "epoch": 0.25432525951557095,
+      "grad_norm": 0.6695550680160522,
+      "learning_rate": 0.0001832595728304791,
+      "loss": 1.0492792129516602,
+      "step": 441
+    },
+    {
+      "epoch": 0.2549019607843137,
+      "grad_norm": 0.710794985294342,
+      "learning_rate": 0.00018322108908985953,
+      "loss": 0.9534090757369995,
+      "step": 442
+    },
+    {
+      "epoch": 0.2554786620530565,
+      "grad_norm": 0.6955594420433044,
+      "learning_rate": 0.00018318260534923994,
+      "loss": 0.8743690252304077,
+      "step": 443
+    },
+    {
+      "epoch": 0.2560553633217993,
+      "grad_norm": 0.6831961274147034,
+      "learning_rate": 0.00018314412160862036,
+      "loss": 1.3500818014144897,
+      "step": 444
+    },
+    {
+      "epoch": 0.2566320645905421,
+      "grad_norm": 0.7839577198028564,
+      "learning_rate": 0.00018310563786800077,
+      "loss": 1.0105950832366943,
+      "step": 445
+    },
+    {
+      "epoch": 0.25720876585928487,
+      "grad_norm": 0.8791704773902893,
+      "learning_rate": 0.0001830671541273812,
+      "loss": 1.2243623733520508,
+      "step": 446
+    },
+    {
+      "epoch": 0.2577854671280277,
+      "grad_norm": 0.7005860209465027,
+      "learning_rate": 0.0001830286703867616,
+      "loss": 1.077842354774475,
+      "step": 447
+    },
+    {
+      "epoch": 0.25836216839677045,
+      "grad_norm": 0.822964072227478,
+      "learning_rate": 0.000182990186646142,
+      "loss": 1.2265344858169556,
+      "step": 448
+    },
+    {
+      "epoch": 0.25893886966551327,
+      "grad_norm": 0.773158609867096,
+      "learning_rate": 0.0001829517029055224,
+      "loss": 0.8715431690216064,
+      "step": 449
+    },
+    {
+      "epoch": 0.25951557093425603,
+      "grad_norm": 0.8603456616401672,
+      "learning_rate": 0.00018291321916490282,
+      "loss": 0.9889146089553833,
+      "step": 450
+    },
+    {
+      "epoch": 0.26009227220299885,
+      "grad_norm": 0.8188443779945374,
+      "learning_rate": 0.00018287473542428324,
+      "loss": 0.8885264992713928,
+      "step": 451
+    },
+    {
+      "epoch": 0.2606689734717416,
+      "grad_norm": 1.0877407789230347,
+      "learning_rate": 0.00018283625168366365,
+      "loss": 1.0748121738433838,
+      "step": 452
+    },
+    {
+      "epoch": 0.26124567474048443,
+      "grad_norm": 0.5481402277946472,
+      "learning_rate": 0.00018279776794304407,
+      "loss": 0.807957649230957,
+      "step": 453
+    },
+    {
+      "epoch": 0.2618223760092272,
+      "grad_norm": 0.8591419458389282,
+      "learning_rate": 0.00018275928420242448,
+      "loss": 1.3057336807250977,
+      "step": 454
+    },
+    {
+      "epoch": 0.26239907727797,
+      "grad_norm": 0.7936019897460938,
+      "learning_rate": 0.0001827208004618049,
+      "loss": 1.185962200164795,
+      "step": 455
+    },
+    {
+      "epoch": 0.2629757785467128,
+      "grad_norm": 0.6581904888153076,
+      "learning_rate": 0.00018268231672118529,
+      "loss": 0.8275895118713379,
+      "step": 456
+    },
+    {
+      "epoch": 0.2635524798154556,
+      "grad_norm": 0.831302285194397,
+      "learning_rate": 0.0001826438329805657,
+      "loss": 1.299217939376831,
+      "step": 457
+    },
+    {
+      "epoch": 0.26412918108419836,
+      "grad_norm": 0.6771467924118042,
+      "learning_rate": 0.00018260534923994612,
+      "loss": 0.8427085876464844,
+      "step": 458
+    },
+    {
+      "epoch": 0.2647058823529412,
+      "grad_norm": 0.7914313077926636,
+      "learning_rate": 0.00018256686549932653,
+      "loss": 1.369484305381775,
+      "step": 459
+    },
+    {
+      "epoch": 0.26528258362168394,
+      "grad_norm": 0.5916578769683838,
+      "learning_rate": 0.00018252838175870695,
+      "loss": 0.6241229772567749,
+      "step": 460
+    },
+    {
+      "epoch": 0.26585928489042676,
+      "grad_norm": 0.6836418509483337,
+      "learning_rate": 0.00018248989801808736,
+      "loss": 0.8050651550292969,
+      "step": 461
+    },
+    {
+      "epoch": 0.2664359861591695,
+      "grad_norm": 0.7545502185821533,
+      "learning_rate": 0.00018245141427746778,
+      "loss": 0.7911585569381714,
+      "step": 462
+    },
+    {
+      "epoch": 0.26701268742791234,
+      "grad_norm": 0.6010773181915283,
+      "learning_rate": 0.0001824129305368482,
+      "loss": 1.1161192655563354,
+      "step": 463
+    },
+    {
+      "epoch": 0.2675893886966551,
+      "grad_norm": 0.813204824924469,
+      "learning_rate": 0.00018237444679622858,
+      "loss": 1.096695065498352,
+      "step": 464
+    },
+    {
+      "epoch": 0.2681660899653979,
+      "grad_norm": 0.91140216588974,
+      "learning_rate": 0.000182335963055609,
+      "loss": 1.4385195970535278,
+      "step": 465
+    },
+    {
+      "epoch": 0.2687427912341407,
+      "grad_norm": 0.9745720624923706,
+      "learning_rate": 0.0001822974793149894,
+      "loss": 1.3157883882522583,
+      "step": 466
+    },
+    {
+      "epoch": 0.2693194925028835,
+      "grad_norm": 0.4999851584434509,
+      "learning_rate": 0.00018225899557436983,
+      "loss": 0.6729867458343506,
+      "step": 467
+    },
+    {
+      "epoch": 0.2698961937716263,
+      "grad_norm": 0.9021291732788086,
+      "learning_rate": 0.00018222051183375024,
+      "loss": 1.0553233623504639,
+      "step": 468
+    },
+    {
+      "epoch": 0.2704728950403691,
+      "grad_norm": 0.8061716556549072,
+      "learning_rate": 0.00018218202809313066,
+      "loss": 1.3081198930740356,
+      "step": 469
+    },
+    {
+      "epoch": 0.2710495963091119,
+      "grad_norm": 0.6820981502532959,
+      "learning_rate": 0.00018214354435251107,
+      "loss": 0.9388906359672546,
+      "step": 470
+    },
+    {
+      "epoch": 0.27162629757785467,
+      "grad_norm": 1.0991320610046387,
+      "learning_rate": 0.0001821050606118915,
+      "loss": 1.528028964996338,
+      "step": 471
+    },
+    {
+      "epoch": 0.2722029988465975,
+      "grad_norm": 0.7934592962265015,
+      "learning_rate": 0.00018206657687127188,
+      "loss": 1.2054097652435303,
+      "step": 472
+    },
+    {
+      "epoch": 0.27277970011534025,
+      "grad_norm": 0.7113450765609741,
+      "learning_rate": 0.0001820280931306523,
+      "loss": 1.0254576206207275,
+      "step": 473
+    },
+    {
+      "epoch": 0.27335640138408307,
+      "grad_norm": 0.7593767046928406,
+      "learning_rate": 0.0001819896093900327,
+      "loss": 1.284333348274231,
+      "step": 474
+    },
+    {
+      "epoch": 0.27393310265282583,
+      "grad_norm": 1.006116509437561,
+      "learning_rate": 0.00018195112564941312,
+      "loss": 1.3650097846984863,
+      "step": 475
+    },
+    {
+      "epoch": 0.27450980392156865,
+      "grad_norm": 0.8706763982772827,
+      "learning_rate": 0.00018191264190879354,
+      "loss": 1.6067880392074585,
+      "step": 476
+    },
+    {
+      "epoch": 0.2750865051903114,
+      "grad_norm": 0.7428901195526123,
+      "learning_rate": 0.00018187415816817395,
+      "loss": 1.373342514038086,
+      "step": 477
+    },
+    {
+      "epoch": 0.27566320645905423,
+      "grad_norm": 0.8846433162689209,
+      "learning_rate": 0.00018183567442755437,
+      "loss": 1.5520777702331543,
+      "step": 478
+    },
+    {
+      "epoch": 0.276239907727797,
+      "grad_norm": 0.8808581829071045,
+      "learning_rate": 0.00018179719068693478,
+      "loss": 1.1342291831970215,
+      "step": 479
+    },
+    {
+      "epoch": 0.2768166089965398,
+      "grad_norm": 0.7310512065887451,
+      "learning_rate": 0.00018175870694631517,
+      "loss": 0.7762906551361084,
+      "step": 480
+    },
+    {
+      "epoch": 0.2773933102652826,
+      "grad_norm": 0.8467727303504944,
+      "learning_rate": 0.0001817202232056956,
+      "loss": 0.990180253982544,
+      "step": 481
+    },
+    {
+      "epoch": 0.2779700115340254,
+      "grad_norm": 0.642230212688446,
+      "learning_rate": 0.000181681739465076,
+      "loss": 0.845292329788208,
+      "step": 482
+    },
+    {
+      "epoch": 0.27854671280276816,
+      "grad_norm": 0.7775582075119019,
+      "learning_rate": 0.00018164325572445642,
+      "loss": 1.279380202293396,
+      "step": 483
+    },
+    {
+      "epoch": 0.279123414071511,
+      "grad_norm": 0.6477130651473999,
+      "learning_rate": 0.00018160477198383683,
+      "loss": 0.8197907209396362,
+      "step": 484
+    },
+    {
+      "epoch": 0.27970011534025374,
+      "grad_norm": 0.6508778929710388,
+      "learning_rate": 0.00018156628824321725,
+      "loss": 0.9538026452064514,
+      "step": 485
+    },
+    {
+      "epoch": 0.28027681660899656,
+      "grad_norm": 0.9379159212112427,
+      "learning_rate": 0.00018152780450259766,
+      "loss": 1.2874410152435303,
+      "step": 486
+    },
+    {
+      "epoch": 0.2808535178777393,
+      "grad_norm": 0.8014243245124817,
+      "learning_rate": 0.00018148932076197808,
+      "loss": 1.364856481552124,
+      "step": 487
+    },
+    {
+      "epoch": 0.28143021914648214,
+      "grad_norm": 1.0049822330474854,
+      "learning_rate": 0.00018145083702135847,
+      "loss": 1.3461369276046753,
+      "step": 488
+    },
+    {
+      "epoch": 0.2820069204152249,
+      "grad_norm": 0.8764071464538574,
+      "learning_rate": 0.00018141235328073888,
+      "loss": 1.549091100692749,
+      "step": 489
+    },
+    {
+      "epoch": 0.2825836216839677,
+      "grad_norm": 0.6743770241737366,
+      "learning_rate": 0.0001813738695401193,
+      "loss": 0.8718385696411133,
+      "step": 490
+    },
+    {
+      "epoch": 0.2831603229527105,
+      "grad_norm": 0.8501721024513245,
+      "learning_rate": 0.0001813353857994997,
+      "loss": 0.9592713117599487,
+      "step": 491
+    },
+    {
+      "epoch": 0.2837370242214533,
+      "grad_norm": 0.6727166771888733,
+      "learning_rate": 0.00018129690205888013,
+      "loss": 1.0024611949920654,
+      "step": 492
+    },
+    {
+      "epoch": 0.28431372549019607,
+      "grad_norm": 0.7949026226997375,
+      "learning_rate": 0.00018125841831826054,
+      "loss": 0.889624297618866,
+      "step": 493
+    },
+    {
+      "epoch": 0.2848904267589389,
+      "grad_norm": 0.8814200758934021,
+      "learning_rate": 0.00018121993457764096,
+      "loss": 1.7483818531036377,
+      "step": 494
+    },
+    {
+      "epoch": 0.28546712802768165,
+      "grad_norm": 0.6116936206817627,
+      "learning_rate": 0.00018118145083702137,
+      "loss": 1.097643256187439,
+      "step": 495
+    },
+    {
+      "epoch": 0.28604382929642447,
+      "grad_norm": 0.6951889395713806,
+      "learning_rate": 0.00018114296709640176,
+      "loss": 0.9292160272598267,
+      "step": 496
+    },
+    {
+      "epoch": 0.28662053056516723,
+      "grad_norm": 0.9138390421867371,
+      "learning_rate": 0.00018110448335578218,
+      "loss": 1.174808144569397,
+      "step": 497
+    },
+    {
+      "epoch": 0.28719723183391005,
+      "grad_norm": 0.6442549824714661,
+      "learning_rate": 0.0001810659996151626,
+      "loss": 0.9390018582344055,
+      "step": 498
+    },
+    {
+      "epoch": 0.2877739331026528,
+      "grad_norm": 0.9683842658996582,
+      "learning_rate": 0.000181027515874543,
+      "loss": 1.4045450687408447,
+      "step": 499
+    },
+    {
+      "epoch": 0.28835063437139563,
+      "grad_norm": 0.7444068193435669,
+      "learning_rate": 0.00018098903213392342,
+      "loss": 0.9792321920394897,
+      "step": 500
+    },
+    {
+      "epoch": 0.2889273356401384,
+      "grad_norm": 0.7402380108833313,
+      "learning_rate": 0.00018095054839330384,
+      "loss": 1.231440782546997,
+      "step": 501
+    },
+    {
+      "epoch": 0.2895040369088812,
+      "grad_norm": 0.7022894024848938,
+      "learning_rate": 0.00018091206465268425,
+      "loss": 0.856300950050354,
+      "step": 502
+    },
+    {
+      "epoch": 0.290080738177624,
+      "grad_norm": 0.7641032338142395,
+      "learning_rate": 0.00018087358091206467,
+      "loss": 0.9729149341583252,
+      "step": 503
+    },
+    {
+      "epoch": 0.2906574394463668,
+      "grad_norm": 0.9500510096549988,
+      "learning_rate": 0.00018083509717144506,
+      "loss": 1.2449204921722412,
+      "step": 504
+    },
+    {
+      "epoch": 0.29123414071510956,
+      "grad_norm": 0.6954758763313293,
+      "learning_rate": 0.00018079661343082547,
+      "loss": 0.8000816106796265,
+      "step": 505
+    },
+    {
+      "epoch": 0.2918108419838524,
+      "grad_norm": 0.7313628196716309,
+      "learning_rate": 0.0001807581296902059,
+      "loss": 1.233512282371521,
+      "step": 506
+    },
+    {
+      "epoch": 0.29238754325259514,
+      "grad_norm": 0.8792680501937866,
+      "learning_rate": 0.0001807196459495863,
+      "loss": 1.092308521270752,
+      "step": 507
+    },
+    {
+      "epoch": 0.29296424452133796,
+      "grad_norm": 0.6230028867721558,
+      "learning_rate": 0.00018068116220896672,
+      "loss": 0.7719423174858093,
+      "step": 508
+    },
+    {
+      "epoch": 0.2935409457900807,
+      "grad_norm": 0.8965409398078918,
+      "learning_rate": 0.00018064267846834713,
+      "loss": 1.576930284500122,
+      "step": 509
+    },
+    {
+      "epoch": 0.29411764705882354,
+      "grad_norm": 0.756908118724823,
+      "learning_rate": 0.00018060419472772755,
+      "loss": 0.9762069582939148,
+      "step": 510
+    },
+    {
+      "epoch": 0.2946943483275663,
+      "grad_norm": 0.7524373531341553,
+      "learning_rate": 0.00018056571098710794,
+      "loss": 0.9206646680831909,
+      "step": 511
+    },
+    {
+      "epoch": 0.2952710495963091,
+      "grad_norm": 0.9292136430740356,
+      "learning_rate": 0.00018052722724648835,
+      "loss": 1.534470558166504,
+      "step": 512
+    },
+    {
+      "epoch": 0.2958477508650519,
+      "grad_norm": 1.0442750453948975,
+      "learning_rate": 0.00018048874350586877,
+      "loss": 1.2520341873168945,
+      "step": 513
+    },
+    {
+      "epoch": 0.2964244521337947,
+      "grad_norm": 0.8131316900253296,
+      "learning_rate": 0.00018045025976524918,
+      "loss": 1.5056309700012207,
+      "step": 514
+    },
+    {
+      "epoch": 0.29700115340253747,
+      "grad_norm": 0.7711693048477173,
+      "learning_rate": 0.0001804117760246296,
+      "loss": 1.2189143896102905,
+      "step": 515
+    },
+    {
+      "epoch": 0.2975778546712803,
+      "grad_norm": 0.6610523462295532,
+      "learning_rate": 0.00018037329228401,
+      "loss": 1.1120340824127197,
+      "step": 516
+    },
+    {
+      "epoch": 0.29815455594002305,
+      "grad_norm": 0.7343090772628784,
+      "learning_rate": 0.00018033480854339043,
+      "loss": 1.0496878623962402,
+      "step": 517
+    },
+    {
+      "epoch": 0.29873125720876587,
+      "grad_norm": 0.6952423453330994,
+      "learning_rate": 0.00018029632480277084,
+      "loss": 1.0725046396255493,
+      "step": 518
+    },
+    {
+      "epoch": 0.29930795847750863,
+      "grad_norm": 1.0385462045669556,
+      "learning_rate": 0.00018025784106215123,
+      "loss": 1.3104898929595947,
+      "step": 519
+    },
+    {
+      "epoch": 0.29988465974625145,
+      "grad_norm": 0.6035030484199524,
+      "learning_rate": 0.00018021935732153165,
+      "loss": 0.7342404127120972,
+      "step": 520
+    },
+    {
+      "epoch": 0.3004613610149942,
+      "grad_norm": 0.5726889371871948,
+      "learning_rate": 0.00018018087358091206,
+      "loss": 0.9352455139160156,
+      "step": 521
+    },
+    {
+      "epoch": 0.30103806228373703,
+      "grad_norm": 0.5148364305496216,
+      "learning_rate": 0.00018014238984029248,
+      "loss": 0.8527913093566895,
+      "step": 522
+    },
+    {
+      "epoch": 0.3016147635524798,
+      "grad_norm": 0.8307221531867981,
+      "learning_rate": 0.0001801039060996729,
+      "loss": 1.180746078491211,
+      "step": 523
+    },
+    {
+      "epoch": 0.3021914648212226,
+      "grad_norm": 0.8560492396354675,
+      "learning_rate": 0.0001800654223590533,
+      "loss": 1.4329997301101685,
+      "step": 524
+    },
+    {
+      "epoch": 0.3027681660899654,
+      "grad_norm": 0.5972908139228821,
+      "learning_rate": 0.00018002693861843372,
+      "loss": 0.7385514974594116,
+      "step": 525
+    },
+    {
+      "epoch": 0.3033448673587082,
+      "grad_norm": 0.5159963965415955,
+      "learning_rate": 0.00017998845487781414,
+      "loss": 0.646453320980072,
+      "step": 526
+    },
+    {
+      "epoch": 0.30392156862745096,
+      "grad_norm": 0.9237578511238098,
+      "learning_rate": 0.00017994997113719453,
+      "loss": 1.442482590675354,
+      "step": 527
+    },
+    {
+      "epoch": 0.3044982698961938,
+      "grad_norm": 0.9341033697128296,
+      "learning_rate": 0.00017991148739657494,
+      "loss": 1.3850878477096558,
+      "step": 528
+    },
+    {
+      "epoch": 0.30507497116493654,
+      "grad_norm": 0.5422039031982422,
+      "learning_rate": 0.00017987300365595536,
+      "loss": 0.6736562252044678,
+      "step": 529
+    },
+    {
+      "epoch": 0.30565167243367936,
+      "grad_norm": 0.6220455765724182,
+      "learning_rate": 0.00017983451991533577,
+      "loss": 0.7528645992279053,
+      "step": 530
+    },
+    {
+      "epoch": 0.3062283737024221,
+      "grad_norm": 0.8073663115501404,
+      "learning_rate": 0.0001797960361747162,
+      "loss": 1.2123267650604248,
+      "step": 531
+    },
+    {
+      "epoch": 0.30680507497116494,
+      "grad_norm": 0.5491252541542053,
+      "learning_rate": 0.0001797575524340966,
+      "loss": 0.5903505086898804,
+      "step": 532
+    },
+    {
+      "epoch": 0.3073817762399077,
+      "grad_norm": 1.9019479751586914,
+      "learning_rate": 0.00017971906869347702,
+      "loss": 1.4316587448120117,
+      "step": 533
+    },
+    {
+      "epoch": 0.3079584775086505,
+      "grad_norm": 0.45649734139442444,
+      "learning_rate": 0.00017968058495285743,
+      "loss": 0.659195065498352,
+      "step": 534
+    },
+    {
+      "epoch": 0.30853517877739334,
+      "grad_norm": 0.7406135201454163,
+      "learning_rate": 0.00017964210121223782,
+      "loss": 1.0346477031707764,
+      "step": 535
+    },
+    {
+      "epoch": 0.3091118800461361,
+      "grad_norm": 0.9768670201301575,
+      "learning_rate": 0.00017960361747161824,
+      "loss": 1.584676742553711,
+      "step": 536
+    },
+    {
+      "epoch": 0.3096885813148789,
+      "grad_norm": 0.7869756817817688,
+      "learning_rate": 0.00017956513373099865,
+      "loss": 1.0404967069625854,
+      "step": 537
+    },
+    {
+      "epoch": 0.3102652825836217,
+      "grad_norm": 0.6868966221809387,
+      "learning_rate": 0.00017952664999037907,
+      "loss": 0.8878238201141357,
+      "step": 538
+    },
+    {
+      "epoch": 0.3108419838523645,
+      "grad_norm": 0.7594157457351685,
+      "learning_rate": 0.00017948816624975948,
+      "loss": 1.0191287994384766,
+      "step": 539
+    },
+    {
+      "epoch": 0.31141868512110726,
+      "grad_norm": 0.8346229195594788,
+      "learning_rate": 0.0001794496825091399,
+      "loss": 1.021256923675537,
+      "step": 540
+    },
+    {
+      "epoch": 0.3119953863898501,
+      "grad_norm": 1.0493948459625244,
+      "learning_rate": 0.00017941119876852031,
+      "loss": 1.0015616416931152,
+      "step": 541
+    },
+    {
+      "epoch": 0.31257208765859285,
+      "grad_norm": 0.62034010887146,
+      "learning_rate": 0.00017937271502790073,
+      "loss": 0.9237149357795715,
+      "step": 542
+    },
+    {
+      "epoch": 0.31314878892733566,
+      "grad_norm": 0.7169587016105652,
+      "learning_rate": 0.00017933423128728112,
+      "loss": 0.8658795356750488,
+      "step": 543
+    },
+    {
+      "epoch": 0.3137254901960784,
+      "grad_norm": 0.7205992341041565,
+      "learning_rate": 0.00017929574754666153,
+      "loss": 1.1227588653564453,
+      "step": 544
+    },
+    {
+      "epoch": 0.31430219146482125,
+      "grad_norm": 0.7573957443237305,
+      "learning_rate": 0.00017925726380604195,
+      "loss": 0.9638352394104004,
+      "step": 545
+    },
+    {
+      "epoch": 0.314878892733564,
+      "grad_norm": 0.981253981590271,
+      "learning_rate": 0.00017921878006542236,
+      "loss": 1.0400216579437256,
+      "step": 546
+    },
+    {
+      "epoch": 0.3154555940023068,
+      "grad_norm": 0.6763452291488647,
+      "learning_rate": 0.00017918029632480278,
+      "loss": 1.0069935321807861,
+      "step": 547
+    },
+    {
+      "epoch": 0.3160322952710496,
+      "grad_norm": 0.5641304850578308,
+      "learning_rate": 0.0001791418125841832,
+      "loss": 0.7099517583847046,
+      "step": 548
+    },
+    {
+      "epoch": 0.3166089965397924,
+      "grad_norm": 0.542838454246521,
+      "learning_rate": 0.0001791033288435636,
+      "loss": 0.7347281575202942,
+      "step": 549
+    },
+    {
+      "epoch": 0.31718569780853517,
+      "grad_norm": 0.6865650415420532,
+      "learning_rate": 0.00017906484510294402,
+      "loss": 0.9269914031028748,
+      "step": 550
+    },
+    {
+      "epoch": 0.317762399077278,
+      "grad_norm": 0.6794233322143555,
+      "learning_rate": 0.0001790263613623244,
+      "loss": 0.8624827861785889,
+      "step": 551
+    },
+    {
+      "epoch": 0.31833910034602075,
+      "grad_norm": 0.9417468905448914,
+      "learning_rate": 0.00017898787762170483,
+      "loss": 1.2194072008132935,
+      "step": 552
+    },
+    {
+      "epoch": 0.31891580161476357,
+      "grad_norm": 0.8551915287971497,
+      "learning_rate": 0.00017894939388108524,
+      "loss": 1.1121107339859009,
+      "step": 553
+    },
+    {
+      "epoch": 0.31949250288350634,
+      "grad_norm": 1.0210304260253906,
+      "learning_rate": 0.00017891091014046566,
+      "loss": 1.3061752319335938,
+      "step": 554
+    },
+    {
+      "epoch": 0.32006920415224915,
+      "grad_norm": 0.9833082556724548,
+      "learning_rate": 0.00017887242639984607,
+      "loss": 1.3157097101211548,
+      "step": 555
+    },
+    {
+      "epoch": 0.3206459054209919,
+      "grad_norm": 0.8534771203994751,
+      "learning_rate": 0.0001788339426592265,
+      "loss": 1.1443736553192139,
+      "step": 556
+    },
+    {
+      "epoch": 0.32122260668973474,
+      "grad_norm": 0.5206373929977417,
+      "learning_rate": 0.0001787954589186069,
+      "loss": 0.9210702776908875,
+      "step": 557
+    },
+    {
+      "epoch": 0.3217993079584775,
+      "grad_norm": 0.9890329837799072,
+      "learning_rate": 0.00017875697517798732,
+      "loss": 1.1474642753601074,
+      "step": 558
+    },
+    {
+      "epoch": 0.3223760092272203,
+      "grad_norm": 1.033987045288086,
+      "learning_rate": 0.0001787184914373677,
+      "loss": 1.3469852209091187,
+      "step": 559
+    },
+    {
+      "epoch": 0.3229527104959631,
+      "grad_norm": 0.5397274494171143,
+      "learning_rate": 0.00017868000769674812,
+      "loss": 0.8606307506561279,
+      "step": 560
+    },
+    {
+      "epoch": 0.3235294117647059,
+      "grad_norm": 0.7607125639915466,
+      "learning_rate": 0.00017864152395612854,
+      "loss": 1.5313308238983154,
+      "step": 561
+    },
+    {
+      "epoch": 0.32410611303344866,
+      "grad_norm": 0.8187709450721741,
+      "learning_rate": 0.00017860304021550895,
+      "loss": 1.2671842575073242,
+      "step": 562
+    },
+    {
+      "epoch": 0.3246828143021915,
+      "grad_norm": 0.8652257919311523,
+      "learning_rate": 0.00017856455647488937,
+      "loss": 1.0011459589004517,
+      "step": 563
+    },
+    {
+      "epoch": 0.32525951557093424,
+      "grad_norm": 0.8205957412719727,
+      "learning_rate": 0.00017852607273426978,
+      "loss": 0.9995499849319458,
+      "step": 564
+    },
+    {
+      "epoch": 0.32583621683967706,
+      "grad_norm": 0.8630533814430237,
+      "learning_rate": 0.0001784875889936502,
+      "loss": 1.119580864906311,
+      "step": 565
+    },
+    {
+      "epoch": 0.3264129181084198,
+      "grad_norm": 0.6678904294967651,
+      "learning_rate": 0.00017844910525303061,
+      "loss": 0.9301247596740723,
+      "step": 566
+    },
+    {
+      "epoch": 0.32698961937716264,
+      "grad_norm": 0.7211806774139404,
+      "learning_rate": 0.000178410621512411,
+      "loss": 1.3346351385116577,
+      "step": 567
+    },
+    {
+      "epoch": 0.3275663206459054,
+      "grad_norm": 0.6392566561698914,
+      "learning_rate": 0.00017837213777179142,
+      "loss": 0.6997557878494263,
+      "step": 568
+    },
+    {
+      "epoch": 0.3281430219146482,
+      "grad_norm": 0.8357546329498291,
+      "learning_rate": 0.00017833365403117183,
+      "loss": 1.3044462203979492,
+      "step": 569
+    },
+    {
+      "epoch": 0.328719723183391,
+      "grad_norm": 0.7778827548027039,
+      "learning_rate": 0.00017829517029055225,
+      "loss": 0.9234685897827148,
+      "step": 570
+    },
+    {
+      "epoch": 0.3292964244521338,
+      "grad_norm": 0.7168182730674744,
+      "learning_rate": 0.00017825668654993266,
+      "loss": 1.532446265220642,
+      "step": 571
+    },
+    {
+      "epoch": 0.32987312572087657,
+      "grad_norm": 1.016398549079895,
+      "learning_rate": 0.00017821820280931308,
+      "loss": 1.4056748151779175,
+      "step": 572
+    },
+    {
+      "epoch": 0.3304498269896194,
+      "grad_norm": 0.8056113719940186,
+      "learning_rate": 0.0001781797190686935,
+      "loss": 1.0595710277557373,
+      "step": 573
+    },
+    {
+      "epoch": 0.33102652825836215,
+      "grad_norm": 0.6588327884674072,
+      "learning_rate": 0.0001781412353280739,
+      "loss": 0.849087655544281,
+      "step": 574
+    },
+    {
+      "epoch": 0.33160322952710497,
+      "grad_norm": 0.7659177184104919,
+      "learning_rate": 0.0001781027515874543,
+      "loss": 1.1442945003509521,
+      "step": 575
+    },
+    {
+      "epoch": 0.33217993079584773,
+      "grad_norm": 0.8960584402084351,
+      "learning_rate": 0.0001780642678468347,
+      "loss": 1.2777467966079712,
+      "step": 576
+    },
+    {
+      "epoch": 0.33275663206459055,
+      "grad_norm": 0.8990175724029541,
+      "learning_rate": 0.00017802578410621513,
+      "loss": 1.0199333429336548,
+      "step": 577
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.7010089159011841,
+      "learning_rate": 0.00017798730036559554,
+      "loss": 1.2177313566207886,
+      "step": 578
+    },
+    {
+      "epoch": 0.33391003460207613,
+      "grad_norm": 0.8779993057250977,
+      "learning_rate": 0.00017794881662497596,
+      "loss": 0.8511064648628235,
+      "step": 579
+    },
+    {
+      "epoch": 0.3344867358708189,
+      "grad_norm": 0.8380318880081177,
+      "learning_rate": 0.00017791033288435637,
+      "loss": 1.0792275667190552,
+      "step": 580
+    },
+    {
+      "epoch": 0.3350634371395617,
+      "grad_norm": 0.7335569858551025,
+      "learning_rate": 0.0001778718491437368,
+      "loss": 1.0502758026123047,
+      "step": 581
+    },
+    {
+      "epoch": 0.3356401384083045,
+      "grad_norm": 0.7759366631507874,
+      "learning_rate": 0.0001778333654031172,
+      "loss": 1.000847578048706,
+      "step": 582
+    },
+    {
+      "epoch": 0.3362168396770473,
+      "grad_norm": 0.565648078918457,
+      "learning_rate": 0.0001777948816624976,
+      "loss": 0.7337483167648315,
+      "step": 583
+    },
+    {
+      "epoch": 0.33679354094579006,
+      "grad_norm": 0.8646697998046875,
+      "learning_rate": 0.000177756397921878,
+      "loss": 1.2806568145751953,
+      "step": 584
+    },
+    {
+      "epoch": 0.3373702422145329,
+      "grad_norm": 0.9556112289428711,
+      "learning_rate": 0.00017771791418125842,
+      "loss": 1.1648443937301636,
+      "step": 585
+    },
+    {
+      "epoch": 0.33794694348327564,
+      "grad_norm": 0.6629974842071533,
+      "learning_rate": 0.00017767943044063884,
+      "loss": 1.0415198802947998,
+      "step": 586
+    },
+    {
+      "epoch": 0.33852364475201846,
+      "grad_norm": 0.5972018837928772,
+      "learning_rate": 0.00017764094670001925,
+      "loss": 0.6916914582252502,
+      "step": 587
+    },
+    {
+      "epoch": 0.3391003460207612,
+      "grad_norm": 0.7391757965087891,
+      "learning_rate": 0.00017760246295939967,
+      "loss": 1.194846510887146,
+      "step": 588
+    },
+    {
+      "epoch": 0.33967704728950404,
+      "grad_norm": 0.7234671711921692,
+      "learning_rate": 0.00017756397921878008,
+      "loss": 0.9572672247886658,
+      "step": 589
+    },
+    {
+      "epoch": 0.3402537485582468,
+      "grad_norm": 0.6949688792228699,
+      "learning_rate": 0.0001775254954781605,
+      "loss": 0.9968490600585938,
+      "step": 590
+    },
+    {
+      "epoch": 0.3408304498269896,
+      "grad_norm": 0.9384737610816956,
+      "learning_rate": 0.0001774870117375409,
+      "loss": 1.106278896331787,
+      "step": 591
+    },
+    {
+      "epoch": 0.3414071510957324,
+      "grad_norm": 0.8691385388374329,
+      "learning_rate": 0.0001774485279969213,
+      "loss": 0.8517290353775024,
+      "step": 592
+    },
+    {
+      "epoch": 0.3419838523644752,
+      "grad_norm": 0.6864728331565857,
+      "learning_rate": 0.00017741004425630172,
+      "loss": 0.9280612468719482,
+      "step": 593
+    },
+    {
+      "epoch": 0.34256055363321797,
+      "grad_norm": 0.7656051516532898,
+      "learning_rate": 0.00017737156051568213,
+      "loss": 1.0975104570388794,
+      "step": 594
+    },
+    {
+      "epoch": 0.3431372549019608,
+      "grad_norm": 0.6587508916854858,
+      "learning_rate": 0.00017733307677506255,
+      "loss": 0.9575508236885071,
+      "step": 595
+    },
+    {
+      "epoch": 0.34371395617070355,
+      "grad_norm": 0.8466372489929199,
+      "learning_rate": 0.00017729459303444296,
+      "loss": 1.2343617677688599,
+      "step": 596
+    },
+    {
+      "epoch": 0.34429065743944637,
+      "grad_norm": 1.0839906930923462,
+      "learning_rate": 0.00017725610929382338,
+      "loss": 1.3552396297454834,
+      "step": 597
+    },
+    {
+      "epoch": 0.34486735870818913,
+      "grad_norm": 0.7300306558609009,
+      "learning_rate": 0.00017721762555320377,
+      "loss": 1.0701713562011719,
+      "step": 598
+    },
+    {
+      "epoch": 0.34544405997693195,
+      "grad_norm": 0.737766683101654,
+      "learning_rate": 0.00017717914181258418,
+      "loss": 1.0968977212905884,
+      "step": 599
+    },
+    {
+      "epoch": 0.3460207612456747,
+      "grad_norm": 0.749933660030365,
+      "learning_rate": 0.0001771406580719646,
+      "loss": 1.3320926427841187,
+      "step": 600
+    },
+    {
+      "epoch": 0.34659746251441753,
+      "grad_norm": 1.0226854085922241,
+      "learning_rate": 0.000177102174331345,
+      "loss": 1.5281516313552856,
+      "step": 601
+    },
+    {
+      "epoch": 0.34717416378316035,
+      "grad_norm": 0.5458315014839172,
+      "learning_rate": 0.00017706369059072543,
+      "loss": 0.6243756413459778,
+      "step": 602
+    },
+    {
+      "epoch": 0.3477508650519031,
+      "grad_norm": 0.6592231392860413,
+      "learning_rate": 0.00017702520685010584,
+      "loss": 1.007111668586731,
+      "step": 603
+    },
+    {
+      "epoch": 0.34832756632064593,
+      "grad_norm": 0.7599675059318542,
+      "learning_rate": 0.00017698672310948626,
+      "loss": 1.059772253036499,
+      "step": 604
+    },
+    {
+      "epoch": 0.3489042675893887,
+      "grad_norm": 0.7249642610549927,
+      "learning_rate": 0.00017694823936886667,
+      "loss": 1.0405762195587158,
+      "step": 605
+    },
+    {
+      "epoch": 0.3494809688581315,
+      "grad_norm": 0.6669758558273315,
+      "learning_rate": 0.00017690975562824706,
+      "loss": 0.8157357573509216,
+      "step": 606
+    },
+    {
+      "epoch": 0.3500576701268743,
+      "grad_norm": 1.0521658658981323,
+      "learning_rate": 0.00017687127188762748,
+      "loss": 1.3226133584976196,
+      "step": 607
+    },
+    {
+      "epoch": 0.3506343713956171,
+      "grad_norm": 1.190586805343628,
+      "learning_rate": 0.0001768327881470079,
+      "loss": 0.9668002724647522,
+      "step": 608
+    },
+    {
+      "epoch": 0.35121107266435986,
+      "grad_norm": 0.7342950105667114,
+      "learning_rate": 0.0001767943044063883,
+      "loss": 1.0137907266616821,
+      "step": 609
+    },
+    {
+      "epoch": 0.3517877739331027,
+      "grad_norm": 0.8390425443649292,
+      "learning_rate": 0.00017675582066576872,
+      "loss": 1.2452900409698486,
+      "step": 610
+    },
+    {
+      "epoch": 0.35236447520184544,
+      "grad_norm": 0.7040269374847412,
+      "learning_rate": 0.00017671733692514914,
+      "loss": 1.1274709701538086,
+      "step": 611
+    },
+    {
+      "epoch": 0.35294117647058826,
+      "grad_norm": 0.6184991002082825,
+      "learning_rate": 0.00017667885318452955,
+      "loss": 0.8320228457450867,
+      "step": 612
+    },
+    {
+      "epoch": 0.353517877739331,
+      "grad_norm": 0.9174041748046875,
+      "learning_rate": 0.00017664036944390997,
+      "loss": 1.0515730381011963,
+      "step": 613
+    },
+    {
+      "epoch": 0.35409457900807384,
+      "grad_norm": 0.8032795786857605,
+      "learning_rate": 0.00017660188570329036,
+      "loss": 0.9692851901054382,
+      "step": 614
+    },
+    {
+      "epoch": 0.3546712802768166,
+      "grad_norm": 0.854794979095459,
+      "learning_rate": 0.00017656340196267077,
+      "loss": 0.9672110676765442,
+      "step": 615
+    },
+    {
+      "epoch": 0.3552479815455594,
+      "grad_norm": 0.8945924043655396,
+      "learning_rate": 0.0001765249182220512,
+      "loss": 1.1629329919815063,
+      "step": 616
+    },
+    {
+      "epoch": 0.3558246828143022,
+      "grad_norm": 0.8737151622772217,
+      "learning_rate": 0.0001764864344814316,
+      "loss": 1.022585153579712,
+      "step": 617
+    },
+    {
+      "epoch": 0.356401384083045,
+      "grad_norm": 0.7043283581733704,
+      "learning_rate": 0.00017644795074081202,
+      "loss": 1.825275182723999,
+      "step": 618
+    },
+    {
+      "epoch": 0.35697808535178777,
+      "grad_norm": 0.81025230884552,
+      "learning_rate": 0.00017640946700019243,
+      "loss": 1.1937224864959717,
+      "step": 619
+    },
+    {
+      "epoch": 0.3575547866205306,
+      "grad_norm": 0.6064541339874268,
+      "learning_rate": 0.00017637098325957285,
+      "loss": 1.144992709159851,
+      "step": 620
+    },
+    {
+      "epoch": 0.35813148788927335,
+      "grad_norm": 0.7281432747840881,
+      "learning_rate": 0.00017633249951895326,
+      "loss": 0.8976823091506958,
+      "step": 621
+    },
+    {
+      "epoch": 0.35870818915801617,
+      "grad_norm": 0.7124044895172119,
+      "learning_rate": 0.00017629401577833365,
+      "loss": 0.9814664721488953,
+      "step": 622
+    },
+    {
+      "epoch": 0.35928489042675893,
+      "grad_norm": 0.7080062031745911,
+      "learning_rate": 0.00017625553203771407,
+      "loss": 0.8040327429771423,
+      "step": 623
+    },
+    {
+      "epoch": 0.35986159169550175,
+      "grad_norm": 0.9307262897491455,
+      "learning_rate": 0.00017621704829709448,
+      "loss": 1.1769636869430542,
+      "step": 624
+    },
+    {
+      "epoch": 0.3604382929642445,
+      "grad_norm": 0.6040496230125427,
+      "learning_rate": 0.0001761785645564749,
+      "loss": 0.8058497905731201,
+      "step": 625
+    },
+    {
+      "epoch": 0.36101499423298733,
+      "grad_norm": 0.6352747678756714,
+      "learning_rate": 0.00017614008081585531,
+      "loss": 1.0901957750320435,
+      "step": 626
+    },
+    {
+      "epoch": 0.3615916955017301,
+      "grad_norm": 1.0686722993850708,
+      "learning_rate": 0.00017610159707523573,
+      "loss": 1.0280206203460693,
+      "step": 627
+    },
+    {
+      "epoch": 0.3621683967704729,
+      "grad_norm": 0.823551595211029,
+      "learning_rate": 0.00017606311333461614,
+      "loss": 1.1255362033843994,
+      "step": 628
+    },
+    {
+      "epoch": 0.3627450980392157,
+      "grad_norm": 0.8719285726547241,
+      "learning_rate": 0.00017602462959399656,
+      "loss": 1.1470766067504883,
+      "step": 629
+    },
+    {
+      "epoch": 0.3633217993079585,
+      "grad_norm": 0.8169400691986084,
+      "learning_rate": 0.00017598614585337695,
+      "loss": 1.0567045211791992,
+      "step": 630
+    },
+    {
+      "epoch": 0.36389850057670126,
+      "grad_norm": 1.0707166194915771,
+      "learning_rate": 0.00017594766211275736,
+      "loss": 1.3314507007598877,
+      "step": 631
+    },
+    {
+      "epoch": 0.3644752018454441,
+      "grad_norm": 0.6268380284309387,
+      "learning_rate": 0.00017590917837213778,
+      "loss": 1.100555419921875,
+      "step": 632
+    },
+    {
+      "epoch": 0.36505190311418684,
+      "grad_norm": 0.7382054328918457,
+      "learning_rate": 0.0001758706946315182,
+      "loss": 0.9670585989952087,
+      "step": 633
+    },
+    {
+      "epoch": 0.36562860438292966,
+      "grad_norm": 1.193224310874939,
+      "learning_rate": 0.0001758322108908986,
+      "loss": 1.3042614459991455,
+      "step": 634
+    },
+    {
+      "epoch": 0.3662053056516724,
+      "grad_norm": 1.0091503858566284,
+      "learning_rate": 0.00017579372715027902,
+      "loss": 1.3520644903182983,
+      "step": 635
+    },
+    {
+      "epoch": 0.36678200692041524,
+      "grad_norm": 0.6810548901557922,
+      "learning_rate": 0.00017575524340965944,
+      "loss": 0.8741036653518677,
+      "step": 636
+    },
+    {
+      "epoch": 0.367358708189158,
+      "grad_norm": 0.7155483365058899,
+      "learning_rate": 0.00017571675966903986,
+      "loss": 0.8751124143600464,
+      "step": 637
+    },
+    {
+      "epoch": 0.3679354094579008,
+      "grad_norm": 1.0436261892318726,
+      "learning_rate": 0.00017567827592842024,
+      "loss": 1.5696821212768555,
+      "step": 638
+    },
+    {
+      "epoch": 0.3685121107266436,
+      "grad_norm": 0.9394407868385315,
+      "learning_rate": 0.00017563979218780066,
+      "loss": 0.8675939440727234,
+      "step": 639
+    },
+    {
+      "epoch": 0.3690888119953864,
+      "grad_norm": 1.4341135025024414,
+      "learning_rate": 0.00017560130844718107,
+      "loss": 1.498160481452942,
+      "step": 640
+    },
+    {
+      "epoch": 0.36966551326412916,
+      "grad_norm": 1.006375789642334,
+      "learning_rate": 0.0001755628247065615,
+      "loss": 1.2490055561065674,
+      "step": 641
+    },
+    {
+      "epoch": 0.370242214532872,
+      "grad_norm": 0.6104082465171814,
+      "learning_rate": 0.0001755243409659419,
+      "loss": 0.8300263285636902,
+      "step": 642
+    },
+    {
+      "epoch": 0.37081891580161475,
+      "grad_norm": 0.8571838736534119,
+      "learning_rate": 0.00017548585722532232,
+      "loss": 1.146481990814209,
+      "step": 643
+    },
+    {
+      "epoch": 0.37139561707035756,
+      "grad_norm": 0.6824607253074646,
+      "learning_rate": 0.00017544737348470273,
+      "loss": 1.2418452501296997,
+      "step": 644
+    },
+    {
+      "epoch": 0.3719723183391003,
+      "grad_norm": 1.0891611576080322,
+      "learning_rate": 0.00017540888974408315,
+      "loss": 1.2160457372665405,
+      "step": 645
+    },
+    {
+      "epoch": 0.37254901960784315,
+      "grad_norm": 0.6260281801223755,
+      "learning_rate": 0.00017537040600346354,
+      "loss": 0.8934881091117859,
+      "step": 646
+    },
+    {
+      "epoch": 0.3731257208765859,
+      "grad_norm": 0.8351913690567017,
+      "learning_rate": 0.00017533192226284395,
+      "loss": 1.5422282218933105,
+      "step": 647
+    },
+    {
+      "epoch": 0.3737024221453287,
+      "grad_norm": 0.7572267055511475,
+      "learning_rate": 0.00017529343852222437,
+      "loss": 1.2659950256347656,
+      "step": 648
+    },
+    {
+      "epoch": 0.3742791234140715,
+      "grad_norm": 0.7712565064430237,
+      "learning_rate": 0.00017525495478160478,
+      "loss": 1.2143782377243042,
+      "step": 649
+    },
+    {
+      "epoch": 0.3748558246828143,
+      "grad_norm": 0.6880773305892944,
+      "learning_rate": 0.0001752164710409852,
+      "loss": 1.0878217220306396,
+      "step": 650
+    },
+    {
+      "epoch": 0.3754325259515571,
+      "grad_norm": 0.8996551632881165,
+      "learning_rate": 0.00017517798730036561,
+      "loss": 0.9668335914611816,
+      "step": 651
+    },
+    {
+      "epoch": 0.3760092272202999,
+      "grad_norm": 0.921444296836853,
+      "learning_rate": 0.00017513950355974603,
+      "loss": 1.1585900783538818,
+      "step": 652
+    },
+    {
+      "epoch": 0.37658592848904265,
+      "grad_norm": 0.8658480048179626,
+      "learning_rate": 0.00017510101981912645,
+      "loss": 1.1533393859863281,
+      "step": 653
+    },
+    {
+      "epoch": 0.3771626297577855,
+      "grad_norm": 0.6665229797363281,
+      "learning_rate": 0.00017506253607850683,
+      "loss": 0.8233336210250854,
+      "step": 654
+    },
+    {
+      "epoch": 0.37773933102652824,
+      "grad_norm": 0.746337890625,
+      "learning_rate": 0.00017502405233788725,
+      "loss": 1.099341630935669,
+      "step": 655
+    },
+    {
+      "epoch": 0.37831603229527105,
+      "grad_norm": 0.8498716354370117,
+      "learning_rate": 0.00017498556859726766,
+      "loss": 1.333115577697754,
+      "step": 656
+    },
+    {
+      "epoch": 0.3788927335640138,
+      "grad_norm": 0.7371817827224731,
+      "learning_rate": 0.00017494708485664808,
+      "loss": 1.05489182472229,
+      "step": 657
+    },
+    {
+      "epoch": 0.37946943483275664,
+      "grad_norm": 0.7369913458824158,
+      "learning_rate": 0.0001749086011160285,
+      "loss": 0.7275075912475586,
+      "step": 658
+    },
+    {
+      "epoch": 0.3800461361014994,
+      "grad_norm": 1.4918899536132812,
+      "learning_rate": 0.0001748701173754089,
+      "loss": 1.2430638074874878,
+      "step": 659
+    },
+    {
+      "epoch": 0.3806228373702422,
+      "grad_norm": 0.686100423336029,
+      "learning_rate": 0.00017483163363478933,
+      "loss": 0.7841339707374573,
+      "step": 660
+    },
+    {
+      "epoch": 0.381199538638985,
+      "grad_norm": 0.7799985408782959,
+      "learning_rate": 0.00017479314989416974,
+      "loss": 1.1784673929214478,
+      "step": 661
+    },
+    {
+      "epoch": 0.3817762399077278,
+      "grad_norm": 0.7435747385025024,
+      "learning_rate": 0.00017475466615355013,
+      "loss": 1.180450439453125,
+      "step": 662
+    },
+    {
+      "epoch": 0.38235294117647056,
+      "grad_norm": 0.7358818650245667,
+      "learning_rate": 0.00017471618241293054,
+      "loss": 0.9987742900848389,
+      "step": 663
+    },
+    {
+      "epoch": 0.3829296424452134,
+      "grad_norm": 0.7353511452674866,
+      "learning_rate": 0.00017467769867231096,
+      "loss": 1.1325185298919678,
+      "step": 664
+    },
+    {
+      "epoch": 0.38350634371395614,
+      "grad_norm": 0.7735626697540283,
+      "learning_rate": 0.00017463921493169137,
+      "loss": 1.0828659534454346,
+      "step": 665
+    },
+    {
+      "epoch": 0.38408304498269896,
+      "grad_norm": 0.6293249130249023,
+      "learning_rate": 0.0001746007311910718,
+      "loss": 0.9253727793693542,
+      "step": 666
+    },
+    {
+      "epoch": 0.3846597462514418,
+      "grad_norm": 0.6271319389343262,
+      "learning_rate": 0.0001745622474504522,
+      "loss": 0.7645162343978882,
+      "step": 667
+    },
+    {
+      "epoch": 0.38523644752018454,
+      "grad_norm": 0.6632966995239258,
+      "learning_rate": 0.00017452376370983262,
+      "loss": 0.9796670079231262,
+      "step": 668
+    },
+    {
+      "epoch": 0.38581314878892736,
+      "grad_norm": 0.8829965591430664,
+      "learning_rate": 0.00017448527996921304,
+      "loss": 0.9777094721794128,
+      "step": 669
+    },
+    {
+      "epoch": 0.3863898500576701,
+      "grad_norm": 0.7675085663795471,
+      "learning_rate": 0.00017444679622859342,
+      "loss": 1.0497252941131592,
+      "step": 670
+    },
+    {
+      "epoch": 0.38696655132641294,
+      "grad_norm": 0.9194138050079346,
+      "learning_rate": 0.00017440831248797384,
+      "loss": 1.0992257595062256,
+      "step": 671
+    },
+    {
+      "epoch": 0.3875432525951557,
+      "grad_norm": 1.0398883819580078,
+      "learning_rate": 0.00017436982874735425,
+      "loss": 1.25284743309021,
+      "step": 672
+    },
+    {
+      "epoch": 0.3881199538638985,
+      "grad_norm": 0.5921796560287476,
+      "learning_rate": 0.00017433134500673467,
+      "loss": 0.6763097047805786,
+      "step": 673
+    },
+    {
+      "epoch": 0.3886966551326413,
+      "grad_norm": 1.0226387977600098,
+      "learning_rate": 0.00017429286126611508,
+      "loss": 1.3273173570632935,
+      "step": 674
+    },
+    {
+      "epoch": 0.3892733564013841,
+      "grad_norm": 0.5252590179443359,
+      "learning_rate": 0.0001742543775254955,
+      "loss": 0.6646312475204468,
+      "step": 675
+    },
+    {
+      "epoch": 0.38985005767012687,
+      "grad_norm": 0.600639820098877,
+      "learning_rate": 0.00017421589378487592,
+      "loss": 0.7095688581466675,
+      "step": 676
+    },
+    {
+      "epoch": 0.3904267589388697,
+      "grad_norm": 0.7131365537643433,
+      "learning_rate": 0.00017417741004425633,
+      "loss": 1.2200595140457153,
+      "step": 677
+    },
+    {
+      "epoch": 0.39100346020761245,
+      "grad_norm": 0.9018159508705139,
+      "learning_rate": 0.00017413892630363672,
+      "loss": 0.9669409394264221,
+      "step": 678
+    },
+    {
+      "epoch": 0.39158016147635527,
+      "grad_norm": 0.9841684103012085,
+      "learning_rate": 0.00017410044256301713,
+      "loss": 1.028241515159607,
+      "step": 679
+    },
+    {
+      "epoch": 0.39215686274509803,
+      "grad_norm": 0.9678821563720703,
+      "learning_rate": 0.00017406195882239755,
+      "loss": 1.3122403621673584,
+      "step": 680
+    },
+    {
+      "epoch": 0.39273356401384085,
+      "grad_norm": 0.6439565420150757,
+      "learning_rate": 0.00017402347508177796,
+      "loss": 0.8441326022148132,
+      "step": 681
+    },
+    {
+      "epoch": 0.3933102652825836,
+      "grad_norm": 0.8460219502449036,
+      "learning_rate": 0.00017398499134115838,
+      "loss": 1.193575382232666,
+      "step": 682
+    },
+    {
+      "epoch": 0.39388696655132643,
+      "grad_norm": 0.8068860769271851,
+      "learning_rate": 0.0001739465076005388,
+      "loss": 1.209285020828247,
+      "step": 683
+    },
+    {
+      "epoch": 0.3944636678200692,
+      "grad_norm": 0.6420811414718628,
+      "learning_rate": 0.0001739080238599192,
+      "loss": 0.9203285574913025,
+      "step": 684
+    },
+    {
+      "epoch": 0.395040369088812,
+      "grad_norm": 1.1171250343322754,
+      "learning_rate": 0.0001738695401192996,
+      "loss": 1.5638062953948975,
+      "step": 685
+    },
+    {
+      "epoch": 0.3956170703575548,
+      "grad_norm": 0.7218726873397827,
+      "learning_rate": 0.00017383105637868001,
+      "loss": 1.1434835195541382,
+      "step": 686
+    },
+    {
+      "epoch": 0.3961937716262976,
+      "grad_norm": 0.9958249926567078,
+      "learning_rate": 0.00017379257263806043,
+      "loss": 0.7441573143005371,
+      "step": 687
+    },
+    {
+      "epoch": 0.39677047289504036,
+      "grad_norm": 0.8222061395645142,
+      "learning_rate": 0.00017375408889744084,
+      "loss": 1.2088245153427124,
+      "step": 688
+    },
+    {
+      "epoch": 0.3973471741637832,
+      "grad_norm": 0.5759637355804443,
+      "learning_rate": 0.00017371560515682126,
+      "loss": 0.9504674077033997,
+      "step": 689
+    },
+    {
+      "epoch": 0.39792387543252594,
+      "grad_norm": 0.8157130479812622,
+      "learning_rate": 0.00017367712141620168,
+      "loss": 1.319948673248291,
+      "step": 690
+    },
+    {
+      "epoch": 0.39850057670126876,
+      "grad_norm": 0.7266381978988647,
+      "learning_rate": 0.0001736386376755821,
+      "loss": 0.8739478588104248,
+      "step": 691
+    },
+    {
+      "epoch": 0.3990772779700115,
+      "grad_norm": 0.644598126411438,
+      "learning_rate": 0.0001736001539349625,
+      "loss": 0.9521651864051819,
+      "step": 692
+    },
+    {
+      "epoch": 0.39965397923875434,
+      "grad_norm": 0.5922922492027283,
+      "learning_rate": 0.0001735616701943429,
+      "loss": 0.7051569223403931,
+      "step": 693
+    },
+    {
+      "epoch": 0.4002306805074971,
+      "grad_norm": 0.6880702972412109,
+      "learning_rate": 0.0001735231864537233,
+      "loss": 1.1202598810195923,
+      "step": 694
+    },
+    {
+      "epoch": 0.4008073817762399,
+      "grad_norm": 1.1836776733398438,
+      "learning_rate": 0.00017348470271310372,
+      "loss": 1.2588169574737549,
+      "step": 695
+    },
+    {
+      "epoch": 0.4013840830449827,
+      "grad_norm": 0.965606689453125,
+      "learning_rate": 0.00017344621897248414,
+      "loss": 0.7970831990242004,
+      "step": 696
+    },
+    {
+      "epoch": 0.4019607843137255,
+      "grad_norm": 0.8883787989616394,
+      "learning_rate": 0.00017340773523186456,
+      "loss": 1.6653708219528198,
+      "step": 697
+    },
+    {
+      "epoch": 0.40253748558246827,
+      "grad_norm": 0.7349938750267029,
+      "learning_rate": 0.00017336925149124497,
+      "loss": 0.7324041724205017,
+      "step": 698
+    },
+    {
+      "epoch": 0.4031141868512111,
+      "grad_norm": 1.0731885433197021,
+      "learning_rate": 0.00017333076775062539,
+      "loss": 0.9731301069259644,
+      "step": 699
+    },
+    {
+      "epoch": 0.40369088811995385,
+      "grad_norm": 0.8691738843917847,
+      "learning_rate": 0.0001732922840100058,
+      "loss": 1.0968525409698486,
+      "step": 700
+    },
+    {
+      "epoch": 0.40426758938869667,
+      "grad_norm": 0.921116292476654,
+      "learning_rate": 0.0001732538002693862,
+      "loss": 1.3427119255065918,
+      "step": 701
+    },
+    {
+      "epoch": 0.40484429065743943,
+      "grad_norm": 0.8539203405380249,
+      "learning_rate": 0.0001732153165287666,
+      "loss": 1.2618871927261353,
+      "step": 702
+    },
+    {
+      "epoch": 0.40542099192618225,
+      "grad_norm": 0.6238696575164795,
+      "learning_rate": 0.00017317683278814702,
+      "loss": 0.7679486274719238,
+      "step": 703
+    },
+    {
+      "epoch": 0.405997693194925,
+      "grad_norm": 0.6827321648597717,
+      "learning_rate": 0.00017313834904752743,
+      "loss": 0.9498722553253174,
+      "step": 704
+    },
+    {
+      "epoch": 0.40657439446366783,
+      "grad_norm": 0.9637985229492188,
+      "learning_rate": 0.00017309986530690785,
+      "loss": 1.2945339679718018,
+      "step": 705
+    },
+    {
+      "epoch": 0.4071510957324106,
+      "grad_norm": 0.6361503601074219,
+      "learning_rate": 0.00017306138156628827,
+      "loss": 1.2040516138076782,
+      "step": 706
+    },
+    {
+      "epoch": 0.4077277970011534,
+      "grad_norm": 0.713758647441864,
+      "learning_rate": 0.00017302289782566868,
+      "loss": 1.1285666227340698,
+      "step": 707
+    },
+    {
+      "epoch": 0.4083044982698962,
+      "grad_norm": 1.0620390176773071,
+      "learning_rate": 0.0001729844140850491,
+      "loss": 1.2117018699645996,
+      "step": 708
+    },
+    {
+      "epoch": 0.408881199538639,
+      "grad_norm": 0.6957300305366516,
+      "learning_rate": 0.00017294593034442948,
+      "loss": 1.2091706991195679,
+      "step": 709
+    },
+    {
+      "epoch": 0.40945790080738176,
+      "grad_norm": 0.4594845771789551,
+      "learning_rate": 0.0001729074466038099,
+      "loss": 3.3324732780456543,
+      "step": 710
+    },
+    {
+      "epoch": 0.4100346020761246,
+      "grad_norm": 0.8902932405471802,
+      "learning_rate": 0.00017286896286319031,
+      "loss": 1.1579055786132812,
+      "step": 711
+    },
+    {
+      "epoch": 0.41061130334486734,
+      "grad_norm": 0.7140578031539917,
+      "learning_rate": 0.00017283047912257073,
+      "loss": 0.877116858959198,
+      "step": 712
+    },
+    {
+      "epoch": 0.41118800461361016,
+      "grad_norm": 0.8449535369873047,
+      "learning_rate": 0.00017279199538195115,
+      "loss": 1.2400063276290894,
+      "step": 713
+    },
+    {
+      "epoch": 0.4117647058823529,
+      "grad_norm": 1.0700358152389526,
+      "learning_rate": 0.00017275351164133156,
+      "loss": 1.1401453018188477,
+      "step": 714
+    },
+    {
+      "epoch": 0.41234140715109574,
+      "grad_norm": 0.6705982685089111,
+      "learning_rate": 0.00017271502790071198,
+      "loss": 0.8326209783554077,
+      "step": 715
+    },
+    {
+      "epoch": 0.4129181084198385,
+      "grad_norm": 0.7149010896682739,
+      "learning_rate": 0.0001726765441600924,
+      "loss": 1.0872998237609863,
+      "step": 716
+    },
+    {
+      "epoch": 0.4134948096885813,
+      "grad_norm": 0.46808966994285583,
+      "learning_rate": 0.00017263806041947278,
+      "loss": 0.6795035004615784,
+      "step": 717
+    },
+    {
+      "epoch": 0.4140715109573241,
+      "grad_norm": 0.8606752157211304,
+      "learning_rate": 0.0001725995766788532,
+      "loss": 1.0544252395629883,
+      "step": 718
+    },
+    {
+      "epoch": 0.4146482122260669,
+      "grad_norm": 0.5839232802391052,
+      "learning_rate": 0.0001725610929382336,
+      "loss": 0.7785719633102417,
+      "step": 719
+    },
+    {
+      "epoch": 0.41522491349480967,
+      "grad_norm": 0.8700772523880005,
+      "learning_rate": 0.00017252260919761403,
+      "loss": 0.988602340221405,
+      "step": 720
+    },
+    {
+      "epoch": 0.4158016147635525,
+      "grad_norm": 0.9886090159416199,
+      "learning_rate": 0.00017248412545699444,
+      "loss": 1.3493539094924927,
+      "step": 721
+    },
+    {
+      "epoch": 0.41637831603229525,
+      "grad_norm": 0.9088316559791565,
+      "learning_rate": 0.00017244564171637486,
+      "loss": 1.0131090879440308,
+      "step": 722
+    },
+    {
+      "epoch": 0.41695501730103807,
+      "grad_norm": 0.9066189527511597,
+      "learning_rate": 0.00017240715797575527,
+      "loss": 1.2530944347381592,
+      "step": 723
+    },
+    {
+      "epoch": 0.41753171856978083,
+      "grad_norm": 0.7733665704727173,
+      "learning_rate": 0.00017236867423513569,
+      "loss": 1.1255629062652588,
+      "step": 724
+    },
+    {
+      "epoch": 0.41810841983852365,
+      "grad_norm": 0.609832763671875,
+      "learning_rate": 0.00017233019049451607,
+      "loss": 0.7514859437942505,
+      "step": 725
+    },
+    {
+      "epoch": 0.4186851211072664,
+      "grad_norm": 0.6903802752494812,
+      "learning_rate": 0.0001722917067538965,
+      "loss": 0.8925538063049316,
+      "step": 726
+    },
+    {
+      "epoch": 0.41926182237600923,
+      "grad_norm": 0.7692581415176392,
+      "learning_rate": 0.0001722532230132769,
+      "loss": 1.103420376777649,
+      "step": 727
+    },
+    {
+      "epoch": 0.419838523644752,
+      "grad_norm": 0.7881311774253845,
+      "learning_rate": 0.0001722147392726573,
+      "loss": 1.3109550476074219,
+      "step": 728
+    },
+    {
+      "epoch": 0.4204152249134948,
+      "grad_norm": 0.6949164271354675,
+      "learning_rate": 0.0001721762555320377,
+      "loss": 1.0904300212860107,
+      "step": 729
+    },
+    {
+      "epoch": 0.4209919261822376,
+      "grad_norm": 0.6746834516525269,
+      "learning_rate": 0.00017213777179141812,
+      "loss": 1.240382194519043,
+      "step": 730
+    },
+    {
+      "epoch": 0.4215686274509804,
+      "grad_norm": 0.8831079602241516,
+      "learning_rate": 0.00017209928805079854,
+      "loss": 1.546260118484497,
+      "step": 731
+    },
+    {
+      "epoch": 0.42214532871972316,
+      "grad_norm": 0.917523205280304,
+      "learning_rate": 0.00017206080431017895,
+      "loss": 1.3464173078536987,
+      "step": 732
+    },
+    {
+      "epoch": 0.422722029988466,
+      "grad_norm": 0.729640007019043,
+      "learning_rate": 0.00017202232056955937,
+      "loss": 0.9092597961425781,
+      "step": 733
+    },
+    {
+      "epoch": 0.4232987312572088,
+      "grad_norm": 0.9597057104110718,
+      "learning_rate": 0.00017198383682893976,
+      "loss": 1.449595332145691,
+      "step": 734
+    },
+    {
+      "epoch": 0.42387543252595156,
+      "grad_norm": 0.570996880531311,
+      "learning_rate": 0.00017194535308832017,
+      "loss": 0.660990297794342,
+      "step": 735
+    },
+    {
+      "epoch": 0.4244521337946944,
+      "grad_norm": 0.8485130071640015,
+      "learning_rate": 0.0001719068693477006,
+      "loss": 1.009351372718811,
+      "step": 736
+    },
+    {
+      "epoch": 0.42502883506343714,
+      "grad_norm": 1.1340487003326416,
+      "learning_rate": 0.000171868385607081,
+      "loss": 1.186898946762085,
+      "step": 737
+    },
+    {
+      "epoch": 0.42560553633217996,
+      "grad_norm": 0.9666796326637268,
+      "learning_rate": 0.00017182990186646142,
+      "loss": 1.3713027238845825,
+      "step": 738
+    },
+    {
+      "epoch": 0.4261822376009227,
+      "grad_norm": 0.8104447722434998,
+      "learning_rate": 0.00017179141812584183,
+      "loss": 0.7822756767272949,
+      "step": 739
+    },
+    {
+      "epoch": 0.42675893886966554,
+      "grad_norm": 0.7587509155273438,
+      "learning_rate": 0.00017175293438522225,
+      "loss": 1.1129992008209229,
+      "step": 740
+    },
+    {
+      "epoch": 0.4273356401384083,
+      "grad_norm": 0.854256272315979,
+      "learning_rate": 0.00017171445064460266,
+      "loss": 1.1753698587417603,
+      "step": 741
+    },
+    {
+      "epoch": 0.4279123414071511,
+      "grad_norm": 0.7335513234138489,
+      "learning_rate": 0.00017167596690398305,
+      "loss": 1.1233677864074707,
+      "step": 742
+    },
+    {
+      "epoch": 0.4284890426758939,
+      "grad_norm": 1.1383814811706543,
+      "learning_rate": 0.00017163748316336347,
+      "loss": 1.6328407526016235,
+      "step": 743
+    },
+    {
+      "epoch": 0.4290657439446367,
+      "grad_norm": 0.5805800557136536,
+      "learning_rate": 0.00017159899942274388,
+      "loss": 0.8374234437942505,
+      "step": 744
+    },
+    {
+      "epoch": 0.42964244521337946,
+      "grad_norm": 0.5744853615760803,
+      "learning_rate": 0.0001715605156821243,
+      "loss": 0.7072418332099915,
+      "step": 745
+    },
+    {
+      "epoch": 0.4302191464821223,
+      "grad_norm": 1.0968151092529297,
+      "learning_rate": 0.00017152203194150471,
+      "loss": 0.9308477640151978,
+      "step": 746
+    },
+    {
+      "epoch": 0.43079584775086505,
+      "grad_norm": 0.7771037220954895,
+      "learning_rate": 0.00017148354820088513,
+      "loss": 1.0803910493850708,
+      "step": 747
+    },
+    {
+      "epoch": 0.43137254901960786,
+      "grad_norm": 0.760296106338501,
+      "learning_rate": 0.00017144506446026554,
+      "loss": 0.9416469931602478,
+      "step": 748
+    },
+    {
+      "epoch": 0.43194925028835063,
+      "grad_norm": 0.8478863835334778,
+      "learning_rate": 0.00017140658071964596,
+      "loss": 1.0037909746170044,
+      "step": 749
+    },
+    {
+      "epoch": 0.43252595155709345,
+      "grad_norm": 0.802010715007782,
+      "learning_rate": 0.00017136809697902635,
+      "loss": 1.2789827585220337,
+      "step": 750
+    },
+    {
+      "epoch": 0.4331026528258362,
+      "grad_norm": 0.7146703004837036,
+      "learning_rate": 0.00017132961323840676,
+      "loss": 0.925313413143158,
+      "step": 751
+    },
+    {
+      "epoch": 0.43367935409457903,
+      "grad_norm": 1.1419707536697388,
+      "learning_rate": 0.00017129112949778718,
+      "loss": 1.3266316652297974,
+      "step": 752
+    },
+    {
+      "epoch": 0.4342560553633218,
+      "grad_norm": 0.5337522029876709,
+      "learning_rate": 0.0001712526457571676,
+      "loss": 0.8182927966117859,
+      "step": 753
+    },
+    {
+      "epoch": 0.4348327566320646,
+      "grad_norm": 0.7067147493362427,
+      "learning_rate": 0.000171214162016548,
+      "loss": 1.01529061794281,
+      "step": 754
+    },
+    {
+      "epoch": 0.4354094579008074,
+      "grad_norm": 0.8742361664772034,
+      "learning_rate": 0.00017117567827592842,
+      "loss": 0.9216449856758118,
+      "step": 755
+    },
+    {
+      "epoch": 0.4359861591695502,
+      "grad_norm": 1.0121413469314575,
+      "learning_rate": 0.00017113719453530884,
+      "loss": 1.5315768718719482,
+      "step": 756
+    },
+    {
+      "epoch": 0.43656286043829295,
+      "grad_norm": 0.970582127571106,
+      "learning_rate": 0.00017109871079468925,
+      "loss": 1.1701881885528564,
+      "step": 757
+    },
+    {
+      "epoch": 0.4371395617070358,
+      "grad_norm": 0.8317894339561462,
+      "learning_rate": 0.00017106022705406964,
+      "loss": 1.1619702577590942,
+      "step": 758
+    },
+    {
+      "epoch": 0.43771626297577854,
+      "grad_norm": 0.6935670375823975,
+      "learning_rate": 0.00017102174331345006,
+      "loss": 1.0018664598464966,
+      "step": 759
+    },
+    {
+      "epoch": 0.43829296424452135,
+      "grad_norm": 1.0123279094696045,
+      "learning_rate": 0.00017098325957283047,
+      "loss": 1.1231794357299805,
+      "step": 760
+    },
+    {
+      "epoch": 0.4388696655132641,
+      "grad_norm": 0.7619280219078064,
+      "learning_rate": 0.0001709447758322109,
+      "loss": 1.0395662784576416,
+      "step": 761
+    },
+    {
+      "epoch": 0.43944636678200694,
+      "grad_norm": 0.8570308089256287,
+      "learning_rate": 0.0001709062920915913,
+      "loss": 1.4022446870803833,
+      "step": 762
+    },
+    {
+      "epoch": 0.4400230680507497,
+      "grad_norm": 1.178285837173462,
+      "learning_rate": 0.00017086780835097172,
+      "loss": 1.5245153903961182,
+      "step": 763
+    },
+    {
+      "epoch": 0.4405997693194925,
+      "grad_norm": 0.876589298248291,
+      "learning_rate": 0.00017082932461035213,
+      "loss": 1.482165813446045,
+      "step": 764
+    },
+    {
+      "epoch": 0.4411764705882353,
+      "grad_norm": 0.8614532947540283,
+      "learning_rate": 0.00017079084086973255,
+      "loss": 1.312232255935669,
+      "step": 765
+    },
+    {
+      "epoch": 0.4417531718569781,
+      "grad_norm": 0.6772201061248779,
+      "learning_rate": 0.00017075235712911294,
+      "loss": 1.1610076427459717,
+      "step": 766
+    },
+    {
+      "epoch": 0.44232987312572086,
+      "grad_norm": 0.805927038192749,
+      "learning_rate": 0.00017071387338849335,
+      "loss": 1.3874244689941406,
+      "step": 767
+    },
+    {
+      "epoch": 0.4429065743944637,
+      "grad_norm": 0.5419954061508179,
+      "learning_rate": 0.00017067538964787377,
+      "loss": 0.7610808610916138,
+      "step": 768
+    },
+    {
+      "epoch": 0.44348327566320644,
+      "grad_norm": 0.773598313331604,
+      "learning_rate": 0.00017063690590725418,
+      "loss": 0.8612810373306274,
+      "step": 769
+    },
+    {
+      "epoch": 0.44405997693194926,
+      "grad_norm": 0.6376165151596069,
+      "learning_rate": 0.0001705984221666346,
+      "loss": 0.8417828679084778,
+      "step": 770
+    },
+    {
+      "epoch": 0.444636678200692,
+      "grad_norm": 0.6870789527893066,
+      "learning_rate": 0.00017055993842601501,
+      "loss": 1.1764918565750122,
+      "step": 771
+    },
+    {
+      "epoch": 0.44521337946943484,
+      "grad_norm": 0.5562968254089355,
+      "learning_rate": 0.00017052145468539543,
+      "loss": 0.8358933925628662,
+      "step": 772
+    },
+    {
+      "epoch": 0.4457900807381776,
+      "grad_norm": 0.602963924407959,
+      "learning_rate": 0.00017048297094477585,
+      "loss": 1.197677731513977,
+      "step": 773
+    },
+    {
+      "epoch": 0.4463667820069204,
+      "grad_norm": 1.0190907716751099,
+      "learning_rate": 0.00017044448720415623,
+      "loss": 1.4355199337005615,
+      "step": 774
+    },
+    {
+      "epoch": 0.4469434832756632,
+      "grad_norm": 0.633346676826477,
+      "learning_rate": 0.00017040600346353665,
+      "loss": 0.7924656867980957,
+      "step": 775
+    },
+    {
+      "epoch": 0.447520184544406,
+      "grad_norm": 0.797099232673645,
+      "learning_rate": 0.00017036751972291706,
+      "loss": 1.2302619218826294,
+      "step": 776
+    },
+    {
+      "epoch": 0.44809688581314877,
+      "grad_norm": 0.7166492938995361,
+      "learning_rate": 0.00017032903598229748,
+      "loss": 1.063340187072754,
+      "step": 777
+    },
+    {
+      "epoch": 0.4486735870818916,
+      "grad_norm": 0.9511370062828064,
+      "learning_rate": 0.0001702905522416779,
+      "loss": 0.8998168706893921,
+      "step": 778
+    },
+    {
+      "epoch": 0.44925028835063435,
+      "grad_norm": 0.8487029075622559,
+      "learning_rate": 0.0001702520685010583,
+      "loss": 1.1850653886795044,
+      "step": 779
+    },
+    {
+      "epoch": 0.44982698961937717,
+      "grad_norm": 1.0267854928970337,
+      "learning_rate": 0.00017021358476043873,
+      "loss": 1.246724009513855,
+      "step": 780
+    },
+    {
+      "epoch": 0.45040369088811993,
+      "grad_norm": 1.155428409576416,
+      "learning_rate": 0.00017017510101981914,
+      "loss": 1.539854884147644,
+      "step": 781
+    },
+    {
+      "epoch": 0.45098039215686275,
+      "grad_norm": 0.6774823069572449,
+      "learning_rate": 0.00017013661727919953,
+      "loss": 0.7472063302993774,
+      "step": 782
+    },
+    {
+      "epoch": 0.4515570934256055,
+      "grad_norm": 0.7500667572021484,
+      "learning_rate": 0.00017009813353857994,
+      "loss": 0.9946876168251038,
+      "step": 783
+    },
+    {
+      "epoch": 0.45213379469434833,
+      "grad_norm": 0.7643426656723022,
+      "learning_rate": 0.00017005964979796036,
+      "loss": 0.8451071977615356,
+      "step": 784
+    },
+    {
+      "epoch": 0.4527104959630911,
+      "grad_norm": 0.721379816532135,
+      "learning_rate": 0.00017002116605734077,
+      "loss": 0.9988998174667358,
+      "step": 785
+    },
+    {
+      "epoch": 0.4532871972318339,
+      "grad_norm": 0.8850287199020386,
+      "learning_rate": 0.0001699826823167212,
+      "loss": 0.9789897203445435,
+      "step": 786
+    },
+    {
+      "epoch": 0.4538638985005767,
+      "grad_norm": 1.0076375007629395,
+      "learning_rate": 0.0001699441985761016,
+      "loss": 1.3830417394638062,
+      "step": 787
+    },
+    {
+      "epoch": 0.4544405997693195,
+      "grad_norm": 0.6105207204818726,
+      "learning_rate": 0.00016990571483548202,
+      "loss": 0.8870081901550293,
+      "step": 788
+    },
+    {
+      "epoch": 0.45501730103806226,
+      "grad_norm": 0.7732753157615662,
+      "learning_rate": 0.00016986723109486244,
+      "loss": 0.9958963990211487,
+      "step": 789
+    },
+    {
+      "epoch": 0.4555940023068051,
+      "grad_norm": 0.9871165156364441,
+      "learning_rate": 0.00016982874735424282,
+      "loss": 1.1141139268875122,
+      "step": 790
+    },
+    {
+      "epoch": 0.45617070357554784,
+      "grad_norm": 0.7117231488227844,
+      "learning_rate": 0.00016979026361362324,
+      "loss": 1.0168585777282715,
+      "step": 791
+    },
+    {
+      "epoch": 0.45674740484429066,
+      "grad_norm": 0.6954454183578491,
+      "learning_rate": 0.00016975177987300365,
+      "loss": 0.9319931268692017,
+      "step": 792
+    },
+    {
+      "epoch": 0.4573241061130334,
+      "grad_norm": 0.6463753581047058,
+      "learning_rate": 0.00016971329613238407,
+      "loss": 0.9734832644462585,
+      "step": 793
+    },
+    {
+      "epoch": 0.45790080738177624,
+      "grad_norm": 0.7156365513801575,
+      "learning_rate": 0.00016967481239176448,
+      "loss": 1.0014495849609375,
+      "step": 794
+    },
+    {
+      "epoch": 0.458477508650519,
+      "grad_norm": 0.8648508787155151,
+      "learning_rate": 0.0001696363286511449,
+      "loss": 1.3907616138458252,
+      "step": 795
+    },
+    {
+      "epoch": 0.4590542099192618,
+      "grad_norm": 0.8066338300704956,
+      "learning_rate": 0.00016959784491052532,
+      "loss": 1.0530327558517456,
+      "step": 796
+    },
+    {
+      "epoch": 0.4596309111880046,
+      "grad_norm": 0.8617266416549683,
+      "learning_rate": 0.00016955936116990573,
+      "loss": 1.7989249229431152,
+      "step": 797
+    },
+    {
+      "epoch": 0.4602076124567474,
+      "grad_norm": 0.7956259250640869,
+      "learning_rate": 0.00016952087742928612,
+      "loss": 0.928198516368866,
+      "step": 798
+    },
+    {
+      "epoch": 0.46078431372549017,
+      "grad_norm": 0.8778709173202515,
+      "learning_rate": 0.00016948239368866653,
+      "loss": 0.9466978907585144,
+      "step": 799
+    },
+    {
+      "epoch": 0.461361014994233,
+      "grad_norm": 0.8518659472465515,
+      "learning_rate": 0.00016944390994804695,
+      "loss": 1.0593540668487549,
+      "step": 800
+    },
+    {
+      "epoch": 0.4619377162629758,
+      "grad_norm": 0.79550701379776,
+      "learning_rate": 0.00016940542620742736,
+      "loss": 1.1164321899414062,
+      "step": 801
+    },
+    {
+      "epoch": 0.46251441753171857,
+      "grad_norm": 1.0006239414215088,
+      "learning_rate": 0.00016936694246680778,
+      "loss": 1.160499930381775,
+      "step": 802
+    },
+    {
+      "epoch": 0.4630911188004614,
+      "grad_norm": 0.8525403738021851,
+      "learning_rate": 0.0001693284587261882,
+      "loss": 1.0770652294158936,
+      "step": 803
+    },
+    {
+      "epoch": 0.46366782006920415,
+      "grad_norm": 0.6851354837417603,
+      "learning_rate": 0.0001692899749855686,
+      "loss": 1.0310590267181396,
+      "step": 804
+    },
+    {
+      "epoch": 0.46424452133794697,
+      "grad_norm": 0.6831552386283875,
+      "learning_rate": 0.000169251491244949,
+      "loss": 1.0782524347305298,
+      "step": 805
+    },
+    {
+      "epoch": 0.46482122260668973,
+      "grad_norm": 0.8892863988876343,
+      "learning_rate": 0.00016921300750432941,
+      "loss": 1.3154478073120117,
+      "step": 806
+    },
+    {
+      "epoch": 0.46539792387543255,
+      "grad_norm": 0.6863577961921692,
+      "learning_rate": 0.00016917452376370983,
+      "loss": 0.5912436842918396,
+      "step": 807
+    },
+    {
+      "epoch": 0.4659746251441753,
+      "grad_norm": 0.8612192869186401,
+      "learning_rate": 0.00016913604002309024,
+      "loss": 1.0140503644943237,
+      "step": 808
+    },
+    {
+      "epoch": 0.46655132641291813,
+      "grad_norm": 0.6565495729446411,
+      "learning_rate": 0.00016909755628247066,
+      "loss": 0.8388250470161438,
+      "step": 809
+    },
+    {
+      "epoch": 0.4671280276816609,
+      "grad_norm": 0.5729434490203857,
+      "learning_rate": 0.00016905907254185107,
+      "loss": 0.8662521839141846,
+      "step": 810
+    },
+    {
+      "epoch": 0.4677047289504037,
+      "grad_norm": 0.8261442184448242,
+      "learning_rate": 0.0001690205888012315,
+      "loss": 1.1527458429336548,
+      "step": 811
+    },
+    {
+      "epoch": 0.4682814302191465,
+      "grad_norm": 0.6182582974433899,
+      "learning_rate": 0.0001689821050606119,
+      "loss": 0.7817882895469666,
+      "step": 812
+    },
+    {
+      "epoch": 0.4688581314878893,
+      "grad_norm": 0.5987662672996521,
+      "learning_rate": 0.0001689436213199923,
+      "loss": 0.864625871181488,
+      "step": 813
+    },
+    {
+      "epoch": 0.46943483275663206,
+      "grad_norm": 0.8617327809333801,
+      "learning_rate": 0.0001689051375793727,
+      "loss": 1.1531751155853271,
+      "step": 814
+    },
+    {
+      "epoch": 0.4700115340253749,
+      "grad_norm": 0.8277755379676819,
+      "learning_rate": 0.00016886665383875312,
+      "loss": 0.928108811378479,
+      "step": 815
+    },
+    {
+      "epoch": 0.47058823529411764,
+      "grad_norm": 0.7510029673576355,
+      "learning_rate": 0.00016882817009813354,
+      "loss": 1.0068414211273193,
+      "step": 816
+    },
+    {
+      "epoch": 0.47116493656286046,
+      "grad_norm": 0.8691316246986389,
+      "learning_rate": 0.00016878968635751395,
+      "loss": 1.0941516160964966,
+      "step": 817
+    },
+    {
+      "epoch": 0.4717416378316032,
+      "grad_norm": 0.581984281539917,
+      "learning_rate": 0.00016875120261689437,
+      "loss": 0.6039727926254272,
+      "step": 818
+    },
+    {
+      "epoch": 0.47231833910034604,
+      "grad_norm": 0.7486310005187988,
+      "learning_rate": 0.00016871271887627479,
+      "loss": 1.140452265739441,
+      "step": 819
+    },
+    {
+      "epoch": 0.4728950403690888,
+      "grad_norm": 0.8794305324554443,
+      "learning_rate": 0.0001686742351356552,
+      "loss": 1.2717854976654053,
+      "step": 820
+    },
+    {
+      "epoch": 0.4734717416378316,
+      "grad_norm": 0.8812481164932251,
+      "learning_rate": 0.0001686357513950356,
+      "loss": 0.9813717007637024,
+      "step": 821
+    },
+    {
+      "epoch": 0.4740484429065744,
+      "grad_norm": 0.9091891646385193,
+      "learning_rate": 0.000168597267654416,
+      "loss": 1.2938401699066162,
+      "step": 822
+    },
+    {
+      "epoch": 0.4746251441753172,
+      "grad_norm": 0.9045780301094055,
+      "learning_rate": 0.00016855878391379642,
+      "loss": 1.312792181968689,
+      "step": 823
+    },
+    {
+      "epoch": 0.47520184544405997,
+      "grad_norm": 0.8430265784263611,
+      "learning_rate": 0.00016852030017317683,
+      "loss": 1.2679914236068726,
+      "step": 824
+    },
+    {
+      "epoch": 0.4757785467128028,
+      "grad_norm": 0.6870001554489136,
+      "learning_rate": 0.00016848181643255725,
+      "loss": 0.970576822757721,
+      "step": 825
+    },
+    {
+      "epoch": 0.47635524798154555,
+      "grad_norm": 0.8256406188011169,
+      "learning_rate": 0.00016844333269193767,
+      "loss": 1.302760362625122,
+      "step": 826
+    },
+    {
+      "epoch": 0.47693194925028837,
+      "grad_norm": 0.7057660222053528,
+      "learning_rate": 0.00016840484895131808,
+      "loss": 0.9811574220657349,
+      "step": 827
+    },
+    {
+      "epoch": 0.47750865051903113,
+      "grad_norm": 0.8487821817398071,
+      "learning_rate": 0.0001683663652106985,
+      "loss": 1.0537941455841064,
+      "step": 828
+    },
+    {
+      "epoch": 0.47808535178777395,
+      "grad_norm": 0.7474492788314819,
+      "learning_rate": 0.00016832788147007888,
+      "loss": 0.856541633605957,
+      "step": 829
+    },
+    {
+      "epoch": 0.4786620530565167,
+      "grad_norm": 0.9228368401527405,
+      "learning_rate": 0.0001682893977294593,
+      "loss": 1.0505741834640503,
+      "step": 830
+    },
+    {
+      "epoch": 0.47923875432525953,
+      "grad_norm": 0.9288182854652405,
+      "learning_rate": 0.00016825091398883971,
+      "loss": 1.3584654331207275,
+      "step": 831
+    },
+    {
+      "epoch": 0.4798154555940023,
+      "grad_norm": 1.4403129816055298,
+      "learning_rate": 0.00016821243024822013,
+      "loss": 1.911801815032959,
+      "step": 832
+    },
+    {
+      "epoch": 0.4803921568627451,
+      "grad_norm": 0.6283893585205078,
+      "learning_rate": 0.00016817394650760055,
+      "loss": 0.8583131432533264,
+      "step": 833
+    },
+    {
+      "epoch": 0.4809688581314879,
+      "grad_norm": 0.6910902261734009,
+      "learning_rate": 0.00016813546276698096,
+      "loss": 1.3508315086364746,
+      "step": 834
+    },
+    {
+      "epoch": 0.4815455594002307,
+      "grad_norm": 0.6606875658035278,
+      "learning_rate": 0.00016809697902636138,
+      "loss": 1.0815465450286865,
+      "step": 835
+    },
+    {
+      "epoch": 0.48212226066897346,
+      "grad_norm": 0.8546112775802612,
+      "learning_rate": 0.0001680584952857418,
+      "loss": 1.2201032638549805,
+      "step": 836
+    },
+    {
+      "epoch": 0.4826989619377163,
+      "grad_norm": 0.9130816459655762,
+      "learning_rate": 0.00016802001154512218,
+      "loss": 1.208343744277954,
+      "step": 837
+    },
+    {
+      "epoch": 0.48327566320645904,
+      "grad_norm": 0.7690496444702148,
+      "learning_rate": 0.0001679815278045026,
+      "loss": 1.0452954769134521,
+      "step": 838
+    },
+    {
+      "epoch": 0.48385236447520186,
+      "grad_norm": 0.7210266590118408,
+      "learning_rate": 0.000167943044063883,
+      "loss": 0.7897384166717529,
+      "step": 839
+    },
+    {
+      "epoch": 0.4844290657439446,
+      "grad_norm": 0.5705054402351379,
+      "learning_rate": 0.00016790456032326342,
+      "loss": 0.8288441896438599,
+      "step": 840
+    },
+    {
+      "epoch": 0.48500576701268744,
+      "grad_norm": 0.6143510341644287,
+      "learning_rate": 0.00016786607658264384,
+      "loss": 0.8081311583518982,
+      "step": 841
+    },
+    {
+      "epoch": 0.4855824682814302,
+      "grad_norm": 0.7222305536270142,
+      "learning_rate": 0.00016782759284202426,
+      "loss": 1.1107532978057861,
+      "step": 842
+    },
+    {
+      "epoch": 0.486159169550173,
+      "grad_norm": 0.6712546944618225,
+      "learning_rate": 0.00016778910910140467,
+      "loss": 0.8375999927520752,
+      "step": 843
+    },
+    {
+      "epoch": 0.4867358708189158,
+      "grad_norm": 0.9085020422935486,
+      "learning_rate": 0.00016775062536078509,
+      "loss": 0.9624453186988831,
+      "step": 844
+    },
+    {
+      "epoch": 0.4873125720876586,
+      "grad_norm": 0.773102879524231,
+      "learning_rate": 0.00016771214162016547,
+      "loss": 1.0454928874969482,
+      "step": 845
+    },
+    {
+      "epoch": 0.48788927335640137,
+      "grad_norm": 0.5635338425636292,
+      "learning_rate": 0.0001676736578795459,
+      "loss": 0.7329631447792053,
+      "step": 846
+    },
+    {
+      "epoch": 0.4884659746251442,
+      "grad_norm": 0.8183399438858032,
+      "learning_rate": 0.0001676351741389263,
+      "loss": 0.859244704246521,
+      "step": 847
+    },
+    {
+      "epoch": 0.48904267589388695,
+      "grad_norm": 0.7920128107070923,
+      "learning_rate": 0.00016759669039830672,
+      "loss": 0.9889219403266907,
+      "step": 848
+    },
+    {
+      "epoch": 0.48961937716262977,
+      "grad_norm": 1.1391570568084717,
+      "learning_rate": 0.00016755820665768714,
+      "loss": 1.146942138671875,
+      "step": 849
+    },
+    {
+      "epoch": 0.49019607843137253,
+      "grad_norm": 0.6648845076560974,
+      "learning_rate": 0.00016751972291706755,
+      "loss": 0.7090552449226379,
+      "step": 850
+    },
+    {
+      "epoch": 0.49077277970011535,
+      "grad_norm": 0.7156478762626648,
+      "learning_rate": 0.00016748123917644797,
+      "loss": 0.7772218585014343,
+      "step": 851
+    },
+    {
+      "epoch": 0.4913494809688581,
+      "grad_norm": 0.7279021739959717,
+      "learning_rate": 0.00016744275543582838,
+      "loss": 1.0468722581863403,
+      "step": 852
+    },
+    {
+      "epoch": 0.49192618223760093,
+      "grad_norm": 1.0862352848052979,
+      "learning_rate": 0.00016740427169520877,
+      "loss": 1.3199949264526367,
+      "step": 853
+    },
+    {
+      "epoch": 0.4925028835063437,
+      "grad_norm": 0.5989871025085449,
+      "learning_rate": 0.00016736578795458918,
+      "loss": 0.7066143751144409,
+      "step": 854
+    },
+    {
+      "epoch": 0.4930795847750865,
+      "grad_norm": 0.88418048620224,
+      "learning_rate": 0.0001673273042139696,
+      "loss": 0.9679941534996033,
+      "step": 855
+    },
+    {
+      "epoch": 0.4936562860438293,
+      "grad_norm": 0.7538619637489319,
+      "learning_rate": 0.00016728882047335002,
+      "loss": 0.906350314617157,
+      "step": 856
+    },
+    {
+      "epoch": 0.4942329873125721,
+      "grad_norm": 1.0406384468078613,
+      "learning_rate": 0.00016725033673273043,
+      "loss": 1.0761326551437378,
+      "step": 857
+    },
+    {
+      "epoch": 0.49480968858131485,
+      "grad_norm": 0.9118819236755371,
+      "learning_rate": 0.00016721185299211085,
+      "loss": 1.449715495109558,
+      "step": 858
+    },
+    {
+      "epoch": 0.4953863898500577,
+      "grad_norm": 0.7859880328178406,
+      "learning_rate": 0.00016717336925149126,
+      "loss": 1.0066848993301392,
+      "step": 859
+    },
+    {
+      "epoch": 0.49596309111880044,
+      "grad_norm": 0.7971929907798767,
+      "learning_rate": 0.00016713488551087168,
+      "loss": 1.0836429595947266,
+      "step": 860
+    },
+    {
+      "epoch": 0.49653979238754326,
+      "grad_norm": 0.7688129544258118,
+      "learning_rate": 0.00016709640177025206,
+      "loss": 0.8990678191184998,
+      "step": 861
+    },
+    {
+      "epoch": 0.497116493656286,
+      "grad_norm": 0.6911450028419495,
+      "learning_rate": 0.00016705791802963248,
+      "loss": 0.9118435382843018,
+      "step": 862
+    },
+    {
+      "epoch": 0.49769319492502884,
+      "grad_norm": 0.9296817183494568,
+      "learning_rate": 0.0001670194342890129,
+      "loss": 1.0580615997314453,
+      "step": 863
+    },
+    {
+      "epoch": 0.4982698961937716,
+      "grad_norm": 0.5820940732955933,
+      "learning_rate": 0.0001669809505483933,
+      "loss": 0.6944743394851685,
+      "step": 864
+    },
+    {
+      "epoch": 0.4988465974625144,
+      "grad_norm": 0.9766574501991272,
+      "learning_rate": 0.00016694246680777373,
+      "loss": 1.4097439050674438,
+      "step": 865
+    },
+    {
+      "epoch": 0.4994232987312572,
+      "grad_norm": 0.658211350440979,
+      "learning_rate": 0.00016690398306715414,
+      "loss": 0.7773644924163818,
+      "step": 866
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7480500340461731,
+      "learning_rate": 0.00016686549932653456,
+      "loss": 1.1536113023757935,
+      "step": 867
+    },
+    {
+      "epoch": 0.5005767012687428,
+      "grad_norm": 0.5885343551635742,
+      "learning_rate": 0.00016682701558591497,
+      "loss": 0.5359970927238464,
+      "step": 868
+    },
+    {
+      "epoch": 0.5011534025374856,
+      "grad_norm": 0.7808444499969482,
+      "learning_rate": 0.00016678853184529536,
+      "loss": 0.6940274834632874,
+      "step": 869
+    },
+    {
+      "epoch": 0.5017301038062284,
+      "grad_norm": 0.8007370233535767,
+      "learning_rate": 0.00016675004810467577,
+      "loss": 1.3268241882324219,
+      "step": 870
+    },
+    {
+      "epoch": 0.5023068050749712,
+      "grad_norm": 0.6729685068130493,
+      "learning_rate": 0.0001667115643640562,
+      "loss": 0.9482746124267578,
+      "step": 871
+    },
+    {
+      "epoch": 0.5028835063437139,
+      "grad_norm": 0.648239016532898,
+      "learning_rate": 0.0001666730806234366,
+      "loss": 0.9904931783676147,
+      "step": 872
+    },
+    {
+      "epoch": 0.5034602076124568,
+      "grad_norm": 0.7997180223464966,
+      "learning_rate": 0.00016663459688281702,
+      "loss": 1.0594019889831543,
+      "step": 873
+    },
+    {
+      "epoch": 0.5040369088811996,
+      "grad_norm": 0.8298223614692688,
+      "learning_rate": 0.00016659611314219744,
+      "loss": 0.9604882597923279,
+      "step": 874
+    },
+    {
+      "epoch": 0.5046136101499423,
+      "grad_norm": 0.8724483251571655,
+      "learning_rate": 0.00016655762940157785,
+      "loss": 1.0515791177749634,
+      "step": 875
+    },
+    {
+      "epoch": 0.5051903114186851,
+      "grad_norm": 0.7477858662605286,
+      "learning_rate": 0.00016651914566095827,
+      "loss": 1.0346887111663818,
+      "step": 876
+    },
+    {
+      "epoch": 0.505767012687428,
+      "grad_norm": 0.6524494886398315,
+      "learning_rate": 0.00016648066192033865,
+      "loss": 0.8699806928634644,
+      "step": 877
+    },
+    {
+      "epoch": 0.5063437139561707,
+      "grad_norm": 0.7959410548210144,
+      "learning_rate": 0.00016644217817971907,
+      "loss": 1.0138338804244995,
+      "step": 878
+    },
+    {
+      "epoch": 0.5069204152249135,
+      "grad_norm": 0.7872818112373352,
+      "learning_rate": 0.00016640369443909949,
+      "loss": 1.0084038972854614,
+      "step": 879
+    },
+    {
+      "epoch": 0.5074971164936563,
+      "grad_norm": 0.9153385758399963,
+      "learning_rate": 0.0001663652106984799,
+      "loss": 0.9120053052902222,
+      "step": 880
+    },
+    {
+      "epoch": 0.5080738177623991,
+      "grad_norm": 0.8691549301147461,
+      "learning_rate": 0.00016632672695786032,
+      "loss": 0.9792031645774841,
+      "step": 881
+    },
+    {
+      "epoch": 0.5086505190311419,
+      "grad_norm": 0.7193480730056763,
+      "learning_rate": 0.00016628824321724073,
+      "loss": 0.9441159963607788,
+      "step": 882
+    },
+    {
+      "epoch": 0.5092272202998847,
+      "grad_norm": 0.5675065517425537,
+      "learning_rate": 0.00016624975947662115,
+      "loss": 0.7550349235534668,
+      "step": 883
+    },
+    {
+      "epoch": 0.5098039215686274,
+      "grad_norm": 0.45122864842414856,
+      "learning_rate": 0.00016621127573600156,
+      "loss": 0.494687020778656,
+      "step": 884
+    },
+    {
+      "epoch": 0.5103806228373703,
+      "grad_norm": 0.5535047650337219,
+      "learning_rate": 0.00016617279199538195,
+      "loss": 1.0048768520355225,
+      "step": 885
+    },
+    {
+      "epoch": 0.510957324106113,
+      "grad_norm": 1.1627446413040161,
+      "learning_rate": 0.00016613430825476237,
+      "loss": 1.3231415748596191,
+      "step": 886
+    },
+    {
+      "epoch": 0.5115340253748558,
+      "grad_norm": 0.5924594402313232,
+      "learning_rate": 0.00016609582451414278,
+      "loss": 0.8373284339904785,
+      "step": 887
+    },
+    {
+      "epoch": 0.5121107266435986,
+      "grad_norm": 1.071594476699829,
+      "learning_rate": 0.0001660573407735232,
+      "loss": 1.1695808172225952,
+      "step": 888
+    },
+    {
+      "epoch": 0.5126874279123415,
+      "grad_norm": 0.7243885397911072,
+      "learning_rate": 0.0001660188570329036,
+      "loss": 0.9688019156455994,
+      "step": 889
+    },
+    {
+      "epoch": 0.5132641291810842,
+      "grad_norm": 0.7857576012611389,
+      "learning_rate": 0.00016598037329228403,
+      "loss": 0.9062821269035339,
+      "step": 890
+    },
+    {
+      "epoch": 0.513840830449827,
+      "grad_norm": 0.6501168012619019,
+      "learning_rate": 0.00016594188955166444,
+      "loss": 0.7230191230773926,
+      "step": 891
+    },
+    {
+      "epoch": 0.5144175317185697,
+      "grad_norm": 0.7679166197776794,
+      "learning_rate": 0.00016590340581104483,
+      "loss": 0.9849987030029297,
+      "step": 892
+    },
+    {
+      "epoch": 0.5149942329873126,
+      "grad_norm": 0.5687773823738098,
+      "learning_rate": 0.00016586492207042524,
+      "loss": 0.5315793752670288,
+      "step": 893
+    },
+    {
+      "epoch": 0.5155709342560554,
+      "grad_norm": 0.5201639533042908,
+      "learning_rate": 0.00016582643832980566,
+      "loss": 0.833229660987854,
+      "step": 894
+    },
+    {
+      "epoch": 0.5161476355247981,
+      "grad_norm": 0.9703792333602905,
+      "learning_rate": 0.00016578795458918608,
+      "loss": 1.2787346839904785,
+      "step": 895
+    },
+    {
+      "epoch": 0.5167243367935409,
+      "grad_norm": 0.5964572429656982,
+      "learning_rate": 0.0001657494708485665,
+      "loss": 0.8054360151290894,
+      "step": 896
+    },
+    {
+      "epoch": 0.5173010380622838,
+      "grad_norm": 0.8156993389129639,
+      "learning_rate": 0.0001657109871079469,
+      "loss": 1.1183547973632812,
+      "step": 897
+    },
+    {
+      "epoch": 0.5178777393310265,
+      "grad_norm": 0.9944779276847839,
+      "learning_rate": 0.00016567250336732732,
+      "loss": 1.4230319261550903,
+      "step": 898
+    },
+    {
+      "epoch": 0.5184544405997693,
+      "grad_norm": 0.6466273069381714,
+      "learning_rate": 0.00016563401962670774,
+      "loss": 0.9248323440551758,
+      "step": 899
+    },
+    {
+      "epoch": 0.5190311418685121,
+      "grad_norm": 0.6486216187477112,
+      "learning_rate": 0.00016559553588608812,
+      "loss": 0.8279266357421875,
+      "step": 900
+    },
+    {
+      "epoch": 0.5196078431372549,
+      "grad_norm": 0.8492687940597534,
+      "learning_rate": 0.00016555705214546854,
+      "loss": 1.1167151927947998,
+      "step": 901
+    },
+    {
+      "epoch": 0.5201845444059977,
+      "grad_norm": 0.7403521537780762,
+      "learning_rate": 0.00016551856840484896,
+      "loss": 0.9129210710525513,
+      "step": 902
+    },
+    {
+      "epoch": 0.5207612456747405,
+      "grad_norm": 0.9525539875030518,
+      "learning_rate": 0.00016548008466422937,
+      "loss": 1.0805696249008179,
+      "step": 903
+    },
+    {
+      "epoch": 0.5213379469434832,
+      "grad_norm": 0.6410759091377258,
+      "learning_rate": 0.00016544160092360979,
+      "loss": 0.7183154821395874,
+      "step": 904
+    },
+    {
+      "epoch": 0.5219146482122261,
+      "grad_norm": 0.9240155816078186,
+      "learning_rate": 0.0001654031171829902,
+      "loss": 1.2977594137191772,
+      "step": 905
+    },
+    {
+      "epoch": 0.5224913494809689,
+      "grad_norm": 0.5909906625747681,
+      "learning_rate": 0.00016536463344237062,
+      "loss": 0.8771336078643799,
+      "step": 906
+    },
+    {
+      "epoch": 0.5230680507497116,
+      "grad_norm": 0.6739245653152466,
+      "learning_rate": 0.00016532614970175103,
+      "loss": 0.9435271620750427,
+      "step": 907
+    },
+    {
+      "epoch": 0.5236447520184544,
+      "grad_norm": 0.7840787172317505,
+      "learning_rate": 0.00016528766596113142,
+      "loss": 0.9116816520690918,
+      "step": 908
+    },
+    {
+      "epoch": 0.5242214532871973,
+      "grad_norm": 0.7001404762268066,
+      "learning_rate": 0.00016524918222051184,
+      "loss": 0.7686711549758911,
+      "step": 909
+    },
+    {
+      "epoch": 0.52479815455594,
+      "grad_norm": 0.7492363452911377,
+      "learning_rate": 0.00016521069847989225,
+      "loss": 0.894406795501709,
+      "step": 910
+    },
+    {
+      "epoch": 0.5253748558246828,
+      "grad_norm": 0.6643780469894409,
+      "learning_rate": 0.00016517221473927267,
+      "loss": 0.9077553153038025,
+      "step": 911
+    },
+    {
+      "epoch": 0.5259515570934256,
+      "grad_norm": 0.6426498889923096,
+      "learning_rate": 0.00016513373099865308,
+      "loss": 0.7784804701805115,
+      "step": 912
+    },
+    {
+      "epoch": 0.5265282583621684,
+      "grad_norm": 0.6445097923278809,
+      "learning_rate": 0.0001650952472580335,
+      "loss": 0.8351481556892395,
+      "step": 913
+    },
+    {
+      "epoch": 0.5271049596309112,
+      "grad_norm": 0.9749622344970703,
+      "learning_rate": 0.0001650567635174139,
+      "loss": 1.3779326677322388,
+      "step": 914
+    },
+    {
+      "epoch": 0.527681660899654,
+      "grad_norm": 1.0297281742095947,
+      "learning_rate": 0.00016501827977679433,
+      "loss": 1.4258373975753784,
+      "step": 915
+    },
+    {
+      "epoch": 0.5282583621683967,
+      "grad_norm": 0.8116568326950073,
+      "learning_rate": 0.00016497979603617472,
+      "loss": 1.120481252670288,
+      "step": 916
+    },
+    {
+      "epoch": 0.5288350634371396,
+      "grad_norm": 0.8832195401191711,
+      "learning_rate": 0.00016494131229555513,
+      "loss": 1.0475956201553345,
+      "step": 917
+    },
+    {
+      "epoch": 0.5294117647058824,
+      "grad_norm": 0.7668746709823608,
+      "learning_rate": 0.00016490282855493555,
+      "loss": 0.9356057643890381,
+      "step": 918
+    },
+    {
+      "epoch": 0.5299884659746251,
+      "grad_norm": 0.7938312292098999,
+      "learning_rate": 0.00016486434481431596,
+      "loss": 1.0766160488128662,
+      "step": 919
+    },
+    {
+      "epoch": 0.5305651672433679,
+      "grad_norm": 0.6379091739654541,
+      "learning_rate": 0.00016482586107369638,
+      "loss": 0.8664296865463257,
+      "step": 920
+    },
+    {
+      "epoch": 0.5311418685121108,
+      "grad_norm": 0.5966930389404297,
+      "learning_rate": 0.0001647873773330768,
+      "loss": 0.7848939299583435,
+      "step": 921
+    },
+    {
+      "epoch": 0.5317185697808535,
+      "grad_norm": 0.7270369529724121,
+      "learning_rate": 0.0001647488935924572,
+      "loss": 0.8690502643585205,
+      "step": 922
+    },
+    {
+      "epoch": 0.5322952710495963,
+      "grad_norm": 0.7373891472816467,
+      "learning_rate": 0.00016471040985183762,
+      "loss": 0.9187401533126831,
+      "step": 923
+    },
+    {
+      "epoch": 0.532871972318339,
+      "grad_norm": 0.6114344596862793,
+      "learning_rate": 0.000164671926111218,
+      "loss": 0.7336284518241882,
+      "step": 924
+    },
+    {
+      "epoch": 0.5334486735870819,
+      "grad_norm": 0.7629640102386475,
+      "learning_rate": 0.00016463344237059843,
+      "loss": 1.0568023920059204,
+      "step": 925
+    },
+    {
+      "epoch": 0.5340253748558247,
+      "grad_norm": 0.5172185897827148,
+      "learning_rate": 0.00016459495862997884,
+      "loss": 0.6043404936790466,
+      "step": 926
+    },
+    {
+      "epoch": 0.5346020761245674,
+      "grad_norm": 0.6732125282287598,
+      "learning_rate": 0.00016455647488935926,
+      "loss": 0.7869133353233337,
+      "step": 927
+    },
+    {
+      "epoch": 0.5351787773933102,
+      "grad_norm": 0.993881344795227,
+      "learning_rate": 0.00016451799114873967,
+      "loss": 1.3750996589660645,
+      "step": 928
+    },
+    {
+      "epoch": 0.5357554786620531,
+      "grad_norm": 0.6748846173286438,
+      "learning_rate": 0.0001644795074081201,
+      "loss": 0.7957302331924438,
+      "step": 929
+    },
+    {
+      "epoch": 0.5363321799307958,
+      "grad_norm": 0.5961597561836243,
+      "learning_rate": 0.0001644410236675005,
+      "loss": 0.817986786365509,
+      "step": 930
+    },
+    {
+      "epoch": 0.5369088811995386,
+      "grad_norm": 0.8336942195892334,
+      "learning_rate": 0.00016440253992688092,
+      "loss": 1.071876883506775,
+      "step": 931
+    },
+    {
+      "epoch": 0.5374855824682814,
+      "grad_norm": 0.8322470784187317,
+      "learning_rate": 0.0001643640561862613,
+      "loss": 0.9675548672676086,
+      "step": 932
+    },
+    {
+      "epoch": 0.5380622837370242,
+      "grad_norm": 0.8054575324058533,
+      "learning_rate": 0.00016432557244564172,
+      "loss": 1.0018256902694702,
+      "step": 933
+    },
+    {
+      "epoch": 0.538638985005767,
+      "grad_norm": 0.7546166181564331,
+      "learning_rate": 0.00016428708870502214,
+      "loss": 0.9199832677841187,
+      "step": 934
+    },
+    {
+      "epoch": 0.5392156862745098,
+      "grad_norm": 0.6384134292602539,
+      "learning_rate": 0.00016424860496440255,
+      "loss": 0.5693946480751038,
+      "step": 935
+    },
+    {
+      "epoch": 0.5397923875432526,
+      "grad_norm": 0.8509575128555298,
+      "learning_rate": 0.00016421012122378297,
+      "loss": 1.3604402542114258,
+      "step": 936
+    },
+    {
+      "epoch": 0.5403690888119954,
+      "grad_norm": 1.0863171815872192,
+      "learning_rate": 0.00016417163748316338,
+      "loss": 1.441767692565918,
+      "step": 937
+    },
+    {
+      "epoch": 0.5409457900807382,
+      "grad_norm": 0.7680332064628601,
+      "learning_rate": 0.0001641331537425438,
+      "loss": 0.8990482091903687,
+      "step": 938
+    },
+    {
+      "epoch": 0.5415224913494809,
+      "grad_norm": 0.9804447889328003,
+      "learning_rate": 0.0001640946700019242,
+      "loss": 1.0421537160873413,
+      "step": 939
+    },
+    {
+      "epoch": 0.5420991926182238,
+      "grad_norm": 1.0693145990371704,
+      "learning_rate": 0.0001640561862613046,
+      "loss": 1.1600146293640137,
+      "step": 940
+    },
+    {
+      "epoch": 0.5426758938869666,
+      "grad_norm": 0.8488958477973938,
+      "learning_rate": 0.00016401770252068502,
+      "loss": 1.2710307836532593,
+      "step": 941
+    },
+    {
+      "epoch": 0.5432525951557093,
+      "grad_norm": 1.048317313194275,
+      "learning_rate": 0.00016397921878006543,
+      "loss": 0.8453274369239807,
+      "step": 942
+    },
+    {
+      "epoch": 0.5438292964244521,
+      "grad_norm": 0.7326422929763794,
+      "learning_rate": 0.00016394073503944585,
+      "loss": 1.0167326927185059,
+      "step": 943
+    },
+    {
+      "epoch": 0.544405997693195,
+      "grad_norm": 0.877862274646759,
+      "learning_rate": 0.00016390225129882626,
+      "loss": 0.9589974880218506,
+      "step": 944
+    },
+    {
+      "epoch": 0.5449826989619377,
+      "grad_norm": 0.8096463680267334,
+      "learning_rate": 0.00016386376755820668,
+      "loss": 0.8364965915679932,
+      "step": 945
+    },
+    {
+      "epoch": 0.5455594002306805,
+      "grad_norm": 0.9232637882232666,
+      "learning_rate": 0.0001638252838175871,
+      "loss": 0.9332213997840881,
+      "step": 946
+    },
+    {
+      "epoch": 0.5461361014994233,
+      "grad_norm": 0.7885507941246033,
+      "learning_rate": 0.0001637868000769675,
+      "loss": 1.0532820224761963,
+      "step": 947
+    },
+    {
+      "epoch": 0.5467128027681661,
+      "grad_norm": 0.914097249507904,
+      "learning_rate": 0.0001637483163363479,
+      "loss": 0.8059665560722351,
+      "step": 948
+    },
+    {
+      "epoch": 0.5472895040369089,
+      "grad_norm": 0.8124399781227112,
+      "learning_rate": 0.0001637098325957283,
+      "loss": 0.7342300415039062,
+      "step": 949
+    },
+    {
+      "epoch": 0.5478662053056517,
+      "grad_norm": 0.8677952289581299,
+      "learning_rate": 0.00016367134885510873,
+      "loss": 1.2200864553451538,
+      "step": 950
+    },
+    {
+      "epoch": 0.5484429065743944,
+      "grad_norm": 0.8235622048377991,
+      "learning_rate": 0.00016363286511448914,
+      "loss": 1.2276276350021362,
+      "step": 951
+    },
+    {
+      "epoch": 0.5490196078431373,
+      "grad_norm": 0.8734779953956604,
+      "learning_rate": 0.00016359438137386956,
+      "loss": 1.481785535812378,
+      "step": 952
+    },
+    {
+      "epoch": 0.5495963091118801,
+      "grad_norm": 0.7058696746826172,
+      "learning_rate": 0.00016355589763324997,
+      "loss": 0.8971320390701294,
+      "step": 953
+    },
+    {
+      "epoch": 0.5501730103806228,
+      "grad_norm": 0.7818495035171509,
+      "learning_rate": 0.0001635174138926304,
+      "loss": 0.9900298118591309,
+      "step": 954
+    },
+    {
+      "epoch": 0.5507497116493656,
+      "grad_norm": 0.9933992028236389,
+      "learning_rate": 0.0001634789301520108,
+      "loss": 1.377812147140503,
+      "step": 955
+    },
+    {
+      "epoch": 0.5513264129181085,
+      "grad_norm": 0.6487358808517456,
+      "learning_rate": 0.0001634404464113912,
+      "loss": 0.8082116842269897,
+      "step": 956
+    },
+    {
+      "epoch": 0.5519031141868512,
+      "grad_norm": 0.7896233201026917,
+      "learning_rate": 0.0001634019626707716,
+      "loss": 0.8894538879394531,
+      "step": 957
+    },
+    {
+      "epoch": 0.552479815455594,
+      "grad_norm": 0.5499460697174072,
+      "learning_rate": 0.00016336347893015202,
+      "loss": 0.7779909372329712,
+      "step": 958
+    },
+    {
+      "epoch": 0.5530565167243368,
+      "grad_norm": 0.7304683327674866,
+      "learning_rate": 0.00016332499518953244,
+      "loss": 0.9466789960861206,
+      "step": 959
+    },
+    {
+      "epoch": 0.5536332179930796,
+      "grad_norm": 0.8766285181045532,
+      "learning_rate": 0.00016328651144891285,
+      "loss": 0.654015064239502,
+      "step": 960
+    },
+    {
+      "epoch": 0.5542099192618224,
+      "grad_norm": 0.5168980956077576,
+      "learning_rate": 0.00016324802770829327,
+      "loss": 0.7942756414413452,
+      "step": 961
+    },
+    {
+      "epoch": 0.5547866205305652,
+      "grad_norm": 0.8975361585617065,
+      "learning_rate": 0.00016320954396767368,
+      "loss": 1.1166660785675049,
+      "step": 962
+    },
+    {
+      "epoch": 0.5553633217993079,
+      "grad_norm": 0.559033215045929,
+      "learning_rate": 0.0001631710602270541,
+      "loss": 0.7238450050354004,
+      "step": 963
+    },
+    {
+      "epoch": 0.5559400230680508,
+      "grad_norm": 0.5114202499389648,
+      "learning_rate": 0.00016313257648643449,
+      "loss": 0.8229402303695679,
+      "step": 964
+    },
+    {
+      "epoch": 0.5565167243367936,
+      "grad_norm": 0.8146692514419556,
+      "learning_rate": 0.0001630940927458149,
+      "loss": 0.9510258436203003,
+      "step": 965
+    },
+    {
+      "epoch": 0.5570934256055363,
+      "grad_norm": 0.7686490416526794,
+      "learning_rate": 0.00016305560900519532,
+      "loss": 1.3754280805587769,
+      "step": 966
+    },
+    {
+      "epoch": 0.5576701268742791,
+      "grad_norm": 0.6895797252655029,
+      "learning_rate": 0.00016301712526457573,
+      "loss": 0.9850455522537231,
+      "step": 967
+    },
+    {
+      "epoch": 0.558246828143022,
+      "grad_norm": 0.6049807667732239,
+      "learning_rate": 0.00016297864152395615,
+      "loss": 0.6829259395599365,
+      "step": 968
+    },
+    {
+      "epoch": 0.5588235294117647,
+      "grad_norm": 0.7376249432563782,
+      "learning_rate": 0.00016294015778333656,
+      "loss": 0.7787905931472778,
+      "step": 969
+    },
+    {
+      "epoch": 0.5594002306805075,
+      "grad_norm": 0.5940505862236023,
+      "learning_rate": 0.00016290167404271698,
+      "loss": 0.7658302783966064,
+      "step": 970
+    },
+    {
+      "epoch": 0.5599769319492502,
+      "grad_norm": 0.8353221416473389,
+      "learning_rate": 0.0001628631903020974,
+      "loss": 1.0191570520401,
+      "step": 971
+    },
+    {
+      "epoch": 0.5605536332179931,
+      "grad_norm": 0.6136527061462402,
+      "learning_rate": 0.00016282470656147778,
+      "loss": 0.9413414001464844,
+      "step": 972
+    },
+    {
+      "epoch": 0.5611303344867359,
+      "grad_norm": 0.64887535572052,
+      "learning_rate": 0.0001627862228208582,
+      "loss": 0.763261616230011,
+      "step": 973
+    },
+    {
+      "epoch": 0.5617070357554786,
+      "grad_norm": 0.8027318716049194,
+      "learning_rate": 0.0001627477390802386,
+      "loss": 1.1142311096191406,
+      "step": 974
+    },
+    {
+      "epoch": 0.5622837370242214,
+      "grad_norm": 0.6630944609642029,
+      "learning_rate": 0.00016270925533961903,
+      "loss": 0.8240130543708801,
+      "step": 975
+    },
+    {
+      "epoch": 0.5628604382929643,
+      "grad_norm": 0.7404500246047974,
+      "learning_rate": 0.00016267077159899944,
+      "loss": 0.9690840244293213,
+      "step": 976
+    },
+    {
+      "epoch": 0.563437139561707,
+      "grad_norm": 1.0134172439575195,
+      "learning_rate": 0.00016263228785837986,
+      "loss": 1.4774882793426514,
+      "step": 977
+    },
+    {
+      "epoch": 0.5640138408304498,
+      "grad_norm": 0.8651242256164551,
+      "learning_rate": 0.00016259380411776027,
+      "loss": 0.898904025554657,
+      "step": 978
+    },
+    {
+      "epoch": 0.5645905420991926,
+      "grad_norm": 0.6225872039794922,
+      "learning_rate": 0.00016255532037714066,
+      "loss": 1.149839162826538,
+      "step": 979
+    },
+    {
+      "epoch": 0.5651672433679354,
+      "grad_norm": 0.5773558020591736,
+      "learning_rate": 0.00016251683663652108,
+      "loss": 0.516633152961731,
+      "step": 980
+    },
+    {
+      "epoch": 0.5657439446366782,
+      "grad_norm": 0.6350861191749573,
+      "learning_rate": 0.0001624783528959015,
+      "loss": 1.0271410942077637,
+      "step": 981
+    },
+    {
+      "epoch": 0.566320645905421,
+      "grad_norm": 0.8134899139404297,
+      "learning_rate": 0.0001624398691552819,
+      "loss": 0.8847084045410156,
+      "step": 982
+    },
+    {
+      "epoch": 0.5668973471741637,
+      "grad_norm": 0.793136477470398,
+      "learning_rate": 0.00016240138541466232,
+      "loss": 1.0517855882644653,
+      "step": 983
+    },
+    {
+      "epoch": 0.5674740484429066,
+      "grad_norm": 0.6838855743408203,
+      "learning_rate": 0.00016236290167404274,
+      "loss": 0.9592060446739197,
+      "step": 984
+    },
+    {
+      "epoch": 0.5680507497116494,
+      "grad_norm": 0.77060467004776,
+      "learning_rate": 0.00016232441793342315,
+      "loss": 1.1476876735687256,
+      "step": 985
+    },
+    {
+      "epoch": 0.5686274509803921,
+      "grad_norm": 0.6759986281394958,
+      "learning_rate": 0.00016228593419280357,
+      "loss": 0.9518548846244812,
+      "step": 986
+    },
+    {
+      "epoch": 0.5692041522491349,
+      "grad_norm": 0.6088658571243286,
+      "learning_rate": 0.00016224745045218396,
+      "loss": 0.6659010648727417,
+      "step": 987
+    },
+    {
+      "epoch": 0.5697808535178778,
+      "grad_norm": 0.9436719417572021,
+      "learning_rate": 0.00016220896671156437,
+      "loss": 1.1346865892410278,
+      "step": 988
+    },
+    {
+      "epoch": 0.5703575547866205,
+      "grad_norm": 1.0091006755828857,
+      "learning_rate": 0.0001621704829709448,
+      "loss": 1.1687716245651245,
+      "step": 989
+    },
+    {
+      "epoch": 0.5709342560553633,
+      "grad_norm": 0.9080367684364319,
+      "learning_rate": 0.0001621319992303252,
+      "loss": 1.0989638566970825,
+      "step": 990
+    },
+    {
+      "epoch": 0.5715109573241061,
+      "grad_norm": 0.7519204020500183,
+      "learning_rate": 0.00016209351548970562,
+      "loss": 1.3017445802688599,
+      "step": 991
+    },
+    {
+      "epoch": 0.5720876585928489,
+      "grad_norm": 0.545911431312561,
+      "learning_rate": 0.00016205503174908603,
+      "loss": 0.7622886300086975,
+      "step": 992
+    },
+    {
+      "epoch": 0.5726643598615917,
+      "grad_norm": 0.9163870215415955,
+      "learning_rate": 0.00016201654800846645,
+      "loss": 1.2744814157485962,
+      "step": 993
+    },
+    {
+      "epoch": 0.5732410611303345,
+      "grad_norm": 0.7644914388656616,
+      "learning_rate": 0.00016197806426784686,
+      "loss": 0.9071030616760254,
+      "step": 994
+    },
+    {
+      "epoch": 0.5738177623990772,
+      "grad_norm": 0.761933445930481,
+      "learning_rate": 0.00016193958052722725,
+      "loss": 1.0261884927749634,
+      "step": 995
+    },
+    {
+      "epoch": 0.5743944636678201,
+      "grad_norm": 0.5850253701210022,
+      "learning_rate": 0.00016190109678660767,
+      "loss": 0.8700547814369202,
+      "step": 996
+    },
+    {
+      "epoch": 0.5749711649365629,
+      "grad_norm": 0.8303119540214539,
+      "learning_rate": 0.00016186261304598808,
+      "loss": 0.7401360273361206,
+      "step": 997
+    },
+    {
+      "epoch": 0.5755478662053056,
+      "grad_norm": 0.8335464000701904,
+      "learning_rate": 0.0001618241293053685,
+      "loss": 1.058925986289978,
+      "step": 998
+    },
+    {
+      "epoch": 0.5761245674740484,
+      "grad_norm": 0.6967325806617737,
+      "learning_rate": 0.0001617856455647489,
+      "loss": 1.3550879955291748,
+      "step": 999
+    },
+    {
+      "epoch": 0.5767012687427913,
+      "grad_norm": 1.0509662628173828,
+      "learning_rate": 0.00016174716182412933,
+      "loss": 1.3809900283813477,
+      "step": 1000
+    },
+    {
+      "epoch": 0.577277970011534,
+      "grad_norm": 0.7688459157943726,
+      "learning_rate": 0.00016170867808350974,
+      "loss": 0.7888709306716919,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5778546712802768,
+      "grad_norm": 1.4081027507781982,
+      "learning_rate": 0.00016167019434289016,
+      "loss": 0.8922286033630371,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5784313725490197,
+      "grad_norm": 0.8513575196266174,
+      "learning_rate": 0.00016163171060227055,
+      "loss": 0.9064381718635559,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5790080738177624,
+      "grad_norm": 0.8020631670951843,
+      "learning_rate": 0.00016159322686165096,
+      "loss": 1.0038318634033203,
+      "step": 1004
+    },
+    {
+      "epoch": 0.5795847750865052,
+      "grad_norm": 0.6308439373970032,
+      "learning_rate": 0.00016155474312103138,
+      "loss": 1.0535993576049805,
+      "step": 1005
+    },
+    {
+      "epoch": 0.580161476355248,
+      "grad_norm": 0.9487643837928772,
+      "learning_rate": 0.0001615162593804118,
+      "loss": 1.0733325481414795,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5807381776239908,
+      "grad_norm": 0.5813226699829102,
+      "learning_rate": 0.0001614777756397922,
+      "loss": 0.6475256085395813,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5813148788927336,
+      "grad_norm": 0.8787825703620911,
+      "learning_rate": 0.00016143929189917262,
+      "loss": 1.2669293880462646,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5818915801614764,
+      "grad_norm": 0.5114219784736633,
+      "learning_rate": 0.00016140080815855304,
+      "loss": 0.5243850946426392,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5824682814302191,
+      "grad_norm": 0.9315117597579956,
+      "learning_rate": 0.00016136232441793345,
+      "loss": 1.0958704948425293,
+      "step": 1010
+    },
+    {
+      "epoch": 0.583044982698962,
+      "grad_norm": 0.7866684794425964,
+      "learning_rate": 0.00016132384067731384,
+      "loss": 1.0202006101608276,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5836216839677048,
+      "grad_norm": 0.9690834283828735,
+      "learning_rate": 0.00016128535693669426,
+      "loss": 0.7898403406143188,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5841983852364475,
+      "grad_norm": 1.17559015750885,
+      "learning_rate": 0.00016124687319607467,
+      "loss": 1.0564637184143066,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5847750865051903,
+      "grad_norm": 0.9403568506240845,
+      "learning_rate": 0.0001612083894554551,
+      "loss": 1.1451847553253174,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5853517877739332,
+      "grad_norm": 0.7303722500801086,
+      "learning_rate": 0.0001611699057148355,
+      "loss": 1.143730878829956,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5859284890426759,
+      "grad_norm": 0.9661723375320435,
+      "learning_rate": 0.00016113142197421592,
+      "loss": 1.1612937450408936,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5865051903114187,
+      "grad_norm": 0.9506820440292358,
+      "learning_rate": 0.0001610929382335963,
+      "loss": 1.3300495147705078,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5870818915801614,
+      "grad_norm": 0.9524713754653931,
+      "learning_rate": 0.00016105445449297672,
+      "loss": 1.4797887802124023,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5876585928489043,
+      "grad_norm": 0.8756133317947388,
+      "learning_rate": 0.00016101597075235714,
+      "loss": 1.0017035007476807,
+      "step": 1019
+    },
+    {
+      "epoch": 0.5882352941176471,
+      "grad_norm": 0.8561094403266907,
+      "learning_rate": 0.00016097748701173752,
+      "loss": 1.4500423669815063,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5888119953863898,
+      "grad_norm": 0.7503087520599365,
+      "learning_rate": 0.00016093900327111794,
+      "loss": 1.0606659650802612,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5893886966551326,
+      "grad_norm": 0.5415161848068237,
+      "learning_rate": 0.00016090051953049836,
+      "loss": 0.6421483159065247,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5899653979238755,
+      "grad_norm": 0.6148718595504761,
+      "learning_rate": 0.00016086203578987877,
+      "loss": 0.94537353515625,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5905420991926182,
+      "grad_norm": 0.7274061441421509,
+      "learning_rate": 0.00016082355204925919,
+      "loss": 1.1045122146606445,
+      "step": 1024
+    },
+    {
+      "epoch": 0.591118800461361,
+      "grad_norm": 1.0995570421218872,
+      "learning_rate": 0.0001607850683086396,
+      "loss": 1.0006502866744995,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5916955017301038,
+      "grad_norm": 0.6411669850349426,
+      "learning_rate": 0.00016074658456802002,
+      "loss": 0.8185054063796997,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5922722029988466,
+      "grad_norm": 0.8972517848014832,
+      "learning_rate": 0.00016070810082740043,
+      "loss": 1.0834156274795532,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5928489042675894,
+      "grad_norm": 1.3362998962402344,
+      "learning_rate": 0.00016066961708678082,
+      "loss": 1.3157958984375,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5934256055363322,
+      "grad_norm": 0.9085165858268738,
+      "learning_rate": 0.00016063113334616124,
+      "loss": 1.0817850828170776,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5940023068050749,
+      "grad_norm": 1.028162956237793,
+      "learning_rate": 0.00016059264960554165,
+      "loss": 1.324896216392517,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5945790080738178,
+      "grad_norm": 0.6264161467552185,
+      "learning_rate": 0.00016055416586492207,
+      "loss": 0.7769796848297119,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5951557093425606,
+      "grad_norm": 0.6027923822402954,
+      "learning_rate": 0.00016051568212430248,
+      "loss": 0.7691771388053894,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5957324106113033,
+      "grad_norm": 1.1957632303237915,
+      "learning_rate": 0.0001604771983836829,
+      "loss": 1.5915735960006714,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5963091118800461,
+      "grad_norm": 0.8243029713630676,
+      "learning_rate": 0.0001604387146430633,
+      "loss": 1.4467861652374268,
+      "step": 1034
+    },
+    {
+      "epoch": 0.596885813148789,
+      "grad_norm": 0.9241074919700623,
+      "learning_rate": 0.00016040023090244373,
+      "loss": 1.2037115097045898,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5974625144175317,
+      "grad_norm": 0.7573208212852478,
+      "learning_rate": 0.00016036174716182411,
+      "loss": 1.111187219619751,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5980392156862745,
+      "grad_norm": 0.9766779541969299,
+      "learning_rate": 0.00016032326342120453,
+      "loss": 1.3394712209701538,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5986159169550173,
+      "grad_norm": 0.7223910093307495,
+      "learning_rate": 0.00016028477968058495,
+      "loss": 0.9714270830154419,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5991926182237601,
+      "grad_norm": 0.8372020721435547,
+      "learning_rate": 0.00016024629593996536,
+      "loss": 0.9755414724349976,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5997693194925029,
+      "grad_norm": 1.060224175453186,
+      "learning_rate": 0.00016020781219934578,
+      "loss": 1.0653870105743408,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6003460207612457,
+      "grad_norm": 1.0068564414978027,
+      "learning_rate": 0.0001601693284587262,
+      "loss": 1.1695475578308105,
+      "step": 1041
+    },
+    {
+      "epoch": 0.6009227220299884,
+      "grad_norm": 0.8202903866767883,
+      "learning_rate": 0.0001601308447181066,
+      "loss": 1.430415391921997,
+      "step": 1042
+    },
+    {
+      "epoch": 0.6014994232987313,
+      "grad_norm": 0.6556461453437805,
+      "learning_rate": 0.00016009236097748702,
+      "loss": 0.6565566658973694,
+      "step": 1043
+    },
+    {
+      "epoch": 0.6020761245674741,
+      "grad_norm": 1.0711745023727417,
+      "learning_rate": 0.0001600538772368674,
+      "loss": 1.4629727602005005,
+      "step": 1044
+    },
+    {
+      "epoch": 0.6026528258362168,
+      "grad_norm": 0.857792317867279,
+      "learning_rate": 0.00016001539349624783,
+      "loss": 1.375361442565918,
+      "step": 1045
+    },
+    {
+      "epoch": 0.6032295271049596,
+      "grad_norm": 0.8610656261444092,
+      "learning_rate": 0.00015997690975562824,
+      "loss": 1.319663166999817,
+      "step": 1046
+    },
+    {
+      "epoch": 0.6038062283737025,
+      "grad_norm": 0.5466272830963135,
+      "learning_rate": 0.00015993842601500866,
+      "loss": 0.9326815009117126,
+      "step": 1047
+    },
+    {
+      "epoch": 0.6043829296424452,
+      "grad_norm": 0.5424578189849854,
+      "learning_rate": 0.00015989994227438907,
+      "loss": 0.8943756818771362,
+      "step": 1048
+    },
+    {
+      "epoch": 0.604959630911188,
+      "grad_norm": 1.0392166376113892,
+      "learning_rate": 0.00015986145853376949,
+      "loss": 1.1610779762268066,
+      "step": 1049
+    },
+    {
+      "epoch": 0.6055363321799307,
+      "grad_norm": 0.7397944331169128,
+      "learning_rate": 0.0001598229747931499,
+      "loss": 0.9297494888305664,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6061130334486736,
+      "grad_norm": 0.7921435832977295,
+      "learning_rate": 0.00015978449105253032,
+      "loss": 0.9271104335784912,
+      "step": 1051
+    },
+    {
+      "epoch": 0.6066897347174164,
+      "grad_norm": 1.0713645219802856,
+      "learning_rate": 0.0001597460073119107,
+      "loss": 1.429350733757019,
+      "step": 1052
+    },
+    {
+      "epoch": 0.6072664359861591,
+      "grad_norm": 0.7312497496604919,
+      "learning_rate": 0.00015970752357129112,
+      "loss": 0.9167627096176147,
+      "step": 1053
+    },
+    {
+      "epoch": 0.6078431372549019,
+      "grad_norm": 0.7499086260795593,
+      "learning_rate": 0.00015966903983067154,
+      "loss": 0.7258137464523315,
+      "step": 1054
+    },
+    {
+      "epoch": 0.6084198385236448,
+      "grad_norm": 0.7300564646720886,
+      "learning_rate": 0.00015963055609005195,
+      "loss": 1.058071494102478,
+      "step": 1055
+    },
+    {
+      "epoch": 0.6089965397923875,
+      "grad_norm": 0.652527928352356,
+      "learning_rate": 0.00015959207234943237,
+      "loss": 0.6544615030288696,
+      "step": 1056
+    },
+    {
+      "epoch": 0.6095732410611303,
+      "grad_norm": 0.7193166613578796,
+      "learning_rate": 0.00015955358860881278,
+      "loss": 0.7395502328872681,
+      "step": 1057
+    },
+    {
+      "epoch": 0.6101499423298731,
+      "grad_norm": 0.7402684092521667,
+      "learning_rate": 0.0001595151048681932,
+      "loss": 0.8958665728569031,
+      "step": 1058
+    },
+    {
+      "epoch": 0.610726643598616,
+      "grad_norm": 1.0471738576889038,
+      "learning_rate": 0.0001594766211275736,
+      "loss": 1.383862018585205,
+      "step": 1059
+    },
+    {
+      "epoch": 0.6113033448673587,
+      "grad_norm": 0.926358699798584,
+      "learning_rate": 0.000159438137386954,
+      "loss": 1.3329360485076904,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6118800461361015,
+      "grad_norm": 1.3576291799545288,
+      "learning_rate": 0.00015939965364633442,
+      "loss": 1.4153847694396973,
+      "step": 1061
+    },
+    {
+      "epoch": 0.6124567474048442,
+      "grad_norm": 1.043614387512207,
+      "learning_rate": 0.00015936116990571483,
+      "loss": 1.1355584859848022,
+      "step": 1062
+    },
+    {
+      "epoch": 0.6130334486735871,
+      "grad_norm": 0.6180047988891602,
+      "learning_rate": 0.00015932268616509525,
+      "loss": 0.7877006530761719,
+      "step": 1063
+    },
+    {
+      "epoch": 0.6136101499423299,
+      "grad_norm": 1.188005805015564,
+      "learning_rate": 0.00015928420242447566,
+      "loss": 1.185757040977478,
+      "step": 1064
+    },
+    {
+      "epoch": 0.6141868512110726,
+      "grad_norm": 0.6937184929847717,
+      "learning_rate": 0.00015924571868385608,
+      "loss": 0.8133529424667358,
+      "step": 1065
+    },
+    {
+      "epoch": 0.6147635524798154,
+      "grad_norm": 0.5152422785758972,
+      "learning_rate": 0.0001592072349432365,
+      "loss": 0.6955524682998657,
+      "step": 1066
+    },
+    {
+      "epoch": 0.6153402537485583,
+      "grad_norm": 0.8295215964317322,
+      "learning_rate": 0.0001591687512026169,
+      "loss": 0.9180642366409302,
+      "step": 1067
+    },
+    {
+      "epoch": 0.615916955017301,
+      "grad_norm": 1.131622314453125,
+      "learning_rate": 0.0001591302674619973,
+      "loss": 1.2194663286209106,
+      "step": 1068
+    },
+    {
+      "epoch": 0.6164936562860438,
+      "grad_norm": 0.744301438331604,
+      "learning_rate": 0.0001590917837213777,
+      "loss": 0.9852138161659241,
+      "step": 1069
+    },
+    {
+      "epoch": 0.6170703575547867,
+      "grad_norm": 0.7841970920562744,
+      "learning_rate": 0.00015905329998075813,
+      "loss": 1.302487850189209,
+      "step": 1070
+    },
+    {
+      "epoch": 0.6176470588235294,
+      "grad_norm": 0.6610711216926575,
+      "learning_rate": 0.00015901481624013854,
+      "loss": 0.8427870273590088,
+      "step": 1071
+    },
+    {
+      "epoch": 0.6182237600922722,
+      "grad_norm": 0.9735661745071411,
+      "learning_rate": 0.00015897633249951896,
+      "loss": 1.1720025539398193,
+      "step": 1072
+    },
+    {
+      "epoch": 0.618800461361015,
+      "grad_norm": 0.6673301458358765,
+      "learning_rate": 0.00015893784875889937,
+      "loss": 1.0172441005706787,
+      "step": 1073
+    },
+    {
+      "epoch": 0.6193771626297578,
+      "grad_norm": 1.0327497720718384,
+      "learning_rate": 0.0001588993650182798,
+      "loss": 1.168729305267334,
+      "step": 1074
+    },
+    {
+      "epoch": 0.6199538638985006,
+      "grad_norm": 0.6887943744659424,
+      "learning_rate": 0.0001588608812776602,
+      "loss": 0.9284838438034058,
+      "step": 1075
+    },
+    {
+      "epoch": 0.6205305651672434,
+      "grad_norm": 0.6660910844802856,
+      "learning_rate": 0.0001588223975370406,
+      "loss": 1.1769919395446777,
+      "step": 1076
+    },
+    {
+      "epoch": 0.6211072664359861,
+      "grad_norm": 0.7416674494743347,
+      "learning_rate": 0.000158783913796421,
+      "loss": 0.750725269317627,
+      "step": 1077
+    },
+    {
+      "epoch": 0.621683967704729,
+      "grad_norm": 0.6302111148834229,
+      "learning_rate": 0.00015874543005580142,
+      "loss": 0.8207563161849976,
+      "step": 1078
+    },
+    {
+      "epoch": 0.6222606689734718,
+      "grad_norm": 0.720021665096283,
+      "learning_rate": 0.00015870694631518184,
+      "loss": 1.133636474609375,
+      "step": 1079
+    },
+    {
+      "epoch": 0.6228373702422145,
+      "grad_norm": 0.9188029170036316,
+      "learning_rate": 0.00015866846257456225,
+      "loss": 1.5215458869934082,
+      "step": 1080
+    },
+    {
+      "epoch": 0.6234140715109573,
+      "grad_norm": 0.7337254881858826,
+      "learning_rate": 0.00015862997883394267,
+      "loss": 0.9544572830200195,
+      "step": 1081
+    },
+    {
+      "epoch": 0.6239907727797002,
+      "grad_norm": 1.0431314706802368,
+      "learning_rate": 0.00015859149509332308,
+      "loss": 1.0790281295776367,
+      "step": 1082
+    },
+    {
+      "epoch": 0.6245674740484429,
+      "grad_norm": 0.6344501376152039,
+      "learning_rate": 0.0001585530113527035,
+      "loss": 0.9151628017425537,
+      "step": 1083
+    },
+    {
+      "epoch": 0.6251441753171857,
+      "grad_norm": 1.332190752029419,
+      "learning_rate": 0.00015851452761208389,
+      "loss": 1.5466241836547852,
+      "step": 1084
+    },
+    {
+      "epoch": 0.6257208765859285,
+      "grad_norm": 0.7802074551582336,
+      "learning_rate": 0.0001584760438714643,
+      "loss": 1.1575053930282593,
+      "step": 1085
+    },
+    {
+      "epoch": 0.6262975778546713,
+      "grad_norm": 0.5755362510681152,
+      "learning_rate": 0.00015843756013084472,
+      "loss": 0.6923443078994751,
+      "step": 1086
+    },
+    {
+      "epoch": 0.6268742791234141,
+      "grad_norm": 0.8710469007492065,
+      "learning_rate": 0.00015839907639022513,
+      "loss": 1.0893003940582275,
+      "step": 1087
+    },
+    {
+      "epoch": 0.6274509803921569,
+      "grad_norm": 0.6689137816429138,
+      "learning_rate": 0.00015836059264960555,
+      "loss": 0.9777762293815613,
+      "step": 1088
+    },
+    {
+      "epoch": 0.6280276816608996,
+      "grad_norm": 0.9923802614212036,
+      "learning_rate": 0.00015832210890898596,
+      "loss": 1.2578145265579224,
+      "step": 1089
+    },
+    {
+      "epoch": 0.6286043829296425,
+      "grad_norm": 0.7596067190170288,
+      "learning_rate": 0.00015828362516836638,
+      "loss": 1.0804511308670044,
+      "step": 1090
+    },
+    {
+      "epoch": 0.6291810841983853,
+      "grad_norm": 0.9255754947662354,
+      "learning_rate": 0.0001582451414277468,
+      "loss": 1.2536742687225342,
+      "step": 1091
+    },
+    {
+      "epoch": 0.629757785467128,
+      "grad_norm": 0.6089752912521362,
+      "learning_rate": 0.00015820665768712718,
+      "loss": 0.8234043121337891,
+      "step": 1092
+    },
+    {
+      "epoch": 0.6303344867358708,
+      "grad_norm": 0.8412203192710876,
+      "learning_rate": 0.0001581681739465076,
+      "loss": 0.8689320683479309,
+      "step": 1093
+    },
+    {
+      "epoch": 0.6309111880046137,
+      "grad_norm": 0.6300414204597473,
+      "learning_rate": 0.000158129690205888,
+      "loss": 0.8836315274238586,
+      "step": 1094
+    },
+    {
+      "epoch": 0.6314878892733564,
+      "grad_norm": 0.8622999787330627,
+      "learning_rate": 0.00015809120646526843,
+      "loss": 0.8355990648269653,
+      "step": 1095
+    },
+    {
+      "epoch": 0.6320645905420992,
+      "grad_norm": 1.0277838706970215,
+      "learning_rate": 0.00015805272272464884,
+      "loss": 1.0228278636932373,
+      "step": 1096
+    },
+    {
+      "epoch": 0.6326412918108419,
+      "grad_norm": 0.7297544479370117,
+      "learning_rate": 0.00015801423898402926,
+      "loss": 0.9207032918930054,
+      "step": 1097
+    },
+    {
+      "epoch": 0.6332179930795848,
+      "grad_norm": 0.6923787593841553,
+      "learning_rate": 0.00015797575524340967,
+      "loss": 0.8914310932159424,
+      "step": 1098
+    },
+    {
+      "epoch": 0.6337946943483276,
+      "grad_norm": 0.984605073928833,
+      "learning_rate": 0.00015793727150279006,
+      "loss": 1.030419945716858,
+      "step": 1099
+    },
+    {
+      "epoch": 0.6343713956170703,
+      "grad_norm": 0.7933477759361267,
+      "learning_rate": 0.00015789878776217048,
+      "loss": 0.8263508081436157,
+      "step": 1100
+    },
+    {
+      "epoch": 0.6349480968858131,
+      "grad_norm": 0.6690862774848938,
+      "learning_rate": 0.0001578603040215509,
+      "loss": 0.8062323927879333,
+      "step": 1101
+    },
+    {
+      "epoch": 0.635524798154556,
+      "grad_norm": 1.1080838441848755,
+      "learning_rate": 0.0001578218202809313,
+      "loss": 1.0695234537124634,
+      "step": 1102
+    },
+    {
+      "epoch": 0.6361014994232987,
+      "grad_norm": 0.7373805046081543,
+      "learning_rate": 0.00015778333654031172,
+      "loss": 0.7782353162765503,
+      "step": 1103
+    },
+    {
+      "epoch": 0.6366782006920415,
+      "grad_norm": 0.9623069167137146,
+      "learning_rate": 0.00015774485279969214,
+      "loss": 1.299721121788025,
+      "step": 1104
+    },
+    {
+      "epoch": 0.6372549019607843,
+      "grad_norm": 0.8447510004043579,
+      "learning_rate": 0.00015770636905907255,
+      "loss": 0.751670241355896,
+      "step": 1105
+    },
+    {
+      "epoch": 0.6378316032295271,
+      "grad_norm": 0.7200034260749817,
+      "learning_rate": 0.00015766788531845297,
+      "loss": 0.8565016388893127,
+      "step": 1106
+    },
+    {
+      "epoch": 0.6384083044982699,
+      "grad_norm": 0.791018545627594,
+      "learning_rate": 0.00015762940157783336,
+      "loss": 1.014164924621582,
+      "step": 1107
+    },
+    {
+      "epoch": 0.6389850057670127,
+      "grad_norm": 0.7488639950752258,
+      "learning_rate": 0.00015759091783721377,
+      "loss": 0.7353352904319763,
+      "step": 1108
+    },
+    {
+      "epoch": 0.6395617070357554,
+      "grad_norm": 0.6376444697380066,
+      "learning_rate": 0.00015755243409659419,
+      "loss": 0.8452020287513733,
+      "step": 1109
+    },
+    {
+      "epoch": 0.6401384083044983,
+      "grad_norm": 0.7400408387184143,
+      "learning_rate": 0.0001575139503559746,
+      "loss": 0.8612061738967896,
+      "step": 1110
+    },
+    {
+      "epoch": 0.6407151095732411,
+      "grad_norm": 0.630378007888794,
+      "learning_rate": 0.00015747546661535502,
+      "loss": 0.8225241899490356,
+      "step": 1111
+    },
+    {
+      "epoch": 0.6412918108419838,
+      "grad_norm": 0.7687711715698242,
+      "learning_rate": 0.00015743698287473543,
+      "loss": 1.0129132270812988,
+      "step": 1112
+    },
+    {
+      "epoch": 0.6418685121107266,
+      "grad_norm": 0.8225964903831482,
+      "learning_rate": 0.00015739849913411585,
+      "loss": 1.0317823886871338,
+      "step": 1113
+    },
+    {
+      "epoch": 0.6424452133794695,
+      "grad_norm": 0.8062997460365295,
+      "learning_rate": 0.00015736001539349626,
+      "loss": 1.2668901681900024,
+      "step": 1114
+    },
+    {
+      "epoch": 0.6430219146482122,
+      "grad_norm": 0.7937533855438232,
+      "learning_rate": 0.00015732153165287665,
+      "loss": 0.5984291434288025,
+      "step": 1115
+    },
+    {
+      "epoch": 0.643598615916955,
+      "grad_norm": 0.6556064486503601,
+      "learning_rate": 0.00015728304791225707,
+      "loss": 0.6811074018478394,
+      "step": 1116
+    },
+    {
+      "epoch": 0.6441753171856978,
+      "grad_norm": 0.6815225481987,
+      "learning_rate": 0.00015724456417163748,
+      "loss": 0.8315191268920898,
+      "step": 1117
+    },
+    {
+      "epoch": 0.6447520184544406,
+      "grad_norm": 0.8624749779701233,
+      "learning_rate": 0.0001572060804310179,
+      "loss": 1.024225115776062,
+      "step": 1118
+    },
+    {
+      "epoch": 0.6453287197231834,
+      "grad_norm": 0.9867150187492371,
+      "learning_rate": 0.0001571675966903983,
+      "loss": 1.1838812828063965,
+      "step": 1119
+    },
+    {
+      "epoch": 0.6459054209919262,
+      "grad_norm": 0.9800993204116821,
+      "learning_rate": 0.00015712911294977873,
+      "loss": 1.0964932441711426,
+      "step": 1120
+    },
+    {
+      "epoch": 0.6464821222606689,
+      "grad_norm": 0.6755380034446716,
+      "learning_rate": 0.00015709062920915914,
+      "loss": 0.6732958555221558,
+      "step": 1121
+    },
+    {
+      "epoch": 0.6470588235294118,
+      "grad_norm": 0.6237842440605164,
+      "learning_rate": 0.00015705214546853956,
+      "loss": 0.769539475440979,
+      "step": 1122
+    },
+    {
+      "epoch": 0.6476355247981546,
+      "grad_norm": 0.9327729344367981,
+      "learning_rate": 0.00015701366172791995,
+      "loss": 1.2593892812728882,
+      "step": 1123
+    },
+    {
+      "epoch": 0.6482122260668973,
+      "grad_norm": 0.7165786623954773,
+      "learning_rate": 0.00015697517798730036,
+      "loss": 0.8721244931221008,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6487889273356401,
+      "grad_norm": 0.7718213200569153,
+      "learning_rate": 0.00015693669424668078,
+      "loss": 0.9298558235168457,
+      "step": 1125
+    },
+    {
+      "epoch": 0.649365628604383,
+      "grad_norm": 0.7327983975410461,
+      "learning_rate": 0.0001568982105060612,
+      "loss": 0.9947003722190857,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6499423298731257,
+      "grad_norm": 0.8242558240890503,
+      "learning_rate": 0.0001568597267654416,
+      "loss": 1.3076270818710327,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6505190311418685,
+      "grad_norm": 0.5866062641143799,
+      "learning_rate": 0.00015682124302482202,
+      "loss": 0.7161552309989929,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6510957324106112,
+      "grad_norm": 0.690351665019989,
+      "learning_rate": 0.00015678275928420244,
+      "loss": 0.7334930896759033,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6516724336793541,
+      "grad_norm": 0.7475882172584534,
+      "learning_rate": 0.00015674427554358285,
+      "loss": 0.8960260152816772,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6522491349480969,
+      "grad_norm": 0.7973214983940125,
+      "learning_rate": 0.00015670579180296324,
+      "loss": 0.9681750535964966,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6528258362168397,
+      "grad_norm": 0.7747503519058228,
+      "learning_rate": 0.00015666730806234366,
+      "loss": 1.051071047782898,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6534025374855824,
+      "grad_norm": 0.6149755120277405,
+      "learning_rate": 0.00015662882432172407,
+      "loss": 1.0745124816894531,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6539792387543253,
+      "grad_norm": 0.8245506286621094,
+      "learning_rate": 0.0001565903405811045,
+      "loss": 1.3383489847183228,
+      "step": 1134
+    },
+    {
+      "epoch": 0.654555940023068,
+      "grad_norm": 0.754502534866333,
+      "learning_rate": 0.0001565518568404849,
+      "loss": 0.709721028804779,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6551326412918108,
+      "grad_norm": 0.5991480946540833,
+      "learning_rate": 0.00015651337309986532,
+      "loss": 0.6601396203041077,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6557093425605537,
+      "grad_norm": 0.7160611152648926,
+      "learning_rate": 0.00015647488935924573,
+      "loss": 1.244566559791565,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6562860438292965,
+      "grad_norm": 0.6996898055076599,
+      "learning_rate": 0.00015643640561862615,
+      "loss": 0.7976762056350708,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6568627450980392,
+      "grad_norm": 1.1391624212265015,
+      "learning_rate": 0.00015639792187800654,
+      "loss": 1.1150181293487549,
+      "step": 1139
+    },
+    {
+      "epoch": 0.657439446366782,
+      "grad_norm": 0.6305305361747742,
+      "learning_rate": 0.00015635943813738695,
+      "loss": 0.9086626768112183,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6580161476355249,
+      "grad_norm": 1.1590427160263062,
+      "learning_rate": 0.00015632095439676737,
+      "loss": 1.2399204969406128,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6585928489042676,
+      "grad_norm": 0.6845443844795227,
+      "learning_rate": 0.00015628247065614778,
+      "loss": 0.9434126019477844,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6591695501730104,
+      "grad_norm": 0.8011909127235413,
+      "learning_rate": 0.0001562439869155282,
+      "loss": 0.9793667197227478,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6597462514417531,
+      "grad_norm": 0.7350550293922424,
+      "learning_rate": 0.0001562055031749086,
+      "loss": 1.27531099319458,
+      "step": 1144
+    },
+    {
+      "epoch": 0.660322952710496,
+      "grad_norm": 0.9062415361404419,
+      "learning_rate": 0.00015616701943428903,
+      "loss": 0.9977236986160278,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6608996539792388,
+      "grad_norm": 0.8427753448486328,
+      "learning_rate": 0.00015612853569366944,
+      "loss": 1.3097494840621948,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6614763552479815,
+      "grad_norm": 0.7309291958808899,
+      "learning_rate": 0.00015609005195304983,
+      "loss": 1.1841623783111572,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6620530565167243,
+      "grad_norm": 0.8518312573432922,
+      "learning_rate": 0.00015605156821243025,
+      "loss": 1.0959196090698242,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6626297577854672,
+      "grad_norm": 0.7902095317840576,
+      "learning_rate": 0.00015601308447181066,
+      "loss": 1.186163067817688,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6632064590542099,
+      "grad_norm": 0.8482567071914673,
+      "learning_rate": 0.00015597460073119108,
+      "loss": 0.9569811820983887,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6637831603229527,
+      "grad_norm": 0.5328805446624756,
+      "learning_rate": 0.0001559361169905715,
+      "loss": 0.6388610005378723,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6643598615916955,
+      "grad_norm": 0.6060228943824768,
+      "learning_rate": 0.0001558976332499519,
+      "loss": 0.7743721008300781,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6649365628604383,
+      "grad_norm": 0.615100085735321,
+      "learning_rate": 0.00015585914950933232,
+      "loss": 0.8808379769325256,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6655132641291811,
+      "grad_norm": 1.1238489151000977,
+      "learning_rate": 0.00015582066576871274,
+      "loss": 1.2252037525177002,
+      "step": 1154
+    },
+    {
+      "epoch": 0.6660899653979239,
+      "grad_norm": 0.8212980628013611,
+      "learning_rate": 0.00015578218202809313,
+      "loss": 1.0264016389846802,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.8575494885444641,
+      "learning_rate": 0.00015574369828747354,
+      "loss": 0.9453893899917603,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6672433679354095,
+      "grad_norm": 0.8559103608131409,
+      "learning_rate": 0.00015570521454685396,
+      "loss": 1.01399564743042,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6678200692041523,
+      "grad_norm": 0.8769490122795105,
+      "learning_rate": 0.00015566673080623437,
+      "loss": 1.1861730813980103,
+      "step": 1158
+    },
+    {
+      "epoch": 0.668396770472895,
+      "grad_norm": 0.5112201571464539,
+      "learning_rate": 0.0001556282470656148,
+      "loss": 0.6198689341545105,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6689734717416378,
+      "grad_norm": 0.6346172094345093,
+      "learning_rate": 0.0001555897633249952,
+      "loss": 0.757227897644043,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6695501730103807,
+      "grad_norm": 0.7918882966041565,
+      "learning_rate": 0.00015555127958437562,
+      "loss": 0.7224777936935425,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6701268742791234,
+      "grad_norm": 0.5124825835227966,
+      "learning_rate": 0.00015551279584375603,
+      "loss": 0.7446980476379395,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6707035755478662,
+      "grad_norm": 0.6950685977935791,
+      "learning_rate": 0.00015547431210313642,
+      "loss": 0.8628665804862976,
+      "step": 1163
+    },
+    {
+      "epoch": 0.671280276816609,
+      "grad_norm": 0.8380517363548279,
+      "learning_rate": 0.00015543582836251684,
+      "loss": 1.0211181640625,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6718569780853518,
+      "grad_norm": 0.732266902923584,
+      "learning_rate": 0.00015539734462189725,
+      "loss": 0.7137742042541504,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6724336793540946,
+      "grad_norm": 0.7325503826141357,
+      "learning_rate": 0.00015535886088127767,
+      "loss": 1.0089268684387207,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6730103806228374,
+      "grad_norm": 0.8091567158699036,
+      "learning_rate": 0.00015532037714065808,
+      "loss": 1.0261311531066895,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6735870818915801,
+      "grad_norm": 0.8078528642654419,
+      "learning_rate": 0.0001552818934000385,
+      "loss": 1.0196332931518555,
+      "step": 1168
+    },
+    {
+      "epoch": 0.674163783160323,
+      "grad_norm": 0.5558749437332153,
+      "learning_rate": 0.0001552434096594189,
+      "loss": 0.8882730007171631,
+      "step": 1169
+    },
+    {
+      "epoch": 0.6747404844290658,
+      "grad_norm": 0.7303665280342102,
+      "learning_rate": 0.00015520492591879933,
+      "loss": 0.9657995700836182,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6753171856978085,
+      "grad_norm": 0.7512165904045105,
+      "learning_rate": 0.00015516644217817972,
+      "loss": 1.0741921663284302,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6758938869665513,
+      "grad_norm": 0.7227686047554016,
+      "learning_rate": 0.0001549788542868128,
+      "loss": 1.0935313701629639,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6764705882352942,
+      "grad_norm": 0.9613728523254395,
+      "learning_rate": 0.00015494040753556324,
+      "loss": 1.0458366870880127,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6770472895040369,
+      "grad_norm": 0.7592456936836243,
+      "learning_rate": 0.00015490196078431375,
+      "loss": 1.048318862915039,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6776239907727797,
+      "grad_norm": 0.6358122229576111,
+      "learning_rate": 0.0001548635140330642,
+      "loss": 0.9271713495254517,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6782006920415224,
+      "grad_norm": 0.6779629588127136,
+      "learning_rate": 0.0001548250672818147,
+      "loss": 0.8732894062995911,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6787773933102653,
+      "grad_norm": 0.7252342700958252,
+      "learning_rate": 0.00015478662053056518,
+      "loss": 1.016528606414795,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6793540945790081,
+      "grad_norm": 0.5252419710159302,
+      "learning_rate": 0.00015474817377931566,
+      "loss": 0.6656200885772705,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6799307958477508,
+      "grad_norm": 0.7480099201202393,
+      "learning_rate": 0.00015470972702806614,
+      "loss": 0.9825901389122009,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6805074971164936,
+      "grad_norm": 0.5403528809547424,
+      "learning_rate": 0.00015467128027681662,
+      "loss": 0.8263649344444275,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6810841983852365,
+      "grad_norm": 0.909685492515564,
+      "learning_rate": 0.0001546328335255671,
+      "loss": 1.1039624214172363,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6816608996539792,
+      "grad_norm": 0.6782054305076599,
+      "learning_rate": 0.00015459438677431757,
+      "loss": 0.8667647242546082,
+      "step": 1182
+    },
+    {
+      "epoch": 0.682237600922722,
+      "grad_norm": 0.9437413811683655,
+      "learning_rate": 0.00015455594002306805,
+      "loss": 1.0089085102081299,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6828143021914648,
+      "grad_norm": 0.631352424621582,
+      "learning_rate": 0.00015451749327181856,
+      "loss": 0.8900731801986694,
+      "step": 1184
+    },
+    {
+      "epoch": 0.6833910034602076,
+      "grad_norm": 0.9895037412643433,
+      "learning_rate": 0.000154479046520569,
+      "loss": 1.4409505128860474,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6839677047289504,
+      "grad_norm": 0.655288815498352,
+      "learning_rate": 0.00015444059976931951,
+      "loss": 0.8149420022964478,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6845444059976932,
+      "grad_norm": 0.906093418598175,
+      "learning_rate": 0.00015440215301806997,
+      "loss": 1.440996527671814,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6851211072664359,
+      "grad_norm": 0.7067789435386658,
+      "learning_rate": 0.00015436370626682047,
+      "loss": 0.6415053009986877,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6856978085351788,
+      "grad_norm": 0.7950546741485596,
+      "learning_rate": 0.00015432525951557095,
+      "loss": 1.0555880069732666,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6862745098039216,
+      "grad_norm": 0.7521815299987793,
+      "learning_rate": 0.00015428681276432143,
+      "loss": 1.0289030075073242,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6868512110726643,
+      "grad_norm": 0.8053890466690063,
+      "learning_rate": 0.0001542483660130719,
+      "loss": 1.0104256868362427,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6874279123414071,
+      "grad_norm": 0.8960652351379395,
+      "learning_rate": 0.00015420991926182238,
+      "loss": 1.3124630451202393,
+      "step": 1192
+    },
+    {
+      "epoch": 0.68800461361015,
+      "grad_norm": 0.6445242762565613,
+      "learning_rate": 0.00015417147251057286,
+      "loss": 0.7147958278656006,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6885813148788927,
+      "grad_norm": 0.8771377801895142,
+      "learning_rate": 0.00015413302575932334,
+      "loss": 1.1068731546401978,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6891580161476355,
+      "grad_norm": 0.746562659740448,
+      "learning_rate": 0.00015409457900807382,
+      "loss": 0.8577734231948853,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6897347174163783,
+      "grad_norm": 0.8225957155227661,
+      "learning_rate": 0.00015405613225682432,
+      "loss": 1.137495994567871,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6903114186851211,
+      "grad_norm": 1.2180874347686768,
+      "learning_rate": 0.00015401768550557478,
+      "loss": 1.3055964708328247,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6908881199538639,
+      "grad_norm": 0.8417837619781494,
+      "learning_rate": 0.00015397923875432528,
+      "loss": 0.719217836856842,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6914648212226067,
+      "grad_norm": 0.5893595218658447,
+      "learning_rate": 0.00015394079200307573,
+      "loss": 0.7719886302947998,
+      "step": 1199
+    },
+    {
+      "epoch": 0.6920415224913494,
+      "grad_norm": 0.6734403371810913,
+      "learning_rate": 0.00015390234525182624,
+      "loss": 0.960877537727356,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6926182237600923,
+      "grad_norm": 0.7350678443908691,
+      "learning_rate": 0.00015386389850057672,
+      "loss": 1.039952278137207,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6931949250288351,
+      "grad_norm": 0.8072929978370667,
+      "learning_rate": 0.0001538254517493272,
+      "loss": 0.9792311787605286,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6937716262975778,
+      "grad_norm": 0.6742820739746094,
+      "learning_rate": 0.00015378700499807767,
+      "loss": 0.8704882860183716,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6943483275663207,
+      "grad_norm": 0.6590847969055176,
+      "learning_rate": 0.00015374855824682815,
+      "loss": 0.7836930155754089,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6949250288350635,
+      "grad_norm": 0.6364882588386536,
+      "learning_rate": 0.00015371011149557863,
+      "loss": 0.6790116429328918,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6955017301038062,
+      "grad_norm": 0.8620322346687317,
+      "learning_rate": 0.0001536716647443291,
+      "loss": 1.1667858362197876,
+      "step": 1206
+    },
+    {
+      "epoch": 0.696078431372549,
+      "grad_norm": 0.9262224435806274,
+      "learning_rate": 0.00015363321799307959,
+      "loss": 1.2684681415557861,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6966551326412919,
+      "grad_norm": 0.7098090052604675,
+      "learning_rate": 0.0001535947712418301,
+      "loss": 1.108170986175537,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6972318339100346,
+      "grad_norm": 0.8219681978225708,
+      "learning_rate": 0.00015355632449058054,
+      "loss": 1.1987258195877075,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6978085351787774,
+      "grad_norm": 0.7267138957977295,
+      "learning_rate": 0.00015351787773933105,
+      "loss": 0.8790909051895142,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6983852364475202,
+      "grad_norm": 0.9880861043930054,
+      "learning_rate": 0.0001534794309880815,
+      "loss": 0.7550561428070068,
+      "step": 1211
+    },
+    {
+      "epoch": 0.698961937716263,
+      "grad_norm": 1.0179109573364258,
+      "learning_rate": 0.000153440984236832,
+      "loss": 1.2887327671051025,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6995386389850058,
+      "grad_norm": 1.0065605640411377,
+      "learning_rate": 0.00015340253748558246,
+      "loss": 1.3018262386322021,
+      "step": 1213
+    },
+    {
+      "epoch": 0.7001153402537486,
+      "grad_norm": 0.7868698835372925,
+      "learning_rate": 0.00015336409073433296,
+      "loss": 1.0050418376922607,
+      "step": 1214
+    },
+    {
+      "epoch": 0.7006920415224913,
+      "grad_norm": 1.2052333354949951,
+      "learning_rate": 0.00015332564398308344,
+      "loss": 1.4229861497879028,
+      "step": 1215
+    },
+    {
+      "epoch": 0.7012687427912342,
+      "grad_norm": 0.7077322006225586,
+      "learning_rate": 0.00015328719723183392,
+      "loss": 0.6043359041213989,
+      "step": 1216
+    },
+    {
+      "epoch": 0.701845444059977,
+      "grad_norm": 0.587632417678833,
+      "learning_rate": 0.0001532487504805844,
+      "loss": 0.6483091115951538,
+      "step": 1217
+    },
+    {
+      "epoch": 0.7024221453287197,
+      "grad_norm": 0.5759986042976379,
+      "learning_rate": 0.00015321030372933487,
+      "loss": 0.8392894864082336,
+      "step": 1218
+    },
+    {
+      "epoch": 0.7029988465974625,
+      "grad_norm": 0.6800678372383118,
+      "learning_rate": 0.00015317185697808535,
+      "loss": 0.8921798467636108,
+      "step": 1219
+    },
+    {
+      "epoch": 0.7035755478662054,
+      "grad_norm": 0.7683438658714294,
+      "learning_rate": 0.00015313341022683586,
+      "loss": 0.9112846851348877,
+      "step": 1220
+    },
+    {
+      "epoch": 0.7041522491349481,
+      "grad_norm": 1.0117342472076416,
+      "learning_rate": 0.0001530949634755863,
+      "loss": 1.4151829481124878,
+      "step": 1221
+    },
+    {
+      "epoch": 0.7047289504036909,
+      "grad_norm": 0.889950156211853,
+      "learning_rate": 0.00015305651672433681,
+      "loss": 1.190742015838623,
+      "step": 1222
+    },
+    {
+      "epoch": 0.7053056516724336,
+      "grad_norm": 0.7858697772026062,
+      "learning_rate": 0.00015301806997308727,
+      "loss": 1.0679411888122559,
+      "step": 1223
+    },
+    {
+      "epoch": 0.7058823529411765,
+      "grad_norm": 0.894363522529602,
+      "learning_rate": 0.00015297962322183777,
+      "loss": 1.1472891569137573,
+      "step": 1224
+    },
+    {
+      "epoch": 0.7064590542099193,
+      "grad_norm": 0.7669128775596619,
+      "learning_rate": 0.00015294117647058822,
+      "loss": 1.1536177396774292,
+      "step": 1225
+    },
+    {
+      "epoch": 0.707035755478662,
+      "grad_norm": 0.6551662683486938,
+      "learning_rate": 0.00015290272971933873,
+      "loss": 1.1004867553710938,
+      "step": 1226
+    },
+    {
+      "epoch": 0.7076124567474048,
+      "grad_norm": 1.0020555257797241,
+      "learning_rate": 0.0001528642829680892,
+      "loss": 1.2485133409500122,
+      "step": 1227
+    },
+    {
+      "epoch": 0.7081891580161477,
+      "grad_norm": 0.725662887096405,
+      "learning_rate": 0.00015282583621683968,
+      "loss": 0.8090496063232422,
+      "step": 1228
+    },
+    {
+      "epoch": 0.7087658592848904,
+      "grad_norm": 0.8500173091888428,
+      "learning_rate": 0.00015278738946559016,
+      "loss": 1.1222527027130127,
+      "step": 1229
+    },
+    {
+      "epoch": 0.7093425605536332,
+      "grad_norm": 0.7580368518829346,
+      "learning_rate": 0.00015274894271434064,
+      "loss": 0.8194168210029602,
+      "step": 1230
+    },
+    {
+      "epoch": 0.709919261822376,
+      "grad_norm": 0.936622679233551,
+      "learning_rate": 0.00015271049596309112,
+      "loss": 0.9981272220611572,
+      "step": 1231
+    },
+    {
+      "epoch": 0.7104959630911188,
+      "grad_norm": 0.8283603191375732,
+      "learning_rate": 0.00015267204921184162,
+      "loss": 0.9328891634941101,
+      "step": 1232
+    },
+    {
+      "epoch": 0.7110726643598616,
+      "grad_norm": 1.0028311014175415,
+      "learning_rate": 0.00015263360246059208,
+      "loss": 0.9482144117355347,
+      "step": 1233
+    },
+    {
+      "epoch": 0.7116493656286044,
+      "grad_norm": 1.1841291189193726,
+      "learning_rate": 0.00015259515570934258,
+      "loss": 1.4021642208099365,
+      "step": 1234
+    },
+    {
+      "epoch": 0.7122260668973471,
+      "grad_norm": 1.0274176597595215,
+      "learning_rate": 0.00015255670895809303,
+      "loss": 1.1408722400665283,
+      "step": 1235
+    },
+    {
+      "epoch": 0.71280276816609,
+      "grad_norm": 0.8339233994483948,
+      "learning_rate": 0.00015251826220684354,
+      "loss": 1.2026294469833374,
+      "step": 1236
+    },
+    {
+      "epoch": 0.7133794694348328,
+      "grad_norm": 0.8232172727584839,
+      "learning_rate": 0.000152479815455594,
+      "loss": 1.0658057928085327,
+      "step": 1237
+    },
+    {
+      "epoch": 0.7139561707035755,
+      "grad_norm": 0.6768394708633423,
+      "learning_rate": 0.0001524413687043445,
+      "loss": 0.7539021968841553,
+      "step": 1238
+    },
+    {
+      "epoch": 0.7145328719723183,
+      "grad_norm": 1.0153294801712036,
+      "learning_rate": 0.00015240292195309497,
+      "loss": 1.1792476177215576,
+      "step": 1239
+    },
+    {
+      "epoch": 0.7151095732410612,
+      "grad_norm": 1.2099579572677612,
+      "learning_rate": 0.00015236447520184545,
+      "loss": 1.482499599456787,
+      "step": 1240
+    },
+    {
+      "epoch": 0.7156862745098039,
+      "grad_norm": 0.5826729536056519,
+      "learning_rate": 0.00015232602845059593,
+      "loss": 0.7845430374145508,
+      "step": 1241
+    },
+    {
+      "epoch": 0.7162629757785467,
+      "grad_norm": 0.7632762789726257,
+      "learning_rate": 0.0001522875816993464,
+      "loss": 0.8908877968788147,
+      "step": 1242
+    },
+    {
+      "epoch": 0.7168396770472895,
+      "grad_norm": 0.835464358329773,
+      "learning_rate": 0.00015224913494809689,
+      "loss": 1.0795903205871582,
+      "step": 1243
+    },
+    {
+      "epoch": 0.7174163783160323,
+      "grad_norm": 0.998972475528717,
+      "learning_rate": 0.0001522106881968474,
+      "loss": 0.9715967178344727,
+      "step": 1244
+    },
+    {
+      "epoch": 0.7179930795847751,
+      "grad_norm": 0.5176213383674622,
+      "learning_rate": 0.00015217224144559784,
+      "loss": 0.7307795286178589,
+      "step": 1245
+    },
+    {
+      "epoch": 0.7185697808535179,
+      "grad_norm": 1.0009640455245972,
+      "learning_rate": 0.00015213379469434835,
+      "loss": 1.253312587738037,
+      "step": 1246
+    },
+    {
+      "epoch": 0.7191464821222606,
+      "grad_norm": 1.1499648094177246,
+      "learning_rate": 0.0001520953479430988,
+      "loss": 1.2523915767669678,
+      "step": 1247
+    },
+    {
+      "epoch": 0.7197231833910035,
+      "grad_norm": 0.9233465790748596,
+      "learning_rate": 0.0001520569011918493,
+      "loss": 1.025418996810913,
+      "step": 1248
+    },
+    {
+      "epoch": 0.7202998846597463,
+      "grad_norm": 0.5469316840171814,
+      "learning_rate": 0.00015201845444059975,
+      "loss": 0.6671372652053833,
+      "step": 1249
+    },
+    {
+      "epoch": 0.720876585928489,
+      "grad_norm": 0.7743379473686218,
+      "learning_rate": 0.00015198000768935026,
+      "loss": 1.2212378978729248,
+      "step": 1250
+    },
+    {
+      "epoch": 0.7214532871972318,
+      "grad_norm": 0.971682608127594,
+      "learning_rate": 0.00015194156093810074,
+      "loss": 1.2435131072998047,
+      "step": 1251
+    },
+    {
+      "epoch": 0.7220299884659747,
+      "grad_norm": 0.9899376630783081,
+      "learning_rate": 0.00015190311418685122,
+      "loss": 1.2595231533050537,
+      "step": 1252
+    },
+    {
+      "epoch": 0.7226066897347174,
+      "grad_norm": 0.8441123962402344,
+      "learning_rate": 0.0001518646674356017,
+      "loss": 0.9278808832168579,
+      "step": 1253
+    },
+    {
+      "epoch": 0.7231833910034602,
+      "grad_norm": 0.5254001021385193,
+      "learning_rate": 0.00015182622068435217,
+      "loss": 0.786496102809906,
+      "step": 1254
+    },
+    {
+      "epoch": 0.723760092272203,
+      "grad_norm": 0.9715943932533264,
+      "learning_rate": 0.00015178777393310265,
+      "loss": 0.9957152605056763,
+      "step": 1255
+    },
+    {
+      "epoch": 0.7243367935409458,
+      "grad_norm": 0.9919838905334473,
+      "learning_rate": 0.00015174932718185316,
+      "loss": 1.3595893383026123,
+      "step": 1256
+    },
+    {
+      "epoch": 0.7249134948096886,
+      "grad_norm": 0.7739357352256775,
+      "learning_rate": 0.0001517108804306036,
+      "loss": 0.7901654839515686,
+      "step": 1257
+    },
+    {
+      "epoch": 0.7254901960784313,
+      "grad_norm": 0.996926486492157,
+      "learning_rate": 0.00015167243367935411,
+      "loss": 1.0908658504486084,
+      "step": 1258
+    },
+    {
+      "epoch": 0.7260668973471741,
+      "grad_norm": 0.6757825016975403,
+      "learning_rate": 0.00015163398692810456,
+      "loss": 0.7795881032943726,
+      "step": 1259
+    },
+    {
+      "epoch": 0.726643598615917,
+      "grad_norm": 0.9458150863647461,
+      "learning_rate": 0.00015159554017685507,
+      "loss": 1.0505211353302002,
+      "step": 1260
+    },
+    {
+      "epoch": 0.7272202998846597,
+      "grad_norm": 0.8086127638816833,
+      "learning_rate": 0.00015155709342560552,
+      "loss": 0.9041070938110352,
+      "step": 1261
+    },
+    {
+      "epoch": 0.7277970011534025,
+      "grad_norm": 0.6491602659225464,
+      "learning_rate": 0.00015151864667435603,
+      "loss": 0.9067816734313965,
+      "step": 1262
+    },
+    {
+      "epoch": 0.7283737024221453,
+      "grad_norm": 0.5835777521133423,
+      "learning_rate": 0.0001514801999231065,
+      "loss": 0.7853602170944214,
+      "step": 1263
+    },
+    {
+      "epoch": 0.7289504036908881,
+      "grad_norm": 0.8881536722183228,
+      "learning_rate": 0.00015144175317185698,
+      "loss": 1.2767361402511597,
+      "step": 1264
+    },
+    {
+      "epoch": 0.7295271049596309,
+      "grad_norm": 0.6160046458244324,
+      "learning_rate": 0.00015140330642060746,
+      "loss": 0.7595696449279785,
+      "step": 1265
+    },
+    {
+      "epoch": 0.7301038062283737,
+      "grad_norm": 0.7877328991889954,
+      "learning_rate": 0.00015136485966935794,
+      "loss": 0.9727606773376465,
+      "step": 1266
+    },
+    {
+      "epoch": 0.7306805074971164,
+      "grad_norm": 0.6233464479446411,
+      "learning_rate": 0.00015132641291810842,
+      "loss": 0.6097822785377502,
+      "step": 1267
+    },
+    {
+      "epoch": 0.7312572087658593,
+      "grad_norm": 0.8846599459648132,
+      "learning_rate": 0.00015128796616685892,
+      "loss": 1.314606785774231,
+      "step": 1268
+    },
+    {
+      "epoch": 0.7318339100346021,
+      "grad_norm": 0.6752328872680664,
+      "learning_rate": 0.00015124951941560937,
+      "loss": 0.9257625341415405,
+      "step": 1269
+    },
+    {
+      "epoch": 0.7324106113033448,
+      "grad_norm": 0.6147440075874329,
+      "learning_rate": 0.00015121107266435988,
+      "loss": 0.7304266691207886,
+      "step": 1270
+    },
+    {
+      "epoch": 0.7329873125720877,
+      "grad_norm": 0.8625065088272095,
+      "learning_rate": 0.00015117262591311033,
+      "loss": 1.2385823726654053,
+      "step": 1271
+    },
+    {
+      "epoch": 0.7335640138408305,
+      "grad_norm": 0.6224170923233032,
+      "learning_rate": 0.00015113417916186084,
+      "loss": 0.7687395215034485,
+      "step": 1272
+    },
+    {
+      "epoch": 0.7341407151095732,
+      "grad_norm": 0.839799165725708,
+      "learning_rate": 0.0001510957324106113,
+      "loss": 1.0231621265411377,
+      "step": 1273
+    },
+    {
+      "epoch": 0.734717416378316,
+      "grad_norm": 0.8609519600868225,
+      "learning_rate": 0.0001510572856593618,
+      "loss": 1.1030302047729492,
+      "step": 1274
+    },
+    {
+      "epoch": 0.7352941176470589,
+      "grad_norm": 0.8059080243110657,
+      "learning_rate": 0.00015101883890811227,
+      "loss": 1.307667851448059,
+      "step": 1275
+    },
+    {
+      "epoch": 0.7358708189158016,
+      "grad_norm": 0.7881230115890503,
+      "learning_rate": 0.00015098039215686275,
+      "loss": 0.8685023784637451,
+      "step": 1276
+    },
+    {
+      "epoch": 0.7364475201845444,
+      "grad_norm": 0.6535466909408569,
+      "learning_rate": 0.00015094194540561323,
+      "loss": 0.8849316835403442,
+      "step": 1277
+    },
+    {
+      "epoch": 0.7370242214532872,
+      "grad_norm": 0.664448082447052,
+      "learning_rate": 0.0001509034986543637,
+      "loss": 0.809040904045105,
+      "step": 1278
+    },
+    {
+      "epoch": 0.73760092272203,
+      "grad_norm": 0.9526609182357788,
+      "learning_rate": 0.00015086505190311418,
+      "loss": 1.2887682914733887,
+      "step": 1279
+    },
+    {
+      "epoch": 0.7381776239907728,
+      "grad_norm": 0.8947210907936096,
+      "learning_rate": 0.00015082660515186466,
+      "loss": 1.0613007545471191,
+      "step": 1280
+    },
+    {
+      "epoch": 0.7387543252595156,
+      "grad_norm": 0.9127343893051147,
+      "learning_rate": 0.00015078815840061514,
+      "loss": 0.9401702284812927,
+      "step": 1281
+    },
+    {
+      "epoch": 0.7393310265282583,
+      "grad_norm": 1.0288292169570923,
+      "learning_rate": 0.00015074971164936565,
+      "loss": 1.2102299928665161,
+      "step": 1282
+    },
+    {
+      "epoch": 0.7399077277970012,
+      "grad_norm": 0.6608892679214478,
+      "learning_rate": 0.0001507112648981161,
+      "loss": 0.7817317247390747,
+      "step": 1283
+    },
+    {
+      "epoch": 0.740484429065744,
+      "grad_norm": 0.5857222080230713,
+      "learning_rate": 0.0001506728181468666,
+      "loss": 0.7468012571334839,
+      "step": 1284
+    },
+    {
+      "epoch": 0.7410611303344867,
+      "grad_norm": 0.6499783992767334,
+      "learning_rate": 0.00015063437139561708,
+      "loss": 0.7113574147224426,
+      "step": 1285
+    },
+    {
+      "epoch": 0.7416378316032295,
+      "grad_norm": 0.718450129032135,
+      "learning_rate": 0.00015059592464436756,
+      "loss": 0.9823046326637268,
+      "step": 1286
+    },
+    {
+      "epoch": 0.7422145328719724,
+      "grad_norm": 0.7987701296806335,
+      "learning_rate": 0.00015055747789311804,
+      "loss": 0.9410796761512756,
+      "step": 1287
+    },
+    {
+      "epoch": 0.7427912341407151,
+      "grad_norm": 0.7227610349655151,
+      "learning_rate": 0.00015051903114186852,
+      "loss": 0.7366760969161987,
+      "step": 1288
+    },
+    {
+      "epoch": 0.7433679354094579,
+      "grad_norm": 0.9411056637763977,
+      "learning_rate": 0.000150480584390619,
+      "loss": 0.9475510120391846,
+      "step": 1289
+    },
+    {
+      "epoch": 0.7439446366782007,
+      "grad_norm": 0.5987991690635681,
+      "learning_rate": 0.00015044213763936947,
+      "loss": 0.8084846138954163,
+      "step": 1290
+    },
+    {
+      "epoch": 0.7445213379469435,
+      "grad_norm": 0.6214851140975952,
+      "learning_rate": 0.00015040369088811995,
+      "loss": 0.6952444911003113,
+      "step": 1291
+    },
+    {
+      "epoch": 0.7450980392156863,
+      "grad_norm": 0.7398913502693176,
+      "learning_rate": 0.00015036524413687043,
+      "loss": 0.8432753086090088,
+      "step": 1292
+    },
+    {
+      "epoch": 0.745674740484429,
+      "grad_norm": 0.8513553142547607,
+      "learning_rate": 0.0001503267973856209,
+      "loss": 0.8751744627952576,
+      "step": 1293
+    },
+    {
+      "epoch": 0.7462514417531718,
+      "grad_norm": 0.7704481482505798,
+      "learning_rate": 0.0001502883506343714,
+      "loss": 0.9727562665939331,
+      "step": 1294
+    },
+    {
+      "epoch": 0.7468281430219147,
+      "grad_norm": 0.6925477385520935,
+      "learning_rate": 0.0001502499038831219,
+      "loss": 1.044316291809082,
+      "step": 1295
+    },
+    {
+      "epoch": 0.7474048442906575,
+      "grad_norm": 0.8089653253555298,
+      "learning_rate": 0.00015021145713187237,
+      "loss": 0.9385859966278076,
+      "step": 1296
+    },
+    {
+      "epoch": 0.7479815455594002,
+      "grad_norm": 0.8045443296432495,
+      "learning_rate": 0.00015017301038062285,
+      "loss": 1.093725562095642,
+      "step": 1297
+    },
+    {
+      "epoch": 0.748558246828143,
+      "grad_norm": 0.8403393626213074,
+      "learning_rate": 0.00015013456362937333,
+      "loss": 0.7081382870674133,
+      "step": 1298
+    },
+    {
+      "epoch": 0.7491349480968859,
+      "grad_norm": 0.8455471992492676,
+      "learning_rate": 0.0001500961168781238,
+      "loss": 1.2357611656188965,
+      "step": 1299
+    },
+    {
+      "epoch": 0.7497116493656286,
+      "grad_norm": 0.8819023966789246,
+      "learning_rate": 0.00015005767012687428,
+      "loss": 1.2907012701034546,
+      "step": 1300
+    },
+    {
+      "epoch": 0.7502883506343714,
+      "grad_norm": 0.6467103362083435,
+      "learning_rate": 0.00015001922337562476,
+      "loss": 0.7991781830787659,
+      "step": 1301
+    },
+    {
+      "epoch": 0.7508650519031141,
+      "grad_norm": 1.0841728448867798,
+      "learning_rate": 0.00014998077662437524,
+      "loss": 1.156419038772583,
+      "step": 1302
+    },
+    {
+      "epoch": 0.751441753171857,
+      "grad_norm": 0.4863538146018982,
+      "learning_rate": 0.00014994232987312572,
+      "loss": 0.5481974482536316,
+      "step": 1303
+    },
+    {
+      "epoch": 0.7520184544405998,
+      "grad_norm": 0.631119966506958,
+      "learning_rate": 0.0001499038831218762,
+      "loss": 0.7421573996543884,
+      "step": 1304
+    },
+    {
+      "epoch": 0.7525951557093425,
+      "grad_norm": 0.6919093728065491,
+      "learning_rate": 0.0001498654363706267,
+      "loss": 0.6554936170578003,
+      "step": 1305
+    },
+    {
+      "epoch": 0.7531718569780853,
+      "grad_norm": 0.7746281027793884,
+      "learning_rate": 0.00014982698961937718,
+      "loss": 0.9226951599121094,
+      "step": 1306
+    },
+    {
+      "epoch": 0.7537485582468282,
+      "grad_norm": 0.821020245552063,
+      "learning_rate": 0.00014978854286812766,
+      "loss": 1.2231357097625732,
+      "step": 1307
+    },
+    {
+      "epoch": 0.754325259515571,
+      "grad_norm": 0.6167652606964111,
+      "learning_rate": 0.00014975009611687814,
+      "loss": 0.9597879648208618,
+      "step": 1308
+    },
+    {
+      "epoch": 0.7549019607843137,
+      "grad_norm": 0.6786548495292664,
+      "learning_rate": 0.00014971164936562861,
+      "loss": 0.8253003358840942,
+      "step": 1309
+    },
+    {
+      "epoch": 0.7554786620530565,
+      "grad_norm": 0.9683876037597656,
+      "learning_rate": 0.0001496732026143791,
+      "loss": 1.1294584274291992,
+      "step": 1310
+    },
+    {
+      "epoch": 0.7560553633217993,
+      "grad_norm": 0.8556981086730957,
+      "learning_rate": 0.00014963475586312957,
+      "loss": 1.009643316268921,
+      "step": 1311
+    },
+    {
+      "epoch": 0.7566320645905421,
+      "grad_norm": 0.7639108896255493,
+      "learning_rate": 0.00014959630911188005,
+      "loss": 0.8871880769729614,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7572087658592849,
+      "grad_norm": 0.9662507176399231,
+      "learning_rate": 0.00014955786236063053,
+      "loss": 1.2890512943267822,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7577854671280276,
+      "grad_norm": 0.7260032892227173,
+      "learning_rate": 0.000149519415609381,
+      "loss": 1.2696185111999512,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7583621683967705,
+      "grad_norm": 1.0413408279418945,
+      "learning_rate": 0.0001494809688581315,
+      "loss": 1.2239567041397095,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7589388696655133,
+      "grad_norm": 0.9003005623817444,
+      "learning_rate": 0.00014944252210688196,
+      "loss": 1.248561143875122,
+      "step": 1316
+    },
+    {
+      "epoch": 0.759515570934256,
+      "grad_norm": 0.9604087471961975,
+      "learning_rate": 0.00014940407535563247,
+      "loss": 1.2369884252548218,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7600922722029988,
+      "grad_norm": 0.7198401093482971,
+      "learning_rate": 0.00014936562860438295,
+      "loss": 0.743487536907196,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7606689734717417,
+      "grad_norm": 0.7526591420173645,
+      "learning_rate": 0.00014932718185313342,
+      "loss": 0.7714953422546387,
+      "step": 1319
+    },
+    {
+      "epoch": 0.7612456747404844,
+      "grad_norm": 1.1336771249771118,
+      "learning_rate": 0.0001492887351018839,
+      "loss": 1.1577683687210083,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7618223760092272,
+      "grad_norm": 0.7607272267341614,
+      "learning_rate": 0.00014925028835063438,
+      "loss": 0.903020977973938,
+      "step": 1321
+    },
+    {
+      "epoch": 0.76239907727797,
+      "grad_norm": 0.7855517268180847,
+      "learning_rate": 0.00014921184159938486,
+      "loss": 0.9421197772026062,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7629757785467128,
+      "grad_norm": 0.9380967020988464,
+      "learning_rate": 0.00014917339484813534,
+      "loss": 1.0594120025634766,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7635524798154556,
+      "grad_norm": 0.9255303740501404,
+      "learning_rate": 0.00014913494809688582,
+      "loss": 1.1912791728973389,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7641291810841984,
+      "grad_norm": 0.7085497379302979,
+      "learning_rate": 0.00014909650134563632,
+      "loss": 0.7702199816703796,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7647058823529411,
+      "grad_norm": 0.8080468773841858,
+      "learning_rate": 0.00014905805459438677,
+      "loss": 0.9640858769416809,
+      "step": 1326
+    },
+    {
+      "epoch": 0.765282583621684,
+      "grad_norm": 0.8854598999023438,
+      "learning_rate": 0.00014901960784313728,
+      "loss": 1.0912519693374634,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7658592848904268,
+      "grad_norm": 1.158070683479309,
+      "learning_rate": 0.00014898116109188773,
+      "loss": 1.259207010269165,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7664359861591695,
+      "grad_norm": 0.7163742780685425,
+      "learning_rate": 0.00014894271434063823,
+      "loss": 0.9091912508010864,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7670126874279123,
+      "grad_norm": 0.6578546762466431,
+      "learning_rate": 0.0001489042675893887,
+      "loss": 1.13603937625885,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7675893886966552,
+      "grad_norm": 0.641118586063385,
+      "learning_rate": 0.0001488658208381392,
+      "loss": 0.6926564574241638,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7681660899653979,
+      "grad_norm": 1.3342225551605225,
+      "learning_rate": 0.00014882737408688967,
+      "loss": 1.1259536743164062,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7687427912341407,
+      "grad_norm": 0.6777533292770386,
+      "learning_rate": 0.00014878892733564015,
+      "loss": 0.8380722403526306,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7693194925028836,
+      "grad_norm": 0.5475529432296753,
+      "learning_rate": 0.00014875048058439063,
+      "loss": 0.7194100618362427,
+      "step": 1334
+    },
+    {
+      "epoch": 0.7698961937716263,
+      "grad_norm": 0.7109413743019104,
+      "learning_rate": 0.0001487120338331411,
+      "loss": 0.7877069711685181,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7704728950403691,
+      "grad_norm": 0.5451337099075317,
+      "learning_rate": 0.00014867358708189158,
+      "loss": 0.7354110479354858,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7710495963091119,
+      "grad_norm": 0.7789444327354431,
+      "learning_rate": 0.0001486351403306421,
+      "loss": 0.9675291776657104,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7716262975778547,
+      "grad_norm": 0.7246870398521423,
+      "learning_rate": 0.00014859669357939254,
+      "loss": 0.9592723846435547,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7722029988465975,
+      "grad_norm": 0.7461789846420288,
+      "learning_rate": 0.00014855824682814304,
+      "loss": 1.062403678894043,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7727797001153403,
+      "grad_norm": 0.6598569750785828,
+      "learning_rate": 0.0001485198000768935,
+      "loss": 0.959195077419281,
+      "step": 1340
+    },
+    {
+      "epoch": 0.773356401384083,
+      "grad_norm": 0.8688694834709167,
+      "learning_rate": 0.000148481353325644,
+      "loss": 1.3393487930297852,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7739331026528259,
+      "grad_norm": 0.7083797454833984,
+      "learning_rate": 0.00014844290657439448,
+      "loss": 0.9515122175216675,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7745098039215687,
+      "grad_norm": 0.7261124849319458,
+      "learning_rate": 0.00014840445982314496,
+      "loss": 1.048977017402649,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7750865051903114,
+      "grad_norm": 0.9450129270553589,
+      "learning_rate": 0.00014836601307189544,
+      "loss": 1.1335430145263672,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7756632064590542,
+      "grad_norm": 0.47535234689712524,
+      "learning_rate": 0.00014832756632064591,
+      "loss": 0.6887091398239136,
+      "step": 1345
+    },
+    {
+      "epoch": 0.776239907727797,
+      "grad_norm": 0.714235782623291,
+      "learning_rate": 0.0001482891195693964,
+      "loss": 0.9414650201797485,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7768166089965398,
+      "grad_norm": 0.6094812750816345,
+      "learning_rate": 0.00014825067281814687,
+      "loss": 0.8214763402938843,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7773933102652826,
+      "grad_norm": 0.7122801542282104,
+      "learning_rate": 0.00014821222606689735,
+      "loss": 0.9144871830940247,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7779700115340253,
+      "grad_norm": 0.8147172927856445,
+      "learning_rate": 0.00014817377931564785,
+      "loss": 1.1212399005889893,
+      "step": 1349
+    },
+    {
+      "epoch": 0.7785467128027682,
+      "grad_norm": 0.5866456627845764,
+      "learning_rate": 0.0001481353325643983,
+      "loss": 0.6841553449630737,
+      "step": 1350
+    },
+    {
+      "epoch": 0.779123414071511,
+      "grad_norm": 1.2120155096054077,
+      "learning_rate": 0.0001480968858131488,
+      "loss": 1.1782194375991821,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7797001153402537,
+      "grad_norm": 0.8661918640136719,
+      "learning_rate": 0.00014805843906189926,
+      "loss": 1.1883846521377563,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7802768166089965,
+      "grad_norm": 1.2335827350616455,
+      "learning_rate": 0.00014801999231064977,
+      "loss": 1.199598789215088,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7808535178777394,
+      "grad_norm": 0.8413060307502747,
+      "learning_rate": 0.00014798154555940025,
+      "loss": 1.0878143310546875,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7814302191464821,
+      "grad_norm": 1.042397379875183,
+      "learning_rate": 0.00014794309880815072,
+      "loss": 1.5179508924484253,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7820069204152249,
+      "grad_norm": 1.2029002904891968,
+      "learning_rate": 0.0001479046520569012,
+      "loss": 1.361120343208313,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7825836216839677,
+      "grad_norm": 0.9056934714317322,
+      "learning_rate": 0.00014786620530565168,
+      "loss": 1.0812435150146484,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7831603229527105,
+      "grad_norm": 0.7730829119682312,
+      "learning_rate": 0.00014782775855440216,
+      "loss": 1.0833256244659424,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7837370242214533,
+      "grad_norm": 0.8789440393447876,
+      "learning_rate": 0.00014778931180315264,
+      "loss": 1.0179883241653442,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7843137254901961,
+      "grad_norm": 0.775190532207489,
+      "learning_rate": 0.00014775086505190312,
+      "loss": 1.0584783554077148,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7848904267589388,
+      "grad_norm": 0.7954389452934265,
+      "learning_rate": 0.00014771241830065362,
+      "loss": 1.1697866916656494,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7854671280276817,
+      "grad_norm": 0.8194144368171692,
+      "learning_rate": 0.00014767397154940407,
+      "loss": 0.9788481593132019,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7860438292964245,
+      "grad_norm": 0.7247309684753418,
+      "learning_rate": 0.00014763552479815458,
+      "loss": 0.9953986406326294,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7866205305651672,
+      "grad_norm": 0.8735687136650085,
+      "learning_rate": 0.00014759707804690503,
+      "loss": 1.108184576034546,
+      "step": 1364
+    },
+    {
+      "epoch": 0.78719723183391,
+      "grad_norm": 0.8578454256057739,
+      "learning_rate": 0.00014755863129565553,
+      "loss": 1.0608623027801514,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7877739331026529,
+      "grad_norm": 1.038670301437378,
+      "learning_rate": 0.000147520184544406,
+      "loss": 1.2398217916488647,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7883506343713956,
+      "grad_norm": 0.832326352596283,
+      "learning_rate": 0.0001474817377931565,
+      "loss": 1.5559954643249512,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7889273356401384,
+      "grad_norm": 0.5325842499732971,
+      "learning_rate": 0.00014744329104190697,
+      "loss": 0.6711868047714233,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7895040369088812,
+      "grad_norm": 0.6845494508743286,
+      "learning_rate": 0.00014740484429065745,
+      "loss": 0.9054516553878784,
+      "step": 1369
+    },
+    {
+      "epoch": 0.790080738177624,
+      "grad_norm": 0.8053160309791565,
+      "learning_rate": 0.00014736639753940793,
+      "loss": 1.1551737785339355,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7906574394463668,
+      "grad_norm": 0.9268645644187927,
+      "learning_rate": 0.0001473279507881584,
+      "loss": 0.9230217933654785,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7912341407151096,
+      "grad_norm": 1.0553678274154663,
+      "learning_rate": 0.00014728950403690888,
+      "loss": 1.2223023176193237,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7918108419838523,
+      "grad_norm": 0.6177469491958618,
+      "learning_rate": 0.0001472510572856594,
+      "loss": 0.8992686867713928,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7923875432525952,
+      "grad_norm": 1.138965368270874,
+      "learning_rate": 0.00014721261053440984,
+      "loss": 0.8630029559135437,
+      "step": 1374
+    },
+    {
+      "epoch": 0.792964244521338,
+      "grad_norm": 0.5512900948524475,
+      "learning_rate": 0.00014717416378316034,
+      "loss": 0.8302984237670898,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7935409457900807,
+      "grad_norm": 0.6091440916061401,
+      "learning_rate": 0.0001471357170319108,
+      "loss": 0.7380212545394897,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7941176470588235,
+      "grad_norm": 0.909902811050415,
+      "learning_rate": 0.0001470972702806613,
+      "loss": 1.0644478797912598,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7946943483275664,
+      "grad_norm": 0.9841009378433228,
+      "learning_rate": 0.00014705882352941178,
+      "loss": 1.5122861862182617,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7952710495963091,
+      "grad_norm": 0.7682785391807556,
+      "learning_rate": 0.00014702037677816226,
+      "loss": 0.8122522830963135,
+      "step": 1379
+    },
+    {
+      "epoch": 0.7958477508650519,
+      "grad_norm": 0.8022129535675049,
+      "learning_rate": 0.00014698193002691274,
+      "loss": 0.7516300678253174,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7964244521337946,
+      "grad_norm": 0.8423136472702026,
+      "learning_rate": 0.00014694348327566321,
+      "loss": 0.9571545124053955,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7970011534025375,
+      "grad_norm": 0.61954665184021,
+      "learning_rate": 0.0001469050365244137,
+      "loss": 0.8543866872787476,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7975778546712803,
+      "grad_norm": 0.5888648629188538,
+      "learning_rate": 0.00014686658977316417,
+      "loss": 0.6958523988723755,
+      "step": 1383
+    },
+    {
+      "epoch": 0.798154555940023,
+      "grad_norm": 0.9419842958450317,
+      "learning_rate": 0.00014682814302191465,
+      "loss": 1.3051813840866089,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7987312572087658,
+      "grad_norm": 1.1472746133804321,
+      "learning_rate": 0.00014678969627066515,
+      "loss": 1.284635305404663,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7993079584775087,
+      "grad_norm": 0.5858578681945801,
+      "learning_rate": 0.0001467512495194156,
+      "loss": 0.7809937596321106,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7998846597462514,
+      "grad_norm": 0.7086213231086731,
+      "learning_rate": 0.0001467128027681661,
+      "loss": 0.6571354269981384,
+      "step": 1387
+    },
+    {
+      "epoch": 0.8004613610149942,
+      "grad_norm": 0.8438594341278076,
+      "learning_rate": 0.00014667435601691656,
+      "loss": 0.9461796283721924,
+      "step": 1388
+    },
+    {
+      "epoch": 0.801038062283737,
+      "grad_norm": 0.6701700687408447,
+      "learning_rate": 0.00014663590926566707,
+      "loss": 0.7518469095230103,
+      "step": 1389
+    },
+    {
+      "epoch": 0.8016147635524798,
+      "grad_norm": 0.7239779233932495,
+      "learning_rate": 0.00014659746251441755,
+      "loss": 0.98681640625,
+      "step": 1390
+    },
+    {
+      "epoch": 0.8021914648212226,
+      "grad_norm": 0.9055145382881165,
+      "learning_rate": 0.00014655901576316802,
+      "loss": 1.038681983947754,
+      "step": 1391
+    },
+    {
+      "epoch": 0.8027681660899654,
+      "grad_norm": 0.674439013004303,
+      "learning_rate": 0.0001465205690119185,
+      "loss": 0.7289140820503235,
+      "step": 1392
+    },
+    {
+      "epoch": 0.8033448673587081,
+      "grad_norm": 0.6101412773132324,
+      "learning_rate": 0.00014648212226066898,
+      "loss": 0.8470169901847839,
+      "step": 1393
+    },
+    {
+      "epoch": 0.803921568627451,
+      "grad_norm": 1.0043631792068481,
+      "learning_rate": 0.00014644367550941946,
+      "loss": 0.9277285933494568,
+      "step": 1394
+    },
+    {
+      "epoch": 0.8044982698961938,
+      "grad_norm": 0.8795577883720398,
+      "learning_rate": 0.00014640522875816994,
+      "loss": 1.2433722019195557,
+      "step": 1395
+    },
+    {
+      "epoch": 0.8050749711649365,
+      "grad_norm": 0.469595730304718,
+      "learning_rate": 0.00014636678200692042,
+      "loss": 0.5572987794876099,
+      "step": 1396
+    },
+    {
+      "epoch": 0.8056516724336793,
+      "grad_norm": 0.8809022903442383,
+      "learning_rate": 0.00014632833525567092,
+      "loss": 1.1597031354904175,
+      "step": 1397
+    },
+    {
+      "epoch": 0.8062283737024222,
+      "grad_norm": 0.9675459861755371,
+      "learning_rate": 0.00014628988850442137,
+      "loss": 1.0070991516113281,
+      "step": 1398
+    },
+    {
+      "epoch": 0.8068050749711649,
+      "grad_norm": 0.8547102212905884,
+      "learning_rate": 0.00014625144175317188,
+      "loss": 0.9210143089294434,
+      "step": 1399
+    },
+    {
+      "epoch": 0.8073817762399077,
+      "grad_norm": 0.5635284185409546,
+      "learning_rate": 0.00014621299500192233,
+      "loss": 0.5849195122718811,
+      "step": 1400
+    },
+    {
+      "epoch": 0.8079584775086506,
+      "grad_norm": 0.8755897283554077,
+      "learning_rate": 0.00014617454825067283,
+      "loss": 1.014789342880249,
+      "step": 1401
+    },
+    {
+      "epoch": 0.8085351787773933,
+      "grad_norm": 0.6002927422523499,
+      "learning_rate": 0.00014613610149942328,
+      "loss": 0.8705483675003052,
+      "step": 1402
+    },
+    {
+      "epoch": 0.8091118800461361,
+      "grad_norm": 0.9547945857048035,
+      "learning_rate": 0.0001460976547481738,
+      "loss": 1.0433237552642822,
+      "step": 1403
+    },
+    {
+      "epoch": 0.8096885813148789,
+      "grad_norm": 0.8594508767127991,
+      "learning_rate": 0.00014605920799692427,
+      "loss": 0.857754111289978,
+      "step": 1404
+    },
+    {
+      "epoch": 0.8102652825836217,
+      "grad_norm": 0.632087230682373,
+      "learning_rate": 0.00014602076124567475,
+      "loss": 1.0932989120483398,
+      "step": 1405
+    },
+    {
+      "epoch": 0.8108419838523645,
+      "grad_norm": 0.6727497577667236,
+      "learning_rate": 0.00014598231449442523,
+      "loss": 1.1335169076919556,
+      "step": 1406
+    },
+    {
+      "epoch": 0.8114186851211073,
+      "grad_norm": 1.050377368927002,
+      "learning_rate": 0.0001459438677431757,
+      "loss": 1.1787501573562622,
+      "step": 1407
+    },
+    {
+      "epoch": 0.81199538638985,
+      "grad_norm": 0.624580442905426,
+      "learning_rate": 0.00014590542099192618,
+      "loss": 0.8040243983268738,
+      "step": 1408
+    },
+    {
+      "epoch": 0.8125720876585929,
+      "grad_norm": 0.644497275352478,
+      "learning_rate": 0.0001458669742406767,
+      "loss": 0.9769735336303711,
+      "step": 1409
+    },
+    {
+      "epoch": 0.8131487889273357,
+      "grad_norm": 0.8106479048728943,
+      "learning_rate": 0.00014582852748942714,
+      "loss": 1.2847563028335571,
+      "step": 1410
+    },
+    {
+      "epoch": 0.8137254901960784,
+      "grad_norm": 0.6234838962554932,
+      "learning_rate": 0.00014579008073817764,
+      "loss": 0.7418760061264038,
+      "step": 1411
+    },
+    {
+      "epoch": 0.8143021914648212,
+      "grad_norm": 0.7591360807418823,
+      "learning_rate": 0.0001457516339869281,
+      "loss": 1.0062642097473145,
+      "step": 1412
+    },
+    {
+      "epoch": 0.8148788927335641,
+      "grad_norm": 0.7684062123298645,
+      "learning_rate": 0.0001457131872356786,
+      "loss": 0.9963294267654419,
+      "step": 1413
+    },
+    {
+      "epoch": 0.8154555940023068,
+      "grad_norm": 0.8234810829162598,
+      "learning_rate": 0.00014567474048442905,
+      "loss": 0.9132286310195923,
+      "step": 1414
+    },
+    {
+      "epoch": 0.8160322952710496,
+      "grad_norm": 1.3752492666244507,
+      "learning_rate": 0.00014563629373317956,
+      "loss": 1.3458770513534546,
+      "step": 1415
+    },
+    {
+      "epoch": 0.8166089965397924,
+      "grad_norm": 0.8771060109138489,
+      "learning_rate": 0.00014559784698193004,
+      "loss": 0.9146612882614136,
+      "step": 1416
+    },
+    {
+      "epoch": 0.8171856978085352,
+      "grad_norm": 0.5799472332000732,
+      "learning_rate": 0.0001455594002306805,
+      "loss": 0.8132292032241821,
+      "step": 1417
+    },
+    {
+      "epoch": 0.817762399077278,
+      "grad_norm": 1.0692527294158936,
+      "learning_rate": 0.000145520953479431,
+      "loss": 1.0524235963821411,
+      "step": 1418
+    },
+    {
+      "epoch": 0.8183391003460208,
+      "grad_norm": 0.6880149245262146,
+      "learning_rate": 0.00014548250672818147,
+      "loss": 0.8549849987030029,
+      "step": 1419
+    },
+    {
+      "epoch": 0.8189158016147635,
+      "grad_norm": 0.9311429858207703,
+      "learning_rate": 0.00014544405997693195,
+      "loss": 1.2363505363464355,
+      "step": 1420
+    },
+    {
+      "epoch": 0.8194925028835064,
+      "grad_norm": 0.6105409860610962,
+      "learning_rate": 0.00014540561322568245,
+      "loss": 0.8256676197052002,
+      "step": 1421
+    },
+    {
+      "epoch": 0.8200692041522492,
+      "grad_norm": 0.9718572497367859,
+      "learning_rate": 0.0001453671664744329,
+      "loss": 1.349236249923706,
+      "step": 1422
+    },
+    {
+      "epoch": 0.8206459054209919,
+      "grad_norm": 0.9589305520057678,
+      "learning_rate": 0.0001453287197231834,
+      "loss": 0.8896529674530029,
+      "step": 1423
+    },
+    {
+      "epoch": 0.8212226066897347,
+      "grad_norm": 1.1475483179092407,
+      "learning_rate": 0.00014529027297193386,
+      "loss": 1.392863154411316,
+      "step": 1424
+    },
+    {
+      "epoch": 0.8217993079584776,
+      "grad_norm": 0.9420047402381897,
+      "learning_rate": 0.00014525182622068437,
+      "loss": 1.1920685768127441,
+      "step": 1425
+    },
+    {
+      "epoch": 0.8223760092272203,
+      "grad_norm": 0.584073007106781,
+      "learning_rate": 0.00014521337946943482,
+      "loss": 0.5488528609275818,
+      "step": 1426
+    },
+    {
+      "epoch": 0.8229527104959631,
+      "grad_norm": 0.6110360622406006,
+      "learning_rate": 0.00014517493271818532,
+      "loss": 0.7226777672767639,
+      "step": 1427
+    },
+    {
+      "epoch": 0.8235294117647058,
+      "grad_norm": 0.5320557355880737,
+      "learning_rate": 0.0001451364859669358,
+      "loss": 0.5602037906646729,
+      "step": 1428
+    },
+    {
+      "epoch": 0.8241061130334487,
+      "grad_norm": 0.5847785472869873,
+      "learning_rate": 0.00014509803921568628,
+      "loss": 0.632820725440979,
+      "step": 1429
+    },
+    {
+      "epoch": 0.8246828143021915,
+      "grad_norm": 1.1915888786315918,
+      "learning_rate": 0.00014505959246443676,
+      "loss": 1.2395484447479248,
+      "step": 1430
+    },
+    {
+      "epoch": 0.8252595155709342,
+      "grad_norm": 0.7745262980461121,
+      "learning_rate": 0.00014502114571318724,
+      "loss": 0.9293632507324219,
+      "step": 1431
+    },
+    {
+      "epoch": 0.825836216839677,
+      "grad_norm": 0.9716136455535889,
+      "learning_rate": 0.00014498269896193771,
+      "loss": 1.2587440013885498,
+      "step": 1432
+    },
+    {
+      "epoch": 0.8264129181084199,
+      "grad_norm": 0.6674740314483643,
+      "learning_rate": 0.00014494425221068822,
+      "loss": 0.9000645875930786,
+      "step": 1433
+    },
+    {
+      "epoch": 0.8269896193771626,
+      "grad_norm": 0.9345766305923462,
+      "learning_rate": 0.00014490580545943867,
+      "loss": 0.9881076812744141,
+      "step": 1434
+    },
+    {
+      "epoch": 0.8275663206459054,
+      "grad_norm": 0.8641346096992493,
+      "learning_rate": 0.00014486735870818918,
+      "loss": 1.0706219673156738,
+      "step": 1435
+    },
+    {
+      "epoch": 0.8281430219146482,
+      "grad_norm": 0.8997068405151367,
+      "learning_rate": 0.00014482891195693963,
+      "loss": 0.932431697845459,
+      "step": 1436
+    },
+    {
+      "epoch": 0.828719723183391,
+      "grad_norm": 0.7539141774177551,
+      "learning_rate": 0.00014479046520569013,
+      "loss": 0.8891205191612244,
+      "step": 1437
+    },
+    {
+      "epoch": 0.8292964244521338,
+      "grad_norm": 0.8675488233566284,
+      "learning_rate": 0.00014475201845444058,
+      "loss": 0.9973325729370117,
+      "step": 1438
+    },
+    {
+      "epoch": 0.8298731257208766,
+      "grad_norm": 0.7566542029380798,
+      "learning_rate": 0.0001447135717031911,
+      "loss": 1.1265358924865723,
+      "step": 1439
+    },
+    {
+      "epoch": 0.8304498269896193,
+      "grad_norm": 0.902654230594635,
+      "learning_rate": 0.00014467512495194157,
+      "loss": 1.0915746688842773,
+      "step": 1440
+    },
+    {
+      "epoch": 0.8310265282583622,
+      "grad_norm": 0.618813693523407,
+      "learning_rate": 0.00014463667820069205,
+      "loss": 0.6798044443130493,
+      "step": 1441
+    },
+    {
+      "epoch": 0.831603229527105,
+      "grad_norm": 0.6372320055961609,
+      "learning_rate": 0.00014459823144944252,
+      "loss": 0.8383584022521973,
+      "step": 1442
+    },
+    {
+      "epoch": 0.8321799307958477,
+      "grad_norm": 0.742468535900116,
+      "learning_rate": 0.000144559784698193,
+      "loss": 1.0003979206085205,
+      "step": 1443
+    },
+    {
+      "epoch": 0.8327566320645905,
+      "grad_norm": 0.9815142750740051,
+      "learning_rate": 0.00014452133794694348,
+      "loss": 1.2571461200714111,
+      "step": 1444
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.7362657785415649,
+      "learning_rate": 0.000144482891195694,
+      "loss": 0.9890142679214478,
+      "step": 1445
+    },
+    {
+      "epoch": 0.8339100346020761,
+      "grad_norm": 1.047896385192871,
+      "learning_rate": 0.00014444444444444444,
+      "loss": 0.7491689920425415,
+      "step": 1446
+    },
+    {
+      "epoch": 0.8344867358708189,
+      "grad_norm": 1.0869019031524658,
+      "learning_rate": 0.00014440599769319494,
+      "loss": 1.0598435401916504,
+      "step": 1447
+    },
+    {
+      "epoch": 0.8350634371395617,
+      "grad_norm": 0.8003841042518616,
+      "learning_rate": 0.0001443675509419454,
+      "loss": 0.7503578662872314,
+      "step": 1448
+    },
+    {
+      "epoch": 0.8356401384083045,
+      "grad_norm": 1.3352385759353638,
+      "learning_rate": 0.0001443291041906959,
+      "loss": 1.7147669792175293,
+      "step": 1449
+    },
+    {
+      "epoch": 0.8362168396770473,
+      "grad_norm": 0.7203720808029175,
+      "learning_rate": 0.00014429065743944635,
+      "loss": 0.7103738188743591,
+      "step": 1450
+    },
+    {
+      "epoch": 0.8367935409457901,
+      "grad_norm": 0.7292425036430359,
+      "learning_rate": 0.00014425221068819686,
+      "loss": 0.9089938402175903,
+      "step": 1451
+    },
+    {
+      "epoch": 0.8373702422145328,
+      "grad_norm": 1.5864981412887573,
+      "learning_rate": 0.00014421376393694733,
+      "loss": 1.2735176086425781,
+      "step": 1452
+    },
+    {
+      "epoch": 0.8379469434832757,
+      "grad_norm": 0.5966582298278809,
+      "learning_rate": 0.0001441753171856978,
+      "loss": 0.8211960196495056,
+      "step": 1453
+    },
+    {
+      "epoch": 0.8385236447520185,
+      "grad_norm": 0.6568999886512756,
+      "learning_rate": 0.0001441368704344483,
+      "loss": 0.9273509979248047,
+      "step": 1454
+    },
+    {
+      "epoch": 0.8391003460207612,
+      "grad_norm": 0.6672592163085938,
+      "learning_rate": 0.00014409842368319877,
+      "loss": 0.7854159474372864,
+      "step": 1455
+    },
+    {
+      "epoch": 0.839677047289504,
+      "grad_norm": 1.1119751930236816,
+      "learning_rate": 0.00014405997693194925,
+      "loss": 1.2850849628448486,
+      "step": 1456
+    },
+    {
+      "epoch": 0.8402537485582469,
+      "grad_norm": 0.8437113165855408,
+      "learning_rate": 0.00014402153018069975,
+      "loss": 0.9052360653877258,
+      "step": 1457
+    },
+    {
+      "epoch": 0.8408304498269896,
+      "grad_norm": 1.1120409965515137,
+      "learning_rate": 0.0001439830834294502,
+      "loss": 1.4261767864227295,
+      "step": 1458
+    },
+    {
+      "epoch": 0.8414071510957324,
+      "grad_norm": 0.6494320631027222,
+      "learning_rate": 0.0001439446366782007,
+      "loss": 0.8434788584709167,
+      "step": 1459
+    },
+    {
+      "epoch": 0.8419838523644751,
+      "grad_norm": 0.5622795820236206,
+      "learning_rate": 0.00014390618992695116,
+      "loss": 0.646868109703064,
+      "step": 1460
+    },
+    {
+      "epoch": 0.842560553633218,
+      "grad_norm": 0.8375677466392517,
+      "learning_rate": 0.00014386774317570167,
+      "loss": 1.0123827457427979,
+      "step": 1461
+    },
+    {
+      "epoch": 0.8431372549019608,
+      "grad_norm": 0.6013731956481934,
+      "learning_rate": 0.00014382929642445214,
+      "loss": 0.7129334211349487,
+      "step": 1462
+    },
+    {
+      "epoch": 0.8437139561707035,
+      "grad_norm": 0.7148757576942444,
+      "learning_rate": 0.00014379084967320262,
+      "loss": 0.7350738048553467,
+      "step": 1463
+    },
+    {
+      "epoch": 0.8442906574394463,
+      "grad_norm": 0.7380696535110474,
+      "learning_rate": 0.0001437524029219531,
+      "loss": 0.7962418794631958,
+      "step": 1464
+    },
+    {
+      "epoch": 0.8448673587081892,
+      "grad_norm": 0.6836022734642029,
+      "learning_rate": 0.00014371395617070358,
+      "loss": 1.0249385833740234,
+      "step": 1465
+    },
+    {
+      "epoch": 0.845444059976932,
+      "grad_norm": 0.8065418004989624,
+      "learning_rate": 0.00014367550941945406,
+      "loss": 1.0036308765411377,
+      "step": 1466
+    },
+    {
+      "epoch": 0.8460207612456747,
+      "grad_norm": 0.8336586356163025,
+      "learning_rate": 0.00014363706266820454,
+      "loss": 0.9442139863967896,
+      "step": 1467
+    },
+    {
+      "epoch": 0.8465974625144176,
+      "grad_norm": 0.9105651378631592,
+      "learning_rate": 0.00014359861591695501,
+      "loss": 1.198281168937683,
+      "step": 1468
+    },
+    {
+      "epoch": 0.8471741637831603,
+      "grad_norm": 0.6932002902030945,
+      "learning_rate": 0.0001435601691657055,
+      "loss": 0.76617431640625,
+      "step": 1469
+    },
+    {
+      "epoch": 0.8477508650519031,
+      "grad_norm": 0.6474612951278687,
+      "learning_rate": 0.00014352172241445597,
+      "loss": 0.9350631237030029,
+      "step": 1470
+    },
+    {
+      "epoch": 0.8483275663206459,
+      "grad_norm": 1.0232489109039307,
+      "learning_rate": 0.00014348327566320648,
+      "loss": 1.2790873050689697,
+      "step": 1471
+    },
+    {
+      "epoch": 0.8489042675893888,
+      "grad_norm": 0.5638800263404846,
+      "learning_rate": 0.00014344482891195695,
+      "loss": 0.6640872359275818,
+      "step": 1472
+    },
+    {
+      "epoch": 0.8494809688581315,
+      "grad_norm": 0.7060153484344482,
+      "learning_rate": 0.00014340638216070743,
+      "loss": 0.549694299697876,
+      "step": 1473
+    },
+    {
+      "epoch": 0.8500576701268743,
+      "grad_norm": 0.7553113698959351,
+      "learning_rate": 0.0001433679354094579,
+      "loss": 0.6748926639556885,
+      "step": 1474
+    },
+    {
+      "epoch": 0.850634371395617,
+      "grad_norm": 1.0750683546066284,
+      "learning_rate": 0.0001433294886582084,
+      "loss": 1.2567592859268188,
+      "step": 1475
+    },
+    {
+      "epoch": 0.8512110726643599,
+      "grad_norm": 0.8767377138137817,
+      "learning_rate": 0.00014329104190695887,
+      "loss": 0.8606712818145752,
+      "step": 1476
+    },
+    {
+      "epoch": 0.8517877739331027,
+      "grad_norm": 0.8583175539970398,
+      "learning_rate": 0.00014325259515570935,
+      "loss": 1.0961095094680786,
+      "step": 1477
+    },
+    {
+      "epoch": 0.8523644752018454,
+      "grad_norm": 0.8185640573501587,
+      "learning_rate": 0.00014321414840445982,
+      "loss": 0.9456279277801514,
+      "step": 1478
+    },
+    {
+      "epoch": 0.8529411764705882,
+      "grad_norm": 0.7922638058662415,
+      "learning_rate": 0.0001431757016532103,
+      "loss": 0.8527402281761169,
+      "step": 1479
+    },
+    {
+      "epoch": 0.8535178777393311,
+      "grad_norm": 0.8317216634750366,
+      "learning_rate": 0.00014313725490196078,
+      "loss": 1.0812233686447144,
+      "step": 1480
+    },
+    {
+      "epoch": 0.8540945790080738,
+      "grad_norm": 0.5592607855796814,
+      "learning_rate": 0.00014309880815071126,
+      "loss": 0.6856215000152588,
+      "step": 1481
+    },
+    {
+      "epoch": 0.8546712802768166,
+      "grad_norm": 0.6144684553146362,
+      "learning_rate": 0.00014306036139946174,
+      "loss": 0.8217105269432068,
+      "step": 1482
+    },
+    {
+      "epoch": 0.8552479815455594,
+      "grad_norm": 0.8721742630004883,
+      "learning_rate": 0.00014302191464821224,
+      "loss": 1.1268048286437988,
+      "step": 1483
+    },
+    {
+      "epoch": 0.8558246828143022,
+      "grad_norm": 0.7512510418891907,
+      "learning_rate": 0.00014298346789696272,
+      "loss": 0.7509297132492065,
+      "step": 1484
+    },
+    {
+      "epoch": 0.856401384083045,
+      "grad_norm": 0.7145662307739258,
+      "learning_rate": 0.0001429450211457132,
+      "loss": 0.787600040435791,
+      "step": 1485
+    },
+    {
+      "epoch": 0.8569780853517878,
+      "grad_norm": 0.5714643597602844,
+      "learning_rate": 0.00014290657439446368,
+      "loss": 0.5843244791030884,
+      "step": 1486
+    },
+    {
+      "epoch": 0.8575547866205305,
+      "grad_norm": 0.567432701587677,
+      "learning_rate": 0.00014286812764321416,
+      "loss": 0.5819793939590454,
+      "step": 1487
+    },
+    {
+      "epoch": 0.8581314878892734,
+      "grad_norm": 0.7957308888435364,
+      "learning_rate": 0.00014282968089196463,
+      "loss": 1.127239465713501,
+      "step": 1488
+    },
+    {
+      "epoch": 0.8587081891580162,
+      "grad_norm": 0.6828871369361877,
+      "learning_rate": 0.0001427912341407151,
+      "loss": 0.8339288234710693,
+      "step": 1489
+    },
+    {
+      "epoch": 0.8592848904267589,
+      "grad_norm": 0.6947774887084961,
+      "learning_rate": 0.0001427527873894656,
+      "loss": 0.8848856687545776,
+      "step": 1490
+    },
+    {
+      "epoch": 0.8598615916955017,
+      "grad_norm": 0.7703558802604675,
+      "learning_rate": 0.00014271434063821607,
+      "loss": 1.1964079141616821,
+      "step": 1491
+    },
+    {
+      "epoch": 0.8604382929642446,
+      "grad_norm": 0.9820204973220825,
+      "learning_rate": 0.00014267589388696655,
+      "loss": 1.3156203031539917,
+      "step": 1492
+    },
+    {
+      "epoch": 0.8610149942329873,
+      "grad_norm": 0.663357138633728,
+      "learning_rate": 0.00014263744713571703,
+      "loss": 1.1208245754241943,
+      "step": 1493
+    },
+    {
+      "epoch": 0.8615916955017301,
+      "grad_norm": 0.6204859018325806,
+      "learning_rate": 0.00014259900038446753,
+      "loss": 0.8412761688232422,
+      "step": 1494
+    },
+    {
+      "epoch": 0.8621683967704729,
+      "grad_norm": 0.8673816323280334,
+      "learning_rate": 0.000142560553633218,
+      "loss": 0.9236775040626526,
+      "step": 1495
+    },
+    {
+      "epoch": 0.8627450980392157,
+      "grad_norm": 0.6511439681053162,
+      "learning_rate": 0.0001425221068819685,
+      "loss": 0.8711351156234741,
+      "step": 1496
+    },
+    {
+      "epoch": 0.8633217993079585,
+      "grad_norm": 0.5167029500007629,
+      "learning_rate": 0.00014248366013071897,
+      "loss": 0.6116561889648438,
+      "step": 1497
+    },
+    {
+      "epoch": 0.8638985005767013,
+      "grad_norm": 0.6007522940635681,
+      "learning_rate": 0.00014244521337946944,
+      "loss": 0.7663001418113708,
+      "step": 1498
+    },
+    {
+      "epoch": 0.864475201845444,
+      "grad_norm": 0.5924880504608154,
+      "learning_rate": 0.00014240676662821992,
+      "loss": 0.6707437038421631,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8650519031141869,
+      "grad_norm": 0.859641969203949,
+      "learning_rate": 0.0001423683198769704,
+      "loss": 1.0436668395996094,
+      "step": 1500
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 5202,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.361771665599693e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}