Training in progress, step 8000

Browse files

Files changed (6) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
runs/Jun07_12-33-16_DESKTOP-69FPKCK/events.out.tfevents.1717788805.DESKTOP-69FPKCK +2 -2
scheduler.pt +1 -1
trainer_state.json +712 -3

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:94cc0b49081e080919d2be8f5e4a517f58ccdc361d43f153ac49ba5755081b84
 size 44644496

 version https://git-lfs.github.com/spec/v1
+oid sha256:adf638b849568db5d9d7a536bdb1edafa1e8d90c38459b1d7b3a6552d6da7ad0
 size 44644496

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d6ad718095808c9c5f6a957cf2ca65f59b9ed75b8a432e2fa2fb65c556f2d360
 size 11230198

 version https://git-lfs.github.com/spec/v1
+oid sha256:abda9e7a12534ef2affd8d0c860673e26661a5152bce292672896e64d2a0cdaf
 size 11230198

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8d138cfe3a4adf21f048848ee35837c9a757a0a3616ff7adbb45b69aac247435
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:6b3ee827a7a00012c0a116546df467feee35e70376d81a7a85b1a70eb90414d3
 size 14244

runs/Jun07_12-33-16_DESKTOP-69FPKCK/events.out.tfevents.1717788805.DESKTOP-69FPKCK CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eff3fe11f185c6d041edc58c2d8b069ad576bfecba786d61fc1bac322d07a7f6
-size 88649

 version https://git-lfs.github.com/spec/v1
+oid sha256:4a3c8d5d8b582a0b0b9fd14feef2ad6f7e897a52df0c558e8aa72703bc07e62b
+size 89915

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4aead1720be621f91a94a09ab21ddeec8c9e93c1a9e50cc6992710fbf1fedb49
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:5b35aaeefae1777c5b0cc2a6a699a6e86dbf10049e0c78d4a59c18dcf3571dfd
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.02714698736307738,
   "eval_steps": 2000,
-  "global_step": 6000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2134,6 +2134,715 @@
       "eval_samples_per_second": 2784.158,
       "eval_steps_per_second": 10.878,
       "step": 6000
     }
   ],
   "logging_steps": 20,
@@ -2141,7 +2850,7 @@
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 100,
-  "total_flos": 2157543161856000.0,
   "train_batch_size": 256,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.03619598315076984,
   "eval_steps": 2000,
+  "global_step": 8000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 2784.158,
       "eval_steps_per_second": 10.878,
       "step": 6000
+    },
+    {
+      "epoch": 0.027237477320954308,
+      "grad_norm": 7.100822925567627,
+      "learning_rate": 8.167134196000362e-05,
+      "loss": 8.4131,
+      "step": 6020
+    },
+    {
+      "epoch": 0.02732796727883123,
+      "grad_norm": 8.460954666137695,
+      "learning_rate": 8.194281060537508e-05,
+      "loss": 8.4087,
+      "step": 6040
+    },
+    {
+      "epoch": 0.027418457236708157,
+      "grad_norm": 7.642125129699707,
+      "learning_rate": 8.221427925074653e-05,
+      "loss": 8.3806,
+      "step": 6060
+    },
+    {
+      "epoch": 0.02750894719458508,
+      "grad_norm": 8.104974746704102,
+      "learning_rate": 8.2485747896118e-05,
+      "loss": 8.404,
+      "step": 6080
+    },
+    {
+      "epoch": 0.027599437152462006,
+      "grad_norm": 8.082459449768066,
+      "learning_rate": 8.275721654148946e-05,
+      "loss": 8.3865,
+      "step": 6100
+    },
+    {
+      "epoch": 0.02768992711033893,
+      "grad_norm": 8.786911010742188,
+      "learning_rate": 8.302868518686092e-05,
+      "loss": 8.3475,
+      "step": 6120
+    },
+    {
+      "epoch": 0.027780417068215855,
+      "grad_norm": 7.780808925628662,
+      "learning_rate": 8.330015383223237e-05,
+      "loss": 8.3798,
+      "step": 6140
+    },
+    {
+      "epoch": 0.02787090702609278,
+      "grad_norm": 10.508188247680664,
+      "learning_rate": 8.357162247760384e-05,
+      "loss": 8.3718,
+      "step": 6160
+    },
+    {
+      "epoch": 0.027961396983969704,
+      "grad_norm": 9.833992004394531,
+      "learning_rate": 8.38430911229753e-05,
+      "loss": 8.3952,
+      "step": 6180
+    },
+    {
+      "epoch": 0.02805188694184663,
+      "grad_norm": 9.917244911193848,
+      "learning_rate": 8.411455976834675e-05,
+      "loss": 8.3828,
+      "step": 6200
+    },
+    {
+      "epoch": 0.028142376899723553,
+      "grad_norm": 8.893899917602539,
+      "learning_rate": 8.438602841371821e-05,
+      "loss": 8.3853,
+      "step": 6220
+    },
+    {
+      "epoch": 0.02823286685760048,
+      "grad_norm": 8.206876754760742,
+      "learning_rate": 8.465749705908967e-05,
+      "loss": 8.3686,
+      "step": 6240
+    },
+    {
+      "epoch": 0.028323356815477402,
+      "grad_norm": 6.771660327911377,
+      "learning_rate": 8.492896570446114e-05,
+      "loss": 8.3699,
+      "step": 6260
+    },
+    {
+      "epoch": 0.028413846773354328,
+      "grad_norm": 8.602880477905273,
+      "learning_rate": 8.52004343498326e-05,
+      "loss": 8.3388,
+      "step": 6280
+    },
+    {
+      "epoch": 0.02850433673123125,
+      "grad_norm": 12.602445602416992,
+      "learning_rate": 8.547190299520405e-05,
+      "loss": 8.3127,
+      "step": 6300
+    },
+    {
+      "epoch": 0.028594826689108177,
+      "grad_norm": 6.581843852996826,
+      "learning_rate": 8.57433716405755e-05,
+      "loss": 8.3345,
+      "step": 6320
+    },
+    {
+      "epoch": 0.0286853166469851,
+      "grad_norm": 11.11732292175293,
+      "learning_rate": 8.601484028594696e-05,
+      "loss": 8.3442,
+      "step": 6340
+    },
+    {
+      "epoch": 0.028775806604862026,
+      "grad_norm": 7.795157432556152,
+      "learning_rate": 8.628630893131843e-05,
+      "loss": 8.3477,
+      "step": 6360
+    },
+    {
+      "epoch": 0.02886629656273895,
+      "grad_norm": 7.013496398925781,
+      "learning_rate": 8.655777757668989e-05,
+      "loss": 8.3444,
+      "step": 6380
+    },
+    {
+      "epoch": 0.028956786520615875,
+      "grad_norm": 7.039948463439941,
+      "learning_rate": 8.682924622206135e-05,
+      "loss": 8.3242,
+      "step": 6400
+    },
+    {
+      "epoch": 0.029047276478492798,
+      "grad_norm": 9.261716842651367,
+      "learning_rate": 8.710071486743282e-05,
+      "loss": 8.3209,
+      "step": 6420
+    },
+    {
+      "epoch": 0.029137766436369724,
+      "grad_norm": 7.255875587463379,
+      "learning_rate": 8.737218351280428e-05,
+      "loss": 8.304,
+      "step": 6440
+    },
+    {
+      "epoch": 0.029228256394246647,
+      "grad_norm": 7.955538749694824,
+      "learning_rate": 8.764365215817573e-05,
+      "loss": 8.2953,
+      "step": 6460
+    },
+    {
+      "epoch": 0.029318746352123573,
+      "grad_norm": 9.364811897277832,
+      "learning_rate": 8.791512080354718e-05,
+      "loss": 8.2936,
+      "step": 6480
+    },
+    {
+      "epoch": 0.0294092363100005,
+      "grad_norm": 9.385396957397461,
+      "learning_rate": 8.818658944891864e-05,
+      "loss": 8.3276,
+      "step": 6500
+    },
+    {
+      "epoch": 0.029499726267877422,
+      "grad_norm": 8.448295593261719,
+      "learning_rate": 8.84580580942901e-05,
+      "loss": 8.2975,
+      "step": 6520
+    },
+    {
+      "epoch": 0.02959021622575435,
+      "grad_norm": 9.282604217529297,
+      "learning_rate": 8.872952673966157e-05,
+      "loss": 8.3217,
+      "step": 6540
+    },
+    {
+      "epoch": 0.02968070618363127,
+      "grad_norm": 7.898446559906006,
+      "learning_rate": 8.900099538503303e-05,
+      "loss": 8.3006,
+      "step": 6560
+    },
+    {
+      "epoch": 0.029771196141508197,
+      "grad_norm": 9.186493873596191,
+      "learning_rate": 8.927246403040448e-05,
+      "loss": 8.2981,
+      "step": 6580
+    },
+    {
+      "epoch": 0.02986168609938512,
+      "grad_norm": 9.346575736999512,
+      "learning_rate": 8.954393267577595e-05,
+      "loss": 8.2883,
+      "step": 6600
+    },
+    {
+      "epoch": 0.029952176057262046,
+      "grad_norm": 6.458785057067871,
+      "learning_rate": 8.981540132114741e-05,
+      "loss": 8.2966,
+      "step": 6620
+    },
+    {
+      "epoch": 0.03004266601513897,
+      "grad_norm": 8.704976081848145,
+      "learning_rate": 9.008686996651886e-05,
+      "loss": 8.2986,
+      "step": 6640
+    },
+    {
+      "epoch": 0.030133155973015895,
+      "grad_norm": 7.744259357452393,
+      "learning_rate": 9.035833861189032e-05,
+      "loss": 8.2868,
+      "step": 6660
+    },
+    {
+      "epoch": 0.030223645930892818,
+      "grad_norm": 8.345844268798828,
+      "learning_rate": 9.062980725726179e-05,
+      "loss": 8.2931,
+      "step": 6680
+    },
+    {
+      "epoch": 0.030314135888769744,
+      "grad_norm": 7.604759216308594,
+      "learning_rate": 9.090127590263323e-05,
+      "loss": 8.2847,
+      "step": 6700
+    },
+    {
+      "epoch": 0.030404625846646667,
+      "grad_norm": 10.3920259475708,
+      "learning_rate": 9.11727445480047e-05,
+      "loss": 8.273,
+      "step": 6720
+    },
+    {
+      "epoch": 0.030495115804523593,
+      "grad_norm": 7.095389366149902,
+      "learning_rate": 9.144421319337616e-05,
+      "loss": 8.2768,
+      "step": 6740
+    },
+    {
+      "epoch": 0.030585605762400516,
+      "grad_norm": 7.211811542510986,
+      "learning_rate": 9.171568183874762e-05,
+      "loss": 8.2918,
+      "step": 6760
+    },
+    {
+      "epoch": 0.030676095720277442,
+      "grad_norm": 8.639713287353516,
+      "learning_rate": 9.198715048411909e-05,
+      "loss": 8.2845,
+      "step": 6780
+    },
+    {
+      "epoch": 0.03076658567815437,
+      "grad_norm": 7.687414169311523,
+      "learning_rate": 9.225861912949055e-05,
+      "loss": 8.2992,
+      "step": 6800
+    },
+    {
+      "epoch": 0.03085707563603129,
+      "grad_norm": 8.479426383972168,
+      "learning_rate": 9.2530087774862e-05,
+      "loss": 8.2848,
+      "step": 6820
+    },
+    {
+      "epoch": 0.030947565593908218,
+      "grad_norm": 8.185149192810059,
+      "learning_rate": 9.280155642023345e-05,
+      "loss": 8.3037,
+      "step": 6840
+    },
+    {
+      "epoch": 0.03103805555178514,
+      "grad_norm": 8.295937538146973,
+      "learning_rate": 9.307302506560491e-05,
+      "loss": 8.3179,
+      "step": 6860
+    },
+    {
+      "epoch": 0.031128545509662067,
+      "grad_norm": 10.772727012634277,
+      "learning_rate": 9.334449371097638e-05,
+      "loss": 8.264,
+      "step": 6880
+    },
+    {
+      "epoch": 0.03121903546753899,
+      "grad_norm": 8.465076446533203,
+      "learning_rate": 9.361596235634784e-05,
+      "loss": 8.2303,
+      "step": 6900
+    },
+    {
+      "epoch": 0.031309525425415916,
+      "grad_norm": 9.096773147583008,
+      "learning_rate": 9.38874310017193e-05,
+      "loss": 8.2473,
+      "step": 6920
+    },
+    {
+      "epoch": 0.03140001538329284,
+      "grad_norm": 10.57555866241455,
+      "learning_rate": 9.415889964709077e-05,
+      "loss": 8.27,
+      "step": 6940
+    },
+    {
+      "epoch": 0.03149050534116976,
+      "grad_norm": 7.5089850425720215,
+      "learning_rate": 9.443036829246222e-05,
+      "loss": 8.27,
+      "step": 6960
+    },
+    {
+      "epoch": 0.03158099529904669,
+      "grad_norm": 10.865699768066406,
+      "learning_rate": 9.470183693783368e-05,
+      "loss": 8.2451,
+      "step": 6980
+    },
+    {
+      "epoch": 0.031671485256923614,
+      "grad_norm": 12.514881134033203,
+      "learning_rate": 9.497330558320513e-05,
+      "loss": 8.259,
+      "step": 7000
+    },
+    {
+      "epoch": 0.031761975214800536,
+      "grad_norm": 9.914373397827148,
+      "learning_rate": 9.524477422857659e-05,
+      "loss": 8.2727,
+      "step": 7020
+    },
+    {
+      "epoch": 0.03185246517267746,
+      "grad_norm": 7.3313984870910645,
+      "learning_rate": 9.551624287394806e-05,
+      "loss": 8.2421,
+      "step": 7040
+    },
+    {
+      "epoch": 0.03194295513055439,
+      "grad_norm": 5.989616394042969,
+      "learning_rate": 9.578771151931952e-05,
+      "loss": 8.2363,
+      "step": 7060
+    },
+    {
+      "epoch": 0.03203344508843131,
+      "grad_norm": 7.4773430824279785,
+      "learning_rate": 9.605918016469098e-05,
+      "loss": 8.2718,
+      "step": 7080
+    },
+    {
+      "epoch": 0.032123935046308234,
+      "grad_norm": 6.605820655822754,
+      "learning_rate": 9.633064881006243e-05,
+      "loss": 8.257,
+      "step": 7100
+    },
+    {
+      "epoch": 0.03221442500418516,
+      "grad_norm": 8.294914245605469,
+      "learning_rate": 9.658854402316532e-05,
+      "loss": 8.2478,
+      "step": 7120
+    },
+    {
+      "epoch": 0.03230491496206209,
+      "grad_norm": 10.011855125427246,
+      "learning_rate": 9.686001266853678e-05,
+      "loss": 8.2525,
+      "step": 7140
+    },
+    {
+      "epoch": 0.03239540491993901,
+      "grad_norm": 7.529365062713623,
+      "learning_rate": 9.713148131390823e-05,
+      "loss": 8.2728,
+      "step": 7160
+    },
+    {
+      "epoch": 0.03248589487781593,
+      "grad_norm": 8.781538009643555,
+      "learning_rate": 9.74029499592797e-05,
+      "loss": 8.2305,
+      "step": 7180
+    },
+    {
+      "epoch": 0.03257638483569286,
+      "grad_norm": 12.758204460144043,
+      "learning_rate": 9.767441860465116e-05,
+      "loss": 8.2382,
+      "step": 7200
+    },
+    {
+      "epoch": 0.032666874793569785,
+      "grad_norm": 10.523704528808594,
+      "learning_rate": 9.794588725002262e-05,
+      "loss": 8.2364,
+      "step": 7220
+    },
+    {
+      "epoch": 0.03275736475144671,
+      "grad_norm": 6.50457239151001,
+      "learning_rate": 9.821735589539409e-05,
+      "loss": 8.2384,
+      "step": 7240
+    },
+    {
+      "epoch": 0.03284785470932363,
+      "grad_norm": 9.191271781921387,
+      "learning_rate": 9.848882454076555e-05,
+      "loss": 8.2148,
+      "step": 7260
+    },
+    {
+      "epoch": 0.03293834466720056,
+      "grad_norm": 8.93270206451416,
+      "learning_rate": 9.8760293186137e-05,
+      "loss": 8.2352,
+      "step": 7280
+    },
+    {
+      "epoch": 0.03302883462507748,
+      "grad_norm": 9.895100593566895,
+      "learning_rate": 9.903176183150845e-05,
+      "loss": 8.2376,
+      "step": 7300
+    },
+    {
+      "epoch": 0.033119324582954406,
+      "grad_norm": 10.420171737670898,
+      "learning_rate": 9.930323047687991e-05,
+      "loss": 8.2479,
+      "step": 7320
+    },
+    {
+      "epoch": 0.03320981454083133,
+      "grad_norm": 9.649170875549316,
+      "learning_rate": 9.957469912225138e-05,
+      "loss": 8.2557,
+      "step": 7340
+    },
+    {
+      "epoch": 0.03330030449870826,
+      "grad_norm": 7.854948043823242,
+      "learning_rate": 9.984616776762284e-05,
+      "loss": 8.2145,
+      "step": 7360
+    },
+    {
+      "epoch": 0.03339079445658518,
+      "grad_norm": 8.486404418945312,
+      "learning_rate": 0.0001001176364129943,
+      "loss": 8.2132,
+      "step": 7380
+    },
+    {
+      "epoch": 0.033481284414462104,
+      "grad_norm": 11.286945343017578,
+      "learning_rate": 0.00010038910505836577,
+      "loss": 8.2169,
+      "step": 7400
+    },
+    {
+      "epoch": 0.033571774372339026,
+      "grad_norm": 6.662302494049072,
+      "learning_rate": 0.00010066057370373721,
+      "loss": 8.2318,
+      "step": 7420
+    },
+    {
+      "epoch": 0.033662264330215956,
+      "grad_norm": 10.467026710510254,
+      "learning_rate": 0.00010093204234910868,
+      "loss": 8.2089,
+      "step": 7440
+    },
+    {
+      "epoch": 0.03375275428809288,
+      "grad_norm": 12.113288879394531,
+      "learning_rate": 0.00010120351099448013,
+      "loss": 8.2194,
+      "step": 7460
+    },
+    {
+      "epoch": 0.0338432442459698,
+      "grad_norm": 13.295260429382324,
+      "learning_rate": 0.00010147497963985159,
+      "loss": 8.2526,
+      "step": 7480
+    },
+    {
+      "epoch": 0.03393373420384673,
+      "grad_norm": 9.79587173461914,
+      "learning_rate": 0.00010174644828522305,
+      "loss": 8.2253,
+      "step": 7500
+    },
+    {
+      "epoch": 0.034024224161723654,
+      "grad_norm": 10.251439094543457,
+      "learning_rate": 0.00010201791693059452,
+      "loss": 8.2248,
+      "step": 7520
+    },
+    {
+      "epoch": 0.03411471411960058,
+      "grad_norm": 10.583033561706543,
+      "learning_rate": 0.00010228938557596597,
+      "loss": 8.211,
+      "step": 7540
+    },
+    {
+      "epoch": 0.0342052040774775,
+      "grad_norm": 10.661384582519531,
+      "learning_rate": 0.00010256085422133743,
+      "loss": 8.2053,
+      "step": 7560
+    },
+    {
+      "epoch": 0.03429569403535443,
+      "grad_norm": 8.133881568908691,
+      "learning_rate": 0.0001028323228667089,
+      "loss": 8.1948,
+      "step": 7580
+    },
+    {
+      "epoch": 0.03438618399323135,
+      "grad_norm": 9.278162002563477,
+      "learning_rate": 0.00010310379151208036,
+      "loss": 8.2235,
+      "step": 7600
+    },
+    {
+      "epoch": 0.034476673951108275,
+      "grad_norm": 10.354171752929688,
+      "learning_rate": 0.00010337526015745181,
+      "loss": 8.1704,
+      "step": 7620
+    },
+    {
+      "epoch": 0.0345671639089852,
+      "grad_norm": 9.4600830078125,
+      "learning_rate": 0.00010364672880282327,
+      "loss": 8.2008,
+      "step": 7640
+    },
+    {
+      "epoch": 0.03465765386686213,
+      "grad_norm": 10.290422439575195,
+      "learning_rate": 0.00010391819744819473,
+      "loss": 8.2084,
+      "step": 7660
+    },
+    {
+      "epoch": 0.03474814382473905,
+      "grad_norm": 9.98493480682373,
+      "learning_rate": 0.00010418966609356618,
+      "loss": 8.1878,
+      "step": 7680
+    },
+    {
+      "epoch": 0.03483863378261597,
+      "grad_norm": 8.021723747253418,
+      "learning_rate": 0.00010446113473893765,
+      "loss": 8.1865,
+      "step": 7700
+    },
+    {
+      "epoch": 0.034929123740492896,
+      "grad_norm": 6.915677070617676,
+      "learning_rate": 0.00010473260338430911,
+      "loss": 8.1795,
+      "step": 7720
+    },
+    {
+      "epoch": 0.035019613698369825,
+      "grad_norm": 9.64877986907959,
+      "learning_rate": 0.00010500407202968057,
+      "loss": 8.1756,
+      "step": 7740
+    },
+    {
+      "epoch": 0.03511010365624675,
+      "grad_norm": 9.673460960388184,
+      "learning_rate": 0.00010527554067505204,
+      "loss": 8.1877,
+      "step": 7760
+    },
+    {
+      "epoch": 0.03520059361412367,
+      "grad_norm": 10.429800033569336,
+      "learning_rate": 0.0001055470093204235,
+      "loss": 8.1803,
+      "step": 7780
+    },
+    {
+      "epoch": 0.0352910835720006,
+      "grad_norm": 9.610269546508789,
+      "learning_rate": 0.00010581847796579494,
+      "loss": 8.214,
+      "step": 7800
+    },
+    {
+      "epoch": 0.03538157352987752,
+      "grad_norm": 9.696439743041992,
+      "learning_rate": 0.0001060899466111664,
+      "loss": 8.1585,
+      "step": 7820
+    },
+    {
+      "epoch": 0.035472063487754446,
+      "grad_norm": 10.302108764648438,
+      "learning_rate": 0.00010636141525653786,
+      "loss": 8.1495,
+      "step": 7840
+    },
+    {
+      "epoch": 0.03556255344563137,
+      "grad_norm": 10.439906120300293,
+      "learning_rate": 0.00010663288390190933,
+      "loss": 8.1636,
+      "step": 7860
+    },
+    {
+      "epoch": 0.0356530434035083,
+      "grad_norm": 13.941293716430664,
+      "learning_rate": 0.00010690435254728079,
+      "loss": 8.1674,
+      "step": 7880
+    },
+    {
+      "epoch": 0.03574353336138522,
+      "grad_norm": 11.378789901733398,
+      "learning_rate": 0.00010717582119265225,
+      "loss": 8.1704,
+      "step": 7900
+    },
+    {
+      "epoch": 0.035834023319262144,
+      "grad_norm": 10.802684783935547,
+      "learning_rate": 0.00010744728983802372,
+      "loss": 8.1902,
+      "step": 7920
+    },
+    {
+      "epoch": 0.03592451327713907,
+      "grad_norm": 13.995284080505371,
+      "learning_rate": 0.00010771875848339517,
+      "loss": 8.1502,
+      "step": 7940
+    },
+    {
+      "epoch": 0.036015003235016,
+      "grad_norm": 11.473008155822754,
+      "learning_rate": 0.00010799022712876663,
+      "loss": 8.2082,
+      "step": 7960
+    },
+    {
+      "epoch": 0.03610549319289292,
+      "grad_norm": 9.314510345458984,
+      "learning_rate": 0.00010826169577413808,
+      "loss": 8.19,
+      "step": 7980
+    },
+    {
+      "epoch": 0.03619598315076984,
+      "grad_norm": 11.141118049621582,
+      "learning_rate": 0.00010853316441950954,
+      "loss": 8.2093,
+      "step": 8000
+    },
+    {
+      "epoch": 0.03619598315076984,
+      "eval_accuracy": 0.11013720949528932,
+      "eval_loss": 8.173333168029785,
+      "eval_runtime": 219.4541,
+      "eval_samples_per_second": 2769.782,
+      "eval_steps_per_second": 10.822,
+      "step": 8000
     }
   ],
   "logging_steps": 20,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 100,
+  "total_flos": 2876724215808000.0,
   "train_batch_size": 256,
   "trial_name": null,
   "trial_params": null