Training in progress, step 10000

Browse files

Files changed (6) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
runs/Jun07_12-33-16_DESKTOP-69FPKCK/events.out.tfevents.1717788805.DESKTOP-69FPKCK +2 -2
scheduler.pt +1 -1
trainer_state.json +712 -3

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f363676186105730c1d25197e2c0305ebfbf6706de03ba2c07404ec86bf25c03
 size 44644496

 version https://git-lfs.github.com/spec/v1
+oid sha256:90e0cf7ce1f314bfb364f44b1b9395ac7510ae076e5664e54d0b81cb80e0f2d4
 size 44644496

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:abda9e7a12534ef2affd8d0c860673e26661a5152bce292672896e64d2a0cdaf
 size 11230198

 version https://git-lfs.github.com/spec/v1
+oid sha256:bf57752b7fe8b0f27d2c11bd437ea0d0546c101a5edf6c2b8ef58464ff8128e9
 size 11230198

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b3ee827a7a00012c0a116546df467feee35e70376d81a7a85b1a70eb90414d3
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
 size 14244

runs/Jun07_12-33-16_DESKTOP-69FPKCK/events.out.tfevents.1717788805.DESKTOP-69FPKCK CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:edcf8641734c1d64588936cae32831ed874d304e627a7c2312709108b8fd418e
-size 110072

 version https://git-lfs.github.com/spec/v1
+oid sha256:be506d81ada1b7dc41d27acbdb06762dd92f15fc140e5efbef58d1d2d1e179db
+size 111338

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b35aaeefae1777c5b0cc2a6a699a6e86dbf10049e0c78d4a59c18dcf3571dfd
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:c903d70ba831f7bc91d767743519849df9eeb11f7c11a55a187111672ce37e65
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.03619598315076984,
   "eval_steps": 2000,
-  "global_step": 8000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2843,6 +2843,715 @@
       "eval_samples_per_second": 2769.782,
       "eval_steps_per_second": 10.822,
       "step": 8000
     }
   ],
   "logging_steps": 20,
@@ -2850,7 +3559,7 @@
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 100,
-  "total_flos": 2876724215808000.0,
   "train_batch_size": 256,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.045244978938462306,
   "eval_steps": 2000,
+  "global_step": 10000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 2769.782,
       "eval_steps_per_second": 10.822,
       "step": 8000
+    },
+    {
+      "epoch": 0.036286473108646765,
+      "grad_norm": 12.62540054321289,
+      "learning_rate": 0.000108804633064881,
+      "loss": 8.1561,
+      "step": 8020
+    },
+    {
+      "epoch": 0.036376963066523695,
+      "grad_norm": 12.97541332244873,
+      "learning_rate": 0.00010907610171025247,
+      "loss": 8.1708,
+      "step": 8040
+    },
+    {
+      "epoch": 0.03646745302440062,
+      "grad_norm": 8.305766105651855,
+      "learning_rate": 0.00010934757035562392,
+      "loss": 8.1671,
+      "step": 8060
+    },
+    {
+      "epoch": 0.03655794298227754,
+      "grad_norm": 14.076859474182129,
+      "learning_rate": 0.00010961903900099538,
+      "loss": 8.1659,
+      "step": 8080
+    },
+    {
+      "epoch": 0.03664843294015447,
+      "grad_norm": 11.951278686523438,
+      "learning_rate": 0.00010989050764636684,
+      "loss": 8.1893,
+      "step": 8100
+    },
+    {
+      "epoch": 0.03673892289803139,
+      "grad_norm": 10.796624183654785,
+      "learning_rate": 0.00011016197629173831,
+      "loss": 8.1942,
+      "step": 8120
+    },
+    {
+      "epoch": 0.036829412855908315,
+      "grad_norm": 10.49177074432373,
+      "learning_rate": 0.00011043344493710976,
+      "loss": 8.1589,
+      "step": 8140
+    },
+    {
+      "epoch": 0.03691990281378524,
+      "grad_norm": 12.82060432434082,
+      "learning_rate": 0.00011070491358248122,
+      "loss": 8.1957,
+      "step": 8160
+    },
+    {
+      "epoch": 0.03701039277166217,
+      "grad_norm": 11.00941276550293,
+      "learning_rate": 0.00011097638222785267,
+      "loss": 8.1609,
+      "step": 8180
+    },
+    {
+      "epoch": 0.03710088272953909,
+      "grad_norm": 10.24111270904541,
+      "learning_rate": 0.00011124785087322413,
+      "loss": 8.1769,
+      "step": 8200
+    },
+    {
+      "epoch": 0.03719137268741601,
+      "grad_norm": 11.292909622192383,
+      "learning_rate": 0.0001115193195185956,
+      "loss": 8.1628,
+      "step": 8220
+    },
+    {
+      "epoch": 0.037281862645292936,
+      "grad_norm": 9.362674713134766,
+      "learning_rate": 0.00011179078816396706,
+      "loss": 8.1638,
+      "step": 8240
+    },
+    {
+      "epoch": 0.037372352603169866,
+      "grad_norm": 12.9249906539917,
+      "learning_rate": 0.00011206225680933852,
+      "loss": 8.1957,
+      "step": 8260
+    },
+    {
+      "epoch": 0.03746284256104679,
+      "grad_norm": 10.386489868164062,
+      "learning_rate": 0.00011233372545470999,
+      "loss": 8.1525,
+      "step": 8280
+    },
+    {
+      "epoch": 0.03755333251892371,
+      "grad_norm": 12.65300464630127,
+      "learning_rate": 0.00011260519410008144,
+      "loss": 8.1558,
+      "step": 8300
+    },
+    {
+      "epoch": 0.037643822476800634,
+      "grad_norm": 11.562602996826172,
+      "learning_rate": 0.0001128766627454529,
+      "loss": 8.148,
+      "step": 8320
+    },
+    {
+      "epoch": 0.037734312434677564,
+      "grad_norm": 14.783183097839355,
+      "learning_rate": 0.00011314813139082436,
+      "loss": 8.1448,
+      "step": 8340
+    },
+    {
+      "epoch": 0.03782480239255449,
+      "grad_norm": 15.469168663024902,
+      "learning_rate": 0.00011341960003619583,
+      "loss": 8.1801,
+      "step": 8360
+    },
+    {
+      "epoch": 0.03791529235043141,
+      "grad_norm": 11.361299514770508,
+      "learning_rate": 0.00011369106868156726,
+      "loss": 8.1549,
+      "step": 8380
+    },
+    {
+      "epoch": 0.03800578230830833,
+      "grad_norm": 9.814708709716797,
+      "learning_rate": 0.00011396253732693873,
+      "loss": 8.1663,
+      "step": 8400
+    },
+    {
+      "epoch": 0.03809627226618526,
+      "grad_norm": 10.522832870483398,
+      "learning_rate": 0.00011423400597231019,
+      "loss": 8.1459,
+      "step": 8420
+    },
+    {
+      "epoch": 0.038186762224062185,
+      "grad_norm": 10.637961387634277,
+      "learning_rate": 0.00011450547461768165,
+      "loss": 8.1554,
+      "step": 8440
+    },
+    {
+      "epoch": 0.03827725218193911,
+      "grad_norm": 14.578750610351562,
+      "learning_rate": 0.00011477694326305312,
+      "loss": 8.1758,
+      "step": 8460
+    },
+    {
+      "epoch": 0.03836774213981604,
+      "grad_norm": 12.179791450500488,
+      "learning_rate": 0.00011504841190842457,
+      "loss": 8.1117,
+      "step": 8480
+    },
+    {
+      "epoch": 0.03845823209769296,
+      "grad_norm": 11.189960479736328,
+      "learning_rate": 0.00011531988055379603,
+      "loss": 8.1517,
+      "step": 8500
+    },
+    {
+      "epoch": 0.03854872205556988,
+      "grad_norm": 11.662614822387695,
+      "learning_rate": 0.00011559134919916749,
+      "loss": 8.129,
+      "step": 8520
+    },
+    {
+      "epoch": 0.038639212013446805,
+      "grad_norm": 9.089029312133789,
+      "learning_rate": 0.00011584924441227038,
+      "loss": 8.1452,
+      "step": 8540
+    },
+    {
+      "epoch": 0.038729701971323735,
+      "grad_norm": 15.1500825881958,
+      "learning_rate": 0.00011612071305764184,
+      "loss": 8.1623,
+      "step": 8560
+    },
+    {
+      "epoch": 0.03882019192920066,
+      "grad_norm": 15.177955627441406,
+      "learning_rate": 0.0001163921817030133,
+      "loss": 8.1138,
+      "step": 8580
+    },
+    {
+      "epoch": 0.03891068188707758,
+      "grad_norm": 9.620798110961914,
+      "learning_rate": 0.00011666365034838476,
+      "loss": 8.1472,
+      "step": 8600
+    },
+    {
+      "epoch": 0.0390011718449545,
+      "grad_norm": 13.227412223815918,
+      "learning_rate": 0.00011693511899375622,
+      "loss": 8.1436,
+      "step": 8620
+    },
+    {
+      "epoch": 0.03909166180283143,
+      "grad_norm": 12.561627388000488,
+      "learning_rate": 0.00011720658763912768,
+      "loss": 8.1478,
+      "step": 8640
+    },
+    {
+      "epoch": 0.039182151760708356,
+      "grad_norm": 12.864951133728027,
+      "learning_rate": 0.00011747805628449915,
+      "loss": 8.1727,
+      "step": 8660
+    },
+    {
+      "epoch": 0.03927264171858528,
+      "grad_norm": 12.883962631225586,
+      "learning_rate": 0.00011774952492987061,
+      "loss": 8.1396,
+      "step": 8680
+    },
+    {
+      "epoch": 0.0393631316764622,
+      "grad_norm": 7.435621738433838,
+      "learning_rate": 0.00011802099357524204,
+      "loss": 8.1774,
+      "step": 8700
+    },
+    {
+      "epoch": 0.03945362163433913,
+      "grad_norm": 12.7384672164917,
+      "learning_rate": 0.00011829246222061351,
+      "loss": 8.1297,
+      "step": 8720
+    },
+    {
+      "epoch": 0.039544111592216054,
+      "grad_norm": 14.0343017578125,
+      "learning_rate": 0.00011856393086598497,
+      "loss": 8.1406,
+      "step": 8740
+    },
+    {
+      "epoch": 0.03963460155009298,
+      "grad_norm": 15.325870513916016,
+      "learning_rate": 0.00011883539951135643,
+      "loss": 8.1619,
+      "step": 8760
+    },
+    {
+      "epoch": 0.039725091507969906,
+      "grad_norm": 21.650548934936523,
+      "learning_rate": 0.00011910686815672788,
+      "loss": 8.193,
+      "step": 8780
+    },
+    {
+      "epoch": 0.03981558146584683,
+      "grad_norm": 15.605712890625,
+      "learning_rate": 0.00011937833680209935,
+      "loss": 8.1709,
+      "step": 8800
+    },
+    {
+      "epoch": 0.03990607142372375,
+      "grad_norm": 10.788895606994629,
+      "learning_rate": 0.00011964980544747081,
+      "loss": 8.1451,
+      "step": 8820
+    },
+    {
+      "epoch": 0.039996561381600675,
+      "grad_norm": 16.377477645874023,
+      "learning_rate": 0.00011992127409284227,
+      "loss": 8.134,
+      "step": 8840
+    },
+    {
+      "epoch": 0.040087051339477604,
+      "grad_norm": 13.106194496154785,
+      "learning_rate": 0.00012019274273821374,
+      "loss": 8.1352,
+      "step": 8860
+    },
+    {
+      "epoch": 0.04017754129735453,
+      "grad_norm": 11.152835845947266,
+      "learning_rate": 0.0001204642113835852,
+      "loss": 8.1138,
+      "step": 8880
+    },
+    {
+      "epoch": 0.04026803125523145,
+      "grad_norm": 9.210712432861328,
+      "learning_rate": 0.00012073568002895666,
+      "loss": 8.1769,
+      "step": 8900
+    },
+    {
+      "epoch": 0.04035852121310837,
+      "grad_norm": 12.555234909057617,
+      "learning_rate": 0.00012100714867432813,
+      "loss": 8.1383,
+      "step": 8920
+    },
+    {
+      "epoch": 0.0404490111709853,
+      "grad_norm": 12.013688087463379,
+      "learning_rate": 0.00012127861731969958,
+      "loss": 8.1564,
+      "step": 8940
+    },
+    {
+      "epoch": 0.040539501128862225,
+      "grad_norm": 9.827411651611328,
+      "learning_rate": 0.00012155008596507101,
+      "loss": 8.1348,
+      "step": 8960
+    },
+    {
+      "epoch": 0.04062999108673915,
+      "grad_norm": 11.609356880187988,
+      "learning_rate": 0.00012182155461044248,
+      "loss": 8.1646,
+      "step": 8980
+    },
+    {
+      "epoch": 0.04072048104461607,
+      "grad_norm": 13.045088768005371,
+      "learning_rate": 0.00012209302325581395,
+      "loss": 8.1628,
+      "step": 9000
+    },
+    {
+      "epoch": 0.040810971002493,
+      "grad_norm": 12.780691146850586,
+      "learning_rate": 0.00012236449190118542,
+      "loss": 8.1487,
+      "step": 9020
+    },
+    {
+      "epoch": 0.04090146096036992,
+      "grad_norm": 10.65334701538086,
+      "learning_rate": 0.00012263596054655685,
+      "loss": 8.1275,
+      "step": 9040
+    },
+    {
+      "epoch": 0.040991950918246846,
+      "grad_norm": 8.080134391784668,
+      "learning_rate": 0.00012290742919192832,
+      "loss": 8.1356,
+      "step": 9060
+    },
+    {
+      "epoch": 0.041082440876123776,
+      "grad_norm": 12.708916664123535,
+      "learning_rate": 0.00012317889783729978,
+      "loss": 8.1606,
+      "step": 9080
+    },
+    {
+      "epoch": 0.0411729308340007,
+      "grad_norm": 13.570298194885254,
+      "learning_rate": 0.00012345036648267124,
+      "loss": 8.1389,
+      "step": 9100
+    },
+    {
+      "epoch": 0.04126342079187762,
+      "grad_norm": 13.237983703613281,
+      "learning_rate": 0.0001237218351280427,
+      "loss": 8.1243,
+      "step": 9120
+    },
+    {
+      "epoch": 0.041353910749754544,
+      "grad_norm": 14.53023910522461,
+      "learning_rate": 0.00012399330377341417,
+      "loss": 8.1191,
+      "step": 9140
+    },
+    {
+      "epoch": 0.041444400707631474,
+      "grad_norm": 11.765192031860352,
+      "learning_rate": 0.00012426477241878563,
+      "loss": 8.1031,
+      "step": 9160
+    },
+    {
+      "epoch": 0.041534890665508396,
+      "grad_norm": 11.261069297790527,
+      "learning_rate": 0.0001245362410641571,
+      "loss": 8.1504,
+      "step": 9180
+    },
+    {
+      "epoch": 0.04162538062338532,
+      "grad_norm": 13.039865493774414,
+      "learning_rate": 0.00012480770970952856,
+      "loss": 8.1186,
+      "step": 9200
+    },
+    {
+      "epoch": 0.04171587058126224,
+      "grad_norm": 11.21242904663086,
+      "learning_rate": 0.0001250791783549,
+      "loss": 8.1244,
+      "step": 9220
+    },
+    {
+      "epoch": 0.04180636053913917,
+      "grad_norm": 13.84521770477295,
+      "learning_rate": 0.00012535064700027146,
+      "loss": 8.1442,
+      "step": 9240
+    },
+    {
+      "epoch": 0.041896850497016094,
+      "grad_norm": 14.333518981933594,
+      "learning_rate": 0.00012562211564564292,
+      "loss": 8.1628,
+      "step": 9260
+    },
+    {
+      "epoch": 0.04198734045489302,
+      "grad_norm": 12.016851425170898,
+      "learning_rate": 0.00012589358429101438,
+      "loss": 8.1037,
+      "step": 9280
+    },
+    {
+      "epoch": 0.04207783041276994,
+      "grad_norm": 9.183259010314941,
+      "learning_rate": 0.00012616505293638585,
+      "loss": 8.1429,
+      "step": 9300
+    },
+    {
+      "epoch": 0.04216832037064687,
+      "grad_norm": 13.651033401489258,
+      "learning_rate": 0.0001264365215817573,
+      "loss": 8.1202,
+      "step": 9320
+    },
+    {
+      "epoch": 0.04225881032852379,
+      "grad_norm": 11.869391441345215,
+      "learning_rate": 0.00012670799022712877,
+      "loss": 8.1125,
+      "step": 9340
+    },
+    {
+      "epoch": 0.042349300286400715,
+      "grad_norm": 15.943286895751953,
+      "learning_rate": 0.00012697945887250024,
+      "loss": 8.1694,
+      "step": 9360
+    },
+    {
+      "epoch": 0.04243979024427764,
+      "grad_norm": 13.450387001037598,
+      "learning_rate": 0.00012725092751787167,
+      "loss": 8.1379,
+      "step": 9380
+    },
+    {
+      "epoch": 0.04253028020215457,
+      "grad_norm": 15.152196884155273,
+      "learning_rate": 0.00012752239616324314,
+      "loss": 8.1391,
+      "step": 9400
+    },
+    {
+      "epoch": 0.04262077016003149,
+      "grad_norm": 15.109274864196777,
+      "learning_rate": 0.0001277938648086146,
+      "loss": 8.0963,
+      "step": 9420
+    },
+    {
+      "epoch": 0.04271126011790841,
+      "grad_norm": 10.3173189163208,
+      "learning_rate": 0.00012806533345398606,
+      "loss": 8.1557,
+      "step": 9440
+    },
+    {
+      "epoch": 0.04280175007578534,
+      "grad_norm": 11.38595962524414,
+      "learning_rate": 0.00012833680209935753,
+      "loss": 8.173,
+      "step": 9460
+    },
+    {
+      "epoch": 0.042892240033662266,
+      "grad_norm": 11.458219528198242,
+      "learning_rate": 0.00012859469731246043,
+      "loss": 8.2542,
+      "step": 9480
+    },
+    {
+      "epoch": 0.04298272999153919,
+      "grad_norm": 14.253256797790527,
+      "learning_rate": 0.00012886616595783186,
+      "loss": 8.1687,
+      "step": 9500
+    },
+    {
+      "epoch": 0.04307321994941611,
+      "grad_norm": 14.074560165405273,
+      "learning_rate": 0.00012913763460320333,
+      "loss": 8.1175,
+      "step": 9520
+    },
+    {
+      "epoch": 0.04316370990729304,
+      "grad_norm": 14.521282196044922,
+      "learning_rate": 0.00012939552981630623,
+      "loss": 8.1456,
+      "step": 9540
+    },
+    {
+      "epoch": 0.043254199865169964,
+      "grad_norm": 12.537208557128906,
+      "learning_rate": 0.0001296669984616777,
+      "loss": 8.1432,
+      "step": 9560
+    },
+    {
+      "epoch": 0.043344689823046886,
+      "grad_norm": 10.885902404785156,
+      "learning_rate": 0.00012993846710704915,
+      "loss": 8.1875,
+      "step": 9580
+    },
+    {
+      "epoch": 0.04343517978092381,
+      "grad_norm": 10.156676292419434,
+      "learning_rate": 0.0001302099357524206,
+      "loss": 8.1728,
+      "step": 9600
+    },
+    {
+      "epoch": 0.04352566973880074,
+      "grad_norm": 13.31322193145752,
+      "learning_rate": 0.00013048140439779205,
+      "loss": 8.1394,
+      "step": 9620
+    },
+    {
+      "epoch": 0.04361615969667766,
+      "grad_norm": 7.779819488525391,
+      "learning_rate": 0.0001307528730431635,
+      "loss": 8.139,
+      "step": 9640
+    },
+    {
+      "epoch": 0.043706649654554584,
+      "grad_norm": 12.208565711975098,
+      "learning_rate": 0.00013102434168853495,
+      "loss": 8.1346,
+      "step": 9660
+    },
+    {
+      "epoch": 0.04379713961243151,
+      "grad_norm": 11.362008094787598,
+      "learning_rate": 0.00013129581033390642,
+      "loss": 8.1419,
+      "step": 9680
+    },
+    {
+      "epoch": 0.04388762957030844,
+      "grad_norm": 11.86789321899414,
+      "learning_rate": 0.00013156727897927788,
+      "loss": 8.1475,
+      "step": 9700
+    },
+    {
+      "epoch": 0.04397811952818536,
+      "grad_norm": 14.61185073852539,
+      "learning_rate": 0.00013183874762464934,
+      "loss": 8.1582,
+      "step": 9720
+    },
+    {
+      "epoch": 0.04406860948606228,
+      "grad_norm": 11.60112190246582,
+      "learning_rate": 0.0001321102162700208,
+      "loss": 8.1073,
+      "step": 9740
+    },
+    {
+      "epoch": 0.04415909944393921,
+      "grad_norm": 13.442856788635254,
+      "learning_rate": 0.00013238168491539227,
+      "loss": 8.1358,
+      "step": 9760
+    },
+    {
+      "epoch": 0.044249589401816135,
+      "grad_norm": 11.524395942687988,
+      "learning_rate": 0.00013265315356076373,
+      "loss": 8.1083,
+      "step": 9780
+    },
+    {
+      "epoch": 0.04434007935969306,
+      "grad_norm": 13.528814315795898,
+      "learning_rate": 0.0001329246222061352,
+      "loss": 8.1392,
+      "step": 9800
+    },
+    {
+      "epoch": 0.04443056931756998,
+      "grad_norm": 18.11868667602539,
+      "learning_rate": 0.00013319609085150666,
+      "loss": 8.1784,
+      "step": 9820
+    },
+    {
+      "epoch": 0.04452105927544691,
+      "grad_norm": 15.858280181884766,
+      "learning_rate": 0.00013346755949687812,
+      "loss": 8.1597,
+      "step": 9840
+    },
+    {
+      "epoch": 0.04461154923332383,
+      "grad_norm": 14.466769218444824,
+      "learning_rate": 0.00013373902814224956,
+      "loss": 8.1632,
+      "step": 9860
+    },
+    {
+      "epoch": 0.044702039191200756,
+      "grad_norm": 11.416616439819336,
+      "learning_rate": 0.00013401049678762102,
+      "loss": 8.1681,
+      "step": 9880
+    },
+    {
+      "epoch": 0.04479252914907768,
+      "grad_norm": 39.87081527709961,
+      "learning_rate": 0.00013428196543299249,
+      "loss": 8.1384,
+      "step": 9900
+    },
+    {
+      "epoch": 0.04488301910695461,
+      "grad_norm": 11.689374923706055,
+      "learning_rate": 0.0001345398606460954,
+      "loss": 8.5619,
+      "step": 9920
+    },
+    {
+      "epoch": 0.04497350906483153,
+      "grad_norm": 10.53484058380127,
+      "learning_rate": 0.00013481132929146682,
+      "loss": 9.1495,
+      "step": 9940
+    },
+    {
+      "epoch": 0.045063999022708454,
+      "grad_norm": 12.07006549835205,
+      "learning_rate": 0.00013508279793683829,
+      "loss": 9.1771,
+      "step": 9960
+    },
+    {
+      "epoch": 0.045154488980585376,
+      "grad_norm": 9.795348167419434,
+      "learning_rate": 0.00013535426658220975,
+      "loss": 9.1545,
+      "step": 9980
+    },
+    {
+      "epoch": 0.045244978938462306,
+      "grad_norm": 10.068339347839355,
+      "learning_rate": 0.0001356257352275812,
+      "loss": 9.1969,
+      "step": 10000
+    },
+    {
+      "epoch": 0.045244978938462306,
+      "eval_accuracy": 0.022879129772772476,
+      "eval_loss": 9.148832321166992,
+      "eval_runtime": 212.7494,
+      "eval_samples_per_second": 2857.071,
+      "eval_steps_per_second": 11.163,
+      "step": 10000
     }
   ],
   "logging_steps": 20,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 100,
+  "total_flos": 3595905269760000.0,
   "train_batch_size": 256,
   "trial_name": null,
   "trial_params": null