Upload 9 files

Browse files

Files changed (8) hide show

config.json +2 -2
model.safetensors +1 -1
rng_state.pth +3 -0
scheduler.pt +1 -1
tokenizer.json +16 -2
tokenizer_config.json +7 -0
trainer_state.json +687 -680
training_args.bin +2 -2

config.json CHANGED Viewed

@@ -4,7 +4,7 @@
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
-  "attn_implementation": "flash_attention_2",
   "bos_token_id": 50281,
   "classifier_activation": "gelu",
   "classifier_bias": false,
@@ -41,6 +41,6 @@
   "sparse_pred_ignore_index": -100,
   "sparse_prediction": false,
   "torch_dtype": "float32",
-  "transformers_version": "4.54.0",
   "vocab_size": 50368
 }

   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
+  "attn_implementation": "sdpa",
   "bos_token_id": 50281,
   "classifier_activation": "gelu",
   "classifier_bias": false,
   "sparse_pred_ignore_index": -100,
   "sparse_prediction": false,
   "torch_dtype": "float32",
+  "transformers_version": "4.55.4",
   "vocab_size": 50368
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2fcb90912bf8a5e13d4b275773666d2c047446e5f2ab572b02420914e69cdc0a
 size 1583544840

 version https://git-lfs.github.com/spec/v1
+oid sha256:50ab72ee4006e90d132d254bd261b095cdf9f599f538918a37986c6071d1773e
 size 1583544840

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd7d1ab5fa201d20d98db247d9f5d0ebc8dcb20aa2e1128cb5af2b40e8ae23a1
+size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7a9c66bbb17dafafd22a57810bb9248986283807454f0cb603b90488f8220f2
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:683f9d438efb114c8ac5a1515e4472a3e865f2fc3aea8a4b95df341b9ab5537f
 size 1465

tokenizer.json CHANGED Viewed

@@ -1,7 +1,21 @@
 {
   "version": "1.0",
-  "truncation": null,
-  "padding": null,
   "added_tokens": [
     {
       "id": 0,

 {
   "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 512,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
+  "padding": {
+    "strategy": {
+      "Fixed": 512
+    },
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 50283,
+    "pad_type_id": 0,
+    "pad_token": "[PAD]"
+  },
   "added_tokens": [
     {
       "id": 0,

tokenizer_config.json CHANGED Viewed

@@ -933,13 +933,20 @@
   "cls_token": "[CLS]",
   "extra_special_tokens": {},
   "mask_token": "[MASK]",
   "model_input_names": [
     "input_ids",
     "attention_mask"
   ],
   "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
   "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "[UNK]"
 }

   "cls_token": "[CLS]",
   "extra_special_tokens": {},
   "mask_token": "[MASK]",
+  "max_length": 512,
   "model_input_names": [
     "input_ids",
     "attention_mask"
   ],
   "model_max_length": 512,
+  "pad_to_multiple_of": null,
   "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
   "sep_token": "[SEP]",
+  "stride": 0,
   "tokenizer_class": "PreTrainedTokenizerFast",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
   "unk_token": "[UNK]"
 }

trainer_state.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.4771220000954244,
   "eval_steps": 1000,
   "global_step": 15000,
   "is_hyper_param_search": false,
@@ -10,1180 +10,1187 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.003180813333969496,
-      "grad_norm": 1.571864128112793,
-      "learning_rate": 2.5142857142857143e-05,
-      "loss": 9.1183,
       "step": 100
     },
     {
-      "epoch": 0.006361626667938992,
-      "grad_norm": 3.1469738483428955,
-      "learning_rate": 5.053968253968254e-05,
-      "loss": 6.8369,
       "step": 200
     },
     {
-      "epoch": 0.009542440001908488,
-      "grad_norm": 1.7220861911773682,
-      "learning_rate": 7.593650793650794e-05,
-      "loss": 5.6821,
       "step": 300
     },
     {
-      "epoch": 0.012723253335877984,
-      "grad_norm": 1.6516789197921753,
-      "learning_rate": 7.999856221185598e-05,
-      "loss": 4.8522,
       "step": 400
     },
     {
-      "epoch": 0.01590406666984748,
-      "grad_norm": 1.4884405136108398,
-      "learning_rate": 7.99931013821163e-05,
-      "loss": 4.4132,
       "step": 500
     },
     {
-      "epoch": 0.019084880003816976,
-      "grad_norm": 1.4667552709579468,
-      "learning_rate": 7.998356590220083e-05,
-      "loss": 4.1364,
       "step": 600
     },
     {
-      "epoch": 0.022265693337786472,
-      "grad_norm": 1.3577353954315186,
-      "learning_rate": 7.996995674362075e-05,
-      "loss": 3.9333,
       "step": 700
     },
     {
-      "epoch": 0.025446506671755968,
-      "grad_norm": 1.5057123899459839,
-      "learning_rate": 7.995227529292917e-05,
-      "loss": 3.7778,
       "step": 800
     },
     {
-      "epoch": 0.028627320005725463,
-      "grad_norm": 1.2254068851470947,
-      "learning_rate": 7.993052335157994e-05,
-      "loss": 3.6373,
       "step": 900
     },
     {
-      "epoch": 0.03180813333969496,
-      "grad_norm": 1.2805088758468628,
-      "learning_rate": 7.99047031357441e-05,
-      "loss": 3.5242,
       "step": 1000
     },
     {
-      "epoch": 0.03180813333969496,
-      "eval_loss": 3.466144561767578,
-      "eval_runtime": 83.4397,
-      "eval_samples_per_second": 193.11,
-      "eval_steps_per_second": 0.384,
       "step": 1000
     },
     {
-      "epoch": 0.03498894667366446,
-      "grad_norm": 1.2110868692398071,
-      "learning_rate": 7.987481727608398e-05,
-      "loss": 3.4105,
       "step": 1100
     },
     {
-      "epoch": 0.03816976000763395,
-      "grad_norm": 1.2545825242996216,
-      "learning_rate": 7.984086881748532e-05,
-      "loss": 3.3196,
       "step": 1200
     },
     {
-      "epoch": 0.04135057334160345,
-      "grad_norm": 1.193214774131775,
-      "learning_rate": 7.980286121874697e-05,
-      "loss": 3.2374,
       "step": 1300
     },
     {
-      "epoch": 0.044531386675572944,
-      "grad_norm": 1.2139487266540527,
-      "learning_rate": 7.976079835222848e-05,
-      "loss": 3.1653,
       "step": 1400
     },
     {
-      "epoch": 0.04771220000954244,
-      "grad_norm": 1.1818259954452515,
-      "learning_rate": 7.971468450345564e-05,
-      "loss": 3.1029,
       "step": 1500
     },
     {
-      "epoch": 0.050893013343511935,
-      "grad_norm": 1.1244895458221436,
-      "learning_rate": 7.966452437068377e-05,
-      "loss": 3.0479,
       "step": 1600
     },
     {
-      "epoch": 0.05407382667748143,
-      "grad_norm": 1.1834404468536377,
-      "learning_rate": 7.961032306441911e-05,
-      "loss": 2.9967,
       "step": 1700
     },
     {
-      "epoch": 0.057254640011450926,
-      "grad_norm": 1.0691077709197998,
-      "learning_rate": 7.95520861068981e-05,
-      "loss": 2.9565,
       "step": 1800
     },
     {
-      "epoch": 0.06043545334542042,
-      "grad_norm": 1.0717471837997437,
-      "learning_rate": 7.948981943152477e-05,
-      "loss": 2.9167,
       "step": 1900
     },
     {
-      "epoch": 0.06361626667938992,
-      "grad_norm": 1.0454130172729492,
-      "learning_rate": 7.942352938226626e-05,
-      "loss": 2.8795,
       "step": 2000
     },
     {
-      "epoch": 0.06361626667938992,
-      "eval_loss": 2.8664157390594482,
-      "eval_runtime": 74.1422,
-      "eval_samples_per_second": 217.326,
-      "eval_steps_per_second": 0.432,
       "step": 2000
     },
     {
-      "epoch": 0.06679708001335942,
-      "grad_norm": 1.0212640762329102,
-      "learning_rate": 7.935322271300637e-05,
-      "loss": 2.8447,
       "step": 2100
     },
     {
-      "epoch": 0.06997789334732891,
-      "grad_norm": 0.9942176342010498,
-      "learning_rate": 7.927890658685757e-05,
-      "loss": 2.8107,
       "step": 2200
     },
     {
-      "epoch": 0.07315870668129841,
-      "grad_norm": 1.0249791145324707,
-      "learning_rate": 7.92005885754311e-05,
-      "loss": 2.7803,
       "step": 2300
     },
     {
-      "epoch": 0.0763395200152679,
-      "grad_norm": 0.9678166508674622,
-      "learning_rate": 7.911827665806558e-05,
-      "loss": 2.7587,
       "step": 2400
     },
     {
-      "epoch": 0.0795203333492374,
-      "grad_norm": 0.9619715213775635,
-      "learning_rate": 7.90319792210141e-05,
-      "loss": 2.7303,
       "step": 2500
     },
     {
-      "epoch": 0.0827011466832069,
-      "grad_norm": 0.9805331826210022,
-      "learning_rate": 7.894170505658968e-05,
-      "loss": 2.706,
       "step": 2600
     },
     {
-      "epoch": 0.08588196001717639,
-      "grad_norm": 1.0143381357192993,
-      "learning_rate": 7.884746336226957e-05,
-      "loss": 2.6882,
       "step": 2700
     },
     {
-      "epoch": 0.08906277335114589,
-      "grad_norm": 0.9475836753845215,
-      "learning_rate": 7.874926373975814e-05,
-      "loss": 2.6674,
       "step": 2800
     },
     {
-      "epoch": 0.09224358668511538,
-      "grad_norm": 0.9616538882255554,
-      "learning_rate": 7.864711619400865e-05,
-      "loss": 2.6405,
       "step": 2900
     },
     {
-      "epoch": 0.09542440001908488,
-      "grad_norm": 0.9427627325057983,
-      "learning_rate": 7.854103113220384e-05,
-      "loss": 2.6261,
       "step": 3000
     },
     {
-      "epoch": 0.09542440001908488,
-      "eval_loss": 2.6171655654907227,
-      "eval_runtime": 74.2572,
-      "eval_samples_per_second": 216.989,
-      "eval_steps_per_second": 0.431,
       "step": 3000
     },
     {
-      "epoch": 0.09860521335305437,
-      "grad_norm": 0.9843456149101257,
-      "learning_rate": 7.843101936269565e-05,
-      "loss": 2.6075,
       "step": 3100
     },
     {
-      "epoch": 0.10178602668702387,
-      "grad_norm": 0.9504939913749695,
-      "learning_rate": 7.831709209390408e-05,
-      "loss": 2.5906,
       "step": 3200
     },
     {
-      "epoch": 0.10496684002099337,
-      "grad_norm": 0.9153879880905151,
-      "learning_rate": 7.819926093317513e-05,
-      "loss": 2.578,
       "step": 3300
     },
     {
-      "epoch": 0.10814765335496286,
-      "grad_norm": 0.9085161685943604,
-      "learning_rate": 7.807753788559824e-05,
-      "loss": 2.56,
       "step": 3400
     },
     {
-      "epoch": 0.11132846668893236,
-      "grad_norm": 0.9289081692695618,
-      "learning_rate": 7.795193535278322e-05,
-      "loss": 2.5432,
       "step": 3500
     },
     {
-      "epoch": 0.11450928002290185,
-      "grad_norm": 0.9010806083679199,
-      "learning_rate": 7.782246613159663e-05,
-      "loss": 2.5272,
       "step": 3600
     },
     {
-      "epoch": 0.11769009335687135,
-      "grad_norm": 0.9009876847267151,
-      "learning_rate": 7.768914341285804e-05,
-      "loss": 2.5154,
       "step": 3700
     },
     {
-      "epoch": 0.12087090669084084,
-      "grad_norm": 0.920699417591095,
-      "learning_rate": 7.755198077999615e-05,
-      "loss": 2.503,
       "step": 3800
     },
     {
-      "epoch": 0.12405172002481034,
-      "grad_norm": 0.8694368600845337,
-      "learning_rate": 7.74109922076647e-05,
-      "loss": 2.4917,
       "step": 3900
     },
     {
-      "epoch": 0.12723253335877985,
-      "grad_norm": 0.8826000094413757,
-      "learning_rate": 7.726619206031888e-05,
-      "loss": 2.4808,
       "step": 4000
     },
     {
-      "epoch": 0.12723253335877985,
-      "eval_loss": 2.4718618392944336,
-      "eval_runtime": 74.2591,
-      "eval_samples_per_second": 216.984,
-      "eval_steps_per_second": 0.431,
       "step": 4000
     },
     {
-      "epoch": 0.13041334669274934,
-      "grad_norm": 0.920994222164154,
-      "learning_rate": 7.711759509075166e-05,
-      "loss": 2.4688,
       "step": 4100
     },
     {
-      "epoch": 0.13359416002671884,
-      "grad_norm": 0.8877164125442505,
-      "learning_rate": 7.696521643859082e-05,
-      "loss": 2.463,
       "step": 4200
     },
     {
-      "epoch": 0.13677497336068833,
-      "grad_norm": 0.8722163438796997,
-      "learning_rate": 7.680907162875641e-05,
-      "loss": 2.4462,
       "step": 4300
     },
     {
-      "epoch": 0.13995578669465783,
-      "grad_norm": 0.8788607716560364,
-      "learning_rate": 7.664917656987906e-05,
-      "loss": 2.435,
       "step": 4400
     },
     {
-      "epoch": 0.14313660002862733,
-      "grad_norm": 0.8874791264533997,
-      "learning_rate": 7.648554755267907e-05,
-      "loss": 2.4276,
       "step": 4500
     },
     {
-      "epoch": 0.14631741336259682,
-      "grad_norm": 0.921125054359436,
-      "learning_rate": 7.631820124830674e-05,
-      "loss": 2.4179,
       "step": 4600
     },
     {
-      "epoch": 0.14949822669656632,
-      "grad_norm": 0.8657805323600769,
-      "learning_rate": 7.614715470664379e-05,
-      "loss": 2.4098,
       "step": 4700
     },
     {
-      "epoch": 0.1526790400305358,
-      "grad_norm": 0.863875150680542,
-      "learning_rate": 7.597242535456632e-05,
-      "loss": 2.3982,
       "step": 4800
     },
     {
-      "epoch": 0.1558598533645053,
-      "grad_norm": 0.8500732779502869,
-      "learning_rate": 7.579403099416917e-05,
-      "loss": 2.3869,
       "step": 4900
     },
     {
-      "epoch": 0.1590406666984748,
-      "grad_norm": 0.8550170063972473,
-      "learning_rate": 7.561198980095229e-05,
-      "loss": 2.3803,
       "step": 5000
     },
     {
-      "epoch": 0.1590406666984748,
-      "eval_loss": 2.3734896183013916,
-      "eval_runtime": 74.3836,
-      "eval_samples_per_second": 216.62,
-      "eval_steps_per_second": 0.43,
       "step": 5000
     },
     {
-      "epoch": 0.1622214800324443,
-      "grad_norm": 0.8349749445915222,
-      "learning_rate": 7.54263203219689e-05,
-      "loss": 2.3696,
       "step": 5100
     },
     {
-      "epoch": 0.1654022933664138,
-      "grad_norm": 0.8674113750457764,
-      "learning_rate": 7.523704147393588e-05,
-      "loss": 2.364,
       "step": 5200
     },
     {
-      "epoch": 0.1685831067003833,
-      "grad_norm": 0.865531861782074,
-      "learning_rate": 7.50441725413064e-05,
-      "loss": 2.3548,
       "step": 5300
     },
     {
-      "epoch": 0.17176392003435278,
-      "grad_norm": 0.848044753074646,
-      "learning_rate": 7.484773317430521e-05,
-      "loss": 2.3488,
       "step": 5400
     },
     {
-      "epoch": 0.17494473336832228,
-      "grad_norm": 0.8770771026611328,
-      "learning_rate": 7.464774338692655e-05,
-      "loss": 2.3418,
       "step": 5500
     },
     {
-      "epoch": 0.17812554670229178,
-      "grad_norm": 0.8298258781433105,
-      "learning_rate": 7.44442235548951e-05,
-      "loss": 2.3362,
       "step": 5600
     },
     {
-      "epoch": 0.18130636003626127,
-      "grad_norm": 0.844881534576416,
-      "learning_rate": 7.423719441358998e-05,
-      "loss": 2.3303,
       "step": 5700
     },
     {
-      "epoch": 0.18448717337023077,
-      "grad_norm": 0.8413675427436829,
-      "learning_rate": 7.402667705593215e-05,
-      "loss": 2.3192,
       "step": 5800
     },
     {
-      "epoch": 0.18766798670420026,
-      "grad_norm": 0.8078741431236267,
-      "learning_rate": 7.381269293023544e-05,
-      "loss": 2.3108,
       "step": 5900
     },
     {
-      "epoch": 0.19084880003816976,
-      "grad_norm": 0.8276827931404114,
-      "learning_rate": 7.359526383802122e-05,
-      "loss": 2.3051,
       "step": 6000
     },
     {
-      "epoch": 0.19084880003816976,
-      "eval_loss": 2.304142951965332,
-      "eval_runtime": 74.2683,
-      "eval_samples_per_second": 216.957,
-      "eval_steps_per_second": 0.431,
       "step": 6000
     },
     {
-      "epoch": 0.19402961337213925,
-      "grad_norm": 0.86152184009552,
-      "learning_rate": 7.337441193179726e-05,
-      "loss": 2.2977,
       "step": 6100
     },
     {
-      "epoch": 0.19721042670610875,
-      "grad_norm": 0.8484801650047302,
-      "learning_rate": 7.31501597128007e-05,
-      "loss": 2.2929,
       "step": 6200
     },
     {
-      "epoch": 0.20039124004007824,
-      "grad_norm": 0.8117874264717102,
-      "learning_rate": 7.292253002870558e-05,
-      "loss": 2.2837,
       "step": 6300
     },
     {
-      "epoch": 0.20357205337404774,
-      "grad_norm": 0.8349144458770752,
-      "learning_rate": 7.269154607129499e-05,
-      "loss": 2.2795,
       "step": 6400
     },
     {
-      "epoch": 0.20675286670801724,
-      "grad_norm": 0.8227733373641968,
-      "learning_rate": 7.24572313740982e-05,
-      "loss": 2.2722,
       "step": 6500
     },
     {
-      "epoch": 0.20993368004198673,
-      "grad_norm": 0.7939384579658508,
-      "learning_rate": 7.221960980999302e-05,
-      "loss": 2.271,
       "step": 6600
     },
     {
-      "epoch": 0.21311449337595623,
-      "grad_norm": 0.8117570877075195,
-      "learning_rate": 7.19787055887735e-05,
-      "loss": 2.2645,
       "step": 6700
     },
     {
-      "epoch": 0.21629530670992572,
-      "grad_norm": 0.8136454224586487,
-      "learning_rate": 7.173454325468337e-05,
-      "loss": 2.2569,
       "step": 6800
     },
     {
-      "epoch": 0.21947612004389522,
-      "grad_norm": 0.7970232367515564,
-      "learning_rate": 7.148714768391532e-05,
-      "loss": 2.2515,
       "step": 6900
     },
     {
-      "epoch": 0.2226569333778647,
-      "grad_norm": 0.8086152672767639,
-      "learning_rate": 7.123654408207664e-05,
-      "loss": 2.246,
       "step": 7000
     },
     {
-      "epoch": 0.2226569333778647,
-      "eval_loss": 2.243811845779419,
-      "eval_runtime": 74.3461,
-      "eval_samples_per_second": 216.729,
-      "eval_steps_per_second": 0.43,
       "step": 7000
     },
     {
-      "epoch": 0.2258377467118342,
-      "grad_norm": 0.8161323666572571,
-      "learning_rate": 7.098275798162101e-05,
-      "loss": 2.2429,
       "step": 7100
     },
     {
-      "epoch": 0.2290185600458037,
-      "grad_norm": 0.804811418056488,
-      "learning_rate": 7.072581523924731e-05,
-      "loss": 2.2357,
       "step": 7200
     },
     {
-      "epoch": 0.2321993733797732,
-      "grad_norm": 0.8152891993522644,
-      "learning_rate": 7.046574203326515e-05,
-      "loss": 2.229,
       "step": 7300
     },
     {
-      "epoch": 0.2353801867137427,
-      "grad_norm": 0.8120368719100952,
-      "learning_rate": 7.020256486092776e-05,
-      "loss": 2.2261,
       "step": 7400
     },
     {
-      "epoch": 0.2385610000477122,
-      "grad_norm": 0.7821657061576843,
-      "learning_rate": 6.99363105357323e-05,
-      "loss": 2.2212,
       "step": 7500
     },
     {
-      "epoch": 0.24174181338168169,
-      "grad_norm": 0.7894716858863831,
-      "learning_rate": 6.96670061846881e-05,
-      "loss": 2.217,
       "step": 7600
     },
     {
-      "epoch": 0.24492262671565118,
-      "grad_norm": 0.7840438485145569,
-      "learning_rate": 6.939467924555274e-05,
-      "loss": 2.2139,
       "step": 7700
     },
     {
-      "epoch": 0.24810344004962068,
-      "grad_norm": 0.7956035137176514,
-      "learning_rate": 6.911935746403668e-05,
-      "loss": 2.2013,
       "step": 7800
     },
     {
-      "epoch": 0.2512842533835902,
-      "grad_norm": 0.7977150082588196,
-      "learning_rate": 6.884106889097633e-05,
-      "loss": 2.2011,
       "step": 7900
     },
     {
-      "epoch": 0.2544650667175597,
-      "grad_norm": 0.7777612805366516,
-      "learning_rate": 6.85598418794762e-05,
-      "loss": 2.1963,
       "step": 8000
     },
     {
-      "epoch": 0.2544650667175597,
-      "eval_loss": 2.1914148330688477,
-      "eval_runtime": 74.2078,
-      "eval_samples_per_second": 217.134,
-      "eval_steps_per_second": 0.431,
       "step": 8000
     },
     {
-      "epoch": 0.25764588005152916,
-      "grad_norm": 0.8219434022903442,
-      "learning_rate": 6.827570508202017e-05,
-      "loss": 2.1922,
       "step": 8100
     },
     {
-      "epoch": 0.2608266933854987,
-      "grad_norm": 0.8048291802406311,
-      "learning_rate": 6.798868744755221e-05,
-      "loss": 2.1835,
       "step": 8200
     },
     {
-      "epoch": 0.26400750671946815,
-      "grad_norm": 0.8045642971992493,
-      "learning_rate": 6.7698818218527e-05,
-      "loss": 2.18,
       "step": 8300
     },
     {
-      "epoch": 0.2671883200534377,
-      "grad_norm": 0.7891286611557007,
-      "learning_rate": 6.740612692793061e-05,
-      "loss": 2.1796,
       "step": 8400
     },
     {
-      "epoch": 0.27036913338740715,
-      "grad_norm": 0.7675222158432007,
-      "learning_rate": 6.711064339627148e-05,
-      "loss": 2.175,
       "step": 8500
     },
     {
-      "epoch": 0.27354994672137667,
-      "grad_norm": 0.7776646018028259,
-      "learning_rate": 6.68123977285423e-05,
-      "loss": 2.1669,
       "step": 8600
     },
     {
-      "epoch": 0.27673076005534614,
-      "grad_norm": 0.7806198000907898,
-      "learning_rate": 6.651142031115273e-05,
-      "loss": 2.163,
       "step": 8700
     },
     {
-      "epoch": 0.27991157338931566,
-      "grad_norm": 0.7961702942848206,
-      "learning_rate": 6.620774180883357e-05,
-      "loss": 2.1603,
       "step": 8800
     },
     {
-      "epoch": 0.2830923867232851,
-      "grad_norm": 0.7584977149963379,
-      "learning_rate": 6.590139316151244e-05,
-      "loss": 2.1585,
       "step": 8900
     },
     {
-      "epoch": 0.28627320005725465,
-      "grad_norm": 0.7702009677886963,
-      "learning_rate": 6.559240558116156e-05,
-      "loss": 2.1564,
       "step": 9000
     },
     {
-      "epoch": 0.28627320005725465,
-      "eval_loss": 2.152564525604248,
-      "eval_runtime": 74.312,
-      "eval_samples_per_second": 216.829,
-      "eval_steps_per_second": 0.431,
       "step": 9000
     },
     {
-      "epoch": 0.2894540133912241,
-      "grad_norm": 0.764265239238739,
-      "learning_rate": 6.528081054861775e-05,
-      "loss": 2.1482,
       "step": 9100
     },
     {
-      "epoch": 0.29263482672519364,
-      "grad_norm": 0.7961864471435547,
-      "learning_rate": 6.496663981037506e-05,
-      "loss": 2.1442,
       "step": 9200
     },
     {
-      "epoch": 0.2958156400591631,
-      "grad_norm": 0.7772250175476074,
-      "learning_rate": 6.464992537535025e-05,
-      "loss": 2.1456,
       "step": 9300
     },
     {
-      "epoch": 0.29899645339313263,
-      "grad_norm": 0.7651770114898682,
-      "learning_rate": 6.433069951162168e-05,
-      "loss": 2.1428,
       "step": 9400
     },
     {
-      "epoch": 0.3021772667271021,
-      "grad_norm": 0.7702131867408752,
-      "learning_rate": 6.400899474314163e-05,
-      "loss": 2.1336,
       "step": 9500
     },
     {
-      "epoch": 0.3053580800610716,
-      "grad_norm": 0.7733346819877625,
-      "learning_rate": 6.368484384642269e-05,
-      "loss": 2.1321,
       "step": 9600
     },
     {
-      "epoch": 0.3085388933950411,
-      "grad_norm": 0.7849894762039185,
-      "learning_rate": 6.335827984719839e-05,
-      "loss": 2.1319,
       "step": 9700
     },
     {
-      "epoch": 0.3117197067290106,
-      "grad_norm": 0.7571319341659546,
-      "learning_rate": 6.302933601705836e-05,
-      "loss": 2.1239,
       "step": 9800
     },
     {
-      "epoch": 0.3149005200629801,
-      "grad_norm": 0.749380886554718,
-      "learning_rate": 6.269804587005846e-05,
-      "loss": 2.1172,
       "step": 9900
     },
     {
-      "epoch": 0.3180813333969496,
-      "grad_norm": 0.7648999691009521,
-      "learning_rate": 6.236444315930636e-05,
-      "loss": 2.1154,
       "step": 10000
     },
     {
-      "epoch": 0.3180813333969496,
-      "eval_loss": 2.114206075668335,
-      "eval_runtime": 74.2204,
-      "eval_samples_per_second": 217.097,
-      "eval_steps_per_second": 0.431,
       "step": 10000
     },
     {
-      "epoch": 0.3212621467309191,
-      "grad_norm": 0.7476580142974854,
-      "learning_rate": 6.202856187352252e-05,
-      "loss": 2.1124,
       "step": 10100
     },
     {
-      "epoch": 0.3244429600648886,
-      "grad_norm": 0.8032057285308838,
-      "learning_rate": 6.169043623357739e-05,
-      "loss": 2.111,
       "step": 10200
     },
     {
-      "epoch": 0.32762377339885806,
-      "grad_norm": 0.7571981549263,
-      "learning_rate": 6.135010068900474e-05,
-      "loss": 2.1051,
       "step": 10300
     },
     {
-      "epoch": 0.3308045867328276,
-      "grad_norm": 0.7669061422348022,
-      "learning_rate": 6.100758991449196e-05,
-      "loss": 2.107,
       "step": 10400
     },
     {
-      "epoch": 0.33398540006679706,
-      "grad_norm": 0.7569290399551392,
-      "learning_rate": 6.0662938806347125e-05,
-      "loss": 2.1036,
       "step": 10500
     },
     {
-      "epoch": 0.3371662134007666,
-      "grad_norm": 0.764243483543396,
-      "learning_rate": 6.03161824789437e-05,
-      "loss": 2.0988,
       "step": 10600
     },
     {
-      "epoch": 0.34034702673473605,
-      "grad_norm": 0.7787636518478394,
-      "learning_rate": 5.9967356261142986e-05,
-      "loss": 2.0955,
       "step": 10700
     },
     {
-      "epoch": 0.34352784006870557,
-      "grad_norm": 0.7693697810173035,
-      "learning_rate": 5.961649569269457e-05,
-      "loss": 2.0928,
       "step": 10800
     },
     {
-      "epoch": 0.34670865340267504,
-      "grad_norm": 0.7563138604164124,
-      "learning_rate": 5.9263636520615524e-05,
-      "loss": 2.0909,
       "step": 10900
     },
     {
-      "epoch": 0.34988946673664456,
-      "grad_norm": 0.7634861469268799,
-      "learning_rate": 5.8908814695548266e-05,
-      "loss": 2.0856,
       "step": 11000
     },
     {
-      "epoch": 0.34988946673664456,
-      "eval_loss": 2.0881729125976562,
-      "eval_runtime": 74.282,
-      "eval_samples_per_second": 216.917,
-      "eval_steps_per_second": 0.431,
       "step": 11000
     },
     {
-      "epoch": 0.35307028007061403,
-      "grad_norm": 0.751215934753418,
-      "learning_rate": 5.8552066368097823e-05,
-      "loss": 2.0809,
       "step": 11100
     },
     {
-      "epoch": 0.35625109340458355,
-      "grad_norm": 0.743963897228241,
-      "learning_rate": 5.819342788514867e-05,
-      "loss": 2.0829,
       "step": 11200
     },
     {
-      "epoch": 0.3594319067385531,
-      "grad_norm": 0.7820264101028442,
-      "learning_rate": 5.783293578616153e-05,
-      "loss": 2.0766,
       "step": 11300
     },
     {
-      "epoch": 0.36261272007252254,
-      "grad_norm": 0.765643298625946,
-      "learning_rate": 5.747062679945063e-05,
-      "loss": 2.0698,
       "step": 11400
     },
     {
-      "epoch": 0.36579353340649207,
-      "grad_norm": 0.7529120445251465,
-      "learning_rate": 5.710653783844168e-05,
-      "loss": 2.0716,
       "step": 11500
     },
     {
-      "epoch": 0.36897434674046153,
-      "grad_norm": 0.7667486667633057,
-      "learning_rate": 5.6740705997910966e-05,
-      "loss": 2.0664,
       "step": 11600
     },
     {
-      "epoch": 0.37215516007443106,
-      "grad_norm": 0.7487459182739258,
-      "learning_rate": 5.6373168550206016e-05,
-      "loss": 2.0637,
       "step": 11700
     },
     {
-      "epoch": 0.3753359734084005,
-      "grad_norm": 0.7455624341964722,
-      "learning_rate": 5.600396294144813e-05,
-      "loss": 2.0648,
       "step": 11800
     },
     {
-      "epoch": 0.37851678674237005,
-      "grad_norm": 0.7680267691612244,
-      "learning_rate": 5.5633126787717255e-05,
-      "loss": 2.0616,
       "step": 11900
     },
     {
-      "epoch": 0.3816976000763395,
-      "grad_norm": 0.7449765205383301,
-      "learning_rate": 5.52606978712195e-05,
-      "loss": 2.0581,
       "step": 12000
     },
     {
-      "epoch": 0.3816976000763395,
-      "eval_loss": 2.061724901199341,
-      "eval_runtime": 74.3191,
-      "eval_samples_per_second": 216.808,
-      "eval_steps_per_second": 0.431,
       "step": 12000
     },
     {
-      "epoch": 0.38487841341030904,
-      "grad_norm": 0.7986348867416382,
-      "learning_rate": 5.4886714136437705e-05,
-      "loss": 2.0562,
       "step": 12100
     },
     {
-      "epoch": 0.3880592267442785,
-      "grad_norm": 0.7683764100074768,
-      "learning_rate": 5.451121368626559e-05,
-      "loss": 2.0566,
       "step": 12200
     },
     {
-      "epoch": 0.39124004007824803,
-      "grad_norm": 0.7579689621925354,
-      "learning_rate": 5.413423477812561e-05,
-      "loss": 2.0502,
       "step": 12300
     },
     {
-      "epoch": 0.3944208534122175,
-      "grad_norm": 0.7543194890022278,
-      "learning_rate": 5.375581582007121e-05,
-      "loss": 2.0442,
       "step": 12400
     },
     {
-      "epoch": 0.397601666746187,
-      "grad_norm": 0.7310113906860352,
-      "learning_rate": 5.337599536687357e-05,
-      "loss": 2.0436,
       "step": 12500
     },
     {
-      "epoch": 0.4007824800801565,
-      "grad_norm": 0.7546029090881348,
-      "learning_rate": 5.2994812116093606e-05,
-      "loss": 2.0397,
       "step": 12600
     },
     {
-      "epoch": 0.403963293414126,
-      "grad_norm": 0.7738301753997803,
-      "learning_rate": 5.261230490413925e-05,
-      "loss": 2.0403,
       "step": 12700
     },
     {
-      "epoch": 0.4071441067480955,
-      "grad_norm": 0.7298979759216309,
-      "learning_rate": 5.2228512702308684e-05,
-      "loss": 2.0343,
       "step": 12800
     },
     {
-      "epoch": 0.410324920082065,
-      "grad_norm": 0.7821021676063538,
-      "learning_rate": 5.1843474612819726e-05,
-      "loss": 2.0361,
       "step": 12900
     },
     {
-      "epoch": 0.41350573341603447,
-      "grad_norm": 0.7907674312591553,
-      "learning_rate": 5.145722986482604e-05,
-      "loss": 2.0343,
       "step": 13000
     },
     {
-      "epoch": 0.41350573341603447,
-      "eval_loss": 2.0375359058380127,
-      "eval_runtime": 74.169,
-      "eval_samples_per_second": 217.247,
-      "eval_steps_per_second": 0.431,
       "step": 13000
     },
     {
-      "epoch": 0.416686546750004,
-      "grad_norm": 0.7722309231758118,
-      "learning_rate": 5.106981781042026e-05,
-      "loss": 2.0349,
       "step": 13100
     },
     {
-      "epoch": 0.41986736008397346,
-      "grad_norm": 0.7431825995445251,
-      "learning_rate": 5.068127792062466e-05,
-      "loss": 2.0277,
       "step": 13200
     },
     {
-      "epoch": 0.423048173417943,
-      "grad_norm": 0.739189624786377,
-      "learning_rate": 5.0291649781369725e-05,
-      "loss": 2.0269,
       "step": 13300
     },
     {
-      "epoch": 0.42622898675191245,
-      "grad_norm": 0.7400371432304382,
-      "learning_rate": 4.990097308946089e-05,
-      "loss": 2.0251,
       "step": 13400
     },
     {
-      "epoch": 0.429409800085882,
-      "grad_norm": 0.7629276514053345,
-      "learning_rate": 4.950928764853423e-05,
-      "loss": 2.0235,
       "step": 13500
     },
     {
-      "epoch": 0.43259061341985144,
-      "grad_norm": 0.7603936195373535,
-      "learning_rate": 4.911663336500097e-05,
-      "loss": 2.0175,
       "step": 13600
     },
     {
-      "epoch": 0.43577142675382097,
-      "grad_norm": 0.7549597024917603,
-      "learning_rate": 4.872305024398175e-05,
-      "loss": 2.018,
       "step": 13700
     },
     {
-      "epoch": 0.43895224008779044,
-      "grad_norm": 0.7367106676101685,
-      "learning_rate": 4.8328578385230744e-05,
-      "loss": 2.016,
       "step": 13800
     },
     {
-      "epoch": 0.44213305342175996,
-      "grad_norm": 0.7753973007202148,
-      "learning_rate": 4.7933257979050066e-05,
-      "loss": 2.0103,
       "step": 13900
     },
     {
-      "epoch": 0.4453138667557294,
-      "grad_norm": 0.7437424063682556,
-      "learning_rate": 4.7537129302195154e-05,
-      "loss": 2.008,
       "step": 14000
     },
     {
-      "epoch": 0.4453138667557294,
-      "eval_loss": 2.004824161529541,
-      "eval_runtime": 74.2767,
-      "eval_samples_per_second": 216.932,
-      "eval_steps_per_second": 0.431,
       "step": 14000
     },
     {
-      "epoch": 0.44849468008969895,
-      "grad_norm": 0.74482262134552,
-      "learning_rate": 4.714023271377113e-05,
-      "loss": 2.0059,
       "step": 14100
     },
     {
-      "epoch": 0.4516754934236684,
-      "grad_norm": 0.7460625767707825,
-      "learning_rate": 4.674260865112088e-05,
-      "loss": 2.0087,
       "step": 14200
     },
     {
-      "epoch": 0.45485630675763794,
-      "grad_norm": 0.7695781588554382,
-      "learning_rate": 4.6344297625705144e-05,
-      "loss": 2.0011,
       "step": 14300
     },
     {
-      "epoch": 0.4580371200916074,
-      "grad_norm": 0.7401705980300903,
-      "learning_rate": 4.594534021897507e-05,
-      "loss": 2.0021,
       "step": 14400
     },
     {
-      "epoch": 0.46121793342557693,
-      "grad_norm": 0.7603822946548462,
-      "learning_rate": 4.554577707823759e-05,
-      "loss": 1.9973,
       "step": 14500
     },
     {
-      "epoch": 0.4643987467595464,
-      "grad_norm": 0.7397817969322205,
-      "learning_rate": 4.514564891251417e-05,
-      "loss": 1.9998,
       "step": 14600
     },
     {
-      "epoch": 0.4675795600935159,
-      "grad_norm": 0.7768462300300598,
-      "learning_rate": 4.474499648839313e-05,
-      "loss": 1.9943,
       "step": 14700
     },
     {
-      "epoch": 0.4707603734274854,
-      "grad_norm": 0.7363395094871521,
-      "learning_rate": 4.434386062587629e-05,
-      "loss": 1.9959,
       "step": 14800
     },
     {
-      "epoch": 0.4739411867614549,
-      "grad_norm": 0.7700894474983215,
-      "learning_rate": 4.3942282194219984e-05,
-      "loss": 1.9934,
       "step": 14900
     },
     {
-      "epoch": 0.4771220000954244,
-      "grad_norm": 0.7271220088005066,
-      "learning_rate": 4.3540302107771206e-05,
-      "loss": 1.9899,
       "step": 15000
     },
     {
-      "epoch": 0.4771220000954244,
-      "eval_loss": 1.9911503791809082,
-      "eval_runtime": 74.3028,
-      "eval_samples_per_second": 216.856,
-      "eval_steps_per_second": 0.431,
       "step": 15000
     }
   ],
   "logging_steps": 100,
-  "max_steps": 31439,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
   "save_steps": 5000,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -1197,8 +1204,8 @@
       "attributes": {}
     }
   },
-  "total_flos": 8.123170431172608e+18,
-  "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null
 }

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.015,
   "eval_steps": 1000,
   "global_step": 15000,
   "is_hyper_param_search": false,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 1e-06,
+      "grad_norm": 1.1795536279678345,
+      "learning_rate": 0.0,
+      "loss": 1.4139,
+      "step": 1
+    },
+    {
+      "epoch": 0.0001,
+      "grad_norm": 1.1734141111373901,
+      "learning_rate": 9.900000000000001e-08,
+      "loss": 1.387,
       "step": 100
     },
     {
+      "epoch": 0.0002,
+      "grad_norm": 1.1503151655197144,
+      "learning_rate": 1.9900000000000002e-07,
+      "loss": 1.3882,
       "step": 200
     },
     {
+      "epoch": 0.0003,
+      "grad_norm": 1.1478229761123657,
+      "learning_rate": 2.99e-07,
+      "loss": 1.386,
       "step": 300
     },
     {
+      "epoch": 0.0004,
+      "grad_norm": 1.1559761762619019,
+      "learning_rate": 3.99e-07,
+      "loss": 1.3823,
       "step": 400
     },
     {
+      "epoch": 0.0005,
+      "grad_norm": 1.1433175802230835,
+      "learning_rate": 4.99e-07,
+      "loss": 1.381,
       "step": 500
     },
     {
+      "epoch": 0.0006,
+      "grad_norm": 1.1483551263809204,
+      "learning_rate": 5.990000000000001e-07,
+      "loss": 1.3807,
       "step": 600
     },
     {
+      "epoch": 0.0007,
+      "grad_norm": 1.161496877670288,
+      "learning_rate": 6.990000000000001e-07,
+      "loss": 1.3833,
       "step": 700
     },
     {
+      "epoch": 0.0008,
+      "grad_norm": 1.139211654663086,
+      "learning_rate": 7.990000000000001e-07,
+      "loss": 1.3835,
       "step": 800
     },
     {
+      "epoch": 0.0009,
+      "grad_norm": 1.133931040763855,
+      "learning_rate": 8.99e-07,
+      "loss": 1.3719,
       "step": 900
     },
     {
+      "epoch": 0.001,
+      "grad_norm": 1.1143814325332642,
+      "learning_rate": 9.99e-07,
+      "loss": 1.3761,
       "step": 1000
     },
     {
+      "epoch": 0.001,
+      "eval_loss": 1.4045685529708862,
+      "eval_runtime": 27.4497,
+      "eval_samples_per_second": 182.152,
+      "eval_steps_per_second": 2.878,
       "step": 1000
     },
     {
+      "epoch": 0.0011,
+      "grad_norm": 1.153507947921753,
+      "learning_rate": 1.099e-06,
+      "loss": 1.3669,
       "step": 1100
     },
     {
+      "epoch": 0.0012,
+      "grad_norm": 1.1281546354293823,
+      "learning_rate": 1.199e-06,
+      "loss": 1.375,
       "step": 1200
     },
     {
+      "epoch": 0.0013,
+      "grad_norm": 1.1093217134475708,
+      "learning_rate": 1.299e-06,
+      "loss": 1.3726,
       "step": 1300
     },
     {
+      "epoch": 0.0014,
+      "grad_norm": 1.1526917219161987,
+      "learning_rate": 1.399e-06,
+      "loss": 1.3696,
       "step": 1400
     },
     {
+      "epoch": 0.0015,
+      "grad_norm": 1.1092661619186401,
+      "learning_rate": 1.4990000000000002e-06,
+      "loss": 1.3699,
       "step": 1500
     },
     {
+      "epoch": 0.0016,
+      "grad_norm": 1.5104150772094727,
+      "learning_rate": 1.599e-06,
+      "loss": 1.3734,
       "step": 1600
     },
     {
+      "epoch": 0.0017,
+      "grad_norm": 1.1301764249801636,
+      "learning_rate": 1.6990000000000002e-06,
+      "loss": 1.3719,
       "step": 1700
     },
     {
+      "epoch": 0.0018,
+      "grad_norm": 1.120370626449585,
+      "learning_rate": 1.7990000000000003e-06,
+      "loss": 1.3695,
       "step": 1800
     },
     {
+      "epoch": 0.0019,
+      "grad_norm": 1.145676612854004,
+      "learning_rate": 1.8990000000000004e-06,
+      "loss": 1.3675,
       "step": 1900
     },
     {
+      "epoch": 0.002,
+      "grad_norm": 1.1365715265274048,
+      "learning_rate": 1.9990000000000003e-06,
+      "loss": 1.3616,
       "step": 2000
     },
     {
+      "epoch": 0.002,
+      "eval_loss": 1.3989644050598145,
+      "eval_runtime": 24.4886,
+      "eval_samples_per_second": 204.177,
+      "eval_steps_per_second": 3.226,
       "step": 2000
     },
     {
+      "epoch": 0.0021,
+      "grad_norm": 1.118861198425293,
+      "learning_rate": 2.099e-06,
+      "loss": 1.3635,
       "step": 2100
     },
     {
+      "epoch": 0.0022,
+      "grad_norm": 1.1307072639465332,
+      "learning_rate": 2.1990000000000005e-06,
+      "loss": 1.375,
       "step": 2200
     },
     {
+      "epoch": 0.0023,
+      "grad_norm": 1.088172197341919,
+      "learning_rate": 2.299e-06,
+      "loss": 1.3627,
       "step": 2300
     },
     {
+      "epoch": 0.0024,
+      "grad_norm": 1.1681883335113525,
+      "learning_rate": 2.3990000000000002e-06,
+      "loss": 1.3607,
       "step": 2400
     },
     {
+      "epoch": 0.0025,
+      "grad_norm": 1.1483210325241089,
+      "learning_rate": 2.499e-06,
+      "loss": 1.3687,
       "step": 2500
     },
     {
+      "epoch": 0.0026,
+      "grad_norm": 1.1572397947311401,
+      "learning_rate": 2.5990000000000004e-06,
+      "loss": 1.3695,
       "step": 2600
     },
     {
+      "epoch": 0.0027,
+      "grad_norm": 1.124837875366211,
+      "learning_rate": 2.699e-06,
+      "loss": 1.3532,
       "step": 2700
     },
     {
+      "epoch": 0.0028,
+      "grad_norm": 1.0974047183990479,
+      "learning_rate": 2.7990000000000002e-06,
+      "loss": 1.3577,
       "step": 2800
     },
     {
+      "epoch": 0.0029,
+      "grad_norm": 1.1722006797790527,
+      "learning_rate": 2.899e-06,
+      "loss": 1.3673,
       "step": 2900
     },
     {
+      "epoch": 0.003,
+      "grad_norm": 1.106062650680542,
+      "learning_rate": 2.9990000000000004e-06,
+      "loss": 1.36,
       "step": 3000
     },
     {
+      "epoch": 0.003,
+      "eval_loss": 1.3754355907440186,
+      "eval_runtime": 24.5927,
+      "eval_samples_per_second": 203.312,
+      "eval_steps_per_second": 3.212,
       "step": 3000
     },
     {
+      "epoch": 0.0031,
+      "grad_norm": 1.1039618253707886,
+      "learning_rate": 3.0990000000000003e-06,
+      "loss": 1.3567,
       "step": 3100
     },
     {
+      "epoch": 0.0032,
+      "grad_norm": 1.1439259052276611,
+      "learning_rate": 3.1990000000000006e-06,
+      "loss": 1.3543,
       "step": 3200
     },
     {
+      "epoch": 0.0033,
+      "grad_norm": 1.1732087135314941,
+      "learning_rate": 3.2990000000000005e-06,
+      "loss": 1.3464,
       "step": 3300
     },
     {
+      "epoch": 0.0034,
+      "grad_norm": 1.0517069101333618,
+      "learning_rate": 3.399e-06,
+      "loss": 1.3398,
       "step": 3400
     },
     {
+      "epoch": 0.0035,
+      "grad_norm": 1.0987197160720825,
+      "learning_rate": 3.4990000000000003e-06,
+      "loss": 1.356,
       "step": 3500
     },
     {
+      "epoch": 0.0036,
+      "grad_norm": 1.1524548530578613,
+      "learning_rate": 3.599e-06,
+      "loss": 1.3481,
       "step": 3600
     },
     {
+      "epoch": 0.0037,
+      "grad_norm": 1.10309636592865,
+      "learning_rate": 3.6990000000000005e-06,
+      "loss": 1.3515,
       "step": 3700
     },
     {
+      "epoch": 0.0038,
+      "grad_norm": 1.1285984516143799,
+      "learning_rate": 3.7990000000000004e-06,
+      "loss": 1.3541,
       "step": 3800
     },
     {
+      "epoch": 0.0039,
+      "grad_norm": 1.1621686220169067,
+      "learning_rate": 3.899e-06,
+      "loss": 1.3532,
       "step": 3900
     },
     {
+      "epoch": 0.004,
+      "grad_norm": 1.078803300857544,
+      "learning_rate": 3.999e-06,
+      "loss": 1.3468,
       "step": 4000
     },
     {
+      "epoch": 0.004,
+      "eval_loss": 1.3711252212524414,
+      "eval_runtime": 24.5467,
+      "eval_samples_per_second": 203.693,
+      "eval_steps_per_second": 3.218,
       "step": 4000
     },
     {
+      "epoch": 0.0041,
+      "grad_norm": 1.1375211477279663,
+      "learning_rate": 4.099e-06,
+      "loss": 1.341,
       "step": 4100
     },
     {
+      "epoch": 0.0042,
+      "grad_norm": 1.0922551155090332,
+      "learning_rate": 4.199e-06,
+      "loss": 1.3427,
       "step": 4200
     },
     {
+      "epoch": 0.0043,
+      "grad_norm": 1.124060034751892,
+      "learning_rate": 4.299000000000001e-06,
+      "loss": 1.3409,
       "step": 4300
     },
     {
+      "epoch": 0.0044,
+      "grad_norm": 1.125467300415039,
+      "learning_rate": 4.3990000000000006e-06,
+      "loss": 1.3467,
       "step": 4400
     },
     {
+      "epoch": 0.0045,
+      "grad_norm": 1.1384063959121704,
+      "learning_rate": 4.4990000000000005e-06,
+      "loss": 1.3426,
       "step": 4500
     },
     {
+      "epoch": 0.0046,
+      "grad_norm": 1.1456679105758667,
+      "learning_rate": 4.599e-06,
+      "loss": 1.3445,
       "step": 4600
     },
     {
+      "epoch": 0.0047,
+      "grad_norm": 1.1553903818130493,
+      "learning_rate": 4.699e-06,
+      "loss": 1.3372,
       "step": 4700
     },
     {
+      "epoch": 0.0048,
+      "grad_norm": 1.1315921545028687,
+      "learning_rate": 4.799e-06,
+      "loss": 1.3398,
       "step": 4800
     },
     {
+      "epoch": 0.0049,
+      "grad_norm": 1.08122980594635,
+      "learning_rate": 4.899e-06,
+      "loss": 1.3364,
       "step": 4900
     },
     {
+      "epoch": 0.005,
+      "grad_norm": 1.09906804561615,
+      "learning_rate": 4.999000000000001e-06,
+      "loss": 1.3366,
       "step": 5000
     },
     {
+      "epoch": 0.005,
+      "eval_loss": 1.3536914587020874,
+      "eval_runtime": 24.5551,
+      "eval_samples_per_second": 203.624,
+      "eval_steps_per_second": 3.217,
       "step": 5000
     },
     {
+      "epoch": 0.0051,
+      "grad_norm": 1.1291029453277588,
+      "learning_rate": 5.099000000000001e-06,
+      "loss": 1.3396,
       "step": 5100
     },
     {
+      "epoch": 0.0052,
+      "grad_norm": 1.1673402786254883,
+      "learning_rate": 5.1990000000000005e-06,
+      "loss": 1.3358,
       "step": 5200
     },
     {
+      "epoch": 0.0053,
+      "grad_norm": 1.1300634145736694,
+      "learning_rate": 5.2990000000000004e-06,
+      "loss": 1.3384,
       "step": 5300
     },
     {
+      "epoch": 0.0054,
+      "grad_norm": 1.1179150342941284,
+      "learning_rate": 5.399000000000001e-06,
+      "loss": 1.3332,
       "step": 5400
     },
     {
+      "epoch": 0.0055,
+      "grad_norm": 1.091856837272644,
+      "learning_rate": 5.499000000000001e-06,
+      "loss": 1.3348,
       "step": 5500
     },
     {
+      "epoch": 0.0056,
+      "grad_norm": 1.0551645755767822,
+      "learning_rate": 5.599e-06,
+      "loss": 1.336,
       "step": 5600
     },
     {
+      "epoch": 0.0057,
+      "grad_norm": 1.1457860469818115,
+      "learning_rate": 5.699e-06,
+      "loss": 1.333,
       "step": 5700
     },
     {
+      "epoch": 0.0058,
+      "grad_norm": 1.1662046909332275,
+      "learning_rate": 5.799e-06,
+      "loss": 1.3299,
       "step": 5800
     },
     {
+      "epoch": 0.0059,
+      "grad_norm": 1.1879452466964722,
+      "learning_rate": 5.899000000000001e-06,
+      "loss": 1.3354,
       "step": 5900
     },
     {
+      "epoch": 0.006,
+      "grad_norm": 1.1441973447799683,
+      "learning_rate": 5.9990000000000005e-06,
+      "loss": 1.3329,
       "step": 6000
     },
     {
+      "epoch": 0.006,
+      "eval_loss": 1.3535875082015991,
+      "eval_runtime": 24.3908,
+      "eval_samples_per_second": 204.995,
+      "eval_steps_per_second": 3.239,
       "step": 6000
     },
     {
+      "epoch": 0.0061,
+      "grad_norm": 1.121394395828247,
+      "learning_rate": 6.099e-06,
+      "loss": 1.3295,
       "step": 6100
     },
     {
+      "epoch": 0.0062,
+      "grad_norm": 1.1496130228042603,
+      "learning_rate": 6.199e-06,
+      "loss": 1.3303,
       "step": 6200
     },
     {
+      "epoch": 0.0063,
+      "grad_norm": 1.2465569972991943,
+      "learning_rate": 6.299000000000001e-06,
+      "loss": 1.3268,
       "step": 6300
     },
     {
+      "epoch": 0.0064,
+      "grad_norm": 1.1363328695297241,
+      "learning_rate": 6.399000000000001e-06,
+      "loss": 1.3248,
       "step": 6400
     },
     {
+      "epoch": 0.0065,
+      "grad_norm": 1.1142207384109497,
+      "learning_rate": 6.499000000000001e-06,
+      "loss": 1.3212,
       "step": 6500
     },
     {
+      "epoch": 0.0066,
+      "grad_norm": 1.1020450592041016,
+      "learning_rate": 6.599000000000001e-06,
+      "loss": 1.3305,
       "step": 6600
     },
     {
+      "epoch": 0.0067,
+      "grad_norm": 1.0636595487594604,
+      "learning_rate": 6.699000000000001e-06,
+      "loss": 1.3343,
       "step": 6700
     },
     {
+      "epoch": 0.0068,
+      "grad_norm": 1.0846408605575562,
+      "learning_rate": 6.7990000000000005e-06,
+      "loss": 1.3306,
       "step": 6800
     },
     {
+      "epoch": 0.0069,
+      "grad_norm": 1.2017494440078735,
+      "learning_rate": 6.899e-06,
+      "loss": 1.3191,
       "step": 6900
     },
     {
+      "epoch": 0.007,
+      "grad_norm": 1.159947156906128,
+      "learning_rate": 6.999e-06,
+      "loss": 1.334,
       "step": 7000
     },
     {
+      "epoch": 0.007,
+      "eval_loss": 1.3385692834854126,
+      "eval_runtime": 24.4488,
+      "eval_samples_per_second": 204.509,
+      "eval_steps_per_second": 3.231,
       "step": 7000
     },
     {
+      "epoch": 0.0071,
+      "grad_norm": 1.1962409019470215,
+      "learning_rate": 7.099e-06,
+      "loss": 1.323,
       "step": 7100
     },
     {
+      "epoch": 0.0072,
+      "grad_norm": 1.1551247835159302,
+      "learning_rate": 7.199e-06,
+      "loss": 1.3119,
       "step": 7200
     },
     {
+      "epoch": 0.0073,
+      "grad_norm": 1.1543225049972534,
+      "learning_rate": 7.299000000000001e-06,
+      "loss": 1.3261,
       "step": 7300
     },
     {
+      "epoch": 0.0074,
+      "grad_norm": 1.133355975151062,
+      "learning_rate": 7.399000000000001e-06,
+      "loss": 1.3241,
       "step": 7400
     },
     {
+      "epoch": 0.0075,
+      "grad_norm": 1.1490956544876099,
+      "learning_rate": 7.4990000000000005e-06,
+      "loss": 1.3293,
       "step": 7500
     },
     {
+      "epoch": 0.0076,
+      "grad_norm": 1.0732618570327759,
+      "learning_rate": 7.5990000000000004e-06,
+      "loss": 1.3216,
       "step": 7600
     },
     {
+      "epoch": 0.0077,
+      "grad_norm": 1.170203685760498,
+      "learning_rate": 7.699e-06,
+      "loss": 1.3193,
       "step": 7700
     },
     {
+      "epoch": 0.0078,
+      "grad_norm": 1.0613148212432861,
+      "learning_rate": 7.799000000000001e-06,
+      "loss": 1.329,
       "step": 7800
     },
     {
+      "epoch": 0.0079,
+      "grad_norm": 1.2019593715667725,
+      "learning_rate": 7.899000000000002e-06,
+      "loss": 1.315,
       "step": 7900
     },
     {
+      "epoch": 0.008,
+      "grad_norm": 1.1080353260040283,
+      "learning_rate": 7.999e-06,
+      "loss": 1.3181,
       "step": 8000
     },
     {
+      "epoch": 0.008,
+      "eval_loss": 1.3239587545394897,
+      "eval_runtime": 24.4556,
+      "eval_samples_per_second": 204.452,
+      "eval_steps_per_second": 3.23,
       "step": 8000
     },
     {
+      "epoch": 0.0081,
+      "grad_norm": 1.1273937225341797,
+      "learning_rate": 8.099e-06,
+      "loss": 1.3252,
       "step": 8100
     },
     {
+      "epoch": 0.0082,
+      "grad_norm": 1.0942583084106445,
+      "learning_rate": 8.199e-06,
+      "loss": 1.3164,
       "step": 8200
     },
     {
+      "epoch": 0.0083,
+      "grad_norm": 1.1845577955245972,
+      "learning_rate": 8.299e-06,
+      "loss": 1.32,
       "step": 8300
     },
     {
+      "epoch": 0.0084,
+      "grad_norm": 1.2376071214675903,
+      "learning_rate": 8.399e-06,
+      "loss": 1.314,
       "step": 8400
     },
     {
+      "epoch": 0.0085,
+      "grad_norm": 1.5554766654968262,
+      "learning_rate": 8.499000000000001e-06,
+      "loss": 1.4128,
       "step": 8500
     },
     {
+      "epoch": 0.0086,
+      "grad_norm": 1.736693024635315,
+      "learning_rate": 8.599e-06,
+      "loss": 1.5028,
       "step": 8600
     },
     {
+      "epoch": 0.0087,
+      "grad_norm": 1.8339451551437378,
+      "learning_rate": 8.699000000000001e-06,
+      "loss": 1.5346,
       "step": 8700
     },
     {
+      "epoch": 0.0088,
+      "grad_norm": 1.827017068862915,
+      "learning_rate": 8.799000000000002e-06,
+      "loss": 1.5309,
       "step": 8800
     },
     {
+      "epoch": 0.0089,
+      "grad_norm": 1.7209491729736328,
+      "learning_rate": 8.899e-06,
+      "loss": 1.5202,
       "step": 8900
     },
     {
+      "epoch": 0.009,
+      "grad_norm": 1.7649836540222168,
+      "learning_rate": 8.999000000000001e-06,
+      "loss": 1.5311,
       "step": 9000
     },
     {
+      "epoch": 0.009,
+      "eval_loss": 1.3453279733657837,
+      "eval_runtime": 24.4639,
+      "eval_samples_per_second": 204.383,
+      "eval_steps_per_second": 3.229,
       "step": 9000
     },
     {
+      "epoch": 0.0091,
+      "grad_norm": 1.758984923362732,
+      "learning_rate": 9.099e-06,
+      "loss": 1.5277,
       "step": 9100
     },
     {
+      "epoch": 0.0092,
+      "grad_norm": 1.5517253875732422,
+      "learning_rate": 9.199000000000001e-06,
+      "loss": 1.5331,
       "step": 9200
     },
     {
+      "epoch": 0.0093,
+      "grad_norm": 1.7491697072982788,
+      "learning_rate": 9.299e-06,
+      "loss": 1.5376,
       "step": 9300
     },
     {
+      "epoch": 0.0094,
+      "grad_norm": 1.7253761291503906,
+      "learning_rate": 9.399000000000001e-06,
+      "loss": 1.5319,
       "step": 9400
     },
     {
+      "epoch": 0.0095,
+      "grad_norm": 1.7779654264450073,
+      "learning_rate": 9.499e-06,
+      "loss": 1.5455,
       "step": 9500
     },
     {
+      "epoch": 0.0096,
+      "grad_norm": 1.8502960205078125,
+      "learning_rate": 9.599e-06,
+      "loss": 1.5256,
       "step": 9600
     },
     {
+      "epoch": 0.0097,
+      "grad_norm": 1.595805287361145,
+      "learning_rate": 9.699e-06,
+      "loss": 1.5338,
       "step": 9700
     },
     {
+      "epoch": 0.0098,
+      "grad_norm": 1.7826145887374878,
+      "learning_rate": 9.799e-06,
+      "loss": 1.5297,
       "step": 9800
     },
     {
+      "epoch": 0.0099,
+      "grad_norm": 1.8574384450912476,
+      "learning_rate": 9.899000000000001e-06,
+      "loss": 1.537,
       "step": 9900
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 1.6225100755691528,
+      "learning_rate": 9.999e-06,
+      "loss": 1.5373,
       "step": 10000
     },
     {
+      "epoch": 0.01,
+      "eval_loss": 1.3474788665771484,
+      "eval_runtime": 24.6009,
+      "eval_samples_per_second": 203.244,
+      "eval_steps_per_second": 3.211,
       "step": 10000
     },
     {
+      "epoch": 0.0101,
+      "grad_norm": 1.7013579607009888,
+      "learning_rate": 9.999999753259893e-06,
+      "loss": 1.5213,
       "step": 10100
     },
     {
+      "epoch": 0.0102,
+      "grad_norm": 1.8451807498931885,
+      "learning_rate": 9.999999003045122e-06,
+      "loss": 1.5252,
       "step": 10200
     },
     {
+      "epoch": 0.0103,
+      "grad_norm": 1.6487650871276855,
+      "learning_rate": 9.999997749330588e-06,
+      "loss": 1.5313,
       "step": 10300
     },
     {
+      "epoch": 0.0104,
+      "grad_norm": 1.7240970134735107,
+      "learning_rate": 9.999995992116415e-06,
+      "loss": 1.5375,
       "step": 10400
     },
     {
+      "epoch": 0.0105,
+      "grad_norm": 1.5860111713409424,
+      "learning_rate": 9.999993731402786e-06,
+      "loss": 1.535,
       "step": 10500
     },
     {
+      "epoch": 0.0106,
+      "grad_norm": 1.6990783214569092,
+      "learning_rate": 9.999990967189924e-06,
+      "loss": 1.5415,
       "step": 10600
     },
     {
+      "epoch": 0.0107,
+      "grad_norm": 1.7421098947525024,
+      "learning_rate": 9.999987699478109e-06,
+      "loss": 1.5266,
       "step": 10700
     },
     {
+      "epoch": 0.0108,
+      "grad_norm": 1.6578110456466675,
+      "learning_rate": 9.999983928267668e-06,
+      "loss": 1.5256,
       "step": 10800
     },
     {
+      "epoch": 0.0109,
+      "grad_norm": 1.8193341493606567,
+      "learning_rate": 9.999979653558982e-06,
+      "loss": 1.54,
       "step": 10900
     },
     {
+      "epoch": 0.011,
+      "grad_norm": 1.7376822233200073,
+      "learning_rate": 9.999974875352482e-06,
+      "loss": 1.5345,
       "step": 11000
     },
     {
+      "epoch": 0.011,
+      "eval_loss": 1.3439626693725586,
+      "eval_runtime": 24.6158,
+      "eval_samples_per_second": 203.122,
+      "eval_steps_per_second": 3.209,
       "step": 11000
     },
     {
+      "epoch": 0.0111,
+      "grad_norm": 1.7770408391952515,
+      "learning_rate": 9.999969593648651e-06,
+      "loss": 1.5257,
       "step": 11100
     },
     {
+      "epoch": 0.0112,
+      "grad_norm": 1.703754186630249,
+      "learning_rate": 9.999963808448016e-06,
+      "loss": 1.523,
       "step": 11200
     },
     {
+      "epoch": 0.0113,
+      "grad_norm": 1.7194414138793945,
+      "learning_rate": 9.999957519751165e-06,
+      "loss": 1.5404,
       "step": 11300
     },
     {
+      "epoch": 0.0114,
+      "grad_norm": 1.694810390472412,
+      "learning_rate": 9.999950727558727e-06,
+      "loss": 1.534,
       "step": 11400
     },
     {
+      "epoch": 0.0115,
+      "grad_norm": 1.644400715827942,
+      "learning_rate": 9.999943431871388e-06,
+      "loss": 1.531,
       "step": 11500
     },
     {
+      "epoch": 0.0116,
+      "grad_norm": 1.792406678199768,
+      "learning_rate": 9.99993563268988e-06,
+      "loss": 1.5298,
       "step": 11600
     },
     {
+      "epoch": 0.0117,
+      "grad_norm": 1.9580830335617065,
+      "learning_rate": 9.999927330014993e-06,
+      "loss": 1.5268,
       "step": 11700
     },
     {
+      "epoch": 0.0118,
+      "grad_norm": 1.6442023515701294,
+      "learning_rate": 9.99991852384756e-06,
+      "loss": 1.5257,
       "step": 11800
     },
     {
+      "epoch": 0.0119,
+      "grad_norm": 1.680830478668213,
+      "learning_rate": 9.99990921418847e-06,
+      "loss": 1.5191,
       "step": 11900
     },
     {
+      "epoch": 0.012,
+      "grad_norm": 1.6746671199798584,
+      "learning_rate": 9.999899401038656e-06,
+      "loss": 1.5372,
       "step": 12000
     },
     {
+      "epoch": 0.012,
+      "eval_loss": 1.3511897325515747,
+      "eval_runtime": 24.555,
+      "eval_samples_per_second": 203.625,
+      "eval_steps_per_second": 3.217,
       "step": 12000
     },
     {
+      "epoch": 0.0121,
+      "grad_norm": 1.7775862216949463,
+      "learning_rate": 9.99988908439911e-06,
+      "loss": 1.5182,
       "step": 12100
     },
     {
+      "epoch": 0.0122,
+      "grad_norm": 1.5296705961227417,
+      "learning_rate": 9.999878264270871e-06,
+      "loss": 1.5303,
       "step": 12200
     },
     {
+      "epoch": 0.0123,
+      "grad_norm": 1.7957079410552979,
+      "learning_rate": 9.999866940655027e-06,
+      "loss": 1.5328,
       "step": 12300
     },
     {
+      "epoch": 0.0124,
+      "grad_norm": 1.8484801054000854,
+      "learning_rate": 9.99985511355272e-06,
+      "loss": 1.5162,
       "step": 12400
     },
     {
+      "epoch": 0.0125,
+      "grad_norm": 1.7253010272979736,
+      "learning_rate": 9.999842782965139e-06,
+      "loss": 1.5178,
       "step": 12500
     },
     {
+      "epoch": 0.0126,
+      "grad_norm": 1.7495081424713135,
+      "learning_rate": 9.999829948893528e-06,
+      "loss": 1.5233,
       "step": 12600
     },
     {
+      "epoch": 0.0127,
+      "grad_norm": 1.6750719547271729,
+      "learning_rate": 9.999816611339175e-06,
+      "loss": 1.5203,
       "step": 12700
     },
     {
+      "epoch": 0.0128,
+      "grad_norm": 1.7870038747787476,
+      "learning_rate": 9.999802770303427e-06,
+      "loss": 1.5106,
       "step": 12800
     },
     {
+      "epoch": 0.0129,
+      "grad_norm": 1.6229153871536255,
+      "learning_rate": 9.999788425787678e-06,
+      "loss": 1.5399,
       "step": 12900
     },
     {
+      "epoch": 0.013,
+      "grad_norm": 1.7483490705490112,
+      "learning_rate": 9.99977357779337e-06,
+      "loss": 1.519,
       "step": 13000
     },
     {
+      "epoch": 0.013,
+      "eval_loss": 1.3341424465179443,
+      "eval_runtime": 24.5433,
+      "eval_samples_per_second": 203.722,
+      "eval_steps_per_second": 3.219,
       "step": 13000
     },
     {
+      "epoch": 0.0131,
+      "grad_norm": 1.7631748914718628,
+      "learning_rate": 9.999758226322e-06,
+      "loss": 1.5232,
       "step": 13100
     },
     {
+      "epoch": 0.0132,
+      "grad_norm": 1.6134735345840454,
+      "learning_rate": 9.999742371375114e-06,
+      "loss": 1.5352,
       "step": 13200
     },
     {
+      "epoch": 0.0133,
+      "grad_norm": 1.8494335412979126,
+      "learning_rate": 9.999726012954308e-06,
+      "loss": 1.5254,
       "step": 13300
     },
     {
+      "epoch": 0.0134,
+      "grad_norm": 1.9245802164077759,
+      "learning_rate": 9.999709151061228e-06,
+      "loss": 1.5358,
       "step": 13400
     },
     {
+      "epoch": 0.0135,
+      "grad_norm": 1.755018711090088,
+      "learning_rate": 9.999691785697574e-06,
+      "loss": 1.5204,
       "step": 13500
     },
     {
+      "epoch": 0.0136,
+      "grad_norm": 1.8922946453094482,
+      "learning_rate": 9.999673916865094e-06,
+      "loss": 1.5267,
       "step": 13600
     },
     {
+      "epoch": 0.0137,
+      "grad_norm": 1.9781936407089233,
+      "learning_rate": 9.999655544565587e-06,
+      "loss": 1.5213,
       "step": 13700
     },
     {
+      "epoch": 0.0138,
+      "grad_norm": 1.8312381505966187,
+      "learning_rate": 9.999636668800905e-06,
+      "loss": 1.517,
       "step": 13800
     },
     {
+      "epoch": 0.0139,
+      "grad_norm": 1.6503413915634155,
+      "learning_rate": 9.999617289572946e-06,
+      "loss": 1.5169,
       "step": 13900
     },
     {
+      "epoch": 0.014,
+      "grad_norm": 1.8612747192382812,
+      "learning_rate": 9.999597406883664e-06,
+      "loss": 1.5277,
       "step": 14000
     },
     {
+      "epoch": 0.014,
+      "eval_loss": 1.3367455005645752,
+      "eval_runtime": 24.5718,
+      "eval_samples_per_second": 203.485,
+      "eval_steps_per_second": 3.215,
       "step": 14000
     },
     {
+      "epoch": 0.0141,
+      "grad_norm": 1.8900790214538574,
+      "learning_rate": 9.999577020735059e-06,
+      "loss": 1.5276,
       "step": 14100
     },
     {
+      "epoch": 0.0142,
+      "grad_norm": 1.720528244972229,
+      "learning_rate": 9.999556131129184e-06,
+      "loss": 1.5209,
       "step": 14200
     },
     {
+      "epoch": 0.0143,
+      "grad_norm": 1.713659405708313,
+      "learning_rate": 9.999534738068145e-06,
+      "loss": 1.5194,
       "step": 14300
     },
     {
+      "epoch": 0.0144,
+      "grad_norm": 1.662377119064331,
+      "learning_rate": 9.999512841554093e-06,
+      "loss": 1.5179,
       "step": 14400
     },
     {
+      "epoch": 0.0145,
+      "grad_norm": 1.6507668495178223,
+      "learning_rate": 9.999490441589235e-06,
+      "loss": 1.5181,
       "step": 14500
     },
     {
+      "epoch": 0.0146,
+      "grad_norm": 1.7075133323669434,
+      "learning_rate": 9.999467538175827e-06,
+      "loss": 1.5203,
       "step": 14600
     },
     {
+      "epoch": 0.0147,
+      "grad_norm": 1.686068058013916,
+      "learning_rate": 9.999444131316173e-06,
+      "loss": 1.5156,
       "step": 14700
     },
     {
+      "epoch": 0.0148,
+      "grad_norm": 1.6891603469848633,
+      "learning_rate": 9.999420221012635e-06,
+      "loss": 1.5195,
       "step": 14800
     },
     {
+      "epoch": 0.0149,
+      "grad_norm": 1.784029245376587,
+      "learning_rate": 9.999395807267616e-06,
+      "loss": 1.509,
       "step": 14900
     },
     {
+      "epoch": 0.015,
+      "grad_norm": 1.6361267566680908,
+      "learning_rate": 9.999370890083575e-06,
+      "loss": 1.5248,
       "step": 15000
     },
     {
+      "epoch": 0.015,
+      "eval_loss": 1.3349226713180542,
+      "eval_runtime": 24.5747,
+      "eval_samples_per_second": 203.461,
+      "eval_steps_per_second": 3.215,
       "step": 15000
     }
   ],
   "logging_steps": 100,
+  "max_steps": 1000000,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
   "save_steps": 5000,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 2.03079253229568e+18,
+  "train_batch_size": 64,
   "trial_name": null,
   "trial_params": null
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ecf959d2b9b962920cb00233e188da75df99308a37612910a391c7651f6d60fb
-size 5841

 version https://git-lfs.github.com/spec/v1
+oid sha256:5045bb023fc9f9ce18adc5b4d8a1c05111e1d7d6f92b7d0e1888eae4ede00e23
+size 5777