Training in progress, step 200, checkpoint

Browse files

Files changed (6) hide show

last-checkpoint/adapter_config.json +5 -5
last-checkpoint/adapter_model.safetensors +2 -2
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/trainer_state.json +615 -615
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -10,23 +10,23 @@
   "layers_pattern": null,
   "layers_to_transform": null,
   "loftq_config": {},
-  "lora_alpha": 128,
   "lora_dropout": 0.15,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
-  "r": 64,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "gate_proj",
     "o_proj",
-    "q_proj",
-    "k_proj",
     "up_proj",
     "v_proj",
-    "down_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "layers_pattern": null,
   "layers_to_transform": null,
   "loftq_config": {},
+  "lora_alpha": 16,
   "lora_dropout": 0.15,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
+  "r": 8,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "k_proj",
+    "down_proj",
     "gate_proj",
     "o_proj",
     "up_proj",
     "v_proj",
+    "q_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:db5cce6156c4621517be68ed6604412d1e180059ddcba2665cbdb58955f9bb05
-size 180385008

 version https://git-lfs.github.com/spec/v1
+oid sha256:7140be330dda6372b6231f1c967402a64cbc852cb4999f9baf8ceb68d4fd23ab
+size 22573704

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bfcc971a71e688b4db954d5f9e261787333eb8279ea692ae9db960cb16db16c5
-size 137651322

 version https://git-lfs.github.com/spec/v1
+oid sha256:1c11b5cd2010a67482310a3fced3b7b838bf6893d6d31933481f02786b688254
+size 17437626

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:11309a88af1da04c34187de7c9fa4eeb4751eebe97a4effc8b29c06633b89aa3
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:4e5701b93a28e5cda28f54c982c99c9f1cc13d09b125c8060476f4a1658335c0
 size 14244

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "best_metric": 0.7864285707473755,
   "best_model_checkpoint": "miner_id_24/checkpoint-200",
-  "epoch": 0.003217076240685559,
   "eval_steps": 200,
   "global_step": 200,
   "is_hyper_param_search": false,
@@ -9,1424 +9,1424 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 1.6085381203427795e-05,
-      "grad_norm": 0.9673656821250916,
       "learning_rate": 2e-05,
-      "loss": 0.5297,
       "step": 1
     },
     {
-      "epoch": 1.6085381203427795e-05,
-      "eval_loss": 0.46725764870643616,
-      "eval_runtime": 25.4799,
-      "eval_samples_per_second": 9.772,
-      "eval_steps_per_second": 9.772,
       "step": 1
     },
     {
-      "epoch": 3.217076240685559e-05,
-      "grad_norm": 0.9954096078872681,
       "learning_rate": 4e-05,
-      "loss": 0.6221,
       "step": 2
     },
     {
-      "epoch": 4.825614361028339e-05,
-      "grad_norm": 0.8733547329902649,
       "learning_rate": 6e-05,
-      "loss": 0.4143,
       "step": 3
     },
     {
-      "epoch": 6.434152481371118e-05,
-      "grad_norm": 0.8365621566772461,
       "learning_rate": 8e-05,
-      "loss": 0.2579,
       "step": 4
     },
     {
-      "epoch": 8.042690601713898e-05,
-      "grad_norm": 1.2265547513961792,
       "learning_rate": 0.0001,
-      "loss": 0.4713,
       "step": 5
     },
     {
-      "epoch": 9.651228722056678e-05,
-      "grad_norm": 1.1219959259033203,
       "learning_rate": 0.00012,
-      "loss": 0.4131,
       "step": 6
     },
     {
-      "epoch": 0.00011259766842399456,
-      "grad_norm": 1.630370855331421,
       "learning_rate": 0.00014,
-      "loss": 0.5203,
       "step": 7
     },
     {
-      "epoch": 0.00012868304962742236,
-      "grad_norm": 1.959912896156311,
       "learning_rate": 0.00016,
-      "loss": 0.6619,
       "step": 8
     },
     {
-      "epoch": 0.00014476843083085014,
-      "grad_norm": 2.2232961654663086,
       "learning_rate": 0.00018,
-      "loss": 0.5758,
       "step": 9
     },
     {
-      "epoch": 0.00016085381203427795,
-      "grad_norm": 2.4021875858306885,
       "learning_rate": 0.0002,
-      "loss": 0.5578,
       "step": 10
     },
     {
-      "epoch": 0.00017693919323770574,
-      "grad_norm": 2.4358997344970703,
       "learning_rate": 0.0002,
-      "loss": 0.5025,
       "step": 11
     },
     {
-      "epoch": 0.00019302457444113355,
-      "grad_norm": 2.9442031383514404,
       "learning_rate": 0.0002,
-      "loss": 0.6399,
       "step": 12
     },
     {
-      "epoch": 0.00020910995564456133,
-      "grad_norm": 3.2934744358062744,
       "learning_rate": 0.0002,
-      "loss": 0.7238,
       "step": 13
     },
     {
-      "epoch": 0.00022519533684798912,
-      "grad_norm": 2.135126829147339,
       "learning_rate": 0.0002,
-      "loss": 0.6701,
       "step": 14
     },
     {
-      "epoch": 0.00024128071805141693,
-      "grad_norm": 3.4425387382507324,
       "learning_rate": 0.0002,
-      "loss": 0.7285,
       "step": 15
     },
     {
-      "epoch": 0.0002573660992548447,
-      "grad_norm": 4.053037166595459,
       "learning_rate": 0.0002,
-      "loss": 0.6067,
       "step": 16
     },
     {
-      "epoch": 0.0002734514804582725,
-      "grad_norm": 1.9749451875686646,
       "learning_rate": 0.0002,
-      "loss": 0.6545,
       "step": 17
     },
     {
-      "epoch": 0.0002895368616617003,
-      "grad_norm": 2.6539998054504395,
       "learning_rate": 0.0002,
-      "loss": 0.5582,
       "step": 18
     },
     {
-      "epoch": 0.0003056222428651281,
-      "grad_norm": 4.1893205642700195,
       "learning_rate": 0.0002,
-      "loss": 0.6518,
       "step": 19
     },
     {
-      "epoch": 0.0003217076240685559,
-      "grad_norm": 2.7660045623779297,
       "learning_rate": 0.0002,
-      "loss": 0.619,
       "step": 20
     },
     {
-      "epoch": 0.00033779300527198367,
-      "grad_norm": 3.1297731399536133,
       "learning_rate": 0.0002,
-      "loss": 0.6169,
       "step": 21
     },
     {
-      "epoch": 0.0003538783864754115,
-      "grad_norm": 2.4766297340393066,
       "learning_rate": 0.0002,
-      "loss": 0.671,
       "step": 22
     },
     {
-      "epoch": 0.0003699637676788393,
-      "grad_norm": 1.840955376625061,
       "learning_rate": 0.0002,
-      "loss": 0.5704,
       "step": 23
     },
     {
-      "epoch": 0.0003860491488822671,
-      "grad_norm": 2.017615556716919,
       "learning_rate": 0.0002,
-      "loss": 0.4936,
       "step": 24
     },
     {
-      "epoch": 0.00040213453008569486,
-      "grad_norm": 2.527812957763672,
       "learning_rate": 0.0002,
-      "loss": 0.436,
       "step": 25
     },
     {
-      "epoch": 0.00041821991128912267,
-      "grad_norm": 2.738335132598877,
       "learning_rate": 0.0002,
-      "loss": 0.6511,
       "step": 26
     },
     {
-      "epoch": 0.0004343052924925505,
-      "grad_norm": 2.6857173442840576,
       "learning_rate": 0.0002,
-      "loss": 0.8459,
       "step": 27
     },
     {
-      "epoch": 0.00045039067369597824,
-      "grad_norm": 3.223954200744629,
       "learning_rate": 0.0002,
-      "loss": 0.5558,
       "step": 28
     },
     {
-      "epoch": 0.00046647605489940605,
-      "grad_norm": 2.828322649002075,
       "learning_rate": 0.0002,
-      "loss": 0.7201,
       "step": 29
     },
     {
-      "epoch": 0.00048256143610283386,
-      "grad_norm": 3.2195804119110107,
       "learning_rate": 0.0002,
-      "loss": 0.5933,
       "step": 30
     },
     {
-      "epoch": 0.0004986468173062617,
-      "grad_norm": 2.4919071197509766,
       "learning_rate": 0.0002,
-      "loss": 0.5764,
       "step": 31
     },
     {
-      "epoch": 0.0005147321985096894,
-      "grad_norm": 4.92438268661499,
       "learning_rate": 0.0002,
-      "loss": 0.9201,
       "step": 32
     },
     {
-      "epoch": 0.0005308175797131172,
-      "grad_norm": 2.232290267944336,
       "learning_rate": 0.0002,
-      "loss": 0.5863,
       "step": 33
     },
     {
-      "epoch": 0.000546902960916545,
-      "grad_norm": 3.7385706901550293,
       "learning_rate": 0.0002,
-      "loss": 0.9086,
       "step": 34
     },
     {
-      "epoch": 0.0005629883421199728,
-      "grad_norm": 3.262006998062134,
       "learning_rate": 0.0002,
-      "loss": 0.6261,
       "step": 35
     },
     {
-      "epoch": 0.0005790737233234006,
-      "grad_norm": 2.7973763942718506,
       "learning_rate": 0.0002,
-      "loss": 0.6955,
       "step": 36
     },
     {
-      "epoch": 0.0005951591045268284,
-      "grad_norm": 3.127302885055542,
       "learning_rate": 0.0002,
-      "loss": 0.7446,
       "step": 37
     },
     {
-      "epoch": 0.0006112444857302562,
-      "grad_norm": 2.1533172130584717,
       "learning_rate": 0.0002,
-      "loss": 0.5484,
       "step": 38
     },
     {
-      "epoch": 0.000627329866933684,
-      "grad_norm": 4.116796016693115,
       "learning_rate": 0.0002,
-      "loss": 0.7521,
       "step": 39
     },
     {
-      "epoch": 0.0006434152481371118,
-      "grad_norm": 4.400921821594238,
       "learning_rate": 0.0002,
-      "loss": 0.9317,
       "step": 40
     },
     {
-      "epoch": 0.0006595006293405396,
-      "grad_norm": 2.6137619018554688,
       "learning_rate": 0.0002,
-      "loss": 0.7086,
       "step": 41
     },
     {
-      "epoch": 0.0006755860105439673,
-      "grad_norm": 2.341974973678589,
       "learning_rate": 0.0002,
-      "loss": 0.5551,
       "step": 42
     },
     {
-      "epoch": 0.0006916713917473952,
-      "grad_norm": 2.7685954570770264,
       "learning_rate": 0.0002,
-      "loss": 0.7665,
       "step": 43
     },
     {
-      "epoch": 0.000707756772950823,
-      "grad_norm": 3.1898794174194336,
       "learning_rate": 0.0002,
-      "loss": 0.8037,
       "step": 44
     },
     {
-      "epoch": 0.0007238421541542507,
-      "grad_norm": 3.215623617172241,
       "learning_rate": 0.0002,
-      "loss": 0.9811,
       "step": 45
     },
     {
-      "epoch": 0.0007399275353576786,
-      "grad_norm": 3.3365135192871094,
       "learning_rate": 0.0002,
-      "loss": 0.7127,
       "step": 46
     },
     {
-      "epoch": 0.0007560129165611063,
-      "grad_norm": 4.518591403961182,
       "learning_rate": 0.0002,
-      "loss": 0.797,
       "step": 47
     },
     {
-      "epoch": 0.0007720982977645342,
-      "grad_norm": 2.179842948913574,
       "learning_rate": 0.0002,
-      "loss": 0.7091,
       "step": 48
     },
     {
-      "epoch": 0.000788183678967962,
-      "grad_norm": 2.5702974796295166,
       "learning_rate": 0.0002,
-      "loss": 0.8829,
       "step": 49
     },
     {
-      "epoch": 0.0008042690601713897,
-      "grad_norm": 2.2742362022399902,
       "learning_rate": 0.0002,
-      "loss": 0.5818,
       "step": 50
     },
     {
-      "epoch": 0.0008203544413748176,
-      "grad_norm": 3.2687766551971436,
       "learning_rate": 0.0002,
-      "loss": 0.7228,
       "step": 51
     },
     {
-      "epoch": 0.0008364398225782453,
-      "grad_norm": 3.5674126148223877,
       "learning_rate": 0.0002,
-      "loss": 0.8874,
       "step": 52
     },
     {
-      "epoch": 0.0008525252037816731,
-      "grad_norm": 2.703923225402832,
       "learning_rate": 0.0002,
-      "loss": 0.6596,
       "step": 53
     },
     {
-      "epoch": 0.000868610584985101,
-      "grad_norm": 2.3442795276641846,
       "learning_rate": 0.0002,
-      "loss": 0.8213,
       "step": 54
     },
     {
-      "epoch": 0.0008846959661885287,
-      "grad_norm": 3.142275094985962,
       "learning_rate": 0.0002,
-      "loss": 0.8181,
       "step": 55
     },
     {
-      "epoch": 0.0009007813473919565,
-      "grad_norm": 4.0531487464904785,
       "learning_rate": 0.0002,
-      "loss": 0.5939,
       "step": 56
     },
     {
-      "epoch": 0.0009168667285953843,
-      "grad_norm": 4.309750556945801,
       "learning_rate": 0.0002,
-      "loss": 0.867,
       "step": 57
     },
     {
-      "epoch": 0.0009329521097988121,
-      "grad_norm": 3.4528746604919434,
       "learning_rate": 0.0002,
-      "loss": 0.5944,
       "step": 58
     },
     {
-      "epoch": 0.0009490374910022399,
-      "grad_norm": 3.531193494796753,
       "learning_rate": 0.0002,
-      "loss": 0.7985,
       "step": 59
     },
     {
-      "epoch": 0.0009651228722056677,
-      "grad_norm": 3.000215768814087,
       "learning_rate": 0.0002,
-      "loss": 0.7939,
       "step": 60
     },
     {
-      "epoch": 0.0009812082534090955,
-      "grad_norm": 4.317079067230225,
       "learning_rate": 0.0002,
-      "loss": 0.6823,
       "step": 61
     },
     {
-      "epoch": 0.0009972936346125233,
-      "grad_norm": 3.4617133140563965,
       "learning_rate": 0.0002,
-      "loss": 0.7672,
       "step": 62
     },
     {
-      "epoch": 0.001013379015815951,
-      "grad_norm": 3.625797986984253,
       "learning_rate": 0.0002,
-      "loss": 0.7985,
       "step": 63
     },
     {
-      "epoch": 0.0010294643970193789,
-      "grad_norm": 4.261772632598877,
       "learning_rate": 0.0002,
-      "loss": 0.8154,
       "step": 64
     },
     {
-      "epoch": 0.0010455497782228067,
-      "grad_norm": 3.3078057765960693,
       "learning_rate": 0.0002,
-      "loss": 0.7663,
       "step": 65
     },
     {
-      "epoch": 0.0010616351594262344,
-      "grad_norm": 2.1908516883850098,
       "learning_rate": 0.0002,
-      "loss": 0.6996,
       "step": 66
     },
     {
-      "epoch": 0.0010777205406296622,
-      "grad_norm": 2.491776943206787,
       "learning_rate": 0.0002,
-      "loss": 0.659,
       "step": 67
     },
     {
-      "epoch": 0.00109380592183309,
-      "grad_norm": 2.7965214252471924,
       "learning_rate": 0.0002,
-      "loss": 0.6798,
       "step": 68
     },
     {
-      "epoch": 0.0011098913030365178,
-      "grad_norm": 3.3033552169799805,
       "learning_rate": 0.0002,
-      "loss": 0.9425,
       "step": 69
     },
     {
-      "epoch": 0.0011259766842399456,
-      "grad_norm": 2.6152732372283936,
       "learning_rate": 0.0002,
-      "loss": 0.9675,
       "step": 70
     },
     {
-      "epoch": 0.0011420620654433735,
-      "grad_norm": 2.942465305328369,
       "learning_rate": 0.0002,
-      "loss": 0.8886,
       "step": 71
     },
     {
-      "epoch": 0.0011581474466468011,
-      "grad_norm": 3.2040352821350098,
       "learning_rate": 0.0002,
-      "loss": 0.7208,
       "step": 72
     },
     {
-      "epoch": 0.001174232827850229,
-      "grad_norm": 5.6633501052856445,
       "learning_rate": 0.0002,
-      "loss": 0.9701,
       "step": 73
     },
     {
-      "epoch": 0.0011903182090536569,
-      "grad_norm": 2.924656867980957,
       "learning_rate": 0.0002,
-      "loss": 0.6366,
       "step": 74
     },
     {
-      "epoch": 0.0012064035902570845,
-      "grad_norm": 3.251835584640503,
       "learning_rate": 0.0002,
-      "loss": 0.8638,
       "step": 75
     },
     {
-      "epoch": 0.0012224889714605124,
-      "grad_norm": 3.145000696182251,
       "learning_rate": 0.0002,
-      "loss": 0.6692,
       "step": 76
     },
     {
-      "epoch": 0.0012385743526639402,
-      "grad_norm": 2.7392325401306152,
       "learning_rate": 0.0002,
-      "loss": 0.7459,
       "step": 77
     },
     {
-      "epoch": 0.001254659733867368,
-      "grad_norm": 2.8011040687561035,
       "learning_rate": 0.0002,
-      "loss": 0.7722,
       "step": 78
     },
     {
-      "epoch": 0.0012707451150707958,
-      "grad_norm": 3.5295469760894775,
       "learning_rate": 0.0002,
-      "loss": 0.7733,
       "step": 79
     },
     {
-      "epoch": 0.0012868304962742236,
-      "grad_norm": 2.9453213214874268,
       "learning_rate": 0.0002,
-      "loss": 0.6945,
       "step": 80
     },
     {
-      "epoch": 0.0013029158774776513,
-      "grad_norm": 3.2154369354248047,
       "learning_rate": 0.0002,
-      "loss": 0.8776,
       "step": 81
     },
     {
-      "epoch": 0.0013190012586810791,
-      "grad_norm": 3.536776065826416,
       "learning_rate": 0.0002,
-      "loss": 0.8774,
       "step": 82
     },
     {
-      "epoch": 0.001335086639884507,
-      "grad_norm": 2.8547418117523193,
       "learning_rate": 0.0002,
-      "loss": 0.7109,
       "step": 83
     },
     {
-      "epoch": 0.0013511720210879347,
-      "grad_norm": 3.4063565731048584,
       "learning_rate": 0.0002,
-      "loss": 0.8466,
       "step": 84
     },
     {
-      "epoch": 0.0013672574022913625,
-      "grad_norm": 5.920643329620361,
       "learning_rate": 0.0002,
-      "loss": 0.8423,
       "step": 85
     },
     {
-      "epoch": 0.0013833427834947904,
-      "grad_norm": 4.299768924713135,
       "learning_rate": 0.0002,
-      "loss": 1.0802,
       "step": 86
     },
     {
-      "epoch": 0.001399428164698218,
-      "grad_norm": 3.5304558277130127,
       "learning_rate": 0.0002,
-      "loss": 0.8542,
       "step": 87
     },
     {
-      "epoch": 0.001415513545901646,
-      "grad_norm": 3.0248117446899414,
       "learning_rate": 0.0002,
-      "loss": 0.6346,
       "step": 88
     },
     {
-      "epoch": 0.0014315989271050738,
-      "grad_norm": 3.5863444805145264,
       "learning_rate": 0.0002,
-      "loss": 0.9679,
       "step": 89
     },
     {
-      "epoch": 0.0014476843083085014,
-      "grad_norm": 3.6556644439697266,
       "learning_rate": 0.0002,
-      "loss": 0.7355,
       "step": 90
     },
     {
-      "epoch": 0.0014637696895119293,
-      "grad_norm": 3.691444158554077,
       "learning_rate": 0.0002,
-      "loss": 0.8556,
       "step": 91
     },
     {
-      "epoch": 0.0014798550707153572,
-      "grad_norm": 3.8535704612731934,
       "learning_rate": 0.0002,
-      "loss": 1.0531,
       "step": 92
     },
     {
-      "epoch": 0.0014959404519187848,
-      "grad_norm": 3.402984619140625,
       "learning_rate": 0.0002,
-      "loss": 0.7127,
       "step": 93
     },
     {
-      "epoch": 0.0015120258331222127,
-      "grad_norm": 2.967519760131836,
       "learning_rate": 0.0002,
-      "loss": 0.7416,
       "step": 94
     },
     {
-      "epoch": 0.0015281112143256405,
-      "grad_norm": 4.5817718505859375,
       "learning_rate": 0.0002,
-      "loss": 0.6667,
       "step": 95
     },
     {
-      "epoch": 0.0015441965955290684,
-      "grad_norm": 4.2193379402160645,
       "learning_rate": 0.0002,
-      "loss": 0.6914,
       "step": 96
     },
     {
-      "epoch": 0.001560281976732496,
-      "grad_norm": 4.412436485290527,
       "learning_rate": 0.0002,
-      "loss": 0.6476,
       "step": 97
     },
     {
-      "epoch": 0.001576367357935924,
-      "grad_norm": 3.960810661315918,
       "learning_rate": 0.0002,
-      "loss": 0.6829,
       "step": 98
     },
     {
-      "epoch": 0.0015924527391393518,
-      "grad_norm": 4.494846343994141,
       "learning_rate": 0.0002,
-      "loss": 0.899,
       "step": 99
     },
     {
-      "epoch": 0.0016085381203427794,
-      "grad_norm": 5.150880813598633,
       "learning_rate": 0.0002,
-      "loss": 0.8743,
       "step": 100
     },
     {
-      "epoch": 0.0016246235015462073,
-      "grad_norm": 3.156965970993042,
       "learning_rate": 0.0002,
-      "loss": 0.754,
       "step": 101
     },
     {
-      "epoch": 0.0016407088827496352,
-      "grad_norm": 3.00789213180542,
       "learning_rate": 0.0002,
-      "loss": 0.8606,
       "step": 102
     },
     {
-      "epoch": 0.0016567942639530628,
-      "grad_norm": 3.9045052528381348,
       "learning_rate": 0.0002,
-      "loss": 0.833,
       "step": 103
     },
     {
-      "epoch": 0.0016728796451564907,
-      "grad_norm": 3.0179498195648193,
       "learning_rate": 0.0002,
-      "loss": 0.6971,
       "step": 104
     },
     {
-      "epoch": 0.0016889650263599185,
-      "grad_norm": 3.441555976867676,
       "learning_rate": 0.0002,
-      "loss": 0.9697,
       "step": 105
     },
     {
-      "epoch": 0.0017050504075633462,
-      "grad_norm": 3.4271888732910156,
       "learning_rate": 0.0002,
-      "loss": 0.8264,
       "step": 106
     },
     {
-      "epoch": 0.001721135788766774,
-      "grad_norm": 3.3394598960876465,
       "learning_rate": 0.0002,
-      "loss": 0.7529,
       "step": 107
     },
     {
-      "epoch": 0.001737221169970202,
-      "grad_norm": 4.098421573638916,
       "learning_rate": 0.0002,
-      "loss": 0.7967,
       "step": 108
     },
     {
-      "epoch": 0.0017533065511736296,
-      "grad_norm": 5.323544979095459,
       "learning_rate": 0.0002,
-      "loss": 0.9429,
       "step": 109
     },
     {
-      "epoch": 0.0017693919323770574,
-      "grad_norm": 3.8546035289764404,
       "learning_rate": 0.0002,
-      "loss": 0.8392,
       "step": 110
     },
     {
-      "epoch": 0.0017854773135804853,
-      "grad_norm": 3.514596939086914,
       "learning_rate": 0.0002,
-      "loss": 0.904,
       "step": 111
     },
     {
-      "epoch": 0.001801562694783913,
-      "grad_norm": 4.436436653137207,
       "learning_rate": 0.0002,
-      "loss": 0.8841,
       "step": 112
     },
     {
-      "epoch": 0.0018176480759873408,
-      "grad_norm": 3.042628049850464,
       "learning_rate": 0.0002,
-      "loss": 0.6856,
       "step": 113
     },
     {
-      "epoch": 0.0018337334571907687,
-      "grad_norm": 3.558793306350708,
       "learning_rate": 0.0002,
-      "loss": 0.9463,
       "step": 114
     },
     {
-      "epoch": 0.0018498188383941963,
-      "grad_norm": 3.0797207355499268,
       "learning_rate": 0.0002,
-      "loss": 0.7813,
       "step": 115
     },
     {
-      "epoch": 0.0018659042195976242,
-      "grad_norm": 3.2403101921081543,
       "learning_rate": 0.0002,
-      "loss": 0.9499,
       "step": 116
     },
     {
-      "epoch": 0.001881989600801052,
-      "grad_norm": 3.385939121246338,
       "learning_rate": 0.0002,
-      "loss": 0.6545,
       "step": 117
     },
     {
-      "epoch": 0.0018980749820044797,
-      "grad_norm": 3.525153636932373,
       "learning_rate": 0.0002,
-      "loss": 0.9449,
       "step": 118
     },
     {
-      "epoch": 0.0019141603632079076,
-      "grad_norm": 2.670220375061035,
       "learning_rate": 0.0002,
-      "loss": 0.6208,
       "step": 119
     },
     {
-      "epoch": 0.0019302457444113354,
-      "grad_norm": 3.3499555587768555,
       "learning_rate": 0.0002,
-      "loss": 0.833,
       "step": 120
     },
     {
-      "epoch": 0.001946331125614763,
-      "grad_norm": 5.413862705230713,
       "learning_rate": 0.0002,
-      "loss": 1.2186,
       "step": 121
     },
     {
-      "epoch": 0.001962416506818191,
-      "grad_norm": 3.637068271636963,
       "learning_rate": 0.0002,
-      "loss": 0.8746,
       "step": 122
     },
     {
-      "epoch": 0.0019785018880216186,
-      "grad_norm": 6.209028244018555,
       "learning_rate": 0.0002,
-      "loss": 1.1379,
       "step": 123
     },
     {
-      "epoch": 0.0019945872692250467,
-      "grad_norm": 4.2924418449401855,
       "learning_rate": 0.0002,
-      "loss": 1.0075,
       "step": 124
     },
     {
-      "epoch": 0.0020106726504284743,
-      "grad_norm": 2.749718427658081,
       "learning_rate": 0.0002,
-      "loss": 0.694,
       "step": 125
     },
     {
-      "epoch": 0.002026758031631902,
-      "grad_norm": 4.217276573181152,
       "learning_rate": 0.0002,
-      "loss": 0.778,
       "step": 126
     },
     {
-      "epoch": 0.00204284341283533,
-      "grad_norm": 3.031771421432495,
       "learning_rate": 0.0002,
-      "loss": 0.9696,
       "step": 127
     },
     {
-      "epoch": 0.0020589287940387577,
-      "grad_norm": 3.4838218688964844,
       "learning_rate": 0.0002,
-      "loss": 0.6629,
       "step": 128
     },
     {
-      "epoch": 0.0020750141752421854,
-      "grad_norm": 3.218451738357544,
       "learning_rate": 0.0002,
-      "loss": 0.6899,
       "step": 129
     },
     {
-      "epoch": 0.0020910995564456135,
-      "grad_norm": 3.4607691764831543,
       "learning_rate": 0.0002,
-      "loss": 0.6832,
       "step": 130
     },
     {
-      "epoch": 0.002107184937649041,
-      "grad_norm": 3.70224666595459,
       "learning_rate": 0.0002,
-      "loss": 0.7241,
       "step": 131
     },
     {
-      "epoch": 0.0021232703188524688,
-      "grad_norm": 4.122409820556641,
       "learning_rate": 0.0002,
-      "loss": 0.8109,
       "step": 132
     },
     {
-      "epoch": 0.002139355700055897,
-      "grad_norm": 3.3417394161224365,
       "learning_rate": 0.0002,
-      "loss": 0.6684,
       "step": 133
     },
     {
-      "epoch": 0.0021554410812593245,
-      "grad_norm": 3.019958972930908,
       "learning_rate": 0.0002,
-      "loss": 0.7826,
       "step": 134
     },
     {
-      "epoch": 0.002171526462462752,
-      "grad_norm": 3.201491117477417,
       "learning_rate": 0.0002,
-      "loss": 0.7875,
       "step": 135
     },
     {
-      "epoch": 0.00218761184366618,
-      "grad_norm": 5.85605525970459,
       "learning_rate": 0.0002,
-      "loss": 1.1128,
       "step": 136
     },
     {
-      "epoch": 0.002203697224869608,
-      "grad_norm": 3.976530075073242,
       "learning_rate": 0.0002,
-      "loss": 0.8679,
       "step": 137
     },
     {
-      "epoch": 0.0022197826060730355,
-      "grad_norm": 3.621382713317871,
       "learning_rate": 0.0002,
-      "loss": 0.7601,
       "step": 138
     },
     {
-      "epoch": 0.0022358679872764636,
-      "grad_norm": 18.2700252532959,
       "learning_rate": 0.0002,
-      "loss": 0.9312,
       "step": 139
     },
     {
-      "epoch": 0.0022519533684798912,
-      "grad_norm": 3.050555467605591,
       "learning_rate": 0.0002,
-      "loss": 0.9431,
       "step": 140
     },
     {
-      "epoch": 0.002268038749683319,
-      "grad_norm": 4.187278747558594,
       "learning_rate": 0.0002,
-      "loss": 1.16,
       "step": 141
     },
     {
-      "epoch": 0.002284124130886747,
-      "grad_norm": 2.9168365001678467,
       "learning_rate": 0.0002,
-      "loss": 0.7853,
       "step": 142
     },
     {
-      "epoch": 0.0023002095120901746,
-      "grad_norm": 118.312744140625,
       "learning_rate": 0.0002,
-      "loss": 1.1003,
       "step": 143
     },
     {
-      "epoch": 0.0023162948932936023,
-      "grad_norm": 4.7243971824646,
       "learning_rate": 0.0002,
-      "loss": 0.694,
       "step": 144
     },
     {
-      "epoch": 0.0023323802744970304,
-      "grad_norm": 4.773429870605469,
       "learning_rate": 0.0002,
-      "loss": 0.7167,
       "step": 145
     },
     {
-      "epoch": 0.002348465655700458,
-      "grad_norm": 6.2195868492126465,
       "learning_rate": 0.0002,
-      "loss": 0.7979,
       "step": 146
     },
     {
-      "epoch": 0.0023645510369038857,
-      "grad_norm": 12.494455337524414,
       "learning_rate": 0.0002,
-      "loss": 1.2257,
       "step": 147
     },
     {
-      "epoch": 0.0023806364181073137,
-      "grad_norm": 6.841114521026611,
       "learning_rate": 0.0002,
-      "loss": 1.28,
       "step": 148
     },
     {
-      "epoch": 0.0023967217993107414,
-      "grad_norm": 5.901433944702148,
       "learning_rate": 0.0002,
-      "loss": 0.826,
       "step": 149
     },
     {
-      "epoch": 0.002412807180514169,
-      "grad_norm": 7.198768615722656,
       "learning_rate": 0.0002,
-      "loss": 0.7969,
       "step": 150
     },
     {
-      "epoch": 0.002428892561717597,
-      "grad_norm": 9.673176765441895,
       "learning_rate": 0.0002,
-      "loss": 0.8828,
       "step": 151
     },
     {
-      "epoch": 0.0024449779429210248,
-      "grad_norm": 10.305676460266113,
       "learning_rate": 0.0002,
-      "loss": 0.8668,
       "step": 152
     },
     {
-      "epoch": 0.0024610633241244524,
-      "grad_norm": 14.00606632232666,
       "learning_rate": 0.0002,
-      "loss": 0.9462,
       "step": 153
     },
     {
-      "epoch": 0.0024771487053278805,
-      "grad_norm": 6.559825897216797,
       "learning_rate": 0.0002,
-      "loss": 0.7042,
       "step": 154
     },
     {
-      "epoch": 0.002493234086531308,
-      "grad_norm": 3.9966037273406982,
       "learning_rate": 0.0002,
-      "loss": 0.8798,
       "step": 155
     },
     {
-      "epoch": 0.002509319467734736,
-      "grad_norm": 5.800797462463379,
       "learning_rate": 0.0002,
-      "loss": 0.7377,
       "step": 156
     },
     {
-      "epoch": 0.002525404848938164,
-      "grad_norm": 7.694753646850586,
       "learning_rate": 0.0002,
-      "loss": 0.9589,
       "step": 157
     },
     {
-      "epoch": 0.0025414902301415915,
-      "grad_norm": 4.698418617248535,
       "learning_rate": 0.0002,
-      "loss": 0.826,
       "step": 158
     },
     {
-      "epoch": 0.002557575611345019,
-      "grad_norm": 3.7439236640930176,
       "learning_rate": 0.0002,
-      "loss": 0.874,
       "step": 159
     },
     {
-      "epoch": 0.0025736609925484473,
-      "grad_norm": 4.441625118255615,
       "learning_rate": 0.0002,
-      "loss": 0.8844,
       "step": 160
     },
     {
-      "epoch": 0.002589746373751875,
-      "grad_norm": 4.822892665863037,
       "learning_rate": 0.0002,
-      "loss": 0.9741,
       "step": 161
     },
     {
-      "epoch": 0.0026058317549553026,
-      "grad_norm": 5.727447986602783,
       "learning_rate": 0.0002,
-      "loss": 1.228,
       "step": 162
     },
     {
-      "epoch": 0.0026219171361587306,
-      "grad_norm": 4.084842681884766,
       "learning_rate": 0.0002,
-      "loss": 0.8113,
       "step": 163
     },
     {
-      "epoch": 0.0026380025173621583,
-      "grad_norm": 4.884864330291748,
       "learning_rate": 0.0002,
-      "loss": 0.9853,
       "step": 164
     },
     {
-      "epoch": 0.002654087898565586,
-      "grad_norm": 4.315978527069092,
       "learning_rate": 0.0002,
-      "loss": 0.7985,
       "step": 165
     },
     {
-      "epoch": 0.002670173279769014,
-      "grad_norm": 3.958301544189453,
       "learning_rate": 0.0002,
-      "loss": 0.8639,
       "step": 166
     },
     {
-      "epoch": 0.0026862586609724417,
-      "grad_norm": 5.930337905883789,
       "learning_rate": 0.0002,
-      "loss": 0.9575,
       "step": 167
     },
     {
-      "epoch": 0.0027023440421758693,
-      "grad_norm": 3.374218702316284,
       "learning_rate": 0.0002,
-      "loss": 0.5752,
       "step": 168
     },
     {
-      "epoch": 0.0027184294233792974,
-      "grad_norm": 7.738460063934326,
       "learning_rate": 0.0002,
-      "loss": 1.1104,
       "step": 169
     },
     {
-      "epoch": 0.002734514804582725,
-      "grad_norm": 6.493184566497803,
       "learning_rate": 0.0002,
-      "loss": 0.9614,
       "step": 170
     },
     {
-      "epoch": 0.0027506001857861527,
-      "grad_norm": 7.904129981994629,
       "learning_rate": 0.0002,
-      "loss": 1.1735,
       "step": 171
     },
     {
-      "epoch": 0.002766685566989581,
-      "grad_norm": 6.135262489318848,
       "learning_rate": 0.0002,
-      "loss": 1.1976,
       "step": 172
     },
     {
-      "epoch": 0.0027827709481930084,
-      "grad_norm": 6.674580097198486,
       "learning_rate": 0.0002,
-      "loss": 0.7546,
       "step": 173
     },
     {
-      "epoch": 0.002798856329396436,
-      "grad_norm": 3.6253364086151123,
       "learning_rate": 0.0002,
-      "loss": 0.8027,
       "step": 174
     },
     {
-      "epoch": 0.002814941710599864,
-      "grad_norm": 3.2293593883514404,
       "learning_rate": 0.0002,
-      "loss": 0.8404,
       "step": 175
     },
     {
-      "epoch": 0.002831027091803292,
-      "grad_norm": 4.404852867126465,
       "learning_rate": 0.0002,
-      "loss": 0.8233,
       "step": 176
     },
     {
-      "epoch": 0.0028471124730067195,
-      "grad_norm": 9.036417007446289,
       "learning_rate": 0.0002,
-      "loss": 1.2197,
       "step": 177
     },
     {
-      "epoch": 0.0028631978542101475,
-      "grad_norm": 3.6753194332122803,
       "learning_rate": 0.0002,
-      "loss": 0.8155,
       "step": 178
     },
     {
-      "epoch": 0.002879283235413575,
-      "grad_norm": 4.148676872253418,
       "learning_rate": 0.0002,
-      "loss": 1.0028,
       "step": 179
     },
     {
-      "epoch": 0.002895368616617003,
-      "grad_norm": 10.267266273498535,
       "learning_rate": 0.0002,
-      "loss": 0.8078,
       "step": 180
     },
     {
-      "epoch": 0.002911453997820431,
-      "grad_norm": 5.570545673370361,
       "learning_rate": 0.0002,
-      "loss": 0.9974,
       "step": 181
     },
     {
-      "epoch": 0.0029275393790238586,
-      "grad_norm": 6.258678436279297,
       "learning_rate": 0.0002,
-      "loss": 1.1986,
       "step": 182
     },
     {
-      "epoch": 0.0029436247602272862,
-      "grad_norm": 11.766939163208008,
       "learning_rate": 0.0002,
-      "loss": 0.8153,
       "step": 183
     },
     {
-      "epoch": 0.0029597101414307143,
-      "grad_norm": 4.668914318084717,
       "learning_rate": 0.0002,
-      "loss": 0.7482,
       "step": 184
     },
     {
-      "epoch": 0.002975795522634142,
-      "grad_norm": 3.728922128677368,
       "learning_rate": 0.0002,
-      "loss": 0.7389,
       "step": 185
     },
     {
-      "epoch": 0.0029918809038375696,
-      "grad_norm": 3.9253530502319336,
       "learning_rate": 0.0002,
-      "loss": 0.8526,
       "step": 186
     },
     {
-      "epoch": 0.0030079662850409977,
-      "grad_norm": 4.449740409851074,
       "learning_rate": 0.0002,
-      "loss": 0.8117,
       "step": 187
     },
     {
-      "epoch": 0.0030240516662444253,
-      "grad_norm": 3.856152296066284,
       "learning_rate": 0.0002,
-      "loss": 0.6481,
       "step": 188
     },
     {
-      "epoch": 0.0030401370474478534,
-      "grad_norm": 140.99961853027344,
       "learning_rate": 0.0002,
-      "loss": 2.8234,
       "step": 189
     },
     {
-      "epoch": 0.003056222428651281,
-      "grad_norm": 4.190764904022217,
       "learning_rate": 0.0002,
-      "loss": 0.7266,
       "step": 190
     },
     {
-      "epoch": 0.0030723078098547087,
-      "grad_norm": 3.9606616497039795,
       "learning_rate": 0.0002,
-      "loss": 0.8465,
       "step": 191
     },
     {
-      "epoch": 0.003088393191058137,
-      "grad_norm": 4.197356700897217,
       "learning_rate": 0.0002,
-      "loss": 0.7764,
       "step": 192
     },
     {
-      "epoch": 0.0031044785722615644,
-      "grad_norm": 4.308269023895264,
       "learning_rate": 0.0002,
-      "loss": 0.6308,
       "step": 193
     },
     {
-      "epoch": 0.003120563953464992,
-      "grad_norm": 7.85593843460083,
       "learning_rate": 0.0002,
-      "loss": 1.2231,
       "step": 194
     },
     {
-      "epoch": 0.00313664933466842,
-      "grad_norm": 5.271966934204102,
       "learning_rate": 0.0002,
-      "loss": 0.6263,
       "step": 195
     },
     {
-      "epoch": 0.003152734715871848,
-      "grad_norm": 4.99168062210083,
       "learning_rate": 0.0002,
-      "loss": 0.8379,
       "step": 196
     },
     {
-      "epoch": 0.0031688200970752755,
-      "grad_norm": 4.923642635345459,
       "learning_rate": 0.0002,
-      "loss": 0.7982,
       "step": 197
     },
     {
-      "epoch": 0.0031849054782787036,
-      "grad_norm": 8.511445999145508,
       "learning_rate": 0.0002,
-      "loss": 0.8379,
       "step": 198
     },
     {
-      "epoch": 0.003200990859482131,
-      "grad_norm": 6.066445350646973,
       "learning_rate": 0.0002,
-      "loss": 0.7347,
       "step": 199
     },
     {
-      "epoch": 0.003217076240685559,
-      "grad_norm": 6.310784339904785,
       "learning_rate": 0.0002,
-      "loss": 0.9526,
       "step": 200
     },
     {
-      "epoch": 0.003217076240685559,
-      "eval_loss": 0.7864285707473755,
-      "eval_runtime": 25.6512,
-      "eval_samples_per_second": 9.707,
-      "eval_steps_per_second": 9.707,
       "step": 200
     }
   ],
   "logging_steps": 1,
-  "max_steps": 186504,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 200,
@@ -1451,8 +1451,8 @@
       "attributes": {}
     }
   },
-  "total_flos": 5911953172070400.0,
-  "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null
 }

 {
+  "best_metric": 0.3310258388519287,
   "best_model_checkpoint": "miner_id_24/checkpoint-200",
+  "epoch": 0.006434126607526319,
   "eval_steps": 200,
   "global_step": 200,
   "is_hyper_param_search": false,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 3.21706330376316e-05,
+      "grad_norm": 0.2609178423881531,
       "learning_rate": 2e-05,
+      "loss": 0.5746,
       "step": 1
     },
     {
+      "epoch": 3.21706330376316e-05,
+      "eval_loss": 0.46101704239845276,
+      "eval_runtime": 28.8598,
+      "eval_samples_per_second": 8.628,
+      "eval_steps_per_second": 4.331,
       "step": 1
     },
     {
+      "epoch": 6.43412660752632e-05,
+      "grad_norm": 0.23752833902835846,
       "learning_rate": 4e-05,
+      "loss": 0.3828,
       "step": 2
     },
     {
+      "epoch": 9.65118991128948e-05,
+      "grad_norm": 0.25489795207977295,
       "learning_rate": 6e-05,
+      "loss": 0.4657,
       "step": 3
     },
     {
+      "epoch": 0.0001286825321505264,
+      "grad_norm": 0.3867496848106384,
       "learning_rate": 8e-05,
+      "loss": 0.5709,
       "step": 4
     },
     {
+      "epoch": 0.000160853165188158,
+      "grad_norm": 0.21718668937683105,
       "learning_rate": 0.0001,
+      "loss": 0.4053,
       "step": 5
     },
     {
+      "epoch": 0.0001930237982257896,
+      "grad_norm": 0.22927387058734894,
       "learning_rate": 0.00012,
+      "loss": 0.3848,
       "step": 6
     },
     {
+      "epoch": 0.00022519443126342117,
+      "grad_norm": 0.2708449363708496,
       "learning_rate": 0.00014,
+      "loss": 0.3925,
       "step": 7
     },
     {
+      "epoch": 0.0002573650643010528,
+      "grad_norm": 0.378393292427063,
       "learning_rate": 0.00016,
+      "loss": 0.3656,
       "step": 8
     },
     {
+      "epoch": 0.0002895356973386844,
+      "grad_norm": 0.5767518281936646,
       "learning_rate": 0.00018,
+      "loss": 0.3179,
       "step": 9
     },
     {
+      "epoch": 0.000321706330376316,
+      "grad_norm": 0.6105942726135254,
       "learning_rate": 0.0002,
+      "loss": 0.3719,
       "step": 10
     },
     {
+      "epoch": 0.00035387696341394756,
+      "grad_norm": 0.4722791016101837,
       "learning_rate": 0.0002,
+      "loss": 0.3447,
       "step": 11
     },
     {
+      "epoch": 0.0003860475964515792,
+      "grad_norm": 0.4688263237476349,
       "learning_rate": 0.0002,
+      "loss": 0.3055,
       "step": 12
     },
     {
+      "epoch": 0.00041821822948921077,
+      "grad_norm": 0.35639023780822754,
       "learning_rate": 0.0002,
+      "loss": 0.2548,
       "step": 13
     },
     {
+      "epoch": 0.00045038886252684235,
+      "grad_norm": 0.4272077679634094,
       "learning_rate": 0.0002,
+      "loss": 0.3488,
       "step": 14
     },
     {
+      "epoch": 0.000482559495564474,
+      "grad_norm": 0.3379782736301422,
       "learning_rate": 0.0002,
+      "loss": 0.3432,
       "step": 15
     },
     {
+      "epoch": 0.0005147301286021056,
+      "grad_norm": 0.6516053676605225,
       "learning_rate": 0.0002,
+      "loss": 0.3964,
       "step": 16
     },
     {
+      "epoch": 0.0005469007616397372,
+      "grad_norm": 0.6698662042617798,
       "learning_rate": 0.0002,
+      "loss": 0.3674,
       "step": 17
     },
     {
+      "epoch": 0.0005790713946773688,
+      "grad_norm": 0.9461187124252319,
       "learning_rate": 0.0002,
+      "loss": 0.2705,
       "step": 18
     },
     {
+      "epoch": 0.0006112420277150003,
+      "grad_norm": 0.5860435366630554,
       "learning_rate": 0.0002,
+      "loss": 0.3152,
       "step": 19
     },
     {
+      "epoch": 0.000643412660752632,
+      "grad_norm": 0.4475187659263611,
       "learning_rate": 0.0002,
+      "loss": 0.4237,
       "step": 20
     },
     {
+      "epoch": 0.0006755832937902636,
+      "grad_norm": 0.5749617218971252,
       "learning_rate": 0.0002,
+      "loss": 0.2669,
       "step": 21
     },
     {
+      "epoch": 0.0007077539268278951,
+      "grad_norm": 0.6806007623672485,
       "learning_rate": 0.0002,
+      "loss": 0.3513,
       "step": 22
     },
     {
+      "epoch": 0.0007399245598655268,
+      "grad_norm": 0.6327475905418396,
       "learning_rate": 0.0002,
+      "loss": 0.3948,
       "step": 23
     },
     {
+      "epoch": 0.0007720951929031584,
+      "grad_norm": 0.5336435437202454,
       "learning_rate": 0.0002,
+      "loss": 0.3854,
       "step": 24
     },
     {
+      "epoch": 0.0008042658259407899,
+      "grad_norm": 0.5399162173271179,
       "learning_rate": 0.0002,
+      "loss": 0.3333,
       "step": 25
     },
     {
+      "epoch": 0.0008364364589784215,
+      "grad_norm": 0.7226356863975525,
       "learning_rate": 0.0002,
+      "loss": 0.3978,
       "step": 26
     },
     {
+      "epoch": 0.0008686070920160532,
+      "grad_norm": 0.6512770652770996,
       "learning_rate": 0.0002,
+      "loss": 0.3304,
       "step": 27
     },
     {
+      "epoch": 0.0009007777250536847,
+      "grad_norm": 0.7261360287666321,
       "learning_rate": 0.0002,
+      "loss": 0.3004,
       "step": 28
     },
     {
+      "epoch": 0.0009329483580913163,
+      "grad_norm": 0.5120699405670166,
       "learning_rate": 0.0002,
+      "loss": 0.3675,
       "step": 29
     },
     {
+      "epoch": 0.000965118991128948,
+      "grad_norm": 0.5695130228996277,
       "learning_rate": 0.0002,
+      "loss": 0.3728,
       "step": 30
     },
     {
+      "epoch": 0.0009972896241665795,
+      "grad_norm": 0.5845438838005066,
       "learning_rate": 0.0002,
+      "loss": 0.2799,
       "step": 31
     },
     {
+      "epoch": 0.001029460257204211,
+      "grad_norm": 0.5468902587890625,
       "learning_rate": 0.0002,
+      "loss": 0.3652,
       "step": 32
     },
     {
+      "epoch": 0.0010616308902418427,
+      "grad_norm": 0.5148847699165344,
       "learning_rate": 0.0002,
+      "loss": 0.3211,
       "step": 33
     },
     {
+      "epoch": 0.0010938015232794744,
+      "grad_norm": 0.4925091862678528,
       "learning_rate": 0.0002,
+      "loss": 0.2873,
       "step": 34
     },
     {
+      "epoch": 0.001125972156317106,
+      "grad_norm": 0.45565104484558105,
       "learning_rate": 0.0002,
+      "loss": 0.4159,
       "step": 35
     },
     {
+      "epoch": 0.0011581427893547376,
+      "grad_norm": 0.44539037346839905,
       "learning_rate": 0.0002,
+      "loss": 0.3451,
       "step": 36
     },
     {
+      "epoch": 0.001190313422392369,
+      "grad_norm": 0.5995281338691711,
       "learning_rate": 0.0002,
+      "loss": 0.3294,
       "step": 37
     },
     {
+      "epoch": 0.0012224840554300007,
+      "grad_norm": 0.5937873125076294,
       "learning_rate": 0.0002,
+      "loss": 0.3174,
       "step": 38
     },
     {
+      "epoch": 0.0012546546884676323,
+      "grad_norm": 0.5223010182380676,
       "learning_rate": 0.0002,
+      "loss": 0.3538,
       "step": 39
     },
     {
+      "epoch": 0.001286825321505264,
+      "grad_norm": 0.45076924562454224,
       "learning_rate": 0.0002,
+      "loss": 0.2965,
       "step": 40
     },
     {
+      "epoch": 0.0013189959545428956,
+      "grad_norm": 0.5829368233680725,
       "learning_rate": 0.0002,
+      "loss": 0.3505,
       "step": 41
     },
     {
+      "epoch": 0.0013511665875805272,
+      "grad_norm": 0.5640948414802551,
       "learning_rate": 0.0002,
+      "loss": 0.3274,
       "step": 42
     },
     {
+      "epoch": 0.0013833372206181586,
+      "grad_norm": 0.5946338772773743,
       "learning_rate": 0.0002,
+      "loss": 0.3784,
       "step": 43
     },
     {
+      "epoch": 0.0014155078536557902,
+      "grad_norm": 0.49370312690734863,
       "learning_rate": 0.0002,
+      "loss": 0.3396,
       "step": 44
     },
     {
+      "epoch": 0.0014476784866934219,
+      "grad_norm": 0.552584707736969,
       "learning_rate": 0.0002,
+      "loss": 0.3028,
       "step": 45
     },
     {
+      "epoch": 0.0014798491197310535,
+      "grad_norm": 0.6281300187110901,
       "learning_rate": 0.0002,
+      "loss": 0.4233,
       "step": 46
     },
     {
+      "epoch": 0.0015120197527686851,
+      "grad_norm": 0.4612821638584137,
       "learning_rate": 0.0002,
+      "loss": 0.2995,
       "step": 47
     },
     {
+      "epoch": 0.0015441903858063168,
+      "grad_norm": 0.4370185434818268,
       "learning_rate": 0.0002,
+      "loss": 0.243,
       "step": 48
     },
     {
+      "epoch": 0.0015763610188439484,
+      "grad_norm": 0.6153799891471863,
       "learning_rate": 0.0002,
+      "loss": 0.2805,
       "step": 49
     },
     {
+      "epoch": 0.0016085316518815798,
+      "grad_norm": 0.6847407817840576,
       "learning_rate": 0.0002,
+      "loss": 0.3014,
       "step": 50
     },
     {
+      "epoch": 0.0016407022849192114,
+      "grad_norm": 0.49228036403656006,
       "learning_rate": 0.0002,
+      "loss": 0.3074,
       "step": 51
     },
     {
+      "epoch": 0.001672872917956843,
+      "grad_norm": 0.5665944218635559,
       "learning_rate": 0.0002,
+      "loss": 0.2657,
       "step": 52
     },
     {
+      "epoch": 0.0017050435509944747,
+      "grad_norm": 0.5495525002479553,
       "learning_rate": 0.0002,
+      "loss": 0.4076,
       "step": 53
     },
     {
+      "epoch": 0.0017372141840321063,
+      "grad_norm": 0.7097938656806946,
       "learning_rate": 0.0002,
+      "loss": 0.3506,
       "step": 54
     },
     {
+      "epoch": 0.001769384817069738,
+      "grad_norm": 0.6005829572677612,
       "learning_rate": 0.0002,
+      "loss": 0.3041,
       "step": 55
     },
     {
+      "epoch": 0.0018015554501073694,
+      "grad_norm": 0.4742415249347687,
       "learning_rate": 0.0002,
+      "loss": 0.3457,
       "step": 56
     },
     {
+      "epoch": 0.001833726083145001,
+      "grad_norm": 0.6460862159729004,
       "learning_rate": 0.0002,
+      "loss": 0.352,
       "step": 57
     },
     {
+      "epoch": 0.0018658967161826326,
+      "grad_norm": 0.5325047373771667,
       "learning_rate": 0.0002,
+      "loss": 0.3353,
       "step": 58
     },
     {
+      "epoch": 0.0018980673492202643,
+      "grad_norm": 0.550370991230011,
       "learning_rate": 0.0002,
+      "loss": 0.3186,
       "step": 59
     },
     {
+      "epoch": 0.001930237982257896,
+      "grad_norm": 0.5427353978157043,
       "learning_rate": 0.0002,
+      "loss": 0.2949,
       "step": 60
     },
     {
+      "epoch": 0.0019624086152955273,
+      "grad_norm": 0.7852073907852173,
       "learning_rate": 0.0002,
+      "loss": 0.4574,
       "step": 61
     },
     {
+      "epoch": 0.001994579248333159,
+      "grad_norm": 0.584457516670227,
       "learning_rate": 0.0002,
+      "loss": 0.3599,
       "step": 62
     },
     {
+      "epoch": 0.0020267498813707906,
+      "grad_norm": 0.6022618412971497,
       "learning_rate": 0.0002,
+      "loss": 0.2653,
       "step": 63
     },
     {
+      "epoch": 0.002058920514408422,
+      "grad_norm": 0.5993865728378296,
       "learning_rate": 0.0002,
+      "loss": 0.3466,
       "step": 64
     },
     {
+      "epoch": 0.002091091147446054,
+      "grad_norm": 0.5613014698028564,
       "learning_rate": 0.0002,
+      "loss": 0.2818,
       "step": 65
     },
     {
+      "epoch": 0.0021232617804836855,
+      "grad_norm": 0.5827286243438721,
       "learning_rate": 0.0002,
+      "loss": 0.2641,
       "step": 66
     },
     {
+      "epoch": 0.002155432413521317,
+      "grad_norm": 0.6258942484855652,
       "learning_rate": 0.0002,
+      "loss": 0.3016,
       "step": 67
     },
     {
+      "epoch": 0.0021876030465589487,
+      "grad_norm": 0.5805741548538208,
       "learning_rate": 0.0002,
+      "loss": 0.4094,
       "step": 68
     },
     {
+      "epoch": 0.0022197736795965804,
+      "grad_norm": 0.6247344017028809,
       "learning_rate": 0.0002,
+      "loss": 0.2884,
       "step": 69
     },
     {
+      "epoch": 0.002251944312634212,
+      "grad_norm": 0.6786600351333618,
       "learning_rate": 0.0002,
+      "loss": 0.352,
       "step": 70
     },
     {
+      "epoch": 0.0022841149456718436,
+      "grad_norm": 0.5860627889633179,
       "learning_rate": 0.0002,
+      "loss": 0.409,
       "step": 71
     },
     {
+      "epoch": 0.0023162855787094753,
+      "grad_norm": 0.5486606955528259,
       "learning_rate": 0.0002,
+      "loss": 0.3088,
       "step": 72
     },
     {
+      "epoch": 0.0023484562117471065,
+      "grad_norm": 0.7216318249702454,
       "learning_rate": 0.0002,
+      "loss": 0.3077,
       "step": 73
     },
     {
+      "epoch": 0.002380626844784738,
+      "grad_norm": 0.7599589228630066,
       "learning_rate": 0.0002,
+      "loss": 0.4633,
       "step": 74
     },
     {
+      "epoch": 0.0024127974778223697,
+      "grad_norm": 0.5651103854179382,
       "learning_rate": 0.0002,
+      "loss": 0.297,
       "step": 75
     },
     {
+      "epoch": 0.0024449681108600014,
+      "grad_norm": 0.5803356170654297,
       "learning_rate": 0.0002,
+      "loss": 0.2458,
       "step": 76
     },
     {
+      "epoch": 0.002477138743897633,
+      "grad_norm": 0.5019489526748657,
       "learning_rate": 0.0002,
+      "loss": 0.3269,
       "step": 77
     },
     {
+      "epoch": 0.0025093093769352646,
+      "grad_norm": 0.5693783760070801,
       "learning_rate": 0.0002,
+      "loss": 0.2905,
       "step": 78
     },
     {
+      "epoch": 0.0025414800099728962,
+      "grad_norm": 0.7466827034950256,
       "learning_rate": 0.0002,
+      "loss": 0.3504,
       "step": 79
     },
     {
+      "epoch": 0.002573650643010528,
+      "grad_norm": 0.6029163599014282,
       "learning_rate": 0.0002,
+      "loss": 0.2978,
       "step": 80
     },
     {
+      "epoch": 0.0026058212760481595,
+      "grad_norm": 0.6179245114326477,
       "learning_rate": 0.0002,
+      "loss": 0.4017,
       "step": 81
     },
     {
+      "epoch": 0.002637991909085791,
+      "grad_norm": 0.7006585001945496,
       "learning_rate": 0.0002,
+      "loss": 0.3377,
       "step": 82
     },
     {
+      "epoch": 0.0026701625421234228,
+      "grad_norm": 0.5809662938117981,
       "learning_rate": 0.0002,
+      "loss": 0.3141,
       "step": 83
     },
     {
+      "epoch": 0.0027023331751610544,
+      "grad_norm": 0.58149254322052,
       "learning_rate": 0.0002,
+      "loss": 0.2874,
       "step": 84
     },
     {
+      "epoch": 0.002734503808198686,
+      "grad_norm": 0.8158010840415955,
       "learning_rate": 0.0002,
+      "loss": 0.4517,
       "step": 85
     },
     {
+      "epoch": 0.0027666744412363172,
+      "grad_norm": 0.6752007603645325,
       "learning_rate": 0.0002,
+      "loss": 0.4586,
       "step": 86
     },
     {
+      "epoch": 0.002798845074273949,
+      "grad_norm": 0.6040322780609131,
       "learning_rate": 0.0002,
+      "loss": 0.2622,
       "step": 87
     },
     {
+      "epoch": 0.0028310157073115805,
+      "grad_norm": 0.7154407501220703,
       "learning_rate": 0.0002,
+      "loss": 0.3639,
       "step": 88
     },
     {
+      "epoch": 0.002863186340349212,
+      "grad_norm": 0.6612291932106018,
       "learning_rate": 0.0002,
+      "loss": 0.3774,
       "step": 89
     },
     {
+      "epoch": 0.0028953569733868438,
+      "grad_norm": 0.7525337338447571,
       "learning_rate": 0.0002,
+      "loss": 0.4023,
       "step": 90
     },
     {
+      "epoch": 0.0029275276064244754,
+      "grad_norm": 0.6839393377304077,
       "learning_rate": 0.0002,
+      "loss": 0.456,
       "step": 91
     },
     {
+      "epoch": 0.002959698239462107,
+      "grad_norm": 0.6505508422851562,
       "learning_rate": 0.0002,
+      "loss": 0.2841,
       "step": 92
     },
     {
+      "epoch": 0.0029918688724997386,
+      "grad_norm": 0.541343092918396,
       "learning_rate": 0.0002,
+      "loss": 0.301,
       "step": 93
     },
     {
+      "epoch": 0.0030240395055373703,
+      "grad_norm": 0.5484374165534973,
       "learning_rate": 0.0002,
+      "loss": 0.2425,
       "step": 94
     },
     {
+      "epoch": 0.003056210138575002,
+      "grad_norm": 0.5703783631324768,
       "learning_rate": 0.0002,
+      "loss": 0.6352,
       "step": 95
     },
     {
+      "epoch": 0.0030883807716126335,
+      "grad_norm": 0.5479252934455872,
       "learning_rate": 0.0002,
+      "loss": 0.3013,
       "step": 96
     },
     {
+      "epoch": 0.003120551404650265,
+      "grad_norm": 0.7930196523666382,
       "learning_rate": 0.0002,
+      "loss": 0.3676,
       "step": 97
     },
     {
+      "epoch": 0.003152722037687897,
+      "grad_norm": 0.6894263029098511,
       "learning_rate": 0.0002,
+      "loss": 0.26,
       "step": 98
     },
     {
+      "epoch": 0.003184892670725528,
+      "grad_norm": 0.639010488986969,
       "learning_rate": 0.0002,
+      "loss": 0.2685,
       "step": 99
     },
     {
+      "epoch": 0.0032170633037631596,
+      "grad_norm": 0.6424719095230103,
       "learning_rate": 0.0002,
+      "loss": 0.2628,
       "step": 100
     },
     {
+      "epoch": 0.0032492339368007913,
+      "grad_norm": 0.7405576109886169,
       "learning_rate": 0.0002,
+      "loss": 0.3001,
       "step": 101
     },
     {
+      "epoch": 0.003281404569838423,
+      "grad_norm": 0.6489754915237427,
       "learning_rate": 0.0002,
+      "loss": 0.3407,
       "step": 102
     },
     {
+      "epoch": 0.0033135752028760545,
+      "grad_norm": 0.6659820675849915,
       "learning_rate": 0.0002,
+      "loss": 0.3678,
       "step": 103
     },
     {
+      "epoch": 0.003345745835913686,
+      "grad_norm": 0.706896960735321,
       "learning_rate": 0.0002,
+      "loss": 0.3577,
       "step": 104
     },
     {
+      "epoch": 0.003377916468951318,
+      "grad_norm": 0.6583238840103149,
       "learning_rate": 0.0002,
+      "loss": 0.3567,
       "step": 105
     },
     {
+      "epoch": 0.0034100871019889494,
+      "grad_norm": 0.7842928767204285,
       "learning_rate": 0.0002,
+      "loss": 0.3746,
       "step": 106
     },
     {
+      "epoch": 0.003442257735026581,
+      "grad_norm": 0.7192911505699158,
       "learning_rate": 0.0002,
+      "loss": 0.381,
       "step": 107
     },
     {
+      "epoch": 0.0034744283680642127,
+      "grad_norm": 0.8255159258842468,
       "learning_rate": 0.0002,
+      "loss": 0.4093,
       "step": 108
     },
     {
+      "epoch": 0.0035065990011018443,
+      "grad_norm": 0.731376588344574,
       "learning_rate": 0.0002,
+      "loss": 0.2749,
       "step": 109
     },
     {
+      "epoch": 0.003538769634139476,
+      "grad_norm": 0.7096914649009705,
       "learning_rate": 0.0002,
+      "loss": 0.2585,
       "step": 110
     },
     {
+      "epoch": 0.0035709402671771076,
+      "grad_norm": 0.7141759991645813,
       "learning_rate": 0.0002,
+      "loss": 0.3652,
       "step": 111
     },
     {
+      "epoch": 0.0036031109002147388,
+      "grad_norm": 0.8442528247833252,
       "learning_rate": 0.0002,
+      "loss": 0.301,
       "step": 112
     },
     {
+      "epoch": 0.0036352815332523704,
+      "grad_norm": 0.8419767618179321,
       "learning_rate": 0.0002,
+      "loss": 0.4241,
       "step": 113
     },
     {
+      "epoch": 0.003667452166290002,
+      "grad_norm": 0.7170063257217407,
       "learning_rate": 0.0002,
+      "loss": 0.4237,
       "step": 114
     },
     {
+      "epoch": 0.0036996227993276337,
+      "grad_norm": 0.7070204019546509,
       "learning_rate": 0.0002,
+      "loss": 0.4084,
       "step": 115
     },
     {
+      "epoch": 0.0037317934323652653,
+      "grad_norm": 0.6054997444152832,
       "learning_rate": 0.0002,
+      "loss": 0.3128,
       "step": 116
     },
     {
+      "epoch": 0.003763964065402897,
+      "grad_norm": 0.5738762021064758,
       "learning_rate": 0.0002,
+      "loss": 0.4329,
       "step": 117
     },
     {
+      "epoch": 0.0037961346984405286,
+      "grad_norm": 0.6349337100982666,
       "learning_rate": 0.0002,
+      "loss": 0.3271,
       "step": 118
     },
     {
+      "epoch": 0.00382830533147816,
+      "grad_norm": 0.6344738006591797,
       "learning_rate": 0.0002,
+      "loss": 0.3099,
       "step": 119
     },
     {
+      "epoch": 0.003860475964515792,
+      "grad_norm": 0.7301223874092102,
       "learning_rate": 0.0002,
+      "loss": 0.4126,
       "step": 120
     },
     {
+      "epoch": 0.0038926465975534234,
+      "grad_norm": 0.7683565616607666,
       "learning_rate": 0.0002,
+      "loss": 0.3152,
       "step": 121
     },
     {
+      "epoch": 0.003924817230591055,
+      "grad_norm": 0.5495012402534485,
       "learning_rate": 0.0002,
+      "loss": 0.2863,
       "step": 122
     },
     {
+      "epoch": 0.003956987863628686,
+      "grad_norm": 0.6135990023612976,
       "learning_rate": 0.0002,
+      "loss": 0.3381,
       "step": 123
     },
     {
+      "epoch": 0.003989158496666318,
+      "grad_norm": 0.8360633850097656,
       "learning_rate": 0.0002,
+      "loss": 0.3739,
       "step": 124
     },
     {
+      "epoch": 0.0040213291297039495,
+      "grad_norm": 0.7187512516975403,
       "learning_rate": 0.0002,
+      "loss": 0.24,
       "step": 125
     },
     {
+      "epoch": 0.004053499762741581,
+      "grad_norm": 0.7280769348144531,
       "learning_rate": 0.0002,
+      "loss": 0.3286,
       "step": 126
     },
     {
+      "epoch": 0.004085670395779213,
+      "grad_norm": 0.6523069143295288,
       "learning_rate": 0.0002,
+      "loss": 0.2758,
       "step": 127
     },
     {
+      "epoch": 0.004117841028816844,
+      "grad_norm": 0.6205531358718872,
       "learning_rate": 0.0002,
+      "loss": 0.3053,
       "step": 128
     },
     {
+      "epoch": 0.004150011661854476,
+      "grad_norm": 0.8631265163421631,
       "learning_rate": 0.0002,
+      "loss": 0.3149,
       "step": 129
     },
     {
+      "epoch": 0.004182182294892108,
+      "grad_norm": 0.7578058242797852,
       "learning_rate": 0.0002,
+      "loss": 0.5391,
       "step": 130
     },
     {
+      "epoch": 0.004214352927929739,
+      "grad_norm": 0.7494041323661804,
       "learning_rate": 0.0002,
+      "loss": 0.2909,
       "step": 131
     },
     {
+      "epoch": 0.004246523560967371,
+      "grad_norm": 0.6875420808792114,
       "learning_rate": 0.0002,
+      "loss": 0.3464,
       "step": 132
     },
     {
+      "epoch": 0.004278694194005003,
+      "grad_norm": 0.6999627947807312,
       "learning_rate": 0.0002,
+      "loss": 0.374,
       "step": 133
     },
     {
+      "epoch": 0.004310864827042634,
+      "grad_norm": 0.7990955114364624,
       "learning_rate": 0.0002,
+      "loss": 0.3734,
       "step": 134
     },
     {
+      "epoch": 0.004343035460080266,
+      "grad_norm": 0.8626115918159485,
       "learning_rate": 0.0002,
+      "loss": 0.4212,
       "step": 135
     },
     {
+      "epoch": 0.0043752060931178975,
+      "grad_norm": 0.7310900092124939,
       "learning_rate": 0.0002,
+      "loss": 0.3488,
       "step": 136
     },
     {
+      "epoch": 0.004407376726155529,
+      "grad_norm": 0.8717100024223328,
       "learning_rate": 0.0002,
+      "loss": 0.3812,
       "step": 137
     },
     {
+      "epoch": 0.004439547359193161,
+      "grad_norm": 0.6076570153236389,
       "learning_rate": 0.0002,
+      "loss": 0.2651,
       "step": 138
     },
     {
+      "epoch": 0.004471717992230792,
+      "grad_norm": 0.9252959489822388,
       "learning_rate": 0.0002,
+      "loss": 0.352,
       "step": 139
     },
     {
+      "epoch": 0.004503888625268424,
+      "grad_norm": 0.9629406929016113,
       "learning_rate": 0.0002,
+      "loss": 0.3687,
       "step": 140
     },
     {
+      "epoch": 0.004536059258306056,
+      "grad_norm": 0.6811290979385376,
       "learning_rate": 0.0002,
+      "loss": 0.3433,
       "step": 141
     },
     {
+      "epoch": 0.004568229891343687,
+      "grad_norm": 1.0294033288955688,
       "learning_rate": 0.0002,
+      "loss": 0.4794,
       "step": 142
     },
     {
+      "epoch": 0.004600400524381319,
+      "grad_norm": 0.6730893850326538,
       "learning_rate": 0.0002,
+      "loss": 0.3088,
       "step": 143
     },
     {
+      "epoch": 0.0046325711574189505,
+      "grad_norm": 0.8225754499435425,
       "learning_rate": 0.0002,
+      "loss": 0.3625,
       "step": 144
     },
     {
+      "epoch": 0.004664741790456581,
+      "grad_norm": 0.6656695008277893,
       "learning_rate": 0.0002,
+      "loss": 0.3459,
       "step": 145
     },
     {
+      "epoch": 0.004696912423494213,
+      "grad_norm": 0.6955097913742065,
       "learning_rate": 0.0002,
+      "loss": 0.3035,
       "step": 146
     },
     {
+      "epoch": 0.0047290830565318446,
+      "grad_norm": 0.9222290515899658,
       "learning_rate": 0.0002,
+      "loss": 0.3599,
       "step": 147
     },
     {
+      "epoch": 0.004761253689569476,
+      "grad_norm": 0.7328464984893799,
       "learning_rate": 0.0002,
+      "loss": 0.2753,
       "step": 148
     },
     {
+      "epoch": 0.004793424322607108,
+      "grad_norm": 0.6858335137367249,
       "learning_rate": 0.0002,
+      "loss": 0.3916,
       "step": 149
     },
     {
+      "epoch": 0.0048255949556447394,
+      "grad_norm": 0.8160132765769958,
       "learning_rate": 0.0002,
+      "loss": 0.3318,
       "step": 150
     },
     {
+      "epoch": 0.004857765588682371,
+      "grad_norm": 0.7775120735168457,
       "learning_rate": 0.0002,
+      "loss": 0.2875,
       "step": 151
     },
     {
+      "epoch": 0.004889936221720003,
+      "grad_norm": 0.7653348445892334,
       "learning_rate": 0.0002,
+      "loss": 0.2932,
       "step": 152
     },
     {
+      "epoch": 0.004922106854757634,
+      "grad_norm": 0.7305892705917358,
       "learning_rate": 0.0002,
+      "loss": 0.3026,
       "step": 153
     },
     {
+      "epoch": 0.004954277487795266,
+      "grad_norm": 0.7535127997398376,
       "learning_rate": 0.0002,
+      "loss": 0.2856,
       "step": 154
     },
     {
+      "epoch": 0.004986448120832898,
+      "grad_norm": 0.7200407981872559,
       "learning_rate": 0.0002,
+      "loss": 0.3097,
       "step": 155
     },
     {
+      "epoch": 0.005018618753870529,
+      "grad_norm": 0.8162491917610168,
       "learning_rate": 0.0002,
+      "loss": 0.3619,
       "step": 156
     },
     {
+      "epoch": 0.005050789386908161,
+      "grad_norm": 1.155956506729126,
       "learning_rate": 0.0002,
+      "loss": 0.4073,
       "step": 157
     },
     {
+      "epoch": 0.0050829600199457925,
+      "grad_norm": 0.7546277046203613,
       "learning_rate": 0.0002,
+      "loss": 0.349,
       "step": 158
     },
     {
+      "epoch": 0.005115130652983424,
+      "grad_norm": 0.5961102247238159,
       "learning_rate": 0.0002,
+      "loss": 0.2786,
       "step": 159
     },
     {
+      "epoch": 0.005147301286021056,
+      "grad_norm": 0.7311742901802063,
       "learning_rate": 0.0002,
+      "loss": 0.3039,
       "step": 160
     },
     {
+      "epoch": 0.005179471919058687,
+      "grad_norm": 1.035477876663208,
       "learning_rate": 0.0002,
+      "loss": 0.4212,
       "step": 161
     },
     {
+      "epoch": 0.005211642552096319,
+      "grad_norm": 0.8163110017776489,
       "learning_rate": 0.0002,
+      "loss": 0.387,
       "step": 162
     },
     {
+      "epoch": 0.005243813185133951,
+      "grad_norm": 0.6917060017585754,
       "learning_rate": 0.0002,
+      "loss": 0.3123,
       "step": 163
     },
     {
+      "epoch": 0.005275983818171582,
+      "grad_norm": 0.8115301132202148,
       "learning_rate": 0.0002,
+      "loss": 0.3335,
       "step": 164
     },
     {
+      "epoch": 0.005308154451209214,
+      "grad_norm": 0.972899854183197,
       "learning_rate": 0.0002,
+      "loss": 0.3542,
       "step": 165
     },
     {
+      "epoch": 0.0053403250842468455,
+      "grad_norm": 0.8446269035339355,
       "learning_rate": 0.0002,
+      "loss": 0.3183,
       "step": 166
     },
     {
+      "epoch": 0.005372495717284477,
+      "grad_norm": 0.7965036630630493,
       "learning_rate": 0.0002,
+      "loss": 0.352,
       "step": 167
     },
     {
+      "epoch": 0.005404666350322109,
+      "grad_norm": 0.7153119444847107,
       "learning_rate": 0.0002,
+      "loss": 0.3603,
       "step": 168
     },
     {
+      "epoch": 0.0054368369833597404,
+      "grad_norm": 0.7775716185569763,
       "learning_rate": 0.0002,
+      "loss": 0.3528,
       "step": 169
     },
     {
+      "epoch": 0.005469007616397372,
+      "grad_norm": 0.7184603214263916,
       "learning_rate": 0.0002,
+      "loss": 0.365,
       "step": 170
     },
     {
+      "epoch": 0.005501178249435003,
+      "grad_norm": 0.6972705125808716,
       "learning_rate": 0.0002,
+      "loss": 0.2906,
       "step": 171
     },
     {
+      "epoch": 0.0055333488824726345,
+      "grad_norm": 0.9904060363769531,
       "learning_rate": 0.0002,
+      "loss": 0.4492,
       "step": 172
     },
     {
+      "epoch": 0.005565519515510266,
+      "grad_norm": 0.852296769618988,
       "learning_rate": 0.0002,
+      "loss": 0.2981,
       "step": 173
     },
     {
+      "epoch": 0.005597690148547898,
+      "grad_norm": 0.6921360492706299,
       "learning_rate": 0.0002,
+      "loss": 0.2875,
       "step": 174
     },
     {
+      "epoch": 0.005629860781585529,
+      "grad_norm": 0.7813829779624939,
       "learning_rate": 0.0002,
+      "loss": 0.3991,
       "step": 175
     },
     {
+      "epoch": 0.005662031414623161,
+      "grad_norm": 0.8460421562194824,
       "learning_rate": 0.0002,
+      "loss": 0.3269,
       "step": 176
     },
     {
+      "epoch": 0.005694202047660793,
+      "grad_norm": 0.793835461139679,
       "learning_rate": 0.0002,
+      "loss": 0.2896,
       "step": 177
     },
     {
+      "epoch": 0.005726372680698424,
+      "grad_norm": 0.8878104090690613,
       "learning_rate": 0.0002,
+      "loss": 0.402,
       "step": 178
     },
     {
+      "epoch": 0.005758543313736056,
+      "grad_norm": 0.8582636713981628,
       "learning_rate": 0.0002,
+      "loss": 0.4386,
       "step": 179
     },
     {
+      "epoch": 0.0057907139467736875,
+      "grad_norm": 0.8274714350700378,
       "learning_rate": 0.0002,
+      "loss": 0.3228,
       "step": 180
     },
     {
+      "epoch": 0.005822884579811319,
+      "grad_norm": 0.8849393725395203,
       "learning_rate": 0.0002,
+      "loss": 0.3835,
       "step": 181
     },
     {
+      "epoch": 0.005855055212848951,
+      "grad_norm": 1.2293494939804077,
       "learning_rate": 0.0002,
+      "loss": 0.2947,
       "step": 182
     },
     {
+      "epoch": 0.005887225845886582,
+      "grad_norm": 0.766805112361908,
       "learning_rate": 0.0002,
+      "loss": 0.4024,
       "step": 183
     },
     {
+      "epoch": 0.005919396478924214,
+      "grad_norm": 1.023227572441101,
       "learning_rate": 0.0002,
+      "loss": 0.2848,
       "step": 184
     },
     {
+      "epoch": 0.005951567111961846,
+      "grad_norm": 0.8333758115768433,
       "learning_rate": 0.0002,
+      "loss": 0.3701,
       "step": 185
     },
     {
+      "epoch": 0.005983737744999477,
+      "grad_norm": 0.9221575260162354,
       "learning_rate": 0.0002,
+      "loss": 0.2953,
       "step": 186
     },
     {
+      "epoch": 0.006015908378037109,
+      "grad_norm": 0.7268536686897278,
       "learning_rate": 0.0002,
+      "loss": 0.3487,
       "step": 187
     },
     {
+      "epoch": 0.0060480790110747406,
+      "grad_norm": 0.7841563820838928,
       "learning_rate": 0.0002,
+      "loss": 0.3737,
       "step": 188
     },
     {
+      "epoch": 0.006080249644112372,
+      "grad_norm": 0.7304165959358215,
       "learning_rate": 0.0002,
+      "loss": 0.3718,
       "step": 189
     },
     {
+      "epoch": 0.006112420277150004,
+      "grad_norm": 0.9500126838684082,
       "learning_rate": 0.0002,
+      "loss": 0.3661,
       "step": 190
     },
     {
+      "epoch": 0.0061445909101876355,
+      "grad_norm": 1.027346134185791,
       "learning_rate": 0.0002,
+      "loss": 0.3755,
       "step": 191
     },
     {
+      "epoch": 0.006176761543225267,
+      "grad_norm": 0.6862695217132568,
       "learning_rate": 0.0002,
+      "loss": 0.3208,
       "step": 192
     },
     {
+      "epoch": 0.006208932176262899,
+      "grad_norm": 0.7714293003082275,
       "learning_rate": 0.0002,
+      "loss": 0.361,
       "step": 193
     },
     {
+      "epoch": 0.00624110280930053,
+      "grad_norm": 0.8124901056289673,
       "learning_rate": 0.0002,
+      "loss": 0.3565,
       "step": 194
     },
     {
+      "epoch": 0.006273273442338162,
+      "grad_norm": 0.7867235541343689,
       "learning_rate": 0.0002,
+      "loss": 0.3192,
       "step": 195
     },
     {
+      "epoch": 0.006305444075375794,
+      "grad_norm": 0.7322407364845276,
       "learning_rate": 0.0002,
+      "loss": 0.3075,
       "step": 196
     },
     {
+      "epoch": 0.006337614708413425,
+      "grad_norm": 0.9365407228469849,
       "learning_rate": 0.0002,
+      "loss": 0.2552,
       "step": 197
     },
     {
+      "epoch": 0.006369785341451056,
+      "grad_norm": 0.9422205686569214,
       "learning_rate": 0.0002,
+      "loss": 0.336,
       "step": 198
     },
     {
+      "epoch": 0.006401955974488688,
+      "grad_norm": 0.76619952917099,
       "learning_rate": 0.0002,
+      "loss": 0.2907,
       "step": 199
     },
     {
+      "epoch": 0.006434126607526319,
+      "grad_norm": 0.7989760041236877,
       "learning_rate": 0.0002,
+      "loss": 0.3402,
       "step": 200
     },
     {
+      "epoch": 0.006434126607526319,
+      "eval_loss": 0.3310258388519287,
+      "eval_runtime": 28.8107,
+      "eval_samples_per_second": 8.643,
+      "eval_steps_per_second": 4.339,
       "step": 200
     }
   ],
   "logging_steps": 1,
+  "max_steps": 93252,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 200,
       "attributes": {}
     }
   },
+  "total_flos": 1.298945783365632e+16,
+  "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null
 }

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c5bfb8f9ee0d17252ff4577fc9c15127560771b7e188338420238d872618fd3b
 size 6776

 version https://git-lfs.github.com/spec/v1
+oid sha256:559463d95a91aa4519eb17ce5aba32cad078cdd9196a6b9560209fa7cf008a3b
 size 6776