Training in progress, step 20, checkpoint

Browse files

Files changed (7) hide show

last-checkpoint/adapter_config.json +3 -3
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +78 -818
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -20,13 +20,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "k_proj",
-    "gate_proj",
     "v_proj",
     "down_proj",
     "o_proj",
-    "up_proj",
-    "q_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "q_proj",
     "k_proj",
     "v_proj",
+    "gate_proj",
     "down_proj",
     "o_proj",
+    "up_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e36bb4966b3713f17079f0f0073225f3c17789e78598436f125bc5847c546220
 size 45118424

 version https://git-lfs.github.com/spec/v1
+oid sha256:0bf014d1a50f271c41f7422261b08ca5ec84dc1faabd04c29231ef2836d36445
 size 45118424

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:31631c7141c9eed8d3d67722b7f007bb55e7b4644efb82e4b7c07b72a46d6b5f
 size 23159290

 version https://git-lfs.github.com/spec/v1
+oid sha256:b9606721fc8617b61d2e0dc2ec8042ef4e6afb22d62a62595ce9ec2026c0ad30
 size 23159290

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:330e765b24011cd6e18b8db74d77f7195e5780a184071a5df72e72c642350c23
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:490faae3574e0545627c6c066345113a5ec4be88337cd4484537a0d75c6be16a
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:61c2b4927e3039b26d377375be782c03ce853d193f96b5868ccf559441e84af9
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:1c297c5cf11a27c75d9f99f1df69752f78c3ad41b0275adf50cdd1b67f9d0bb3
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,914 +1,174 @@
 {
-  "best_metric": 1.1519354581832886,
-  "best_model_checkpoint": "miner_id_besimray/checkpoint-60",
-  "epoch": 2.526315789473684,
   "eval_steps": 20,
-  "global_step": 120,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.021052631578947368,
-      "grad_norm": 0.7695803046226501,
       "learning_rate": 2e-05,
-      "loss": 1.3028,
       "step": 1
     },
     {
-      "epoch": 0.021052631578947368,
-      "eval_loss": 1.2579221725463867,
-      "eval_runtime": 2.0651,
-      "eval_samples_per_second": 48.423,
-      "eval_steps_per_second": 4.842,
       "step": 1
     },
     {
-      "epoch": 0.042105263157894736,
-      "grad_norm": 0.7731568217277527,
       "learning_rate": 4e-05,
-      "loss": 1.4572,
       "step": 2
     },
     {
-      "epoch": 0.06315789473684211,
-      "grad_norm": 0.6739473342895508,
       "learning_rate": 6e-05,
-      "loss": 1.2761,
       "step": 3
     },
     {
-      "epoch": 0.08421052631578947,
-      "grad_norm": 0.713449239730835,
       "learning_rate": 8e-05,
-      "loss": 1.4221,
       "step": 4
     },
     {
-      "epoch": 0.10526315789473684,
-      "grad_norm": 0.5318827629089355,
       "learning_rate": 0.0001,
-      "loss": 1.2373,
       "step": 5
     },
     {
-      "epoch": 0.12631578947368421,
-      "grad_norm": 0.5601332783699036,
       "learning_rate": 0.00012,
-      "loss": 1.3898,
       "step": 6
     },
     {
-      "epoch": 0.14736842105263157,
-      "grad_norm": 0.6797667741775513,
       "learning_rate": 0.00014,
-      "loss": 1.3347,
       "step": 7
     },
     {
-      "epoch": 0.16842105263157894,
-      "grad_norm": 0.5191617012023926,
       "learning_rate": 0.00016,
-      "loss": 1.2194,
       "step": 8
     },
     {
-      "epoch": 0.18947368421052632,
-      "grad_norm": 0.5978218913078308,
       "learning_rate": 0.00018,
-      "loss": 1.2025,
       "step": 9
     },
     {
-      "epoch": 0.21052631578947367,
-      "grad_norm": 0.4920961558818817,
       "learning_rate": 0.0002,
-      "loss": 1.378,
       "step": 10
     },
     {
-      "epoch": 0.23157894736842105,
-      "grad_norm": 0.44265127182006836,
       "learning_rate": 0.00019997482349425066,
-      "loss": 1.1907,
       "step": 11
     },
     {
-      "epoch": 0.25263157894736843,
-      "grad_norm": 0.3402289152145386,
       "learning_rate": 0.00019989930665413147,
-      "loss": 1.2153,
       "step": 12
     },
     {
-      "epoch": 0.2736842105263158,
-      "grad_norm": 0.33481013774871826,
       "learning_rate": 0.0001997734875046456,
-      "loss": 1.0648,
       "step": 13
     },
     {
-      "epoch": 0.29473684210526313,
-      "grad_norm": 0.3752918243408203,
       "learning_rate": 0.00019959742939952392,
-      "loss": 1.0774,
       "step": 14
     },
     {
-      "epoch": 0.3157894736842105,
-      "grad_norm": 0.37364915013313293,
       "learning_rate": 0.00019937122098932428,
-      "loss": 1.003,
       "step": 15
     },
     {
-      "epoch": 0.3368421052631579,
-      "grad_norm": 0.3115549683570862,
       "learning_rate": 0.00019909497617679348,
-      "loss": 1.2112,
       "step": 16
     },
     {
-      "epoch": 0.35789473684210527,
-      "grad_norm": 0.3663255572319031,
       "learning_rate": 0.00019876883405951377,
-      "loss": 1.281,
       "step": 17
     },
     {
-      "epoch": 0.37894736842105264,
-      "grad_norm": 0.325300008058548,
       "learning_rate": 0.00019839295885986296,
-      "loss": 1.2251,
       "step": 18
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 0.3866478204727173,
       "learning_rate": 0.00019796753984232358,
-      "loss": 1.2657,
       "step": 19
     },
     {
-      "epoch": 0.42105263157894735,
-      "grad_norm": 0.3811936378479004,
       "learning_rate": 0.00019749279121818235,
-      "loss": 1.3521,
       "step": 20
     },
     {
-      "epoch": 0.42105263157894735,
-      "eval_loss": 1.1702154874801636,
-      "eval_runtime": 2.0888,
-      "eval_samples_per_second": 47.875,
-      "eval_steps_per_second": 4.788,
       "step": 20
-    },
-    {
-      "epoch": 0.4421052631578947,
-      "grad_norm": 0.4274454414844513,
-      "learning_rate": 0.0001969689520376687,
-      "loss": 1.0972,
-      "step": 21
-    },
-    {
-      "epoch": 0.4631578947368421,
-      "grad_norm": 0.3145941197872162,
-      "learning_rate": 0.00019639628606958533,
-      "loss": 1.1746,
-      "step": 22
-    },
-    {
-      "epoch": 0.4842105263157895,
-      "grad_norm": 0.34570032358169556,
-      "learning_rate": 0.00019577508166849304,
-      "loss": 1.2273,
-      "step": 23
-    },
-    {
-      "epoch": 0.5052631578947369,
-      "grad_norm": 0.4847642481327057,
-      "learning_rate": 0.00019510565162951537,
-      "loss": 1.2152,
-      "step": 24
-    },
-    {
-      "epoch": 0.5263157894736842,
-      "grad_norm": 0.3375917375087738,
-      "learning_rate": 0.00019438833303083678,
-      "loss": 1.1977,
-      "step": 25
-    },
-    {
-      "epoch": 0.5473684210526316,
-      "grad_norm": 0.42273998260498047,
-      "learning_rate": 0.00019362348706397373,
-      "loss": 1.24,
-      "step": 26
-    },
-    {
-      "epoch": 0.5684210526315789,
-      "grad_norm": 0.4141988456249237,
-      "learning_rate": 0.0001928114988519039,
-      "loss": 1.2562,
-      "step": 27
-    },
-    {
-      "epoch": 0.5894736842105263,
-      "grad_norm": 0.382915198802948,
-      "learning_rate": 0.0001919527772551451,
-      "loss": 1.226,
-      "step": 28
-    },
-    {
-      "epoch": 0.6105263157894737,
-      "grad_norm": 0.37382128834724426,
-      "learning_rate": 0.00019104775466588161,
-      "loss": 1.3248,
-      "step": 29
-    },
-    {
-      "epoch": 0.631578947368421,
-      "grad_norm": 0.3107808232307434,
-      "learning_rate": 0.0001900968867902419,
-      "loss": 1.1354,
-      "step": 30
-    },
-    {
-      "epoch": 0.6526315789473685,
-      "grad_norm": 0.34155750274658203,
-      "learning_rate": 0.0001891006524188368,
-      "loss": 1.0803,
-      "step": 31
-    },
-    {
-      "epoch": 0.6736842105263158,
-      "grad_norm": 0.3141622543334961,
-      "learning_rate": 0.0001880595531856738,
-      "loss": 1.1457,
-      "step": 32
-    },
-    {
-      "epoch": 0.6947368421052632,
-      "grad_norm": 0.35257869958877563,
-      "learning_rate": 0.00018697411331556956,
-      "loss": 1.2322,
-      "step": 33
-    },
-    {
-      "epoch": 0.7157894736842105,
-      "grad_norm": 0.42415115237236023,
-      "learning_rate": 0.00018584487936018661,
-      "loss": 1.179,
-      "step": 34
-    },
-    {
-      "epoch": 0.7368421052631579,
-      "grad_norm": 0.3805026710033417,
-      "learning_rate": 0.00018467241992282843,
-      "loss": 1.126,
-      "step": 35
-    },
-    {
-      "epoch": 0.7578947368421053,
-      "grad_norm": 0.3650873601436615,
-      "learning_rate": 0.00018345732537213027,
-      "loss": 1.2333,
-      "step": 36
-    },
-    {
-      "epoch": 0.7789473684210526,
-      "grad_norm": 0.3254134953022003,
-      "learning_rate": 0.00018220020754479102,
-      "loss": 1.2721,
-      "step": 37
-    },
-    {
-      "epoch": 0.8,
-      "grad_norm": 0.36688175797462463,
-      "learning_rate": 0.00018090169943749476,
-      "loss": 1.2615,
-      "step": 38
-    },
-    {
-      "epoch": 0.8210526315789474,
-      "grad_norm": 0.3320186734199524,
-      "learning_rate": 0.00017956245488817812,
-      "loss": 1.1474,
-      "step": 39
-    },
-    {
-      "epoch": 0.8421052631578947,
-      "grad_norm": 0.37685626745224,
-      "learning_rate": 0.000178183148246803,
-      "loss": 1.1977,
-      "step": 40
-    },
-    {
-      "epoch": 0.8421052631578947,
-      "eval_loss": 1.153311014175415,
-      "eval_runtime": 2.0851,
-      "eval_samples_per_second": 47.96,
-      "eval_steps_per_second": 4.796,
-      "step": 40
-    },
-    {
-      "epoch": 0.8631578947368421,
-      "grad_norm": 0.43908432126045227,
-      "learning_rate": 0.0001767644740358011,
-      "loss": 1.2448,
-      "step": 41
-    },
-    {
-      "epoch": 0.8842105263157894,
-      "grad_norm": 0.3212919235229492,
-      "learning_rate": 0.00017530714660036112,
-      "loss": 1.1699,
-      "step": 42
-    },
-    {
-      "epoch": 0.9052631578947369,
-      "grad_norm": 0.339679479598999,
-      "learning_rate": 0.00017381189974873407,
-      "loss": 1.1572,
-      "step": 43
-    },
-    {
-      "epoch": 0.9263157894736842,
-      "grad_norm": 0.3269651234149933,
-      "learning_rate": 0.00017227948638273916,
-      "loss": 1.1494,
-      "step": 44
-    },
-    {
-      "epoch": 0.9473684210526315,
-      "grad_norm": 0.34232962131500244,
-      "learning_rate": 0.00017071067811865476,
-      "loss": 1.2262,
-      "step": 45
-    },
-    {
-      "epoch": 0.968421052631579,
-      "grad_norm": 0.34240803122520447,
-      "learning_rate": 0.00016910626489868649,
-      "loss": 1.1834,
-      "step": 46
-    },
-    {
-      "epoch": 0.9894736842105263,
-      "grad_norm": 0.35933125019073486,
-      "learning_rate": 0.00016746705459320745,
-      "loss": 1.0391,
-      "step": 47
-    },
-    {
-      "epoch": 1.0105263157894737,
-      "grad_norm": 0.3355937898159027,
-      "learning_rate": 0.00016579387259397127,
-      "loss": 1.2872,
-      "step": 48
-    },
-    {
-      "epoch": 1.0315789473684212,
-      "grad_norm": 0.3706349730491638,
-      "learning_rate": 0.0001640875613985024,
-      "loss": 1.0775,
-      "step": 49
-    },
-    {
-      "epoch": 1.0526315789473684,
-      "grad_norm": 0.427852988243103,
-      "learning_rate": 0.00016234898018587337,
-      "loss": 1.1524,
-      "step": 50
-    },
-    {
-      "epoch": 1.0736842105263158,
-      "grad_norm": 0.3533117473125458,
-      "learning_rate": 0.000160579004384082,
-      "loss": 1.1181,
-      "step": 51
-    },
-    {
-      "epoch": 1.0947368421052632,
-      "grad_norm": 0.3712696135044098,
-      "learning_rate": 0.00015877852522924732,
-      "loss": 1.0986,
-      "step": 52
-    },
-    {
-      "epoch": 1.1157894736842104,
-      "grad_norm": 0.3790956437587738,
-      "learning_rate": 0.0001569484493168452,
-      "loss": 1.1749,
-      "step": 53
-    },
-    {
-      "epoch": 1.1368421052631579,
-      "grad_norm": 0.3779037296772003,
-      "learning_rate": 0.00015508969814521025,
-      "loss": 1.1089,
-      "step": 54
-    },
-    {
-      "epoch": 1.1578947368421053,
-      "grad_norm": 0.36196696758270264,
-      "learning_rate": 0.00015320320765153367,
-      "loss": 1.0186,
-      "step": 55
-    },
-    {
-      "epoch": 1.1789473684210527,
-      "grad_norm": 0.3449699282646179,
-      "learning_rate": 0.00015128992774059063,
-      "loss": 1.064,
-      "step": 56
-    },
-    {
-      "epoch": 1.2,
-      "grad_norm": 0.43372786045074463,
-      "learning_rate": 0.0001493508218064347,
-      "loss": 1.007,
-      "step": 57
-    },
-    {
-      "epoch": 1.2210526315789474,
-      "grad_norm": 0.35580453276634216,
-      "learning_rate": 0.00014738686624729986,
-      "loss": 1.0295,
-      "step": 58
-    },
-    {
-      "epoch": 1.2421052631578948,
-      "grad_norm": 0.47308239340782166,
-      "learning_rate": 0.00014539904997395468,
-      "loss": 1.1361,
-      "step": 59
-    },
-    {
-      "epoch": 1.263157894736842,
-      "grad_norm": 0.3692001402378082,
-      "learning_rate": 0.00014338837391175582,
-      "loss": 1.099,
-      "step": 60
-    },
-    {
-      "epoch": 1.263157894736842,
-      "eval_loss": 1.1519354581832886,
-      "eval_runtime": 2.053,
-      "eval_samples_per_second": 48.71,
-      "eval_steps_per_second": 4.871,
-      "step": 60
-    },
-    {
-      "epoch": 1.2842105263157895,
-      "grad_norm": 0.4118487238883972,
-      "learning_rate": 0.00014135585049665207,
-      "loss": 0.9891,
-      "step": 61
-    },
-    {
-      "epoch": 1.305263157894737,
-      "grad_norm": 0.32802432775497437,
-      "learning_rate": 0.00013930250316539238,
-      "loss": 0.9878,
-      "step": 62
-    },
-    {
-      "epoch": 1.3263157894736843,
-      "grad_norm": 0.41467538475990295,
-      "learning_rate": 0.00013722936584019453,
-      "loss": 1.0542,
-      "step": 63
-    },
-    {
-      "epoch": 1.3473684210526315,
-      "grad_norm": 0.39795804023742676,
-      "learning_rate": 0.0001351374824081343,
-      "loss": 1.1358,
-      "step": 64
-    },
-    {
-      "epoch": 1.368421052631579,
-      "grad_norm": 0.3385366201400757,
-      "learning_rate": 0.00013302790619551674,
-      "loss": 1.1107,
-      "step": 65
-    },
-    {
-      "epoch": 1.3894736842105262,
-      "grad_norm": 0.4300186336040497,
-      "learning_rate": 0.00013090169943749476,
-      "loss": 1.0554,
-      "step": 66
-    },
-    {
-      "epoch": 1.4105263157894736,
-      "grad_norm": 0.4523608982563019,
-      "learning_rate": 0.00012875993274320173,
-      "loss": 1.1442,
-      "step": 67
-    },
-    {
-      "epoch": 1.431578947368421,
-      "grad_norm": 0.48153308033943176,
-      "learning_rate": 0.00012660368455666752,
-      "loss": 1.1677,
-      "step": 68
-    },
-    {
-      "epoch": 1.4526315789473685,
-      "grad_norm": 0.46898069977760315,
-      "learning_rate": 0.0001244340406137894,
-      "loss": 1.1212,
-      "step": 69
-    },
-    {
-      "epoch": 1.4736842105263157,
-      "grad_norm": 0.3733386695384979,
-      "learning_rate": 0.00012225209339563145,
-      "loss": 0.9843,
-      "step": 70
-    },
-    {
-      "epoch": 1.4947368421052631,
-      "grad_norm": 0.4410829544067383,
-      "learning_rate": 0.00012005894157832729,
-      "loss": 1.1679,
-      "step": 71
-    },
-    {
-      "epoch": 1.5157894736842106,
-      "grad_norm": 0.46537336707115173,
-      "learning_rate": 0.00011785568947986367,
-      "loss": 1.0453,
-      "step": 72
-    },
-    {
-      "epoch": 1.5368421052631578,
-      "grad_norm": 0.39270663261413574,
-      "learning_rate": 0.0001156434465040231,
-      "loss": 1.1019,
-      "step": 73
-    },
-    {
-      "epoch": 1.5578947368421052,
-      "grad_norm": 0.3547813296318054,
-      "learning_rate": 0.00011342332658176555,
-      "loss": 0.9807,
-      "step": 74
-    },
-    {
-      "epoch": 1.5789473684210527,
-      "grad_norm": 0.33064335584640503,
-      "learning_rate": 0.00011119644761033078,
-      "loss": 0.9903,
-      "step": 75
-    },
-    {
-      "epoch": 1.6,
-      "grad_norm": 0.41019386053085327,
-      "learning_rate": 0.00010896393089034336,
-      "loss": 0.9956,
-      "step": 76
-    },
-    {
-      "epoch": 1.6210526315789475,
-      "grad_norm": 0.43731600046157837,
-      "learning_rate": 0.00010672690056120399,
-      "loss": 0.9657,
-      "step": 77
-    },
-    {
-      "epoch": 1.6421052631578947,
-      "grad_norm": 0.38457056879997253,
-      "learning_rate": 0.00010448648303505151,
-      "loss": 1.1255,
-      "step": 78
-    },
-    {
-      "epoch": 1.663157894736842,
-      "grad_norm": 0.4372155964374542,
-      "learning_rate": 0.00010224380642958052,
-      "loss": 1.105,
-      "step": 79
-    },
-    {
-      "epoch": 1.6842105263157894,
-      "grad_norm": 0.4701666533946991,
-      "learning_rate": 0.0001,
-      "loss": 1.0658,
-      "step": 80
-    },
-    {
-      "epoch": 1.6842105263157894,
-      "eval_loss": 1.152337908744812,
-      "eval_runtime": 2.0462,
-      "eval_samples_per_second": 48.871,
-      "eval_steps_per_second": 4.887,
-      "step": 80
-    },
-    {
-      "epoch": 1.7052631578947368,
-      "grad_norm": 0.44070982933044434,
-      "learning_rate": 9.775619357041952e-05,
-      "loss": 1.1024,
-      "step": 81
-    },
-    {
-      "epoch": 1.7263157894736842,
-      "grad_norm": 0.5059276819229126,
-      "learning_rate": 9.551351696494854e-05,
-      "loss": 1.1214,
-      "step": 82
-    },
-    {
-      "epoch": 1.7473684210526317,
-      "grad_norm": 0.4155433773994446,
-      "learning_rate": 9.327309943879604e-05,
-      "loss": 1.1853,
-      "step": 83
-    },
-    {
-      "epoch": 1.768421052631579,
-      "grad_norm": 0.5396384596824646,
-      "learning_rate": 9.103606910965666e-05,
-      "loss": 1.1497,
-      "step": 84
-    },
-    {
-      "epoch": 1.7894736842105263,
-      "grad_norm": 0.43235623836517334,
-      "learning_rate": 8.880355238966923e-05,
-      "loss": 1.1753,
-      "step": 85
-    },
-    {
-      "epoch": 1.8105263157894735,
-      "grad_norm": 0.3918503224849701,
-      "learning_rate": 8.657667341823448e-05,
-      "loss": 1.1254,
-      "step": 86
-    },
-    {
-      "epoch": 1.831578947368421,
-      "grad_norm": 0.4692346155643463,
-      "learning_rate": 8.435655349597689e-05,
-      "loss": 1.3666,
-      "step": 87
-    },
-    {
-      "epoch": 1.8526315789473684,
-      "grad_norm": 0.4968159794807434,
-      "learning_rate": 8.214431052013634e-05,
-      "loss": 0.9668,
-      "step": 88
-    },
-    {
-      "epoch": 1.8736842105263158,
-      "grad_norm": 0.4856269061565399,
-      "learning_rate": 7.994105842167273e-05,
-      "loss": 1.1482,
-      "step": 89
-    },
-    {
-      "epoch": 1.8947368421052633,
-      "grad_norm": 0.5288775563240051,
-      "learning_rate": 7.774790660436858e-05,
-      "loss": 1.13,
-      "step": 90
-    },
-    {
-      "epoch": 1.9157894736842105,
-      "grad_norm": 0.5403844118118286,
-      "learning_rate": 7.556595938621058e-05,
-      "loss": 1.1483,
-      "step": 91
-    },
-    {
-      "epoch": 1.936842105263158,
-      "grad_norm": 0.45445382595062256,
-      "learning_rate": 7.339631544333249e-05,
-      "loss": 1.0528,
-      "step": 92
-    },
-    {
-      "epoch": 1.9578947368421051,
-      "grad_norm": 0.48713403940200806,
-      "learning_rate": 7.124006725679828e-05,
-      "loss": 1.2208,
-      "step": 93
-    },
-    {
-      "epoch": 1.9789473684210526,
-      "grad_norm": 0.4627130627632141,
-      "learning_rate": 6.909830056250527e-05,
-      "loss": 1.0794,
-      "step": 94
-    },
-    {
-      "epoch": 2.0,
-      "grad_norm": 0.46807029843330383,
-      "learning_rate": 6.697209380448333e-05,
-      "loss": 1.12,
-      "step": 95
-    },
-    {
-      "epoch": 2.0210526315789474,
-      "grad_norm": 0.41066575050354004,
-      "learning_rate": 6.486251759186572e-05,
-      "loss": 1.0634,
-      "step": 96
-    },
-    {
-      "epoch": 2.042105263157895,
-      "grad_norm": 0.3904050886631012,
-      "learning_rate": 6.277063415980549e-05,
-      "loss": 0.9888,
-      "step": 97
-    },
-    {
-      "epoch": 2.0631578947368423,
-      "grad_norm": 0.49676060676574707,
-      "learning_rate": 6.069749683460765e-05,
-      "loss": 0.8783,
-      "step": 98
-    },
-    {
-      "epoch": 2.0842105263157893,
-      "grad_norm": 0.46549147367477417,
-      "learning_rate": 5.864414950334796e-05,
-      "loss": 0.9815,
-      "step": 99
-    },
-    {
-      "epoch": 2.1052631578947367,
-      "grad_norm": 0.5622740387916565,
-      "learning_rate": 5.6611626088244194e-05,
-      "loss": 1.0091,
-      "step": 100
-    },
-    {
-      "epoch": 2.1052631578947367,
-      "eval_loss": 1.1575236320495605,
-      "eval_runtime": 2.0589,
-      "eval_samples_per_second": 48.569,
-      "eval_steps_per_second": 4.857,
-      "step": 100
-    },
-    {
-      "epoch": 2.126315789473684,
-      "grad_norm": 0.47087791562080383,
-      "learning_rate": 5.4600950026045326e-05,
-      "loss": 0.994,
-      "step": 101
-    },
-    {
-      "epoch": 2.1473684210526316,
-      "grad_norm": 0.46321335434913635,
-      "learning_rate": 5.261313375270014e-05,
-      "loss": 0.8965,
-      "step": 102
-    },
-    {
-      "epoch": 2.168421052631579,
-      "grad_norm": 0.48722636699676514,
-      "learning_rate": 5.0649178193565314e-05,
-      "loss": 1.0028,
-      "step": 103
-    },
-    {
-      "epoch": 2.1894736842105265,
-      "grad_norm": 0.5477016568183899,
-      "learning_rate": 4.87100722594094e-05,
-      "loss": 0.9755,
-      "step": 104
-    },
-    {
-      "epoch": 2.2105263157894735,
-      "grad_norm": 0.43870726227760315,
-      "learning_rate": 4.6796792348466356e-05,
-      "loss": 0.9023,
-      "step": 105
-    },
-    {
-      "epoch": 2.231578947368421,
-      "grad_norm": 0.4974609911441803,
-      "learning_rate": 4.491030185478976e-05,
-      "loss": 1.0978,
-      "step": 106
-    },
-    {
-      "epoch": 2.2526315789473683,
-      "grad_norm": 0.48663774132728577,
-      "learning_rate": 4.305155068315481e-05,
-      "loss": 1.1326,
-      "step": 107
-    },
-    {
-      "epoch": 2.2736842105263158,
-      "grad_norm": 0.47879499197006226,
-      "learning_rate": 4.12214747707527e-05,
-      "loss": 0.8403,
-      "step": 108
-    },
-    {
-      "epoch": 2.294736842105263,
-      "grad_norm": 0.4391883909702301,
-      "learning_rate": 3.942099561591802e-05,
-      "loss": 1.0096,
-      "step": 109
-    },
-    {
-      "epoch": 2.3157894736842106,
-      "grad_norm": 0.5225970149040222,
-      "learning_rate": 3.7651019814126654e-05,
-      "loss": 0.9684,
-      "step": 110
-    },
-    {
-      "epoch": 2.336842105263158,
-      "grad_norm": 0.529344379901886,
-      "learning_rate": 3.591243860149759e-05,
-      "loss": 0.9164,
-      "step": 111
-    },
-    {
-      "epoch": 2.3578947368421055,
-      "grad_norm": 0.4865782856941223,
-      "learning_rate": 3.4206127406028745e-05,
-      "loss": 1.0993,
-      "step": 112
-    },
-    {
-      "epoch": 2.3789473684210525,
-      "grad_norm": 0.4908663332462311,
-      "learning_rate": 3.253294540679257e-05,
-      "loss": 1.1203,
-      "step": 113
-    },
-    {
-      "epoch": 2.4,
-      "grad_norm": 0.4688137471675873,
-      "learning_rate": 3.089373510131354e-05,
-      "loss": 0.8358,
-      "step": 114
-    },
-    {
-      "epoch": 2.4210526315789473,
-      "grad_norm": 0.5007145404815674,
-      "learning_rate": 2.9289321881345254e-05,
-      "loss": 1.0975,
-      "step": 115
-    },
-    {
-      "epoch": 2.442105263157895,
-      "grad_norm": 0.4280741214752197,
-      "learning_rate": 2.7720513617260856e-05,
-      "loss": 1.0134,
-      "step": 116
-    },
-    {
-      "epoch": 2.463157894736842,
-      "grad_norm": 0.5474169850349426,
-      "learning_rate": 2.6188100251265945e-05,
-      "loss": 0.9781,
-      "step": 117
-    },
-    {
-      "epoch": 2.4842105263157896,
-      "grad_norm": 0.4554167091846466,
-      "learning_rate": 2.4692853399638917e-05,
-      "loss": 1.082,
-      "step": 118
-    },
-    {
-      "epoch": 2.5052631578947366,
-      "grad_norm": 0.5812304615974426,
-      "learning_rate": 2.323552596419889e-05,
-      "loss": 0.9826,
-      "step": 119
-    },
-    {
-      "epoch": 2.526315789473684,
-      "grad_norm": 0.4756172001361847,
-      "learning_rate": 2.181685175319702e-05,
-      "loss": 1.1045,
-      "step": 120
-    },
-    {
-      "epoch": 2.526315789473684,
-      "eval_loss": 1.1679396629333496,
-      "eval_runtime": 2.0595,
-      "eval_samples_per_second": 48.555,
-      "eval_steps_per_second": 4.856,
-      "step": 120
     }
   ],
   "logging_steps": 1,
   "max_steps": 150,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 4,
   "save_steps": 20,
   "stateful_callbacks": {
     "EarlyStoppingCallback": {
@@ -917,7 +177,7 @@
         "early_stopping_threshold": 0.0
       },
       "attributes": {
-        "early_stopping_patience_counter": 3
       }
     },
     "TrainerControl": {
@@ -926,12 +186,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": true
       },
       "attributes": {}
     }
   },
-  "total_flos": 1.214189411500032e+16,
   "train_batch_size": 10,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.8169272541999817,
+  "best_model_checkpoint": "miner_id_besimray/checkpoint-20",
+  "epoch": 0.2572347266881029,
   "eval_steps": 20,
+  "global_step": 20,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.012861736334405145,
+      "grad_norm": 0.5877827405929565,
       "learning_rate": 2e-05,
+      "loss": 0.9595,
       "step": 1
     },
     {
+      "epoch": 0.012861736334405145,
+      "eval_loss": 0.9745924472808838,
+      "eval_runtime": 5.6911,
+      "eval_samples_per_second": 28.817,
+      "eval_steps_per_second": 2.987,
       "step": 1
     },
     {
+      "epoch": 0.02572347266881029,
+      "grad_norm": 0.8297654986381531,
       "learning_rate": 4e-05,
+      "loss": 0.9813,
       "step": 2
     },
     {
+      "epoch": 0.03858520900321544,
+      "grad_norm": 0.8195751905441284,
       "learning_rate": 6e-05,
+      "loss": 1.0,
       "step": 3
     },
     {
+      "epoch": 0.05144694533762058,
+      "grad_norm": 0.6416041851043701,
       "learning_rate": 8e-05,
+      "loss": 0.8282,
       "step": 4
     },
     {
+      "epoch": 0.06430868167202572,
+      "grad_norm": 0.7084880471229553,
       "learning_rate": 0.0001,
+      "loss": 1.0059,
       "step": 5
     },
     {
+      "epoch": 0.07717041800643087,
+      "grad_norm": 0.4804805517196655,
       "learning_rate": 0.00012,
+      "loss": 0.8269,
       "step": 6
     },
     {
+      "epoch": 0.09003215434083602,
+      "grad_norm": 0.4058835804462433,
       "learning_rate": 0.00014,
+      "loss": 0.9492,
       "step": 7
     },
     {
+      "epoch": 0.10289389067524116,
+      "grad_norm": 0.3298371434211731,
       "learning_rate": 0.00016,
+      "loss": 0.8769,
       "step": 8
     },
     {
+      "epoch": 0.1157556270096463,
+      "grad_norm": 0.2648942470550537,
       "learning_rate": 0.00018,
+      "loss": 0.8254,
       "step": 9
     },
     {
+      "epoch": 0.12861736334405144,
+      "grad_norm": 0.22385652363300323,
       "learning_rate": 0.0002,
+      "loss": 0.8403,
       "step": 10
     },
     {
+      "epoch": 0.1414790996784566,
+      "grad_norm": 0.38447538018226624,
       "learning_rate": 0.00019997482349425066,
+      "loss": 0.8706,
       "step": 11
     },
     {
+      "epoch": 0.15434083601286175,
+      "grad_norm": 0.41497623920440674,
       "learning_rate": 0.00019989930665413147,
+      "loss": 0.8532,
       "step": 12
     },
     {
+      "epoch": 0.16720257234726688,
+      "grad_norm": 0.38628965616226196,
       "learning_rate": 0.0001997734875046456,
+      "loss": 0.7667,
       "step": 13
     },
     {
+      "epoch": 0.18006430868167203,
+      "grad_norm": 0.29869189858436584,
       "learning_rate": 0.00019959742939952392,
+      "loss": 0.8069,
       "step": 14
     },
     {
+      "epoch": 0.19292604501607716,
+      "grad_norm": 0.3177284002304077,
       "learning_rate": 0.00019937122098932428,
+      "loss": 0.7532,
       "step": 15
     },
     {
+      "epoch": 0.2057877813504823,
+      "grad_norm": 0.3028796315193176,
       "learning_rate": 0.00019909497617679348,
+      "loss": 0.7722,
       "step": 16
     },
     {
+      "epoch": 0.21864951768488747,
+      "grad_norm": 0.31497061252593994,
       "learning_rate": 0.00019876883405951377,
+      "loss": 0.8742,
       "step": 17
     },
     {
+      "epoch": 0.2315112540192926,
+      "grad_norm": 0.26743173599243164,
       "learning_rate": 0.00019839295885986296,
+      "loss": 0.7541,
       "step": 18
     },
     {
+      "epoch": 0.24437299035369775,
+      "grad_norm": 0.2908126711845398,
       "learning_rate": 0.00019796753984232358,
+      "loss": 0.9543,
       "step": 19
     },
     {
+      "epoch": 0.2572347266881029,
+      "grad_norm": 0.26621854305267334,
       "learning_rate": 0.00019749279121818235,
+      "loss": 0.9359,
       "step": 20
     },
     {
+      "epoch": 0.2572347266881029,
+      "eval_loss": 0.8169272541999817,
+      "eval_runtime": 5.7691,
+      "eval_samples_per_second": 28.427,
+      "eval_steps_per_second": 2.947,
       "step": 20
     }
   ],
   "logging_steps": 1,
   "max_steps": 150,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
   "save_steps": 20,
   "stateful_callbacks": {
     "EarlyStoppingCallback": {
         "early_stopping_threshold": 0.0
       },
       "attributes": {
+        "early_stopping_patience_counter": 0
       }
     },
     "TrainerControl": {
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": false
       },
       "attributes": {}
     }
   },
+  "total_flos": 5141026150809600.0,
   "train_batch_size": 10,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d4cb7ce651922e8f53dabf2b1364985d613e09d28a1319890e22f5a25dfbce85
 size 6648

 version https://git-lfs.github.com/spec/v1
+oid sha256:ccb34079e4accf483c3a38a7fbb5ed53ad4dbca33ee39f87bdede0297b6d0cff
 size 6648