Training in progress, step 20, checkpoint

Browse files

Files changed (7) hide show

last-checkpoint/adapter_config.json +4 -4
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +54 -1012
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -20,13 +20,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "gate_proj",
-    "down_proj",
     "k_proj",
     "o_proj",
-    "up_proj",
     "v_proj",
-    "q_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "up_proj",
     "k_proj",
     "o_proj",
     "v_proj",
+    "q_proj",
+    "down_proj",
+    "gate_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee822693fef528317ba083c5d48f88c30b9c61025611d616bab3d9d798072246
 size 45118424

 version https://git-lfs.github.com/spec/v1
+oid sha256:c2ea1e6a39b9b019bb68f8bcb9e4113a9450d3a145c326c3d17c60e7914f4870
 size 45118424

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:460835f7008ae886b43dbf42bb872deabb83175b9129a32fbc50d714ed9014db
 size 23159290

 version https://git-lfs.github.com/spec/v1
+oid sha256:aa015a1e42d23eb4a3ea0fa718ceb403d42f019d9d2191fa6d5727830e4b7375
 size 23159290

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d11c7a6a55b1fb167809df49e86bfb5922b63262ff5a72c4acf98ce212bd8ed
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:1e691a6f9c4d5048643b02e3232a9cfa061e68f527f466d35a53bb462800e4b3
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:194456d3c9e165255d5406a0f3f62973b0bede79d91784f72431350783e27ae7
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:1c297c5cf11a27c75d9f99f1df69752f78c3ad41b0275adf50cdd1b67f9d0bb3
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,1126 +1,168 @@
 {
-  "best_metric": 0.7633899450302124,
-  "best_model_checkpoint": "miner_id_besimray/checkpoint-140",
-  "epoch": 1.9292604501607717,
   "eval_steps": 20,
-  "global_step": 150,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.012861736334405145,
-      "grad_norm": 0.6353400945663452,
       "learning_rate": 2e-05,
-      "loss": 0.9595,
       "step": 1
     },
     {
       "epoch": 0.012861736334405145,
-      "eval_loss": 0.9745924472808838,
-      "eval_runtime": 5.7213,
-      "eval_samples_per_second": 28.665,
-      "eval_steps_per_second": 2.971,
       "step": 1
     },
     {
       "epoch": 0.02572347266881029,
-      "grad_norm": 0.9050183296203613,
       "learning_rate": 4e-05,
-      "loss": 0.9813,
       "step": 2
     },
     {
       "epoch": 0.03858520900321544,
-      "grad_norm": 0.8838196992874146,
       "learning_rate": 6e-05,
-      "loss": 0.9972,
       "step": 3
     },
     {
       "epoch": 0.05144694533762058,
-      "grad_norm": 0.6954429149627686,
       "learning_rate": 8e-05,
-      "loss": 0.8263,
       "step": 4
     },
     {
       "epoch": 0.06430868167202572,
-      "grad_norm": 0.7475744485855103,
       "learning_rate": 0.0001,
-      "loss": 1.0028,
       "step": 5
     },
     {
       "epoch": 0.07717041800643087,
-      "grad_norm": 0.48648685216903687,
       "learning_rate": 0.00012,
-      "loss": 0.8235,
       "step": 6
     },
     {
       "epoch": 0.09003215434083602,
-      "grad_norm": 0.4148712754249573,
       "learning_rate": 0.00014,
-      "loss": 0.9482,
       "step": 7
     },
     {
       "epoch": 0.10289389067524116,
-      "grad_norm": 0.3340390622615814,
       "learning_rate": 0.00016,
-      "loss": 0.8758,
       "step": 8
     },
     {
       "epoch": 0.1157556270096463,
-      "grad_norm": 0.26932457089424133,
       "learning_rate": 0.00018,
-      "loss": 0.8252,
       "step": 9
     },
     {
       "epoch": 0.12861736334405144,
-      "grad_norm": 0.23018883168697357,
       "learning_rate": 0.0002,
-      "loss": 0.8404,
       "step": 10
     },
     {
       "epoch": 0.1414790996784566,
-      "grad_norm": 0.39730432629585266,
       "learning_rate": 0.00019997482349425066,
-      "loss": 0.8719,
       "step": 11
     },
     {
       "epoch": 0.15434083601286175,
-      "grad_norm": 0.42967018485069275,
       "learning_rate": 0.00019989930665413147,
-      "loss": 0.8539,
       "step": 12
     },
     {
       "epoch": 0.16720257234726688,
-      "grad_norm": 0.39713603258132935,
       "learning_rate": 0.0001997734875046456,
-      "loss": 0.7647,
       "step": 13
     },
     {
       "epoch": 0.18006430868167203,
-      "grad_norm": 0.3015749156475067,
       "learning_rate": 0.00019959742939952392,
-      "loss": 0.806,
       "step": 14
     },
     {
       "epoch": 0.19292604501607716,
-      "grad_norm": 0.32679829001426697,
       "learning_rate": 0.00019937122098932428,
-      "loss": 0.7549,
       "step": 15
     },
     {
       "epoch": 0.2057877813504823,
-      "grad_norm": 0.31075620651245117,
       "learning_rate": 0.00019909497617679348,
-      "loss": 0.7734,
       "step": 16
     },
     {
       "epoch": 0.21864951768488747,
-      "grad_norm": 0.3161105215549469,
       "learning_rate": 0.00019876883405951377,
-      "loss": 0.8734,
       "step": 17
     },
     {
       "epoch": 0.2315112540192926,
-      "grad_norm": 0.26673561334609985,
       "learning_rate": 0.00019839295885986296,
-      "loss": 0.7541,
       "step": 18
     },
     {
       "epoch": 0.24437299035369775,
-      "grad_norm": 0.2973664104938507,
       "learning_rate": 0.00019796753984232358,
-      "loss": 0.9559,
       "step": 19
     },
     {
       "epoch": 0.2572347266881029,
-      "grad_norm": 0.2668478190898895,
       "learning_rate": 0.00019749279121818235,
-      "loss": 0.9389,
       "step": 20
     },
     {
       "epoch": 0.2572347266881029,
-      "eval_loss": 0.8176891803741455,
-      "eval_runtime": 5.7831,
-      "eval_samples_per_second": 28.358,
-      "eval_steps_per_second": 2.94,
       "step": 20
-    },
-    {
-      "epoch": 0.27009646302250806,
-      "grad_norm": 0.21167340874671936,
-      "learning_rate": 0.0001969689520376687,
-      "loss": 0.7046,
-      "step": 21
-    },
-    {
-      "epoch": 0.2829581993569132,
-      "grad_norm": 0.23888267576694489,
-      "learning_rate": 0.00019639628606958533,
-      "loss": 0.769,
-      "step": 22
-    },
-    {
-      "epoch": 0.2958199356913183,
-      "grad_norm": 0.2215345799922943,
-      "learning_rate": 0.00019577508166849304,
-      "loss": 0.7524,
-      "step": 23
-    },
-    {
-      "epoch": 0.3086816720257235,
-      "grad_norm": 0.2053850293159485,
-      "learning_rate": 0.00019510565162951537,
-      "loss": 0.7305,
-      "step": 24
-    },
-    {
-      "epoch": 0.3215434083601286,
-      "grad_norm": 0.21348363161087036,
-      "learning_rate": 0.00019438833303083678,
-      "loss": 0.8105,
-      "step": 25
-    },
-    {
-      "epoch": 0.33440514469453375,
-      "grad_norm": 0.21149109303951263,
-      "learning_rate": 0.00019362348706397373,
-      "loss": 0.8249,
-      "step": 26
-    },
-    {
-      "epoch": 0.34726688102893893,
-      "grad_norm": 0.22312600910663605,
-      "learning_rate": 0.0001928114988519039,
-      "loss": 0.8205,
-      "step": 27
-    },
-    {
-      "epoch": 0.36012861736334406,
-      "grad_norm": 0.20689311623573303,
-      "learning_rate": 0.0001919527772551451,
-      "loss": 0.7488,
-      "step": 28
-    },
-    {
-      "epoch": 0.3729903536977492,
-      "grad_norm": 0.22930146753787994,
-      "learning_rate": 0.00019104775466588161,
-      "loss": 0.8267,
-      "step": 29
-    },
-    {
-      "epoch": 0.3858520900321543,
-      "grad_norm": 0.24117408692836761,
-      "learning_rate": 0.0001900968867902419,
-      "loss": 0.8412,
-      "step": 30
-    },
-    {
-      "epoch": 0.3987138263665595,
-      "grad_norm": 0.23889240622520447,
-      "learning_rate": 0.0001891006524188368,
-      "loss": 0.8336,
-      "step": 31
-    },
-    {
-      "epoch": 0.4115755627009646,
-      "grad_norm": 0.22367344796657562,
-      "learning_rate": 0.0001880595531856738,
-      "loss": 0.7905,
-      "step": 32
-    },
-    {
-      "epoch": 0.42443729903536975,
-      "grad_norm": 0.22343823313713074,
-      "learning_rate": 0.00018697411331556956,
-      "loss": 0.7842,
-      "step": 33
-    },
-    {
-      "epoch": 0.43729903536977494,
-      "grad_norm": 0.21874739229679108,
-      "learning_rate": 0.00018584487936018661,
-      "loss": 0.7624,
-      "step": 34
-    },
-    {
-      "epoch": 0.45016077170418006,
-      "grad_norm": 0.21489335596561432,
-      "learning_rate": 0.00018467241992282843,
-      "loss": 0.7956,
-      "step": 35
-    },
-    {
-      "epoch": 0.4630225080385852,
-      "grad_norm": 0.2225746512413025,
-      "learning_rate": 0.00018345732537213027,
-      "loss": 0.788,
-      "step": 36
-    },
-    {
-      "epoch": 0.4758842443729904,
-      "grad_norm": 0.25866109132766724,
-      "learning_rate": 0.00018220020754479102,
-      "loss": 0.9176,
-      "step": 37
-    },
-    {
-      "epoch": 0.4887459807073955,
-      "grad_norm": 0.2253965586423874,
-      "learning_rate": 0.00018090169943749476,
-      "loss": 0.837,
-      "step": 38
-    },
-    {
-      "epoch": 0.5016077170418006,
-      "grad_norm": 0.23456409573554993,
-      "learning_rate": 0.00017956245488817812,
-      "loss": 0.8215,
-      "step": 39
-    },
-    {
-      "epoch": 0.5144694533762058,
-      "grad_norm": 0.22983534634113312,
-      "learning_rate": 0.000178183148246803,
-      "loss": 0.7865,
-      "step": 40
-    },
-    {
-      "epoch": 0.5144694533762058,
-      "eval_loss": 0.790475070476532,
-      "eval_runtime": 7.5635,
-      "eval_samples_per_second": 21.683,
-      "eval_steps_per_second": 2.248,
-      "step": 40
-    },
-    {
-      "epoch": 0.5273311897106109,
-      "grad_norm": 0.21388499438762665,
-      "learning_rate": 0.0001767644740358011,
-      "loss": 0.7596,
-      "step": 41
-    },
-    {
-      "epoch": 0.5401929260450161,
-      "grad_norm": 0.23246338963508606,
-      "learning_rate": 0.00017530714660036112,
-      "loss": 0.8301,
-      "step": 42
-    },
-    {
-      "epoch": 0.5530546623794212,
-      "grad_norm": 0.2237369567155838,
-      "learning_rate": 0.00017381189974873407,
-      "loss": 0.7615,
-      "step": 43
-    },
-    {
-      "epoch": 0.5659163987138264,
-      "grad_norm": 0.2338048666715622,
-      "learning_rate": 0.00017227948638273916,
-      "loss": 0.7508,
-      "step": 44
-    },
-    {
-      "epoch": 0.5787781350482315,
-      "grad_norm": 0.23250731825828552,
-      "learning_rate": 0.00017071067811865476,
-      "loss": 0.7531,
-      "step": 45
-    },
-    {
-      "epoch": 0.5916398713826366,
-      "grad_norm": 0.23482945561408997,
-      "learning_rate": 0.00016910626489868649,
-      "loss": 0.747,
-      "step": 46
-    },
-    {
-      "epoch": 0.6045016077170418,
-      "grad_norm": 0.24034211039543152,
-      "learning_rate": 0.00016746705459320745,
-      "loss": 0.8101,
-      "step": 47
-    },
-    {
-      "epoch": 0.617363344051447,
-      "grad_norm": 0.22306109964847565,
-      "learning_rate": 0.00016579387259397127,
-      "loss": 0.7662,
-      "step": 48
-    },
-    {
-      "epoch": 0.6302250803858521,
-      "grad_norm": 0.2227921038866043,
-      "learning_rate": 0.0001640875613985024,
-      "loss": 0.7652,
-      "step": 49
-    },
-    {
-      "epoch": 0.6430868167202572,
-      "grad_norm": 0.23712162673473358,
-      "learning_rate": 0.00016234898018587337,
-      "loss": 0.7383,
-      "step": 50
-    },
-    {
-      "epoch": 0.6559485530546624,
-      "grad_norm": 0.22555230557918549,
-      "learning_rate": 0.000160579004384082,
-      "loss": 0.7069,
-      "step": 51
-    },
-    {
-      "epoch": 0.6688102893890675,
-      "grad_norm": 0.20635604858398438,
-      "learning_rate": 0.00015877852522924732,
-      "loss": 0.7082,
-      "step": 52
-    },
-    {
-      "epoch": 0.6816720257234726,
-      "grad_norm": 0.24219252169132233,
-      "learning_rate": 0.0001569484493168452,
-      "loss": 0.8613,
-      "step": 53
-    },
-    {
-      "epoch": 0.6945337620578779,
-      "grad_norm": 0.21117131412029266,
-      "learning_rate": 0.00015508969814521025,
-      "loss": 0.7367,
-      "step": 54
-    },
-    {
-      "epoch": 0.707395498392283,
-      "grad_norm": 0.19816330075263977,
-      "learning_rate": 0.00015320320765153367,
-      "loss": 0.6701,
-      "step": 55
-    },
-    {
-      "epoch": 0.7202572347266881,
-      "grad_norm": 0.24127283692359924,
-      "learning_rate": 0.00015128992774059063,
-      "loss": 0.7212,
-      "step": 56
-    },
-    {
-      "epoch": 0.7331189710610932,
-      "grad_norm": 0.2404824048280716,
-      "learning_rate": 0.0001493508218064347,
-      "loss": 0.6806,
-      "step": 57
-    },
-    {
-      "epoch": 0.7459807073954984,
-      "grad_norm": 0.2355654090642929,
-      "learning_rate": 0.00014738686624729986,
-      "loss": 0.8262,
-      "step": 58
-    },
-    {
-      "epoch": 0.7588424437299035,
-      "grad_norm": 0.24600790441036224,
-      "learning_rate": 0.00014539904997395468,
-      "loss": 0.8672,
-      "step": 59
-    },
-    {
-      "epoch": 0.7717041800643086,
-      "grad_norm": 0.22222688794136047,
-      "learning_rate": 0.00014338837391175582,
-      "loss": 0.7565,
-      "step": 60
-    },
-    {
-      "epoch": 0.7717041800643086,
-      "eval_loss": 0.7795044779777527,
-      "eval_runtime": 7.0042,
-      "eval_samples_per_second": 23.414,
-      "eval_steps_per_second": 2.427,
-      "step": 60
-    },
-    {
-      "epoch": 0.7845659163987139,
-      "grad_norm": 0.23646439611911774,
-      "learning_rate": 0.00014135585049665207,
-      "loss": 0.8112,
-      "step": 61
-    },
-    {
-      "epoch": 0.797427652733119,
-      "grad_norm": 0.2320852130651474,
-      "learning_rate": 0.00013930250316539238,
-      "loss": 0.8133,
-      "step": 62
-    },
-    {
-      "epoch": 0.8102893890675241,
-      "grad_norm": 0.22437462210655212,
-      "learning_rate": 0.00013722936584019453,
-      "loss": 0.7881,
-      "step": 63
-    },
-    {
-      "epoch": 0.8231511254019293,
-      "grad_norm": 0.22626082599163055,
-      "learning_rate": 0.0001351374824081343,
-      "loss": 0.7188,
-      "step": 64
-    },
-    {
-      "epoch": 0.8360128617363344,
-      "grad_norm": 0.2286273092031479,
-      "learning_rate": 0.00013302790619551674,
-      "loss": 0.7627,
-      "step": 65
-    },
-    {
-      "epoch": 0.8488745980707395,
-      "grad_norm": 0.20803673565387726,
-      "learning_rate": 0.00013090169943749476,
-      "loss": 0.7243,
-      "step": 66
-    },
-    {
-      "epoch": 0.8617363344051447,
-      "grad_norm": 0.2412373572587967,
-      "learning_rate": 0.00012875993274320173,
-      "loss": 0.7587,
-      "step": 67
-    },
-    {
-      "epoch": 0.8745980707395499,
-      "grad_norm": 0.23493675887584686,
-      "learning_rate": 0.00012660368455666752,
-      "loss": 0.7248,
-      "step": 68
-    },
-    {
-      "epoch": 0.887459807073955,
-      "grad_norm": 0.2631540596485138,
-      "learning_rate": 0.0001244340406137894,
-      "loss": 0.8143,
-      "step": 69
-    },
-    {
-      "epoch": 0.9003215434083601,
-      "grad_norm": 0.22676923871040344,
-      "learning_rate": 0.00012225209339563145,
-      "loss": 0.8182,
-      "step": 70
-    },
-    {
-      "epoch": 0.9131832797427653,
-      "grad_norm": 0.2400771528482437,
-      "learning_rate": 0.00012005894157832729,
-      "loss": 0.8467,
-      "step": 71
-    },
-    {
-      "epoch": 0.9260450160771704,
-      "grad_norm": 0.22387422621250153,
-      "learning_rate": 0.00011785568947986367,
-      "loss": 0.7709,
-      "step": 72
-    },
-    {
-      "epoch": 0.9389067524115756,
-      "grad_norm": 0.2363765388727188,
-      "learning_rate": 0.0001156434465040231,
-      "loss": 0.8006,
-      "step": 73
-    },
-    {
-      "epoch": 0.9517684887459807,
-      "grad_norm": 0.23003999888896942,
-      "learning_rate": 0.00011342332658176555,
-      "loss": 0.6959,
-      "step": 74
-    },
-    {
-      "epoch": 0.9646302250803859,
-      "grad_norm": 0.2013099491596222,
-      "learning_rate": 0.00011119644761033078,
-      "loss": 0.73,
-      "step": 75
-    },
-    {
-      "epoch": 0.977491961414791,
-      "grad_norm": 0.23860198259353638,
-      "learning_rate": 0.00010896393089034336,
-      "loss": 0.8589,
-      "step": 76
-    },
-    {
-      "epoch": 0.9903536977491961,
-      "grad_norm": 0.2546059489250183,
-      "learning_rate": 0.00010672690056120399,
-      "loss": 0.7953,
-      "step": 77
-    },
-    {
-      "epoch": 1.0032154340836013,
-      "grad_norm": 0.3725316822528839,
-      "learning_rate": 0.00010448648303505151,
-      "loss": 0.6716,
-      "step": 78
-    },
-    {
-      "epoch": 1.0160771704180065,
-      "grad_norm": 0.2085859179496765,
-      "learning_rate": 0.00010224380642958052,
-      "loss": 0.6845,
-      "step": 79
-    },
-    {
-      "epoch": 1.0289389067524115,
-      "grad_norm": 0.24546077847480774,
-      "learning_rate": 0.0001,
-      "loss": 0.8137,
-      "step": 80
-    },
-    {
-      "epoch": 1.0289389067524115,
-      "eval_loss": 0.7710850238800049,
-      "eval_runtime": 6.8626,
-      "eval_samples_per_second": 23.898,
-      "eval_steps_per_second": 2.477,
-      "step": 80
-    },
-    {
-      "epoch": 1.0418006430868167,
-      "grad_norm": 0.21984128654003143,
-      "learning_rate": 9.775619357041952e-05,
-      "loss": 0.6629,
-      "step": 81
-    },
-    {
-      "epoch": 1.0546623794212218,
-      "grad_norm": 0.21213646233081818,
-      "learning_rate": 9.551351696494854e-05,
-      "loss": 0.6813,
-      "step": 82
-    },
-    {
-      "epoch": 1.067524115755627,
-      "grad_norm": 0.22041082382202148,
-      "learning_rate": 9.327309943879604e-05,
-      "loss": 0.7533,
-      "step": 83
-    },
-    {
-      "epoch": 1.0803858520900322,
-      "grad_norm": 0.23151585459709167,
-      "learning_rate": 9.103606910965666e-05,
-      "loss": 0.7094,
-      "step": 84
-    },
-    {
-      "epoch": 1.0932475884244373,
-      "grad_norm": 0.24404524266719818,
-      "learning_rate": 8.880355238966923e-05,
-      "loss": 0.7239,
-      "step": 85
-    },
-    {
-      "epoch": 1.1061093247588425,
-      "grad_norm": 0.21060094237327576,
-      "learning_rate": 8.657667341823448e-05,
-      "loss": 0.6813,
-      "step": 86
-    },
-    {
-      "epoch": 1.1189710610932475,
-      "grad_norm": 0.21228162944316864,
-      "learning_rate": 8.435655349597689e-05,
-      "loss": 0.6777,
-      "step": 87
-    },
-    {
-      "epoch": 1.1318327974276527,
-      "grad_norm": 0.2304028421640396,
-      "learning_rate": 8.214431052013634e-05,
-      "loss": 0.7201,
-      "step": 88
-    },
-    {
-      "epoch": 1.144694533762058,
-      "grad_norm": 0.2450607568025589,
-      "learning_rate": 7.994105842167273e-05,
-      "loss": 0.6912,
-      "step": 89
-    },
-    {
-      "epoch": 1.157556270096463,
-      "grad_norm": 0.22919628024101257,
-      "learning_rate": 7.774790660436858e-05,
-      "loss": 0.6571,
-      "step": 90
-    },
-    {
-      "epoch": 1.1704180064308682,
-      "grad_norm": 0.23411712050437927,
-      "learning_rate": 7.556595938621058e-05,
-      "loss": 0.7856,
-      "step": 91
-    },
-    {
-      "epoch": 1.1832797427652733,
-      "grad_norm": 0.23763220012187958,
-      "learning_rate": 7.339631544333249e-05,
-      "loss": 0.6961,
-      "step": 92
-    },
-    {
-      "epoch": 1.1961414790996785,
-      "grad_norm": 0.2141312211751938,
-      "learning_rate": 7.124006725679828e-05,
-      "loss": 0.6091,
-      "step": 93
-    },
-    {
-      "epoch": 1.2090032154340835,
-      "grad_norm": 0.21600951254367828,
-      "learning_rate": 6.909830056250527e-05,
-      "loss": 0.6597,
-      "step": 94
-    },
-    {
-      "epoch": 1.2218649517684887,
-      "grad_norm": 0.24440795183181763,
-      "learning_rate": 6.697209380448333e-05,
-      "loss": 0.7335,
-      "step": 95
-    },
-    {
-      "epoch": 1.234726688102894,
-      "grad_norm": 0.23137834668159485,
-      "learning_rate": 6.486251759186572e-05,
-      "loss": 0.649,
-      "step": 96
-    },
-    {
-      "epoch": 1.247588424437299,
-      "grad_norm": 0.24493689835071564,
-      "learning_rate": 6.277063415980549e-05,
-      "loss": 0.7592,
-      "step": 97
-    },
-    {
-      "epoch": 1.2604501607717042,
-      "grad_norm": 0.2131170779466629,
-      "learning_rate": 6.069749683460765e-05,
-      "loss": 0.6984,
-      "step": 98
-    },
-    {
-      "epoch": 1.2733118971061093,
-      "grad_norm": 0.26320311427116394,
-      "learning_rate": 5.864414950334796e-05,
-      "loss": 0.7309,
-      "step": 99
-    },
-    {
-      "epoch": 1.2861736334405145,
-      "grad_norm": 0.24698734283447266,
-      "learning_rate": 5.6611626088244194e-05,
-      "loss": 0.7243,
-      "step": 100
-    },
-    {
-      "epoch": 1.2861736334405145,
-      "eval_loss": 0.7675071954727173,
-      "eval_runtime": 6.8995,
-      "eval_samples_per_second": 23.77,
-      "eval_steps_per_second": 2.464,
-      "step": 100
-    },
-    {
-      "epoch": 1.2990353697749195,
-      "grad_norm": 0.2751113474369049,
-      "learning_rate": 5.4600950026045326e-05,
-      "loss": 0.8179,
-      "step": 101
-    },
-    {
-      "epoch": 1.3118971061093248,
-      "grad_norm": 0.24326056241989136,
-      "learning_rate": 5.261313375270014e-05,
-      "loss": 0.7323,
-      "step": 102
-    },
-    {
-      "epoch": 1.32475884244373,
-      "grad_norm": 0.26097771525382996,
-      "learning_rate": 5.0649178193565314e-05,
-      "loss": 0.8192,
-      "step": 103
-    },
-    {
-      "epoch": 1.337620578778135,
-      "grad_norm": 0.28192853927612305,
-      "learning_rate": 4.87100722594094e-05,
-      "loss": 0.7968,
-      "step": 104
-    },
-    {
-      "epoch": 1.3504823151125402,
-      "grad_norm": 0.25281116366386414,
-      "learning_rate": 4.6796792348466356e-05,
-      "loss": 0.7235,
-      "step": 105
-    },
-    {
-      "epoch": 1.3633440514469453,
-      "grad_norm": 0.24736233055591583,
-      "learning_rate": 4.491030185478976e-05,
-      "loss": 0.8161,
-      "step": 106
-    },
-    {
-      "epoch": 1.3762057877813505,
-      "grad_norm": 0.22223138809204102,
-      "learning_rate": 4.305155068315481e-05,
-      "loss": 0.6488,
-      "step": 107
-    },
-    {
-      "epoch": 1.3890675241157555,
-      "grad_norm": 0.2827548682689667,
-      "learning_rate": 4.12214747707527e-05,
-      "loss": 0.7366,
-      "step": 108
-    },
-    {
-      "epoch": 1.4019292604501608,
-      "grad_norm": 0.2146385759115219,
-      "learning_rate": 3.942099561591802e-05,
-      "loss": 0.6998,
-      "step": 109
-    },
-    {
-      "epoch": 1.414790996784566,
-      "grad_norm": 0.24802613258361816,
-      "learning_rate": 3.7651019814126654e-05,
-      "loss": 0.7389,
-      "step": 110
-    },
-    {
-      "epoch": 1.427652733118971,
-      "grad_norm": 0.21664857864379883,
-      "learning_rate": 3.591243860149759e-05,
-      "loss": 0.6622,
-      "step": 111
-    },
-    {
-      "epoch": 1.4405144694533762,
-      "grad_norm": 0.2532820403575897,
-      "learning_rate": 3.4206127406028745e-05,
-      "loss": 0.6946,
-      "step": 112
-    },
-    {
-      "epoch": 1.4533762057877815,
-      "grad_norm": 0.2482985109090805,
-      "learning_rate": 3.253294540679257e-05,
-      "loss": 0.6731,
-      "step": 113
-    },
-    {
-      "epoch": 1.4662379421221865,
-      "grad_norm": 0.2581978440284729,
-      "learning_rate": 3.089373510131354e-05,
-      "loss": 0.729,
-      "step": 114
-    },
-    {
-      "epoch": 1.4790996784565915,
-      "grad_norm": 0.2761397957801819,
-      "learning_rate": 2.9289321881345254e-05,
-      "loss": 0.7888,
-      "step": 115
-    },
-    {
-      "epoch": 1.4919614147909968,
-      "grad_norm": 0.2787714898586273,
-      "learning_rate": 2.7720513617260856e-05,
-      "loss": 0.7264,
-      "step": 116
-    },
-    {
-      "epoch": 1.504823151125402,
-      "grad_norm": 0.24963301420211792,
-      "learning_rate": 2.6188100251265945e-05,
-      "loss": 0.6779,
-      "step": 117
-    },
-    {
-      "epoch": 1.517684887459807,
-      "grad_norm": 0.27637454867362976,
-      "learning_rate": 2.4692853399638917e-05,
-      "loss": 0.7916,
-      "step": 118
-    },
-    {
-      "epoch": 1.5305466237942122,
-      "grad_norm": 0.2721528708934784,
-      "learning_rate": 2.323552596419889e-05,
-      "loss": 0.8018,
-      "step": 119
-    },
-    {
-      "epoch": 1.5434083601286175,
-      "grad_norm": 0.25732940435409546,
-      "learning_rate": 2.181685175319702e-05,
-      "loss": 0.8195,
-      "step": 120
-    },
-    {
-      "epoch": 1.5434083601286175,
-      "eval_loss": 0.764509916305542,
-      "eval_runtime": 6.806,
-      "eval_samples_per_second": 24.097,
-      "eval_steps_per_second": 2.498,
-      "step": 120
-    },
-    {
-      "epoch": 1.5562700964630225,
-      "grad_norm": 0.288256973028183,
-      "learning_rate": 2.043754511182191e-05,
-      "loss": 0.8183,
-      "step": 121
-    },
-    {
-      "epoch": 1.5691318327974275,
-      "grad_norm": 0.254046231508255,
-      "learning_rate": 1.9098300562505266e-05,
-      "loss": 0.6721,
-      "step": 122
-    },
-    {
-      "epoch": 1.5819935691318328,
-      "grad_norm": 0.24051472544670105,
-      "learning_rate": 1.7799792455209018e-05,
-      "loss": 0.6993,
-      "step": 123
-    },
-    {
-      "epoch": 1.594855305466238,
-      "grad_norm": 0.2690548002719879,
-      "learning_rate": 1.6542674627869737e-05,
-      "loss": 0.7357,
-      "step": 124
-    },
-    {
-      "epoch": 1.607717041800643,
-      "grad_norm": 0.2249222695827484,
-      "learning_rate": 1.5327580077171587e-05,
-      "loss": 0.6112,
-      "step": 125
-    },
-    {
-      "epoch": 1.6205787781350482,
-      "grad_norm": 0.2525765597820282,
-      "learning_rate": 1.415512063981339e-05,
-      "loss": 0.7281,
-      "step": 126
-    },
-    {
-      "epoch": 1.6334405144694535,
-      "grad_norm": 0.2448454648256302,
-      "learning_rate": 1.3025886684430467e-05,
-      "loss": 0.6699,
-      "step": 127
-    },
-    {
-      "epoch": 1.6463022508038585,
-      "grad_norm": 0.27227962017059326,
-      "learning_rate": 1.19404468143262e-05,
-      "loss": 0.7431,
-      "step": 128
-    },
-    {
-      "epoch": 1.6591639871382635,
-      "grad_norm": 0.26319149136543274,
-      "learning_rate": 1.0899347581163221e-05,
-      "loss": 0.6885,
-      "step": 129
-    },
-    {
-      "epoch": 1.6720257234726688,
-      "grad_norm": 0.2802058160305023,
-      "learning_rate": 9.903113209758096e-06,
-      "loss": 0.7451,
-      "step": 130
-    },
-    {
-      "epoch": 1.684887459807074,
-      "grad_norm": 0.23295214772224426,
-      "learning_rate": 8.952245334118414e-06,
-      "loss": 0.6393,
-      "step": 131
-    },
-    {
-      "epoch": 1.697749196141479,
-      "grad_norm": 0.2382490485906601,
-      "learning_rate": 8.047222744854943e-06,
-      "loss": 0.621,
-      "step": 132
-    },
-    {
-      "epoch": 1.7106109324758842,
-      "grad_norm": 0.26903268694877625,
-      "learning_rate": 7.1885011480961164e-06,
-      "loss": 0.8464,
-      "step": 133
-    },
-    {
-      "epoch": 1.7234726688102895,
-      "grad_norm": 0.22437304258346558,
-      "learning_rate": 6.37651293602628e-06,
-      "loss": 0.526,
-      "step": 134
-    },
-    {
-      "epoch": 1.7363344051446945,
-      "grad_norm": 0.2693169414997101,
-      "learning_rate": 5.611666969163243e-06,
-      "loss": 0.7231,
-      "step": 135
-    },
-    {
-      "epoch": 1.7491961414790995,
-      "grad_norm": 0.24426168203353882,
-      "learning_rate": 4.8943483704846475e-06,
-      "loss": 0.6771,
-      "step": 136
-    },
-    {
-      "epoch": 1.762057877813505,
-      "grad_norm": 0.24735158681869507,
-      "learning_rate": 4.224918331506955e-06,
-      "loss": 0.7126,
-      "step": 137
-    },
-    {
-      "epoch": 1.77491961414791,
-      "grad_norm": 0.2898198962211609,
-      "learning_rate": 3.6037139304146762e-06,
-      "loss": 0.838,
-      "step": 138
-    },
-    {
-      "epoch": 1.787781350482315,
-      "grad_norm": 0.22501428425312042,
-      "learning_rate": 3.0310479623313127e-06,
-      "loss": 0.6003,
-      "step": 139
-    },
-    {
-      "epoch": 1.8006430868167203,
-      "grad_norm": 0.2505525052547455,
-      "learning_rate": 2.5072087818176382e-06,
-      "loss": 0.6793,
-      "step": 140
-    },
-    {
-      "epoch": 1.8006430868167203,
-      "eval_loss": 0.7633899450302124,
-      "eval_runtime": 6.8979,
-      "eval_samples_per_second": 23.775,
-      "eval_steps_per_second": 2.465,
-      "step": 140
-    },
-    {
-      "epoch": 1.8135048231511255,
-      "grad_norm": 0.2789839506149292,
-      "learning_rate": 2.032460157676452e-06,
-      "loss": 0.6984,
-      "step": 141
-    },
-    {
-      "epoch": 1.8263665594855305,
-      "grad_norm": 0.2638675272464752,
-      "learning_rate": 1.6070411401370334e-06,
-      "loss": 0.7894,
-      "step": 142
-    },
-    {
-      "epoch": 1.8392282958199357,
-      "grad_norm": 0.22497716546058655,
-      "learning_rate": 1.231165940486234e-06,
-      "loss": 0.6259,
-      "step": 143
-    },
-    {
-      "epoch": 1.852090032154341,
-      "grad_norm": 0.2598622143268585,
-      "learning_rate": 9.0502382320653e-07,
-      "loss": 0.7023,
-      "step": 144
-    },
-    {
-      "epoch": 1.864951768488746,
-      "grad_norm": 0.24743396043777466,
-      "learning_rate": 6.287790106757396e-07,
-      "loss": 0.7361,
-      "step": 145
-    },
-    {
-      "epoch": 1.877813504823151,
-      "grad_norm": 0.22825051844120026,
-      "learning_rate": 4.025706004760932e-07,
-      "loss": 0.6144,
-      "step": 146
-    },
-    {
-      "epoch": 1.8906752411575563,
-      "grad_norm": 0.24232302606105804,
-      "learning_rate": 2.265124953543918e-07,
-      "loss": 0.738,
-      "step": 147
-    },
-    {
-      "epoch": 1.9035369774919615,
-      "grad_norm": 0.23391401767730713,
-      "learning_rate": 1.0069334586854107e-07,
-      "loss": 0.671,
-      "step": 148
-    },
-    {
-      "epoch": 1.9163987138263665,
-      "grad_norm": 0.2621341943740845,
-      "learning_rate": 2.5176505749346936e-08,
-      "loss": 0.7556,
-      "step": 149
-    },
-    {
-      "epoch": 1.9292604501607717,
-      "grad_norm": 0.30414289236068726,
-      "learning_rate": 0.0,
-      "loss": 0.8086,
-      "step": 150
     }
   ],
   "logging_steps": 1,
@@ -1144,12 +186,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": true
       },
       "attributes": {}
     }
   },
-  "total_flos": 3.58265259884544e+16,
   "train_batch_size": 10,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.3471006155014038,
+  "best_model_checkpoint": "miner_id_besimray/checkpoint-20",
+  "epoch": 0.2572347266881029,
   "eval_steps": 20,
+  "global_step": 20,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.012861736334405145,
+      "grad_norm": 14.967682838439941,
       "learning_rate": 2e-05,
+      "loss": 9.1,
       "step": 1
     },
     {
       "epoch": 0.012861736334405145,
+      "eval_loss": 8.996231079101562,
+      "eval_runtime": 5.6709,
+      "eval_samples_per_second": 28.919,
+      "eval_steps_per_second": 2.998,
       "step": 1
     },
     {
       "epoch": 0.02572347266881029,
+      "grad_norm": 17.858224868774414,
       "learning_rate": 4e-05,
+      "loss": 8.9697,
       "step": 2
     },
     {
       "epoch": 0.03858520900321544,
+      "grad_norm": 17.052230834960938,
       "learning_rate": 6e-05,
+      "loss": 9.3384,
       "step": 3
     },
     {
       "epoch": 0.05144694533762058,
+      "grad_norm": 15.359251022338867,
       "learning_rate": 8e-05,
+      "loss": 8.6911,
       "step": 4
     },
     {
       "epoch": 0.06430868167202572,
+      "grad_norm": 14.528264045715332,
       "learning_rate": 0.0001,
+      "loss": 7.9832,
       "step": 5
     },
     {
       "epoch": 0.07717041800643087,
+      "grad_norm": 12.933821678161621,
       "learning_rate": 0.00012,
+      "loss": 6.9516,
       "step": 6
     },
     {
       "epoch": 0.09003215434083602,
+      "grad_norm": 11.693599700927734,
       "learning_rate": 0.00014,
+      "loss": 6.192,
       "step": 7
     },
     {
       "epoch": 0.10289389067524116,
+      "grad_norm": 9.695719718933105,
       "learning_rate": 0.00016,
+      "loss": 5.1497,
       "step": 8
     },
     {
       "epoch": 0.1157556270096463,
+      "grad_norm": 11.275097846984863,
       "learning_rate": 0.00018,
+      "loss": 4.0494,
       "step": 9
     },
     {
       "epoch": 0.12861736334405144,
+      "grad_norm": 13.86536693572998,
       "learning_rate": 0.0002,
+      "loss": 2.8475,
       "step": 10
     },
     {
       "epoch": 0.1414790996784566,
+      "grad_norm": 11.746561050415039,
       "learning_rate": 0.00019997482349425066,
+      "loss": 1.7198,
       "step": 11
     },
     {
       "epoch": 0.15434083601286175,
+      "grad_norm": 6.656251907348633,
       "learning_rate": 0.00019989930665413147,
+      "loss": 0.991,
       "step": 12
     },
     {
       "epoch": 0.16720257234726688,
+      "grad_norm": 7.672077655792236,
       "learning_rate": 0.0001997734875046456,
+      "loss": 0.8849,
       "step": 13
     },
     {
       "epoch": 0.18006430868167203,
+      "grad_norm": 5.506864547729492,
       "learning_rate": 0.00019959742939952392,
+      "loss": 0.5775,
       "step": 14
     },
     {
       "epoch": 0.19292604501607716,
+      "grad_norm": 6.015160083770752,
       "learning_rate": 0.00019937122098932428,
+      "loss": 0.4869,
       "step": 15
     },
     {
       "epoch": 0.2057877813504823,
+      "grad_norm": 6.050337314605713,
       "learning_rate": 0.00019909497617679348,
+      "loss": 0.5208,
       "step": 16
     },
     {
       "epoch": 0.21864951768488747,
+      "grad_norm": 5.535327911376953,
       "learning_rate": 0.00019876883405951377,
+      "loss": 0.5163,
       "step": 17
     },
     {
       "epoch": 0.2315112540192926,
+      "grad_norm": 3.005343437194824,
       "learning_rate": 0.00019839295885986296,
+      "loss": 0.4186,
       "step": 18
     },
     {
       "epoch": 0.24437299035369775,
+      "grad_norm": 3.88594388961792,
       "learning_rate": 0.00019796753984232358,
+      "loss": 0.3806,
       "step": 19
     },
     {
       "epoch": 0.2572347266881029,
+      "grad_norm": 3.7315642833709717,
       "learning_rate": 0.00019749279121818235,
+      "loss": 0.3746,
       "step": 20
     },
     {
       "epoch": 0.2572347266881029,
+      "eval_loss": 0.3471006155014038,
+      "eval_runtime": 5.7323,
+      "eval_samples_per_second": 28.61,
+      "eval_steps_per_second": 2.966,
       "step": 20
     }
   ],
   "logging_steps": 1,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": false
       },
       "attributes": {}
     }
   },
+  "total_flos": 5141026150809600.0,
   "train_batch_size": 10,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:36d0f18aea43d7612a7191b41a7eb6bd0de838ab0f1cc6cc2b200f546360c06a
 size 6648

 version https://git-lfs.github.com/spec/v1
+oid sha256:a61f177c4f35816461aeee7877425472ed07bd0e989be98a55cef3f50bbeb021
 size 6648