Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

adapter_config.json +2 -2
adapter_model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +373 -247
training_args.bin +1 -1

adapter_config.json CHANGED Viewed

@@ -24,8 +24,8 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "q_proj",
-    "v_proj"
   ],
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "v_proj",
+    "q_proj"
   ],
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e92e7c0e65d124154f882db6e6b89ae7dedbcf6ca149012ec2de51653e86644f
 size 3158328

 version https://git-lfs.github.com/spec/v1
+oid sha256:a4dc822dedc384ff1e1bf9f824c1d8ed16fc0c84c4ec2e909074b5b393597299
 size 3158328

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b0701747dc14bac44bc2768ca59a83df1f871b155a2e2c5492794c93f184fd54
 size 6372346

 version https://git-lfs.github.com/spec/v1
+oid sha256:92f877d5fa714528710fdcc22554f62d68df70d84f8061dcc948239ff02e4497
 size 6372346

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ae6ca1c4675db6b88190f773cd2468c2e86958ece56a79a0a2ac9f552b528d5d
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:d570f8e841b98b38546fa6f5424e476748b4ed0291b07935c047166c46cb18c9
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:71c858e7d80e455b20613a1a6bfebd32fda797d631d427b2c8f97c0128f7aba4
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:e56410807552dcace5d3d612d3c7e7dae7c6a7e2bc67c923c248c2898d752168
 size 1064

trainer_state.json CHANGED Viewed

@@ -2,579 +2,705 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 4.0,
   "eval_steps": 500,
-  "global_step": 4000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.05,
-      "grad_norm": 0.07341445982456207,
-      "learning_rate": 4.93875e-05,
-      "loss": 2.2129,
       "step": 50
     },
     {
       "epoch": 0.1,
-      "grad_norm": 0.05993746593594551,
-      "learning_rate": 4.87625e-05,
-      "loss": 1.9761,
       "step": 100
     },
     {
       "epoch": 0.15,
-      "grad_norm": 0.09222695976495743,
-      "learning_rate": 4.8137500000000005e-05,
-      "loss": 2.1171,
       "step": 150
     },
     {
       "epoch": 0.2,
-      "grad_norm": 0.10137380659580231,
-      "learning_rate": 4.75125e-05,
-      "loss": 2.1274,
       "step": 200
     },
     {
       "epoch": 0.25,
-      "grad_norm": 0.1008363887667656,
-      "learning_rate": 4.68875e-05,
-      "loss": 1.9111,
       "step": 250
     },
     {
       "epoch": 0.3,
-      "grad_norm": 0.09414055943489075,
-      "learning_rate": 4.6262500000000006e-05,
-      "loss": 1.9712,
       "step": 300
     },
     {
       "epoch": 0.35,
-      "grad_norm": 0.08397311717271805,
-      "learning_rate": 4.56375e-05,
-      "loss": 1.9442,
       "step": 350
     },
     {
       "epoch": 0.4,
-      "grad_norm": 0.11413953453302383,
-      "learning_rate": 4.50125e-05,
-      "loss": 1.8472,
       "step": 400
     },
     {
       "epoch": 0.45,
-      "grad_norm": 0.1147843673825264,
-      "learning_rate": 4.43875e-05,
-      "loss": 1.88,
       "step": 450
     },
     {
       "epoch": 0.5,
-      "grad_norm": 0.15569846332073212,
-      "learning_rate": 4.37625e-05,
-      "loss": 1.8664,
       "step": 500
     },
     {
       "epoch": 0.55,
-      "grad_norm": 0.1375017762184143,
-      "learning_rate": 4.3137500000000005e-05,
-      "loss": 1.8417,
       "step": 550
     },
     {
       "epoch": 0.6,
-      "grad_norm": 0.10806793719530106,
-      "learning_rate": 4.2512499999999997e-05,
-      "loss": 1.8872,
       "step": 600
     },
     {
       "epoch": 0.65,
-      "grad_norm": 0.12453680485486984,
-      "learning_rate": 4.18875e-05,
-      "loss": 1.8628,
       "step": 650
     },
     {
       "epoch": 0.7,
-      "grad_norm": 0.1216558963060379,
-      "learning_rate": 4.126250000000001e-05,
-      "loss": 1.9252,
       "step": 700
     },
     {
       "epoch": 0.75,
-      "grad_norm": 0.10894916951656342,
-      "learning_rate": 4.06375e-05,
-      "loss": 1.8206,
       "step": 750
     },
     {
       "epoch": 0.8,
-      "grad_norm": 0.11225900053977966,
-      "learning_rate": 4.0012500000000004e-05,
-      "loss": 1.8024,
       "step": 800
     },
     {
       "epoch": 0.85,
-      "grad_norm": 0.0950043648481369,
-      "learning_rate": 3.93875e-05,
-      "loss": 1.8965,
       "step": 850
     },
     {
       "epoch": 0.9,
-      "grad_norm": 0.15110863745212555,
-      "learning_rate": 3.87625e-05,
-      "loss": 1.892,
       "step": 900
     },
     {
       "epoch": 0.95,
-      "grad_norm": 0.11794973164796829,
-      "learning_rate": 3.8137500000000005e-05,
-      "loss": 1.846,
       "step": 950
     },
     {
       "epoch": 1.0,
-      "grad_norm": 0.09780783951282501,
-      "learning_rate": 3.7512500000000004e-05,
-      "loss": 1.8793,
       "step": 1000
     },
     {
       "epoch": 1.05,
-      "grad_norm": 0.10895536839962006,
-      "learning_rate": 3.68875e-05,
-      "loss": 1.8671,
       "step": 1050
     },
     {
       "epoch": 1.1,
-      "grad_norm": 0.11591296643018723,
-      "learning_rate": 3.62625e-05,
-      "loss": 1.8809,
       "step": 1100
     },
     {
       "epoch": 1.15,
-      "grad_norm": 0.12027107179164886,
-      "learning_rate": 3.56375e-05,
-      "loss": 1.8601,
       "step": 1150
     },
     {
       "epoch": 1.2,
-      "grad_norm": 0.17965424060821533,
-      "learning_rate": 3.5012500000000004e-05,
-      "loss": 1.79,
       "step": 1200
     },
     {
       "epoch": 1.25,
-      "grad_norm": 0.10349484533071518,
-      "learning_rate": 3.43875e-05,
-      "loss": 1.911,
       "step": 1250
     },
     {
       "epoch": 1.3,
-      "grad_norm": 0.11761331558227539,
-      "learning_rate": 3.37625e-05,
-      "loss": 1.8324,
       "step": 1300
     },
     {
       "epoch": 1.35,
-      "grad_norm": 0.10469675809144974,
-      "learning_rate": 3.31375e-05,
-      "loss": 1.8358,
       "step": 1350
     },
     {
       "epoch": 1.4,
-      "grad_norm": 0.1126856803894043,
-      "learning_rate": 3.2512500000000004e-05,
-      "loss": 1.7683,
       "step": 1400
     },
     {
       "epoch": 1.45,
-      "grad_norm": 0.13452214002609253,
-      "learning_rate": 3.18875e-05,
-      "loss": 1.8371,
       "step": 1450
     },
     {
       "epoch": 1.5,
-      "grad_norm": 0.13590937852859497,
-      "learning_rate": 3.12625e-05,
-      "loss": 1.8016,
       "step": 1500
     },
     {
       "epoch": 1.55,
-      "grad_norm": 0.16644452512264252,
-      "learning_rate": 3.06375e-05,
-      "loss": 1.828,
       "step": 1550
     },
     {
       "epoch": 1.6,
-      "grad_norm": 0.1445678323507309,
-      "learning_rate": 3.0012499999999998e-05,
-      "loss": 1.8417,
       "step": 1600
     },
     {
       "epoch": 1.65,
-      "grad_norm": 0.10797803103923798,
-      "learning_rate": 2.9387500000000003e-05,
-      "loss": 1.8364,
       "step": 1650
     },
     {
       "epoch": 1.7,
-      "grad_norm": 0.14495541155338287,
-      "learning_rate": 2.8762500000000005e-05,
-      "loss": 1.7685,
       "step": 1700
     },
     {
       "epoch": 1.75,
-      "grad_norm": 0.19478319585323334,
-      "learning_rate": 2.81375e-05,
-      "loss": 1.8443,
       "step": 1750
     },
     {
       "epoch": 1.8,
-      "grad_norm": 0.1155581995844841,
-      "learning_rate": 2.75125e-05,
-      "loss": 1.8308,
       "step": 1800
     },
     {
       "epoch": 1.85,
-      "grad_norm": 0.16222324967384338,
-      "learning_rate": 2.68875e-05,
-      "loss": 1.8612,
       "step": 1850
     },
     {
       "epoch": 1.9,
-      "grad_norm": 0.12850694358348846,
-      "learning_rate": 2.62625e-05,
-      "loss": 1.7941,
       "step": 1900
     },
     {
       "epoch": 1.95,
-      "grad_norm": 0.15454839169979095,
-      "learning_rate": 2.5637500000000003e-05,
-      "loss": 1.8409,
       "step": 1950
     },
     {
       "epoch": 2.0,
-      "grad_norm": 0.13590778410434723,
-      "learning_rate": 2.50125e-05,
-      "loss": 1.8786,
       "step": 2000
     },
     {
       "epoch": 2.05,
-      "grad_norm": 0.16667228937149048,
-      "learning_rate": 2.4387500000000003e-05,
-      "loss": 1.8638,
       "step": 2050
     },
     {
       "epoch": 2.1,
-      "grad_norm": 0.1315215826034546,
-      "learning_rate": 2.37625e-05,
-      "loss": 1.8444,
       "step": 2100
     },
     {
       "epoch": 2.15,
-      "grad_norm": 0.12784986197948456,
-      "learning_rate": 2.31375e-05,
-      "loss": 1.8937,
       "step": 2150
     },
     {
       "epoch": 2.2,
-      "grad_norm": 0.1757715791463852,
-      "learning_rate": 2.2512500000000002e-05,
-      "loss": 1.8579,
       "step": 2200
     },
     {
       "epoch": 2.25,
-      "grad_norm": 0.13813862204551697,
-      "learning_rate": 2.18875e-05,
-      "loss": 1.8044,
       "step": 2250
     },
     {
       "epoch": 2.3,
-      "grad_norm": 0.1028163880109787,
-      "learning_rate": 2.1262500000000002e-05,
-      "loss": 1.8371,
       "step": 2300
     },
     {
       "epoch": 2.35,
-      "grad_norm": 0.1259194314479828,
-      "learning_rate": 2.06375e-05,
-      "loss": 1.7808,
       "step": 2350
     },
     {
       "epoch": 2.4,
-      "grad_norm": 0.14981134235858917,
-      "learning_rate": 2.0012500000000002e-05,
-      "loss": 1.791,
       "step": 2400
     },
     {
       "epoch": 2.45,
-      "grad_norm": 0.17923866212368011,
-      "learning_rate": 1.93875e-05,
-      "loss": 1.7812,
       "step": 2450
     },
     {
       "epoch": 2.5,
-      "grad_norm": 0.1661010980606079,
-      "learning_rate": 1.87625e-05,
-      "loss": 1.8722,
       "step": 2500
     },
     {
       "epoch": 2.55,
-      "grad_norm": 0.1746157556772232,
-      "learning_rate": 1.81375e-05,
-      "loss": 1.7818,
       "step": 2550
     },
     {
       "epoch": 2.6,
-      "grad_norm": 0.13667204976081848,
-      "learning_rate": 1.7512500000000002e-05,
-      "loss": 1.8312,
       "step": 2600
     },
     {
       "epoch": 2.65,
-      "grad_norm": 0.149635910987854,
-      "learning_rate": 1.68875e-05,
-      "loss": 1.813,
       "step": 2650
     },
     {
       "epoch": 2.7,
-      "grad_norm": 0.1654106080532074,
-      "learning_rate": 1.62625e-05,
-      "loss": 1.8739,
       "step": 2700
     },
     {
       "epoch": 2.75,
-      "grad_norm": 0.14472809433937073,
-      "learning_rate": 1.56375e-05,
-      "loss": 1.7726,
       "step": 2750
     },
     {
       "epoch": 2.8,
-      "grad_norm": 0.14457519352436066,
-      "learning_rate": 1.5012500000000002e-05,
-      "loss": 1.7593,
       "step": 2800
     },
     {
       "epoch": 2.85,
-      "grad_norm": 0.12135745584964752,
-      "learning_rate": 1.43875e-05,
-      "loss": 1.8547,
       "step": 2850
     },
     {
       "epoch": 2.9,
-      "grad_norm": 0.18297483026981354,
-      "learning_rate": 1.37625e-05,
-      "loss": 1.852,
       "step": 2900
     },
     {
       "epoch": 2.95,
-      "grad_norm": 0.13699814677238464,
-      "learning_rate": 1.31375e-05,
-      "loss": 1.8064,
       "step": 2950
     },
     {
       "epoch": 3.0,
-      "grad_norm": 0.1180616170167923,
-      "learning_rate": 1.25125e-05,
-      "loss": 1.8457,
       "step": 3000
     },
     {
       "epoch": 3.05,
-      "grad_norm": 0.13587267696857452,
-      "learning_rate": 1.18875e-05,
-      "loss": 1.8615,
       "step": 3050
     },
     {
       "epoch": 3.1,
-      "grad_norm": 0.13397973775863647,
-      "learning_rate": 1.1262500000000001e-05,
-      "loss": 1.8077,
       "step": 3100
     },
     {
       "epoch": 3.15,
-      "grad_norm": 0.13365299999713898,
-      "learning_rate": 1.0637500000000001e-05,
-      "loss": 1.9439,
       "step": 3150
     },
     {
       "epoch": 3.2,
-      "grad_norm": 0.1443161964416504,
-      "learning_rate": 1.0012500000000001e-05,
-      "loss": 1.8582,
       "step": 3200
     },
     {
       "epoch": 3.25,
-      "grad_norm": 0.14696797728538513,
-      "learning_rate": 9.387500000000001e-06,
-      "loss": 1.781,
       "step": 3250
     },
     {
       "epoch": 3.3,
-      "grad_norm": 0.15039022266864777,
-      "learning_rate": 8.7625e-06,
-      "loss": 1.8631,
       "step": 3300
     },
     {
       "epoch": 3.35,
-      "grad_norm": 0.1311933696269989,
-      "learning_rate": 8.137500000000001e-06,
-      "loss": 1.8544,
       "step": 3350
     },
     {
       "epoch": 3.4,
-      "grad_norm": 0.15743456780910492,
-      "learning_rate": 7.5125000000000005e-06,
-      "loss": 1.7619,
       "step": 3400
     },
     {
       "epoch": 3.45,
-      "grad_norm": 0.1521286964416504,
-      "learning_rate": 6.8875000000000005e-06,
-      "loss": 1.8013,
       "step": 3450
     },
     {
       "epoch": 3.5,
-      "grad_norm": 0.22098106145858765,
-      "learning_rate": 6.262500000000001e-06,
-      "loss": 1.799,
       "step": 3500
     },
     {
       "epoch": 3.55,
-      "grad_norm": 0.17960400879383087,
-      "learning_rate": 5.637500000000001e-06,
-      "loss": 1.7757,
       "step": 3550
     },
     {
       "epoch": 3.6,
-      "grad_norm": 0.13921096920967102,
-      "learning_rate": 5.012500000000001e-06,
-      "loss": 1.8261,
       "step": 3600
     },
     {
       "epoch": 3.65,
-      "grad_norm": 0.1500893533229828,
-      "learning_rate": 4.3875e-06,
-      "loss": 1.8084,
       "step": 3650
     },
     {
       "epoch": 3.7,
-      "grad_norm": 0.16856829822063446,
-      "learning_rate": 3.7625e-06,
-      "loss": 1.8691,
       "step": 3700
     },
     {
       "epoch": 3.75,
-      "grad_norm": 0.14953631162643433,
-      "learning_rate": 3.1375e-06,
-      "loss": 1.7682,
       "step": 3750
     },
     {
       "epoch": 3.8,
-      "grad_norm": 0.14736834168434143,
-      "learning_rate": 2.5125e-06,
-      "loss": 1.7549,
       "step": 3800
     },
     {
       "epoch": 3.85,
-      "grad_norm": 0.1247793510556221,
-      "learning_rate": 1.8875e-06,
-      "loss": 1.8505,
       "step": 3850
     },
     {
       "epoch": 3.9,
-      "grad_norm": 0.1904279887676239,
-      "learning_rate": 1.2625000000000002e-06,
-      "loss": 1.8485,
       "step": 3900
     },
     {
       "epoch": 3.95,
-      "grad_norm": 0.13930699229240417,
-      "learning_rate": 6.375e-07,
-      "loss": 1.8034,
       "step": 3950
     },
     {
       "epoch": 4.0,
-      "grad_norm": 0.1202196553349495,
-      "learning_rate": 1.2500000000000001e-08,
-      "loss": 1.8422,
       "step": 4000
     }
   ],
   "logging_steps": 50,
-  "max_steps": 4000,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 4,
-  "save_steps": 10,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
@@ -582,12 +708,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": true
       },
       "attributes": {}
     }
   },
-  "total_flos": 5.360735748096e+16,
   "train_batch_size": 10,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 4.9,
   "eval_steps": 500,
+  "global_step": 4900,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.05,
+      "grad_norm": 0.045439597219228745,
+      "learning_rate": 4.951e-05,
+      "loss": 2.0674,
       "step": 50
     },
     {
       "epoch": 0.1,
+      "grad_norm": 0.16585414111614227,
+      "learning_rate": 4.901e-05,
+      "loss": 2.0627,
       "step": 100
     },
     {
       "epoch": 0.15,
+      "grad_norm": 0.09927507489919662,
+      "learning_rate": 4.851e-05,
+      "loss": 1.9676,
       "step": 150
     },
     {
       "epoch": 0.2,
+      "grad_norm": 0.12874533236026764,
+      "learning_rate": 4.801e-05,
+      "loss": 1.9972,
       "step": 200
     },
     {
       "epoch": 0.25,
+      "grad_norm": 0.10141663253307343,
+      "learning_rate": 4.7510000000000004e-05,
+      "loss": 1.9816,
       "step": 250
     },
     {
       "epoch": 0.3,
+      "grad_norm": 0.10262555629014969,
+      "learning_rate": 4.7010000000000006e-05,
+      "loss": 1.956,
       "step": 300
     },
     {
       "epoch": 0.35,
+      "grad_norm": 0.13878460228443146,
+      "learning_rate": 4.651e-05,
+      "loss": 1.8552,
       "step": 350
     },
     {
       "epoch": 0.4,
+      "grad_norm": 0.1261318475008011,
+      "learning_rate": 4.601e-05,
+      "loss": 1.8788,
       "step": 400
     },
     {
       "epoch": 0.45,
+      "grad_norm": 0.09696529805660248,
+      "learning_rate": 4.551e-05,
+      "loss": 1.8678,
       "step": 450
     },
     {
       "epoch": 0.5,
+      "grad_norm": 0.0807134360074997,
+      "learning_rate": 4.5010000000000004e-05,
+      "loss": 1.9138,
       "step": 500
     },
     {
       "epoch": 0.55,
+      "grad_norm": 0.11735808849334717,
+      "learning_rate": 4.451e-05,
+      "loss": 1.7924,
       "step": 550
     },
     {
       "epoch": 0.6,
+      "grad_norm": 0.10592185705900192,
+      "learning_rate": 4.401e-05,
+      "loss": 1.8553,
       "step": 600
     },
     {
       "epoch": 0.65,
+      "grad_norm": 0.09940163046121597,
+      "learning_rate": 4.351e-05,
+      "loss": 1.8049,
       "step": 650
     },
     {
       "epoch": 0.7,
+      "grad_norm": 0.12494263797998428,
+      "learning_rate": 4.301e-05,
+      "loss": 1.7692,
       "step": 700
     },
     {
       "epoch": 0.75,
+      "grad_norm": 0.09799129515886307,
+      "learning_rate": 4.251e-05,
+      "loss": 1.6875,
       "step": 750
     },
     {
       "epoch": 0.8,
+      "grad_norm": 0.12961390614509583,
+      "learning_rate": 4.201e-05,
+      "loss": 1.913,
       "step": 800
     },
     {
       "epoch": 0.85,
+      "grad_norm": 0.11012327671051025,
+      "learning_rate": 4.151000000000001e-05,
+      "loss": 1.8881,
       "step": 850
     },
     {
       "epoch": 0.9,
+      "grad_norm": 0.10871279984712601,
+      "learning_rate": 4.101e-05,
+      "loss": 1.9143,
       "step": 900
     },
     {
       "epoch": 0.95,
+      "grad_norm": 0.08704905211925507,
+      "learning_rate": 4.0510000000000003e-05,
+      "loss": 1.787,
       "step": 950
     },
     {
       "epoch": 1.0,
+      "grad_norm": 0.14718344807624817,
+      "learning_rate": 4.0010000000000005e-05,
+      "loss": 1.7454,
       "step": 1000
     },
     {
       "epoch": 1.05,
+      "grad_norm": 0.16144457459449768,
+      "learning_rate": 3.951e-05,
+      "loss": 1.8849,
       "step": 1050
     },
     {
       "epoch": 1.1,
+      "grad_norm": 0.14586377143859863,
+      "learning_rate": 3.901e-05,
+      "loss": 1.8774,
       "step": 1100
     },
     {
       "epoch": 1.15,
+      "grad_norm": 0.12431728094816208,
+      "learning_rate": 3.851e-05,
+      "loss": 1.7688,
       "step": 1150
     },
     {
       "epoch": 1.2,
+      "grad_norm": 0.13206268846988678,
+      "learning_rate": 3.8010000000000004e-05,
+      "loss": 1.8348,
       "step": 1200
     },
     {
       "epoch": 1.25,
+      "grad_norm": 0.12747374176979065,
+      "learning_rate": 3.751e-05,
+      "loss": 1.8073,
       "step": 1250
     },
     {
       "epoch": 1.3,
+      "grad_norm": 0.12160924822092056,
+      "learning_rate": 3.701e-05,
+      "loss": 1.8575,
       "step": 1300
     },
     {
       "epoch": 1.35,
+      "grad_norm": 0.09841366857290268,
+      "learning_rate": 3.651e-05,
+      "loss": 1.8004,
       "step": 1350
     },
     {
       "epoch": 1.4,
+      "grad_norm": 0.11598911881446838,
+      "learning_rate": 3.601e-05,
+      "loss": 1.8464,
       "step": 1400
     },
     {
       "epoch": 1.45,
+      "grad_norm": 0.13689620792865753,
+      "learning_rate": 3.5510000000000004e-05,
+      "loss": 1.7985,
       "step": 1450
     },
     {
       "epoch": 1.5,
+      "grad_norm": 0.13403603434562683,
+      "learning_rate": 3.5010000000000005e-05,
+      "loss": 1.7672,
       "step": 1500
     },
     {
       "epoch": 1.55,
+      "grad_norm": 0.14851945638656616,
+      "learning_rate": 3.451000000000001e-05,
+      "loss": 1.7729,
       "step": 1550
     },
     {
       "epoch": 1.6,
+      "grad_norm": 0.16562113165855408,
+      "learning_rate": 3.401e-05,
+      "loss": 1.8064,
       "step": 1600
     },
     {
       "epoch": 1.65,
+      "grad_norm": 0.1761932522058487,
+      "learning_rate": 3.351e-05,
+      "loss": 1.8096,
       "step": 1650
     },
     {
       "epoch": 1.7,
+      "grad_norm": 0.1335448920726776,
+      "learning_rate": 3.3010000000000004e-05,
+      "loss": 1.7384,
       "step": 1700
     },
     {
       "epoch": 1.75,
+      "grad_norm": 0.12576375901699066,
+      "learning_rate": 3.251e-05,
+      "loss": 1.7785,
       "step": 1750
     },
     {
       "epoch": 1.8,
+      "grad_norm": 0.15707552433013916,
+      "learning_rate": 3.201e-05,
+      "loss": 1.8143,
       "step": 1800
     },
     {
       "epoch": 1.85,
+      "grad_norm": 0.1951502561569214,
+      "learning_rate": 3.151e-05,
+      "loss": 1.827,
       "step": 1850
     },
     {
       "epoch": 1.9,
+      "grad_norm": 0.15844249725341797,
+      "learning_rate": 3.101e-05,
+      "loss": 1.7335,
       "step": 1900
     },
     {
       "epoch": 1.95,
+      "grad_norm": 0.13916510343551636,
+      "learning_rate": 3.051e-05,
+      "loss": 1.8175,
       "step": 1950
     },
     {
       "epoch": 2.0,
+      "grad_norm": 0.13753607869148254,
+      "learning_rate": 3.001e-05,
+      "loss": 1.7886,
       "step": 2000
     },
     {
       "epoch": 2.05,
+      "grad_norm": 0.12483993917703629,
+      "learning_rate": 2.951e-05,
+      "loss": 1.79,
       "step": 2050
     },
     {
       "epoch": 2.1,
+      "grad_norm": 0.15688075125217438,
+      "learning_rate": 2.9010000000000005e-05,
+      "loss": 1.7924,
       "step": 2100
     },
     {
       "epoch": 2.15,
+      "grad_norm": 0.14715777337551117,
+      "learning_rate": 2.8510000000000003e-05,
+      "loss": 1.7819,
       "step": 2150
     },
     {
       "epoch": 2.2,
+      "grad_norm": 0.14971637725830078,
+      "learning_rate": 2.8010000000000005e-05,
+      "loss": 1.7986,
       "step": 2200
     },
     {
       "epoch": 2.25,
+      "grad_norm": 0.12904739379882812,
+      "learning_rate": 2.7510000000000003e-05,
+      "loss": 1.7566,
       "step": 2250
     },
     {
       "epoch": 2.3,
+      "grad_norm": 0.12834960222244263,
+      "learning_rate": 2.701e-05,
+      "loss": 1.8059,
       "step": 2300
     },
     {
       "epoch": 2.35,
+      "grad_norm": 0.13853800296783447,
+      "learning_rate": 2.6510000000000002e-05,
+      "loss": 1.8128,
       "step": 2350
     },
     {
       "epoch": 2.4,
+      "grad_norm": 0.1513364017009735,
+      "learning_rate": 2.601e-05,
+      "loss": 1.7884,
       "step": 2400
     },
     {
       "epoch": 2.45,
+      "grad_norm": 0.12462843209505081,
+      "learning_rate": 2.551e-05,
+      "loss": 1.795,
       "step": 2450
     },
     {
       "epoch": 2.5,
+      "grad_norm": 0.13411079347133636,
+      "learning_rate": 2.501e-05,
+      "loss": 1.8142,
       "step": 2500
     },
     {
       "epoch": 2.55,
+      "grad_norm": 0.19330520927906036,
+      "learning_rate": 2.451e-05,
+      "loss": 1.8254,
       "step": 2550
     },
     {
       "epoch": 2.6,
+      "grad_norm": 0.128371000289917,
+      "learning_rate": 2.4010000000000002e-05,
+      "loss": 1.7903,
       "step": 2600
     },
     {
       "epoch": 2.65,
+      "grad_norm": 0.14016272127628326,
+      "learning_rate": 2.351e-05,
+      "loss": 1.8395,
       "step": 2650
     },
     {
       "epoch": 2.7,
+      "grad_norm": 0.12970508635044098,
+      "learning_rate": 2.301e-05,
+      "loss": 1.7158,
       "step": 2700
     },
     {
       "epoch": 2.75,
+      "grad_norm": 0.12173590809106827,
+      "learning_rate": 2.251e-05,
+      "loss": 1.7364,
       "step": 2750
     },
     {
       "epoch": 2.8,
+      "grad_norm": 0.15061478316783905,
+      "learning_rate": 2.201e-05,
+      "loss": 1.7521,
       "step": 2800
     },
     {
       "epoch": 2.85,
+      "grad_norm": 0.14284451305866241,
+      "learning_rate": 2.1510000000000002e-05,
+      "loss": 1.7876,
       "step": 2850
     },
     {
       "epoch": 2.9,
+      "grad_norm": 0.1959935575723648,
+      "learning_rate": 2.101e-05,
+      "loss": 1.7806,
       "step": 2900
     },
     {
       "epoch": 2.95,
+      "grad_norm": 0.16236719489097595,
+      "learning_rate": 2.0510000000000002e-05,
+      "loss": 1.764,
       "step": 2950
     },
     {
       "epoch": 3.0,
+      "grad_norm": 0.176457017660141,
+      "learning_rate": 2.001e-05,
+      "loss": 1.8309,
       "step": 3000
     },
     {
       "epoch": 3.05,
+      "grad_norm": 0.16891098022460938,
+      "learning_rate": 1.951e-05,
+      "loss": 1.7853,
       "step": 3050
     },
     {
       "epoch": 3.1,
+      "grad_norm": 0.17052677273750305,
+      "learning_rate": 1.901e-05,
+      "loss": 1.8199,
       "step": 3100
     },
     {
       "epoch": 3.15,
+      "grad_norm": 0.1866779923439026,
+      "learning_rate": 1.851e-05,
+      "loss": 1.7815,
       "step": 3150
     },
     {
       "epoch": 3.2,
+      "grad_norm": 0.15316179394721985,
+      "learning_rate": 1.8010000000000002e-05,
+      "loss": 1.773,
       "step": 3200
     },
     {
       "epoch": 3.25,
+      "grad_norm": 0.1431683450937271,
+      "learning_rate": 1.751e-05,
+      "loss": 1.8048,
       "step": 3250
     },
     {
       "epoch": 3.3,
+      "grad_norm": 0.13355782628059387,
+      "learning_rate": 1.701e-05,
+      "loss": 1.7749,
       "step": 3300
     },
     {
       "epoch": 3.35,
+      "grad_norm": 0.13734985888004303,
+      "learning_rate": 1.651e-05,
+      "loss": 1.8291,
       "step": 3350
     },
     {
       "epoch": 3.4,
+      "grad_norm": 0.15869005024433136,
+      "learning_rate": 1.601e-05,
+      "loss": 1.7562,
       "step": 3400
     },
     {
       "epoch": 3.45,
+      "grad_norm": 0.18163730204105377,
+      "learning_rate": 1.551e-05,
+      "loss": 1.7663,
       "step": 3450
     },
     {
       "epoch": 3.5,
+      "grad_norm": 0.157146155834198,
+      "learning_rate": 1.5010000000000002e-05,
+      "loss": 1.7134,
       "step": 3500
     },
     {
       "epoch": 3.55,
+      "grad_norm": 0.10092534869909286,
+      "learning_rate": 1.4510000000000002e-05,
+      "loss": 1.7135,
       "step": 3550
     },
     {
       "epoch": 3.6,
+      "grad_norm": 0.19938626885414124,
+      "learning_rate": 1.4010000000000001e-05,
+      "loss": 1.83,
       "step": 3600
     },
     {
       "epoch": 3.65,
+      "grad_norm": 0.16330371797084808,
+      "learning_rate": 1.3510000000000001e-05,
+      "loss": 1.757,
       "step": 3650
     },
     {
       "epoch": 3.7,
+      "grad_norm": 0.15412551164627075,
+      "learning_rate": 1.301e-05,
+      "loss": 1.7907,
       "step": 3700
     },
     {
       "epoch": 3.75,
+      "grad_norm": 0.17748574912548065,
+      "learning_rate": 1.2509999999999999e-05,
+      "loss": 1.7425,
       "step": 3750
     },
     {
       "epoch": 3.8,
+      "grad_norm": 0.1464386135339737,
+      "learning_rate": 1.201e-05,
+      "loss": 1.7368,
       "step": 3800
     },
     {
       "epoch": 3.85,
+      "grad_norm": 0.19307634234428406,
+      "learning_rate": 1.151e-05,
+      "loss": 1.7958,
       "step": 3850
     },
     {
       "epoch": 3.9,
+      "grad_norm": 0.15492843091487885,
+      "learning_rate": 1.1010000000000001e-05,
+      "loss": 1.7608,
       "step": 3900
     },
     {
       "epoch": 3.95,
+      "grad_norm": 0.24907787144184113,
+      "learning_rate": 1.0510000000000001e-05,
+      "loss": 1.8087,
       "step": 3950
     },
     {
       "epoch": 4.0,
+      "grad_norm": 0.17466576397418976,
+      "learning_rate": 1.001e-05,
+      "loss": 1.8596,
       "step": 4000
+    },
+    {
+      "epoch": 4.05,
+      "grad_norm": 0.15683214366436005,
+      "learning_rate": 9.51e-06,
+      "loss": 1.7841,
+      "step": 4050
+    },
+    {
+      "epoch": 4.1,
+      "grad_norm": 0.26058587431907654,
+      "learning_rate": 9.01e-06,
+      "loss": 1.715,
+      "step": 4100
+    },
+    {
+      "epoch": 4.15,
+      "grad_norm": 0.15742363035678864,
+      "learning_rate": 8.51e-06,
+      "loss": 1.7996,
+      "step": 4150
+    },
+    {
+      "epoch": 4.2,
+      "grad_norm": 0.1513935923576355,
+      "learning_rate": 8.010000000000001e-06,
+      "loss": 1.8159,
+      "step": 4200
+    },
+    {
+      "epoch": 4.25,
+      "grad_norm": 0.14027664065361023,
+      "learning_rate": 7.51e-06,
+      "loss": 1.7364,
+      "step": 4250
+    },
+    {
+      "epoch": 4.3,
+      "grad_norm": 0.1800561249256134,
+      "learning_rate": 7.01e-06,
+      "loss": 1.8001,
+      "step": 4300
+    },
+    {
+      "epoch": 4.35,
+      "grad_norm": 0.15748870372772217,
+      "learning_rate": 6.510000000000001e-06,
+      "loss": 1.7893,
+      "step": 4350
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 0.16933466494083405,
+      "learning_rate": 6.01e-06,
+      "loss": 1.7317,
+      "step": 4400
+    },
+    {
+      "epoch": 4.45,
+      "grad_norm": 0.1617567241191864,
+      "learning_rate": 5.510000000000001e-06,
+      "loss": 1.7532,
+      "step": 4450
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 0.14668306708335876,
+      "learning_rate": 5.01e-06,
+      "loss": 1.746,
+      "step": 4500
+    },
+    {
+      "epoch": 4.55,
+      "grad_norm": 0.1313694268465042,
+      "learning_rate": 4.51e-06,
+      "loss": 1.7548,
+      "step": 4550
+    },
+    {
+      "epoch": 4.6,
+      "grad_norm": 0.15111179649829865,
+      "learning_rate": 4.01e-06,
+      "loss": 1.7585,
+      "step": 4600
+    },
+    {
+      "epoch": 4.65,
+      "grad_norm": 0.14186547696590424,
+      "learning_rate": 3.5100000000000003e-06,
+      "loss": 1.7825,
+      "step": 4650
+    },
+    {
+      "epoch": 4.7,
+      "grad_norm": 0.18073762953281403,
+      "learning_rate": 3.01e-06,
+      "loss": 1.7919,
+      "step": 4700
+    },
+    {
+      "epoch": 4.75,
+      "grad_norm": 0.11522198468446732,
+      "learning_rate": 2.51e-06,
+      "loss": 1.7825,
+      "step": 4750
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 0.22390571236610413,
+      "learning_rate": 2.0100000000000002e-06,
+      "loss": 1.8044,
+      "step": 4800
+    },
+    {
+      "epoch": 4.85,
+      "grad_norm": 0.22593551874160767,
+      "learning_rate": 1.5100000000000002e-06,
+      "loss": 1.7819,
+      "step": 4850
+    },
+    {
+      "epoch": 4.9,
+      "grad_norm": 0.15215976536273956,
+      "learning_rate": 1.01e-06,
+      "loss": 1.8139,
+      "step": 4900
     }
   ],
   "logging_steps": 50,
+  "max_steps": 5000,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": false
       },
       "attributes": {}
     }
   },
+  "total_flos": 6.5669012914176e+16,
   "train_batch_size": 10,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76d1d18100c90f968b787f300279b75f704d674df1dc4e0364972389fca68a1f
 size 5304

 version https://git-lfs.github.com/spec/v1
+oid sha256:13a4c7e28f90020a96f4f94c164188da8eb58874dae9cf897001abae146280b4
 size 5304