CharlesLi
/

mistral_cot_simplest_qlora

@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) on the generator dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.8973
 ## Model description
@@ -55,7 +55,7 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
-| 1.8904        | 0.8   | 2    | 0.8973          |
 ### Framework versions

 This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) on the generator dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.8974
 ## Model description
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
+| 1.8904        | 0.8   | 2    | 0.8974          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -2,8 +2,8 @@
     "epoch": 0.8,
     "total_flos": 1406258997362688.0,
     "train_loss": 1.9418977499008179,
-    "train_runtime": 371.4468,
     "train_samples": 100,
-    "train_samples_per_second": 0.054,
-    "train_steps_per_second": 0.005
 }

     "epoch": 0.8,
     "total_flos": 1406258997362688.0,
     "train_loss": 1.9418977499008179,
+    "train_runtime": 22.4337,
     "train_samples": 100,
+    "train_samples_per_second": 0.892,
+    "train_steps_per_second": 0.089
 }

train_results.json CHANGED Viewed

@@ -2,8 +2,8 @@
     "epoch": 0.8,
     "total_flos": 1406258997362688.0,
     "train_loss": 1.9418977499008179,
-    "train_runtime": 371.4468,
     "train_samples": 100,
-    "train_samples_per_second": 0.054,
-    "train_steps_per_second": 0.005
 }

     "epoch": 0.8,
     "total_flos": 1406258997362688.0,
     "train_loss": 1.9418977499008179,
+    "train_runtime": 22.4337,
     "train_samples": 100,
+    "train_samples_per_second": 0.892,
+    "train_steps_per_second": 0.089
 }

trainer_state.json CHANGED Viewed

@@ -10,17 +10,17 @@
   "log_history": [
     {
       "epoch": 0.4,
-      "grad_norm": 1.2140933275222778,
       "learning_rate": 0.0002,
       "loss": 1.8904,
       "step": 1
     },
     {
       "epoch": 0.8,
-      "eval_loss": 0.8972685933113098,
-      "eval_runtime": 0.9262,
-      "eval_samples_per_second": 3.239,
-      "eval_steps_per_second": 1.08,
       "step": 2
     },
     {
@@ -28,16 +28,16 @@
       "step": 2,
       "total_flos": 1406258997362688.0,
       "train_loss": 1.9418977499008179,
-      "train_runtime": 371.4468,
-      "train_samples_per_second": 0.054,
-      "train_steps_per_second": 0.005
     }
   ],
   "logging_steps": 5,
   "max_steps": 2,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
-  "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {

   "log_history": [
     {
       "epoch": 0.4,
+      "grad_norm": 1.2139586210250854,
       "learning_rate": 0.0002,
       "loss": 1.8904,
       "step": 1
     },
     {
       "epoch": 0.8,
+      "eval_loss": 0.8973897099494934,
+      "eval_runtime": 0.919,
+      "eval_samples_per_second": 3.264,
+      "eval_steps_per_second": 1.088,
       "step": 2
     },
     {
       "step": 2,
       "total_flos": 1406258997362688.0,
       "train_loss": 1.9418977499008179,
+      "train_runtime": 22.4337,
+      "train_samples_per_second": 0.892,
+      "train_steps_per_second": 0.089
     }
   ],
   "logging_steps": 5,
   "max_steps": 2,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
+  "save_steps": 1000,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {