End of training

Browse files

Files changed (6) hide show

README.md +4 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +267 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: mit
 base_model: CMU-AIRe/e3-1.7B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: e3-sft
@@ -15,7 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
 # e3-sft
-This model is a fine-tuned version of [CMU-AIRe/e3-1.7B](https://huggingface.co/CMU-AIRe/e3-1.7B) on an unknown dataset.
 ## Model description

 base_model: CMU-AIRe/e3-1.7B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: e3-sft
 # e3-sft
+This model is a fine-tuned version of [CMU-AIRe/e3-1.7B](https://huggingface.co/CMU-AIRe/e3-1.7B) on the hardmath_sft_2 dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.7587
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 8.0,
+    "eval_loss": 0.7586517930030823,
+    "eval_runtime": 0.7792,
+    "eval_samples_per_second": 16.684,
+    "eval_steps_per_second": 16.684,
+    "total_flos": 1.073226288070656e+16,
+    "train_loss": 0.8035880327224731,
+    "train_runtime": 252.7497,
+    "train_samples_per_second": 3.482,
+    "train_steps_per_second": 0.127
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 8.0,
+    "eval_loss": 0.7586517930030823,
+    "eval_runtime": 0.7792,
+    "eval_samples_per_second": 16.684,
+    "eval_steps_per_second": 16.684
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 8.0,
+    "total_flos": 1.073226288070656e+16,
+    "train_loss": 0.8035880327224731,
+    "train_runtime": 252.7497,
+    "train_samples_per_second": 3.482,
+    "train_steps_per_second": 0.127
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,267 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 8.0,
+  "eval_steps": 100,
+  "global_step": 32,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.2909090909090909,
+      "grad_norm": 10.45223617553711,
+      "learning_rate": 0.0,
+      "loss": 0.7859,
+      "step": 1
+    },
+    {
+      "epoch": 0.5818181818181818,
+      "grad_norm": 9.886490821838379,
+      "learning_rate": 2.5e-08,
+      "loss": 0.7965,
+      "step": 2
+    },
+    {
+      "epoch": 0.8727272727272727,
+      "grad_norm": 10.403158187866211,
+      "learning_rate": 5e-08,
+      "loss": 0.7893,
+      "step": 3
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 12.796398162841797,
+      "learning_rate": 7.5e-08,
+      "loss": 0.8701,
+      "step": 4
+    },
+    {
+      "epoch": 1.290909090909091,
+      "grad_norm": 10.29249095916748,
+      "learning_rate": 1e-07,
+      "loss": 0.7784,
+      "step": 5
+    },
+    {
+      "epoch": 1.5818181818181818,
+      "grad_norm": 9.718952178955078,
+      "learning_rate": 9.971704944519592e-08,
+      "loss": 0.7589,
+      "step": 6
+    },
+    {
+      "epoch": 1.8727272727272726,
+      "grad_norm": 10.934309959411621,
+      "learning_rate": 9.887175604818205e-08,
+      "loss": 0.8307,
+      "step": 7
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 11.188789367675781,
+      "learning_rate": 9.747474986387654e-08,
+      "loss": 0.8821,
+      "step": 8
+    },
+    {
+      "epoch": 2.290909090909091,
+      "grad_norm": 9.692912101745605,
+      "learning_rate": 9.554359905560885e-08,
+      "loss": 0.7998,
+      "step": 9
+    },
+    {
+      "epoch": 2.581818181818182,
+      "grad_norm": 11.390420913696289,
+      "learning_rate": 9.310258896527278e-08,
+      "loss": 0.8196,
+      "step": 10
+    },
+    {
+      "epoch": 2.8727272727272726,
+      "grad_norm": 10.247771263122559,
+      "learning_rate": 9.018241671106134e-08,
+      "loss": 0.7766,
+      "step": 11
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 11.571775436401367,
+      "learning_rate": 8.681980515339464e-08,
+      "loss": 0.8289,
+      "step": 12
+    },
+    {
+      "epoch": 3.290909090909091,
+      "grad_norm": 10.701568603515625,
+      "learning_rate": 8.305704108364301e-08,
+      "loss": 0.8375,
+      "step": 13
+    },
+    {
+      "epoch": 3.581818181818182,
+      "grad_norm": 10.49411678314209,
+      "learning_rate": 7.894144344319013e-08,
+      "loss": 0.8383,
+      "step": 14
+    },
+    {
+      "epoch": 3.8727272727272726,
+      "grad_norm": 9.895997047424316,
+      "learning_rate": 7.452476826029011e-08,
+      "loss": 0.772,
+      "step": 15
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 10.467330932617188,
+      "learning_rate": 6.986255778798252e-08,
+      "loss": 0.7012,
+      "step": 16
+    },
+    {
+      "epoch": 4.290909090909091,
+      "grad_norm": 9.836618423461914,
+      "learning_rate": 6.501344202803413e-08,
+      "loss": 0.777,
+      "step": 17
+    },
+    {
+      "epoch": 4.581818181818182,
+      "grad_norm": 11.242887496948242,
+      "learning_rate": 6.003840142464886e-08,
+      "loss": 0.8631,
+      "step": 18
+    },
+    {
+      "epoch": 4.872727272727273,
+      "grad_norm": 10.001364707946777,
+      "learning_rate": 5.5e-08,
+      "loss": 0.7819,
+      "step": 19
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 10.092758178710938,
+      "learning_rate": 4.996159857535115e-08,
+      "loss": 0.7722,
+      "step": 20
+    },
+    {
+      "epoch": 5.290909090909091,
+      "grad_norm": 9.45466423034668,
+      "learning_rate": 4.498655797196585e-08,
+      "loss": 0.7416,
+      "step": 21
+    },
+    {
+      "epoch": 5.581818181818182,
+      "grad_norm": 10.496912956237793,
+      "learning_rate": 4.0137442212017494e-08,
+      "loss": 0.8161,
+      "step": 22
+    },
+    {
+      "epoch": 5.872727272727273,
+      "grad_norm": 10.202836036682129,
+      "learning_rate": 3.5475231739709885e-08,
+      "loss": 0.823,
+      "step": 23
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 12.920607566833496,
+      "learning_rate": 3.105855655680986e-08,
+      "loss": 0.8315,
+      "step": 24
+    },
+    {
+      "epoch": 6.290909090909091,
+      "grad_norm": 10.253811836242676,
+      "learning_rate": 2.6942958916356994e-08,
+      "loss": 0.8316,
+      "step": 25
+    },
+    {
+      "epoch": 6.581818181818182,
+      "grad_norm": 9.783924102783203,
+      "learning_rate": 2.3180194846605363e-08,
+      "loss": 0.7542,
+      "step": 26
+    },
+    {
+      "epoch": 6.872727272727273,
+      "grad_norm": 10.855210304260254,
+      "learning_rate": 1.981758328893866e-08,
+      "loss": 0.8357,
+      "step": 27
+    },
+    {
+      "epoch": 7.0,
+      "grad_norm": 10.147912979125977,
+      "learning_rate": 1.6897411034727217e-08,
+      "loss": 0.7426,
+      "step": 28
+    },
+    {
+      "epoch": 7.290909090909091,
+      "grad_norm": 10.078908920288086,
+      "learning_rate": 1.4456400944391145e-08,
+      "loss": 0.7832,
+      "step": 29
+    },
+    {
+      "epoch": 7.581818181818182,
+      "grad_norm": 10.833037376403809,
+      "learning_rate": 1.2525250136123459e-08,
+      "loss": 0.7954,
+      "step": 30
+    },
+    {
+      "epoch": 7.872727272727273,
+      "grad_norm": 9.931336402893066,
+      "learning_rate": 1.1128243951817936e-08,
+      "loss": 0.7893,
+      "step": 31
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 10.87130355834961,
+      "learning_rate": 1.0282950554804083e-08,
+      "loss": 0.9104,
+      "step": 32
+    },
+    {
+      "epoch": 8.0,
+      "step": 32,
+      "total_flos": 1.073226288070656e+16,
+      "train_loss": 0.8035880327224731,
+      "train_runtime": 252.7497,
+      "train_samples_per_second": 3.482,
+      "train_steps_per_second": 0.127
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 32,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 16,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.073226288070656e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_loss.png ADDED Viewed