End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +661 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: oh_v1.3_metamath_x2
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # oh_v1.3_metamath_x2
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7103

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: oh_v1.3_metamath_x2
 # oh_v1.3_metamath_x2
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/oh_v1.3_metamath_x2 dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7103

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9956063268892796,
+    "eval_loss": 0.7103263735771179,
+    "eval_runtime": 302.3687,
+    "eval_samples_per_second": 25.356,
+    "eval_steps_per_second": 0.397,
+    "total_flos": 1426922353459200.0,
+    "train_loss": 0.6913109551852857,
+    "train_runtime": 50624.2334,
+    "train_samples_per_second": 8.632,
+    "train_steps_per_second": 0.017
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9956063268892796,
+    "eval_loss": 0.7103263735771179,
+    "eval_runtime": 302.3687,
+    "eval_samples_per_second": 25.356,
+    "eval_steps_per_second": 0.397
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9956063268892796,
+    "total_flos": 1426922353459200.0,
+    "train_loss": 0.6913109551852857,
+    "train_runtime": 50624.2334,
+    "train_samples_per_second": 8.632,
+    "train_steps_per_second": 0.017
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,661 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9956063268892796,
+  "eval_steps": 500,
+  "global_step": 852,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0351493848857645,
+      "grad_norm": 6.865658287926604,
+      "learning_rate": 5e-06,
+      "loss": 1.0224,
+      "step": 10
+    },
+    {
+      "epoch": 0.070298769771529,
+      "grad_norm": 1.300129454515723,
+      "learning_rate": 5e-06,
+      "loss": 0.8893,
+      "step": 20
+    },
+    {
+      "epoch": 0.1054481546572935,
+      "grad_norm": 1.662141333626877,
+      "learning_rate": 5e-06,
+      "loss": 0.8461,
+      "step": 30
+    },
+    {
+      "epoch": 0.140597539543058,
+      "grad_norm": 1.1867317674423972,
+      "learning_rate": 5e-06,
+      "loss": 0.8221,
+      "step": 40
+    },
+    {
+      "epoch": 0.1757469244288225,
+      "grad_norm": 1.3406663541941202,
+      "learning_rate": 5e-06,
+      "loss": 0.8082,
+      "step": 50
+    },
+    {
+      "epoch": 0.210896309314587,
+      "grad_norm": 0.9748883756928828,
+      "learning_rate": 5e-06,
+      "loss": 0.7926,
+      "step": 60
+    },
+    {
+      "epoch": 0.2460456942003515,
+      "grad_norm": 0.9049088495136057,
+      "learning_rate": 5e-06,
+      "loss": 0.7786,
+      "step": 70
+    },
+    {
+      "epoch": 0.281195079086116,
+      "grad_norm": 0.7756246074861972,
+      "learning_rate": 5e-06,
+      "loss": 0.7672,
+      "step": 80
+    },
+    {
+      "epoch": 0.3163444639718805,
+      "grad_norm": 0.6803113923201367,
+      "learning_rate": 5e-06,
+      "loss": 0.7666,
+      "step": 90
+    },
+    {
+      "epoch": 0.351493848857645,
+      "grad_norm": 0.7087390396178761,
+      "learning_rate": 5e-06,
+      "loss": 0.7626,
+      "step": 100
+    },
+    {
+      "epoch": 0.3866432337434095,
+      "grad_norm": 0.6904558645177432,
+      "learning_rate": 5e-06,
+      "loss": 0.7558,
+      "step": 110
+    },
+    {
+      "epoch": 0.421792618629174,
+      "grad_norm": 0.5406381121382178,
+      "learning_rate": 5e-06,
+      "loss": 0.751,
+      "step": 120
+    },
+    {
+      "epoch": 0.45694200351493847,
+      "grad_norm": 0.7263515515258443,
+      "learning_rate": 5e-06,
+      "loss": 0.7541,
+      "step": 130
+    },
+    {
+      "epoch": 0.492091388400703,
+      "grad_norm": 0.6381225954297634,
+      "learning_rate": 5e-06,
+      "loss": 0.7507,
+      "step": 140
+    },
+    {
+      "epoch": 0.5272407732864675,
+      "grad_norm": 0.9282926398227679,
+      "learning_rate": 5e-06,
+      "loss": 0.7425,
+      "step": 150
+    },
+    {
+      "epoch": 0.562390158172232,
+      "grad_norm": 0.703837658050583,
+      "learning_rate": 5e-06,
+      "loss": 0.7473,
+      "step": 160
+    },
+    {
+      "epoch": 0.5975395430579965,
+      "grad_norm": 0.7762432087380096,
+      "learning_rate": 5e-06,
+      "loss": 0.7408,
+      "step": 170
+    },
+    {
+      "epoch": 0.632688927943761,
+      "grad_norm": 0.5947662547404722,
+      "learning_rate": 5e-06,
+      "loss": 0.7347,
+      "step": 180
+    },
+    {
+      "epoch": 0.6678383128295254,
+      "grad_norm": 0.628944705791063,
+      "learning_rate": 5e-06,
+      "loss": 0.7406,
+      "step": 190
+    },
+    {
+      "epoch": 0.70298769771529,
+      "grad_norm": 0.5977406811055224,
+      "learning_rate": 5e-06,
+      "loss": 0.7347,
+      "step": 200
+    },
+    {
+      "epoch": 0.7381370826010545,
+      "grad_norm": 0.582107769314153,
+      "learning_rate": 5e-06,
+      "loss": 0.737,
+      "step": 210
+    },
+    {
+      "epoch": 0.773286467486819,
+      "grad_norm": 0.6326552735959291,
+      "learning_rate": 5e-06,
+      "loss": 0.7328,
+      "step": 220
+    },
+    {
+      "epoch": 0.8084358523725835,
+      "grad_norm": 0.6637547116847639,
+      "learning_rate": 5e-06,
+      "loss": 0.7311,
+      "step": 230
+    },
+    {
+      "epoch": 0.843585237258348,
+      "grad_norm": 0.6997143410926964,
+      "learning_rate": 5e-06,
+      "loss": 0.7341,
+      "step": 240
+    },
+    {
+      "epoch": 0.8787346221441125,
+      "grad_norm": 0.6162729226466245,
+      "learning_rate": 5e-06,
+      "loss": 0.7332,
+      "step": 250
+    },
+    {
+      "epoch": 0.9138840070298769,
+      "grad_norm": 0.6199166403621413,
+      "learning_rate": 5e-06,
+      "loss": 0.7262,
+      "step": 260
+    },
+    {
+      "epoch": 0.9490333919156415,
+      "grad_norm": 0.6034966296550427,
+      "learning_rate": 5e-06,
+      "loss": 0.729,
+      "step": 270
+    },
+    {
+      "epoch": 0.984182776801406,
+      "grad_norm": 0.6195682554180708,
+      "learning_rate": 5e-06,
+      "loss": 0.7264,
+      "step": 280
+    },
+    {
+      "epoch": 0.9982425307557118,
+      "eval_loss": 0.721891462802887,
+      "eval_runtime": 302.9942,
+      "eval_samples_per_second": 25.304,
+      "eval_steps_per_second": 0.396,
+      "step": 284
+    },
+    {
+      "epoch": 1.0197715289982425,
+      "grad_norm": 0.6617437136575776,
+      "learning_rate": 5e-06,
+      "loss": 0.7503,
+      "step": 290
+    },
+    {
+      "epoch": 1.054920913884007,
+      "grad_norm": 0.596469668606961,
+      "learning_rate": 5e-06,
+      "loss": 0.6752,
+      "step": 300
+    },
+    {
+      "epoch": 1.0900702987697715,
+      "grad_norm": 0.7286257897811691,
+      "learning_rate": 5e-06,
+      "loss": 0.6745,
+      "step": 310
+    },
+    {
+      "epoch": 1.1252196836555362,
+      "grad_norm": 0.6703518701287363,
+      "learning_rate": 5e-06,
+      "loss": 0.6772,
+      "step": 320
+    },
+    {
+      "epoch": 1.1603690685413006,
+      "grad_norm": 0.6678193952959378,
+      "learning_rate": 5e-06,
+      "loss": 0.6757,
+      "step": 330
+    },
+    {
+      "epoch": 1.195518453427065,
+      "grad_norm": 0.6272061731880971,
+      "learning_rate": 5e-06,
+      "loss": 0.6772,
+      "step": 340
+    },
+    {
+      "epoch": 1.2306678383128296,
+      "grad_norm": 0.5618776589312474,
+      "learning_rate": 5e-06,
+      "loss": 0.6767,
+      "step": 350
+    },
+    {
+      "epoch": 1.265817223198594,
+      "grad_norm": 0.656461597570214,
+      "learning_rate": 5e-06,
+      "loss": 0.6742,
+      "step": 360
+    },
+    {
+      "epoch": 1.3009666080843585,
+      "grad_norm": 0.95088298783439,
+      "learning_rate": 5e-06,
+      "loss": 0.6803,
+      "step": 370
+    },
+    {
+      "epoch": 1.336115992970123,
+      "grad_norm": 0.6256062888068228,
+      "learning_rate": 5e-06,
+      "loss": 0.6789,
+      "step": 380
+    },
+    {
+      "epoch": 1.3712653778558876,
+      "grad_norm": 0.5178529345876333,
+      "learning_rate": 5e-06,
+      "loss": 0.6757,
+      "step": 390
+    },
+    {
+      "epoch": 1.406414762741652,
+      "grad_norm": 0.6633111117626306,
+      "learning_rate": 5e-06,
+      "loss": 0.6786,
+      "step": 400
+    },
+    {
+      "epoch": 1.4415641476274166,
+      "grad_norm": 0.5753214727933854,
+      "learning_rate": 5e-06,
+      "loss": 0.6686,
+      "step": 410
+    },
+    {
+      "epoch": 1.476713532513181,
+      "grad_norm": 0.7023169996268164,
+      "learning_rate": 5e-06,
+      "loss": 0.674,
+      "step": 420
+    },
+    {
+      "epoch": 1.5118629173989455,
+      "grad_norm": 0.601050736097527,
+      "learning_rate": 5e-06,
+      "loss": 0.676,
+      "step": 430
+    },
+    {
+      "epoch": 1.54701230228471,
+      "grad_norm": 0.6375081303020413,
+      "learning_rate": 5e-06,
+      "loss": 0.6779,
+      "step": 440
+    },
+    {
+      "epoch": 1.5821616871704745,
+      "grad_norm": 0.6076189491485879,
+      "learning_rate": 5e-06,
+      "loss": 0.6811,
+      "step": 450
+    },
+    {
+      "epoch": 1.6173110720562391,
+      "grad_norm": 0.6123755131309624,
+      "learning_rate": 5e-06,
+      "loss": 0.6749,
+      "step": 460
+    },
+    {
+      "epoch": 1.6524604569420034,
+      "grad_norm": 0.5847478738087437,
+      "learning_rate": 5e-06,
+      "loss": 0.6747,
+      "step": 470
+    },
+    {
+      "epoch": 1.687609841827768,
+      "grad_norm": 0.6041574755100807,
+      "learning_rate": 5e-06,
+      "loss": 0.6681,
+      "step": 480
+    },
+    {
+      "epoch": 1.7227592267135325,
+      "grad_norm": 0.6972984159432736,
+      "learning_rate": 5e-06,
+      "loss": 0.6686,
+      "step": 490
+    },
+    {
+      "epoch": 1.757908611599297,
+      "grad_norm": 0.592332789109461,
+      "learning_rate": 5e-06,
+      "loss": 0.6763,
+      "step": 500
+    },
+    {
+      "epoch": 1.7930579964850615,
+      "grad_norm": 0.7081266254056617,
+      "learning_rate": 5e-06,
+      "loss": 0.6707,
+      "step": 510
+    },
+    {
+      "epoch": 1.828207381370826,
+      "grad_norm": 0.5655551892586738,
+      "learning_rate": 5e-06,
+      "loss": 0.675,
+      "step": 520
+    },
+    {
+      "epoch": 1.8633567662565906,
+      "grad_norm": 0.5912936045849521,
+      "learning_rate": 5e-06,
+      "loss": 0.6746,
+      "step": 530
+    },
+    {
+      "epoch": 1.8985061511423549,
+      "grad_norm": 0.6418543236430647,
+      "learning_rate": 5e-06,
+      "loss": 0.6743,
+      "step": 540
+    },
+    {
+      "epoch": 1.9336555360281196,
+      "grad_norm": 0.8406203952305934,
+      "learning_rate": 5e-06,
+      "loss": 0.6777,
+      "step": 550
+    },
+    {
+      "epoch": 1.968804920913884,
+      "grad_norm": 0.6740623987469322,
+      "learning_rate": 5e-06,
+      "loss": 0.6783,
+      "step": 560
+    },
+    {
+      "epoch": 1.9969244288224957,
+      "eval_loss": 0.7088373899459839,
+      "eval_runtime": 302.5633,
+      "eval_samples_per_second": 25.34,
+      "eval_steps_per_second": 0.397,
+      "step": 568
+    },
+    {
+      "epoch": 2.0043936731107204,
+      "grad_norm": 0.8982450142296012,
+      "learning_rate": 5e-06,
+      "loss": 0.7079,
+      "step": 570
+    },
+    {
+      "epoch": 2.039543057996485,
+      "grad_norm": 0.9691008221084222,
+      "learning_rate": 5e-06,
+      "loss": 0.6203,
+      "step": 580
+    },
+    {
+      "epoch": 2.0746924428822497,
+      "grad_norm": 0.6447824341633516,
+      "learning_rate": 5e-06,
+      "loss": 0.6244,
+      "step": 590
+    },
+    {
+      "epoch": 2.109841827768014,
+      "grad_norm": 0.7064323342581214,
+      "learning_rate": 5e-06,
+      "loss": 0.6189,
+      "step": 600
+    },
+    {
+      "epoch": 2.1449912126537787,
+      "grad_norm": 0.5819596596280016,
+      "learning_rate": 5e-06,
+      "loss": 0.6207,
+      "step": 610
+    },
+    {
+      "epoch": 2.180140597539543,
+      "grad_norm": 0.7981926624790863,
+      "learning_rate": 5e-06,
+      "loss": 0.6203,
+      "step": 620
+    },
+    {
+      "epoch": 2.2152899824253076,
+      "grad_norm": 0.6853162161955834,
+      "learning_rate": 5e-06,
+      "loss": 0.6281,
+      "step": 630
+    },
+    {
+      "epoch": 2.2504393673110723,
+      "grad_norm": 0.6819271490957453,
+      "learning_rate": 5e-06,
+      "loss": 0.6246,
+      "step": 640
+    },
+    {
+      "epoch": 2.2855887521968365,
+      "grad_norm": 0.678545369804577,
+      "learning_rate": 5e-06,
+      "loss": 0.6277,
+      "step": 650
+    },
+    {
+      "epoch": 2.3207381370826012,
+      "grad_norm": 0.6597702524075268,
+      "learning_rate": 5e-06,
+      "loss": 0.631,
+      "step": 660
+    },
+    {
+      "epoch": 2.3558875219683655,
+      "grad_norm": 0.5352899370053985,
+      "learning_rate": 5e-06,
+      "loss": 0.627,
+      "step": 670
+    },
+    {
+      "epoch": 2.39103690685413,
+      "grad_norm": 0.575976735916134,
+      "learning_rate": 5e-06,
+      "loss": 0.6252,
+      "step": 680
+    },
+    {
+      "epoch": 2.4261862917398944,
+      "grad_norm": 0.6538224434833726,
+      "learning_rate": 5e-06,
+      "loss": 0.631,
+      "step": 690
+    },
+    {
+      "epoch": 2.461335676625659,
+      "grad_norm": 0.8281376962806699,
+      "learning_rate": 5e-06,
+      "loss": 0.6238,
+      "step": 700
+    },
+    {
+      "epoch": 2.4964850615114234,
+      "grad_norm": 0.5971561231648772,
+      "learning_rate": 5e-06,
+      "loss": 0.6244,
+      "step": 710
+    },
+    {
+      "epoch": 2.531634446397188,
+      "grad_norm": 0.5668390272889466,
+      "learning_rate": 5e-06,
+      "loss": 0.6254,
+      "step": 720
+    },
+    {
+      "epoch": 2.5667838312829527,
+      "grad_norm": 0.7378544776528181,
+      "learning_rate": 5e-06,
+      "loss": 0.6248,
+      "step": 730
+    },
+    {
+      "epoch": 2.601933216168717,
+      "grad_norm": 0.6067368368819991,
+      "learning_rate": 5e-06,
+      "loss": 0.6256,
+      "step": 740
+    },
+    {
+      "epoch": 2.6370826010544817,
+      "grad_norm": 0.6816545127839443,
+      "learning_rate": 5e-06,
+      "loss": 0.6286,
+      "step": 750
+    },
+    {
+      "epoch": 2.672231985940246,
+      "grad_norm": 0.787032141068753,
+      "learning_rate": 5e-06,
+      "loss": 0.628,
+      "step": 760
+    },
+    {
+      "epoch": 2.7073813708260106,
+      "grad_norm": 0.6393338928189319,
+      "learning_rate": 5e-06,
+      "loss": 0.6267,
+      "step": 770
+    },
+    {
+      "epoch": 2.7425307557117753,
+      "grad_norm": 0.5562264277034894,
+      "learning_rate": 5e-06,
+      "loss": 0.6261,
+      "step": 780
+    },
+    {
+      "epoch": 2.7776801405975395,
+      "grad_norm": 0.5896436524802737,
+      "learning_rate": 5e-06,
+      "loss": 0.6256,
+      "step": 790
+    },
+    {
+      "epoch": 2.812829525483304,
+      "grad_norm": 0.5828475505687344,
+      "learning_rate": 5e-06,
+      "loss": 0.6247,
+      "step": 800
+    },
+    {
+      "epoch": 2.8479789103690685,
+      "grad_norm": 0.634394806473084,
+      "learning_rate": 5e-06,
+      "loss": 0.6269,
+      "step": 810
+    },
+    {
+      "epoch": 2.883128295254833,
+      "grad_norm": 0.6117384621451529,
+      "learning_rate": 5e-06,
+      "loss": 0.6279,
+      "step": 820
+    },
+    {
+      "epoch": 2.9182776801405974,
+      "grad_norm": 0.5540272640106404,
+      "learning_rate": 5e-06,
+      "loss": 0.6212,
+      "step": 830
+    },
+    {
+      "epoch": 2.953427065026362,
+      "grad_norm": 0.5600169828318418,
+      "learning_rate": 5e-06,
+      "loss": 0.6282,
+      "step": 840
+    },
+    {
+      "epoch": 2.9885764499121263,
+      "grad_norm": 0.7592332443324643,
+      "learning_rate": 5e-06,
+      "loss": 0.6277,
+      "step": 850
+    },
+    {
+      "epoch": 2.9956063268892796,
+      "eval_loss": 0.7103263735771179,
+      "eval_runtime": 302.7761,
+      "eval_samples_per_second": 25.322,
+      "eval_steps_per_second": 0.396,
+      "step": 852
+    },
+    {
+      "epoch": 2.9956063268892796,
+      "step": 852,
+      "total_flos": 1426922353459200.0,
+      "train_loss": 0.6913109551852857,
+      "train_runtime": 50624.2334,
+      "train_samples_per_second": 8.632,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 852,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1426922353459200.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed