End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +612 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: oh_v1.3_metamath_x.25
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # oh_v1.3_metamath_x.25
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7495

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: oh_v1.3_metamath_x.25
 # oh_v1.3_metamath_x.25
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/oh_v1.3_metamath_x.25 dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7495

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9961795606494745,
+    "eval_loss": 0.7494649887084961,
+    "eval_runtime": 279.2244,
+    "eval_samples_per_second": 25.259,
+    "eval_steps_per_second": 0.398,
+    "total_flos": 1311344783523840.0,
+    "train_loss": 0.7237713550090181,
+    "train_runtime": 46210.0208,
+    "train_samples_per_second": 8.699,
+    "train_steps_per_second": 0.017
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9961795606494745,
+    "eval_loss": 0.7494649887084961,
+    "eval_runtime": 279.2244,
+    "eval_samples_per_second": 25.259,
+    "eval_steps_per_second": 0.398
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9961795606494745,
+    "total_flos": 1311344783523840.0,
+    "train_loss": 0.7237713550090181,
+    "train_runtime": 46210.0208,
+    "train_samples_per_second": 8.699,
+    "train_steps_per_second": 0.017
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,612 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9961795606494745,
+  "eval_steps": 500,
+  "global_step": 783,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.038204393505253106,
+      "grad_norm": 17.106196027203886,
+      "learning_rate": 5e-06,
+      "loss": 1.0699,
+      "step": 10
+    },
+    {
+      "epoch": 0.07640878701050621,
+      "grad_norm": 3.7575313843264073,
+      "learning_rate": 5e-06,
+      "loss": 0.9412,
+      "step": 20
+    },
+    {
+      "epoch": 0.11461318051575932,
+      "grad_norm": 2.0783280143086302,
+      "learning_rate": 5e-06,
+      "loss": 0.8972,
+      "step": 30
+    },
+    {
+      "epoch": 0.15281757402101243,
+      "grad_norm": 1.419216051491444,
+      "learning_rate": 5e-06,
+      "loss": 0.8682,
+      "step": 40
+    },
+    {
+      "epoch": 0.19102196752626552,
+      "grad_norm": 1.5492013803153966,
+      "learning_rate": 5e-06,
+      "loss": 0.8507,
+      "step": 50
+    },
+    {
+      "epoch": 0.22922636103151864,
+      "grad_norm": 0.9979261775157234,
+      "learning_rate": 5e-06,
+      "loss": 0.8317,
+      "step": 60
+    },
+    {
+      "epoch": 0.26743075453677173,
+      "grad_norm": 1.1453499414283712,
+      "learning_rate": 5e-06,
+      "loss": 0.8257,
+      "step": 70
+    },
+    {
+      "epoch": 0.30563514804202485,
+      "grad_norm": 1.0657509981340374,
+      "learning_rate": 5e-06,
+      "loss": 0.8136,
+      "step": 80
+    },
+    {
+      "epoch": 0.3438395415472779,
+      "grad_norm": 0.8154629296380118,
+      "learning_rate": 5e-06,
+      "loss": 0.8012,
+      "step": 90
+    },
+    {
+      "epoch": 0.38204393505253104,
+      "grad_norm": 0.7783174666894658,
+      "learning_rate": 5e-06,
+      "loss": 0.7944,
+      "step": 100
+    },
+    {
+      "epoch": 0.42024832855778416,
+      "grad_norm": 0.7557295182511866,
+      "learning_rate": 5e-06,
+      "loss": 0.7976,
+      "step": 110
+    },
+    {
+      "epoch": 0.4584527220630373,
+      "grad_norm": 0.7419048503084669,
+      "learning_rate": 5e-06,
+      "loss": 0.7838,
+      "step": 120
+    },
+    {
+      "epoch": 0.49665711556829034,
+      "grad_norm": 0.6023078446753443,
+      "learning_rate": 5e-06,
+      "loss": 0.7861,
+      "step": 130
+    },
+    {
+      "epoch": 0.5348615090735435,
+      "grad_norm": 0.8029384495653265,
+      "learning_rate": 5e-06,
+      "loss": 0.7794,
+      "step": 140
+    },
+    {
+      "epoch": 0.5730659025787965,
+      "grad_norm": 0.8049936585803824,
+      "learning_rate": 5e-06,
+      "loss": 0.7824,
+      "step": 150
+    },
+    {
+      "epoch": 0.6112702960840497,
+      "grad_norm": 0.8903153692892993,
+      "learning_rate": 5e-06,
+      "loss": 0.7716,
+      "step": 160
+    },
+    {
+      "epoch": 0.6494746895893028,
+      "grad_norm": 0.7481918362618383,
+      "learning_rate": 5e-06,
+      "loss": 0.7781,
+      "step": 170
+    },
+    {
+      "epoch": 0.6876790830945558,
+      "grad_norm": 0.8571380886679489,
+      "learning_rate": 5e-06,
+      "loss": 0.7728,
+      "step": 180
+    },
+    {
+      "epoch": 0.725883476599809,
+      "grad_norm": 0.603577733589318,
+      "learning_rate": 5e-06,
+      "loss": 0.7693,
+      "step": 190
+    },
+    {
+      "epoch": 0.7640878701050621,
+      "grad_norm": 0.6986000212250895,
+      "learning_rate": 5e-06,
+      "loss": 0.7708,
+      "step": 200
+    },
+    {
+      "epoch": 0.8022922636103151,
+      "grad_norm": 0.6654581267839026,
+      "learning_rate": 5e-06,
+      "loss": 0.7686,
+      "step": 210
+    },
+    {
+      "epoch": 0.8404966571155683,
+      "grad_norm": 0.7161329148587753,
+      "learning_rate": 5e-06,
+      "loss": 0.7639,
+      "step": 220
+    },
+    {
+      "epoch": 0.8787010506208214,
+      "grad_norm": 0.7291450225202621,
+      "learning_rate": 5e-06,
+      "loss": 0.7653,
+      "step": 230
+    },
+    {
+      "epoch": 0.9169054441260746,
+      "grad_norm": 0.7588585120562266,
+      "learning_rate": 5e-06,
+      "loss": 0.7633,
+      "step": 240
+    },
+    {
+      "epoch": 0.9551098376313276,
+      "grad_norm": 0.8527681409489486,
+      "learning_rate": 5e-06,
+      "loss": 0.758,
+      "step": 250
+    },
+    {
+      "epoch": 0.9933142311365807,
+      "grad_norm": 0.5733961547084557,
+      "learning_rate": 5e-06,
+      "loss": 0.7556,
+      "step": 260
+    },
+    {
+      "epoch": 0.997134670487106,
+      "eval_loss": 0.7598350644111633,
+      "eval_runtime": 277.7141,
+      "eval_samples_per_second": 25.397,
+      "eval_steps_per_second": 0.4,
+      "step": 261
+    },
+    {
+      "epoch": 1.033906399235912,
+      "grad_norm": 0.8279780719142003,
+      "learning_rate": 5e-06,
+      "loss": 0.7812,
+      "step": 270
+    },
+    {
+      "epoch": 1.0721107927411653,
+      "grad_norm": 0.8247306859370325,
+      "learning_rate": 5e-06,
+      "loss": 0.7055,
+      "step": 280
+    },
+    {
+      "epoch": 1.1103151862464182,
+      "grad_norm": 0.6966889761138212,
+      "learning_rate": 5e-06,
+      "loss": 0.7083,
+      "step": 290
+    },
+    {
+      "epoch": 1.1485195797516714,
+      "grad_norm": 0.7158212384271236,
+      "learning_rate": 5e-06,
+      "loss": 0.7028,
+      "step": 300
+    },
+    {
+      "epoch": 1.1867239732569246,
+      "grad_norm": 0.6513073930120158,
+      "learning_rate": 5e-06,
+      "loss": 0.7105,
+      "step": 310
+    },
+    {
+      "epoch": 1.2249283667621778,
+      "grad_norm": 0.6782643522162839,
+      "learning_rate": 5e-06,
+      "loss": 0.7092,
+      "step": 320
+    },
+    {
+      "epoch": 1.2631327602674307,
+      "grad_norm": 0.7229522703779768,
+      "learning_rate": 5e-06,
+      "loss": 0.7027,
+      "step": 330
+    },
+    {
+      "epoch": 1.3013371537726839,
+      "grad_norm": 0.686689277377695,
+      "learning_rate": 5e-06,
+      "loss": 0.7092,
+      "step": 340
+    },
+    {
+      "epoch": 1.3395415472779368,
+      "grad_norm": 0.6686838666832164,
+      "learning_rate": 5e-06,
+      "loss": 0.7076,
+      "step": 350
+    },
+    {
+      "epoch": 1.37774594078319,
+      "grad_norm": 0.8906041633095456,
+      "learning_rate": 5e-06,
+      "loss": 0.703,
+      "step": 360
+    },
+    {
+      "epoch": 1.4159503342884432,
+      "grad_norm": 0.6554966723136705,
+      "learning_rate": 5e-06,
+      "loss": 0.7078,
+      "step": 370
+    },
+    {
+      "epoch": 1.4541547277936964,
+      "grad_norm": 0.6044694699607192,
+      "learning_rate": 5e-06,
+      "loss": 0.7064,
+      "step": 380
+    },
+    {
+      "epoch": 1.4923591212989493,
+      "grad_norm": 0.6795649168516978,
+      "learning_rate": 5e-06,
+      "loss": 0.7042,
+      "step": 390
+    },
+    {
+      "epoch": 1.5305635148042025,
+      "grad_norm": 0.6518471106071317,
+      "learning_rate": 5e-06,
+      "loss": 0.7055,
+      "step": 400
+    },
+    {
+      "epoch": 1.5687679083094554,
+      "grad_norm": 0.656923516770704,
+      "learning_rate": 5e-06,
+      "loss": 0.707,
+      "step": 410
+    },
+    {
+      "epoch": 1.6069723018147086,
+      "grad_norm": 0.6225351538335282,
+      "learning_rate": 5e-06,
+      "loss": 0.7045,
+      "step": 420
+    },
+    {
+      "epoch": 1.6451766953199618,
+      "grad_norm": 0.777993155808027,
+      "learning_rate": 5e-06,
+      "loss": 0.7056,
+      "step": 430
+    },
+    {
+      "epoch": 1.683381088825215,
+      "grad_norm": 0.6859522609411438,
+      "learning_rate": 5e-06,
+      "loss": 0.7047,
+      "step": 440
+    },
+    {
+      "epoch": 1.7215854823304682,
+      "grad_norm": 0.6651740116728566,
+      "learning_rate": 5e-06,
+      "loss": 0.7004,
+      "step": 450
+    },
+    {
+      "epoch": 1.759789875835721,
+      "grad_norm": 0.5969267067929888,
+      "learning_rate": 5e-06,
+      "loss": 0.7012,
+      "step": 460
+    },
+    {
+      "epoch": 1.797994269340974,
+      "grad_norm": 0.613011364797191,
+      "learning_rate": 5e-06,
+      "loss": 0.7073,
+      "step": 470
+    },
+    {
+      "epoch": 1.8361986628462272,
+      "grad_norm": 0.6207675725240294,
+      "learning_rate": 5e-06,
+      "loss": 0.7002,
+      "step": 480
+    },
+    {
+      "epoch": 1.8744030563514804,
+      "grad_norm": 0.5957548029122114,
+      "learning_rate": 5e-06,
+      "loss": 0.7063,
+      "step": 490
+    },
+    {
+      "epoch": 1.9126074498567336,
+      "grad_norm": 0.6972405923078006,
+      "learning_rate": 5e-06,
+      "loss": 0.7046,
+      "step": 500
+    },
+    {
+      "epoch": 1.9508118433619868,
+      "grad_norm": 0.6246007349854923,
+      "learning_rate": 5e-06,
+      "loss": 0.7074,
+      "step": 510
+    },
+    {
+      "epoch": 1.9890162368672397,
+      "grad_norm": 0.6291588284508969,
+      "learning_rate": 5e-06,
+      "loss": 0.6989,
+      "step": 520
+    },
+    {
+      "epoch": 1.9966571155682904,
+      "eval_loss": 0.7465963363647461,
+      "eval_runtime": 278.0536,
+      "eval_samples_per_second": 25.366,
+      "eval_steps_per_second": 0.399,
+      "step": 522
+    },
+    {
+      "epoch": 2.029608404966571,
+      "grad_norm": 0.9162417337081611,
+      "learning_rate": 5e-06,
+      "loss": 0.7192,
+      "step": 530
+    },
+    {
+      "epoch": 2.067812798471824,
+      "grad_norm": 0.6666792440916017,
+      "learning_rate": 5e-06,
+      "loss": 0.6447,
+      "step": 540
+    },
+    {
+      "epoch": 2.1060171919770774,
+      "grad_norm": 0.6320270044953581,
+      "learning_rate": 5e-06,
+      "loss": 0.6504,
+      "step": 550
+    },
+    {
+      "epoch": 2.1442215854823305,
+      "grad_norm": 0.8407791173957067,
+      "learning_rate": 5e-06,
+      "loss": 0.6486,
+      "step": 560
+    },
+    {
+      "epoch": 2.1824259789875837,
+      "grad_norm": 0.9821964292842589,
+      "learning_rate": 5e-06,
+      "loss": 0.6528,
+      "step": 570
+    },
+    {
+      "epoch": 2.2206303724928365,
+      "grad_norm": 0.9530943320810575,
+      "learning_rate": 5e-06,
+      "loss": 0.6536,
+      "step": 580
+    },
+    {
+      "epoch": 2.2588347659980896,
+      "grad_norm": 0.7065407366963496,
+      "learning_rate": 5e-06,
+      "loss": 0.6515,
+      "step": 590
+    },
+    {
+      "epoch": 2.297039159503343,
+      "grad_norm": 0.9143814042284045,
+      "learning_rate": 5e-06,
+      "loss": 0.6516,
+      "step": 600
+    },
+    {
+      "epoch": 2.335243553008596,
+      "grad_norm": 1.0356120889247198,
+      "learning_rate": 5e-06,
+      "loss": 0.6522,
+      "step": 610
+    },
+    {
+      "epoch": 2.373447946513849,
+      "grad_norm": 0.5955479754526213,
+      "learning_rate": 5e-06,
+      "loss": 0.6484,
+      "step": 620
+    },
+    {
+      "epoch": 2.4116523400191023,
+      "grad_norm": 0.668667957683453,
+      "learning_rate": 5e-06,
+      "loss": 0.6517,
+      "step": 630
+    },
+    {
+      "epoch": 2.4498567335243555,
+      "grad_norm": 0.5916712868955213,
+      "learning_rate": 5e-06,
+      "loss": 0.6492,
+      "step": 640
+    },
+    {
+      "epoch": 2.4880611270296082,
+      "grad_norm": 0.7078011059499841,
+      "learning_rate": 5e-06,
+      "loss": 0.6499,
+      "step": 650
+    },
+    {
+      "epoch": 2.5262655205348614,
+      "grad_norm": 0.7113167536030103,
+      "learning_rate": 5e-06,
+      "loss": 0.6491,
+      "step": 660
+    },
+    {
+      "epoch": 2.5644699140401146,
+      "grad_norm": 0.7286729281959395,
+      "learning_rate": 5e-06,
+      "loss": 0.6559,
+      "step": 670
+    },
+    {
+      "epoch": 2.6026743075453678,
+      "grad_norm": 0.8393175922718014,
+      "learning_rate": 5e-06,
+      "loss": 0.6537,
+      "step": 680
+    },
+    {
+      "epoch": 2.640878701050621,
+      "grad_norm": 0.9635082597402534,
+      "learning_rate": 5e-06,
+      "loss": 0.645,
+      "step": 690
+    },
+    {
+      "epoch": 2.6790830945558737,
+      "grad_norm": 0.6376260449080609,
+      "learning_rate": 5e-06,
+      "loss": 0.6516,
+      "step": 700
+    },
+    {
+      "epoch": 2.7172874880611273,
+      "grad_norm": 0.9042073773765085,
+      "learning_rate": 5e-06,
+      "loss": 0.6538,
+      "step": 710
+    },
+    {
+      "epoch": 2.75549188156638,
+      "grad_norm": 0.8795780646670239,
+      "learning_rate": 5e-06,
+      "loss": 0.6537,
+      "step": 720
+    },
+    {
+      "epoch": 2.793696275071633,
+      "grad_norm": 0.7101546769683508,
+      "learning_rate": 5e-06,
+      "loss": 0.6507,
+      "step": 730
+    },
+    {
+      "epoch": 2.8319006685768864,
+      "grad_norm": 0.6112049740364579,
+      "learning_rate": 5e-06,
+      "loss": 0.6536,
+      "step": 740
+    },
+    {
+      "epoch": 2.8701050620821396,
+      "grad_norm": 0.6240740305145582,
+      "learning_rate": 5e-06,
+      "loss": 0.6525,
+      "step": 750
+    },
+    {
+      "epoch": 2.9083094555873927,
+      "grad_norm": 0.6687610050145816,
+      "learning_rate": 5e-06,
+      "loss": 0.6569,
+      "step": 760
+    },
+    {
+      "epoch": 2.9465138490926455,
+      "grad_norm": 0.7981405552978358,
+      "learning_rate": 5e-06,
+      "loss": 0.655,
+      "step": 770
+    },
+    {
+      "epoch": 2.9847182425978986,
+      "grad_norm": 0.6901040178181519,
+      "learning_rate": 5e-06,
+      "loss": 0.6567,
+      "step": 780
+    },
+    {
+      "epoch": 2.9961795606494745,
+      "eval_loss": 0.7494649887084961,
+      "eval_runtime": 280.3949,
+      "eval_samples_per_second": 25.154,
+      "eval_steps_per_second": 0.396,
+      "step": 783
+    },
+    {
+      "epoch": 2.9961795606494745,
+      "step": 783,
+      "total_flos": 1311344783523840.0,
+      "train_loss": 0.7237713550090181,
+      "train_runtime": 46210.0208,
+      "train_samples_per_second": 8.699,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 783,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1311344783523840.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed