End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1005 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: gemma
 base_model: google/gemma-2-9b
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: hp_ablations_gemma_epoch4_dcftv1.2
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # hp_ablations_gemma_epoch4_dcftv1.2
-This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6798

 base_model: google/gemma-2-9b
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: hp_ablations_gemma_epoch4_dcftv1.2
 # hp_ablations_gemma_epoch4_dcftv1.2
+This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on the mlfoundations-dev/oh-dcft-v1.2_no-curation_gpt-4o-mini dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6798

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 3.9985035540591096,
+    "eval_loss": 0.6797953248023987,
+    "eval_runtime": 269.0155,
+    "eval_samples_per_second": 33.459,
+    "eval_steps_per_second": 0.524,
+    "total_flos": 5090631865073664.0,
+    "train_loss": 0.5299089724075294,
+    "train_runtime": 62096.6704,
+    "train_samples_per_second": 11.016,
+    "train_steps_per_second": 0.022
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 3.9985035540591096,
+    "eval_loss": 0.6797953248023987,
+    "eval_runtime": 269.0155,
+    "eval_samples_per_second": 33.459,
+    "eval_steps_per_second": 0.524
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 3.9985035540591096,
+    "total_flos": 5090631865073664.0,
+    "train_loss": 0.5299089724075294,
+    "train_runtime": 62096.6704,
+    "train_samples_per_second": 11.016,
+    "train_steps_per_second": 0.022
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1005 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.9985035540591096,
+  "eval_steps": 500,
+  "global_step": 1336,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.029928918817807706,
+      "grad_norm": 1.3197502899687559,
+      "learning_rate": 5e-06,
+      "loss": 0.7729,
+      "step": 10
+    },
+    {
+      "epoch": 0.05985783763561541,
+      "grad_norm": 0.7957246856225982,
+      "learning_rate": 5e-06,
+      "loss": 0.694,
+      "step": 20
+    },
+    {
+      "epoch": 0.08978675645342311,
+      "grad_norm": 0.7023919119728642,
+      "learning_rate": 5e-06,
+      "loss": 0.6692,
+      "step": 30
+    },
+    {
+      "epoch": 0.11971567527123082,
+      "grad_norm": 0.6852471491957217,
+      "learning_rate": 5e-06,
+      "loss": 0.6652,
+      "step": 40
+    },
+    {
+      "epoch": 0.14964459408903855,
+      "grad_norm": 0.7441284057804172,
+      "learning_rate": 5e-06,
+      "loss": 0.6567,
+      "step": 50
+    },
+    {
+      "epoch": 0.17957351290684623,
+      "grad_norm": 0.6737131800519109,
+      "learning_rate": 5e-06,
+      "loss": 0.6489,
+      "step": 60
+    },
+    {
+      "epoch": 0.20950243172465394,
+      "grad_norm": 0.8645698704938743,
+      "learning_rate": 5e-06,
+      "loss": 0.6458,
+      "step": 70
+    },
+    {
+      "epoch": 0.23943135054246165,
+      "grad_norm": 0.6824403788440216,
+      "learning_rate": 5e-06,
+      "loss": 0.6472,
+      "step": 80
+    },
+    {
+      "epoch": 0.26936026936026936,
+      "grad_norm": 0.8355879527708924,
+      "learning_rate": 5e-06,
+      "loss": 0.6382,
+      "step": 90
+    },
+    {
+      "epoch": 0.2992891881780771,
+      "grad_norm": 0.6566317269166482,
+      "learning_rate": 5e-06,
+      "loss": 0.6394,
+      "step": 100
+    },
+    {
+      "epoch": 0.3292181069958848,
+      "grad_norm": 0.7025002610859795,
+      "learning_rate": 5e-06,
+      "loss": 0.6352,
+      "step": 110
+    },
+    {
+      "epoch": 0.35914702581369246,
+      "grad_norm": 0.7294514273893201,
+      "learning_rate": 5e-06,
+      "loss": 0.6341,
+      "step": 120
+    },
+    {
+      "epoch": 0.3890759446315002,
+      "grad_norm": 0.7204998726570041,
+      "learning_rate": 5e-06,
+      "loss": 0.6342,
+      "step": 130
+    },
+    {
+      "epoch": 0.4190048634493079,
+      "grad_norm": 0.9245929000779519,
+      "learning_rate": 5e-06,
+      "loss": 0.6279,
+      "step": 140
+    },
+    {
+      "epoch": 0.4489337822671156,
+      "grad_norm": 0.8312008040431372,
+      "learning_rate": 5e-06,
+      "loss": 0.6298,
+      "step": 150
+    },
+    {
+      "epoch": 0.4788627010849233,
+      "grad_norm": 0.6941447661619787,
+      "learning_rate": 5e-06,
+      "loss": 0.6287,
+      "step": 160
+    },
+    {
+      "epoch": 0.508791619902731,
+      "grad_norm": 0.7880713474277835,
+      "learning_rate": 5e-06,
+      "loss": 0.623,
+      "step": 170
+    },
+    {
+      "epoch": 0.5387205387205387,
+      "grad_norm": 0.7199931353143368,
+      "learning_rate": 5e-06,
+      "loss": 0.6247,
+      "step": 180
+    },
+    {
+      "epoch": 0.5686494575383464,
+      "grad_norm": 0.7680737861171968,
+      "learning_rate": 5e-06,
+      "loss": 0.6223,
+      "step": 190
+    },
+    {
+      "epoch": 0.5985783763561542,
+      "grad_norm": 0.7601491643468152,
+      "learning_rate": 5e-06,
+      "loss": 0.6299,
+      "step": 200
+    },
+    {
+      "epoch": 0.6285072951739619,
+      "grad_norm": 0.76786698349262,
+      "learning_rate": 5e-06,
+      "loss": 0.6236,
+      "step": 210
+    },
+    {
+      "epoch": 0.6584362139917695,
+      "grad_norm": 0.9029566246000676,
+      "learning_rate": 5e-06,
+      "loss": 0.6224,
+      "step": 220
+    },
+    {
+      "epoch": 0.6883651328095772,
+      "grad_norm": 0.7045261038164553,
+      "learning_rate": 5e-06,
+      "loss": 0.6245,
+      "step": 230
+    },
+    {
+      "epoch": 0.7182940516273849,
+      "grad_norm": 0.6774924026654922,
+      "learning_rate": 5e-06,
+      "loss": 0.6234,
+      "step": 240
+    },
+    {
+      "epoch": 0.7482229704451927,
+      "grad_norm": 0.6011441610841004,
+      "learning_rate": 5e-06,
+      "loss": 0.6201,
+      "step": 250
+    },
+    {
+      "epoch": 0.7781518892630004,
+      "grad_norm": 0.6589701033868924,
+      "learning_rate": 5e-06,
+      "loss": 0.6188,
+      "step": 260
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "grad_norm": 0.7793955701511873,
+      "learning_rate": 5e-06,
+      "loss": 0.6263,
+      "step": 270
+    },
+    {
+      "epoch": 0.8380097268986157,
+      "grad_norm": 0.6801997659823543,
+      "learning_rate": 5e-06,
+      "loss": 0.6164,
+      "step": 280
+    },
+    {
+      "epoch": 0.8679386457164235,
+      "grad_norm": 0.7863034594758392,
+      "learning_rate": 5e-06,
+      "loss": 0.6133,
+      "step": 290
+    },
+    {
+      "epoch": 0.8978675645342312,
+      "grad_norm": 0.8097674195506819,
+      "learning_rate": 5e-06,
+      "loss": 0.6145,
+      "step": 300
+    },
+    {
+      "epoch": 0.9277964833520389,
+      "grad_norm": 0.6976711471967793,
+      "learning_rate": 5e-06,
+      "loss": 0.6132,
+      "step": 310
+    },
+    {
+      "epoch": 0.9577254021698466,
+      "grad_norm": 0.6845188306806823,
+      "learning_rate": 5e-06,
+      "loss": 0.615,
+      "step": 320
+    },
+    {
+      "epoch": 0.9876543209876543,
+      "grad_norm": 0.9820656021369418,
+      "learning_rate": 5e-06,
+      "loss": 0.6085,
+      "step": 330
+    },
+    {
+      "epoch": 0.9996258885147774,
+      "eval_loss": 0.6192271709442139,
+      "eval_runtime": 268.4518,
+      "eval_samples_per_second": 33.529,
+      "eval_steps_per_second": 0.525,
+      "step": 334
+    },
+    {
+      "epoch": 1.017583239805462,
+      "grad_norm": 0.8670558349646079,
+      "learning_rate": 5e-06,
+      "loss": 0.6314,
+      "step": 340
+    },
+    {
+      "epoch": 1.0475121586232696,
+      "grad_norm": 1.0194196930172406,
+      "learning_rate": 5e-06,
+      "loss": 0.5525,
+      "step": 350
+    },
+    {
+      "epoch": 1.0774410774410774,
+      "grad_norm": 0.7571264421758325,
+      "learning_rate": 5e-06,
+      "loss": 0.5475,
+      "step": 360
+    },
+    {
+      "epoch": 1.1073699962588852,
+      "grad_norm": 0.716142300432686,
+      "learning_rate": 5e-06,
+      "loss": 0.5479,
+      "step": 370
+    },
+    {
+      "epoch": 1.1372989150766928,
+      "grad_norm": 0.7134968159345548,
+      "learning_rate": 5e-06,
+      "loss": 0.5483,
+      "step": 380
+    },
+    {
+      "epoch": 1.1672278338945006,
+      "grad_norm": 0.7093422370162528,
+      "learning_rate": 5e-06,
+      "loss": 0.5497,
+      "step": 390
+    },
+    {
+      "epoch": 1.1971567527123081,
+      "grad_norm": 0.6758306313904245,
+      "learning_rate": 5e-06,
+      "loss": 0.5499,
+      "step": 400
+    },
+    {
+      "epoch": 1.227085671530116,
+      "grad_norm": 0.6590188596738886,
+      "learning_rate": 5e-06,
+      "loss": 0.5523,
+      "step": 410
+    },
+    {
+      "epoch": 1.2570145903479237,
+      "grad_norm": 0.7115281205352587,
+      "learning_rate": 5e-06,
+      "loss": 0.557,
+      "step": 420
+    },
+    {
+      "epoch": 1.2869435091657313,
+      "grad_norm": 0.6651956769462775,
+      "learning_rate": 5e-06,
+      "loss": 0.5521,
+      "step": 430
+    },
+    {
+      "epoch": 1.316872427983539,
+      "grad_norm": 0.686904033471436,
+      "learning_rate": 5e-06,
+      "loss": 0.5542,
+      "step": 440
+    },
+    {
+      "epoch": 1.3468013468013469,
+      "grad_norm": 0.7052326227629313,
+      "learning_rate": 5e-06,
+      "loss": 0.5473,
+      "step": 450
+    },
+    {
+      "epoch": 1.3767302656191545,
+      "grad_norm": 0.6603203892732427,
+      "learning_rate": 5e-06,
+      "loss": 0.558,
+      "step": 460
+    },
+    {
+      "epoch": 1.4066591844369623,
+      "grad_norm": 0.7204930864199384,
+      "learning_rate": 5e-06,
+      "loss": 0.557,
+      "step": 470
+    },
+    {
+      "epoch": 1.43658810325477,
+      "grad_norm": 0.6582974125304011,
+      "learning_rate": 5e-06,
+      "loss": 0.5598,
+      "step": 480
+    },
+    {
+      "epoch": 1.4665170220725776,
+      "grad_norm": 0.653408089340934,
+      "learning_rate": 5e-06,
+      "loss": 0.5589,
+      "step": 490
+    },
+    {
+      "epoch": 1.4964459408903854,
+      "grad_norm": 0.7722703692356943,
+      "learning_rate": 5e-06,
+      "loss": 0.5549,
+      "step": 500
+    },
+    {
+      "epoch": 1.5263748597081932,
+      "grad_norm": 0.6410463952946445,
+      "learning_rate": 5e-06,
+      "loss": 0.5571,
+      "step": 510
+    },
+    {
+      "epoch": 1.5563037785260008,
+      "grad_norm": 0.6788292489082296,
+      "learning_rate": 5e-06,
+      "loss": 0.5567,
+      "step": 520
+    },
+    {
+      "epoch": 1.5862326973438083,
+      "grad_norm": 0.7347495173956178,
+      "learning_rate": 5e-06,
+      "loss": 0.5557,
+      "step": 530
+    },
+    {
+      "epoch": 1.6161616161616161,
+      "grad_norm": 0.7489697605253047,
+      "learning_rate": 5e-06,
+      "loss": 0.56,
+      "step": 540
+    },
+    {
+      "epoch": 1.646090534979424,
+      "grad_norm": 0.6649594456868578,
+      "learning_rate": 5e-06,
+      "loss": 0.5596,
+      "step": 550
+    },
+    {
+      "epoch": 1.6760194537972315,
+      "grad_norm": 0.6944801894329058,
+      "learning_rate": 5e-06,
+      "loss": 0.5499,
+      "step": 560
+    },
+    {
+      "epoch": 1.7059483726150393,
+      "grad_norm": 0.7516636245416078,
+      "learning_rate": 5e-06,
+      "loss": 0.5519,
+      "step": 570
+    },
+    {
+      "epoch": 1.735877291432847,
+      "grad_norm": 0.834145985540098,
+      "learning_rate": 5e-06,
+      "loss": 0.5525,
+      "step": 580
+    },
+    {
+      "epoch": 1.7658062102506547,
+      "grad_norm": 0.6940488546001392,
+      "learning_rate": 5e-06,
+      "loss": 0.5561,
+      "step": 590
+    },
+    {
+      "epoch": 1.7957351290684624,
+      "grad_norm": 0.6996951151429136,
+      "learning_rate": 5e-06,
+      "loss": 0.5568,
+      "step": 600
+    },
+    {
+      "epoch": 1.8256640478862702,
+      "grad_norm": 0.6321044767548653,
+      "learning_rate": 5e-06,
+      "loss": 0.5543,
+      "step": 610
+    },
+    {
+      "epoch": 1.8555929667040778,
+      "grad_norm": 0.6380400908901183,
+      "learning_rate": 5e-06,
+      "loss": 0.5521,
+      "step": 620
+    },
+    {
+      "epoch": 1.8855218855218854,
+      "grad_norm": 0.6726516418071744,
+      "learning_rate": 5e-06,
+      "loss": 0.5536,
+      "step": 630
+    },
+    {
+      "epoch": 1.9154508043396934,
+      "grad_norm": 0.6952484734366503,
+      "learning_rate": 5e-06,
+      "loss": 0.5556,
+      "step": 640
+    },
+    {
+      "epoch": 1.945379723157501,
+      "grad_norm": 0.6339074390401458,
+      "learning_rate": 5e-06,
+      "loss": 0.554,
+      "step": 650
+    },
+    {
+      "epoch": 1.9753086419753085,
+      "grad_norm": 0.8335015680516275,
+      "learning_rate": 5e-06,
+      "loss": 0.5595,
+      "step": 660
+    },
+    {
+      "epoch": 1.9992517770295548,
+      "eval_loss": 0.615513026714325,
+      "eval_runtime": 267.9582,
+      "eval_samples_per_second": 33.591,
+      "eval_steps_per_second": 0.526,
+      "step": 668
+    },
+    {
+      "epoch": 2.0052375607931165,
+      "grad_norm": 1.095470898789856,
+      "learning_rate": 5e-06,
+      "loss": 0.5957,
+      "step": 670
+    },
+    {
+      "epoch": 2.035166479610924,
+      "grad_norm": 0.7925440628175368,
+      "learning_rate": 5e-06,
+      "loss": 0.4851,
+      "step": 680
+    },
+    {
+      "epoch": 2.0650953984287317,
+      "grad_norm": 0.7600969395946293,
+      "learning_rate": 5e-06,
+      "loss": 0.4799,
+      "step": 690
+    },
+    {
+      "epoch": 2.0950243172465393,
+      "grad_norm": 0.8685890982294241,
+      "learning_rate": 5e-06,
+      "loss": 0.4845,
+      "step": 700
+    },
+    {
+      "epoch": 2.1249532360643473,
+      "grad_norm": 0.7159762779954674,
+      "learning_rate": 5e-06,
+      "loss": 0.4862,
+      "step": 710
+    },
+    {
+      "epoch": 2.154882154882155,
+      "grad_norm": 0.7850425626912287,
+      "learning_rate": 5e-06,
+      "loss": 0.4882,
+      "step": 720
+    },
+    {
+      "epoch": 2.1848110736999624,
+      "grad_norm": 0.7829173560959974,
+      "learning_rate": 5e-06,
+      "loss": 0.4894,
+      "step": 730
+    },
+    {
+      "epoch": 2.2147399925177704,
+      "grad_norm": 0.7053202412118417,
+      "learning_rate": 5e-06,
+      "loss": 0.4898,
+      "step": 740
+    },
+    {
+      "epoch": 2.244668911335578,
+      "grad_norm": 0.7275797577145928,
+      "learning_rate": 5e-06,
+      "loss": 0.4917,
+      "step": 750
+    },
+    {
+      "epoch": 2.2745978301533856,
+      "grad_norm": 0.684143630508004,
+      "learning_rate": 5e-06,
+      "loss": 0.4878,
+      "step": 760
+    },
+    {
+      "epoch": 2.3045267489711936,
+      "grad_norm": 0.778690697436679,
+      "learning_rate": 5e-06,
+      "loss": 0.4936,
+      "step": 770
+    },
+    {
+      "epoch": 2.334455667789001,
+      "grad_norm": 0.6973756438711023,
+      "learning_rate": 5e-06,
+      "loss": 0.4885,
+      "step": 780
+    },
+    {
+      "epoch": 2.3643845866068087,
+      "grad_norm": 0.7512378015475496,
+      "learning_rate": 5e-06,
+      "loss": 0.4902,
+      "step": 790
+    },
+    {
+      "epoch": 2.3943135054246163,
+      "grad_norm": 0.6954041240036626,
+      "learning_rate": 5e-06,
+      "loss": 0.4947,
+      "step": 800
+    },
+    {
+      "epoch": 2.4242424242424243,
+      "grad_norm": 0.7661445266388807,
+      "learning_rate": 5e-06,
+      "loss": 0.4995,
+      "step": 810
+    },
+    {
+      "epoch": 2.454171343060232,
+      "grad_norm": 0.7288724567709918,
+      "learning_rate": 5e-06,
+      "loss": 0.4979,
+      "step": 820
+    },
+    {
+      "epoch": 2.48410026187804,
+      "grad_norm": 0.7507674417043292,
+      "learning_rate": 5e-06,
+      "loss": 0.4968,
+      "step": 830
+    },
+    {
+      "epoch": 2.5140291806958475,
+      "grad_norm": 0.6886877322873068,
+      "learning_rate": 5e-06,
+      "loss": 0.4951,
+      "step": 840
+    },
+    {
+      "epoch": 2.543958099513655,
+      "grad_norm": 0.710314562589874,
+      "learning_rate": 5e-06,
+      "loss": 0.498,
+      "step": 850
+    },
+    {
+      "epoch": 2.5738870183314626,
+      "grad_norm": 0.6994762876301733,
+      "learning_rate": 5e-06,
+      "loss": 0.4959,
+      "step": 860
+    },
+    {
+      "epoch": 2.6038159371492706,
+      "grad_norm": 0.7582356365854407,
+      "learning_rate": 5e-06,
+      "loss": 0.4938,
+      "step": 870
+    },
+    {
+      "epoch": 2.633744855967078,
+      "grad_norm": 0.731935619090177,
+      "learning_rate": 5e-06,
+      "loss": 0.4921,
+      "step": 880
+    },
+    {
+      "epoch": 2.6636737747848858,
+      "grad_norm": 0.74782144362319,
+      "learning_rate": 5e-06,
+      "loss": 0.4977,
+      "step": 890
+    },
+    {
+      "epoch": 2.6936026936026938,
+      "grad_norm": 0.6942188030457457,
+      "learning_rate": 5e-06,
+      "loss": 0.5011,
+      "step": 900
+    },
+    {
+      "epoch": 2.7235316124205013,
+      "grad_norm": 0.6881327401867374,
+      "learning_rate": 5e-06,
+      "loss": 0.4947,
+      "step": 910
+    },
+    {
+      "epoch": 2.753460531238309,
+      "grad_norm": 0.6856202337817314,
+      "learning_rate": 5e-06,
+      "loss": 0.4959,
+      "step": 920
+    },
+    {
+      "epoch": 2.7833894500561165,
+      "grad_norm": 0.7141040450174527,
+      "learning_rate": 5e-06,
+      "loss": 0.5016,
+      "step": 930
+    },
+    {
+      "epoch": 2.8133183688739245,
+      "grad_norm": 0.6857610208401852,
+      "learning_rate": 5e-06,
+      "loss": 0.4985,
+      "step": 940
+    },
+    {
+      "epoch": 2.843247287691732,
+      "grad_norm": 0.6698180625003869,
+      "learning_rate": 5e-06,
+      "loss": 0.4986,
+      "step": 950
+    },
+    {
+      "epoch": 2.87317620650954,
+      "grad_norm": 0.8039520213911328,
+      "learning_rate": 5e-06,
+      "loss": 0.502,
+      "step": 960
+    },
+    {
+      "epoch": 2.9031051253273477,
+      "grad_norm": 0.7415409936401505,
+      "learning_rate": 5e-06,
+      "loss": 0.4959,
+      "step": 970
+    },
+    {
+      "epoch": 2.9330340441451552,
+      "grad_norm": 0.7856625436324756,
+      "learning_rate": 5e-06,
+      "loss": 0.5023,
+      "step": 980
+    },
+    {
+      "epoch": 2.962962962962963,
+      "grad_norm": 0.761345605606732,
+      "learning_rate": 5e-06,
+      "loss": 0.5048,
+      "step": 990
+    },
+    {
+      "epoch": 2.992891881780771,
+      "grad_norm": 0.7302412373936236,
+      "learning_rate": 5e-06,
+      "loss": 0.5047,
+      "step": 1000
+    },
+    {
+      "epoch": 2.9988776655443323,
+      "eval_loss": 0.6354114413261414,
+      "eval_runtime": 269.2475,
+      "eval_samples_per_second": 33.43,
+      "eval_steps_per_second": 0.524,
+      "step": 1002
+    },
+    {
+      "epoch": 3.0228208005985784,
+      "grad_norm": 1.1087903273234458,
+      "learning_rate": 5e-06,
+      "loss": 0.4757,
+      "step": 1010
+    },
+    {
+      "epoch": 3.052749719416386,
+      "grad_norm": 0.8684090527265274,
+      "learning_rate": 5e-06,
+      "loss": 0.4189,
+      "step": 1020
+    },
+    {
+      "epoch": 3.082678638234194,
+      "grad_norm": 0.8005907139036144,
+      "learning_rate": 5e-06,
+      "loss": 0.42,
+      "step": 1030
+    },
+    {
+      "epoch": 3.1126075570520015,
+      "grad_norm": 0.7642262444187367,
+      "learning_rate": 5e-06,
+      "loss": 0.4184,
+      "step": 1040
+    },
+    {
+      "epoch": 3.142536475869809,
+      "grad_norm": 0.7766668581347643,
+      "learning_rate": 5e-06,
+      "loss": 0.4173,
+      "step": 1050
+    },
+    {
+      "epoch": 3.1724653946876167,
+      "grad_norm": 0.7661079163483049,
+      "learning_rate": 5e-06,
+      "loss": 0.426,
+      "step": 1060
+    },
+    {
+      "epoch": 3.2023943135054247,
+      "grad_norm": 0.8204327830042761,
+      "learning_rate": 5e-06,
+      "loss": 0.4192,
+      "step": 1070
+    },
+    {
+      "epoch": 3.2323232323232323,
+      "grad_norm": 0.7418029541161775,
+      "learning_rate": 5e-06,
+      "loss": 0.4248,
+      "step": 1080
+    },
+    {
+      "epoch": 3.2622521511410403,
+      "grad_norm": 0.7688720105233667,
+      "learning_rate": 5e-06,
+      "loss": 0.4309,
+      "step": 1090
+    },
+    {
+      "epoch": 3.292181069958848,
+      "grad_norm": 0.783937291766432,
+      "learning_rate": 5e-06,
+      "loss": 0.4265,
+      "step": 1100
+    },
+    {
+      "epoch": 3.3221099887766554,
+      "grad_norm": 0.7522379239239885,
+      "learning_rate": 5e-06,
+      "loss": 0.4247,
+      "step": 1110
+    },
+    {
+      "epoch": 3.352038907594463,
+      "grad_norm": 0.7694503676828072,
+      "learning_rate": 5e-06,
+      "loss": 0.4247,
+      "step": 1120
+    },
+    {
+      "epoch": 3.381967826412271,
+      "grad_norm": 0.7524030817183468,
+      "learning_rate": 5e-06,
+      "loss": 0.427,
+      "step": 1130
+    },
+    {
+      "epoch": 3.4118967452300786,
+      "grad_norm": 0.7702225573698022,
+      "learning_rate": 5e-06,
+      "loss": 0.4296,
+      "step": 1140
+    },
+    {
+      "epoch": 3.441825664047886,
+      "grad_norm": 0.7621134864427016,
+      "learning_rate": 5e-06,
+      "loss": 0.4272,
+      "step": 1150
+    },
+    {
+      "epoch": 3.471754582865694,
+      "grad_norm": 0.7536787003193455,
+      "learning_rate": 5e-06,
+      "loss": 0.4298,
+      "step": 1160
+    },
+    {
+      "epoch": 3.5016835016835017,
+      "grad_norm": 0.9135172996253528,
+      "learning_rate": 5e-06,
+      "loss": 0.4299,
+      "step": 1170
+    },
+    {
+      "epoch": 3.5316124205013093,
+      "grad_norm": 0.7993783814738175,
+      "learning_rate": 5e-06,
+      "loss": 0.4299,
+      "step": 1180
+    },
+    {
+      "epoch": 3.561541339319117,
+      "grad_norm": 0.7454852579268769,
+      "learning_rate": 5e-06,
+      "loss": 0.436,
+      "step": 1190
+    },
+    {
+      "epoch": 3.591470258136925,
+      "grad_norm": 0.7472361919942885,
+      "learning_rate": 5e-06,
+      "loss": 0.4307,
+      "step": 1200
+    },
+    {
+      "epoch": 3.6213991769547325,
+      "grad_norm": 0.7709040310826254,
+      "learning_rate": 5e-06,
+      "loss": 0.435,
+      "step": 1210
+    },
+    {
+      "epoch": 3.6513280957725405,
+      "grad_norm": 0.9162628098304457,
+      "learning_rate": 5e-06,
+      "loss": 0.4366,
+      "step": 1220
+    },
+    {
+      "epoch": 3.681257014590348,
+      "grad_norm": 0.748944264236295,
+      "learning_rate": 5e-06,
+      "loss": 0.4328,
+      "step": 1230
+    },
+    {
+      "epoch": 3.7111859334081556,
+      "grad_norm": 0.7943321429757056,
+      "learning_rate": 5e-06,
+      "loss": 0.4423,
+      "step": 1240
+    },
+    {
+      "epoch": 3.741114852225963,
+      "grad_norm": 0.7732334849903745,
+      "learning_rate": 5e-06,
+      "loss": 0.4348,
+      "step": 1250
+    },
+    {
+      "epoch": 3.771043771043771,
+      "grad_norm": 0.9404932255120924,
+      "learning_rate": 5e-06,
+      "loss": 0.4341,
+      "step": 1260
+    },
+    {
+      "epoch": 3.8009726898615788,
+      "grad_norm": 0.836108708076803,
+      "learning_rate": 5e-06,
+      "loss": 0.4417,
+      "step": 1270
+    },
+    {
+      "epoch": 3.8309016086793863,
+      "grad_norm": 0.708913428348208,
+      "learning_rate": 5e-06,
+      "loss": 0.4385,
+      "step": 1280
+    },
+    {
+      "epoch": 3.8608305274971944,
+      "grad_norm": 0.7741293535971421,
+      "learning_rate": 5e-06,
+      "loss": 0.4389,
+      "step": 1290
+    },
+    {
+      "epoch": 3.890759446315002,
+      "grad_norm": 0.7445598509368653,
+      "learning_rate": 5e-06,
+      "loss": 0.4371,
+      "step": 1300
+    },
+    {
+      "epoch": 3.9206883651328095,
+      "grad_norm": 0.9179953662967183,
+      "learning_rate": 5e-06,
+      "loss": 0.4394,
+      "step": 1310
+    },
+    {
+      "epoch": 3.950617283950617,
+      "grad_norm": 0.7659677690542608,
+      "learning_rate": 5e-06,
+      "loss": 0.4351,
+      "step": 1320
+    },
+    {
+      "epoch": 3.980546202768425,
+      "grad_norm": 0.827226367347478,
+      "learning_rate": 5e-06,
+      "loss": 0.4386,
+      "step": 1330
+    },
+    {
+      "epoch": 3.9985035540591096,
+      "eval_loss": 0.6797953248023987,
+      "eval_runtime": 268.7833,
+      "eval_samples_per_second": 33.488,
+      "eval_steps_per_second": 0.525,
+      "step": 1336
+    },
+    {
+      "epoch": 3.9985035540591096,
+      "step": 1336,
+      "total_flos": 5090631865073664.0,
+      "train_loss": 0.5299089724075294,
+      "train_runtime": 62096.6704,
+      "train_samples_per_second": 11.016,
+      "train_steps_per_second": 0.022
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1336,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5090631865073664.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed