End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1095 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: apache-2.0
 base_model: mistralai/Mistral-7B-v0.1
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: hp_ablations_mistral_epoch3
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # hp_ablations_mistral_epoch3
-This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.1461

 base_model: mistralai/Mistral-7B-v0.1
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: hp_ablations_mistral_epoch3
 # hp_ablations_mistral_epoch3
+This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the mlfoundations-dev/oh-dcft-v3.1-gpt-4o-mini dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.1461

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9964556962025317,
+    "eval_loss": 0.14610780775547028,
+    "eval_runtime": 253.4559,
+    "eval_samples_per_second": 52.49,
+    "eval_steps_per_second": 0.41,
+    "total_flos": 2476961326694400.0,
+    "train_loss": 0.47900067526718665,
+    "train_runtime": 42754.8995,
+    "train_samples_per_second": 17.735,
+    "train_steps_per_second": 0.035
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9964556962025317,
+    "eval_loss": 0.14610780775547028,
+    "eval_runtime": 253.4559,
+    "eval_samples_per_second": 52.49,
+    "eval_steps_per_second": 0.41
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9964556962025317,
+    "total_flos": 2476961326694400.0,
+    "train_loss": 0.47900067526718665,
+    "train_runtime": 42754.8995,
+    "train_samples_per_second": 17.735,
+    "train_steps_per_second": 0.035
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1095 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9964556962025317,
+  "eval_steps": 500,
+  "global_step": 1479,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.020253164556962026,
+      "grad_norm": 2.881640763497059,
+      "learning_rate": 5e-06,
+      "loss": 0.7569,
+      "step": 10
+    },
+    {
+      "epoch": 0.04050632911392405,
+      "grad_norm": 2.0337939479231464,
+      "learning_rate": 5e-06,
+      "loss": 0.6507,
+      "step": 20
+    },
+    {
+      "epoch": 0.060759493670886074,
+      "grad_norm": 2.673785425051781,
+      "learning_rate": 5e-06,
+      "loss": 0.6295,
+      "step": 30
+    },
+    {
+      "epoch": 0.0810126582278481,
+      "grad_norm": 3.3956320905602704,
+      "learning_rate": 5e-06,
+      "loss": 0.615,
+      "step": 40
+    },
+    {
+      "epoch": 0.10126582278481013,
+      "grad_norm": 2.4989053137375143,
+      "learning_rate": 5e-06,
+      "loss": 0.6055,
+      "step": 50
+    },
+    {
+      "epoch": 0.12151898734177215,
+      "grad_norm": 1.8362600995228775,
+      "learning_rate": 5e-06,
+      "loss": 0.598,
+      "step": 60
+    },
+    {
+      "epoch": 0.14177215189873418,
+      "grad_norm": 1.6812600364217105,
+      "learning_rate": 5e-06,
+      "loss": 0.5919,
+      "step": 70
+    },
+    {
+      "epoch": 0.1620253164556962,
+      "grad_norm": 1.7204729721312917,
+      "learning_rate": 5e-06,
+      "loss": 0.5885,
+      "step": 80
+    },
+    {
+      "epoch": 0.18227848101265823,
+      "grad_norm": 1.5280049113720315,
+      "learning_rate": 5e-06,
+      "loss": 0.5878,
+      "step": 90
+    },
+    {
+      "epoch": 0.20253164556962025,
+      "grad_norm": 2.0183706809796145,
+      "learning_rate": 5e-06,
+      "loss": 0.5912,
+      "step": 100
+    },
+    {
+      "epoch": 0.22278481012658227,
+      "grad_norm": 1.3728348794112124,
+      "learning_rate": 5e-06,
+      "loss": 0.5834,
+      "step": 110
+    },
+    {
+      "epoch": 0.2430379746835443,
+      "grad_norm": 1.815358729258882,
+      "learning_rate": 5e-06,
+      "loss": 0.5806,
+      "step": 120
+    },
+    {
+      "epoch": 0.26329113924050634,
+      "grad_norm": 1.6815731840354737,
+      "learning_rate": 5e-06,
+      "loss": 0.5793,
+      "step": 130
+    },
+    {
+      "epoch": 0.28354430379746837,
+      "grad_norm": 1.225903486666645,
+      "learning_rate": 5e-06,
+      "loss": 0.5684,
+      "step": 140
+    },
+    {
+      "epoch": 0.3037974683544304,
+      "grad_norm": 1.5251383950821966,
+      "learning_rate": 5e-06,
+      "loss": 0.5792,
+      "step": 150
+    },
+    {
+      "epoch": 0.3240506329113924,
+      "grad_norm": 2.6329119810915365,
+      "learning_rate": 5e-06,
+      "loss": 0.5782,
+      "step": 160
+    },
+    {
+      "epoch": 0.34430379746835443,
+      "grad_norm": 1.7167868191479003,
+      "learning_rate": 5e-06,
+      "loss": 0.5764,
+      "step": 170
+    },
+    {
+      "epoch": 0.36455696202531646,
+      "grad_norm": 1.73285303403399,
+      "learning_rate": 5e-06,
+      "loss": 0.5716,
+      "step": 180
+    },
+    {
+      "epoch": 0.3848101265822785,
+      "grad_norm": 1.3746990652975846,
+      "learning_rate": 5e-06,
+      "loss": 0.5719,
+      "step": 190
+    },
+    {
+      "epoch": 0.4050632911392405,
+      "grad_norm": 1.4936398849590744,
+      "learning_rate": 5e-06,
+      "loss": 0.5679,
+      "step": 200
+    },
+    {
+      "epoch": 0.4253164556962025,
+      "grad_norm": 1.2921614371273473,
+      "learning_rate": 5e-06,
+      "loss": 0.5636,
+      "step": 210
+    },
+    {
+      "epoch": 0.44556962025316454,
+      "grad_norm": 1.442945179409374,
+      "learning_rate": 5e-06,
+      "loss": 0.5684,
+      "step": 220
+    },
+    {
+      "epoch": 0.46582278481012657,
+      "grad_norm": 1.4946631499646226,
+      "learning_rate": 5e-06,
+      "loss": 0.567,
+      "step": 230
+    },
+    {
+      "epoch": 0.4860759493670886,
+      "grad_norm": 1.2982882675858225,
+      "learning_rate": 5e-06,
+      "loss": 0.5633,
+      "step": 240
+    },
+    {
+      "epoch": 0.5063291139240507,
+      "grad_norm": 1.201139136446853,
+      "learning_rate": 5e-06,
+      "loss": 0.5672,
+      "step": 250
+    },
+    {
+      "epoch": 0.5265822784810127,
+      "grad_norm": 1.221681422450315,
+      "learning_rate": 5e-06,
+      "loss": 0.5541,
+      "step": 260
+    },
+    {
+      "epoch": 0.5468354430379747,
+      "grad_norm": 1.3764079520580204,
+      "learning_rate": 5e-06,
+      "loss": 0.5634,
+      "step": 270
+    },
+    {
+      "epoch": 0.5670886075949367,
+      "grad_norm": 1.2769215201401323,
+      "learning_rate": 5e-06,
+      "loss": 0.5585,
+      "step": 280
+    },
+    {
+      "epoch": 0.5873417721518988,
+      "grad_norm": 1.658435241290474,
+      "learning_rate": 5e-06,
+      "loss": 0.5589,
+      "step": 290
+    },
+    {
+      "epoch": 0.6075949367088608,
+      "grad_norm": 1.3208774206825835,
+      "learning_rate": 5e-06,
+      "loss": 0.5584,
+      "step": 300
+    },
+    {
+      "epoch": 0.6278481012658228,
+      "grad_norm": 1.3732535087662892,
+      "learning_rate": 5e-06,
+      "loss": 0.5641,
+      "step": 310
+    },
+    {
+      "epoch": 0.6481012658227848,
+      "grad_norm": 1.7757004131781444,
+      "learning_rate": 5e-06,
+      "loss": 0.5602,
+      "step": 320
+    },
+    {
+      "epoch": 0.6683544303797468,
+      "grad_norm": 1.4157657120348899,
+      "learning_rate": 5e-06,
+      "loss": 0.5598,
+      "step": 330
+    },
+    {
+      "epoch": 0.6886075949367089,
+      "grad_norm": 1.2554752448180828,
+      "learning_rate": 5e-06,
+      "loss": 0.5592,
+      "step": 340
+    },
+    {
+      "epoch": 0.7088607594936709,
+      "grad_norm": 1.4796331594217829,
+      "learning_rate": 5e-06,
+      "loss": 0.5606,
+      "step": 350
+    },
+    {
+      "epoch": 0.7291139240506329,
+      "grad_norm": 1.2444877557169038,
+      "learning_rate": 5e-06,
+      "loss": 0.5627,
+      "step": 360
+    },
+    {
+      "epoch": 0.7493670886075949,
+      "grad_norm": 1.3700404053887212,
+      "learning_rate": 5e-06,
+      "loss": 0.5553,
+      "step": 370
+    },
+    {
+      "epoch": 0.769620253164557,
+      "grad_norm": 1.399982865718864,
+      "learning_rate": 5e-06,
+      "loss": 0.5523,
+      "step": 380
+    },
+    {
+      "epoch": 0.789873417721519,
+      "grad_norm": 1.7626359294815905,
+      "learning_rate": 5e-06,
+      "loss": 0.5554,
+      "step": 390
+    },
+    {
+      "epoch": 0.810126582278481,
+      "grad_norm": 1.2003485909233378,
+      "learning_rate": 5e-06,
+      "loss": 0.5554,
+      "step": 400
+    },
+    {
+      "epoch": 0.830379746835443,
+      "grad_norm": 1.1749218008708648,
+      "learning_rate": 5e-06,
+      "loss": 0.5499,
+      "step": 410
+    },
+    {
+      "epoch": 0.850632911392405,
+      "grad_norm": 1.1170828766358432,
+      "learning_rate": 5e-06,
+      "loss": 0.5583,
+      "step": 420
+    },
+    {
+      "epoch": 0.8708860759493671,
+      "grad_norm": 1.1913684385210768,
+      "learning_rate": 5e-06,
+      "loss": 0.552,
+      "step": 430
+    },
+    {
+      "epoch": 0.8911392405063291,
+      "grad_norm": 1.158491457002269,
+      "learning_rate": 5e-06,
+      "loss": 0.5576,
+      "step": 440
+    },
+    {
+      "epoch": 0.9113924050632911,
+      "grad_norm": 1.2440014645948327,
+      "learning_rate": 5e-06,
+      "loss": 0.5486,
+      "step": 450
+    },
+    {
+      "epoch": 0.9316455696202531,
+      "grad_norm": 1.0624082894155582,
+      "learning_rate": 5e-06,
+      "loss": 0.553,
+      "step": 460
+    },
+    {
+      "epoch": 0.9518987341772152,
+      "grad_norm": 1.3891566012880086,
+      "learning_rate": 5e-06,
+      "loss": 0.5537,
+      "step": 470
+    },
+    {
+      "epoch": 0.9721518987341772,
+      "grad_norm": 1.2000412125972895,
+      "learning_rate": 5e-06,
+      "loss": 0.5511,
+      "step": 480
+    },
+    {
+      "epoch": 0.9924050632911392,
+      "grad_norm": 1.0973875984510393,
+      "learning_rate": 5e-06,
+      "loss": 0.5475,
+      "step": 490
+    },
+    {
+      "epoch": 0.9984810126582279,
+      "eval_loss": 0.13758791983127594,
+      "eval_runtime": 254.1278,
+      "eval_samples_per_second": 52.352,
+      "eval_steps_per_second": 0.409,
+      "step": 493
+    },
+    {
+      "epoch": 1.0131645569620253,
+      "grad_norm": 1.8117531834466138,
+      "learning_rate": 5e-06,
+      "loss": 0.5044,
+      "step": 500
+    },
+    {
+      "epoch": 1.0334177215189873,
+      "grad_norm": 1.3053921965302375,
+      "learning_rate": 5e-06,
+      "loss": 0.4698,
+      "step": 510
+    },
+    {
+      "epoch": 1.0536708860759494,
+      "grad_norm": 1.1687624258615774,
+      "learning_rate": 5e-06,
+      "loss": 0.4664,
+      "step": 520
+    },
+    {
+      "epoch": 1.0739240506329113,
+      "grad_norm": 1.1072989424160442,
+      "learning_rate": 5e-06,
+      "loss": 0.4678,
+      "step": 530
+    },
+    {
+      "epoch": 1.0941772151898734,
+      "grad_norm": 1.2175965077481026,
+      "learning_rate": 5e-06,
+      "loss": 0.4676,
+      "step": 540
+    },
+    {
+      "epoch": 1.1144303797468353,
+      "grad_norm": 1.2541248804984217,
+      "learning_rate": 5e-06,
+      "loss": 0.4679,
+      "step": 550
+    },
+    {
+      "epoch": 1.1346835443037975,
+      "grad_norm": 1.2263184938833407,
+      "learning_rate": 5e-06,
+      "loss": 0.4652,
+      "step": 560
+    },
+    {
+      "epoch": 1.1549367088607596,
+      "grad_norm": 1.2335018260186486,
+      "learning_rate": 5e-06,
+      "loss": 0.4741,
+      "step": 570
+    },
+    {
+      "epoch": 1.1751898734177215,
+      "grad_norm": 1.1651843980725467,
+      "learning_rate": 5e-06,
+      "loss": 0.4698,
+      "step": 580
+    },
+    {
+      "epoch": 1.1954430379746834,
+      "grad_norm": 1.3299683670971845,
+      "learning_rate": 5e-06,
+      "loss": 0.4648,
+      "step": 590
+    },
+    {
+      "epoch": 1.2156962025316456,
+      "grad_norm": 1.5406403193173543,
+      "learning_rate": 5e-06,
+      "loss": 0.4728,
+      "step": 600
+    },
+    {
+      "epoch": 1.2359493670886077,
+      "grad_norm": 1.3994137841579308,
+      "learning_rate": 5e-06,
+      "loss": 0.4672,
+      "step": 610
+    },
+    {
+      "epoch": 1.2562025316455696,
+      "grad_norm": 1.4687039924759018,
+      "learning_rate": 5e-06,
+      "loss": 0.4722,
+      "step": 620
+    },
+    {
+      "epoch": 1.2764556962025315,
+      "grad_norm": 1.1767001591548043,
+      "learning_rate": 5e-06,
+      "loss": 0.4707,
+      "step": 630
+    },
+    {
+      "epoch": 1.2967088607594937,
+      "grad_norm": 1.377209586857746,
+      "learning_rate": 5e-06,
+      "loss": 0.4703,
+      "step": 640
+    },
+    {
+      "epoch": 1.3169620253164558,
+      "grad_norm": 1.3280966356632122,
+      "learning_rate": 5e-06,
+      "loss": 0.4743,
+      "step": 650
+    },
+    {
+      "epoch": 1.3372151898734177,
+      "grad_norm": 1.172194594770593,
+      "learning_rate": 5e-06,
+      "loss": 0.4625,
+      "step": 660
+    },
+    {
+      "epoch": 1.3574683544303796,
+      "grad_norm": 1.1815616623629874,
+      "learning_rate": 5e-06,
+      "loss": 0.4772,
+      "step": 670
+    },
+    {
+      "epoch": 1.3777215189873417,
+      "grad_norm": 1.2788117947587072,
+      "learning_rate": 5e-06,
+      "loss": 0.4748,
+      "step": 680
+    },
+    {
+      "epoch": 1.3979746835443039,
+      "grad_norm": 1.2120662888757698,
+      "learning_rate": 5e-06,
+      "loss": 0.4737,
+      "step": 690
+    },
+    {
+      "epoch": 1.4182278481012658,
+      "grad_norm": 1.196686900725073,
+      "learning_rate": 5e-06,
+      "loss": 0.4706,
+      "step": 700
+    },
+    {
+      "epoch": 1.438481012658228,
+      "grad_norm": 1.198300916867019,
+      "learning_rate": 5e-06,
+      "loss": 0.4766,
+      "step": 710
+    },
+    {
+      "epoch": 1.4587341772151898,
+      "grad_norm": 1.2100463670783117,
+      "learning_rate": 5e-06,
+      "loss": 0.4764,
+      "step": 720
+    },
+    {
+      "epoch": 1.478987341772152,
+      "grad_norm": 1.1505710467148873,
+      "learning_rate": 5e-06,
+      "loss": 0.4788,
+      "step": 730
+    },
+    {
+      "epoch": 1.4992405063291139,
+      "grad_norm": 1.1514955759509187,
+      "learning_rate": 5e-06,
+      "loss": 0.4729,
+      "step": 740
+    },
+    {
+      "epoch": 1.5194936708860758,
+      "grad_norm": 1.2756859859117555,
+      "learning_rate": 5e-06,
+      "loss": 0.4752,
+      "step": 750
+    },
+    {
+      "epoch": 1.539746835443038,
+      "grad_norm": 1.1669815187630872,
+      "learning_rate": 5e-06,
+      "loss": 0.4751,
+      "step": 760
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.1950794033571506,
+      "learning_rate": 5e-06,
+      "loss": 0.4745,
+      "step": 770
+    },
+    {
+      "epoch": 1.5802531645569622,
+      "grad_norm": 1.2518137614638525,
+      "learning_rate": 5e-06,
+      "loss": 0.4744,
+      "step": 780
+    },
+    {
+      "epoch": 1.600506329113924,
+      "grad_norm": 1.164886747372712,
+      "learning_rate": 5e-06,
+      "loss": 0.4778,
+      "step": 790
+    },
+    {
+      "epoch": 1.620759493670886,
+      "grad_norm": 1.3963551349425976,
+      "learning_rate": 5e-06,
+      "loss": 0.4749,
+      "step": 800
+    },
+    {
+      "epoch": 1.6410126582278481,
+      "grad_norm": 1.2170674697519668,
+      "learning_rate": 5e-06,
+      "loss": 0.4668,
+      "step": 810
+    },
+    {
+      "epoch": 1.6612658227848103,
+      "grad_norm": 1.2362608113225955,
+      "learning_rate": 5e-06,
+      "loss": 0.4755,
+      "step": 820
+    },
+    {
+      "epoch": 1.6815189873417722,
+      "grad_norm": 1.186974044583489,
+      "learning_rate": 5e-06,
+      "loss": 0.4709,
+      "step": 830
+    },
+    {
+      "epoch": 1.701772151898734,
+      "grad_norm": 1.1120094436131531,
+      "learning_rate": 5e-06,
+      "loss": 0.473,
+      "step": 840
+    },
+    {
+      "epoch": 1.7220253164556962,
+      "grad_norm": 1.1801749965556498,
+      "learning_rate": 5e-06,
+      "loss": 0.4759,
+      "step": 850
+    },
+    {
+      "epoch": 1.7422784810126584,
+      "grad_norm": 1.1457854356860961,
+      "learning_rate": 5e-06,
+      "loss": 0.4751,
+      "step": 860
+    },
+    {
+      "epoch": 1.7625316455696203,
+      "grad_norm": 1.096573157780436,
+      "learning_rate": 5e-06,
+      "loss": 0.476,
+      "step": 870
+    },
+    {
+      "epoch": 1.7827848101265822,
+      "grad_norm": 1.1968883544722357,
+      "learning_rate": 5e-06,
+      "loss": 0.48,
+      "step": 880
+    },
+    {
+      "epoch": 1.8030379746835443,
+      "grad_norm": 1.3462695661737496,
+      "learning_rate": 5e-06,
+      "loss": 0.477,
+      "step": 890
+    },
+    {
+      "epoch": 1.8232911392405065,
+      "grad_norm": 1.2981459167417944,
+      "learning_rate": 5e-06,
+      "loss": 0.4753,
+      "step": 900
+    },
+    {
+      "epoch": 1.8435443037974684,
+      "grad_norm": 1.4277375961136556,
+      "learning_rate": 5e-06,
+      "loss": 0.4771,
+      "step": 910
+    },
+    {
+      "epoch": 1.8637974683544303,
+      "grad_norm": 1.4551182850778734,
+      "learning_rate": 5e-06,
+      "loss": 0.477,
+      "step": 920
+    },
+    {
+      "epoch": 1.8840506329113924,
+      "grad_norm": 1.3405721129672685,
+      "learning_rate": 5e-06,
+      "loss": 0.4726,
+      "step": 930
+    },
+    {
+      "epoch": 1.9043037974683545,
+      "grad_norm": 1.2634029223509275,
+      "learning_rate": 5e-06,
+      "loss": 0.4749,
+      "step": 940
+    },
+    {
+      "epoch": 1.9245569620253165,
+      "grad_norm": 1.3160409770893926,
+      "learning_rate": 5e-06,
+      "loss": 0.4776,
+      "step": 950
+    },
+    {
+      "epoch": 1.9448101265822784,
+      "grad_norm": 1.0750739382136794,
+      "learning_rate": 5e-06,
+      "loss": 0.4816,
+      "step": 960
+    },
+    {
+      "epoch": 1.9650632911392405,
+      "grad_norm": 1.215667249435148,
+      "learning_rate": 5e-06,
+      "loss": 0.4789,
+      "step": 970
+    },
+    {
+      "epoch": 1.9853164556962026,
+      "grad_norm": 1.1020704754247843,
+      "learning_rate": 5e-06,
+      "loss": 0.4805,
+      "step": 980
+    },
+    {
+      "epoch": 1.999493670886076,
+      "eval_loss": 0.13782347738742828,
+      "eval_runtime": 255.1699,
+      "eval_samples_per_second": 52.138,
+      "eval_steps_per_second": 0.408,
+      "step": 987
+    },
+    {
+      "epoch": 2.0060759493670886,
+      "grad_norm": 2.2320392744750532,
+      "learning_rate": 5e-06,
+      "loss": 0.4496,
+      "step": 990
+    },
+    {
+      "epoch": 2.0263291139240507,
+      "grad_norm": 1.6264522271278852,
+      "learning_rate": 5e-06,
+      "loss": 0.3864,
+      "step": 1000
+    },
+    {
+      "epoch": 2.046582278481013,
+      "grad_norm": 1.4022257575865094,
+      "learning_rate": 5e-06,
+      "loss": 0.3795,
+      "step": 1010
+    },
+    {
+      "epoch": 2.0668354430379745,
+      "grad_norm": 1.5383595004674644,
+      "learning_rate": 5e-06,
+      "loss": 0.377,
+      "step": 1020
+    },
+    {
+      "epoch": 2.0870886075949366,
+      "grad_norm": 1.3513330118475093,
+      "learning_rate": 5e-06,
+      "loss": 0.3767,
+      "step": 1030
+    },
+    {
+      "epoch": 2.1073417721518988,
+      "grad_norm": 1.3272367266437448,
+      "learning_rate": 5e-06,
+      "loss": 0.3739,
+      "step": 1040
+    },
+    {
+      "epoch": 2.127594936708861,
+      "grad_norm": 1.3073032398087094,
+      "learning_rate": 5e-06,
+      "loss": 0.3846,
+      "step": 1050
+    },
+    {
+      "epoch": 2.1478481012658226,
+      "grad_norm": 1.5607458954840778,
+      "learning_rate": 5e-06,
+      "loss": 0.3804,
+      "step": 1060
+    },
+    {
+      "epoch": 2.1681012658227847,
+      "grad_norm": 1.3955276924826612,
+      "learning_rate": 5e-06,
+      "loss": 0.382,
+      "step": 1070
+    },
+    {
+      "epoch": 2.188354430379747,
+      "grad_norm": 1.5667776972963814,
+      "learning_rate": 5e-06,
+      "loss": 0.3815,
+      "step": 1080
+    },
+    {
+      "epoch": 2.208607594936709,
+      "grad_norm": 1.4260082058645476,
+      "learning_rate": 5e-06,
+      "loss": 0.3787,
+      "step": 1090
+    },
+    {
+      "epoch": 2.2288607594936707,
+      "grad_norm": 1.5562857213046708,
+      "learning_rate": 5e-06,
+      "loss": 0.3876,
+      "step": 1100
+    },
+    {
+      "epoch": 2.249113924050633,
+      "grad_norm": 1.9917004448182953,
+      "learning_rate": 5e-06,
+      "loss": 0.3856,
+      "step": 1110
+    },
+    {
+      "epoch": 2.269367088607595,
+      "grad_norm": 2.388764780653022,
+      "learning_rate": 5e-06,
+      "loss": 0.3843,
+      "step": 1120
+    },
+    {
+      "epoch": 2.289620253164557,
+      "grad_norm": 1.974750270308368,
+      "learning_rate": 5e-06,
+      "loss": 0.3873,
+      "step": 1130
+    },
+    {
+      "epoch": 2.309873417721519,
+      "grad_norm": 1.447456780610406,
+      "learning_rate": 5e-06,
+      "loss": 0.381,
+      "step": 1140
+    },
+    {
+      "epoch": 2.330126582278481,
+      "grad_norm": 1.3927022295027611,
+      "learning_rate": 5e-06,
+      "loss": 0.3869,
+      "step": 1150
+    },
+    {
+      "epoch": 2.350379746835443,
+      "grad_norm": 1.215373362438277,
+      "learning_rate": 5e-06,
+      "loss": 0.3871,
+      "step": 1160
+    },
+    {
+      "epoch": 2.370632911392405,
+      "grad_norm": 1.4491184946598725,
+      "learning_rate": 5e-06,
+      "loss": 0.3888,
+      "step": 1170
+    },
+    {
+      "epoch": 2.390886075949367,
+      "grad_norm": 1.4189426863567351,
+      "learning_rate": 5e-06,
+      "loss": 0.3901,
+      "step": 1180
+    },
+    {
+      "epoch": 2.411139240506329,
+      "grad_norm": 1.3984787319231367,
+      "learning_rate": 5e-06,
+      "loss": 0.3874,
+      "step": 1190
+    },
+    {
+      "epoch": 2.431392405063291,
+      "grad_norm": 1.5285728590269927,
+      "learning_rate": 5e-06,
+      "loss": 0.3885,
+      "step": 1200
+    },
+    {
+      "epoch": 2.4516455696202533,
+      "grad_norm": 1.6526724638756385,
+      "learning_rate": 5e-06,
+      "loss": 0.3866,
+      "step": 1210
+    },
+    {
+      "epoch": 2.4718987341772154,
+      "grad_norm": 1.7189706408748655,
+      "learning_rate": 5e-06,
+      "loss": 0.3892,
+      "step": 1220
+    },
+    {
+      "epoch": 2.492151898734177,
+      "grad_norm": 1.3142654182229083,
+      "learning_rate": 5e-06,
+      "loss": 0.3914,
+      "step": 1230
+    },
+    {
+      "epoch": 2.512405063291139,
+      "grad_norm": 1.735979097692375,
+      "learning_rate": 5e-06,
+      "loss": 0.3895,
+      "step": 1240
+    },
+    {
+      "epoch": 2.5326582278481014,
+      "grad_norm": 1.8356865092298218,
+      "learning_rate": 5e-06,
+      "loss": 0.3876,
+      "step": 1250
+    },
+    {
+      "epoch": 2.552911392405063,
+      "grad_norm": 1.2680188524532179,
+      "learning_rate": 5e-06,
+      "loss": 0.3808,
+      "step": 1260
+    },
+    {
+      "epoch": 2.573164556962025,
+      "grad_norm": 1.4512168828739789,
+      "learning_rate": 5e-06,
+      "loss": 0.3891,
+      "step": 1270
+    },
+    {
+      "epoch": 2.5934177215189873,
+      "grad_norm": 1.3295995678864736,
+      "learning_rate": 5e-06,
+      "loss": 0.3921,
+      "step": 1280
+    },
+    {
+      "epoch": 2.6136708860759494,
+      "grad_norm": 1.3912435194150836,
+      "learning_rate": 5e-06,
+      "loss": 0.3958,
+      "step": 1290
+    },
+    {
+      "epoch": 2.6339240506329116,
+      "grad_norm": 1.3175096081971012,
+      "learning_rate": 5e-06,
+      "loss": 0.3915,
+      "step": 1300
+    },
+    {
+      "epoch": 2.6541772151898733,
+      "grad_norm": 1.5566817774133672,
+      "learning_rate": 5e-06,
+      "loss": 0.3917,
+      "step": 1310
+    },
+    {
+      "epoch": 2.6744303797468354,
+      "grad_norm": 1.3878720965690428,
+      "learning_rate": 5e-06,
+      "loss": 0.3945,
+      "step": 1320
+    },
+    {
+      "epoch": 2.6946835443037975,
+      "grad_norm": 1.3785817403015144,
+      "learning_rate": 5e-06,
+      "loss": 0.3953,
+      "step": 1330
+    },
+    {
+      "epoch": 2.714936708860759,
+      "grad_norm": 1.4183014258875806,
+      "learning_rate": 5e-06,
+      "loss": 0.3974,
+      "step": 1340
+    },
+    {
+      "epoch": 2.7351898734177214,
+      "grad_norm": 1.3491695029291941,
+      "learning_rate": 5e-06,
+      "loss": 0.3946,
+      "step": 1350
+    },
+    {
+      "epoch": 2.7554430379746835,
+      "grad_norm": 1.2428699126707001,
+      "learning_rate": 5e-06,
+      "loss": 0.3904,
+      "step": 1360
+    },
+    {
+      "epoch": 2.7756962025316456,
+      "grad_norm": 1.433548321717136,
+      "learning_rate": 5e-06,
+      "loss": 0.3973,
+      "step": 1370
+    },
+    {
+      "epoch": 2.7959493670886078,
+      "grad_norm": 1.5285229294068068,
+      "learning_rate": 5e-06,
+      "loss": 0.395,
+      "step": 1380
+    },
+    {
+      "epoch": 2.81620253164557,
+      "grad_norm": 1.3717745770912413,
+      "learning_rate": 5e-06,
+      "loss": 0.3975,
+      "step": 1390
+    },
+    {
+      "epoch": 2.8364556962025316,
+      "grad_norm": 1.6107209557365296,
+      "learning_rate": 5e-06,
+      "loss": 0.3919,
+      "step": 1400
+    },
+    {
+      "epoch": 2.8567088607594937,
+      "grad_norm": 1.4417849324011554,
+      "learning_rate": 5e-06,
+      "loss": 0.3977,
+      "step": 1410
+    },
+    {
+      "epoch": 2.876962025316456,
+      "grad_norm": 1.3890939860087244,
+      "learning_rate": 5e-06,
+      "loss": 0.3945,
+      "step": 1420
+    },
+    {
+      "epoch": 2.8972151898734175,
+      "grad_norm": 1.340958614241595,
+      "learning_rate": 5e-06,
+      "loss": 0.3944,
+      "step": 1430
+    },
+    {
+      "epoch": 2.9174683544303797,
+      "grad_norm": 1.6057483136323554,
+      "learning_rate": 5e-06,
+      "loss": 0.3974,
+      "step": 1440
+    },
+    {
+      "epoch": 2.937721518987342,
+      "grad_norm": 1.3480539514146224,
+      "learning_rate": 5e-06,
+      "loss": 0.4065,
+      "step": 1450
+    },
+    {
+      "epoch": 2.957974683544304,
+      "grad_norm": 1.4215727797263549,
+      "learning_rate": 5e-06,
+      "loss": 0.3949,
+      "step": 1460
+    },
+    {
+      "epoch": 2.978227848101266,
+      "grad_norm": 1.3718015666487833,
+      "learning_rate": 5e-06,
+      "loss": 0.3987,
+      "step": 1470
+    },
+    {
+      "epoch": 2.9964556962025317,
+      "eval_loss": 0.14610780775547028,
+      "eval_runtime": 253.7654,
+      "eval_samples_per_second": 52.426,
+      "eval_steps_per_second": 0.41,
+      "step": 1479
+    },
+    {
+      "epoch": 2.9964556962025317,
+      "step": 1479,
+      "total_flos": 2476961326694400.0,
+      "train_loss": 0.47900067526718665,
+      "train_runtime": 42754.8995,
+      "train_samples_per_second": 17.735,
+      "train_steps_per_second": 0.035
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1479,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2476961326694400.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed