End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1270 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: oh_v1.3_slim_orca_x4
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # oh_v1.3_slim_orca_x4
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7230

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: oh_v1.3_slim_orca_x4
 # oh_v1.3_slim_orca_x4
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/oh_v1.3_slim_orca_x4 dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7230

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.999346832135859,
+    "eval_loss": 0.7229765057563782,
+    "eval_runtime": 388.6577,
+    "eval_samples_per_second": 39.804,
+    "eval_steps_per_second": 0.623,
+    "total_flos": 2884204756992000.0,
+    "train_loss": 0.6974312729536448,
+    "train_runtime": 56418.3982,
+    "train_samples_per_second": 15.629,
+    "train_steps_per_second": 0.031
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.999346832135859,
+    "eval_loss": 0.7229765057563782,
+    "eval_runtime": 388.6577,
+    "eval_samples_per_second": 39.804,
+    "eval_steps_per_second": 0.623
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.999346832135859,
+    "total_flos": 2884204756992000.0,
+    "train_loss": 0.6974312729536448,
+    "train_runtime": 56418.3982,
+    "train_samples_per_second": 15.629,
+    "train_steps_per_second": 0.031
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1270 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.999346832135859,
+  "eval_steps": 500,
+  "global_step": 1722,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.017417809710428913,
+      "grad_norm": 10.378825586435497,
+      "learning_rate": 5e-06,
+      "loss": 1.0838,
+      "step": 10
+    },
+    {
+      "epoch": 0.034835619420857826,
+      "grad_norm": 4.271546912407876,
+      "learning_rate": 5e-06,
+      "loss": 0.9373,
+      "step": 20
+    },
+    {
+      "epoch": 0.05225342913128674,
+      "grad_norm": 2.1390120781164823,
+      "learning_rate": 5e-06,
+      "loss": 0.8909,
+      "step": 30
+    },
+    {
+      "epoch": 0.06967123884171565,
+      "grad_norm": 1.4309168874054254,
+      "learning_rate": 5e-06,
+      "loss": 0.8639,
+      "step": 40
+    },
+    {
+      "epoch": 0.08708904855214457,
+      "grad_norm": 1.5406276876095664,
+      "learning_rate": 5e-06,
+      "loss": 0.8433,
+      "step": 50
+    },
+    {
+      "epoch": 0.10450685826257348,
+      "grad_norm": 1.2501465404617618,
+      "learning_rate": 5e-06,
+      "loss": 0.8247,
+      "step": 60
+    },
+    {
+      "epoch": 0.1219246679730024,
+      "grad_norm": 0.9239834135671998,
+      "learning_rate": 5e-06,
+      "loss": 0.8126,
+      "step": 70
+    },
+    {
+      "epoch": 0.1393424776834313,
+      "grad_norm": 0.8433941860985329,
+      "learning_rate": 5e-06,
+      "loss": 0.8046,
+      "step": 80
+    },
+    {
+      "epoch": 0.15676028739386022,
+      "grad_norm": 1.097647182335265,
+      "learning_rate": 5e-06,
+      "loss": 0.796,
+      "step": 90
+    },
+    {
+      "epoch": 0.17417809710428914,
+      "grad_norm": 1.0683416519694173,
+      "learning_rate": 5e-06,
+      "loss": 0.7903,
+      "step": 100
+    },
+    {
+      "epoch": 0.19159590681471805,
+      "grad_norm": 1.572717136046009,
+      "learning_rate": 5e-06,
+      "loss": 0.7846,
+      "step": 110
+    },
+    {
+      "epoch": 0.20901371652514697,
+      "grad_norm": 0.9249575894994505,
+      "learning_rate": 5e-06,
+      "loss": 0.7806,
+      "step": 120
+    },
+    {
+      "epoch": 0.2264315262355759,
+      "grad_norm": 0.8193084838390464,
+      "learning_rate": 5e-06,
+      "loss": 0.7783,
+      "step": 130
+    },
+    {
+      "epoch": 0.2438493359460048,
+      "grad_norm": 0.8732647330500742,
+      "learning_rate": 5e-06,
+      "loss": 0.7725,
+      "step": 140
+    },
+    {
+      "epoch": 0.2612671456564337,
+      "grad_norm": 0.7448079044745952,
+      "learning_rate": 5e-06,
+      "loss": 0.773,
+      "step": 150
+    },
+    {
+      "epoch": 0.2786849553668626,
+      "grad_norm": 0.9898862055097742,
+      "learning_rate": 5e-06,
+      "loss": 0.7695,
+      "step": 160
+    },
+    {
+      "epoch": 0.29610276507729155,
+      "grad_norm": 0.7272776397879028,
+      "learning_rate": 5e-06,
+      "loss": 0.7705,
+      "step": 170
+    },
+    {
+      "epoch": 0.31352057478772044,
+      "grad_norm": 1.0826222911505594,
+      "learning_rate": 5e-06,
+      "loss": 0.7737,
+      "step": 180
+    },
+    {
+      "epoch": 0.3309383844981494,
+      "grad_norm": 0.6748830770235253,
+      "learning_rate": 5e-06,
+      "loss": 0.7611,
+      "step": 190
+    },
+    {
+      "epoch": 0.3483561942085783,
+      "grad_norm": 0.7359625796839507,
+      "learning_rate": 5e-06,
+      "loss": 0.7609,
+      "step": 200
+    },
+    {
+      "epoch": 0.36577400391900716,
+      "grad_norm": 0.6962279114859433,
+      "learning_rate": 5e-06,
+      "loss": 0.756,
+      "step": 210
+    },
+    {
+      "epoch": 0.3831918136294361,
+      "grad_norm": 0.6255149151141138,
+      "learning_rate": 5e-06,
+      "loss": 0.7617,
+      "step": 220
+    },
+    {
+      "epoch": 0.400609623339865,
+      "grad_norm": 0.7052369366480614,
+      "learning_rate": 5e-06,
+      "loss": 0.7569,
+      "step": 230
+    },
+    {
+      "epoch": 0.41802743305029394,
+      "grad_norm": 0.6097155207889393,
+      "learning_rate": 5e-06,
+      "loss": 0.7546,
+      "step": 240
+    },
+    {
+      "epoch": 0.43544524276072283,
+      "grad_norm": 0.6836967619335058,
+      "learning_rate": 5e-06,
+      "loss": 0.7613,
+      "step": 250
+    },
+    {
+      "epoch": 0.4528630524711518,
+      "grad_norm": 0.5747669710959988,
+      "learning_rate": 5e-06,
+      "loss": 0.7568,
+      "step": 260
+    },
+    {
+      "epoch": 0.47028086218158066,
+      "grad_norm": 0.8293746415280457,
+      "learning_rate": 5e-06,
+      "loss": 0.755,
+      "step": 270
+    },
+    {
+      "epoch": 0.4876986718920096,
+      "grad_norm": 0.6240762658588678,
+      "learning_rate": 5e-06,
+      "loss": 0.7512,
+      "step": 280
+    },
+    {
+      "epoch": 0.5051164816024385,
+      "grad_norm": 0.604817000368227,
+      "learning_rate": 5e-06,
+      "loss": 0.7565,
+      "step": 290
+    },
+    {
+      "epoch": 0.5225342913128674,
+      "grad_norm": 0.7975782732234052,
+      "learning_rate": 5e-06,
+      "loss": 0.7536,
+      "step": 300
+    },
+    {
+      "epoch": 0.5399521010232963,
+      "grad_norm": 0.9097401207787359,
+      "learning_rate": 5e-06,
+      "loss": 0.7449,
+      "step": 310
+    },
+    {
+      "epoch": 0.5573699107337252,
+      "grad_norm": 0.5850865591811083,
+      "learning_rate": 5e-06,
+      "loss": 0.7476,
+      "step": 320
+    },
+    {
+      "epoch": 0.5747877204441542,
+      "grad_norm": 0.7135182489392279,
+      "learning_rate": 5e-06,
+      "loss": 0.7446,
+      "step": 330
+    },
+    {
+      "epoch": 0.5922055301545831,
+      "grad_norm": 0.7289623389467091,
+      "learning_rate": 5e-06,
+      "loss": 0.7461,
+      "step": 340
+    },
+    {
+      "epoch": 0.6096233398650119,
+      "grad_norm": 0.7209085909740518,
+      "learning_rate": 5e-06,
+      "loss": 0.7441,
+      "step": 350
+    },
+    {
+      "epoch": 0.6270411495754409,
+      "grad_norm": 0.7124009797840823,
+      "learning_rate": 5e-06,
+      "loss": 0.7426,
+      "step": 360
+    },
+    {
+      "epoch": 0.6444589592858698,
+      "grad_norm": 0.7812743047985142,
+      "learning_rate": 5e-06,
+      "loss": 0.7425,
+      "step": 370
+    },
+    {
+      "epoch": 0.6618767689962988,
+      "grad_norm": 0.9348863468860681,
+      "learning_rate": 5e-06,
+      "loss": 0.7426,
+      "step": 380
+    },
+    {
+      "epoch": 0.6792945787067276,
+      "grad_norm": 0.8887235387086234,
+      "learning_rate": 5e-06,
+      "loss": 0.7443,
+      "step": 390
+    },
+    {
+      "epoch": 0.6967123884171565,
+      "grad_norm": 0.5644550564287659,
+      "learning_rate": 5e-06,
+      "loss": 0.7462,
+      "step": 400
+    },
+    {
+      "epoch": 0.7141301981275855,
+      "grad_norm": 0.7133759479394518,
+      "learning_rate": 5e-06,
+      "loss": 0.7392,
+      "step": 410
+    },
+    {
+      "epoch": 0.7315480078380143,
+      "grad_norm": 0.6322425713640688,
+      "learning_rate": 5e-06,
+      "loss": 0.7438,
+      "step": 420
+    },
+    {
+      "epoch": 0.7489658175484433,
+      "grad_norm": 0.653356418355363,
+      "learning_rate": 5e-06,
+      "loss": 0.7353,
+      "step": 430
+    },
+    {
+      "epoch": 0.7663836272588722,
+      "grad_norm": 0.7172550980138845,
+      "learning_rate": 5e-06,
+      "loss": 0.7362,
+      "step": 440
+    },
+    {
+      "epoch": 0.7838014369693012,
+      "grad_norm": 0.5961758862117608,
+      "learning_rate": 5e-06,
+      "loss": 0.7374,
+      "step": 450
+    },
+    {
+      "epoch": 0.80121924667973,
+      "grad_norm": 0.682099366244341,
+      "learning_rate": 5e-06,
+      "loss": 0.7361,
+      "step": 460
+    },
+    {
+      "epoch": 0.8186370563901589,
+      "grad_norm": 0.6035807917106119,
+      "learning_rate": 5e-06,
+      "loss": 0.7394,
+      "step": 470
+    },
+    {
+      "epoch": 0.8360548661005879,
+      "grad_norm": 0.6420432430762913,
+      "learning_rate": 5e-06,
+      "loss": 0.7427,
+      "step": 480
+    },
+    {
+      "epoch": 0.8534726758110167,
+      "grad_norm": 0.6771119733640588,
+      "learning_rate": 5e-06,
+      "loss": 0.7423,
+      "step": 490
+    },
+    {
+      "epoch": 0.8708904855214457,
+      "grad_norm": 0.7126075005956054,
+      "learning_rate": 5e-06,
+      "loss": 0.7383,
+      "step": 500
+    },
+    {
+      "epoch": 0.8883082952318746,
+      "grad_norm": 0.7584360793951261,
+      "learning_rate": 5e-06,
+      "loss": 0.7321,
+      "step": 510
+    },
+    {
+      "epoch": 0.9057261049423035,
+      "grad_norm": 0.7439364499749611,
+      "learning_rate": 5e-06,
+      "loss": 0.7371,
+      "step": 520
+    },
+    {
+      "epoch": 0.9231439146527324,
+      "grad_norm": 0.6529063351558732,
+      "learning_rate": 5e-06,
+      "loss": 0.74,
+      "step": 530
+    },
+    {
+      "epoch": 0.9405617243631613,
+      "grad_norm": 0.6579238997317206,
+      "learning_rate": 5e-06,
+      "loss": 0.7339,
+      "step": 540
+    },
+    {
+      "epoch": 0.9579795340735903,
+      "grad_norm": 0.6529747723949795,
+      "learning_rate": 5e-06,
+      "loss": 0.7366,
+      "step": 550
+    },
+    {
+      "epoch": 0.9753973437840192,
+      "grad_norm": 0.7465822819939318,
+      "learning_rate": 5e-06,
+      "loss": 0.7329,
+      "step": 560
+    },
+    {
+      "epoch": 0.992815153494448,
+      "grad_norm": 0.6744574449718616,
+      "learning_rate": 5e-06,
+      "loss": 0.7336,
+      "step": 570
+    },
+    {
+      "epoch": 0.9997822773786197,
+      "eval_loss": 0.7317857146263123,
+      "eval_runtime": 402.0076,
+      "eval_samples_per_second": 38.482,
+      "eval_steps_per_second": 0.602,
+      "step": 574
+    },
+    {
+      "epoch": 1.010232963204877,
+      "grad_norm": 0.7094322700801018,
+      "learning_rate": 5e-06,
+      "loss": 0.763,
+      "step": 580
+    },
+    {
+      "epoch": 1.027650772915306,
+      "grad_norm": 0.619008567608486,
+      "learning_rate": 5e-06,
+      "loss": 0.6821,
+      "step": 590
+    },
+    {
+      "epoch": 1.0450685826257349,
+      "grad_norm": 0.7720080238431899,
+      "learning_rate": 5e-06,
+      "loss": 0.6814,
+      "step": 600
+    },
+    {
+      "epoch": 1.0624863923361638,
+      "grad_norm": 0.613074115463787,
+      "learning_rate": 5e-06,
+      "loss": 0.6871,
+      "step": 610
+    },
+    {
+      "epoch": 1.0799042020465925,
+      "grad_norm": 0.8529977128112323,
+      "learning_rate": 5e-06,
+      "loss": 0.6916,
+      "step": 620
+    },
+    {
+      "epoch": 1.0973220117570215,
+      "grad_norm": 0.6185916445154745,
+      "learning_rate": 5e-06,
+      "loss": 0.6852,
+      "step": 630
+    },
+    {
+      "epoch": 1.1147398214674504,
+      "grad_norm": 0.5960552407712587,
+      "learning_rate": 5e-06,
+      "loss": 0.6902,
+      "step": 640
+    },
+    {
+      "epoch": 1.1321576311778794,
+      "grad_norm": 0.6992739712457544,
+      "learning_rate": 5e-06,
+      "loss": 0.6838,
+      "step": 650
+    },
+    {
+      "epoch": 1.1495754408883083,
+      "grad_norm": 0.6896093508843089,
+      "learning_rate": 5e-06,
+      "loss": 0.6883,
+      "step": 660
+    },
+    {
+      "epoch": 1.1669932505987373,
+      "grad_norm": 0.7618446264021529,
+      "learning_rate": 5e-06,
+      "loss": 0.6866,
+      "step": 670
+    },
+    {
+      "epoch": 1.1844110603091662,
+      "grad_norm": 0.5993982811906263,
+      "learning_rate": 5e-06,
+      "loss": 0.6864,
+      "step": 680
+    },
+    {
+      "epoch": 1.201828870019595,
+      "grad_norm": 0.7774490221678223,
+      "learning_rate": 5e-06,
+      "loss": 0.6904,
+      "step": 690
+    },
+    {
+      "epoch": 1.2192466797300239,
+      "grad_norm": 0.7209322592758776,
+      "learning_rate": 5e-06,
+      "loss": 0.6843,
+      "step": 700
+    },
+    {
+      "epoch": 1.2366644894404528,
+      "grad_norm": 0.5933996592162579,
+      "learning_rate": 5e-06,
+      "loss": 0.6858,
+      "step": 710
+    },
+    {
+      "epoch": 1.2540822991508818,
+      "grad_norm": 0.603096793298619,
+      "learning_rate": 5e-06,
+      "loss": 0.6895,
+      "step": 720
+    },
+    {
+      "epoch": 1.2715001088613107,
+      "grad_norm": 0.6254264453133531,
+      "learning_rate": 5e-06,
+      "loss": 0.6848,
+      "step": 730
+    },
+    {
+      "epoch": 1.2889179185717397,
+      "grad_norm": 0.6255696330381725,
+      "learning_rate": 5e-06,
+      "loss": 0.6846,
+      "step": 740
+    },
+    {
+      "epoch": 1.3063357282821686,
+      "grad_norm": 0.5954700919121718,
+      "learning_rate": 5e-06,
+      "loss": 0.6857,
+      "step": 750
+    },
+    {
+      "epoch": 1.3237535379925975,
+      "grad_norm": 0.605017152001749,
+      "learning_rate": 5e-06,
+      "loss": 0.6833,
+      "step": 760
+    },
+    {
+      "epoch": 1.3411713477030263,
+      "grad_norm": 0.6374571812243335,
+      "learning_rate": 5e-06,
+      "loss": 0.6835,
+      "step": 770
+    },
+    {
+      "epoch": 1.3585891574134552,
+      "grad_norm": 0.7034685309687484,
+      "learning_rate": 5e-06,
+      "loss": 0.6862,
+      "step": 780
+    },
+    {
+      "epoch": 1.3760069671238842,
+      "grad_norm": 0.6478857838671078,
+      "learning_rate": 5e-06,
+      "loss": 0.6834,
+      "step": 790
+    },
+    {
+      "epoch": 1.393424776834313,
+      "grad_norm": 0.6774648068840645,
+      "learning_rate": 5e-06,
+      "loss": 0.6835,
+      "step": 800
+    },
+    {
+      "epoch": 1.410842586544742,
+      "grad_norm": 0.7999211706624636,
+      "learning_rate": 5e-06,
+      "loss": 0.6849,
+      "step": 810
+    },
+    {
+      "epoch": 1.428260396255171,
+      "grad_norm": 0.7093553520320318,
+      "learning_rate": 5e-06,
+      "loss": 0.6874,
+      "step": 820
+    },
+    {
+      "epoch": 1.4456782059655997,
+      "grad_norm": 0.6339150451411103,
+      "learning_rate": 5e-06,
+      "loss": 0.6817,
+      "step": 830
+    },
+    {
+      "epoch": 1.4630960156760286,
+      "grad_norm": 0.6378354921425449,
+      "learning_rate": 5e-06,
+      "loss": 0.687,
+      "step": 840
+    },
+    {
+      "epoch": 1.4805138253864576,
+      "grad_norm": 0.6060950779002315,
+      "learning_rate": 5e-06,
+      "loss": 0.6852,
+      "step": 850
+    },
+    {
+      "epoch": 1.4979316350968865,
+      "grad_norm": 0.7027258672686724,
+      "learning_rate": 5e-06,
+      "loss": 0.6866,
+      "step": 860
+    },
+    {
+      "epoch": 1.5153494448073155,
+      "grad_norm": 0.6241403475122037,
+      "learning_rate": 5e-06,
+      "loss": 0.6831,
+      "step": 870
+    },
+    {
+      "epoch": 1.5327672545177444,
+      "grad_norm": 0.7090161192507007,
+      "learning_rate": 5e-06,
+      "loss": 0.6827,
+      "step": 880
+    },
+    {
+      "epoch": 1.5501850642281734,
+      "grad_norm": 0.6858971984003965,
+      "learning_rate": 5e-06,
+      "loss": 0.6792,
+      "step": 890
+    },
+    {
+      "epoch": 1.5676028739386023,
+      "grad_norm": 0.6546584740097385,
+      "learning_rate": 5e-06,
+      "loss": 0.6806,
+      "step": 900
+    },
+    {
+      "epoch": 1.5850206836490313,
+      "grad_norm": 0.6020087571697199,
+      "learning_rate": 5e-06,
+      "loss": 0.6799,
+      "step": 910
+    },
+    {
+      "epoch": 1.6024384933594602,
+      "grad_norm": 0.6501182817858244,
+      "learning_rate": 5e-06,
+      "loss": 0.6857,
+      "step": 920
+    },
+    {
+      "epoch": 1.619856303069889,
+      "grad_norm": 0.584870256422628,
+      "learning_rate": 5e-06,
+      "loss": 0.6803,
+      "step": 930
+    },
+    {
+      "epoch": 1.6372741127803179,
+      "grad_norm": 0.6317587237334846,
+      "learning_rate": 5e-06,
+      "loss": 0.6855,
+      "step": 940
+    },
+    {
+      "epoch": 1.6546919224907468,
+      "grad_norm": 0.5912078658288651,
+      "learning_rate": 5e-06,
+      "loss": 0.6831,
+      "step": 950
+    },
+    {
+      "epoch": 1.6721097322011755,
+      "grad_norm": 0.6128368243065496,
+      "learning_rate": 5e-06,
+      "loss": 0.6808,
+      "step": 960
+    },
+    {
+      "epoch": 1.6895275419116045,
+      "grad_norm": 0.7946845578841991,
+      "learning_rate": 5e-06,
+      "loss": 0.6878,
+      "step": 970
+    },
+    {
+      "epoch": 1.7069453516220334,
+      "grad_norm": 0.6757515701163817,
+      "learning_rate": 5e-06,
+      "loss": 0.6854,
+      "step": 980
+    },
+    {
+      "epoch": 1.7243631613324624,
+      "grad_norm": 0.6571419210311429,
+      "learning_rate": 5e-06,
+      "loss": 0.6823,
+      "step": 990
+    },
+    {
+      "epoch": 1.7417809710428913,
+      "grad_norm": 0.7598031996788027,
+      "learning_rate": 5e-06,
+      "loss": 0.6797,
+      "step": 1000
+    },
+    {
+      "epoch": 1.7591987807533203,
+      "grad_norm": 0.6670752253306316,
+      "learning_rate": 5e-06,
+      "loss": 0.6842,
+      "step": 1010
+    },
+    {
+      "epoch": 1.7766165904637492,
+      "grad_norm": 0.6401962122300333,
+      "learning_rate": 5e-06,
+      "loss": 0.6812,
+      "step": 1020
+    },
+    {
+      "epoch": 1.7940344001741781,
+      "grad_norm": 0.7117175290423249,
+      "learning_rate": 5e-06,
+      "loss": 0.6824,
+      "step": 1030
+    },
+    {
+      "epoch": 1.811452209884607,
+      "grad_norm": 0.5800625845659623,
+      "learning_rate": 5e-06,
+      "loss": 0.6803,
+      "step": 1040
+    },
+    {
+      "epoch": 1.828870019595036,
+      "grad_norm": 0.7320040356270946,
+      "learning_rate": 5e-06,
+      "loss": 0.6803,
+      "step": 1050
+    },
+    {
+      "epoch": 1.846287829305465,
+      "grad_norm": 0.6283167554926344,
+      "learning_rate": 5e-06,
+      "loss": 0.6839,
+      "step": 1060
+    },
+    {
+      "epoch": 1.8637056390158937,
+      "grad_norm": 0.8371943589394946,
+      "learning_rate": 5e-06,
+      "loss": 0.6812,
+      "step": 1070
+    },
+    {
+      "epoch": 1.8811234487263226,
+      "grad_norm": 0.6257083963805145,
+      "learning_rate": 5e-06,
+      "loss": 0.6838,
+      "step": 1080
+    },
+    {
+      "epoch": 1.8985412584367516,
+      "grad_norm": 0.5986261297946587,
+      "learning_rate": 5e-06,
+      "loss": 0.6797,
+      "step": 1090
+    },
+    {
+      "epoch": 1.9159590681471805,
+      "grad_norm": 0.6170171924536151,
+      "learning_rate": 5e-06,
+      "loss": 0.6797,
+      "step": 1100
+    },
+    {
+      "epoch": 1.9333768778576093,
+      "grad_norm": 0.5700527826150651,
+      "learning_rate": 5e-06,
+      "loss": 0.681,
+      "step": 1110
+    },
+    {
+      "epoch": 1.9507946875680382,
+      "grad_norm": 0.7918614747451274,
+      "learning_rate": 5e-06,
+      "loss": 0.6836,
+      "step": 1120
+    },
+    {
+      "epoch": 1.9682124972784671,
+      "grad_norm": 0.6423488525843636,
+      "learning_rate": 5e-06,
+      "loss": 0.6841,
+      "step": 1130
+    },
+    {
+      "epoch": 1.985630306988896,
+      "grad_norm": 0.5727071414052627,
+      "learning_rate": 5e-06,
+      "loss": 0.6807,
+      "step": 1140
+    },
+    {
+      "epoch": 1.9995645547572392,
+      "eval_loss": 0.7203673124313354,
+      "eval_runtime": 394.6464,
+      "eval_samples_per_second": 39.2,
+      "eval_steps_per_second": 0.613,
+      "step": 1148
+    },
+    {
+      "epoch": 2.003048116699325,
+      "grad_norm": 0.9442473115477681,
+      "learning_rate": 5e-06,
+      "loss": 0.7309,
+      "step": 1150
+    },
+    {
+      "epoch": 2.020465926409754,
+      "grad_norm": 0.7483105880106439,
+      "learning_rate": 5e-06,
+      "loss": 0.6325,
+      "step": 1160
+    },
+    {
+      "epoch": 2.037883736120183,
+      "grad_norm": 0.7540713563935749,
+      "learning_rate": 5e-06,
+      "loss": 0.6307,
+      "step": 1170
+    },
+    {
+      "epoch": 2.055301545830612,
+      "grad_norm": 0.6724851440947269,
+      "learning_rate": 5e-06,
+      "loss": 0.6337,
+      "step": 1180
+    },
+    {
+      "epoch": 2.072719355541041,
+      "grad_norm": 0.8087552584080454,
+      "learning_rate": 5e-06,
+      "loss": 0.6291,
+      "step": 1190
+    },
+    {
+      "epoch": 2.0901371652514698,
+      "grad_norm": 0.5977695571055209,
+      "learning_rate": 5e-06,
+      "loss": 0.6303,
+      "step": 1200
+    },
+    {
+      "epoch": 2.1075549749618987,
+      "grad_norm": 0.7569591780057143,
+      "learning_rate": 5e-06,
+      "loss": 0.63,
+      "step": 1210
+    },
+    {
+      "epoch": 2.1249727846723276,
+      "grad_norm": 0.7224022820617728,
+      "learning_rate": 5e-06,
+      "loss": 0.6319,
+      "step": 1220
+    },
+    {
+      "epoch": 2.142390594382756,
+      "grad_norm": 0.9103041866730269,
+      "learning_rate": 5e-06,
+      "loss": 0.6338,
+      "step": 1230
+    },
+    {
+      "epoch": 2.159808404093185,
+      "grad_norm": 0.6573119068550747,
+      "learning_rate": 5e-06,
+      "loss": 0.6315,
+      "step": 1240
+    },
+    {
+      "epoch": 2.177226213803614,
+      "grad_norm": 0.7130793786888794,
+      "learning_rate": 5e-06,
+      "loss": 0.6307,
+      "step": 1250
+    },
+    {
+      "epoch": 2.194644023514043,
+      "grad_norm": 0.6787527348130123,
+      "learning_rate": 5e-06,
+      "loss": 0.635,
+      "step": 1260
+    },
+    {
+      "epoch": 2.212061833224472,
+      "grad_norm": 0.7583316967190475,
+      "learning_rate": 5e-06,
+      "loss": 0.6328,
+      "step": 1270
+    },
+    {
+      "epoch": 2.229479642934901,
+      "grad_norm": 0.9232698200191256,
+      "learning_rate": 5e-06,
+      "loss": 0.6392,
+      "step": 1280
+    },
+    {
+      "epoch": 2.24689745264533,
+      "grad_norm": 0.7252346361915288,
+      "learning_rate": 5e-06,
+      "loss": 0.6347,
+      "step": 1290
+    },
+    {
+      "epoch": 2.2643152623557588,
+      "grad_norm": 0.6362752959635848,
+      "learning_rate": 5e-06,
+      "loss": 0.6304,
+      "step": 1300
+    },
+    {
+      "epoch": 2.2817330720661877,
+      "grad_norm": 0.6085958821854244,
+      "learning_rate": 5e-06,
+      "loss": 0.6354,
+      "step": 1310
+    },
+    {
+      "epoch": 2.2991508817766166,
+      "grad_norm": 0.6106358089516802,
+      "learning_rate": 5e-06,
+      "loss": 0.6319,
+      "step": 1320
+    },
+    {
+      "epoch": 2.3165686914870456,
+      "grad_norm": 0.6704766246511351,
+      "learning_rate": 5e-06,
+      "loss": 0.6361,
+      "step": 1330
+    },
+    {
+      "epoch": 2.3339865011974745,
+      "grad_norm": 0.7134352783772538,
+      "learning_rate": 5e-06,
+      "loss": 0.6363,
+      "step": 1340
+    },
+    {
+      "epoch": 2.3514043109079035,
+      "grad_norm": 0.691476396805842,
+      "learning_rate": 5e-06,
+      "loss": 0.6408,
+      "step": 1350
+    },
+    {
+      "epoch": 2.3688221206183324,
+      "grad_norm": 0.6591953333036876,
+      "learning_rate": 5e-06,
+      "loss": 0.6333,
+      "step": 1360
+    },
+    {
+      "epoch": 2.3862399303287614,
+      "grad_norm": 0.6492399404200755,
+      "learning_rate": 5e-06,
+      "loss": 0.6328,
+      "step": 1370
+    },
+    {
+      "epoch": 2.40365774003919,
+      "grad_norm": 0.615428906926297,
+      "learning_rate": 5e-06,
+      "loss": 0.6413,
+      "step": 1380
+    },
+    {
+      "epoch": 2.421075549749619,
+      "grad_norm": 0.6178851722594154,
+      "learning_rate": 5e-06,
+      "loss": 0.6362,
+      "step": 1390
+    },
+    {
+      "epoch": 2.4384933594600477,
+      "grad_norm": 0.6630448901048148,
+      "learning_rate": 5e-06,
+      "loss": 0.6354,
+      "step": 1400
+    },
+    {
+      "epoch": 2.4559111691704767,
+      "grad_norm": 0.6761352769262333,
+      "learning_rate": 5e-06,
+      "loss": 0.6364,
+      "step": 1410
+    },
+    {
+      "epoch": 2.4733289788809056,
+      "grad_norm": 0.6840809680625406,
+      "learning_rate": 5e-06,
+      "loss": 0.6347,
+      "step": 1420
+    },
+    {
+      "epoch": 2.4907467885913346,
+      "grad_norm": 0.7752711880925182,
+      "learning_rate": 5e-06,
+      "loss": 0.6375,
+      "step": 1430
+    },
+    {
+      "epoch": 2.5081645983017635,
+      "grad_norm": 0.671961987869029,
+      "learning_rate": 5e-06,
+      "loss": 0.638,
+      "step": 1440
+    },
+    {
+      "epoch": 2.5255824080121925,
+      "grad_norm": 0.6066582748163826,
+      "learning_rate": 5e-06,
+      "loss": 0.6356,
+      "step": 1450
+    },
+    {
+      "epoch": 2.5430002177226214,
+      "grad_norm": 0.6494968644059873,
+      "learning_rate": 5e-06,
+      "loss": 0.64,
+      "step": 1460
+    },
+    {
+      "epoch": 2.5604180274330504,
+      "grad_norm": 0.6819270998889235,
+      "learning_rate": 5e-06,
+      "loss": 0.6392,
+      "step": 1470
+    },
+    {
+      "epoch": 2.5778358371434793,
+      "grad_norm": 0.6720899123226914,
+      "learning_rate": 5e-06,
+      "loss": 0.638,
+      "step": 1480
+    },
+    {
+      "epoch": 2.5952536468539082,
+      "grad_norm": 0.8865903590224419,
+      "learning_rate": 5e-06,
+      "loss": 0.6418,
+      "step": 1490
+    },
+    {
+      "epoch": 2.612671456564337,
+      "grad_norm": 0.8255525182739956,
+      "learning_rate": 5e-06,
+      "loss": 0.637,
+      "step": 1500
+    },
+    {
+      "epoch": 2.6300892662747657,
+      "grad_norm": 0.6926869297282812,
+      "learning_rate": 5e-06,
+      "loss": 0.6359,
+      "step": 1510
+    },
+    {
+      "epoch": 2.647507075985195,
+      "grad_norm": 0.7064032548106364,
+      "learning_rate": 5e-06,
+      "loss": 0.6372,
+      "step": 1520
+    },
+    {
+      "epoch": 2.6649248856956236,
+      "grad_norm": 0.6147984872224924,
+      "learning_rate": 5e-06,
+      "loss": 0.6393,
+      "step": 1530
+    },
+    {
+      "epoch": 2.6823426954060525,
+      "grad_norm": 0.6504099699536218,
+      "learning_rate": 5e-06,
+      "loss": 0.6399,
+      "step": 1540
+    },
+    {
+      "epoch": 2.6997605051164815,
+      "grad_norm": 0.563483068339733,
+      "learning_rate": 5e-06,
+      "loss": 0.6348,
+      "step": 1550
+    },
+    {
+      "epoch": 2.7171783148269104,
+      "grad_norm": 0.730022824759867,
+      "learning_rate": 5e-06,
+      "loss": 0.6358,
+      "step": 1560
+    },
+    {
+      "epoch": 2.7345961245373394,
+      "grad_norm": 0.7435338593643929,
+      "learning_rate": 5e-06,
+      "loss": 0.6416,
+      "step": 1570
+    },
+    {
+      "epoch": 2.7520139342477683,
+      "grad_norm": 0.7041374525178048,
+      "learning_rate": 5e-06,
+      "loss": 0.6312,
+      "step": 1580
+    },
+    {
+      "epoch": 2.7694317439581972,
+      "grad_norm": 0.6185370005773447,
+      "learning_rate": 5e-06,
+      "loss": 0.6369,
+      "step": 1590
+    },
+    {
+      "epoch": 2.786849553668626,
+      "grad_norm": 0.597751813516521,
+      "learning_rate": 5e-06,
+      "loss": 0.6376,
+      "step": 1600
+    },
+    {
+      "epoch": 2.804267363379055,
+      "grad_norm": 0.6525590591893353,
+      "learning_rate": 5e-06,
+      "loss": 0.638,
+      "step": 1610
+    },
+    {
+      "epoch": 2.821685173089484,
+      "grad_norm": 0.6520966089133831,
+      "learning_rate": 5e-06,
+      "loss": 0.6363,
+      "step": 1620
+    },
+    {
+      "epoch": 2.839102982799913,
+      "grad_norm": 0.6318597492523834,
+      "learning_rate": 5e-06,
+      "loss": 0.6392,
+      "step": 1630
+    },
+    {
+      "epoch": 2.856520792510342,
+      "grad_norm": 0.7325044927855683,
+      "learning_rate": 5e-06,
+      "loss": 0.6369,
+      "step": 1640
+    },
+    {
+      "epoch": 2.873938602220771,
+      "grad_norm": 0.6253335500365848,
+      "learning_rate": 5e-06,
+      "loss": 0.636,
+      "step": 1650
+    },
+    {
+      "epoch": 2.8913564119311994,
+      "grad_norm": 0.6704613482673505,
+      "learning_rate": 5e-06,
+      "loss": 0.6376,
+      "step": 1660
+    },
+    {
+      "epoch": 2.908774221641629,
+      "grad_norm": 0.7273411977567759,
+      "learning_rate": 5e-06,
+      "loss": 0.6386,
+      "step": 1670
+    },
+    {
+      "epoch": 2.9261920313520573,
+      "grad_norm": 0.6268248741184522,
+      "learning_rate": 5e-06,
+      "loss": 0.6378,
+      "step": 1680
+    },
+    {
+      "epoch": 2.9436098410624862,
+      "grad_norm": 0.5745167165482074,
+      "learning_rate": 5e-06,
+      "loss": 0.6392,
+      "step": 1690
+    },
+    {
+      "epoch": 2.961027650772915,
+      "grad_norm": 0.6046462612164668,
+      "learning_rate": 5e-06,
+      "loss": 0.642,
+      "step": 1700
+    },
+    {
+      "epoch": 2.978445460483344,
+      "grad_norm": 0.6397198084499147,
+      "learning_rate": 5e-06,
+      "loss": 0.6381,
+      "step": 1710
+    },
+    {
+      "epoch": 2.995863270193773,
+      "grad_norm": 0.7413825248918653,
+      "learning_rate": 5e-06,
+      "loss": 0.638,
+      "step": 1720
+    },
+    {
+      "epoch": 2.999346832135859,
+      "eval_loss": 0.7229765057563782,
+      "eval_runtime": 389.254,
+      "eval_samples_per_second": 39.743,
+      "eval_steps_per_second": 0.622,
+      "step": 1722
+    },
+    {
+      "epoch": 2.999346832135859,
+      "step": 1722,
+      "total_flos": 2884204756992000.0,
+      "train_loss": 0.6974312729536448,
+      "train_runtime": 56418.3982,
+      "train_samples_per_second": 15.629,
+      "train_steps_per_second": 0.031
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1722,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2884204756992000.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed