End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +766 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: gemma
 base_model: google/gemma-2-9b
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: hp_ablations_gemma_scheduler_linear_warmup0.05_dcftv1.2
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # hp_ablations_gemma_scheduler_linear_warmup0.05_dcftv1.2
-This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6192

 base_model: google/gemma-2-9b
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: hp_ablations_gemma_scheduler_linear_warmup0.05_dcftv1.2
 # hp_ablations_gemma_scheduler_linear_warmup0.05_dcftv1.2
+This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on the mlfoundations-dev/oh-dcft-v1.2_no-curation_gpt-4o-mini dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6192

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.999438727782975,
+    "eval_loss": 0.6192284226417542,
+    "eval_runtime": 517.6737,
+    "eval_samples_per_second": 17.387,
+    "eval_steps_per_second": 0.545,
+    "total_flos": 3818092983484416.0,
+    "train_loss": 0.5742326915383101,
+    "train_runtime": 91112.7027,
+    "train_samples_per_second": 5.631,
+    "train_steps_per_second": 0.011
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.999438727782975,
+    "eval_loss": 0.6192284226417542,
+    "eval_runtime": 517.6737,
+    "eval_samples_per_second": 17.387,
+    "eval_steps_per_second": 0.545
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.999438727782975,
+    "total_flos": 3818092983484416.0,
+    "train_loss": 0.5742326915383101,
+    "train_runtime": 91112.7027,
+    "train_samples_per_second": 5.631,
+    "train_steps_per_second": 0.011
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,766 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.999438727782975,
+  "eval_steps": 500,
+  "global_step": 1002,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.029934518241347054,
+      "grad_norm": 3.1538438772448028,
+      "learning_rate": 9.80392156862745e-07,
+      "loss": 0.8925,
+      "step": 10
+    },
+    {
+      "epoch": 0.05986903648269411,
+      "grad_norm": 1.575527592601641,
+      "learning_rate": 1.96078431372549e-06,
+      "loss": 0.7728,
+      "step": 20
+    },
+    {
+      "epoch": 0.08980355472404115,
+      "grad_norm": 0.8485542019061536,
+      "learning_rate": 2.9411764705882355e-06,
+      "loss": 0.7132,
+      "step": 30
+    },
+    {
+      "epoch": 0.11973807296538821,
+      "grad_norm": 1.0458951495740654,
+      "learning_rate": 3.92156862745098e-06,
+      "loss": 0.6908,
+      "step": 40
+    },
+    {
+      "epoch": 0.14967259120673526,
+      "grad_norm": 1.0234177336265349,
+      "learning_rate": 4.901960784313726e-06,
+      "loss": 0.6732,
+      "step": 50
+    },
+    {
+      "epoch": 0.1796071094480823,
+      "grad_norm": 0.7664713859853483,
+      "learning_rate": 4.952681388012618e-06,
+      "loss": 0.6604,
+      "step": 60
+    },
+    {
+      "epoch": 0.20954162768942938,
+      "grad_norm": 0.7264862540234072,
+      "learning_rate": 4.900105152471083e-06,
+      "loss": 0.6545,
+      "step": 70
+    },
+    {
+      "epoch": 0.23947614593077643,
+      "grad_norm": 0.7165653090242867,
+      "learning_rate": 4.847528916929548e-06,
+      "loss": 0.6541,
+      "step": 80
+    },
+    {
+      "epoch": 0.2694106641721235,
+      "grad_norm": 1.0486759545711029,
+      "learning_rate": 4.7949526813880135e-06,
+      "loss": 0.6439,
+      "step": 90
+    },
+    {
+      "epoch": 0.2993451824134705,
+      "grad_norm": 0.8994568354813858,
+      "learning_rate": 4.742376445846478e-06,
+      "loss": 0.6445,
+      "step": 100
+    },
+    {
+      "epoch": 0.3292797006548176,
+      "grad_norm": 0.7593674300249386,
+      "learning_rate": 4.689800210304943e-06,
+      "loss": 0.6396,
+      "step": 110
+    },
+    {
+      "epoch": 0.3592142188961646,
+      "grad_norm": 0.85057187662535,
+      "learning_rate": 4.637223974763407e-06,
+      "loss": 0.6381,
+      "step": 120
+    },
+    {
+      "epoch": 0.3891487371375117,
+      "grad_norm": 0.7459111736147722,
+      "learning_rate": 4.5846477392218716e-06,
+      "loss": 0.6377,
+      "step": 130
+    },
+    {
+      "epoch": 0.41908325537885877,
+      "grad_norm": 0.8579170876941498,
+      "learning_rate": 4.532071503680337e-06,
+      "loss": 0.6312,
+      "step": 140
+    },
+    {
+      "epoch": 0.4490177736202058,
+      "grad_norm": 0.6478431663947087,
+      "learning_rate": 4.479495268138802e-06,
+      "loss": 0.6325,
+      "step": 150
+    },
+    {
+      "epoch": 0.47895229186155286,
+      "grad_norm": 0.6686480386236353,
+      "learning_rate": 4.426919032597267e-06,
+      "loss": 0.6311,
+      "step": 160
+    },
+    {
+      "epoch": 0.5088868101028999,
+      "grad_norm": 0.755316711464673,
+      "learning_rate": 4.374342797055731e-06,
+      "loss": 0.6252,
+      "step": 170
+    },
+    {
+      "epoch": 0.538821328344247,
+      "grad_norm": 0.6842648831780761,
+      "learning_rate": 4.321766561514196e-06,
+      "loss": 0.6262,
+      "step": 180
+    },
+    {
+      "epoch": 0.568755846585594,
+      "grad_norm": 0.7502100219405159,
+      "learning_rate": 4.269190325972661e-06,
+      "loss": 0.6239,
+      "step": 190
+    },
+    {
+      "epoch": 0.598690364826941,
+      "grad_norm": 0.758912914876769,
+      "learning_rate": 4.216614090431125e-06,
+      "loss": 0.6317,
+      "step": 200
+    },
+    {
+      "epoch": 0.6286248830682881,
+      "grad_norm": 0.6877162255531747,
+      "learning_rate": 4.16403785488959e-06,
+      "loss": 0.6254,
+      "step": 210
+    },
+    {
+      "epoch": 0.6585594013096352,
+      "grad_norm": 0.8083437539392957,
+      "learning_rate": 4.111461619348055e-06,
+      "loss": 0.6233,
+      "step": 220
+    },
+    {
+      "epoch": 0.6884939195509823,
+      "grad_norm": 0.6686871460745675,
+      "learning_rate": 4.05888538380652e-06,
+      "loss": 0.6257,
+      "step": 230
+    },
+    {
+      "epoch": 0.7184284377923292,
+      "grad_norm": 0.7093737962787516,
+      "learning_rate": 4.006309148264985e-06,
+      "loss": 0.6246,
+      "step": 240
+    },
+    {
+      "epoch": 0.7483629560336763,
+      "grad_norm": 0.6818933630417556,
+      "learning_rate": 3.953732912723449e-06,
+      "loss": 0.621,
+      "step": 250
+    },
+    {
+      "epoch": 0.7782974742750234,
+      "grad_norm": 0.6864274182716663,
+      "learning_rate": 3.901156677181914e-06,
+      "loss": 0.6195,
+      "step": 260
+    },
+    {
+      "epoch": 0.8082319925163705,
+      "grad_norm": 0.7434696163320667,
+      "learning_rate": 3.8485804416403785e-06,
+      "loss": 0.627,
+      "step": 270
+    },
+    {
+      "epoch": 0.8381665107577175,
+      "grad_norm": 0.7345088923624808,
+      "learning_rate": 3.796004206098844e-06,
+      "loss": 0.617,
+      "step": 280
+    },
+    {
+      "epoch": 0.8681010289990645,
+      "grad_norm": 0.680190168122027,
+      "learning_rate": 3.7434279705573083e-06,
+      "loss": 0.6136,
+      "step": 290
+    },
+    {
+      "epoch": 0.8980355472404116,
+      "grad_norm": 0.6319430608102439,
+      "learning_rate": 3.690851735015773e-06,
+      "loss": 0.6148,
+      "step": 300
+    },
+    {
+      "epoch": 0.9279700654817586,
+      "grad_norm": 0.6208261758938318,
+      "learning_rate": 3.638275499474238e-06,
+      "loss": 0.6133,
+      "step": 310
+    },
+    {
+      "epoch": 0.9579045837231057,
+      "grad_norm": 0.8614781021999184,
+      "learning_rate": 3.5856992639327025e-06,
+      "loss": 0.6154,
+      "step": 320
+    },
+    {
+      "epoch": 0.9878391019644528,
+      "grad_norm": 0.6068278814174504,
+      "learning_rate": 3.5331230283911676e-06,
+      "loss": 0.6083,
+      "step": 330
+    },
+    {
+      "epoch": 0.9998129092609915,
+      "eval_loss": 0.6192405819892883,
+      "eval_runtime": 517.7726,
+      "eval_samples_per_second": 17.384,
+      "eval_steps_per_second": 0.545,
+      "step": 334
+    },
+    {
+      "epoch": 1.0177736202057999,
+      "grad_norm": 0.9179458561043997,
+      "learning_rate": 3.4805467928496324e-06,
+      "loss": 0.6461,
+      "step": 340
+    },
+    {
+      "epoch": 1.047708138447147,
+      "grad_norm": 0.6570645082267376,
+      "learning_rate": 3.4279705573080967e-06,
+      "loss": 0.5611,
+      "step": 350
+    },
+    {
+      "epoch": 1.077642656688494,
+      "grad_norm": 0.8877383233190026,
+      "learning_rate": 3.375394321766562e-06,
+      "loss": 0.5566,
+      "step": 360
+    },
+    {
+      "epoch": 1.1075771749298409,
+      "grad_norm": 0.6805153568925695,
+      "learning_rate": 3.3228180862250265e-06,
+      "loss": 0.5562,
+      "step": 370
+    },
+    {
+      "epoch": 1.137511693171188,
+      "grad_norm": 0.654399495707004,
+      "learning_rate": 3.2702418506834917e-06,
+      "loss": 0.5563,
+      "step": 380
+    },
+    {
+      "epoch": 1.167446211412535,
+      "grad_norm": 0.651069659354449,
+      "learning_rate": 3.217665615141956e-06,
+      "loss": 0.5577,
+      "step": 390
+    },
+    {
+      "epoch": 1.197380729653882,
+      "grad_norm": 0.7592057979021696,
+      "learning_rate": 3.1650893796004207e-06,
+      "loss": 0.5578,
+      "step": 400
+    },
+    {
+      "epoch": 1.2273152478952292,
+      "grad_norm": 0.6954421618460875,
+      "learning_rate": 3.112513144058886e-06,
+      "loss": 0.5597,
+      "step": 410
+    },
+    {
+      "epoch": 1.2572497661365762,
+      "grad_norm": 0.6926921804694697,
+      "learning_rate": 3.05993690851735e-06,
+      "loss": 0.5639,
+      "step": 420
+    },
+    {
+      "epoch": 1.2871842843779233,
+      "grad_norm": 0.653386716731147,
+      "learning_rate": 3.0073606729758153e-06,
+      "loss": 0.5578,
+      "step": 430
+    },
+    {
+      "epoch": 1.3171188026192704,
+      "grad_norm": 0.6395515449791989,
+      "learning_rate": 2.95478443743428e-06,
+      "loss": 0.5594,
+      "step": 440
+    },
+    {
+      "epoch": 1.3470533208606175,
+      "grad_norm": 0.6566985553116162,
+      "learning_rate": 2.902208201892745e-06,
+      "loss": 0.5533,
+      "step": 450
+    },
+    {
+      "epoch": 1.3769878391019645,
+      "grad_norm": 0.6848764013482682,
+      "learning_rate": 2.8496319663512094e-06,
+      "loss": 0.5638,
+      "step": 460
+    },
+    {
+      "epoch": 1.4069223573433116,
+      "grad_norm": 0.6781613015124301,
+      "learning_rate": 2.797055730809674e-06,
+      "loss": 0.5622,
+      "step": 470
+    },
+    {
+      "epoch": 1.4368568755846587,
+      "grad_norm": 0.659537106249896,
+      "learning_rate": 2.7444794952681393e-06,
+      "loss": 0.5642,
+      "step": 480
+    },
+    {
+      "epoch": 1.4667913938260055,
+      "grad_norm": 0.6336243524599241,
+      "learning_rate": 2.6919032597266036e-06,
+      "loss": 0.5628,
+      "step": 490
+    },
+    {
+      "epoch": 1.4967259120673526,
+      "grad_norm": 0.6286765901414704,
+      "learning_rate": 2.6393270241850687e-06,
+      "loss": 0.5591,
+      "step": 500
+    },
+    {
+      "epoch": 1.5266604303086997,
+      "grad_norm": 0.6264368854902406,
+      "learning_rate": 2.5867507886435334e-06,
+      "loss": 0.5619,
+      "step": 510
+    },
+    {
+      "epoch": 1.5565949485500468,
+      "grad_norm": 0.6193837921364219,
+      "learning_rate": 2.5341745531019977e-06,
+      "loss": 0.5596,
+      "step": 520
+    },
+    {
+      "epoch": 1.5865294667913938,
+      "grad_norm": 0.649529341936427,
+      "learning_rate": 2.481598317560463e-06,
+      "loss": 0.5601,
+      "step": 530
+    },
+    {
+      "epoch": 1.616463985032741,
+      "grad_norm": 0.6307420170494844,
+      "learning_rate": 2.4290220820189276e-06,
+      "loss": 0.5632,
+      "step": 540
+    },
+    {
+      "epoch": 1.646398503274088,
+      "grad_norm": 0.6814211286136553,
+      "learning_rate": 2.3764458464773923e-06,
+      "loss": 0.5624,
+      "step": 550
+    },
+    {
+      "epoch": 1.6763330215154348,
+      "grad_norm": 0.6175587854440984,
+      "learning_rate": 2.323869610935857e-06,
+      "loss": 0.5528,
+      "step": 560
+    },
+    {
+      "epoch": 1.706267539756782,
+      "grad_norm": 0.651263673383139,
+      "learning_rate": 2.271293375394322e-06,
+      "loss": 0.5545,
+      "step": 570
+    },
+    {
+      "epoch": 1.736202057998129,
+      "grad_norm": 0.6674250126277982,
+      "learning_rate": 2.218717139852787e-06,
+      "loss": 0.5549,
+      "step": 580
+    },
+    {
+      "epoch": 1.766136576239476,
+      "grad_norm": 0.6415996062389275,
+      "learning_rate": 2.1661409043112516e-06,
+      "loss": 0.5581,
+      "step": 590
+    },
+    {
+      "epoch": 1.7960710944808231,
+      "grad_norm": 0.6687198082888665,
+      "learning_rate": 2.1135646687697163e-06,
+      "loss": 0.5586,
+      "step": 600
+    },
+    {
+      "epoch": 1.8260056127221702,
+      "grad_norm": 0.6359899825644427,
+      "learning_rate": 2.060988433228181e-06,
+      "loss": 0.5563,
+      "step": 610
+    },
+    {
+      "epoch": 1.8559401309635173,
+      "grad_norm": 0.6105850392708778,
+      "learning_rate": 2.0084121976866458e-06,
+      "loss": 0.5539,
+      "step": 620
+    },
+    {
+      "epoch": 1.8858746492048644,
+      "grad_norm": 0.6295913381883963,
+      "learning_rate": 1.9558359621451105e-06,
+      "loss": 0.5558,
+      "step": 630
+    },
+    {
+      "epoch": 1.9158091674462114,
+      "grad_norm": 0.6153339447448841,
+      "learning_rate": 1.9032597266035754e-06,
+      "loss": 0.5573,
+      "step": 640
+    },
+    {
+      "epoch": 1.9457436856875585,
+      "grad_norm": 0.6441057540051247,
+      "learning_rate": 1.85068349106204e-06,
+      "loss": 0.5551,
+      "step": 650
+    },
+    {
+      "epoch": 1.9756782039289056,
+      "grad_norm": 0.6140027788031359,
+      "learning_rate": 1.7981072555205049e-06,
+      "loss": 0.5599,
+      "step": 660
+    },
+    {
+      "epoch": 1.999625818521983,
+      "eval_loss": 0.6120015382766724,
+      "eval_runtime": 518.2071,
+      "eval_samples_per_second": 17.37,
+      "eval_steps_per_second": 0.544,
+      "step": 668
+    },
+    {
+      "epoch": 2.0056127221702527,
+      "grad_norm": 1.1050858010777171,
+      "learning_rate": 1.7455310199789696e-06,
+      "loss": 0.6063,
+      "step": 670
+    },
+    {
+      "epoch": 2.0355472404115997,
+      "grad_norm": 0.8550476858553763,
+      "learning_rate": 1.6929547844374345e-06,
+      "loss": 0.5113,
+      "step": 680
+    },
+    {
+      "epoch": 2.065481758652947,
+      "grad_norm": 0.666715478558051,
+      "learning_rate": 1.6403785488958992e-06,
+      "loss": 0.5086,
+      "step": 690
+    },
+    {
+      "epoch": 2.095416276894294,
+      "grad_norm": 0.6875612083161527,
+      "learning_rate": 1.5878023133543642e-06,
+      "loss": 0.5125,
+      "step": 700
+    },
+    {
+      "epoch": 2.125350795135641,
+      "grad_norm": 0.6341626377492836,
+      "learning_rate": 1.5352260778128287e-06,
+      "loss": 0.512,
+      "step": 710
+    },
+    {
+      "epoch": 2.155285313376988,
+      "grad_norm": 0.6302944171010374,
+      "learning_rate": 1.4826498422712934e-06,
+      "loss": 0.5149,
+      "step": 720
+    },
+    {
+      "epoch": 2.185219831618335,
+      "grad_norm": 0.6538654908190926,
+      "learning_rate": 1.4300736067297583e-06,
+      "loss": 0.514,
+      "step": 730
+    },
+    {
+      "epoch": 2.2151543498596817,
+      "grad_norm": 0.6263998927581799,
+      "learning_rate": 1.377497371188223e-06,
+      "loss": 0.5141,
+      "step": 740
+    },
+    {
+      "epoch": 2.245088868101029,
+      "grad_norm": 0.6455376141225925,
+      "learning_rate": 1.324921135646688e-06,
+      "loss": 0.5156,
+      "step": 750
+    },
+    {
+      "epoch": 2.275023386342376,
+      "grad_norm": 0.6183585553749799,
+      "learning_rate": 1.2723449001051527e-06,
+      "loss": 0.5101,
+      "step": 760
+    },
+    {
+      "epoch": 2.304957904583723,
+      "grad_norm": 0.64329317604967,
+      "learning_rate": 1.2197686645636174e-06,
+      "loss": 0.5157,
+      "step": 770
+    },
+    {
+      "epoch": 2.33489242282507,
+      "grad_norm": 0.6405861750192782,
+      "learning_rate": 1.1671924290220821e-06,
+      "loss": 0.5099,
+      "step": 780
+    },
+    {
+      "epoch": 2.364826941066417,
+      "grad_norm": 0.6198685520017336,
+      "learning_rate": 1.1146161934805468e-06,
+      "loss": 0.511,
+      "step": 790
+    },
+    {
+      "epoch": 2.394761459307764,
+      "grad_norm": 0.6935047782557408,
+      "learning_rate": 1.0620399579390118e-06,
+      "loss": 0.5142,
+      "step": 800
+    },
+    {
+      "epoch": 2.4246959775491113,
+      "grad_norm": 0.6412834801494394,
+      "learning_rate": 1.0094637223974763e-06,
+      "loss": 0.5204,
+      "step": 810
+    },
+    {
+      "epoch": 2.4546304957904583,
+      "grad_norm": 0.6370264615292399,
+      "learning_rate": 9.568874868559412e-07,
+      "loss": 0.5167,
+      "step": 820
+    },
+    {
+      "epoch": 2.4845650140318054,
+      "grad_norm": 0.6549640678553555,
+      "learning_rate": 9.04311251314406e-07,
+      "loss": 0.5155,
+      "step": 830
+    },
+    {
+      "epoch": 2.5144995322731525,
+      "grad_norm": 0.6303838732411701,
+      "learning_rate": 8.517350157728707e-07,
+      "loss": 0.5125,
+      "step": 840
+    },
+    {
+      "epoch": 2.5444340505144996,
+      "grad_norm": 0.6252498072909572,
+      "learning_rate": 7.991587802313355e-07,
+      "loss": 0.5159,
+      "step": 850
+    },
+    {
+      "epoch": 2.5743685687558466,
+      "grad_norm": 0.619762570689399,
+      "learning_rate": 7.465825446898002e-07,
+      "loss": 0.5133,
+      "step": 860
+    },
+    {
+      "epoch": 2.6043030869971937,
+      "grad_norm": 0.621367841271269,
+      "learning_rate": 6.94006309148265e-07,
+      "loss": 0.51,
+      "step": 870
+    },
+    {
+      "epoch": 2.634237605238541,
+      "grad_norm": 0.6277371890683754,
+      "learning_rate": 6.414300736067299e-07,
+      "loss": 0.5087,
+      "step": 880
+    },
+    {
+      "epoch": 2.664172123479888,
+      "grad_norm": 0.6230297769696992,
+      "learning_rate": 5.888538380651946e-07,
+      "loss": 0.5143,
+      "step": 890
+    },
+    {
+      "epoch": 2.694106641721235,
+      "grad_norm": 0.6131791711055301,
+      "learning_rate": 5.362776025236594e-07,
+      "loss": 0.517,
+      "step": 900
+    },
+    {
+      "epoch": 2.724041159962582,
+      "grad_norm": 0.6009379746100512,
+      "learning_rate": 4.837013669821241e-07,
+      "loss": 0.5094,
+      "step": 910
+    },
+    {
+      "epoch": 2.753975678203929,
+      "grad_norm": 0.5945695321680531,
+      "learning_rate": 4.311251314405889e-07,
+      "loss": 0.51,
+      "step": 920
+    },
+    {
+      "epoch": 2.7839101964452757,
+      "grad_norm": 0.6115970196704695,
+      "learning_rate": 3.7854889589905366e-07,
+      "loss": 0.5159,
+      "step": 930
+    },
+    {
+      "epoch": 2.8138447146866232,
+      "grad_norm": 0.6047728761964349,
+      "learning_rate": 3.2597266035751843e-07,
+      "loss": 0.514,
+      "step": 940
+    },
+    {
+      "epoch": 2.84377923292797,
+      "grad_norm": 0.6133751020570728,
+      "learning_rate": 2.733964248159832e-07,
+      "loss": 0.5127,
+      "step": 950
+    },
+    {
+      "epoch": 2.8737137511693174,
+      "grad_norm": 0.592450988034991,
+      "learning_rate": 2.2082018927444798e-07,
+      "loss": 0.5148,
+      "step": 960
+    },
+    {
+      "epoch": 2.903648269410664,
+      "grad_norm": 0.6009390030111875,
+      "learning_rate": 1.6824395373291272e-07,
+      "loss": 0.5094,
+      "step": 970
+    },
+    {
+      "epoch": 2.933582787652011,
+      "grad_norm": 0.6175880134916168,
+      "learning_rate": 1.156677181913775e-07,
+      "loss": 0.5147,
+      "step": 980
+    },
+    {
+      "epoch": 2.963517305893358,
+      "grad_norm": 0.6070880758663311,
+      "learning_rate": 6.309148264984227e-08,
+      "loss": 0.5163,
+      "step": 990
+    },
+    {
+      "epoch": 2.9934518241347052,
+      "grad_norm": 0.5979303241818252,
+      "learning_rate": 1.0515247108307045e-08,
+      "loss": 0.5167,
+      "step": 1000
+    },
+    {
+      "epoch": 2.999438727782975,
+      "eval_loss": 0.6192284226417542,
+      "eval_runtime": 516.8713,
+      "eval_samples_per_second": 17.414,
+      "eval_steps_per_second": 0.546,
+      "step": 1002
+    },
+    {
+      "epoch": 2.999438727782975,
+      "step": 1002,
+      "total_flos": 3818092983484416.0,
+      "train_loss": 0.5742326915383101,
+      "train_runtime": 91112.7027,
+      "train_samples_per_second": 5.631,
+      "train_steps_per_second": 0.011
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1002,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3818092983484416.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed