End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +766 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: gemma
 base_model: google/gemma-2-9b
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: hp_ablations_gemma_lr1e-5_dcftv1.2
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # hp_ablations_gemma_lr1e-5_dcftv1.2
-This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6808

 base_model: google/gemma-2-9b
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: hp_ablations_gemma_lr1e-5_dcftv1.2
 # hp_ablations_gemma_lr1e-5_dcftv1.2
+This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on the mlfoundations-dev/oh-dcft-v1.2_no-curation_gpt-4o-mini dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6808

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.999438727782975,
+    "eval_loss": 0.6807616353034973,
+    "eval_runtime": 516.3443,
+    "eval_samples_per_second": 17.432,
+    "eval_steps_per_second": 0.546,
+    "total_flos": 3818092983484416.0,
+    "train_loss": 0.5373929247051894,
+    "train_runtime": 90253.1826,
+    "train_samples_per_second": 5.684,
+    "train_steps_per_second": 0.011
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.999438727782975,
+    "eval_loss": 0.6807616353034973,
+    "eval_runtime": 516.3443,
+    "eval_samples_per_second": 17.432,
+    "eval_steps_per_second": 0.546
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.999438727782975,
+    "total_flos": 3818092983484416.0,
+    "train_loss": 0.5373929247051894,
+    "train_runtime": 90253.1826,
+    "train_samples_per_second": 5.684,
+    "train_steps_per_second": 0.011
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,766 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.999438727782975,
+  "eval_steps": 500,
+  "global_step": 1002,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.029934518241347054,
+      "grad_norm": 1.436149184708336,
+      "learning_rate": 1e-05,
+      "loss": 0.8156,
+      "step": 10
+    },
+    {
+      "epoch": 0.05986903648269411,
+      "grad_norm": 0.8470104349075924,
+      "learning_rate": 1e-05,
+      "loss": 0.6962,
+      "step": 20
+    },
+    {
+      "epoch": 0.08980355472404115,
+      "grad_norm": 0.8064789531015041,
+      "learning_rate": 1e-05,
+      "loss": 0.6701,
+      "step": 30
+    },
+    {
+      "epoch": 0.11973807296538821,
+      "grad_norm": 0.8844517736535112,
+      "learning_rate": 1e-05,
+      "loss": 0.6663,
+      "step": 40
+    },
+    {
+      "epoch": 0.14967259120673526,
+      "grad_norm": 0.8552170926578669,
+      "learning_rate": 1e-05,
+      "loss": 0.6582,
+      "step": 50
+    },
+    {
+      "epoch": 0.1796071094480823,
+      "grad_norm": 0.691474609618276,
+      "learning_rate": 1e-05,
+      "loss": 0.6501,
+      "step": 60
+    },
+    {
+      "epoch": 0.20954162768942938,
+      "grad_norm": 0.8728851481736056,
+      "learning_rate": 1e-05,
+      "loss": 0.6478,
+      "step": 70
+    },
+    {
+      "epoch": 0.23947614593077643,
+      "grad_norm": 0.6511950609669344,
+      "learning_rate": 1e-05,
+      "loss": 0.6486,
+      "step": 80
+    },
+    {
+      "epoch": 0.2694106641721235,
+      "grad_norm": 0.7208148861851342,
+      "learning_rate": 1e-05,
+      "loss": 0.64,
+      "step": 90
+    },
+    {
+      "epoch": 0.2993451824134705,
+      "grad_norm": 1.9733421194362517,
+      "learning_rate": 1e-05,
+      "loss": 0.6415,
+      "step": 100
+    },
+    {
+      "epoch": 0.3292797006548176,
+      "grad_norm": 0.7169805653885655,
+      "learning_rate": 1e-05,
+      "loss": 0.6379,
+      "step": 110
+    },
+    {
+      "epoch": 0.3592142188961646,
+      "grad_norm": 0.6681799138772999,
+      "learning_rate": 1e-05,
+      "loss": 0.6365,
+      "step": 120
+    },
+    {
+      "epoch": 0.3891487371375117,
+      "grad_norm": 0.7595098630906292,
+      "learning_rate": 1e-05,
+      "loss": 0.6369,
+      "step": 130
+    },
+    {
+      "epoch": 0.41908325537885877,
+      "grad_norm": 0.8845411450560166,
+      "learning_rate": 1e-05,
+      "loss": 0.6307,
+      "step": 140
+    },
+    {
+      "epoch": 0.4490177736202058,
+      "grad_norm": 0.7841946541102748,
+      "learning_rate": 1e-05,
+      "loss": 0.6328,
+      "step": 150
+    },
+    {
+      "epoch": 0.47895229186155286,
+      "grad_norm": 0.6767441174281267,
+      "learning_rate": 1e-05,
+      "loss": 0.6315,
+      "step": 160
+    },
+    {
+      "epoch": 0.5088868101028999,
+      "grad_norm": 0.801569153817463,
+      "learning_rate": 1e-05,
+      "loss": 0.6259,
+      "step": 170
+    },
+    {
+      "epoch": 0.538821328344247,
+      "grad_norm": 0.9293115426977383,
+      "learning_rate": 1e-05,
+      "loss": 0.6276,
+      "step": 180
+    },
+    {
+      "epoch": 0.568755846585594,
+      "grad_norm": 0.7156053774180291,
+      "learning_rate": 1e-05,
+      "loss": 0.6254,
+      "step": 190
+    },
+    {
+      "epoch": 0.598690364826941,
+      "grad_norm": 0.7139841785093624,
+      "learning_rate": 1e-05,
+      "loss": 0.6332,
+      "step": 200
+    },
+    {
+      "epoch": 0.6286248830682881,
+      "grad_norm": 0.6793268190386273,
+      "learning_rate": 1e-05,
+      "loss": 0.627,
+      "step": 210
+    },
+    {
+      "epoch": 0.6585594013096352,
+      "grad_norm": 0.7523957272675231,
+      "learning_rate": 1e-05,
+      "loss": 0.625,
+      "step": 220
+    },
+    {
+      "epoch": 0.6884939195509823,
+      "grad_norm": 0.6701510393609906,
+      "learning_rate": 1e-05,
+      "loss": 0.6278,
+      "step": 230
+    },
+    {
+      "epoch": 0.7184284377923292,
+      "grad_norm": 0.6411943217226495,
+      "learning_rate": 1e-05,
+      "loss": 0.6266,
+      "step": 240
+    },
+    {
+      "epoch": 0.7483629560336763,
+      "grad_norm": 0.6077476903677665,
+      "learning_rate": 1e-05,
+      "loss": 0.6233,
+      "step": 250
+    },
+    {
+      "epoch": 0.7782974742750234,
+      "grad_norm": 0.6054849756074197,
+      "learning_rate": 1e-05,
+      "loss": 0.6218,
+      "step": 260
+    },
+    {
+      "epoch": 0.8082319925163705,
+      "grad_norm": 0.6102207934997144,
+      "learning_rate": 1e-05,
+      "loss": 0.6296,
+      "step": 270
+    },
+    {
+      "epoch": 0.8381665107577175,
+      "grad_norm": 0.6341903498794719,
+      "learning_rate": 1e-05,
+      "loss": 0.6197,
+      "step": 280
+    },
+    {
+      "epoch": 0.8681010289990645,
+      "grad_norm": 0.5892798874353478,
+      "learning_rate": 1e-05,
+      "loss": 0.6164,
+      "step": 290
+    },
+    {
+      "epoch": 0.8980355472404116,
+      "grad_norm": 0.6763132723092944,
+      "learning_rate": 1e-05,
+      "loss": 0.6178,
+      "step": 300
+    },
+    {
+      "epoch": 0.9279700654817586,
+      "grad_norm": 0.6379477777187238,
+      "learning_rate": 1e-05,
+      "loss": 0.6164,
+      "step": 310
+    },
+    {
+      "epoch": 0.9579045837231057,
+      "grad_norm": 0.6164591042123551,
+      "learning_rate": 1e-05,
+      "loss": 0.6185,
+      "step": 320
+    },
+    {
+      "epoch": 0.9878391019644528,
+      "grad_norm": 0.589103130850815,
+      "learning_rate": 1e-05,
+      "loss": 0.6114,
+      "step": 330
+    },
+    {
+      "epoch": 0.9998129092609915,
+      "eval_loss": 0.6230862736701965,
+      "eval_runtime": 514.5688,
+      "eval_samples_per_second": 17.492,
+      "eval_steps_per_second": 0.548,
+      "step": 334
+    },
+    {
+      "epoch": 1.0177736202057999,
+      "grad_norm": 1.0789148554995527,
+      "learning_rate": 1e-05,
+      "loss": 0.622,
+      "step": 340
+    },
+    {
+      "epoch": 1.047708138447147,
+      "grad_norm": 0.879489488383757,
+      "learning_rate": 1e-05,
+      "loss": 0.5225,
+      "step": 350
+    },
+    {
+      "epoch": 1.077642656688494,
+      "grad_norm": 0.7163524008199746,
+      "learning_rate": 1e-05,
+      "loss": 0.518,
+      "step": 360
+    },
+    {
+      "epoch": 1.1075771749298409,
+      "grad_norm": 0.6189313448636057,
+      "learning_rate": 1e-05,
+      "loss": 0.5189,
+      "step": 370
+    },
+    {
+      "epoch": 1.137511693171188,
+      "grad_norm": 0.6622745304027244,
+      "learning_rate": 1e-05,
+      "loss": 0.5191,
+      "step": 380
+    },
+    {
+      "epoch": 1.167446211412535,
+      "grad_norm": 0.5722248842800408,
+      "learning_rate": 1e-05,
+      "loss": 0.5218,
+      "step": 390
+    },
+    {
+      "epoch": 1.197380729653882,
+      "grad_norm": 0.6571926004643104,
+      "learning_rate": 1e-05,
+      "loss": 0.5226,
+      "step": 400
+    },
+    {
+      "epoch": 1.2273152478952292,
+      "grad_norm": 0.657190925809655,
+      "learning_rate": 1e-05,
+      "loss": 0.5257,
+      "step": 410
+    },
+    {
+      "epoch": 1.2572497661365762,
+      "grad_norm": 0.6575555769873754,
+      "learning_rate": 1e-05,
+      "loss": 0.5296,
+      "step": 420
+    },
+    {
+      "epoch": 1.2871842843779233,
+      "grad_norm": 0.7072358321643488,
+      "learning_rate": 1e-05,
+      "loss": 0.5252,
+      "step": 430
+    },
+    {
+      "epoch": 1.3171188026192704,
+      "grad_norm": 0.5933792087399892,
+      "learning_rate": 1e-05,
+      "loss": 0.5276,
+      "step": 440
+    },
+    {
+      "epoch": 1.3470533208606175,
+      "grad_norm": 0.5986923641370627,
+      "learning_rate": 1e-05,
+      "loss": 0.5225,
+      "step": 450
+    },
+    {
+      "epoch": 1.3769878391019645,
+      "grad_norm": 0.6200542797386817,
+      "learning_rate": 1e-05,
+      "loss": 0.5331,
+      "step": 460
+    },
+    {
+      "epoch": 1.4069223573433116,
+      "grad_norm": 0.6417386277326463,
+      "learning_rate": 1e-05,
+      "loss": 0.5326,
+      "step": 470
+    },
+    {
+      "epoch": 1.4368568755846587,
+      "grad_norm": 0.7015930470443179,
+      "learning_rate": 1e-05,
+      "loss": 0.535,
+      "step": 480
+    },
+    {
+      "epoch": 1.4667913938260055,
+      "grad_norm": 0.594308805207007,
+      "learning_rate": 1e-05,
+      "loss": 0.5347,
+      "step": 490
+    },
+    {
+      "epoch": 1.4967259120673526,
+      "grad_norm": 0.5553957558260186,
+      "learning_rate": 1e-05,
+      "loss": 0.531,
+      "step": 500
+    },
+    {
+      "epoch": 1.5266604303086997,
+      "grad_norm": 0.7685143591600733,
+      "learning_rate": 1e-05,
+      "loss": 0.5341,
+      "step": 510
+    },
+    {
+      "epoch": 1.5565949485500468,
+      "grad_norm": 0.6448415753485363,
+      "learning_rate": 1e-05,
+      "loss": 0.5327,
+      "step": 520
+    },
+    {
+      "epoch": 1.5865294667913938,
+      "grad_norm": 0.6075412691359406,
+      "learning_rate": 1e-05,
+      "loss": 0.5336,
+      "step": 530
+    },
+    {
+      "epoch": 1.616463985032741,
+      "grad_norm": 0.5763107752055747,
+      "learning_rate": 1e-05,
+      "loss": 0.5378,
+      "step": 540
+    },
+    {
+      "epoch": 1.646398503274088,
+      "grad_norm": 0.5603858307568755,
+      "learning_rate": 1e-05,
+      "loss": 0.5371,
+      "step": 550
+    },
+    {
+      "epoch": 1.6763330215154348,
+      "grad_norm": 0.6006913668733296,
+      "learning_rate": 1e-05,
+      "loss": 0.5284,
+      "step": 560
+    },
+    {
+      "epoch": 1.706267539756782,
+      "grad_norm": 0.6177631350868539,
+      "learning_rate": 1e-05,
+      "loss": 0.5302,
+      "step": 570
+    },
+    {
+      "epoch": 1.736202057998129,
+      "grad_norm": 0.585179723046756,
+      "learning_rate": 1e-05,
+      "loss": 0.5318,
+      "step": 580
+    },
+    {
+      "epoch": 1.766136576239476,
+      "grad_norm": 0.6080979528118828,
+      "learning_rate": 1e-05,
+      "loss": 0.5351,
+      "step": 590
+    },
+    {
+      "epoch": 1.7960710944808231,
+      "grad_norm": 0.6217675467464355,
+      "learning_rate": 1e-05,
+      "loss": 0.5355,
+      "step": 600
+    },
+    {
+      "epoch": 1.8260056127221702,
+      "grad_norm": 0.5994360959232031,
+      "learning_rate": 1e-05,
+      "loss": 0.5338,
+      "step": 610
+    },
+    {
+      "epoch": 1.8559401309635173,
+      "grad_norm": 0.6183853502032267,
+      "learning_rate": 1e-05,
+      "loss": 0.5323,
+      "step": 620
+    },
+    {
+      "epoch": 1.8858746492048644,
+      "grad_norm": 0.6010641587842128,
+      "learning_rate": 1e-05,
+      "loss": 0.5339,
+      "step": 630
+    },
+    {
+      "epoch": 1.9158091674462114,
+      "grad_norm": 0.6165868033059826,
+      "learning_rate": 1e-05,
+      "loss": 0.5355,
+      "step": 640
+    },
+    {
+      "epoch": 1.9457436856875585,
+      "grad_norm": 0.5906764886461219,
+      "learning_rate": 1e-05,
+      "loss": 0.534,
+      "step": 650
+    },
+    {
+      "epoch": 1.9756782039289056,
+      "grad_norm": 0.6167441736977045,
+      "learning_rate": 1e-05,
+      "loss": 0.5387,
+      "step": 660
+    },
+    {
+      "epoch": 1.999625818521983,
+      "eval_loss": 0.6286783218383789,
+      "eval_runtime": 512.2643,
+      "eval_samples_per_second": 17.571,
+      "eval_steps_per_second": 0.55,
+      "step": 668
+    },
+    {
+      "epoch": 2.0056127221702527,
+      "grad_norm": 1.1149683186404131,
+      "learning_rate": 1e-05,
+      "loss": 0.5713,
+      "step": 670
+    },
+    {
+      "epoch": 2.0355472404115997,
+      "grad_norm": 0.8375303981489991,
+      "learning_rate": 1e-05,
+      "loss": 0.4237,
+      "step": 680
+    },
+    {
+      "epoch": 2.065481758652947,
+      "grad_norm": 0.7496290030129303,
+      "learning_rate": 1e-05,
+      "loss": 0.4183,
+      "step": 690
+    },
+    {
+      "epoch": 2.095416276894294,
+      "grad_norm": 0.7316475667126104,
+      "learning_rate": 1e-05,
+      "loss": 0.4221,
+      "step": 700
+    },
+    {
+      "epoch": 2.125350795135641,
+      "grad_norm": 0.6966311084358239,
+      "learning_rate": 1e-05,
+      "loss": 0.4226,
+      "step": 710
+    },
+    {
+      "epoch": 2.155285313376988,
+      "grad_norm": 0.7496413293022232,
+      "learning_rate": 1e-05,
+      "loss": 0.4267,
+      "step": 720
+    },
+    {
+      "epoch": 2.185219831618335,
+      "grad_norm": 0.6748769928099622,
+      "learning_rate": 1e-05,
+      "loss": 0.4272,
+      "step": 730
+    },
+    {
+      "epoch": 2.2151543498596817,
+      "grad_norm": 0.6647730732308946,
+      "learning_rate": 1e-05,
+      "loss": 0.4288,
+      "step": 740
+    },
+    {
+      "epoch": 2.245088868101029,
+      "grad_norm": 0.675232110980268,
+      "learning_rate": 1e-05,
+      "loss": 0.4314,
+      "step": 750
+    },
+    {
+      "epoch": 2.275023386342376,
+      "grad_norm": 0.6499875867621265,
+      "learning_rate": 1e-05,
+      "loss": 0.4288,
+      "step": 760
+    },
+    {
+      "epoch": 2.304957904583723,
+      "grad_norm": 0.7041249541616547,
+      "learning_rate": 1e-05,
+      "loss": 0.4331,
+      "step": 770
+    },
+    {
+      "epoch": 2.33489242282507,
+      "grad_norm": 0.7194561636469382,
+      "learning_rate": 1e-05,
+      "loss": 0.4312,
+      "step": 780
+    },
+    {
+      "epoch": 2.364826941066417,
+      "grad_norm": 0.6310368955810708,
+      "learning_rate": 1e-05,
+      "loss": 0.434,
+      "step": 790
+    },
+    {
+      "epoch": 2.394761459307764,
+      "grad_norm": 0.6480786152897183,
+      "learning_rate": 1e-05,
+      "loss": 0.4359,
+      "step": 800
+    },
+    {
+      "epoch": 2.4246959775491113,
+      "grad_norm": 0.6854347531850187,
+      "learning_rate": 1e-05,
+      "loss": 0.4425,
+      "step": 810
+    },
+    {
+      "epoch": 2.4546304957904583,
+      "grad_norm": 0.6843667989668157,
+      "learning_rate": 1e-05,
+      "loss": 0.4413,
+      "step": 820
+    },
+    {
+      "epoch": 2.4845650140318054,
+      "grad_norm": 0.7172949284852687,
+      "learning_rate": 1e-05,
+      "loss": 0.4417,
+      "step": 830
+    },
+    {
+      "epoch": 2.5144995322731525,
+      "grad_norm": 0.6539869554188668,
+      "learning_rate": 1e-05,
+      "loss": 0.44,
+      "step": 840
+    },
+    {
+      "epoch": 2.5444340505144996,
+      "grad_norm": 0.6171725666216646,
+      "learning_rate": 1e-05,
+      "loss": 0.4438,
+      "step": 850
+    },
+    {
+      "epoch": 2.5743685687558466,
+      "grad_norm": 0.6760996835540183,
+      "learning_rate": 1e-05,
+      "loss": 0.4415,
+      "step": 860
+    },
+    {
+      "epoch": 2.6043030869971937,
+      "grad_norm": 0.6271049243359499,
+      "learning_rate": 1e-05,
+      "loss": 0.4412,
+      "step": 870
+    },
+    {
+      "epoch": 2.634237605238541,
+      "grad_norm": 0.6441469956566896,
+      "learning_rate": 1e-05,
+      "loss": 0.4383,
+      "step": 880
+    },
+    {
+      "epoch": 2.664172123479888,
+      "grad_norm": 0.6367328036366604,
+      "learning_rate": 1e-05,
+      "loss": 0.4447,
+      "step": 890
+    },
+    {
+      "epoch": 2.694106641721235,
+      "grad_norm": 0.617152483286665,
+      "learning_rate": 1e-05,
+      "loss": 0.4472,
+      "step": 900
+    },
+    {
+      "epoch": 2.724041159962582,
+      "grad_norm": 0.6454438558992355,
+      "learning_rate": 1e-05,
+      "loss": 0.4418,
+      "step": 910
+    },
+    {
+      "epoch": 2.753975678203929,
+      "grad_norm": 0.6429990332626118,
+      "learning_rate": 1e-05,
+      "loss": 0.4435,
+      "step": 920
+    },
+    {
+      "epoch": 2.7839101964452757,
+      "grad_norm": 0.6205622067794777,
+      "learning_rate": 1e-05,
+      "loss": 0.449,
+      "step": 930
+    },
+    {
+      "epoch": 2.8138447146866232,
+      "grad_norm": 0.646284347738926,
+      "learning_rate": 1e-05,
+      "loss": 0.4462,
+      "step": 940
+    },
+    {
+      "epoch": 2.84377923292797,
+      "grad_norm": 0.7300304825605007,
+      "learning_rate": 1e-05,
+      "loss": 0.4475,
+      "step": 950
+    },
+    {
+      "epoch": 2.8737137511693174,
+      "grad_norm": 0.6722791952085628,
+      "learning_rate": 1e-05,
+      "loss": 0.449,
+      "step": 960
+    },
+    {
+      "epoch": 2.903648269410664,
+      "grad_norm": 0.6859554305302392,
+      "learning_rate": 1e-05,
+      "loss": 0.4457,
+      "step": 970
+    },
+    {
+      "epoch": 2.933582787652011,
+      "grad_norm": 0.660936677648392,
+      "learning_rate": 1e-05,
+      "loss": 0.4509,
+      "step": 980
+    },
+    {
+      "epoch": 2.963517305893358,
+      "grad_norm": 0.6479374316532951,
+      "learning_rate": 1e-05,
+      "loss": 0.4526,
+      "step": 990
+    },
+    {
+      "epoch": 2.9934518241347052,
+      "grad_norm": 0.646996698416053,
+      "learning_rate": 1e-05,
+      "loss": 0.4544,
+      "step": 1000
+    },
+    {
+      "epoch": 2.999438727782975,
+      "eval_loss": 0.6807616353034973,
+      "eval_runtime": 515.6952,
+      "eval_samples_per_second": 17.454,
+      "eval_steps_per_second": 0.547,
+      "step": 1002
+    },
+    {
+      "epoch": 2.999438727782975,
+      "step": 1002,
+      "total_flos": 3818092983484416.0,
+      "train_loss": 0.5373929247051894,
+      "train_runtime": 90253.1826,
+      "train_samples_per_second": 5.684,
+      "train_steps_per_second": 0.011
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1002,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3818092983484416.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed