mlfoundations-dev
/

llama3-1_8b_multiple_samples_majority_consensus_numina_aime

@@ -4,6 +4,7 @@ license: apache-2.0
 base_model: Qwen/Qwen2.5-7B-Instruct
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: llama3-1_8b_multiple_samples_majority_consensus_numina_aime
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # llama3-1_8b_multiple_samples_majority_consensus_numina_aime
-This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on an unknown dataset.
 ## Model description

 base_model: Qwen/Qwen2.5-7B-Instruct
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: llama3-1_8b_multiple_samples_majority_consensus_numina_aime
 # llama3-1_8b_multiple_samples_majority_consensus_numina_aime
+This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the mlfoundations-dev/multiple_samples_majority_consensus_numina_aime dataset.
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.965909090909091,
+    "total_flos": 156886855483392.0,
+    "train_loss": 0.7380151625337272,
+    "train_runtime": 2697.8392,
+    "train_samples_per_second": 6.228,
+    "train_steps_per_second": 0.064
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.965909090909091,
+    "total_flos": 156886855483392.0,
+    "train_loss": 0.7380151625337272,
+    "train_runtime": 2697.8392,
+    "train_samples_per_second": 6.228,
+    "train_steps_per_second": 0.064
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1260 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.965909090909091,
+  "eval_steps": 500,
+  "global_step": 174,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.017045454545454544,
+      "grad_norm": 7.22884205408755,
+      "learning_rate": 5.555555555555555e-07,
+      "loss": 1.1159,
+      "step": 1
+    },
+    {
+      "epoch": 0.03409090909090909,
+      "grad_norm": 6.86377349527473,
+      "learning_rate": 1.111111111111111e-06,
+      "loss": 1.0455,
+      "step": 2
+    },
+    {
+      "epoch": 0.05113636363636364,
+      "grad_norm": 7.199961151138557,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 1.1275,
+      "step": 3
+    },
+    {
+      "epoch": 0.06818181818181818,
+      "grad_norm": 7.268274547750973,
+      "learning_rate": 2.222222222222222e-06,
+      "loss": 1.1096,
+      "step": 4
+    },
+    {
+      "epoch": 0.08522727272727272,
+      "grad_norm": 6.311407588180345,
+      "learning_rate": 2.7777777777777783e-06,
+      "loss": 1.0481,
+      "step": 5
+    },
+    {
+      "epoch": 0.10227272727272728,
+      "grad_norm": 5.325136507507456,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 1.0745,
+      "step": 6
+    },
+    {
+      "epoch": 0.11931818181818182,
+      "grad_norm": 3.204819984360326,
+      "learning_rate": 3.88888888888889e-06,
+      "loss": 0.9787,
+      "step": 7
+    },
+    {
+      "epoch": 0.13636363636363635,
+      "grad_norm": 2.6398476108217075,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 0.9367,
+      "step": 8
+    },
+    {
+      "epoch": 0.1534090909090909,
+      "grad_norm": 2.5518759893438645,
+      "learning_rate": 5e-06,
+      "loss": 0.9116,
+      "step": 9
+    },
+    {
+      "epoch": 0.17045454545454544,
+      "grad_norm": 5.223827384885404,
+      "learning_rate": 5.555555555555557e-06,
+      "loss": 0.9835,
+      "step": 10
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 4.47133774615716,
+      "learning_rate": 6.111111111111112e-06,
+      "loss": 0.9244,
+      "step": 11
+    },
+    {
+      "epoch": 0.20454545454545456,
+      "grad_norm": 4.277259377499211,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 0.9496,
+      "step": 12
+    },
+    {
+      "epoch": 0.2215909090909091,
+      "grad_norm": 3.526152904409924,
+      "learning_rate": 7.222222222222223e-06,
+      "loss": 0.8649,
+      "step": 13
+    },
+    {
+      "epoch": 0.23863636363636365,
+      "grad_norm": 3.259182755810135,
+      "learning_rate": 7.77777777777778e-06,
+      "loss": 0.882,
+      "step": 14
+    },
+    {
+      "epoch": 0.2556818181818182,
+      "grad_norm": 2.657169823270014,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 0.9532,
+      "step": 15
+    },
+    {
+      "epoch": 0.2727272727272727,
+      "grad_norm": 2.2469055281791768,
+      "learning_rate": 8.888888888888888e-06,
+      "loss": 0.8686,
+      "step": 16
+    },
+    {
+      "epoch": 0.2897727272727273,
+      "grad_norm": 2.2961013306684492,
+      "learning_rate": 9.444444444444445e-06,
+      "loss": 0.9136,
+      "step": 17
+    },
+    {
+      "epoch": 0.3068181818181818,
+      "grad_norm": 2.0321977080614575,
+      "learning_rate": 1e-05,
+      "loss": 0.8197,
+      "step": 18
+    },
+    {
+      "epoch": 0.32386363636363635,
+      "grad_norm": 1.5442837234976816,
+      "learning_rate": 9.998986144924253e-06,
+      "loss": 0.7929,
+      "step": 19
+    },
+    {
+      "epoch": 0.3409090909090909,
+      "grad_norm": 1.3959797656829187,
+      "learning_rate": 9.995944990857848e-06,
+      "loss": 0.746,
+      "step": 20
+    },
+    {
+      "epoch": 0.35795454545454547,
+      "grad_norm": 1.6413702216042307,
+      "learning_rate": 9.990877771116588e-06,
+      "loss": 0.8195,
+      "step": 21
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 1.1455825686481986,
+      "learning_rate": 9.983786540671052e-06,
+      "loss": 0.7925,
+      "step": 22
+    },
+    {
+      "epoch": 0.39204545454545453,
+      "grad_norm": 1.1413214191419678,
+      "learning_rate": 9.974674175313228e-06,
+      "loss": 0.8086,
+      "step": 23
+    },
+    {
+      "epoch": 0.4090909090909091,
+      "grad_norm": 1.1533147121145144,
+      "learning_rate": 9.96354437049027e-06,
+      "loss": 0.8001,
+      "step": 24
+    },
+    {
+      "epoch": 0.42613636363636365,
+      "grad_norm": 0.9882676326329094,
+      "learning_rate": 9.950401639805822e-06,
+      "loss": 0.868,
+      "step": 25
+    },
+    {
+      "epoch": 0.4431818181818182,
+      "grad_norm": 1.028373201431852,
+      "learning_rate": 9.935251313189564e-06,
+      "loss": 0.7855,
+      "step": 26
+    },
+    {
+      "epoch": 0.4602272727272727,
+      "grad_norm": 1.049228042355448,
+      "learning_rate": 9.91809953473572e-06,
+      "loss": 0.7744,
+      "step": 27
+    },
+    {
+      "epoch": 0.4772727272727273,
+      "grad_norm": 0.8404995559488506,
+      "learning_rate": 9.89895326021134e-06,
+      "loss": 0.7396,
+      "step": 28
+    },
+    {
+      "epoch": 0.4943181818181818,
+      "grad_norm": 0.9078468037297426,
+      "learning_rate": 9.87782025423547e-06,
+      "loss": 0.8565,
+      "step": 29
+    },
+    {
+      "epoch": 0.5113636363636364,
+      "grad_norm": 0.8508053415926495,
+      "learning_rate": 9.854709087130261e-06,
+      "loss": 0.729,
+      "step": 30
+    },
+    {
+      "epoch": 0.5284090909090909,
+      "grad_norm": 0.944676094285407,
+      "learning_rate": 9.829629131445342e-06,
+      "loss": 0.7635,
+      "step": 31
+    },
+    {
+      "epoch": 0.5454545454545454,
+      "grad_norm": 0.7058532649010517,
+      "learning_rate": 9.802590558156863e-06,
+      "loss": 0.7878,
+      "step": 32
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.8621390038765169,
+      "learning_rate": 9.77360433254273e-06,
+      "loss": 0.8099,
+      "step": 33
+    },
+    {
+      "epoch": 0.5795454545454546,
+      "grad_norm": 0.8073345495598823,
+      "learning_rate": 9.742682209735727e-06,
+      "loss": 0.7863,
+      "step": 34
+    },
+    {
+      "epoch": 0.5965909090909091,
+      "grad_norm": 0.6669987952697678,
+      "learning_rate": 9.709836729956326e-06,
+      "loss": 0.7973,
+      "step": 35
+    },
+    {
+      "epoch": 0.6136363636363636,
+      "grad_norm": 0.8486755092214402,
+      "learning_rate": 9.675081213427076e-06,
+      "loss": 0.7459,
+      "step": 36
+    },
+    {
+      "epoch": 0.6306818181818182,
+      "grad_norm": 0.7143187741160664,
+      "learning_rate": 9.638429754970715e-06,
+      "loss": 0.7543,
+      "step": 37
+    },
+    {
+      "epoch": 0.6477272727272727,
+      "grad_norm": 0.6848079020913995,
+      "learning_rate": 9.599897218294122e-06,
+      "loss": 0.8018,
+      "step": 38
+    },
+    {
+      "epoch": 0.6647727272727273,
+      "grad_norm": 0.853749579569145,
+      "learning_rate": 9.55949922996045e-06,
+      "loss": 0.7145,
+      "step": 39
+    },
+    {
+      "epoch": 0.6818181818181818,
+      "grad_norm": 0.68372338772216,
+      "learning_rate": 9.517252173051912e-06,
+      "loss": 0.764,
+      "step": 40
+    },
+    {
+      "epoch": 0.6988636363636364,
+      "grad_norm": 0.6422340925591769,
+      "learning_rate": 9.473173180525737e-06,
+      "loss": 0.7332,
+      "step": 41
+    },
+    {
+      "epoch": 0.7159090909090909,
+      "grad_norm": 0.7261628443441004,
+      "learning_rate": 9.427280128266049e-06,
+      "loss": 0.7428,
+      "step": 42
+    },
+    {
+      "epoch": 0.7329545454545454,
+      "grad_norm": 0.7702313145051908,
+      "learning_rate": 9.37959162783444e-06,
+      "loss": 0.7859,
+      "step": 43
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.6747045160311267,
+      "learning_rate": 9.330127018922195e-06,
+      "loss": 0.824,
+      "step": 44
+    },
+    {
+      "epoch": 0.7670454545454546,
+      "grad_norm": 0.7223692043597745,
+      "learning_rate": 9.278906361507238e-06,
+      "loss": 0.7605,
+      "step": 45
+    },
+    {
+      "epoch": 0.7840909090909091,
+      "grad_norm": 0.578610691292999,
+      "learning_rate": 9.225950427718974e-06,
+      "loss": 0.7546,
+      "step": 46
+    },
+    {
+      "epoch": 0.8011363636363636,
+      "grad_norm": 0.7399134643017758,
+      "learning_rate": 9.171280693414307e-06,
+      "loss": 0.7068,
+      "step": 47
+    },
+    {
+      "epoch": 0.8181818181818182,
+      "grad_norm": 0.6651749348569863,
+      "learning_rate": 9.114919329468283e-06,
+      "loss": 0.7707,
+      "step": 48
+    },
+    {
+      "epoch": 0.8352272727272727,
+      "grad_norm": 0.6681727962907063,
+      "learning_rate": 9.056889192782865e-06,
+      "loss": 0.7423,
+      "step": 49
+    },
+    {
+      "epoch": 0.8522727272727273,
+      "grad_norm": 0.5989175339145305,
+      "learning_rate": 8.997213817017508e-06,
+      "loss": 0.7367,
+      "step": 50
+    },
+    {
+      "epoch": 0.8693181818181818,
+      "grad_norm": 0.6415631156829461,
+      "learning_rate": 8.935917403045251e-06,
+      "loss": 0.7742,
+      "step": 51
+    },
+    {
+      "epoch": 0.8863636363636364,
+      "grad_norm": 0.7053287962999208,
+      "learning_rate": 8.873024809138272e-06,
+      "loss": 0.8004,
+      "step": 52
+    },
+    {
+      "epoch": 0.9034090909090909,
+      "grad_norm": 0.6616877102090415,
+      "learning_rate": 8.808561540886796e-06,
+      "loss": 0.8331,
+      "step": 53
+    },
+    {
+      "epoch": 0.9204545454545454,
+      "grad_norm": 0.6686807030220209,
+      "learning_rate": 8.742553740855507e-06,
+      "loss": 0.7622,
+      "step": 54
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 0.6207828741528636,
+      "learning_rate": 8.675028177981643e-06,
+      "loss": 0.7329,
+      "step": 55
+    },
+    {
+      "epoch": 0.9545454545454546,
+      "grad_norm": 0.5650546394483587,
+      "learning_rate": 8.606012236719073e-06,
+      "loss": 0.7638,
+      "step": 56
+    },
+    {
+      "epoch": 0.9715909090909091,
+      "grad_norm": 0.6124784723851092,
+      "learning_rate": 8.535533905932739e-06,
+      "loss": 0.7341,
+      "step": 57
+    },
+    {
+      "epoch": 0.9886363636363636,
+      "grad_norm": 0.6052340709182291,
+      "learning_rate": 8.463621767547998e-06,
+      "loss": 0.7457,
+      "step": 58
+    },
+    {
+      "epoch": 1.0056818181818181,
+      "grad_norm": 0.8052906258453199,
+      "learning_rate": 8.390304984959455e-06,
+      "loss": 0.939,
+      "step": 59
+    },
+    {
+      "epoch": 1.0227272727272727,
+      "grad_norm": 0.6376926058729759,
+      "learning_rate": 8.315613291203977e-06,
+      "loss": 0.7367,
+      "step": 60
+    },
+    {
+      "epoch": 1.0397727272727273,
+      "grad_norm": 0.6369503379591582,
+      "learning_rate": 8.239576976902694e-06,
+      "loss": 0.7449,
+      "step": 61
+    },
+    {
+      "epoch": 1.0568181818181819,
+      "grad_norm": 0.6969242917426808,
+      "learning_rate": 8.162226877976886e-06,
+      "loss": 0.7259,
+      "step": 62
+    },
+    {
+      "epoch": 1.0738636363636365,
+      "grad_norm": 0.6530544217263711,
+      "learning_rate": 8.083594363142717e-06,
+      "loss": 0.646,
+      "step": 63
+    },
+    {
+      "epoch": 1.0909090909090908,
+      "grad_norm": 0.5900438510967748,
+      "learning_rate": 8.003711321189895e-06,
+      "loss": 0.6842,
+      "step": 64
+    },
+    {
+      "epoch": 1.1079545454545454,
+      "grad_norm": 0.6284392353104462,
+      "learning_rate": 7.922610148049445e-06,
+      "loss": 0.8002,
+      "step": 65
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 0.6494425439689134,
+      "learning_rate": 7.84032373365578e-06,
+      "loss": 0.638,
+      "step": 66
+    },
+    {
+      "epoch": 1.1420454545454546,
+      "grad_norm": 0.5499044253465919,
+      "learning_rate": 7.75688544860846e-06,
+      "loss": 0.6289,
+      "step": 67
+    },
+    {
+      "epoch": 1.1590909090909092,
+      "grad_norm": 0.6392668714240907,
+      "learning_rate": 7.672329130639007e-06,
+      "loss": 0.7572,
+      "step": 68
+    },
+    {
+      "epoch": 1.1761363636363638,
+      "grad_norm": 0.673210138953056,
+      "learning_rate": 7.586689070888284e-06,
+      "loss": 0.6524,
+      "step": 69
+    },
+    {
+      "epoch": 1.1931818181818181,
+      "grad_norm": 0.6410447658297229,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 0.7362,
+      "step": 70
+    },
+    {
+      "epoch": 1.2102272727272727,
+      "grad_norm": 0.6785817657314511,
+      "learning_rate": 7.412297074035968e-06,
+      "loss": 0.7131,
+      "step": 71
+    },
+    {
+      "epoch": 1.2272727272727273,
+      "grad_norm": 0.861453272486042,
+      "learning_rate": 7.323615860218844e-06,
+      "loss": 0.7169,
+      "step": 72
+    },
+    {
+      "epoch": 1.2443181818181819,
+      "grad_norm": 0.611947189690942,
+      "learning_rate": 7.2339923225081296e-06,
+      "loss": 0.7185,
+      "step": 73
+    },
+    {
+      "epoch": 1.2613636363636362,
+      "grad_norm": 0.6501253363478586,
+      "learning_rate": 7.143462807015271e-06,
+      "loss": 0.6616,
+      "step": 74
+    },
+    {
+      "epoch": 1.2784090909090908,
+      "grad_norm": 0.715695504058172,
+      "learning_rate": 7.052064027263785e-06,
+      "loss": 0.7005,
+      "step": 75
+    },
+    {
+      "epoch": 1.2954545454545454,
+      "grad_norm": 0.7073752700474762,
+      "learning_rate": 6.959833049300376e-06,
+      "loss": 0.7879,
+      "step": 76
+    },
+    {
+      "epoch": 1.3125,
+      "grad_norm": 0.49068564720807656,
+      "learning_rate": 6.8668072766631054e-06,
+      "loss": 0.6314,
+      "step": 77
+    },
+    {
+      "epoch": 1.3295454545454546,
+      "grad_norm": 0.7803765124238339,
+      "learning_rate": 6.773024435212678e-06,
+      "loss": 0.7851,
+      "step": 78
+    },
+    {
+      "epoch": 1.3465909090909092,
+      "grad_norm": 0.5942335579737438,
+      "learning_rate": 6.678522557833025e-06,
+      "loss": 0.689,
+      "step": 79
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 0.5176233208142944,
+      "learning_rate": 6.583339969007364e-06,
+      "loss": 0.6542,
+      "step": 80
+    },
+    {
+      "epoch": 1.3806818181818181,
+      "grad_norm": 0.607753727654389,
+      "learning_rate": 6.487515269276015e-06,
+      "loss": 0.7534,
+      "step": 81
+    },
+    {
+      "epoch": 1.3977272727272727,
+      "grad_norm": 0.5409054512847689,
+      "learning_rate": 6.391087319582264e-06,
+      "loss": 0.6305,
+      "step": 82
+    },
+    {
+      "epoch": 1.4147727272727273,
+      "grad_norm": 0.5071385453642557,
+      "learning_rate": 6.294095225512604e-06,
+      "loss": 0.7589,
+      "step": 83
+    },
+    {
+      "epoch": 1.4318181818181819,
+      "grad_norm": 0.5143547906358554,
+      "learning_rate": 6.1965783214377895e-06,
+      "loss": 0.7914,
+      "step": 84
+    },
+    {
+      "epoch": 1.4488636363636362,
+      "grad_norm": 0.5071795878844763,
+      "learning_rate": 6.0985761545610865e-06,
+      "loss": 0.6643,
+      "step": 85
+    },
+    {
+      "epoch": 1.4659090909090908,
+      "grad_norm": 0.5505402808974502,
+      "learning_rate": 6.000128468880223e-06,
+      "loss": 0.7187,
+      "step": 86
+    },
+    {
+      "epoch": 1.4829545454545454,
+      "grad_norm": 0.5577932417601431,
+      "learning_rate": 5.90127518906953e-06,
+      "loss": 0.6295,
+      "step": 87
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.6475715595827016,
+      "learning_rate": 5.8020564042888015e-06,
+      "loss": 0.7056,
+      "step": 88
+    },
+    {
+      "epoch": 1.5170454545454546,
+      "grad_norm": 0.5835532211565827,
+      "learning_rate": 5.7025123519254644e-06,
+      "loss": 0.7256,
+      "step": 89
+    },
+    {
+      "epoch": 1.5340909090909092,
+      "grad_norm": 0.5415921745872375,
+      "learning_rate": 5.6026834012766155e-06,
+      "loss": 0.7471,
+      "step": 90
+    },
+    {
+      "epoch": 1.5511363636363638,
+      "grad_norm": 0.5833270165675741,
+      "learning_rate": 5.502610037177586e-06,
+      "loss": 0.7633,
+      "step": 91
+    },
+    {
+      "epoch": 1.5681818181818183,
+      "grad_norm": 0.560199988553917,
+      "learning_rate": 5.402332843583631e-06,
+      "loss": 0.6595,
+      "step": 92
+    },
+    {
+      "epoch": 1.5852272727272727,
+      "grad_norm": 0.4939855321494504,
+      "learning_rate": 5.301892487111431e-06,
+      "loss": 0.6958,
+      "step": 93
+    },
+    {
+      "epoch": 1.6022727272727273,
+      "grad_norm": 0.5378643220876148,
+      "learning_rate": 5.201329700547077e-06,
+      "loss": 0.6934,
+      "step": 94
+    },
+    {
+      "epoch": 1.6193181818181817,
+      "grad_norm": 0.5420525607884883,
+      "learning_rate": 5.100685266327202e-06,
+      "loss": 0.6365,
+      "step": 95
+    },
+    {
+      "epoch": 1.6363636363636362,
+      "grad_norm": 0.4520550820837634,
+      "learning_rate": 5e-06,
+      "loss": 0.6338,
+      "step": 96
+    },
+    {
+      "epoch": 1.6534090909090908,
+      "grad_norm": 0.5117553753106799,
+      "learning_rate": 4.8993147336728e-06,
+      "loss": 0.7532,
+      "step": 97
+    },
+    {
+      "epoch": 1.6704545454545454,
+      "grad_norm": 0.5106162671673076,
+      "learning_rate": 4.798670299452926e-06,
+      "loss": 0.7011,
+      "step": 98
+    },
+    {
+      "epoch": 1.6875,
+      "grad_norm": 0.47075334035417565,
+      "learning_rate": 4.69810751288857e-06,
+      "loss": 0.6379,
+      "step": 99
+    },
+    {
+      "epoch": 1.7045454545454546,
+      "grad_norm": 0.5343852849626032,
+      "learning_rate": 4.597667156416371e-06,
+      "loss": 0.7058,
+      "step": 100
+    },
+    {
+      "epoch": 1.7215909090909092,
+      "grad_norm": 0.5025252513851014,
+      "learning_rate": 4.497389962822416e-06,
+      "loss": 0.7518,
+      "step": 101
+    },
+    {
+      "epoch": 1.7386363636363638,
+      "grad_norm": 0.4973905372002931,
+      "learning_rate": 4.397316598723385e-06,
+      "loss": 0.7153,
+      "step": 102
+    },
+    {
+      "epoch": 1.7556818181818183,
+      "grad_norm": 0.46247569289594587,
+      "learning_rate": 4.297487648074538e-06,
+      "loss": 0.6349,
+      "step": 103
+    },
+    {
+      "epoch": 1.7727272727272727,
+      "grad_norm": 0.48340291588569545,
+      "learning_rate": 4.1979435957111984e-06,
+      "loss": 0.7174,
+      "step": 104
+    },
+    {
+      "epoch": 1.7897727272727273,
+      "grad_norm": 0.45562405643253756,
+      "learning_rate": 4.098724810930472e-06,
+      "loss": 0.6847,
+      "step": 105
+    },
+    {
+      "epoch": 1.8068181818181817,
+      "grad_norm": 0.47781922446221436,
+      "learning_rate": 3.999871531119779e-06,
+      "loss": 0.7162,
+      "step": 106
+    },
+    {
+      "epoch": 1.8238636363636362,
+      "grad_norm": 0.4315069745006211,
+      "learning_rate": 3.901423845438916e-06,
+      "loss": 0.6691,
+      "step": 107
+    },
+    {
+      "epoch": 1.8409090909090908,
+      "grad_norm": 0.47883228124204524,
+      "learning_rate": 3.803421678562213e-06,
+      "loss": 0.7257,
+      "step": 108
+    },
+    {
+      "epoch": 1.8579545454545454,
+      "grad_norm": 0.5320184023237785,
+      "learning_rate": 3.705904774487396e-06,
+      "loss": 0.7532,
+      "step": 109
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": 0.4269725257899147,
+      "learning_rate": 3.6089126804177373e-06,
+      "loss": 0.6863,
+      "step": 110
+    },
+    {
+      "epoch": 1.8920454545454546,
+      "grad_norm": 0.4583005565763506,
+      "learning_rate": 3.5124847307239863e-06,
+      "loss": 0.7502,
+      "step": 111
+    },
+    {
+      "epoch": 1.9090909090909092,
+      "grad_norm": 0.4486844352174395,
+      "learning_rate": 3.416660030992639e-06,
+      "loss": 0.6897,
+      "step": 112
+    },
+    {
+      "epoch": 1.9261363636363638,
+      "grad_norm": 0.49053197217486366,
+      "learning_rate": 3.3214774421669777e-06,
+      "loss": 0.698,
+      "step": 113
+    },
+    {
+      "epoch": 1.9431818181818183,
+      "grad_norm": 0.5128637973645126,
+      "learning_rate": 3.226975564787322e-06,
+      "loss": 0.6907,
+      "step": 114
+    },
+    {
+      "epoch": 1.9602272727272727,
+      "grad_norm": 0.478047879056811,
+      "learning_rate": 3.1331927233368954e-06,
+      "loss": 0.7661,
+      "step": 115
+    },
+    {
+      "epoch": 1.9772727272727273,
+      "grad_norm": 0.4387651831852447,
+      "learning_rate": 3.040166950699626e-06,
+      "loss": 0.6683,
+      "step": 116
+    },
+    {
+      "epoch": 1.9943181818181817,
+      "grad_norm": 0.5998562328424022,
+      "learning_rate": 2.947935972736217e-06,
+      "loss": 0.8983,
+      "step": 117
+    },
+    {
+      "epoch": 2.0113636363636362,
+      "grad_norm": 0.5630847402650452,
+      "learning_rate": 2.8565371929847286e-06,
+      "loss": 0.6758,
+      "step": 118
+    },
+    {
+      "epoch": 2.028409090909091,
+      "grad_norm": 0.4489743674974812,
+      "learning_rate": 2.766007677491871e-06,
+      "loss": 0.6843,
+      "step": 119
+    },
+    {
+      "epoch": 2.0454545454545454,
+      "grad_norm": 0.4457573392562645,
+      "learning_rate": 2.6763841397811576e-06,
+      "loss": 0.6327,
+      "step": 120
+    },
+    {
+      "epoch": 2.0625,
+      "grad_norm": 0.5634916203600439,
+      "learning_rate": 2.587702925964034e-06,
+      "loss": 0.6627,
+      "step": 121
+    },
+    {
+      "epoch": 2.0795454545454546,
+      "grad_norm": 0.4707269296382247,
+      "learning_rate": 2.5000000000000015e-06,
+      "loss": 0.7272,
+      "step": 122
+    },
+    {
+      "epoch": 2.096590909090909,
+      "grad_norm": 0.443373403354557,
+      "learning_rate": 2.4133109291117156e-06,
+      "loss": 0.6195,
+      "step": 123
+    },
+    {
+      "epoch": 2.1136363636363638,
+      "grad_norm": 0.46908613730040055,
+      "learning_rate": 2.3276708693609947e-06,
+      "loss": 0.5704,
+      "step": 124
+    },
+    {
+      "epoch": 2.1306818181818183,
+      "grad_norm": 0.4105495278728908,
+      "learning_rate": 2.243114551391542e-06,
+      "loss": 0.711,
+      "step": 125
+    },
+    {
+      "epoch": 2.147727272727273,
+      "grad_norm": 0.537311901769593,
+      "learning_rate": 2.159676266344222e-06,
+      "loss": 0.7541,
+      "step": 126
+    },
+    {
+      "epoch": 2.164772727272727,
+      "grad_norm": 0.5468018420551715,
+      "learning_rate": 2.077389851950557e-06,
+      "loss": 0.5964,
+      "step": 127
+    },
+    {
+      "epoch": 2.1818181818181817,
+      "grad_norm": 0.5124295355078099,
+      "learning_rate": 1.996288678810105e-06,
+      "loss": 0.6619,
+      "step": 128
+    },
+    {
+      "epoch": 2.1988636363636362,
+      "grad_norm": 0.4715818069175043,
+      "learning_rate": 1.9164056368572847e-06,
+      "loss": 0.6833,
+      "step": 129
+    },
+    {
+      "epoch": 2.215909090909091,
+      "grad_norm": 0.48951497017695567,
+      "learning_rate": 1.8377731220231144e-06,
+      "loss": 0.6855,
+      "step": 130
+    },
+    {
+      "epoch": 2.2329545454545454,
+      "grad_norm": 0.42265291988632153,
+      "learning_rate": 1.7604230230973068e-06,
+      "loss": 0.6424,
+      "step": 131
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.4650605098758252,
+      "learning_rate": 1.6843867087960252e-06,
+      "loss": 0.6231,
+      "step": 132
+    },
+    {
+      "epoch": 2.2670454545454546,
+      "grad_norm": 0.4143527777268036,
+      "learning_rate": 1.6096950150405454e-06,
+      "loss": 0.5598,
+      "step": 133
+    },
+    {
+      "epoch": 2.284090909090909,
+      "grad_norm": 0.44107050291230937,
+      "learning_rate": 1.5363782324520033e-06,
+      "loss": 0.7221,
+      "step": 134
+    },
+    {
+      "epoch": 2.3011363636363638,
+      "grad_norm": 0.445778553889928,
+      "learning_rate": 1.4644660940672628e-06,
+      "loss": 0.6413,
+      "step": 135
+    },
+    {
+      "epoch": 2.3181818181818183,
+      "grad_norm": 0.4187405257211714,
+      "learning_rate": 1.3939877632809279e-06,
+      "loss": 0.6992,
+      "step": 136
+    },
+    {
+      "epoch": 2.3352272727272725,
+      "grad_norm": 0.4350630455961759,
+      "learning_rate": 1.3249718220183583e-06,
+      "loss": 0.6878,
+      "step": 137
+    },
+    {
+      "epoch": 2.3522727272727275,
+      "grad_norm": 0.393861708986206,
+      "learning_rate": 1.257446259144494e-06,
+      "loss": 0.6385,
+      "step": 138
+    },
+    {
+      "epoch": 2.3693181818181817,
+      "grad_norm": 0.42310655150402887,
+      "learning_rate": 1.1914384591132045e-06,
+      "loss": 0.7421,
+      "step": 139
+    },
+    {
+      "epoch": 2.3863636363636362,
+      "grad_norm": 0.40172644758715986,
+      "learning_rate": 1.1269751908617277e-06,
+      "loss": 0.647,
+      "step": 140
+    },
+    {
+      "epoch": 2.403409090909091,
+      "grad_norm": 0.4418275610345151,
+      "learning_rate": 1.0640825969547498e-06,
+      "loss": 0.6702,
+      "step": 141
+    },
+    {
+      "epoch": 2.4204545454545454,
+      "grad_norm": 0.3802094194842585,
+      "learning_rate": 1.0027861829824953e-06,
+      "loss": 0.5888,
+      "step": 142
+    },
+    {
+      "epoch": 2.4375,
+      "grad_norm": 0.39698478453376557,
+      "learning_rate": 9.431108072171346e-07,
+      "loss": 0.7405,
+      "step": 143
+    },
+    {
+      "epoch": 2.4545454545454546,
+      "grad_norm": 0.4302235176734781,
+      "learning_rate": 8.850806705317183e-07,
+      "loss": 0.6819,
+      "step": 144
+    },
+    {
+      "epoch": 2.471590909090909,
+      "grad_norm": 0.4055826304794628,
+      "learning_rate": 8.287193065856936e-07,
+      "loss": 0.6332,
+      "step": 145
+    },
+    {
+      "epoch": 2.4886363636363638,
+      "grad_norm": 0.4307108144646676,
+      "learning_rate": 7.740495722810271e-07,
+      "loss": 0.676,
+      "step": 146
+    },
+    {
+      "epoch": 2.5056818181818183,
+      "grad_norm": 0.4295524075776934,
+      "learning_rate": 7.210936384927631e-07,
+      "loss": 0.7018,
+      "step": 147
+    },
+    {
+      "epoch": 2.5227272727272725,
+      "grad_norm": 0.3860805358590195,
+      "learning_rate": 6.698729810778065e-07,
+      "loss": 0.6894,
+      "step": 148
+    },
+    {
+      "epoch": 2.5397727272727275,
+      "grad_norm": 0.4334053053827782,
+      "learning_rate": 6.204083721655607e-07,
+      "loss": 0.645,
+      "step": 149
+    },
+    {
+      "epoch": 2.5568181818181817,
+      "grad_norm": 0.4287547792874842,
+      "learning_rate": 5.727198717339511e-07,
+      "loss": 0.7111,
+      "step": 150
+    },
+    {
+      "epoch": 2.5738636363636362,
+      "grad_norm": 0.3887625011714418,
+      "learning_rate": 5.268268194742638e-07,
+      "loss": 0.7216,
+      "step": 151
+    },
+    {
+      "epoch": 2.590909090909091,
+      "grad_norm": 0.3798899338475208,
+      "learning_rate": 4.827478269480895e-07,
+      "loss": 0.621,
+      "step": 152
+    },
+    {
+      "epoch": 2.6079545454545454,
+      "grad_norm": 0.45053111754260017,
+      "learning_rate": 4.405007700395497e-07,
+      "loss": 0.7521,
+      "step": 153
+    },
+    {
+      "epoch": 2.625,
+      "grad_norm": 0.377645018921979,
+      "learning_rate": 4.001027817058789e-07,
+      "loss": 0.6145,
+      "step": 154
+    },
+    {
+      "epoch": 2.6420454545454546,
+      "grad_norm": 0.3776664317185743,
+      "learning_rate": 3.615702450292857e-07,
+      "loss": 0.6278,
+      "step": 155
+    },
+    {
+      "epoch": 2.659090909090909,
+      "grad_norm": 0.4305890271008056,
+      "learning_rate": 3.2491878657292643e-07,
+      "loss": 0.7472,
+      "step": 156
+    },
+    {
+      "epoch": 2.6761363636363638,
+      "grad_norm": 0.40843066342237466,
+      "learning_rate": 2.901632700436757e-07,
+      "loss": 0.6258,
+      "step": 157
+    },
+    {
+      "epoch": 2.6931818181818183,
+      "grad_norm": 0.3956427977171968,
+      "learning_rate": 2.573177902642726e-07,
+      "loss": 0.6839,
+      "step": 158
+    },
+    {
+      "epoch": 2.7102272727272725,
+      "grad_norm": 0.41431945926782104,
+      "learning_rate": 2.2639566745727203e-07,
+      "loss": 0.6734,
+      "step": 159
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 0.3818344903770187,
+      "learning_rate": 1.9740944184313882e-07,
+      "loss": 0.6367,
+      "step": 160
+    },
+    {
+      "epoch": 2.7443181818181817,
+      "grad_norm": 0.38077075132026156,
+      "learning_rate": 1.7037086855465902e-07,
+      "loss": 0.632,
+      "step": 161
+    },
+    {
+      "epoch": 2.7613636363636362,
+      "grad_norm": 0.39006091928915276,
+      "learning_rate": 1.4529091286973994e-07,
+      "loss": 0.6924,
+      "step": 162
+    },
+    {
+      "epoch": 2.778409090909091,
+      "grad_norm": 0.38840487543638486,
+      "learning_rate": 1.2217974576453072e-07,
+      "loss": 0.7466,
+      "step": 163
+    },
+    {
+      "epoch": 2.7954545454545454,
+      "grad_norm": 0.45834263911645595,
+      "learning_rate": 1.0104673978866164e-07,
+      "loss": 0.6057,
+      "step": 164
+    },
+    {
+      "epoch": 2.8125,
+      "grad_norm": 0.38210612410412104,
+      "learning_rate": 8.190046526428241e-08,
+      "loss": 0.663,
+      "step": 165
+    },
+    {
+      "epoch": 2.8295454545454546,
+      "grad_norm": 0.38478210491106063,
+      "learning_rate": 6.474868681043578e-08,
+      "loss": 0.6762,
+      "step": 166
+    },
+    {
+      "epoch": 2.846590909090909,
+      "grad_norm": 0.3720363705810689,
+      "learning_rate": 4.959836019417963e-08,
+      "loss": 0.6105,
+      "step": 167
+    },
+    {
+      "epoch": 2.8636363636363638,
+      "grad_norm": 0.4440736370045247,
+      "learning_rate": 3.645562950973014e-08,
+      "loss": 0.7567,
+      "step": 168
+    },
+    {
+      "epoch": 2.8806818181818183,
+      "grad_norm": 0.381304638800026,
+      "learning_rate": 2.5325824686772138e-08,
+      "loss": 0.6174,
+      "step": 169
+    },
+    {
+      "epoch": 2.8977272727272725,
+      "grad_norm": 0.4156474533014312,
+      "learning_rate": 1.6213459328950355e-08,
+      "loss": 0.5918,
+      "step": 170
+    },
+    {
+      "epoch": 2.9147727272727275,
+      "grad_norm": 0.4459243063651079,
+      "learning_rate": 9.12222888341252e-09,
+      "loss": 0.693,
+      "step": 171
+    },
+    {
+      "epoch": 2.9318181818181817,
+      "grad_norm": 0.3705961074102094,
+      "learning_rate": 4.055009142152066e-09,
+      "loss": 0.6132,
+      "step": 172
+    },
+    {
+      "epoch": 2.9488636363636362,
+      "grad_norm": 0.3768879535202281,
+      "learning_rate": 1.0138550757493592e-09,
+      "loss": 0.6515,
+      "step": 173
+    },
+    {
+      "epoch": 2.965909090909091,
+      "grad_norm": 0.39814357318394317,
+      "learning_rate": 0.0,
+      "loss": 0.7362,
+      "step": 174
+    },
+    {
+      "epoch": 2.965909090909091,
+      "step": 174,
+      "total_flos": 156886855483392.0,
+      "train_loss": 0.7380151625337272,
+      "train_runtime": 2697.8392,
+      "train_samples_per_second": 6.228,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 174,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 156886855483392.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_loss.png ADDED Viewed