Training in progress, step 50, checkpoint

Browse files

Files changed (8) hide show

last-checkpoint/README.md +3 -3
last-checkpoint/adapter_config.json +9 -9
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +209 -1641
last-checkpoint/training_args.bin +2 -2

last-checkpoint/README.md CHANGED Viewed

@@ -1,9 +1,9 @@
 ---
-base_model: unsloth/Nemotron-3-Nano-30B-A3B
 library_name: peft
 pipeline_tag: text-generation
 tags:
-- base_model:adapter:unsloth/Nemotron-3-Nano-30B-A3B
 - lora
 - sft
 - transformers
@@ -207,4 +207,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
 [More Information Needed]
 ### Framework versions
-- PEFT 0.18.0

 ---
+base_model: owenisas/nemotron-3-nano-reasoning
 library_name: peft
 pipeline_tag: text-generation
 tags:
+- base_model:adapter:owenisas/nemotron-3-nano-reasoning
 - lora
 - sft
 - transformers
 [More Information Needed]
 ### Framework versions
+- PEFT 0.18.1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -4,10 +4,10 @@
   "arrow_config": null,
   "auto_mapping": {
     "base_model_class": "NemotronHForCausalLM",
-    "parent_library": "transformers_modules.unsloth.Nemotron-3-Nano-30B-A3B.b93ba8494bf95b9e5dd7aed6b5d07517db195743.modeling_nemotron_h",
     "unsloth_fixed": true
   },
-  "base_model_name_or_path": "unsloth/Nemotron-3-Nano-30B-A3B",
   "bias": "none",
   "corda_config": null,
   "ensure_weight_tying": false,
@@ -27,21 +27,21 @@
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
-  "peft_version": "0.18.0",
   "qalora_group_size": 16,
   "r": 32,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "k_proj",
-    "down_proj",
-    "gate_proj",
     "out_proj",
-    "o_proj",
-    "q_proj",
     "up_proj",
-    "in_proj",
-    "v_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "arrow_config": null,
   "auto_mapping": {
     "base_model_class": "NemotronHForCausalLM",
+    "parent_library": "transformers_modules.owenisas.nemotron-3-nano-reasoning.c06798b01704b3d322954056e8de8bf6cae11e38.modeling_nemotron_h",
     "unsloth_fixed": true
   },
+  "base_model_name_or_path": "owenisas/nemotron-3-nano-reasoning",
   "bias": "none",
   "corda_config": null,
   "ensure_weight_tying": false,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
+  "peft_version": "0.18.1",
   "qalora_group_size": 16,
   "r": 32,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "in_proj",
+    "q_proj",
     "k_proj",
     "out_proj",
     "up_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c2a5ff429f650539cd5c6ad9ea7f9569fd24863056cad28726290ed985d9fea
 size 3537299144

 version https://git-lfs.github.com/spec/v1
+oid sha256:1c302dc1f7d4b868ed2fec7fb599c56ab89a9be3b061d10a09c33f91bc884118
 size 3537299144

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:85bb5e2364254f0b84ca558a536ce2983868014e01a90e171fbe557dd01d62f6
-size 1830175435

 version https://git-lfs.github.com/spec/v1
+oid sha256:c82c2ea846bada76c6987cfb10fc7217cfd00b4b82d0021a138e9add209aaec9
+size 1798933287

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d895ccae2b55d4ea213653ca4a80d00de131463e105716eab1b7022906f260bf
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:9ccb8eeb935749fc43744e0a5eeacdf6f0f10253be15266a497cbca0ffaa2573
 size 14645

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b7f2a236446ef1e40ceb20dfad68baf17d74c3d4a45e7640820b9ddfc1c6c59
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:83429aff07094f43f6ae84f250d5d91c95fca2dfaf4ecddce133674cbbfe1442
 size 1465

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,1806 +2,374 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.9389671361502347,
   "eval_steps": 50,
-  "global_step": 250,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.003755868544600939,
-      "grad_norm": 0.7058172225952148,
       "learning_rate": 0.0,
-      "loss": 4.8421,
       "step": 1
     },
     {
-      "epoch": 0.007511737089201878,
-      "grad_norm": 0.6402199268341064,
-      "learning_rate": 4e-05,
-      "loss": 4.9285,
       "step": 2
     },
     {
-      "epoch": 0.011267605633802818,
-      "grad_norm": 0.71592777967453,
-      "learning_rate": 8e-05,
-      "loss": 5.0171,
       "step": 3
     },
     {
-      "epoch": 0.015023474178403756,
-      "grad_norm": 0.5461985468864441,
-      "learning_rate": 0.00012,
-      "loss": 4.1572,
       "step": 4
     },
     {
-      "epoch": 0.018779342723004695,
-      "grad_norm": 0.6180109977722168,
-      "learning_rate": 0.00016,
-      "loss": 4.901,
       "step": 5
     },
     {
-      "epoch": 0.022535211267605635,
-      "grad_norm": 0.6709616184234619,
-      "learning_rate": 0.0002,
-      "loss": 4.7065,
       "step": 6
     },
     {
-      "epoch": 0.02629107981220657,
-      "grad_norm": 0.7794731855392456,
-      "learning_rate": 0.00019999281110792807,
-      "loss": 3.947,
       "step": 7
     },
     {
-      "epoch": 0.03004694835680751,
-      "grad_norm": 0.7531212568283081,
-      "learning_rate": 0.0001999712454653157,
-      "loss": 4.6737,
       "step": 8
     },
     {
-      "epoch": 0.03380281690140845,
-      "grad_norm": 0.7358666062355042,
-      "learning_rate": 0.00019993530617282436,
-      "loss": 5.2637,
       "step": 9
     },
     {
-      "epoch": 0.03755868544600939,
-      "grad_norm": 0.676575243473053,
-      "learning_rate": 0.00019988499839772804,
-      "loss": 5.189,
       "step": 10
     },
     {
-      "epoch": 0.04131455399061033,
-      "grad_norm": 0.6564130187034607,
-      "learning_rate": 0.00019982032937316998,
-      "loss": 4.7153,
       "step": 11
     },
     {
-      "epoch": 0.04507042253521127,
-      "grad_norm": 0.5907655954360962,
-      "learning_rate": 0.000199741308397123,
-      "loss": 4.5261,
       "step": 12
     },
     {
-      "epoch": 0.048826291079812206,
-      "grad_norm": 0.5551841855049133,
-      "learning_rate": 0.0001996479468310524,
-      "loss": 4.1933,
       "step": 13
     },
     {
-      "epoch": 0.05258215962441314,
-      "grad_norm": 0.5221297740936279,
-      "learning_rate": 0.00019954025809828266,
-      "loss": 4.1989,
       "step": 14
     },
     {
-      "epoch": 0.056338028169014086,
-      "grad_norm": 0.6120548844337463,
-      "learning_rate": 0.0001994182576820673,
-      "loss": 4.756,
       "step": 15
     },
     {
-      "epoch": 0.06009389671361502,
-      "grad_norm": 0.5957234501838684,
-      "learning_rate": 0.00019928196312336285,
-      "loss": 5.0855,
       "step": 16
     },
     {
-      "epoch": 0.06384976525821597,
-      "grad_norm": 0.47647541761398315,
-      "learning_rate": 0.00019913139401830674,
-      "loss": 4.0117,
       "step": 17
     },
     {
-      "epoch": 0.0676056338028169,
-      "grad_norm": 0.4548296630382538,
-      "learning_rate": 0.0001989665720153999,
-      "loss": 3.7509,
       "step": 18
     },
     {
-      "epoch": 0.07136150234741784,
-      "grad_norm": 0.561380922794342,
-      "learning_rate": 0.0001987875208123941,
-      "loss": 4.7493,
       "step": 19
     },
     {
-      "epoch": 0.07511737089201878,
-      "grad_norm": 0.5130324959754944,
-      "learning_rate": 0.00019859426615288488,
-      "loss": 4.0675,
       "step": 20
     },
     {
-      "epoch": 0.07887323943661972,
-      "grad_norm": 0.509790301322937,
-      "learning_rate": 0.00019838683582260993,
-      "loss": 4.1663,
       "step": 21
     },
     {
-      "epoch": 0.08262910798122065,
-      "grad_norm": 0.49768951535224915,
-      "learning_rate": 0.00019816525964545448,
-      "loss": 4.2581,
       "step": 22
     },
     {
-      "epoch": 0.0863849765258216,
-      "grad_norm": 0.4733114242553711,
-      "learning_rate": 0.00019792956947916292,
-      "loss": 4.2086,
       "step": 23
     },
     {
-      "epoch": 0.09014084507042254,
-      "grad_norm": 0.5826513767242432,
-      "learning_rate": 0.00019767979921075866,
-      "loss": 4.9181,
       "step": 24
     },
     {
-      "epoch": 0.09389671361502347,
-      "grad_norm": 0.5314785242080688,
-      "learning_rate": 0.00019741598475167175,
-      "loss": 4.2111,
       "step": 25
     },
     {
-      "epoch": 0.09765258215962441,
-      "grad_norm": 0.5262457728385925,
-      "learning_rate": 0.0001971381640325756,
-      "loss": 3.9379,
       "step": 26
     },
     {
-      "epoch": 0.10140845070422536,
-      "grad_norm": 0.4913003742694855,
-      "learning_rate": 0.00019684637699793358,
-      "loss": 3.9544,
       "step": 27
     },
     {
-      "epoch": 0.10516431924882629,
-      "grad_norm": 0.5091902613639832,
-      "learning_rate": 0.00019654066560025567,
-      "loss": 4.4432,
       "step": 28
     },
     {
-      "epoch": 0.10892018779342723,
-      "grad_norm": 0.474369615316391,
-      "learning_rate": 0.00019622107379406667,
-      "loss": 3.8418,
       "step": 29
     },
     {
-      "epoch": 0.11267605633802817,
-      "grad_norm": 0.5503483414649963,
-      "learning_rate": 0.00019588764752958668,
-      "loss": 4.5962,
       "step": 30
     },
     {
-      "epoch": 0.11643192488262911,
-      "grad_norm": 0.48790082335472107,
-      "learning_rate": 0.0001955404347461243,
-      "loss": 4.1689,
       "step": 31
     },
     {
-      "epoch": 0.12018779342723004,
-      "grad_norm": 0.5871917605400085,
-      "learning_rate": 0.000195179485365184,
-      "loss": 4.1855,
       "step": 32
     },
     {
-      "epoch": 0.12394366197183099,
-      "grad_norm": 0.5197378993034363,
-      "learning_rate": 0.00019480485128328868,
-      "loss": 4.2648,
       "step": 33
     },
     {
-      "epoch": 0.12769953051643193,
-      "grad_norm": 0.5159541964530945,
-      "learning_rate": 0.00019441658636451794,
-      "loss": 4.4084,
       "step": 34
     },
     {
-      "epoch": 0.13145539906103287,
-      "grad_norm": 0.49587559700012207,
-      "learning_rate": 0.0001940147464327637,
-      "loss": 4.0163,
       "step": 35
     },
     {
-      "epoch": 0.1352112676056338,
-      "grad_norm": 0.46864601969718933,
-      "learning_rate": 0.000193599389263704,
-      "loss": 3.7834,
       "step": 36
     },
     {
-      "epoch": 0.13896713615023473,
-      "grad_norm": 0.4787595272064209,
-      "learning_rate": 0.000193170574576496,
-      "loss": 3.976,
       "step": 37
     },
     {
-      "epoch": 0.14272300469483568,
-      "grad_norm": 0.5712424516677856,
-      "learning_rate": 0.0001927283640251898,
-      "loss": 4.2897,
       "step": 38
     },
     {
-      "epoch": 0.14647887323943662,
-      "grad_norm": 0.46865108609199524,
-      "learning_rate": 0.00019227282118986394,
-      "loss": 3.9205,
       "step": 39
     },
     {
-      "epoch": 0.15023474178403756,
-      "grad_norm": 0.5147837400436401,
-      "learning_rate": 0.00019180401156748396,
-      "loss": 3.8292,
       "step": 40
     },
     {
-      "epoch": 0.1539906103286385,
-      "grad_norm": 0.5160613656044006,
-      "learning_rate": 0.0001913220025624854,
-      "loss": 4.3181,
       "step": 41
     },
     {
-      "epoch": 0.15774647887323945,
-      "grad_norm": 0.5188874006271362,
-      "learning_rate": 0.00019082686347708254,
-      "loss": 4.1479,
       "step": 42
     },
     {
-      "epoch": 0.16150234741784036,
-      "grad_norm": 0.5262385606765747,
-      "learning_rate": 0.00019031866550130438,
-      "loss": 4.3483,
       "step": 43
     },
     {
-      "epoch": 0.1652582159624413,
-      "grad_norm": 0.5749176144599915,
-      "learning_rate": 0.0001897974817027588,
-      "loss": 4.2619,
       "step": 44
     },
     {
-      "epoch": 0.16901408450704225,
-      "grad_norm": 0.515292227268219,
-      "learning_rate": 0.00018926338701612738,
-      "loss": 3.9171,
       "step": 45
     },
     {
-      "epoch": 0.1727699530516432,
-      "grad_norm": 0.5622981190681458,
-      "learning_rate": 0.00018871645823239128,
-      "loss": 4.3514,
       "step": 46
     },
     {
-      "epoch": 0.17652582159624414,
-      "grad_norm": 0.5402107238769531,
-      "learning_rate": 0.00018815677398779048,
-      "loss": 4.1302,
       "step": 47
     },
     {
-      "epoch": 0.18028169014084508,
-      "grad_norm": 0.4933449625968933,
-      "learning_rate": 0.00018758441475251754,
-      "loss": 3.7445,
       "step": 48
     },
     {
-      "epoch": 0.18403755868544602,
-      "grad_norm": 0.5452999472618103,
-      "learning_rate": 0.0001869994628191478,
-      "loss": 4.3435,
       "step": 49
     },
     {
-      "epoch": 0.18779342723004694,
-      "grad_norm": 0.4758988916873932,
-      "learning_rate": 0.00018640200229080763,
-      "loss": 3.8278,
       "step": 50
     },
     {
-      "epoch": 0.18779342723004694,
-      "eval_loss": 0.5256218314170837,
-      "eval_runtime": 368.5627,
-      "eval_samples_per_second": 2.569,
-      "eval_steps_per_second": 0.643,
       "step": 50
-    },
-    {
-      "epoch": 0.19154929577464788,
-      "grad_norm": 0.5213260650634766,
-      "learning_rate": 0.00018579211906908215,
-      "loss": 3.6026,
-      "step": 51
-    },
-    {
-      "epoch": 0.19530516431924883,
-      "grad_norm": 0.4835609495639801,
-      "learning_rate": 0.00018516990084166442,
-      "loss": 4.178,
-      "step": 52
-    },
-    {
-      "epoch": 0.19906103286384977,
-      "grad_norm": 0.6534969210624695,
-      "learning_rate": 0.0001845354370697482,
-      "loss": 4.6021,
-      "step": 53
-    },
-    {
-      "epoch": 0.2028169014084507,
-      "grad_norm": 0.5575639605522156,
-      "learning_rate": 0.000183888818975165,
-      "loss": 4.2967,
-      "step": 54
-    },
-    {
-      "epoch": 0.20657276995305165,
-      "grad_norm": 0.5025990009307861,
-      "learning_rate": 0.00018323013952726875,
-      "loss": 3.9018,
-      "step": 55
-    },
-    {
-      "epoch": 0.21032863849765257,
-      "grad_norm": 0.5214390754699707,
-      "learning_rate": 0.00018255949342956863,
-      "loss": 4.2738,
-      "step": 56
-    },
-    {
-      "epoch": 0.2140845070422535,
-      "grad_norm": 0.46630731225013733,
-      "learning_rate": 0.00018187697710611298,
-      "loss": 3.6578,
-      "step": 57
-    },
-    {
-      "epoch": 0.21784037558685446,
-      "grad_norm": 0.47853055596351624,
-      "learning_rate": 0.00018118268868762546,
-      "loss": 3.8583,
-      "step": 58
-    },
-    {
-      "epoch": 0.2215962441314554,
-      "grad_norm": 0.6015180349349976,
-      "learning_rate": 0.00018047672799739628,
-      "loss": 4.4667,
-      "step": 59
-    },
-    {
-      "epoch": 0.22535211267605634,
-      "grad_norm": 0.6465438604354858,
-      "learning_rate": 0.0001797591965369296,
-      "loss": 4.6973,
-      "step": 60
-    },
-    {
-      "epoch": 0.2291079812206573,
-      "grad_norm": 0.6278983950614929,
-      "learning_rate": 0.00017903019747134998,
-      "loss": 4.3331,
-      "step": 61
-    },
-    {
-      "epoch": 0.23286384976525823,
-      "grad_norm": 0.5308983325958252,
-      "learning_rate": 0.00017828983561456941,
-      "loss": 4.1813,
-      "step": 62
-    },
-    {
-      "epoch": 0.23661971830985915,
-      "grad_norm": 0.5170332789421082,
-      "learning_rate": 0.00017753821741421769,
-      "loss": 3.9088,
-      "step": 63
-    },
-    {
-      "epoch": 0.2403755868544601,
-      "grad_norm": 0.560627818107605,
-      "learning_rate": 0.00017677545093633713,
-      "loss": 4.1723,
-      "step": 64
-    },
-    {
-      "epoch": 0.24413145539906103,
-      "grad_norm": 0.6482558846473694,
-      "learning_rate": 0.00017600164584984546,
-      "loss": 4.4782,
-      "step": 65
-    },
-    {
-      "epoch": 0.24788732394366197,
-      "grad_norm": 0.55652916431427,
-      "learning_rate": 0.00017521691341076774,
-      "loss": 4.0827,
-      "step": 66
-    },
-    {
-      "epoch": 0.2516431924882629,
-      "grad_norm": 0.5317822098731995,
-      "learning_rate": 0.00017442136644624015,
-      "loss": 3.8659,
-      "step": 67
-    },
-    {
-      "epoch": 0.25539906103286386,
-      "grad_norm": 0.5009008049964905,
-      "learning_rate": 0.00017361511933828801,
-      "loss": 3.9116,
-      "step": 68
-    },
-    {
-      "epoch": 0.2591549295774648,
-      "grad_norm": 0.4998956620693207,
-      "learning_rate": 0.00017279828800738017,
-      "loss": 3.6624,
-      "step": 69
-    },
-    {
-      "epoch": 0.26291079812206575,
-      "grad_norm": 0.6255429983139038,
-      "learning_rate": 0.00017197098989576222,
-      "loss": 4.8328,
-      "step": 70
-    },
-    {
-      "epoch": 0.26666666666666666,
-      "grad_norm": 0.4984213411808014,
-      "learning_rate": 0.00017113334395057087,
-      "loss": 3.8159,
-      "step": 71
-    },
-    {
-      "epoch": 0.2704225352112676,
-      "grad_norm": 0.6359574794769287,
-      "learning_rate": 0.000170285470606732,
-      "loss": 4.6995,
-      "step": 72
-    },
-    {
-      "epoch": 0.27417840375586855,
-      "grad_norm": 0.4945514500141144,
-      "learning_rate": 0.0001694274917696448,
-      "loss": 3.6875,
-      "step": 73
-    },
-    {
-      "epoch": 0.27793427230046946,
-      "grad_norm": 0.5279414057731628,
-      "learning_rate": 0.00016855953079765448,
-      "loss": 3.8174,
-      "step": 74
-    },
-    {
-      "epoch": 0.28169014084507044,
-      "grad_norm": 0.6026564240455627,
-      "learning_rate": 0.00016768171248431602,
-      "loss": 4.3851,
-      "step": 75
-    },
-    {
-      "epoch": 0.28544600938967135,
-      "grad_norm": 0.5192970633506775,
-      "learning_rate": 0.0001667941630404517,
-      "loss": 3.7024,
-      "step": 76
-    },
-    {
-      "epoch": 0.2892018779342723,
-      "grad_norm": 0.48926350474357605,
-      "learning_rate": 0.00016589701007600476,
-      "loss": 4.0638,
-      "step": 77
-    },
-    {
-      "epoch": 0.29295774647887324,
-      "grad_norm": 0.6420406699180603,
-      "learning_rate": 0.0001649903825816918,
-      "loss": 4.751,
-      "step": 78
-    },
-    {
-      "epoch": 0.29671361502347415,
-      "grad_norm": 0.5534292459487915,
-      "learning_rate": 0.00016407441091045706,
-      "loss": 4.0464,
-      "step": 79
-    },
-    {
-      "epoch": 0.3004694835680751,
-      "grad_norm": 0.5345770120620728,
-      "learning_rate": 0.0001631492267587301,
-      "loss": 4.0831,
-      "step": 80
-    },
-    {
-      "epoch": 0.30422535211267604,
-      "grad_norm": 0.4912847876548767,
-      "learning_rate": 0.0001622149631474913,
-      "loss": 3.537,
-      "step": 81
-    },
-    {
-      "epoch": 0.307981220657277,
-      "grad_norm": 0.5144412517547607,
-      "learning_rate": 0.00016127175440314596,
-      "loss": 4.0086,
-      "step": 82
-    },
-    {
-      "epoch": 0.3117370892018779,
-      "grad_norm": 0.4543435573577881,
-      "learning_rate": 0.0001603197361382114,
-      "loss": 3.5774,
-      "step": 83
-    },
-    {
-      "epoch": 0.3154929577464789,
-      "grad_norm": 0.5636236667633057,
-      "learning_rate": 0.0001593590452318187,
-      "loss": 4.0208,
-      "step": 84
-    },
-    {
-      "epoch": 0.3192488262910798,
-      "grad_norm": 0.47736960649490356,
-      "learning_rate": 0.00015838981981003273,
-      "loss": 3.7864,
-      "step": 85
-    },
-    {
-      "epoch": 0.32300469483568073,
-      "grad_norm": 0.5701449513435364,
-      "learning_rate": 0.00015741219922599253,
-      "loss": 4.2106,
-      "step": 86
-    },
-    {
-      "epoch": 0.3267605633802817,
-      "grad_norm": 0.4872874617576599,
-      "learning_rate": 0.00015642632403987535,
-      "loss": 3.5756,
-      "step": 87
-    },
-    {
-      "epoch": 0.3305164319248826,
-      "grad_norm": 0.49332892894744873,
-      "learning_rate": 0.00015543233599868742,
-      "loss": 4.0097,
-      "step": 88
-    },
-    {
-      "epoch": 0.3342723004694836,
-      "grad_norm": 0.5447995662689209,
-      "learning_rate": 0.0001544303780158837,
-      "loss": 4.0947,
-      "step": 89
-    },
-    {
-      "epoch": 0.3380281690140845,
-      "grad_norm": 0.46900174021720886,
-      "learning_rate": 0.0001534205941508202,
-      "loss": 3.8602,
-      "step": 90
-    },
-    {
-      "epoch": 0.34178403755868547,
-      "grad_norm": 0.5882900953292847,
-      "learning_rate": 0.00015240312958804132,
-      "loss": 4.2984,
-      "step": 91
-    },
-    {
-      "epoch": 0.3455399061032864,
-      "grad_norm": 0.5628681182861328,
-      "learning_rate": 0.00015137813061640563,
-      "loss": 4.0235,
-      "step": 92
-    },
-    {
-      "epoch": 0.3492957746478873,
-      "grad_norm": 0.5416899919509888,
-      "learning_rate": 0.00015034574460805279,
-      "loss": 4.003,
-      "step": 93
-    },
-    {
-      "epoch": 0.3530516431924883,
-      "grad_norm": 0.5500250458717346,
-      "learning_rate": 0.00014930611999721457,
-      "loss": 3.8399,
-      "step": 94
-    },
-    {
-      "epoch": 0.3568075117370892,
-      "grad_norm": 0.5311074256896973,
-      "learning_rate": 0.00014825940625887342,
-      "loss": 3.7204,
-      "step": 95
-    },
-    {
-      "epoch": 0.36056338028169016,
-      "grad_norm": 0.5872887969017029,
-      "learning_rate": 0.00014720575388727132,
-      "loss": 4.2717,
-      "step": 96
-    },
-    {
-      "epoch": 0.3643192488262911,
-      "grad_norm": 0.5307738184928894,
-      "learning_rate": 0.0001461453143742718,
-      "loss": 4.0727,
-      "step": 97
-    },
-    {
-      "epoch": 0.36807511737089205,
-      "grad_norm": 0.5269085168838501,
-      "learning_rate": 0.00014507824018757906,
-      "loss": 3.703,
-      "step": 98
-    },
-    {
-      "epoch": 0.37183098591549296,
-      "grad_norm": 0.5249464511871338,
-      "learning_rate": 0.0001440046847488163,
-      "loss": 4.1674,
-      "step": 99
-    },
-    {
-      "epoch": 0.3755868544600939,
-      "grad_norm": 0.5709621906280518,
-      "learning_rate": 0.00014292480241146716,
-      "loss": 4.2529,
-      "step": 100
-    },
-    {
-      "epoch": 0.3755868544600939,
-      "eval_loss": 0.5099202990531921,
-      "eval_runtime": 361.6043,
-      "eval_samples_per_second": 2.619,
-      "eval_steps_per_second": 0.655,
-      "step": 100
-    },
-    {
-      "epoch": 0.37934272300469485,
-      "grad_norm": 0.5334945321083069,
-      "learning_rate": 0.00014183874843868313,
-      "loss": 4.042,
-      "step": 101
-    },
-    {
-      "epoch": 0.38309859154929576,
-      "grad_norm": 0.5117276310920715,
-      "learning_rate": 0.0001407466789809601,
-      "loss": 3.7999,
-      "step": 102
-    },
-    {
-      "epoch": 0.38685446009389673,
-      "grad_norm": 0.527893602848053,
-      "learning_rate": 0.0001396487510536874,
-      "loss": 3.8513,
-      "step": 103
-    },
-    {
-      "epoch": 0.39061032863849765,
-      "grad_norm": 0.49835067987442017,
-      "learning_rate": 0.00013854512251457247,
-      "loss": 3.8276,
-      "step": 104
-    },
-    {
-      "epoch": 0.39436619718309857,
-      "grad_norm": 0.6509612202644348,
-      "learning_rate": 0.0001374359520409444,
-      "loss": 4.2759,
-      "step": 105
-    },
-    {
-      "epoch": 0.39812206572769954,
-      "grad_norm": 0.4912853538990021,
-      "learning_rate": 0.0001363213991069397,
-      "loss": 3.5757,
-      "step": 106
-    },
-    {
-      "epoch": 0.40187793427230045,
-      "grad_norm": 0.5202915668487549,
-      "learning_rate": 0.00013520162396057342,
-      "loss": 4.0784,
-      "step": 107
-    },
-    {
-      "epoch": 0.4056338028169014,
-      "grad_norm": 0.5149546265602112,
-      "learning_rate": 0.00013407678760069891,
-      "loss": 3.7496,
-      "step": 108
-    },
-    {
-      "epoch": 0.40938967136150234,
-      "grad_norm": 0.5628048777580261,
-      "learning_rate": 0.00013294705175386003,
-      "loss": 4.3535,
-      "step": 109
-    },
-    {
-      "epoch": 0.4131455399061033,
-      "grad_norm": 0.5484233498573303,
-      "learning_rate": 0.00013181257885103818,
-      "loss": 3.9337,
-      "step": 110
-    },
-    {
-      "epoch": 0.4169014084507042,
-      "grad_norm": 0.5825195908546448,
-      "learning_rate": 0.00013067353200429857,
-      "loss": 4.0801,
-      "step": 111
-    },
-    {
-      "epoch": 0.42065727699530514,
-      "grad_norm": 0.5359321236610413,
-      "learning_rate": 0.00012953007498333808,
-      "loss": 4.1705,
-      "step": 112
-    },
-    {
-      "epoch": 0.4244131455399061,
-      "grad_norm": 0.6541722416877747,
-      "learning_rate": 0.00012838237219193896,
-      "loss": 4.6486,
-      "step": 113
-    },
-    {
-      "epoch": 0.428169014084507,
-      "grad_norm": 0.6243426203727722,
-      "learning_rate": 0.00012723058864433118,
-      "loss": 4.1711,
-      "step": 114
-    },
-    {
-      "epoch": 0.431924882629108,
-      "grad_norm": 0.585813581943512,
-      "learning_rate": 0.00012607488994146704,
-      "loss": 4.0612,
-      "step": 115
-    },
-    {
-      "epoch": 0.4356807511737089,
-      "grad_norm": 0.5607698559761047,
-      "learning_rate": 0.00012491544224721136,
-      "loss": 4.0229,
-      "step": 116
-    },
-    {
-      "epoch": 0.4394366197183099,
-      "grad_norm": 0.519707441329956,
-      "learning_rate": 0.00012375241226445088,
-      "loss": 4.0728,
-      "step": 117
-    },
-    {
-      "epoch": 0.4431924882629108,
-      "grad_norm": 0.44573110342025757,
-      "learning_rate": 0.00012258596721112608,
-      "loss": 3.5927,
-      "step": 118
-    },
-    {
-      "epoch": 0.4469483568075117,
-      "grad_norm": 0.527217447757721,
-      "learning_rate": 0.00012141627479618885,
-      "loss": 4.0032,
-      "step": 119
-    },
-    {
-      "epoch": 0.4507042253521127,
-      "grad_norm": 0.5384316444396973,
-      "learning_rate": 0.00012024350319548976,
-      "loss": 3.8763,
-      "step": 120
-    },
-    {
-      "epoch": 0.4544600938967136,
-      "grad_norm": 0.5317234992980957,
-      "learning_rate": 0.00011906782102759808,
-      "loss": 4.1505,
-      "step": 121
-    },
-    {
-      "epoch": 0.4582159624413146,
-      "grad_norm": 0.6159288287162781,
-      "learning_rate": 0.0001178893973295581,
-      "loss": 4.286,
-      "step": 122
-    },
-    {
-      "epoch": 0.4619718309859155,
-      "grad_norm": 0.5571346282958984,
-      "learning_rate": 0.00011670840153258547,
-      "loss": 3.8812,
-      "step": 123
-    },
-    {
-      "epoch": 0.46572769953051646,
-      "grad_norm": 0.5357509851455688,
-      "learning_rate": 0.00011552500343770658,
-      "loss": 3.9433,
-      "step": 124
-    },
-    {
-      "epoch": 0.4694835680751174,
-      "grad_norm": 0.5376076698303223,
-      "learning_rate": 0.00011433937319134511,
-      "loss": 3.6673,
-      "step": 125
-    },
-    {
-      "epoch": 0.4732394366197183,
-      "grad_norm": 0.5321595072746277,
-      "learning_rate": 0.00011315168126085857,
-      "loss": 3.6064,
-      "step": 126
-    },
-    {
-      "epoch": 0.47699530516431926,
-      "grad_norm": 0.5370662808418274,
-      "learning_rate": 0.00011196209841002909,
-      "loss": 3.7798,
-      "step": 127
-    },
-    {
-      "epoch": 0.4807511737089202,
-      "grad_norm": 0.47796905040740967,
-      "learning_rate": 0.00011077079567451111,
-      "loss": 3.4657,
-      "step": 128
-    },
-    {
-      "epoch": 0.48450704225352115,
-      "grad_norm": 0.5204142332077026,
-      "learning_rate": 0.00010957794433724051,
-      "loss": 3.6982,
-      "step": 129
-    },
-    {
-      "epoch": 0.48826291079812206,
-      "grad_norm": 0.5429206490516663,
-      "learning_rate": 0.00010838371590380765,
-      "loss": 3.9558,
-      "step": 130
-    },
-    {
-      "epoch": 0.492018779342723,
-      "grad_norm": 0.5927493572235107,
-      "learning_rate": 0.00010718828207779894,
-      "loss": 4.3142,
-      "step": 131
-    },
-    {
-      "epoch": 0.49577464788732395,
-      "grad_norm": 0.5089852213859558,
-      "learning_rate": 0.0001059918147361094,
-      "loss": 4.186,
-      "step": 132
-    },
-    {
-      "epoch": 0.49953051643192486,
-      "grad_norm": 0.5470064282417297,
-      "learning_rate": 0.00010479448590423082,
-      "loss": 3.5952,
-      "step": 133
-    },
-    {
-      "epoch": 0.5032863849765258,
-      "grad_norm": 0.5791990160942078,
-      "learning_rate": 0.00010359646773151814,
-      "loss": 3.9606,
-      "step": 134
-    },
-    {
-      "epoch": 0.5070422535211268,
-      "grad_norm": 0.6007618308067322,
-      "learning_rate": 0.00010239793246643819,
-      "loss": 4.0543,
-      "step": 135
-    },
-    {
-      "epoch": 0.5107981220657277,
-      "grad_norm": 0.5693303942680359,
-      "learning_rate": 0.00010119905243180432,
-      "loss": 4.4878,
-      "step": 136
-    },
-    {
-      "epoch": 0.5145539906103287,
-      "grad_norm": 0.5487916469573975,
-      "learning_rate": 0.0001,
-      "loss": 3.8789,
-      "step": 137
-    },
-    {
-      "epoch": 0.5183098591549296,
-      "grad_norm": 0.5099664330482483,
-      "learning_rate": 9.880094756819572e-05,
-      "loss": 3.9367,
-      "step": 138
-    },
-    {
-      "epoch": 0.5220657276995305,
-      "grad_norm": 0.5488762259483337,
-      "learning_rate": 9.760206753356184e-05,
-      "loss": 3.8747,
-      "step": 139
-    },
-    {
-      "epoch": 0.5258215962441315,
-      "grad_norm": 0.6004945039749146,
-      "learning_rate": 9.64035322684819e-05,
-      "loss": 4.6112,
-      "step": 140
-    },
-    {
-      "epoch": 0.5295774647887324,
-      "grad_norm": 0.4959471821784973,
-      "learning_rate": 9.520551409576919e-05,
-      "loss": 3.5334,
-      "step": 141
-    },
-    {
-      "epoch": 0.5333333333333333,
-      "grad_norm": 0.5801951885223389,
-      "learning_rate": 9.400818526389063e-05,
-      "loss": 4.0366,
-      "step": 142
-    },
-    {
-      "epoch": 0.5370892018779343,
-      "grad_norm": 0.6023839116096497,
-      "learning_rate": 9.281171792220107e-05,
-      "loss": 4.0858,
-      "step": 143
-    },
-    {
-      "epoch": 0.5408450704225352,
-      "grad_norm": 0.5538918972015381,
-      "learning_rate": 9.161628409619236e-05,
-      "loss": 3.8907,
-      "step": 144
-    },
-    {
-      "epoch": 0.5446009389671361,
-      "grad_norm": 0.5293076038360596,
-      "learning_rate": 9.042205566275951e-05,
-      "loss": 3.6444,
-      "step": 145
-    },
-    {
-      "epoch": 0.5483568075117371,
-      "grad_norm": 0.637414276599884,
-      "learning_rate": 8.92292043254889e-05,
-      "loss": 4.5763,
-      "step": 146
-    },
-    {
-      "epoch": 0.5521126760563381,
-      "grad_norm": 0.5648563504219055,
-      "learning_rate": 8.803790158997095e-05,
-      "loss": 4.3616,
-      "step": 147
-    },
-    {
-      "epoch": 0.5558685446009389,
-      "grad_norm": 0.5116698741912842,
-      "learning_rate": 8.684831873914145e-05,
-      "loss": 3.7008,
-      "step": 148
-    },
-    {
-      "epoch": 0.5596244131455399,
-      "grad_norm": 0.6268083453178406,
-      "learning_rate": 8.566062680865494e-05,
-      "loss": 4.224,
-      "step": 149
-    },
-    {
-      "epoch": 0.5633802816901409,
-      "grad_norm": 0.47161865234375,
-      "learning_rate": 8.447499656229344e-05,
-      "loss": 3.5188,
-      "step": 150
-    },
-    {
-      "epoch": 0.5633802816901409,
-      "eval_loss": 0.5007660388946533,
-      "eval_runtime": 373.0551,
-      "eval_samples_per_second": 2.538,
-      "eval_steps_per_second": 0.635,
-      "step": 150
-    },
-    {
-      "epoch": 0.5671361502347417,
-      "grad_norm": 0.5017052292823792,
-      "learning_rate": 8.329159846741457e-05,
-      "loss": 3.7126,
-      "step": 151
-    },
-    {
-      "epoch": 0.5708920187793427,
-      "grad_norm": 0.6328552961349487,
-      "learning_rate": 8.211060267044191e-05,
-      "loss": 4.3953,
-      "step": 152
-    },
-    {
-      "epoch": 0.5746478873239437,
-      "grad_norm": 0.6981328725814819,
-      "learning_rate": 8.093217897240195e-05,
-      "loss": 4.7259,
-      "step": 153
-    },
-    {
-      "epoch": 0.5784037558685446,
-      "grad_norm": 0.5479520559310913,
-      "learning_rate": 7.975649680451024e-05,
-      "loss": 3.9748,
-      "step": 154
-    },
-    {
-      "epoch": 0.5821596244131455,
-      "grad_norm": 0.5974417328834534,
-      "learning_rate": 7.858372520381119e-05,
-      "loss": 4.0999,
-      "step": 155
-    },
-    {
-      "epoch": 0.5859154929577465,
-      "grad_norm": 0.5800752639770508,
-      "learning_rate": 7.741403278887397e-05,
-      "loss": 4.3825,
-      "step": 156
-    },
-    {
-      "epoch": 0.5896713615023474,
-      "grad_norm": 0.5555791258811951,
-      "learning_rate": 7.624758773554914e-05,
-      "loss": 4.0412,
-      "step": 157
-    },
-    {
-      "epoch": 0.5934272300469483,
-      "grad_norm": 0.5661817789077759,
-      "learning_rate": 7.508455775278867e-05,
-      "loss": 3.8501,
-      "step": 158
-    },
-    {
-      "epoch": 0.5971830985915493,
-      "grad_norm": 0.5640938878059387,
-      "learning_rate": 7.392511005853297e-05,
-      "loss": 4.0421,
-      "step": 159
-    },
-    {
-      "epoch": 0.6009389671361502,
-      "grad_norm": 0.5214723944664001,
-      "learning_rate": 7.276941135566884e-05,
-      "loss": 3.6713,
-      "step": 160
-    },
-    {
-      "epoch": 0.6046948356807512,
-      "grad_norm": 0.6109952926635742,
-      "learning_rate": 7.161762780806103e-05,
-      "loss": 4.1644,
-      "step": 161
-    },
-    {
-      "epoch": 0.6084507042253521,
-      "grad_norm": 0.4427598714828491,
-      "learning_rate": 7.046992501666195e-05,
-      "loss": 3.3083,
-      "step": 162
-    },
-    {
-      "epoch": 0.612206572769953,
-      "grad_norm": 0.593859076499939,
-      "learning_rate": 6.932646799570144e-05,
-      "loss": 4.0755,
-      "step": 163
-    },
-    {
-      "epoch": 0.615962441314554,
-      "grad_norm": 0.5258002281188965,
-      "learning_rate": 6.818742114896184e-05,
-      "loss": 3.9657,
-      "step": 164
-    },
-    {
-      "epoch": 0.6197183098591549,
-      "grad_norm": 0.6475521922111511,
-      "learning_rate": 6.705294824614004e-05,
-      "loss": 4.8808,
-      "step": 165
-    },
-    {
-      "epoch": 0.6234741784037559,
-      "grad_norm": 0.45996585488319397,
-      "learning_rate": 6.592321239930112e-05,
-      "loss": 3.2131,
-      "step": 166
-    },
-    {
-      "epoch": 0.6272300469483568,
-      "grad_norm": 0.505383312702179,
-      "learning_rate": 6.479837603942665e-05,
-      "loss": 3.8211,
-      "step": 167
-    },
-    {
-      "epoch": 0.6309859154929578,
-      "grad_norm": 0.5987924933433533,
-      "learning_rate": 6.367860089306028e-05,
-      "loss": 4.073,
-      "step": 168
-    },
-    {
-      "epoch": 0.6347417840375587,
-      "grad_norm": 0.5297787189483643,
-      "learning_rate": 6.256404795905561e-05,
-      "loss": 3.8255,
-      "step": 169
-    },
-    {
-      "epoch": 0.6384976525821596,
-      "grad_norm": 0.5636645555496216,
-      "learning_rate": 6.145487748542753e-05,
-      "loss": 4.4,
-      "step": 170
-    },
-    {
-      "epoch": 0.6422535211267606,
-      "grad_norm": 0.584668755531311,
-      "learning_rate": 6.035124894631263e-05,
-      "loss": 3.9181,
-      "step": 171
-    },
-    {
-      "epoch": 0.6460093896713615,
-      "grad_norm": 0.6228652000427246,
-      "learning_rate": 5.925332101903994e-05,
-      "loss": 4.1693,
-      "step": 172
-    },
-    {
-      "epoch": 0.6497652582159624,
-      "grad_norm": 0.6707691550254822,
-      "learning_rate": 5.816125156131691e-05,
-      "loss": 4.8515,
-      "step": 173
-    },
-    {
-      "epoch": 0.6535211267605634,
-      "grad_norm": 0.5788469314575195,
-      "learning_rate": 5.707519758853288e-05,
-      "loss": 3.9823,
-      "step": 174
-    },
-    {
-      "epoch": 0.6572769953051644,
-      "grad_norm": 0.5524411201477051,
-      "learning_rate": 5.5995315251183734e-05,
-      "loss": 3.8754,
-      "step": 175
-    },
-    {
-      "epoch": 0.6610328638497652,
-      "grad_norm": 0.5607343912124634,
-      "learning_rate": 5.492175981242097e-05,
-      "loss": 4.0384,
-      "step": 176
-    },
-    {
-      "epoch": 0.6647887323943662,
-      "grad_norm": 0.5071132183074951,
-      "learning_rate": 5.385468562572823e-05,
-      "loss": 3.4805,
-      "step": 177
-    },
-    {
-      "epoch": 0.6685446009389672,
-      "grad_norm": 0.5974758267402649,
-      "learning_rate": 5.279424611272873e-05,
-      "loss": 4.2209,
-      "step": 178
-    },
-    {
-      "epoch": 0.672300469483568,
-      "grad_norm": 0.48757869005203247,
-      "learning_rate": 5.174059374112657e-05,
-      "loss": 3.6575,
-      "step": 179
-    },
-    {
-      "epoch": 0.676056338028169,
-      "grad_norm": 0.6194841861724854,
-      "learning_rate": 5.0693880002785456e-05,
-      "loss": 4.5966,
-      "step": 180
-    },
-    {
-      "epoch": 0.67981220657277,
-      "grad_norm": 0.5687793493270874,
-      "learning_rate": 4.965425539194726e-05,
-      "loss": 3.8884,
-      "step": 181
-    },
-    {
-      "epoch": 0.6835680751173709,
-      "grad_norm": 0.5817456245422363,
-      "learning_rate": 4.8621869383594406e-05,
-      "loss": 4.2781,
-      "step": 182
-    },
-    {
-      "epoch": 0.6873239436619718,
-      "grad_norm": 0.4698617458343506,
-      "learning_rate": 4.759687041195874e-05,
-      "loss": 3.5443,
-      "step": 183
-    },
-    {
-      "epoch": 0.6910798122065728,
-      "grad_norm": 0.5563104748725891,
-      "learning_rate": 4.657940584917983e-05,
-      "loss": 3.8302,
-      "step": 184
-    },
-    {
-      "epoch": 0.6948356807511737,
-      "grad_norm": 0.5313715934753418,
-      "learning_rate": 4.556962198411631e-05,
-      "loss": 3.7517,
-      "step": 185
-    },
-    {
-      "epoch": 0.6985915492957746,
-      "grad_norm": 0.47465792298316956,
-      "learning_rate": 4.45676640013126e-05,
-      "loss": 3.4732,
-      "step": 186
-    },
-    {
-      "epoch": 0.7023474178403756,
-      "grad_norm": 0.49628129601478577,
-      "learning_rate": 4.3573675960124684e-05,
-      "loss": 3.7668,
-      "step": 187
-    },
-    {
-      "epoch": 0.7061032863849765,
-      "grad_norm": 0.5268252491950989,
-      "learning_rate": 4.258780077400748e-05,
-      "loss": 3.5747,
-      "step": 188
-    },
-    {
-      "epoch": 0.7098591549295775,
-      "grad_norm": 0.6187554001808167,
-      "learning_rate": 4.161018018996727e-05,
-      "loss": 4.3695,
-      "step": 189
-    },
-    {
-      "epoch": 0.7136150234741784,
-      "grad_norm": 0.5253807902336121,
-      "learning_rate": 4.064095476818133e-05,
-      "loss": 3.8376,
-      "step": 190
-    },
-    {
-      "epoch": 0.7173708920187793,
-      "grad_norm": 0.5611537098884583,
-      "learning_rate": 3.968026386178867e-05,
-      "loss": 3.8718,
-      "step": 191
-    },
-    {
-      "epoch": 0.7211267605633803,
-      "grad_norm": 0.6236064434051514,
-      "learning_rate": 3.87282455968541e-05,
-      "loss": 4.4724,
-      "step": 192
-    },
-    {
-      "epoch": 0.7248826291079812,
-      "grad_norm": 0.4799625277519226,
-      "learning_rate": 3.778503685250873e-05,
-      "loss": 3.6452,
-      "step": 193
-    },
-    {
-      "epoch": 0.7286384976525822,
-      "grad_norm": 0.5699834227561951,
-      "learning_rate": 3.685077324126992e-05,
-      "loss": 3.9373,
-      "step": 194
-    },
-    {
-      "epoch": 0.7323943661971831,
-      "grad_norm": 0.49022650718688965,
-      "learning_rate": 3.592558908954295e-05,
-      "loss": 3.3991,
-      "step": 195
-    },
-    {
-      "epoch": 0.7361502347417841,
-      "grad_norm": 0.5775969624519348,
-      "learning_rate": 3.500961741830821e-05,
-      "loss": 4.2728,
-      "step": 196
-    },
-    {
-      "epoch": 0.739906103286385,
-      "grad_norm": 0.5632807612419128,
-      "learning_rate": 3.410298992399524e-05,
-      "loss": 4.1647,
-      "step": 197
-    },
-    {
-      "epoch": 0.7436619718309859,
-      "grad_norm": 0.4752277135848999,
-      "learning_rate": 3.3205836959548296e-05,
-      "loss": 3.3707,
-      "step": 198
-    },
-    {
-      "epoch": 0.7474178403755869,
-      "grad_norm": 0.5167598724365234,
-      "learning_rate": 3.231828751568401e-05,
-      "loss": 3.6365,
-      "step": 199
-    },
-    {
-      "epoch": 0.7511737089201878,
-      "grad_norm": 0.5540789365768433,
-      "learning_rate": 3.144046920234553e-05,
-      "loss": 3.8104,
-      "step": 200
-    },
-    {
-      "epoch": 0.7511737089201878,
-      "eval_loss": 0.4954932928085327,
-      "eval_runtime": 367.7455,
-      "eval_samples_per_second": 2.575,
-      "eval_steps_per_second": 0.644,
-      "step": 200
-    },
-    {
-      "epoch": 0.7549295774647887,
-      "grad_norm": 0.4975653886795044,
-      "learning_rate": 3.0572508230355246e-05,
-      "loss": 3.763,
-      "step": 201
-    },
-    {
-      "epoch": 0.7586854460093897,
-      "grad_norm": 0.5943359136581421,
-      "learning_rate": 2.971452939326802e-05,
-      "loss": 4.1011,
-      "step": 202
-    },
-    {
-      "epoch": 0.7624413145539906,
-      "grad_norm": 0.5947958827018738,
-      "learning_rate": 2.8866656049429162e-05,
-      "loss": 3.837,
-      "step": 203
-    },
-    {
-      "epoch": 0.7661971830985915,
-      "grad_norm": 0.55486661195755,
-      "learning_rate": 2.8029010104237785e-05,
-      "loss": 3.773,
-      "step": 204
-    },
-    {
-      "epoch": 0.7699530516431925,
-      "grad_norm": 0.6001894474029541,
-      "learning_rate": 2.720171199261987e-05,
-      "loss": 4.1092,
-      "step": 205
-    },
-    {
-      "epoch": 0.7737089201877935,
-      "grad_norm": 0.611171305179596,
-      "learning_rate": 2.638488066171201e-05,
-      "loss": 4.2872,
-      "step": 206
-    },
-    {
-      "epoch": 0.7774647887323943,
-      "grad_norm": 0.5929466485977173,
-      "learning_rate": 2.5578633553759878e-05,
-      "loss": 4.0139,
-      "step": 207
-    },
-    {
-      "epoch": 0.7812206572769953,
-      "grad_norm": 0.5859886407852173,
-      "learning_rate": 2.4783086589232295e-05,
-      "loss": 3.9495,
-      "step": 208
-    },
-    {
-      "epoch": 0.7849765258215963,
-      "grad_norm": 0.5463722348213196,
-      "learning_rate": 2.3998354150154555e-05,
-      "loss": 3.7008,
-      "step": 209
-    },
-    {
-      "epoch": 0.7887323943661971,
-      "grad_norm": 0.5370416045188904,
-      "learning_rate": 2.3224549063662927e-05,
-      "loss": 3.9123,
-      "step": 210
-    },
-    {
-      "epoch": 0.7924882629107981,
-      "grad_norm": 0.5654124021530151,
-      "learning_rate": 2.246178258578234e-05,
-      "loss": 3.816,
-      "step": 211
-    },
-    {
-      "epoch": 0.7962441314553991,
-      "grad_norm": 0.5404929518699646,
-      "learning_rate": 2.171016438543059e-05,
-      "loss": 3.943,
-      "step": 212
-    },
-    {
-      "epoch": 0.8,
-      "grad_norm": 0.5264220237731934,
-      "learning_rate": 2.096980252865005e-05,
-      "loss": 3.8148,
-      "step": 213
-    },
-    {
-      "epoch": 0.8037558685446009,
-      "grad_norm": 0.5364089012145996,
-      "learning_rate": 2.0240803463070425e-05,
-      "loss": 4.0956,
-      "step": 214
-    },
-    {
-      "epoch": 0.8075117370892019,
-      "grad_norm": 0.49832502007484436,
-      "learning_rate": 1.9523272002603742e-05,
-      "loss": 3.5919,
-      "step": 215
-    },
-    {
-      "epoch": 0.8112676056338028,
-      "grad_norm": 0.5661212205886841,
-      "learning_rate": 1.8817311312374564e-05,
-      "loss": 3.9309,
-      "step": 216
-    },
-    {
-      "epoch": 0.8150234741784037,
-      "grad_norm": 0.6174516677856445,
-      "learning_rate": 1.8123022893887065e-05,
-      "loss": 4.4702,
-      "step": 217
-    },
-    {
-      "epoch": 0.8187793427230047,
-      "grad_norm": 0.5399917364120483,
-      "learning_rate": 1.744050657043137e-05,
-      "loss": 3.8469,
-      "step": 218
-    },
-    {
-      "epoch": 0.8225352112676056,
-      "grad_norm": 0.48354753851890564,
-      "learning_rate": 1.6769860472731257e-05,
-      "loss": 3.5587,
-      "step": 219
-    },
-    {
-      "epoch": 0.8262910798122066,
-      "grad_norm": 0.5603431463241577,
-      "learning_rate": 1.6111181024835e-05,
-      "loss": 4.3805,
-      "step": 220
-    },
-    {
-      "epoch": 0.8300469483568075,
-      "grad_norm": 0.5792990326881409,
-      "learning_rate": 1.5464562930251814e-05,
-      "loss": 4.2204,
-      "step": 221
-    },
-    {
-      "epoch": 0.8338028169014085,
-      "grad_norm": 0.5376021862030029,
-      "learning_rate": 1.4830099158335563e-05,
-      "loss": 3.8365,
-      "step": 222
-    },
-    {
-      "epoch": 0.8375586854460094,
-      "grad_norm": 0.5793043971061707,
-      "learning_rate": 1.4207880930917871e-05,
-      "loss": 4.064,
-      "step": 223
-    },
-    {
-      "epoch": 0.8413145539906103,
-      "grad_norm": 0.5597378611564636,
-      "learning_rate": 1.3597997709192378e-05,
-      "loss": 3.8224,
-      "step": 224
-    },
-    {
-      "epoch": 0.8450704225352113,
-      "grad_norm": 0.5336353182792664,
-      "learning_rate": 1.3000537180852212e-05,
-      "loss": 3.7203,
-      "step": 225
-    },
-    {
-      "epoch": 0.8488262910798122,
-      "grad_norm": 0.640953004360199,
-      "learning_rate": 1.2415585247482498e-05,
-      "loss": 4.3212,
-      "step": 226
-    },
-    {
-      "epoch": 0.8525821596244132,
-      "grad_norm": 0.45982062816619873,
-      "learning_rate": 1.1843226012209529e-05,
-      "loss": 3.6229,
-      "step": 227
-    },
-    {
-      "epoch": 0.856338028169014,
-      "grad_norm": 0.5055301189422607,
-      "learning_rate": 1.128354176760873e-05,
-      "loss": 3.6906,
-      "step": 228
-    },
-    {
-      "epoch": 0.860093896713615,
-      "grad_norm": 0.4451459050178528,
-      "learning_rate": 1.073661298387265e-05,
-      "loss": 3.3596,
-      "step": 229
-    },
-    {
-      "epoch": 0.863849765258216,
-      "grad_norm": 0.6167091727256775,
-      "learning_rate": 1.0202518297241237e-05,
-      "loss": 4.6817,
-      "step": 230
-    },
-    {
-      "epoch": 0.8676056338028169,
-      "grad_norm": 0.5457577705383301,
-      "learning_rate": 9.681334498695648e-06,
-      "loss": 4.2546,
-      "step": 231
-    },
-    {
-      "epoch": 0.8713615023474178,
-      "grad_norm": 0.49405384063720703,
-      "learning_rate": 9.173136522917457e-06,
-      "loss": 3.7713,
-      "step": 232
-    },
-    {
-      "epoch": 0.8751173708920188,
-      "grad_norm": 0.5279140472412109,
-      "learning_rate": 8.677997437514629e-06,
-      "loss": 3.7468,
-      "step": 233
-    },
-    {
-      "epoch": 0.8788732394366198,
-      "grad_norm": 0.5161781311035156,
-      "learning_rate": 8.195988432516078e-06,
-      "loss": 4.2746,
-      "step": 234
-    },
-    {
-      "epoch": 0.8826291079812206,
-      "grad_norm": 0.5855900049209595,
-      "learning_rate": 7.727178810136093e-06,
-      "loss": 4.1113,
-      "step": 235
-    },
-    {
-      "epoch": 0.8863849765258216,
-      "grad_norm": 0.4686482548713684,
-      "learning_rate": 7.27163597481022e-06,
-      "loss": 3.3821,
-      "step": 236
-    },
-    {
-      "epoch": 0.8901408450704226,
-      "grad_norm": 0.5629131197929382,
-      "learning_rate": 6.829425423504021e-06,
-      "loss": 4.1901,
-      "step": 237
-    },
-    {
-      "epoch": 0.8938967136150234,
-      "grad_norm": 0.5782991647720337,
-      "learning_rate": 6.4006107362960195e-06,
-      "loss": 4.3302,
-      "step": 238
-    },
-    {
-      "epoch": 0.8976525821596244,
-      "grad_norm": 0.5707590579986572,
-      "learning_rate": 5.985253567236304e-06,
-      "loss": 3.9955,
-      "step": 239
-    },
-    {
-      "epoch": 0.9014084507042254,
-      "grad_norm": 0.4625610411167145,
-      "learning_rate": 5.583413635482082e-06,
-      "loss": 3.5662,
-      "step": 240
-    },
-    {
-      "epoch": 0.9051643192488263,
-      "grad_norm": 0.6621753573417664,
-      "learning_rate": 5.19514871671134e-06,
-      "loss": 4.5634,
-      "step": 241
-    },
-    {
-      "epoch": 0.9089201877934272,
-      "grad_norm": 0.4976242482662201,
-      "learning_rate": 4.82051463481602e-06,
-      "loss": 3.5897,
-      "step": 242
-    },
-    {
-      "epoch": 0.9126760563380282,
-      "grad_norm": 0.51161789894104,
-      "learning_rate": 4.45956525387573e-06,
-      "loss": 3.6594,
-      "step": 243
-    },
-    {
-      "epoch": 0.9164319248826291,
-      "grad_norm": 0.5785262584686279,
-      "learning_rate": 4.112352470413328e-06,
-      "loss": 4.031,
-      "step": 244
-    },
-    {
-      "epoch": 0.92018779342723,
-      "grad_norm": 0.5122177004814148,
-      "learning_rate": 3.778926205933342e-06,
-      "loss": 3.6733,
-      "step": 245
-    },
-    {
-      "epoch": 0.923943661971831,
-      "grad_norm": 0.5668466687202454,
-      "learning_rate": 3.459334399744374e-06,
-      "loss": 3.8761,
-      "step": 246
-    },
-    {
-      "epoch": 0.927699530516432,
-      "grad_norm": 0.5304160714149475,
-      "learning_rate": 3.1536230020664417e-06,
-      "loss": 3.3638,
-      "step": 247
-    },
-    {
-      "epoch": 0.9314553990610329,
-      "grad_norm": 0.5929594039916992,
-      "learning_rate": 2.861835967424409e-06,
-      "loss": 4.1158,
-      "step": 248
-    },
-    {
-      "epoch": 0.9352112676056338,
-      "grad_norm": 0.5661305785179138,
-      "learning_rate": 2.5840152483282752e-06,
-      "loss": 3.8846,
-      "step": 249
-    },
-    {
-      "epoch": 0.9389671361502347,
-      "grad_norm": 0.5555335879325867,
-      "learning_rate": 2.3202007892413447e-06,
-      "loss": 3.9409,
-      "step": 250
-    },
-    {
-      "epoch": 0.9389671361502347,
-      "eval_loss": 0.4938514232635498,
-      "eval_runtime": 365.2814,
-      "eval_samples_per_second": 2.593,
-      "eval_steps_per_second": 0.649,
-      "step": 250
     }
   ],
   "logging_steps": 1,
-  "max_steps": 267,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 50,
@@ -1817,8 +385,8 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.688311494350195e+18,
-  "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null
 }

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.3454231433506045,
   "eval_steps": 50,
+  "global_step": 50,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.0069084628670120895,
+      "grad_norm": 0.25255295634269714,
       "learning_rate": 0.0,
+      "loss": 1.9819,
       "step": 1
     },
     {
+      "epoch": 0.013816925734024179,
+      "grad_norm": 0.26424452662467957,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 2.0092,
       "step": 2
     },
     {
+      "epoch": 0.02072538860103627,
+      "grad_norm": 0.2828506827354431,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 2.0253,
       "step": 3
     },
     {
+      "epoch": 0.027633851468048358,
+      "grad_norm": 0.2540068030357361,
+      "learning_rate": 6e-06,
+      "loss": 1.9012,
       "step": 4
     },
     {
+      "epoch": 0.03454231433506045,
+      "grad_norm": 0.22753603756427765,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 1.8407,
       "step": 5
     },
     {
+      "epoch": 0.04145077720207254,
+      "grad_norm": 0.279053270816803,
+      "learning_rate": 1e-05,
+      "loss": 2.012,
       "step": 6
     },
     {
+      "epoch": 0.04835924006908463,
+      "grad_norm": 0.25448864698410034,
+      "learning_rate": 9.998741174712534e-06,
+      "loss": 1.9668,
       "step": 7
     },
     {
+      "epoch": 0.055267702936096716,
+      "grad_norm": 0.25880831480026245,
+      "learning_rate": 9.994965332706574e-06,
+      "loss": 2.1026,
       "step": 8
     },
     {
+      "epoch": 0.06217616580310881,
+      "grad_norm": 0.2563261389732361,
+      "learning_rate": 9.98867437523228e-06,
+      "loss": 1.8686,
       "step": 9
     },
     {
+      "epoch": 0.0690846286701209,
+      "grad_norm": 0.22925390303134918,
+      "learning_rate": 9.979871469976197e-06,
+      "loss": 1.8824,
       "step": 10
     },
     {
+      "epoch": 0.07599309153713299,
+      "grad_norm": 0.22950085997581482,
+      "learning_rate": 9.968561049466214e-06,
+      "loss": 1.7878,
       "step": 11
     },
     {
+      "epoch": 0.08290155440414508,
+      "grad_norm": 0.2901078462600708,
+      "learning_rate": 9.954748808839675e-06,
+      "loss": 2.2594,
       "step": 12
     },
     {
+      "epoch": 0.08981001727115717,
+      "grad_norm": 0.24290603399276733,
+      "learning_rate": 9.938441702975689e-06,
+      "loss": 1.8975,
       "step": 13
     },
     {
+      "epoch": 0.09671848013816926,
+      "grad_norm": 0.27432599663734436,
+      "learning_rate": 9.91964794299315e-06,
+      "loss": 2.3578,
       "step": 14
     },
     {
+      "epoch": 0.10362694300518134,
+      "grad_norm": 0.23735301196575165,
+      "learning_rate": 9.898376992116179e-06,
+      "loss": 1.8863,
       "step": 15
     },
     {
+      "epoch": 0.11053540587219343,
+      "grad_norm": 0.22492671012878418,
+      "learning_rate": 9.874639560909118e-06,
+      "loss": 1.8989,
       "step": 16
     },
     {
+      "epoch": 0.11744386873920552,
+      "grad_norm": 0.21187926828861237,
+      "learning_rate": 9.848447601883436e-06,
+      "loss": 1.9164,
       "step": 17
     },
     {
+      "epoch": 0.12435233160621761,
+      "grad_norm": 0.23231491446495056,
+      "learning_rate": 9.819814303479268e-06,
+      "loss": 1.8666,
       "step": 18
     },
     {
+      "epoch": 0.13126079447322972,
+      "grad_norm": 0.2294367104768753,
+      "learning_rate": 9.788754083424654e-06,
+      "loss": 1.885,
       "step": 19
     },
     {
+      "epoch": 0.1381692573402418,
+      "grad_norm": 0.23653824627399445,
+      "learning_rate": 9.755282581475769e-06,
+      "loss": 1.9521,
       "step": 20
     },
     {
+      "epoch": 0.14507772020725387,
+      "grad_norm": 0.2559220790863037,
+      "learning_rate": 9.719416651541839e-06,
+      "loss": 2.0136,
       "step": 21
     },
     {
+      "epoch": 0.15198618307426598,
+      "grad_norm": 0.24212689697742462,
+      "learning_rate": 9.681174353198687e-06,
+      "loss": 2.0389,
       "step": 22
     },
     {
+      "epoch": 0.15889464594127806,
+      "grad_norm": 0.21532879769802094,
+      "learning_rate": 9.640574942595195e-06,
+      "loss": 1.7763,
       "step": 23
     },
     {
+      "epoch": 0.16580310880829016,
+      "grad_norm": 0.25662222504615784,
+      "learning_rate": 9.597638862757255e-06,
+      "loss": 2.1307,
       "step": 24
     },
     {
+      "epoch": 0.17271157167530224,
+      "grad_norm": 0.2255883365869522,
+      "learning_rate": 9.552387733294081e-06,
+      "loss": 1.9851,
       "step": 25
     },
     {
+      "epoch": 0.17962003454231434,
+      "grad_norm": 0.2256392389535904,
+      "learning_rate": 9.504844339512096e-06,
+      "loss": 1.8682,
       "step": 26
     },
     {
+      "epoch": 0.18652849740932642,
+      "grad_norm": 0.2532212436199188,
+      "learning_rate": 9.45503262094184e-06,
+      "loss": 2.0718,
       "step": 27
     },
     {
+      "epoch": 0.19343696027633853,
+      "grad_norm": 0.2326337695121765,
+      "learning_rate": 9.40297765928369e-06,
+      "loss": 1.7779,
       "step": 28
     },
     {
+      "epoch": 0.2003454231433506,
+      "grad_norm": 0.2295856773853302,
+      "learning_rate": 9.348705665778479e-06,
+      "loss": 2.1006,
       "step": 29
     },
     {
+      "epoch": 0.20725388601036268,
+      "grad_norm": 0.2527850270271301,
+      "learning_rate": 9.292243968009332e-06,
+      "loss": 2.2097,
       "step": 30
     },
     {
+      "epoch": 0.2141623488773748,
+      "grad_norm": 0.22618888318538666,
+      "learning_rate": 9.233620996141421e-06,
+      "loss": 1.7951,
       "step": 31
     },
     {
+      "epoch": 0.22107081174438686,
+      "grad_norm": 0.2514853775501251,
+      "learning_rate": 9.172866268606514e-06,
+      "loss": 2.2897,
       "step": 32
     },
     {
+      "epoch": 0.22797927461139897,
+      "grad_norm": 0.2353752851486206,
+      "learning_rate": 9.110010377239552e-06,
+      "loss": 1.8954,
       "step": 33
     },
     {
+      "epoch": 0.23488773747841105,
+      "grad_norm": 0.2222089171409607,
+      "learning_rate": 9.045084971874738e-06,
+      "loss": 1.8893,
       "step": 34
     },
     {
+      "epoch": 0.24179620034542315,
+      "grad_norm": 0.2845269739627838,
+      "learning_rate": 8.978122744408905e-06,
+      "loss": 2.1425,
       "step": 35
     },
     {
+      "epoch": 0.24870466321243523,
+      "grad_norm": 0.2125595360994339,
+      "learning_rate": 8.90915741234015e-06,
+      "loss": 1.8447,
       "step": 36
     },
     {
+      "epoch": 0.2556131260794473,
+      "grad_norm": 0.23252736032009125,
+      "learning_rate": 8.838223701790057e-06,
+      "loss": 1.8057,
       "step": 37
     },
     {
+      "epoch": 0.26252158894645944,
+      "grad_norm": 0.22627419233322144,
+      "learning_rate": 8.765357330018056e-06,
+      "loss": 1.9897,
       "step": 38
     },
     {
+      "epoch": 0.2694300518134715,
+      "grad_norm": 0.22737424075603485,
+      "learning_rate": 8.690594987436705e-06,
+      "loss": 1.8672,
       "step": 39
     },
     {
+      "epoch": 0.2763385146804836,
+      "grad_norm": 0.25408855080604553,
+      "learning_rate": 8.613974319136959e-06,
+      "loss": 2.0794,
       "step": 40
     },
     {
+      "epoch": 0.28324697754749567,
+      "grad_norm": 0.2922523319721222,
+      "learning_rate": 8.535533905932739e-06,
+      "loss": 2.4457,
       "step": 41
     },
     {
+      "epoch": 0.29015544041450775,
+      "grad_norm": 0.23074638843536377,
+      "learning_rate": 8.455313244934324e-06,
+      "loss": 1.8467,
       "step": 42
     },
     {
+      "epoch": 0.2970639032815199,
+      "grad_norm": 0.21250127255916595,
+      "learning_rate": 8.373352729660373e-06,
+      "loss": 1.7373,
       "step": 43
     },
     {
+      "epoch": 0.30397236614853196,
+      "grad_norm": 0.2267821580171585,
+      "learning_rate": 8.289693629698564e-06,
+      "loss": 1.8409,
       "step": 44
     },
     {
+      "epoch": 0.31088082901554404,
+      "grad_norm": 0.23144274950027466,
+      "learning_rate": 8.204378069925121e-06,
+      "loss": 1.8812,
       "step": 45
     },
     {
+      "epoch": 0.3177892918825561,
+      "grad_norm": 0.245137557387352,
+      "learning_rate": 8.117449009293668e-06,
+      "loss": 1.9027,
       "step": 46
     },
     {
+      "epoch": 0.32469775474956825,
+      "grad_norm": 0.27354151010513306,
+      "learning_rate": 8.0289502192041e-06,
+      "loss": 2.2673,
       "step": 47
     },
     {
+      "epoch": 0.3316062176165803,
+      "grad_norm": 0.23882536590099335,
+      "learning_rate": 7.938926261462366e-06,
+      "loss": 1.9599,
       "step": 48
     },
     {
+      "epoch": 0.3385146804835924,
+      "grad_norm": 0.25785377621650696,
+      "learning_rate": 7.84742246584226e-06,
+      "loss": 2.1052,
       "step": 49
     },
     {
+      "epoch": 0.3454231433506045,
+      "grad_norm": 0.2514020502567291,
+      "learning_rate": 7.754484907260513e-06,
+      "loss": 2.0549,
       "step": 50
     },
     {
+      "epoch": 0.3454231433506045,
+      "eval_loss": 0.5051040649414062,
+      "eval_runtime": 212.9606,
+      "eval_samples_per_second": 2.414,
+      "eval_steps_per_second": 0.606,
       "step": 50
     }
   ],
   "logging_steps": 1,
+  "max_steps": 145,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 50,
       "attributes": {}
     }
   },
+  "total_flos": 1.0220481364790016e+18,
+  "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null
 }

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:908a527816bc09fbe07f310e4f80e352792f6417bc1abafa58d2254bddc3d1db
-size 6225

 version https://git-lfs.github.com/spec/v1
+oid sha256:702294f2ba6032bb021dc32bbc9dbd5ee8f2ef55f4eb6c78b41cc0994567f4e2
+size 6289