Model save

Browse files

Files changed (4) hide show

README.md +69 -0
all_results.json +9 -0
train_results.json +9 -0
trainer_state.json +1380 -0

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+license: llama3
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: meta-llama/Meta-Llama-3-8B
+datasets:
+- generator
+model-index:
+- name: downstream-7b
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# downstream-7b
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.0792
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 4
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 64
+- total_eval_batch_size: 4
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.0709        | 0.9992 | 949  | 1.0792          |
+### Framework versions
+- PEFT 0.11.1
+- Transformers 4.43.4
+- Pytorch 2.3.1+cu121
+- Datasets 2.19.1
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9992103185048697,
+    "total_flos": 1959448100732928.0,
+    "train_loss": 1.0930153100081064,
+    "train_runtime": 22340.3866,
+    "train_samples": 103932,
+    "train_samples_per_second": 2.72,
+    "train_steps_per_second": 0.042
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9992103185048697,
+    "total_flos": 1959448100732928.0,
+    "train_loss": 1.0930153100081064,
+    "train_runtime": 22340.3866,
+    "train_samples": 103932,
+    "train_samples_per_second": 2.72,
+    "train_steps_per_second": 0.042
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1380 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9992103185048697,
+  "eval_steps": 500,
+  "global_step": 949,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0010529086601737299,
+      "grad_norm": 4.026114469450001,
+      "learning_rate": 2.105263157894737e-06,
+      "loss": 1.3755,
+      "step": 1
+    },
+    {
+      "epoch": 0.0052645433008686494,
+      "grad_norm": 1.1251945428540173,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.3102,
+      "step": 5
+    },
+    {
+      "epoch": 0.010529086601737299,
+      "grad_norm": 0.563945023031292,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.2626,
+      "step": 10
+    },
+    {
+      "epoch": 0.01579362990260595,
+      "grad_norm": 0.4336837922055097,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.2133,
+      "step": 15
+    },
+    {
+      "epoch": 0.021058173203474598,
+      "grad_norm": 0.33707427690007363,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.1636,
+      "step": 20
+    },
+    {
+      "epoch": 0.026322716504343247,
+      "grad_norm": 0.2412443925117212,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.1845,
+      "step": 25
+    },
+    {
+      "epoch": 0.0315872598052119,
+      "grad_norm": 0.2414606698915264,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.1401,
+      "step": 30
+    },
+    {
+      "epoch": 0.03685180310608055,
+      "grad_norm": 0.22628642837844729,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.1591,
+      "step": 35
+    },
+    {
+      "epoch": 0.042116346406949196,
+      "grad_norm": 0.2208133931977146,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.1397,
+      "step": 40
+    },
+    {
+      "epoch": 0.04738088970781785,
+      "grad_norm": 0.21905054641153135,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 1.1295,
+      "step": 45
+    },
+    {
+      "epoch": 0.052645433008686494,
+      "grad_norm": 0.1951061692287286,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.141,
+      "step": 50
+    },
+    {
+      "epoch": 0.05790997630955515,
+      "grad_norm": 0.18792729468212402,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.121,
+      "step": 55
+    },
+    {
+      "epoch": 0.0631745196104238,
+      "grad_norm": 0.189919407852987,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 1.1347,
+      "step": 60
+    },
+    {
+      "epoch": 0.06843906291129244,
+      "grad_norm": 0.18793881851552416,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 1.0961,
+      "step": 65
+    },
+    {
+      "epoch": 0.0737036062121611,
+      "grad_norm": 0.18263188517758086,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 1.0937,
+      "step": 70
+    },
+    {
+      "epoch": 0.07896814951302975,
+      "grad_norm": 0.18520098125405152,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 1.1419,
+      "step": 75
+    },
+    {
+      "epoch": 0.08423269281389839,
+      "grad_norm": 0.18675524775004465,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 1.1094,
+      "step": 80
+    },
+    {
+      "epoch": 0.08949723611476705,
+      "grad_norm": 0.18469057661828525,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 1.0952,
+      "step": 85
+    },
+    {
+      "epoch": 0.0947617794156357,
+      "grad_norm": 0.17860571701450936,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 1.1035,
+      "step": 90
+    },
+    {
+      "epoch": 0.10002632271650434,
+      "grad_norm": 0.2032976356381528,
+      "learning_rate": 0.0002,
+      "loss": 1.1329,
+      "step": 95
+    },
+    {
+      "epoch": 0.10529086601737299,
+      "grad_norm": 0.18932762375677964,
+      "learning_rate": 0.0001999830846194422,
+      "loss": 1.0902,
+      "step": 100
+    },
+    {
+      "epoch": 0.11055540931824165,
+      "grad_norm": 0.17823414518835126,
+      "learning_rate": 0.00019993234420037073,
+      "loss": 1.0951,
+      "step": 105
+    },
+    {
+      "epoch": 0.1158199526191103,
+      "grad_norm": 0.19033211796864122,
+      "learning_rate": 0.00019984779590865556,
+      "loss": 1.11,
+      "step": 110
+    },
+    {
+      "epoch": 0.12108449591997894,
+      "grad_norm": 0.1781090004150184,
+      "learning_rate": 0.0001997294683476273,
+      "loss": 1.1216,
+      "step": 115
+    },
+    {
+      "epoch": 0.1263490392208476,
+      "grad_norm": 0.20142566240295628,
+      "learning_rate": 0.0001995774015484005,
+      "loss": 1.088,
+      "step": 120
+    },
+    {
+      "epoch": 0.13161358252171623,
+      "grad_norm": 0.16738672746077932,
+      "learning_rate": 0.00019939164695633067,
+      "loss": 1.1069,
+      "step": 125
+    },
+    {
+      "epoch": 0.13687812582258488,
+      "grad_norm": 0.17141306079033702,
+      "learning_rate": 0.00019917226741361015,
+      "loss": 1.1178,
+      "step": 130
+    },
+    {
+      "epoch": 0.14214266912345355,
+      "grad_norm": 0.18242919862111662,
+      "learning_rate": 0.00019891933713800798,
+      "loss": 1.115,
+      "step": 135
+    },
+    {
+      "epoch": 0.1474072124243222,
+      "grad_norm": 0.18858703761293544,
+      "learning_rate": 0.00019863294169776148,
+      "loss": 1.092,
+      "step": 140
+    },
+    {
+      "epoch": 0.15267175572519084,
+      "grad_norm": 0.1851910906506613,
+      "learning_rate": 0.00019831317798262786,
+      "loss": 1.1015,
+      "step": 145
+    },
+    {
+      "epoch": 0.1579362990260595,
+      "grad_norm": 0.17061718616532065,
+      "learning_rate": 0.00019796015417110577,
+      "loss": 1.0834,
+      "step": 150
+    },
+    {
+      "epoch": 0.16320084232692814,
+      "grad_norm": 0.19083175263550564,
+      "learning_rate": 0.0001975739896938375,
+      "loss": 1.0915,
+      "step": 155
+    },
+    {
+      "epoch": 0.16846538562779678,
+      "grad_norm": 0.17041981222039004,
+      "learning_rate": 0.00019715481519320496,
+      "loss": 1.1045,
+      "step": 160
+    },
+    {
+      "epoch": 0.17372992892866543,
+      "grad_norm": 0.17539080495334333,
+      "learning_rate": 0.00019670277247913205,
+      "loss": 1.0822,
+      "step": 165
+    },
+    {
+      "epoch": 0.1789944722295341,
+      "grad_norm": 0.16847918243353582,
+      "learning_rate": 0.00019621801448110952,
+      "loss": 1.1113,
+      "step": 170
+    },
+    {
+      "epoch": 0.18425901553040275,
+      "grad_norm": 0.16577520965121645,
+      "learning_rate": 0.00019570070519645767,
+      "loss": 1.0726,
+      "step": 175
+    },
+    {
+      "epoch": 0.1895235588312714,
+      "grad_norm": 0.17216940817918563,
+      "learning_rate": 0.00019515101963484485,
+      "loss": 1.1214,
+      "step": 180
+    },
+    {
+      "epoch": 0.19478810213214004,
+      "grad_norm": 0.16717603329959776,
+      "learning_rate": 0.00019456914375908023,
+      "loss": 1.0749,
+      "step": 185
+    },
+    {
+      "epoch": 0.20005264543300869,
+      "grad_norm": 0.16743311436795275,
+      "learning_rate": 0.0001939552744222014,
+      "loss": 1.0856,
+      "step": 190
+    },
+    {
+      "epoch": 0.20531718873387733,
+      "grad_norm": 0.16628473925396028,
+      "learning_rate": 0.00019330961930087725,
+      "loss": 1.1088,
+      "step": 195
+    },
+    {
+      "epoch": 0.21058173203474598,
+      "grad_norm": 0.1735673710306468,
+      "learning_rate": 0.00019263239682514952,
+      "loss": 1.094,
+      "step": 200
+    },
+    {
+      "epoch": 0.21584627533561462,
+      "grad_norm": 0.16463208491106188,
+      "learning_rate": 0.00019192383610453618,
+      "loss": 1.1191,
+      "step": 205
+    },
+    {
+      "epoch": 0.2211108186364833,
+      "grad_norm": 0.1697201646634803,
+      "learning_rate": 0.00019118417685052194,
+      "loss": 1.1188,
+      "step": 210
+    },
+    {
+      "epoch": 0.22637536193735194,
+      "grad_norm": 0.15930348674889006,
+      "learning_rate": 0.00019041366929546219,
+      "loss": 1.1132,
+      "step": 215
+    },
+    {
+      "epoch": 0.2316399052382206,
+      "grad_norm": 0.16154587528605638,
+      "learning_rate": 0.0001896125741079272,
+      "loss": 1.1029,
+      "step": 220
+    },
+    {
+      "epoch": 0.23690444853908924,
+      "grad_norm": 0.1689593754891321,
+      "learning_rate": 0.00018878116230451613,
+      "loss": 1.1196,
+      "step": 225
+    },
+    {
+      "epoch": 0.24216899183995788,
+      "grad_norm": 0.158620581331537,
+      "learning_rate": 0.0001879197151581702,
+      "loss": 1.0786,
+      "step": 230
+    },
+    {
+      "epoch": 0.24743353514082653,
+      "grad_norm": 0.1591976970503649,
+      "learning_rate": 0.00018702852410301554,
+      "loss": 1.0861,
+      "step": 235
+    },
+    {
+      "epoch": 0.2526980784416952,
+      "grad_norm": 0.16271741933565712,
+      "learning_rate": 0.00018610789063576913,
+      "loss": 1.077,
+      "step": 240
+    },
+    {
+      "epoch": 0.25796262174256385,
+      "grad_norm": 0.16691583214883504,
+      "learning_rate": 0.00018515812621373997,
+      "loss": 1.0931,
+      "step": 245
+    },
+    {
+      "epoch": 0.26322716504343247,
+      "grad_norm": 0.15795453416798677,
+      "learning_rate": 0.00018417955214946092,
+      "loss": 1.0929,
+      "step": 250
+    },
+    {
+      "epoch": 0.26849170834430114,
+      "grad_norm": 0.15734119895920037,
+      "learning_rate": 0.00018317249950198597,
+      "loss": 1.086,
+      "step": 255
+    },
+    {
+      "epoch": 0.27375625164516976,
+      "grad_norm": 0.15815121783491273,
+      "learning_rate": 0.0001821373089648906,
+      "loss": 1.1142,
+      "step": 260
+    },
+    {
+      "epoch": 0.27902079494603843,
+      "grad_norm": 0.15790684873329372,
+      "learning_rate": 0.00018107433075101252,
+      "loss": 1.0907,
+      "step": 265
+    },
+    {
+      "epoch": 0.2842853382469071,
+      "grad_norm": 0.1603612919235376,
+      "learning_rate": 0.00017998392447397197,
+      "loss": 1.103,
+      "step": 270
+    },
+    {
+      "epoch": 0.2895498815477757,
+      "grad_norm": 0.1935643212000403,
+      "learning_rate": 0.00017886645902651167,
+      "loss": 1.1207,
+      "step": 275
+    },
+    {
+      "epoch": 0.2948144248486444,
+      "grad_norm": 0.16197395404790052,
+      "learning_rate": 0.0001777223124556978,
+      "loss": 1.1036,
+      "step": 280
+    },
+    {
+      "epoch": 0.300078968149513,
+      "grad_norm": 0.16503760296000086,
+      "learning_rate": 0.00017655187183502344,
+      "loss": 1.0647,
+      "step": 285
+    },
+    {
+      "epoch": 0.3053435114503817,
+      "grad_norm": 0.1772283442967409,
+      "learning_rate": 0.00017535553313345904,
+      "loss": 1.1075,
+      "step": 290
+    },
+    {
+      "epoch": 0.3106080547512503,
+      "grad_norm": 0.16282645295325013,
+      "learning_rate": 0.00017413370108149286,
+      "loss": 1.1094,
+      "step": 295
+    },
+    {
+      "epoch": 0.315872598052119,
+      "grad_norm": 0.15561402718068354,
+      "learning_rate": 0.00017288678903420762,
+      "loss": 1.0776,
+      "step": 300
+    },
+    {
+      "epoch": 0.32113714135298765,
+      "grad_norm": 0.15594031474920508,
+      "learning_rate": 0.00017161521883143934,
+      "loss": 1.1078,
+      "step": 305
+    },
+    {
+      "epoch": 0.32640168465385627,
+      "grad_norm": 0.1570395383591175,
+      "learning_rate": 0.00017031942065506576,
+      "loss": 1.1124,
+      "step": 310
+    },
+    {
+      "epoch": 0.33166622795472495,
+      "grad_norm": 0.15773152315944608,
+      "learning_rate": 0.00016899983288347248,
+      "loss": 1.0913,
+      "step": 315
+    },
+    {
+      "epoch": 0.33693077125559356,
+      "grad_norm": 0.15310806808595664,
+      "learning_rate": 0.00016765690194324616,
+      "loss": 1.0845,
+      "step": 320
+    },
+    {
+      "epoch": 0.34219531455646224,
+      "grad_norm": 0.16384678369715433,
+      "learning_rate": 0.00016629108215814525,
+      "loss": 1.1173,
+      "step": 325
+    },
+    {
+      "epoch": 0.34745985785733086,
+      "grad_norm": 0.165818325184464,
+      "learning_rate": 0.00016490283559539838,
+      "loss": 1.1014,
+      "step": 330
+    },
+    {
+      "epoch": 0.35272440115819953,
+      "grad_norm": 0.15456003221800826,
+      "learning_rate": 0.000163492631909384,
+      "loss": 1.0915,
+      "step": 335
+    },
+    {
+      "epoch": 0.3579889444590682,
+      "grad_norm": 0.16059867173233644,
+      "learning_rate": 0.00016206094818274229,
+      "loss": 1.0969,
+      "step": 340
+    },
+    {
+      "epoch": 0.3632534877599368,
+      "grad_norm": 0.17415674474557066,
+      "learning_rate": 0.00016060826876497478,
+      "loss": 1.1145,
+      "step": 345
+    },
+    {
+      "epoch": 0.3685180310608055,
+      "grad_norm": 0.16440403677512835,
+      "learning_rate": 0.0001591350851085851,
+      "loss": 1.0683,
+      "step": 350
+    },
+    {
+      "epoch": 0.3737825743616741,
+      "grad_norm": 0.15901493438320982,
+      "learning_rate": 0.00015764189560281677,
+      "loss": 1.1199,
+      "step": 355
+    },
+    {
+      "epoch": 0.3790471176625428,
+      "grad_norm": 0.15988293404570103,
+      "learning_rate": 0.00015612920540504453,
+      "loss": 1.0709,
+      "step": 360
+    },
+    {
+      "epoch": 0.3843116609634114,
+      "grad_norm": 0.1616109424204681,
+      "learning_rate": 0.00015459752626987563,
+      "loss": 1.1027,
+      "step": 365
+    },
+    {
+      "epoch": 0.3895762042642801,
+      "grad_norm": 0.1513607201651111,
+      "learning_rate": 0.00015304737637601926,
+      "loss": 1.0956,
+      "step": 370
+    },
+    {
+      "epoch": 0.3948407475651487,
+      "grad_norm": 0.15452619863423803,
+      "learning_rate": 0.0001514792801509831,
+      "loss": 1.0952,
+      "step": 375
+    },
+    {
+      "epoch": 0.40010529086601737,
+      "grad_norm": 0.15418975657555584,
+      "learning_rate": 0.00014989376809365493,
+      "loss": 1.0934,
+      "step": 380
+    },
+    {
+      "epoch": 0.40536983416688604,
+      "grad_norm": 0.15158447263390024,
+      "learning_rate": 0.00014829137659483143,
+      "loss": 1.0981,
+      "step": 385
+    },
+    {
+      "epoch": 0.41063437746775466,
+      "grad_norm": 0.15420702474431047,
+      "learning_rate": 0.0001466726477557527,
+      "loss": 1.1013,
+      "step": 390
+    },
+    {
+      "epoch": 0.41589892076862334,
+      "grad_norm": 0.1513401762569788,
+      "learning_rate": 0.00014503812920470534,
+      "loss": 1.1128,
+      "step": 395
+    },
+    {
+      "epoch": 0.42116346406949196,
+      "grad_norm": 0.1759021276212348,
+      "learning_rate": 0.00014338837391175582,
+      "loss": 1.0793,
+      "step": 400
+    },
+    {
+      "epoch": 0.42642800737036063,
+      "grad_norm": 0.15639002655528358,
+      "learning_rate": 0.00014172394000167623,
+      "loss": 1.1126,
+      "step": 405
+    },
+    {
+      "epoch": 0.43169255067122925,
+      "grad_norm": 0.1558922751326013,
+      "learning_rate": 0.00014004539056512667,
+      "loss": 1.0864,
+      "step": 410
+    },
+    {
+      "epoch": 0.4369570939720979,
+      "grad_norm": 0.15449223519766864,
+      "learning_rate": 0.00013835329346815716,
+      "loss": 1.1161,
+      "step": 415
+    },
+    {
+      "epoch": 0.4422216372729666,
+      "grad_norm": 0.15398779214531882,
+      "learning_rate": 0.0001366482211600945,
+      "loss": 1.113,
+      "step": 420
+    },
+    {
+      "epoch": 0.4474861805738352,
+      "grad_norm": 0.15902962443654645,
+      "learning_rate": 0.000134930750479878,
+      "loss": 1.0783,
+      "step": 425
+    },
+    {
+      "epoch": 0.4527507238747039,
+      "grad_norm": 0.15614703146804315,
+      "learning_rate": 0.00013320146246091074,
+      "loss": 1.0891,
+      "step": 430
+    },
+    {
+      "epoch": 0.4580152671755725,
+      "grad_norm": 0.151735228198923,
+      "learning_rate": 0.00013146094213449148,
+      "loss": 1.1006,
+      "step": 435
+    },
+    {
+      "epoch": 0.4632798104764412,
+      "grad_norm": 0.1633743946888902,
+      "learning_rate": 0.00012970977833189393,
+      "loss": 1.0717,
+      "step": 440
+    },
+    {
+      "epoch": 0.4685443537773098,
+      "grad_norm": 0.16534257355481496,
+      "learning_rate": 0.00012794856348516095,
+      "loss": 1.0778,
+      "step": 445
+    },
+    {
+      "epoch": 0.47380889707817847,
+      "grad_norm": 0.1856142828881669,
+      "learning_rate": 0.00012617789342668004,
+      "loss": 1.0859,
+      "step": 450
+    },
+    {
+      "epoch": 0.47907344037904714,
+      "grad_norm": 0.15229515578033356,
+      "learning_rate": 0.00012439836718760886,
+      "loss": 1.0761,
+      "step": 455
+    },
+    {
+      "epoch": 0.48433798367991576,
+      "grad_norm": 0.15984985984562605,
+      "learning_rate": 0.00012261058679521834,
+      "loss": 1.0926,
+      "step": 460
+    },
+    {
+      "epoch": 0.48960252698078444,
+      "grad_norm": 0.14896040772758903,
+      "learning_rate": 0.00012081515706922227,
+      "loss": 1.0834,
+      "step": 465
+    },
+    {
+      "epoch": 0.49486707028165305,
+      "grad_norm": 0.1514924492192347,
+      "learning_rate": 0.00011901268541716224,
+      "loss": 1.0885,
+      "step": 470
+    },
+    {
+      "epoch": 0.5001316135825217,
+      "grad_norm": 0.1513889418015892,
+      "learning_rate": 0.00011720378162891708,
+      "loss": 1.1001,
+      "step": 475
+    },
+    {
+      "epoch": 0.5053961568833903,
+      "grad_norm": 0.15159825336613816,
+      "learning_rate": 0.0001153890576704062,
+      "loss": 1.1082,
+      "step": 480
+    },
+    {
+      "epoch": 0.510660700184259,
+      "grad_norm": 0.15427722774659086,
+      "learning_rate": 0.00011356912747655685,
+      "loss": 1.0843,
+      "step": 485
+    },
+    {
+      "epoch": 0.5159252434851277,
+      "grad_norm": 0.14639500931900093,
+      "learning_rate": 0.00011174460674360549,
+      "loss": 1.1058,
+      "step": 490
+    },
+    {
+      "epoch": 0.5211897867859964,
+      "grad_norm": 0.15320269723203808,
+      "learning_rate": 0.00010991611272080269,
+      "loss": 1.1125,
+      "step": 495
+    },
+    {
+      "epoch": 0.5264543300868649,
+      "grad_norm": 0.15092814943890553,
+      "learning_rate": 0.00010808426400159338,
+      "loss": 1.0898,
+      "step": 500
+    },
+    {
+      "epoch": 0.5317188733877336,
+      "grad_norm": 0.14712598563479434,
+      "learning_rate": 0.00010624968031434173,
+      "loss": 1.0975,
+      "step": 505
+    },
+    {
+      "epoch": 0.5369834166886023,
+      "grad_norm": 0.1506174008648404,
+      "learning_rate": 0.00010441298231267242,
+      "loss": 1.0789,
+      "step": 510
+    },
+    {
+      "epoch": 0.542247959989471,
+      "grad_norm": 0.14915164476738402,
+      "learning_rate": 0.00010257479136549889,
+      "loss": 1.088,
+      "step": 515
+    },
+    {
+      "epoch": 0.5475125032903395,
+      "grad_norm": 0.14933216158522156,
+      "learning_rate": 0.00010073572934680919,
+      "loss": 1.1012,
+      "step": 520
+    },
+    {
+      "epoch": 0.5527770465912082,
+      "grad_norm": 0.1623395783916047,
+      "learning_rate": 9.889641842528178e-05,
+      "loss": 1.0992,
+      "step": 525
+    },
+    {
+      "epoch": 0.5580415898920769,
+      "grad_norm": 0.15524883773019818,
+      "learning_rate": 9.70574808538006e-05,
+      "loss": 1.0558,
+      "step": 530
+    },
+    {
+      "epoch": 0.5633061331929455,
+      "grad_norm": 0.14879516385003932,
+      "learning_rate": 9.521953875894257e-05,
+      "loss": 1.0634,
+      "step": 535
+    },
+    {
+      "epoch": 0.5685706764938142,
+      "grad_norm": 0.14856407933911947,
+      "learning_rate": 9.338321393050719e-05,
+      "loss": 1.0513,
+      "step": 540
+    },
+    {
+      "epoch": 0.5738352197946828,
+      "grad_norm": 0.1514919636398635,
+      "learning_rate": 9.154912761116056e-05,
+      "loss": 1.0899,
+      "step": 545
+    },
+    {
+      "epoch": 0.5790997630955514,
+      "grad_norm": 0.15005939408454377,
+      "learning_rate": 8.971790028626395e-05,
+      "loss": 1.09,
+      "step": 550
+    },
+    {
+      "epoch": 0.5843643063964201,
+      "grad_norm": 0.1541140355049706,
+      "learning_rate": 8.789015147395919e-05,
+      "loss": 1.072,
+      "step": 555
+    },
+    {
+      "epoch": 0.5896288496972888,
+      "grad_norm": 0.14756189100480177,
+      "learning_rate": 8.606649951558073e-05,
+      "loss": 1.0548,
+      "step": 560
+    },
+    {
+      "epoch": 0.5948933929981574,
+      "grad_norm": 0.14468591274130843,
+      "learning_rate": 8.424756136646623e-05,
+      "loss": 1.056,
+      "step": 565
+    },
+    {
+      "epoch": 0.600157936299026,
+      "grad_norm": 0.1510683202100121,
+      "learning_rate": 8.243395238723571e-05,
+      "loss": 1.0999,
+      "step": 570
+    },
+    {
+      "epoch": 0.6054224795998947,
+      "grad_norm": 0.14942489035639112,
+      "learning_rate": 8.062628613561051e-05,
+      "loss": 1.08,
+      "step": 575
+    },
+    {
+      "epoch": 0.6106870229007634,
+      "grad_norm": 0.14792710995590722,
+      "learning_rate": 7.8825174158842e-05,
+      "loss": 1.0916,
+      "step": 580
+    },
+    {
+      "epoch": 0.615951566201632,
+      "grad_norm": 0.14543568608581728,
+      "learning_rate": 7.703122578682046e-05,
+      "loss": 1.061,
+      "step": 585
+    },
+    {
+      "epoch": 0.6212161095025006,
+      "grad_norm": 0.14792849899325772,
+      "learning_rate": 7.524504792593419e-05,
+      "loss": 1.1101,
+      "step": 590
+    },
+    {
+      "epoch": 0.6264806528033693,
+      "grad_norm": 0.14574924924348462,
+      "learning_rate": 7.346724485374837e-05,
+      "loss": 1.0687,
+      "step": 595
+    },
+    {
+      "epoch": 0.631745196104238,
+      "grad_norm": 0.1434166906369258,
+      "learning_rate": 7.169841801457347e-05,
+      "loss": 1.0825,
+      "step": 600
+    },
+    {
+      "epoch": 0.6370097394051066,
+      "grad_norm": 0.14254720323207454,
+      "learning_rate": 6.993916581599202e-05,
+      "loss": 1.0896,
+      "step": 605
+    },
+    {
+      "epoch": 0.6422742827059753,
+      "grad_norm": 0.14534591022474969,
+      "learning_rate": 6.819008342641273e-05,
+      "loss": 1.0805,
+      "step": 610
+    },
+    {
+      "epoch": 0.6475388260068439,
+      "grad_norm": 0.1471482502229213,
+      "learning_rate": 6.645176257372055e-05,
+      "loss": 1.0933,
+      "step": 615
+    },
+    {
+      "epoch": 0.6528033693077125,
+      "grad_norm": 0.14967562406928056,
+      "learning_rate": 6.472479134509052e-05,
+      "loss": 1.0987,
+      "step": 620
+    },
+    {
+      "epoch": 0.6580679126085812,
+      "grad_norm": 0.14756218985788289,
+      "learning_rate": 6.300975398803362e-05,
+      "loss": 1.0862,
+      "step": 625
+    },
+    {
+      "epoch": 0.6633324559094499,
+      "grad_norm": 0.14358810278632364,
+      "learning_rate": 6.130723071274107e-05,
+      "loss": 1.0736,
+      "step": 630
+    },
+    {
+      "epoch": 0.6685969992103185,
+      "grad_norm": 0.14508119820046267,
+      "learning_rate": 5.961779749579516e-05,
+      "loss": 1.077,
+      "step": 635
+    },
+    {
+      "epoch": 0.6738615425111871,
+      "grad_norm": 0.14868475648668983,
+      "learning_rate": 5.794202588531166e-05,
+      "loss": 1.0921,
+      "step": 640
+    },
+    {
+      "epoch": 0.6791260858120558,
+      "grad_norm": 0.14136660751737096,
+      "learning_rate": 5.628048280758096e-05,
+      "loss": 1.0967,
+      "step": 645
+    },
+    {
+      "epoch": 0.6843906291129245,
+      "grad_norm": 0.14429824406995242,
+      "learning_rate": 5.4633730375272594e-05,
+      "loss": 1.094,
+      "step": 650
+    },
+    {
+      "epoch": 0.6896551724137931,
+      "grad_norm": 0.1435583500936634,
+      "learning_rate": 5.300232569726804e-05,
+      "loss": 1.0796,
+      "step": 655
+    },
+    {
+      "epoch": 0.6949197157146617,
+      "grad_norm": 0.14917594264214823,
+      "learning_rate": 5.13868206901867e-05,
+      "loss": 1.0813,
+      "step": 660
+    },
+    {
+      "epoch": 0.7001842590155304,
+      "grad_norm": 0.14484547003342338,
+      "learning_rate": 4.9787761891668397e-05,
+      "loss": 1.0833,
+      "step": 665
+    },
+    {
+      "epoch": 0.7054488023163991,
+      "grad_norm": 0.14125281408090304,
+      "learning_rate": 4.820569027547533e-05,
+      "loss": 1.0813,
+      "step": 670
+    },
+    {
+      "epoch": 0.7107133456172677,
+      "grad_norm": 0.1408995053360923,
+      "learning_rate": 4.6641141068476666e-05,
+      "loss": 1.0752,
+      "step": 675
+    },
+    {
+      "epoch": 0.7159778889181364,
+      "grad_norm": 0.1414179653044325,
+      "learning_rate": 4.5094643569577186e-05,
+      "loss": 1.054,
+      "step": 680
+    },
+    {
+      "epoch": 0.721242432219005,
+      "grad_norm": 0.14582058548503438,
+      "learning_rate": 4.356672097065134e-05,
+      "loss": 1.1048,
+      "step": 685
+    },
+    {
+      "epoch": 0.7265069755198736,
+      "grad_norm": 0.14009606861616825,
+      "learning_rate": 4.205789017954364e-05,
+      "loss": 1.0683,
+      "step": 690
+    },
+    {
+      "epoch": 0.7317715188207423,
+      "grad_norm": 0.14586506040118713,
+      "learning_rate": 4.056866164519465e-05,
+      "loss": 1.0728,
+      "step": 695
+    },
+    {
+      "epoch": 0.737036062121611,
+      "grad_norm": 0.14168474565307407,
+      "learning_rate": 3.909953918495234e-05,
+      "loss": 1.0476,
+      "step": 700
+    },
+    {
+      "epoch": 0.7423006054224796,
+      "grad_norm": 0.14476382479542646,
+      "learning_rate": 3.7651019814126654e-05,
+      "loss": 1.05,
+      "step": 705
+    },
+    {
+      "epoch": 0.7475651487233482,
+      "grad_norm": 0.14528550784733454,
+      "learning_rate": 3.622359357784569e-05,
+      "loss": 1.0611,
+      "step": 710
+    },
+    {
+      "epoch": 0.7528296920242169,
+      "grad_norm": 0.14781069746763306,
+      "learning_rate": 3.481774338526954e-05,
+      "loss": 1.0952,
+      "step": 715
+    },
+    {
+      "epoch": 0.7580942353250856,
+      "grad_norm": 0.15618197530507127,
+      "learning_rate": 3.343394484621855e-05,
+      "loss": 1.0836,
+      "step": 720
+    },
+    {
+      "epoch": 0.7633587786259542,
+      "grad_norm": 0.22087793925041818,
+      "learning_rate": 3.207266611027069e-05,
+      "loss": 1.0727,
+      "step": 725
+    },
+    {
+      "epoch": 0.7686233219268228,
+      "grad_norm": 0.14674869869141435,
+      "learning_rate": 3.0734367708383294e-05,
+      "loss": 1.0712,
+      "step": 730
+    },
+    {
+      "epoch": 0.7738878652276915,
+      "grad_norm": 0.14673826341334423,
+      "learning_rate": 2.9419502397091713e-05,
+      "loss": 1.0852,
+      "step": 735
+    },
+    {
+      "epoch": 0.7791524085285602,
+      "grad_norm": 0.1426087824509766,
+      "learning_rate": 2.812851500533843e-05,
+      "loss": 1.0604,
+      "step": 740
+    },
+    {
+      "epoch": 0.7844169518294288,
+      "grad_norm": 0.1446320144127932,
+      "learning_rate": 2.6861842283983953e-05,
+      "loss": 1.0537,
+      "step": 745
+    },
+    {
+      "epoch": 0.7896814951302974,
+      "grad_norm": 0.14326111319394175,
+      "learning_rate": 2.5619912758050725e-05,
+      "loss": 1.0942,
+      "step": 750
+    },
+    {
+      "epoch": 0.7949460384311661,
+      "grad_norm": 0.14149919988871043,
+      "learning_rate": 2.4403146581749925e-05,
+      "loss": 1.0578,
+      "step": 755
+    },
+    {
+      "epoch": 0.8002105817320347,
+      "grad_norm": 0.14034086298796508,
+      "learning_rate": 2.3211955396340002e-05,
+      "loss": 1.0808,
+      "step": 760
+    },
+    {
+      "epoch": 0.8054751250329034,
+      "grad_norm": 0.1433790314655123,
+      "learning_rate": 2.204674219086531e-05,
+      "loss": 1.0906,
+      "step": 765
+    },
+    {
+      "epoch": 0.8107396683337721,
+      "grad_norm": 0.138618618401559,
+      "learning_rate": 2.090790116582191e-05,
+      "loss": 1.0559,
+      "step": 770
+    },
+    {
+      "epoch": 0.8160042116346407,
+      "grad_norm": 0.1429827381187093,
+      "learning_rate": 1.9795817599796418e-05,
+      "loss": 1.0792,
+      "step": 775
+    },
+    {
+      "epoch": 0.8212687549355093,
+      "grad_norm": 0.14200271718072968,
+      "learning_rate": 1.871086771912348e-05,
+      "loss": 1.0702,
+      "step": 780
+    },
+    {
+      "epoch": 0.826533298236378,
+      "grad_norm": 0.1429932480295589,
+      "learning_rate": 1.7653418570605475e-05,
+      "loss": 1.0715,
+      "step": 785
+    },
+    {
+      "epoch": 0.8317978415372467,
+      "grad_norm": 0.14431467515210814,
+      "learning_rate": 1.6623827897337762e-05,
+      "loss": 1.0713,
+      "step": 790
+    },
+    {
+      "epoch": 0.8370623848381153,
+      "grad_norm": 0.15238820455432608,
+      "learning_rate": 1.562244401768144e-05,
+      "loss": 1.0824,
+      "step": 795
+    },
+    {
+      "epoch": 0.8423269281389839,
+      "grad_norm": 0.14830242766673976,
+      "learning_rate": 1.4649605707424707e-05,
+      "loss": 1.0787,
+      "step": 800
+    },
+    {
+      "epoch": 0.8475914714398526,
+      "grad_norm": 0.14468170557092047,
+      "learning_rate": 1.3705642085172366e-05,
+      "loss": 1.0737,
+      "step": 805
+    },
+    {
+      "epoch": 0.8528560147407213,
+      "grad_norm": 0.14674968769736463,
+      "learning_rate": 1.2790872501002472e-05,
+      "loss": 1.0577,
+      "step": 810
+    },
+    {
+      "epoch": 0.8581205580415899,
+      "grad_norm": 0.14311627432536864,
+      "learning_rate": 1.1905606428427774e-05,
+      "loss": 1.0692,
+      "step": 815
+    },
+    {
+      "epoch": 0.8633851013424585,
+      "grad_norm": 0.14558376197107287,
+      "learning_rate": 1.105014335969855e-05,
+      "loss": 1.0934,
+      "step": 820
+    },
+    {
+      "epoch": 0.8686496446433272,
+      "grad_norm": 0.14414555681497093,
+      "learning_rate": 1.0224772704482033e-05,
+      "loss": 1.0875,
+      "step": 825
+    },
+    {
+      "epoch": 0.8739141879441958,
+      "grad_norm": 0.1399627142514978,
+      "learning_rate": 9.429773691952858e-06,
+      "loss": 1.082,
+      "step": 830
+    },
+    {
+      "epoch": 0.8791787312450645,
+      "grad_norm": 0.1392001373823857,
+      "learning_rate": 8.665415276327871e-06,
+      "loss": 1.0573,
+      "step": 835
+    },
+    {
+      "epoch": 0.8844432745459332,
+      "grad_norm": 0.13993969105859186,
+      "learning_rate": 7.931956045876688e-06,
+      "loss": 1.0448,
+      "step": 840
+    },
+    {
+      "epoch": 0.8897078178468018,
+      "grad_norm": 0.16741517197447736,
+      "learning_rate": 7.229644135439473e-06,
+      "loss": 1.104,
+      "step": 845
+    },
+    {
+      "epoch": 0.8949723611476704,
+      "grad_norm": 0.14123729142229655,
+      "learning_rate": 6.558717142480919e-06,
+      "loss": 1.0808,
+      "step": 850
+    },
+    {
+      "epoch": 0.9002369044485391,
+      "grad_norm": 0.1424278055064695,
+      "learning_rate": 5.919402046709288e-06,
+      "loss": 1.0709,
+      "step": 855
+    },
+    {
+      "epoch": 0.9055014477494078,
+      "grad_norm": 0.13993993967003346,
+      "learning_rate": 5.311915133287415e-06,
+      "loss": 1.0941,
+      "step": 860
+    },
+    {
+      "epoch": 0.9107659910502763,
+      "grad_norm": 0.14557850289664284,
+      "learning_rate": 4.7364619196617495e-06,
+      "loss": 1.0492,
+      "step": 865
+    },
+    {
+      "epoch": 0.916030534351145,
+      "grad_norm": 0.1450177459066908,
+      "learning_rate": 4.193237086034351e-06,
+      "loss": 1.0972,
+      "step": 870
+    },
+    {
+      "epoch": 0.9212950776520137,
+      "grad_norm": 0.1570091074884799,
+      "learning_rate": 3.6824244095010065e-06,
+      "loss": 1.0695,
+      "step": 875
+    },
+    {
+      "epoch": 0.9265596209528824,
+      "grad_norm": 0.14097561405495265,
+      "learning_rate": 3.2041967018780707e-06,
+      "loss": 1.0948,
+      "step": 880
+    },
+    {
+      "epoch": 0.931824164253751,
+      "grad_norm": 0.1420984285291773,
+      "learning_rate": 2.7587157512388718e-06,
+      "loss": 1.0573,
+      "step": 885
+    },
+    {
+      "epoch": 0.9370887075546196,
+      "grad_norm": 0.1545471738706476,
+      "learning_rate": 2.346132267179646e-06,
+      "loss": 1.0786,
+      "step": 890
+    },
+    {
+      "epoch": 0.9423532508554883,
+      "grad_norm": 0.14481364480205125,
+      "learning_rate": 1.9665858298333005e-06,
+      "loss": 1.0939,
+      "step": 895
+    },
+    {
+      "epoch": 0.9476177941563569,
+      "grad_norm": 0.1446556897144525,
+      "learning_rate": 1.6202048426483651e-06,
+      "loss": 1.0752,
+      "step": 900
+    },
+    {
+      "epoch": 0.9528823374572256,
+      "grad_norm": 0.13840641658264988,
+      "learning_rate": 1.3071064889491724e-06,
+      "loss": 1.0757,
+      "step": 905
+    },
+    {
+      "epoch": 0.9581468807580943,
+      "grad_norm": 0.1405867091258211,
+      "learning_rate": 1.0273966922918155e-06,
+      "loss": 1.0886,
+      "step": 910
+    },
+    {
+      "epoch": 0.9634114240589629,
+      "grad_norm": 0.15143973079201015,
+      "learning_rate": 7.81170080629412e-07,
+      "loss": 1.0337,
+      "step": 915
+    },
+    {
+      "epoch": 0.9686759673598315,
+      "grad_norm": 0.15113893856195346,
+      "learning_rate": 5.68509954298757e-07,
+      "loss": 1.099,
+      "step": 920
+    },
+    {
+      "epoch": 0.9739405106607002,
+      "grad_norm": 0.1436446854214333,
+      "learning_rate": 3.8948825783918784e-07,
+      "loss": 1.0595,
+      "step": 925
+    },
+    {
+      "epoch": 0.9792050539615689,
+      "grad_norm": 0.14373165990559605,
+      "learning_rate": 2.4416555565318635e-07,
+      "loss": 1.0815,
+      "step": 930
+    },
+    {
+      "epoch": 0.9844695972624374,
+      "grad_norm": 0.14233020784379563,
+      "learning_rate": 1.3259101151694708e-07,
+      "loss": 1.0569,
+      "step": 935
+    },
+    {
+      "epoch": 0.9897341405633061,
+      "grad_norm": 0.13823967108377017,
+      "learning_rate": 5.480237194799287e-08,
+      "loss": 1.0689,
+      "step": 940
+    },
+    {
+      "epoch": 0.9949986838641748,
+      "grad_norm": 0.1431568671824589,
+      "learning_rate": 1.0825953435122938e-08,
+      "loss": 1.0709,
+      "step": 945
+    },
+    {
+      "epoch": 0.9992103185048697,
+      "eval_loss": 1.07915198802948,
+      "eval_runtime": 3821.2872,
+      "eval_samples_per_second": 3.522,
+      "eval_steps_per_second": 0.881,
+      "step": 949
+    },
+    {
+      "epoch": 0.9992103185048697,
+      "step": 949,
+      "total_flos": 1959448100732928.0,
+      "train_loss": 1.0930153100081064,
+      "train_runtime": 22340.3866,
+      "train_samples_per_second": 2.72,
+      "train_steps_per_second": 0.042
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 949,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1959448100732928.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}