Model save

Browse files

Files changed (5) hide show

README.md +69 -0
all_results.json +9 -0
runs/Aug23_04-22-51_COE-CS-sv003/events.out.tfevents.1724387248.COE-CS-sv003.2786887.0 +2 -2
train_results.json +9 -0
trainer_state.json +1310 -0

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+license: llama3
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: meta-llama/Meta-Llama-3-8B
+datasets:
+- generator
+model-index:
+- name: sanity_syntax_20p_100k
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# sanity_syntax_20p_100k
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.1748
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 64
+- total_eval_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.2359        | 0.9992 | 895  | 1.1748          |
+### Framework versions
+- PEFT 0.11.1
+- Transformers 4.43.4
+- Pytorch 2.3.1+cu121
+- Datasets 2.19.1
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.999162712810494,
+    "total_flos": 1.1254972268150784e+16,
+    "train_loss": 1.2433469767011078,
+    "train_runtime": 20318.3129,
+    "train_samples": 99914,
+    "train_samples_per_second": 2.821,
+    "train_steps_per_second": 0.044
+}

runs/Aug23_04-22-51_COE-CS-sv003/events.out.tfevents.1724387248.COE-CS-sv003.2786887.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb033a66841284ecbc39d1046b14c0e7acfbdd3337a9234b991bb7c7230650c1
-size 43103

 version https://git-lfs.github.com/spec/v1
+oid sha256:8fb463d1f832330e6f98d93125078df65300115d463e1f933aa1cefaa31615e9
+size 43728

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.999162712810494,
+    "total_flos": 1.1254972268150784e+16,
+    "train_loss": 1.2433469767011078,
+    "train_runtime": 20318.3129,
+    "train_samples": 99914,
+    "train_samples_per_second": 2.821,
+    "train_steps_per_second": 0.044
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1310 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.999162712810494,
+  "eval_steps": 500,
+  "global_step": 895,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0011163829193413341,
+      "grad_norm": 0.3974737008844776,
+      "learning_rate": 2.2222222222222225e-06,
+      "loss": 1.607,
+      "step": 1
+    },
+    {
+      "epoch": 0.0055819145967066705,
+      "grad_norm": 0.4252789938746273,
+      "learning_rate": 1.1111111111111112e-05,
+      "loss": 1.5942,
+      "step": 5
+    },
+    {
+      "epoch": 0.011163829193413341,
+      "grad_norm": 0.4658525758416883,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 1.5877,
+      "step": 10
+    },
+    {
+      "epoch": 0.01674574379012001,
+      "grad_norm": 0.27282017063503095,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 1.5695,
+      "step": 15
+    },
+    {
+      "epoch": 0.022327658386826682,
+      "grad_norm": 0.24165395076839943,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 1.558,
+      "step": 20
+    },
+    {
+      "epoch": 0.027909572983533353,
+      "grad_norm": 0.1767403193777301,
+      "learning_rate": 5.555555555555556e-05,
+      "loss": 1.4678,
+      "step": 25
+    },
+    {
+      "epoch": 0.03349148758024002,
+      "grad_norm": 0.16356442786177314,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 1.467,
+      "step": 30
+    },
+    {
+      "epoch": 0.039073402176946694,
+      "grad_norm": 0.15556520577978836,
+      "learning_rate": 7.777777777777778e-05,
+      "loss": 1.429,
+      "step": 35
+    },
+    {
+      "epoch": 0.044655316773653364,
+      "grad_norm": 0.1263609432879071,
+      "learning_rate": 8.888888888888889e-05,
+      "loss": 1.4253,
+      "step": 40
+    },
+    {
+      "epoch": 0.050237231370360035,
+      "grad_norm": 0.1696978939183065,
+      "learning_rate": 0.0001,
+      "loss": 1.3895,
+      "step": 45
+    },
+    {
+      "epoch": 0.055819145967066705,
+      "grad_norm": 0.10830406775154863,
+      "learning_rate": 0.00011111111111111112,
+      "loss": 1.3645,
+      "step": 50
+    },
+    {
+      "epoch": 0.061401060563773376,
+      "grad_norm": 0.08414898733986972,
+      "learning_rate": 0.00012222222222222224,
+      "loss": 1.3082,
+      "step": 55
+    },
+    {
+      "epoch": 0.06698297516048005,
+      "grad_norm": 0.07973185533121883,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 1.2962,
+      "step": 60
+    },
+    {
+      "epoch": 0.07256488975718671,
+      "grad_norm": 0.09811845100733502,
+      "learning_rate": 0.00014444444444444444,
+      "loss": 1.3061,
+      "step": 65
+    },
+    {
+      "epoch": 0.07814680435389339,
+      "grad_norm": 0.08298371354138047,
+      "learning_rate": 0.00015555555555555556,
+      "loss": 1.3017,
+      "step": 70
+    },
+    {
+      "epoch": 0.08372871895060005,
+      "grad_norm": 0.07510078793315819,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 1.2989,
+      "step": 75
+    },
+    {
+      "epoch": 0.08931063354730673,
+      "grad_norm": 0.07085309149624731,
+      "learning_rate": 0.00017777777777777779,
+      "loss": 1.2787,
+      "step": 80
+    },
+    {
+      "epoch": 0.09489254814401339,
+      "grad_norm": 0.09400917029194135,
+      "learning_rate": 0.00018888888888888888,
+      "loss": 1.2843,
+      "step": 85
+    },
+    {
+      "epoch": 0.10047446274072007,
+      "grad_norm": 0.09230059652672952,
+      "learning_rate": 0.0002,
+      "loss": 1.262,
+      "step": 90
+    },
+    {
+      "epoch": 0.10605637733742673,
+      "grad_norm": 0.10009657676945562,
+      "learning_rate": 0.00019998096274980728,
+      "loss": 1.2821,
+      "step": 95
+    },
+    {
+      "epoch": 0.11163829193413341,
+      "grad_norm": 0.12201167887174731,
+      "learning_rate": 0.000199923858247567,
+      "loss": 1.2668,
+      "step": 100
+    },
+    {
+      "epoch": 0.11722020653084007,
+      "grad_norm": 0.09628889966493127,
+      "learning_rate": 0.00019982870823553308,
+      "loss": 1.2503,
+      "step": 105
+    },
+    {
+      "epoch": 0.12280212112754675,
+      "grad_norm": 0.10028621820088561,
+      "learning_rate": 0.00019969554894159723,
+      "loss": 1.2632,
+      "step": 110
+    },
+    {
+      "epoch": 0.12838403572425341,
+      "grad_norm": 0.08593461106683208,
+      "learning_rate": 0.00019952443106549533,
+      "loss": 1.2396,
+      "step": 115
+    },
+    {
+      "epoch": 0.1339659503209601,
+      "grad_norm": 0.08827739693201113,
+      "learning_rate": 0.00019931541975950378,
+      "loss": 1.2784,
+      "step": 120
+    },
+    {
+      "epoch": 0.13954786491766677,
+      "grad_norm": 0.0911508607290428,
+      "learning_rate": 0.00019906859460363307,
+      "loss": 1.2689,
+      "step": 125
+    },
+    {
+      "epoch": 0.14512977951437342,
+      "grad_norm": 0.12157025851983183,
+      "learning_rate": 0.00019878404957532814,
+      "loss": 1.2563,
+      "step": 130
+    },
+    {
+      "epoch": 0.1507116941110801,
+      "grad_norm": 0.10772740664174668,
+      "learning_rate": 0.0001984618930136869,
+      "loss": 1.2853,
+      "step": 135
+    },
+    {
+      "epoch": 0.15629360870778677,
+      "grad_norm": 0.09940063564218579,
+      "learning_rate": 0.00019810224757821064,
+      "loss": 1.241,
+      "step": 140
+    },
+    {
+      "epoch": 0.16187552330449345,
+      "grad_norm": 0.09118466185918958,
+      "learning_rate": 0.00019770525020210204,
+      "loss": 1.2746,
+      "step": 145
+    },
+    {
+      "epoch": 0.1674574379012001,
+      "grad_norm": 0.09674538853934604,
+      "learning_rate": 0.0001972710520401287,
+      "loss": 1.2561,
+      "step": 150
+    },
+    {
+      "epoch": 0.17303935249790678,
+      "grad_norm": 0.1126652956332537,
+      "learning_rate": 0.0001967998184110713,
+      "loss": 1.257,
+      "step": 155
+    },
+    {
+      "epoch": 0.17862126709461346,
+      "grad_norm": 0.0869341846350413,
+      "learning_rate": 0.00019629172873477995,
+      "loss": 1.2529,
+      "step": 160
+    },
+    {
+      "epoch": 0.18420318169132013,
+      "grad_norm": 0.09888626799953022,
+      "learning_rate": 0.00019574697646386027,
+      "loss": 1.244,
+      "step": 165
+    },
+    {
+      "epoch": 0.18978509628802678,
+      "grad_norm": 0.09785278620381999,
+      "learning_rate": 0.0001951657690100178,
+      "loss": 1.2334,
+      "step": 170
+    },
+    {
+      "epoch": 0.19536701088473346,
+      "grad_norm": 0.07378537831469305,
+      "learning_rate": 0.0001945483276650868,
+      "loss": 1.2415,
+      "step": 175
+    },
+    {
+      "epoch": 0.20094892548144014,
+      "grad_norm": 0.08814263560160436,
+      "learning_rate": 0.0001938948875167745,
+      "loss": 1.2512,
+      "step": 180
+    },
+    {
+      "epoch": 0.20653084007814682,
+      "grad_norm": 0.09775538276417937,
+      "learning_rate": 0.00019320569735915271,
+      "loss": 1.2213,
+      "step": 185
+    },
+    {
+      "epoch": 0.21211275467485347,
+      "grad_norm": 0.09538626874304115,
+      "learning_rate": 0.00019248101959793066,
+      "loss": 1.2354,
+      "step": 190
+    },
+    {
+      "epoch": 0.21769466927156014,
+      "grad_norm": 0.08332625788355251,
+      "learning_rate": 0.00019172113015054532,
+      "loss": 1.2444,
+      "step": 195
+    },
+    {
+      "epoch": 0.22327658386826682,
+      "grad_norm": 0.08309090570657847,
+      "learning_rate": 0.00019092631834110723,
+      "loss": 1.2316,
+      "step": 200
+    },
+    {
+      "epoch": 0.2288584984649735,
+      "grad_norm": 0.09054323693110126,
+      "learning_rate": 0.0001900968867902419,
+      "loss": 1.27,
+      "step": 205
+    },
+    {
+      "epoch": 0.23444041306168015,
+      "grad_norm": 0.08549436898181585,
+      "learning_rate": 0.00018923315129986835,
+      "loss": 1.2348,
+      "step": 210
+    },
+    {
+      "epoch": 0.24002232765838682,
+      "grad_norm": 0.086610993256363,
+      "learning_rate": 0.00018833544073295917,
+      "loss": 1.2461,
+      "step": 215
+    },
+    {
+      "epoch": 0.2456042422550935,
+      "grad_norm": 0.08146109722648563,
+      "learning_rate": 0.00018740409688832764,
+      "loss": 1.2431,
+      "step": 220
+    },
+    {
+      "epoch": 0.2511861568518002,
+      "grad_norm": 0.08232534290451142,
+      "learning_rate": 0.00018643947437048944,
+      "loss": 1.2408,
+      "step": 225
+    },
+    {
+      "epoch": 0.25676807144850683,
+      "grad_norm": 0.08507739560575232,
+      "learning_rate": 0.00018544194045464886,
+      "loss": 1.243,
+      "step": 230
+    },
+    {
+      "epoch": 0.26234998604521353,
+      "grad_norm": 0.09782665661618925,
+      "learning_rate": 0.00018441187494686053,
+      "loss": 1.2426,
+      "step": 235
+    },
+    {
+      "epoch": 0.2679319006419202,
+      "grad_norm": 0.0809973818897895,
+      "learning_rate": 0.0001833496700394202,
+      "loss": 1.2345,
+      "step": 240
+    },
+    {
+      "epoch": 0.27351381523862683,
+      "grad_norm": 0.09269081567542259,
+      "learning_rate": 0.00018225573016153945,
+      "loss": 1.2343,
+      "step": 245
+    },
+    {
+      "epoch": 0.27909572983533354,
+      "grad_norm": 0.09671785308848269,
+      "learning_rate": 0.00018113047182536127,
+      "loss": 1.2327,
+      "step": 250
+    },
+    {
+      "epoch": 0.2846776444320402,
+      "grad_norm": 0.0906432644454991,
+      "learning_rate": 0.00017997432346737524,
+      "loss": 1.2532,
+      "step": 255
+    },
+    {
+      "epoch": 0.29025955902874684,
+      "grad_norm": 0.08371586611488784,
+      "learning_rate": 0.00017878772528529232,
+      "loss": 1.2384,
+      "step": 260
+    },
+    {
+      "epoch": 0.29584147362545354,
+      "grad_norm": 0.08640773776491195,
+      "learning_rate": 0.000177571129070442,
+      "loss": 1.2193,
+      "step": 265
+    },
+    {
+      "epoch": 0.3014233882221602,
+      "grad_norm": 0.08164649256677078,
+      "learning_rate": 0.00017632499803575474,
+      "loss": 1.2327,
+      "step": 270
+    },
+    {
+      "epoch": 0.3070053028188669,
+      "grad_norm": 0.09156690890905773,
+      "learning_rate": 0.00017504980663939613,
+      "loss": 1.2534,
+      "step": 275
+    },
+    {
+      "epoch": 0.31258721741557355,
+      "grad_norm": 0.08393163680296412,
+      "learning_rate": 0.00017374604040411935,
+      "loss": 1.2411,
+      "step": 280
+    },
+    {
+      "epoch": 0.3181691320122802,
+      "grad_norm": 0.08340859881557235,
+      "learning_rate": 0.00017241419573240462,
+      "loss": 1.2398,
+      "step": 285
+    },
+    {
+      "epoch": 0.3237510466089869,
+      "grad_norm": 0.08622506272483123,
+      "learning_rate": 0.00017105477971745666,
+      "loss": 1.2321,
+      "step": 290
+    },
+    {
+      "epoch": 0.32933296120569355,
+      "grad_norm": 0.08338497396964428,
+      "learning_rate": 0.00016966830995013133,
+      "loss": 1.2453,
+      "step": 295
+    },
+    {
+      "epoch": 0.3349148758024002,
+      "grad_norm": 0.08718794446584939,
+      "learning_rate": 0.00016825531432186543,
+      "loss": 1.2134,
+      "step": 300
+    },
+    {
+      "epoch": 0.3404967903991069,
+      "grad_norm": 0.09158015865602193,
+      "learning_rate": 0.00016681633082368498,
+      "loss": 1.223,
+      "step": 305
+    },
+    {
+      "epoch": 0.34607870499581356,
+      "grad_norm": 0.08768121171152027,
+      "learning_rate": 0.0001653519073413675,
+      "loss": 1.235,
+      "step": 310
+    },
+    {
+      "epoch": 0.3516606195925202,
+      "grad_norm": 0.08907125432704804,
+      "learning_rate": 0.00016386260144683745,
+      "loss": 1.2169,
+      "step": 315
+    },
+    {
+      "epoch": 0.3572425341892269,
+      "grad_norm": 0.08767993008424768,
+      "learning_rate": 0.00016234898018587337,
+      "loss": 1.2435,
+      "step": 320
+    },
+    {
+      "epoch": 0.36282444878593356,
+      "grad_norm": 0.08991663909567185,
+      "learning_rate": 0.00016081161986220807,
+      "loss": 1.2371,
+      "step": 325
+    },
+    {
+      "epoch": 0.36840636338264027,
+      "grad_norm": 0.07876061570647706,
+      "learning_rate": 0.00015925110581810394,
+      "loss": 1.2118,
+      "step": 330
+    },
+    {
+      "epoch": 0.3739882779793469,
+      "grad_norm": 0.09088539514665886,
+      "learning_rate": 0.00015766803221148673,
+      "loss": 1.2333,
+      "step": 335
+    },
+    {
+      "epoch": 0.37957019257605357,
+      "grad_norm": 0.09371191064756335,
+      "learning_rate": 0.00015606300178972287,
+      "loss": 1.2192,
+      "step": 340
+    },
+    {
+      "epoch": 0.38515210717276027,
+      "grad_norm": 0.0988524027231739,
+      "learning_rate": 0.00015443662566012645,
+      "loss": 1.2201,
+      "step": 345
+    },
+    {
+      "epoch": 0.3907340217694669,
+      "grad_norm": 0.08068655015289312,
+      "learning_rate": 0.00015278952305728324,
+      "loss": 1.2312,
+      "step": 350
+    },
+    {
+      "epoch": 0.39631593636617357,
+      "grad_norm": 0.08530580419429784,
+      "learning_rate": 0.00015112232110728015,
+      "loss": 1.2103,
+      "step": 355
+    },
+    {
+      "epoch": 0.4018978509628803,
+      "grad_norm": 0.0832856621155852,
+      "learning_rate": 0.00014943565458893,
+      "loss": 1.2049,
+      "step": 360
+    },
+    {
+      "epoch": 0.4074797655595869,
+      "grad_norm": 0.10112900442930213,
+      "learning_rate": 0.00014773016569208283,
+      "loss": 1.2381,
+      "step": 365
+    },
+    {
+      "epoch": 0.41306168015629363,
+      "grad_norm": 0.08250019530921109,
+      "learning_rate": 0.00014600650377311522,
+      "loss": 1.2185,
+      "step": 370
+    },
+    {
+      "epoch": 0.4186435947530003,
+      "grad_norm": 0.0987578329954232,
+      "learning_rate": 0.0001442653251076912,
+      "loss": 1.2222,
+      "step": 375
+    },
+    {
+      "epoch": 0.42422550934970693,
+      "grad_norm": 0.08530899013880136,
+      "learning_rate": 0.00014250729264088843,
+      "loss": 1.2556,
+      "step": 380
+    },
+    {
+      "epoch": 0.42980742394641364,
+      "grad_norm": 0.10267562745822716,
+      "learning_rate": 0.00014073307573478526,
+      "loss": 1.2146,
+      "step": 385
+    },
+    {
+      "epoch": 0.4353893385431203,
+      "grad_norm": 0.09189285950155643,
+      "learning_rate": 0.00013894334991360448,
+      "loss": 1.2206,
+      "step": 390
+    },
+    {
+      "epoch": 0.44097125313982694,
+      "grad_norm": 0.08370196846674145,
+      "learning_rate": 0.00013713879660651068,
+      "loss": 1.2076,
+      "step": 395
+    },
+    {
+      "epoch": 0.44655316773653364,
+      "grad_norm": 0.08423557906306067,
+      "learning_rate": 0.0001353201028881598,
+      "loss": 1.2223,
+      "step": 400
+    },
+    {
+      "epoch": 0.4521350823332403,
+      "grad_norm": 0.08292081122541138,
+      "learning_rate": 0.00013348796121709862,
+      "loss": 1.2294,
+      "step": 405
+    },
+    {
+      "epoch": 0.457716996929947,
+      "grad_norm": 0.08767079524531268,
+      "learning_rate": 0.00013164306917211476,
+      "loss": 1.2229,
+      "step": 410
+    },
+    {
+      "epoch": 0.46329891152665365,
+      "grad_norm": 0.0865942463810843,
+      "learning_rate": 0.000129786129186637,
+      "loss": 1.2163,
+      "step": 415
+    },
+    {
+      "epoch": 0.4688808261233603,
+      "grad_norm": 0.08101515714055764,
+      "learning_rate": 0.00012791784828128724,
+      "loss": 1.2337,
+      "step": 420
+    },
+    {
+      "epoch": 0.474462740720067,
+      "grad_norm": 0.09009147490161429,
+      "learning_rate": 0.00012603893779468604,
+      "loss": 1.2148,
+      "step": 425
+    },
+    {
+      "epoch": 0.48004465531677365,
+      "grad_norm": 0.08757351279515291,
+      "learning_rate": 0.0001241501131126138,
+      "loss": 1.2056,
+      "step": 430
+    },
+    {
+      "epoch": 0.4856265699134803,
+      "grad_norm": 0.08418609867162384,
+      "learning_rate": 0.00012225209339563145,
+      "loss": 1.2419,
+      "step": 435
+    },
+    {
+      "epoch": 0.491208484510187,
+      "grad_norm": 0.08790367723325618,
+      "learning_rate": 0.0001203456013052634,
+      "loss": 1.2115,
+      "step": 440
+    },
+    {
+      "epoch": 0.49679039910689365,
+      "grad_norm": 0.08071789319204539,
+      "learning_rate": 0.00011843136272884794,
+      "loss": 1.2072,
+      "step": 445
+    },
+    {
+      "epoch": 0.5023723137036004,
+      "grad_norm": 0.0879278395825441,
+      "learning_rate": 0.00011651010650315923,
+      "loss": 1.2194,
+      "step": 450
+    },
+    {
+      "epoch": 0.507954228300307,
+      "grad_norm": 0.08506166782358492,
+      "learning_rate": 0.00011458256413690633,
+      "loss": 1.2077,
+      "step": 455
+    },
+    {
+      "epoch": 0.5135361428970137,
+      "grad_norm": 0.08984730610411729,
+      "learning_rate": 0.00011264946953221496,
+      "loss": 1.2484,
+      "step": 460
+    },
+    {
+      "epoch": 0.5191180574937203,
+      "grad_norm": 0.2978083078661545,
+      "learning_rate": 0.00011071155870519777,
+      "loss": 1.2491,
+      "step": 465
+    },
+    {
+      "epoch": 0.5246999720904271,
+      "grad_norm": 0.08504227931172395,
+      "learning_rate": 0.00010876956950572006,
+      "loss": 1.2268,
+      "step": 470
+    },
+    {
+      "epoch": 0.5302818866871337,
+      "grad_norm": 0.08620167875904892,
+      "learning_rate": 0.0001068242413364671,
+      "loss": 1.2252,
+      "step": 475
+    },
+    {
+      "epoch": 0.5358638012838404,
+      "grad_norm": 0.08669957736640198,
+      "learning_rate": 0.00010487631487142017,
+      "loss": 1.217,
+      "step": 480
+    },
+    {
+      "epoch": 0.541445715880547,
+      "grad_norm": 0.08577871896034497,
+      "learning_rate": 0.00010292653177384876,
+      "loss": 1.2169,
+      "step": 485
+    },
+    {
+      "epoch": 0.5470276304772537,
+      "grad_norm": 0.08417260057895289,
+      "learning_rate": 0.00010097563441392581,
+      "loss": 1.2354,
+      "step": 490
+    },
+    {
+      "epoch": 0.5526095450739603,
+      "grad_norm": 0.08676422431924583,
+      "learning_rate": 9.90243655860742e-05,
+      "loss": 1.2039,
+      "step": 495
+    },
+    {
+      "epoch": 0.5581914596706671,
+      "grad_norm": 0.09103906295111437,
+      "learning_rate": 9.707346822615128e-05,
+      "loss": 1.2194,
+      "step": 500
+    },
+    {
+      "epoch": 0.5637733742673737,
+      "grad_norm": 0.08594537537719427,
+      "learning_rate": 9.512368512857984e-05,
+      "loss": 1.1949,
+      "step": 505
+    },
+    {
+      "epoch": 0.5693552888640804,
+      "grad_norm": 0.08392759057088481,
+      "learning_rate": 9.317575866353292e-05,
+      "loss": 1.2196,
+      "step": 510
+    },
+    {
+      "epoch": 0.574937203460787,
+      "grad_norm": 0.08201912454761111,
+      "learning_rate": 9.123043049427995e-05,
+      "loss": 1.2131,
+      "step": 515
+    },
+    {
+      "epoch": 0.5805191180574937,
+      "grad_norm": 0.08925291750313868,
+      "learning_rate": 8.928844129480227e-05,
+      "loss": 1.2369,
+      "step": 520
+    },
+    {
+      "epoch": 0.5861010326542004,
+      "grad_norm": 0.08954980070951671,
+      "learning_rate": 8.735053046778506e-05,
+      "loss": 1.2175,
+      "step": 525
+    },
+    {
+      "epoch": 0.5916829472509071,
+      "grad_norm": 0.08574100993825345,
+      "learning_rate": 8.541743586309365e-05,
+      "loss": 1.2166,
+      "step": 530
+    },
+    {
+      "epoch": 0.5972648618476137,
+      "grad_norm": 0.08840883290578404,
+      "learning_rate": 8.348989349684076e-05,
+      "loss": 1.2271,
+      "step": 535
+    },
+    {
+      "epoch": 0.6028467764443204,
+      "grad_norm": 0.08443946017557556,
+      "learning_rate": 8.156863727115211e-05,
+      "loss": 1.2329,
+      "step": 540
+    },
+    {
+      "epoch": 0.608428691041027,
+      "grad_norm": 0.0902640782545258,
+      "learning_rate": 7.965439869473664e-05,
+      "loss": 1.2253,
+      "step": 545
+    },
+    {
+      "epoch": 0.6140106056377338,
+      "grad_norm": 0.08988630625422679,
+      "learning_rate": 7.774790660436858e-05,
+      "loss": 1.1785,
+      "step": 550
+    },
+    {
+      "epoch": 0.6195925202344404,
+      "grad_norm": 0.08134808753957644,
+      "learning_rate": 7.584988688738622e-05,
+      "loss": 1.2261,
+      "step": 555
+    },
+    {
+      "epoch": 0.6251744348311471,
+      "grad_norm": 0.08768193779762151,
+      "learning_rate": 7.396106220531398e-05,
+      "loss": 1.2463,
+      "step": 560
+    },
+    {
+      "epoch": 0.6307563494278537,
+      "grad_norm": 0.0885816930556393,
+      "learning_rate": 7.208215171871277e-05,
+      "loss": 1.2141,
+      "step": 565
+    },
+    {
+      "epoch": 0.6363382640245604,
+      "grad_norm": 0.08553683878588977,
+      "learning_rate": 7.021387081336301e-05,
+      "loss": 1.2026,
+      "step": 570
+    },
+    {
+      "epoch": 0.641920178621267,
+      "grad_norm": 0.09505838067263224,
+      "learning_rate": 6.835693082788525e-05,
+      "loss": 1.2168,
+      "step": 575
+    },
+    {
+      "epoch": 0.6475020932179738,
+      "grad_norm": 0.08769224685329463,
+      "learning_rate": 6.651203878290139e-05,
+      "loss": 1.2493,
+      "step": 580
+    },
+    {
+      "epoch": 0.6530840078146805,
+      "grad_norm": 0.07990213288377576,
+      "learning_rate": 6.46798971118402e-05,
+      "loss": 1.2308,
+      "step": 585
+    },
+    {
+      "epoch": 0.6586659224113871,
+      "grad_norm": 0.08133261350163556,
+      "learning_rate": 6.286120339348935e-05,
+      "loss": 1.2014,
+      "step": 590
+    },
+    {
+      "epoch": 0.6642478370080938,
+      "grad_norm": 0.09363089434544866,
+      "learning_rate": 6.105665008639557e-05,
+      "loss": 1.2238,
+      "step": 595
+    },
+    {
+      "epoch": 0.6698297516048004,
+      "grad_norm": 0.07910287951552411,
+      "learning_rate": 5.926692426521474e-05,
+      "loss": 1.2473,
+      "step": 600
+    },
+    {
+      "epoch": 0.6754116662015072,
+      "grad_norm": 0.0801209902764544,
+      "learning_rate": 5.749270735911158e-05,
+      "loss": 1.1975,
+      "step": 605
+    },
+    {
+      "epoch": 0.6809935807982138,
+      "grad_norm": 0.08087293360533905,
+      "learning_rate": 5.573467489230879e-05,
+      "loss": 1.1966,
+      "step": 610
+    },
+    {
+      "epoch": 0.6865754953949205,
+      "grad_norm": 0.08220997258417966,
+      "learning_rate": 5.399349622688479e-05,
+      "loss": 1.2345,
+      "step": 615
+    },
+    {
+      "epoch": 0.6921574099916271,
+      "grad_norm": 0.0825575277760057,
+      "learning_rate": 5.226983430791722e-05,
+      "loss": 1.2289,
+      "step": 620
+    },
+    {
+      "epoch": 0.6977393245883338,
+      "grad_norm": 0.08305460425818378,
+      "learning_rate": 5.0564345411070025e-05,
+      "loss": 1.204,
+      "step": 625
+    },
+    {
+      "epoch": 0.7033212391850404,
+      "grad_norm": 0.08011105262542664,
+      "learning_rate": 4.8877678892719866e-05,
+      "loss": 1.1946,
+      "step": 630
+    },
+    {
+      "epoch": 0.7089031537817472,
+      "grad_norm": 0.08686069747720479,
+      "learning_rate": 4.721047694271676e-05,
+      "loss": 1.2,
+      "step": 635
+    },
+    {
+      "epoch": 0.7144850683784538,
+      "grad_norm": 0.08537977661965272,
+      "learning_rate": 4.556337433987359e-05,
+      "loss": 1.2054,
+      "step": 640
+    },
+    {
+      "epoch": 0.7200669829751605,
+      "grad_norm": 0.08857193949478791,
+      "learning_rate": 4.393699821027716e-05,
+      "loss": 1.1988,
+      "step": 645
+    },
+    {
+      "epoch": 0.7256488975718671,
+      "grad_norm": 0.09608004999262602,
+      "learning_rate": 4.2331967788513295e-05,
+      "loss": 1.2226,
+      "step": 650
+    },
+    {
+      "epoch": 0.7312308121685738,
+      "grad_norm": 0.08235757922811432,
+      "learning_rate": 4.074889418189608e-05,
+      "loss": 1.2202,
+      "step": 655
+    },
+    {
+      "epoch": 0.7368127267652805,
+      "grad_norm": 0.08660069823512372,
+      "learning_rate": 3.9188380137791936e-05,
+      "loss": 1.215,
+      "step": 660
+    },
+    {
+      "epoch": 0.7423946413619872,
+      "grad_norm": 0.08090639704744831,
+      "learning_rate": 3.7651019814126654e-05,
+      "loss": 1.2255,
+      "step": 665
+    },
+    {
+      "epoch": 0.7479765559586938,
+      "grad_norm": 0.08082821477995833,
+      "learning_rate": 3.613739855316257e-05,
+      "loss": 1.2176,
+      "step": 670
+    },
+    {
+      "epoch": 0.7535584705554005,
+      "grad_norm": 0.08469395080984878,
+      "learning_rate": 3.46480926586325e-05,
+      "loss": 1.2275,
+      "step": 675
+    },
+    {
+      "epoch": 0.7591403851521071,
+      "grad_norm": 0.0871555466504494,
+      "learning_rate": 3.3183669176315045e-05,
+      "loss": 1.2351,
+      "step": 680
+    },
+    {
+      "epoch": 0.7647222997488139,
+      "grad_norm": 0.08170223557553191,
+      "learning_rate": 3.174468567813461e-05,
+      "loss": 1.2074,
+      "step": 685
+    },
+    {
+      "epoch": 0.7703042143455205,
+      "grad_norm": 0.0838318843856818,
+      "learning_rate": 3.033169004986873e-05,
+      "loss": 1.2396,
+      "step": 690
+    },
+    {
+      "epoch": 0.7758861289422272,
+      "grad_norm": 0.08831381148889993,
+      "learning_rate": 2.894522028254334e-05,
+      "loss": 1.1947,
+      "step": 695
+    },
+    {
+      "epoch": 0.7814680435389338,
+      "grad_norm": 0.08158536981215994,
+      "learning_rate": 2.7585804267595384e-05,
+      "loss": 1.208,
+      "step": 700
+    },
+    {
+      "epoch": 0.7870499581356405,
+      "grad_norm": 0.08116519613000232,
+      "learning_rate": 2.6253959595880673e-05,
+      "loss": 1.2191,
+      "step": 705
+    },
+    {
+      "epoch": 0.7926318727323471,
+      "grad_norm": 0.08294169676184929,
+      "learning_rate": 2.495019336060387e-05,
+      "loss": 1.195,
+      "step": 710
+    },
+    {
+      "epoch": 0.7982137873290539,
+      "grad_norm": 0.08406756837278591,
+      "learning_rate": 2.367500196424529e-05,
+      "loss": 1.2203,
+      "step": 715
+    },
+    {
+      "epoch": 0.8037957019257606,
+      "grad_norm": 0.08211403607563178,
+      "learning_rate": 2.242887092955801e-05,
+      "loss": 1.2041,
+      "step": 720
+    },
+    {
+      "epoch": 0.8093776165224672,
+      "grad_norm": 0.07980978787138238,
+      "learning_rate": 2.121227471470768e-05,
+      "loss": 1.2394,
+      "step": 725
+    },
+    {
+      "epoch": 0.8149595311191739,
+      "grad_norm": 0.08416184610807921,
+      "learning_rate": 2.002567653262479e-05,
+      "loss": 1.2228,
+      "step": 730
+    },
+    {
+      "epoch": 0.8205414457158805,
+      "grad_norm": 0.08256062792318115,
+      "learning_rate": 1.8869528174638752e-05,
+      "loss": 1.203,
+      "step": 735
+    },
+    {
+      "epoch": 0.8261233603125873,
+      "grad_norm": 0.09043351264554417,
+      "learning_rate": 1.774426983846058e-05,
+      "loss": 1.2275,
+      "step": 740
+    },
+    {
+      "epoch": 0.8317052749092939,
+      "grad_norm": 0.08486147964302236,
+      "learning_rate": 1.6650329960579792e-05,
+      "loss": 1.2208,
+      "step": 745
+    },
+    {
+      "epoch": 0.8372871895060006,
+      "grad_norm": 0.0935945466460169,
+      "learning_rate": 1.5588125053139468e-05,
+      "loss": 1.2131,
+      "step": 750
+    },
+    {
+      "epoch": 0.8428691041027072,
+      "grad_norm": 0.08282716353976063,
+      "learning_rate": 1.4558059545351143e-05,
+      "loss": 1.2284,
+      "step": 755
+    },
+    {
+      "epoch": 0.8484510186994139,
+      "grad_norm": 0.08286515378820142,
+      "learning_rate": 1.3560525629510568e-05,
+      "loss": 1.2086,
+      "step": 760
+    },
+    {
+      "epoch": 0.8540329332961206,
+      "grad_norm": 0.08295259360853054,
+      "learning_rate": 1.259590311167238e-05,
+      "loss": 1.2061,
+      "step": 765
+    },
+    {
+      "epoch": 0.8596148478928273,
+      "grad_norm": 0.08358389042910293,
+      "learning_rate": 1.166455926704082e-05,
+      "loss": 1.222,
+      "step": 770
+    },
+    {
+      "epoch": 0.8651967624895339,
+      "grad_norm": 0.08388863476839661,
+      "learning_rate": 1.0766848700131648e-05,
+      "loss": 1.2143,
+      "step": 775
+    },
+    {
+      "epoch": 0.8707786770862406,
+      "grad_norm": 0.08277339984932784,
+      "learning_rate": 9.903113209758096e-06,
+      "loss": 1.2192,
+      "step": 780
+    },
+    {
+      "epoch": 0.8763605916829472,
+      "grad_norm": 0.08938310164317657,
+      "learning_rate": 9.073681658892775e-06,
+      "loss": 1.2191,
+      "step": 785
+    },
+    {
+      "epoch": 0.8819425062796539,
+      "grad_norm": 0.07910593096708422,
+      "learning_rate": 8.278869849454718e-06,
+      "loss": 1.2269,
+      "step": 790
+    },
+    {
+      "epoch": 0.8875244208763606,
+      "grad_norm": 0.08295037453317607,
+      "learning_rate": 7.5189804020693536e-06,
+      "loss": 1.2021,
+      "step": 795
+    },
+    {
+      "epoch": 0.8931063354730673,
+      "grad_norm": 0.08199446080472911,
+      "learning_rate": 6.794302640847294e-06,
+      "loss": 1.1961,
+      "step": 800
+    },
+    {
+      "epoch": 0.8986882500697739,
+      "grad_norm": 0.08481342663212112,
+      "learning_rate": 6.1051124832254944e-06,
+      "loss": 1.2069,
+      "step": 805
+    },
+    {
+      "epoch": 0.9042701646664806,
+      "grad_norm": 0.08217551850800063,
+      "learning_rate": 5.451672334913216e-06,
+      "loss": 1.2055,
+      "step": 810
+    },
+    {
+      "epoch": 0.9098520792631872,
+      "grad_norm": 0.08322503504827561,
+      "learning_rate": 4.834230989982213e-06,
+      "loss": 1.2156,
+      "step": 815
+    },
+    {
+      "epoch": 0.915433993859894,
+      "grad_norm": 0.08125961805104615,
+      "learning_rate": 4.253023536139733e-06,
+      "loss": 1.2005,
+      "step": 820
+    },
+    {
+      "epoch": 0.9210159084566006,
+      "grad_norm": 0.09037682759604541,
+      "learning_rate": 3.7082712652200867e-06,
+      "loss": 1.2079,
+      "step": 825
+    },
+    {
+      "epoch": 0.9265978230533073,
+      "grad_norm": 0.08711894287392291,
+      "learning_rate": 3.2001815889286856e-06,
+      "loss": 1.232,
+      "step": 830
+    },
+    {
+      "epoch": 0.9321797376500139,
+      "grad_norm": 0.08367132801462379,
+      "learning_rate": 2.728947959871353e-06,
+      "loss": 1.1858,
+      "step": 835
+    },
+    {
+      "epoch": 0.9377616522467206,
+      "grad_norm": 0.0809801248589102,
+      "learning_rate": 2.294749797897955e-06,
+      "loss": 1.1871,
+      "step": 840
+    },
+    {
+      "epoch": 0.9433435668434274,
+      "grad_norm": 0.08412969109149288,
+      "learning_rate": 1.8977524217893783e-06,
+      "loss": 1.2248,
+      "step": 845
+    },
+    {
+      "epoch": 0.948925481440134,
+      "grad_norm": 0.08014128153610968,
+      "learning_rate": 1.5381069863131037e-06,
+      "loss": 1.2312,
+      "step": 850
+    },
+    {
+      "epoch": 0.9545073960368406,
+      "grad_norm": 0.08040835492341503,
+      "learning_rate": 1.2159504246718522e-06,
+      "loss": 1.2213,
+      "step": 855
+    },
+    {
+      "epoch": 0.9600893106335473,
+      "grad_norm": 0.08170226749481643,
+      "learning_rate": 9.314053963669245e-07,
+      "loss": 1.2114,
+      "step": 860
+    },
+    {
+      "epoch": 0.965671225230254,
+      "grad_norm": 0.08123838559159317,
+      "learning_rate": 6.845802404962243e-07,
+      "loss": 1.2455,
+      "step": 865
+    },
+    {
+      "epoch": 0.9712531398269606,
+      "grad_norm": 0.08532355248950987,
+      "learning_rate": 4.7556893450466653e-07,
+      "loss": 1.2017,
+      "step": 870
+    },
+    {
+      "epoch": 0.9768350544236674,
+      "grad_norm": 0.07935413274906811,
+      "learning_rate": 3.044510584027771e-07,
+      "loss": 1.203,
+      "step": 875
+    },
+    {
+      "epoch": 0.982416969020374,
+      "grad_norm": 0.07922680701516337,
+      "learning_rate": 1.7129176446692984e-07,
+      "loss": 1.1993,
+      "step": 880
+    },
+    {
+      "epoch": 0.9879988836170807,
+      "grad_norm": 0.08007277288266887,
+      "learning_rate": 7.614175243301213e-08,
+      "loss": 1.221,
+      "step": 885
+    },
+    {
+      "epoch": 0.9935807982137873,
+      "grad_norm": 0.08190648675567455,
+      "learning_rate": 1.9037250192732726e-08,
+      "loss": 1.2245,
+      "step": 890
+    },
+    {
+      "epoch": 0.999162712810494,
+      "grad_norm": 0.07884795604109555,
+      "learning_rate": 0.0,
+      "loss": 1.2359,
+      "step": 895
+    },
+    {
+      "epoch": 0.999162712810494,
+      "eval_loss": 1.1748292446136475,
+      "eval_runtime": 1569.4225,
+      "eval_samples_per_second": 8.524,
+      "eval_steps_per_second": 0.533,
+      "step": 895
+    },
+    {
+      "epoch": 0.999162712810494,
+      "step": 895,
+      "total_flos": 1.1254972268150784e+16,
+      "train_loss": 1.2433469767011078,
+      "train_runtime": 20318.3129,
+      "train_samples_per_second": 2.821,
+      "train_steps_per_second": 0.044
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 895,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 25,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1254972268150784e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}