Model save

Browse files

Files changed (7) hide show

README.md +57 -0
all_results.json +9 -0
generation_config.json +9 -0
model-00001-of-00002.safetensors +1 -1
model-00002-of-00002.safetensors +1 -1
train_results.json +9 -0
trainer_state.json +787 -0

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+base_model: meta-llama/Llama-3.2-3B
+library_name: transformers
+model_name: llama3.2-3b-sft-full
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+# Model Card for llama3.2-3b-sft-full
+This model is a fine-tuned version of [meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="obiwit/llama3.2-3b-sft-full", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/bborges/L3-8B_preferences/runs/pnv8ca52)
+This model was trained with SFT.
+### Framework versions
+- TRL: 0.12.2
+- Transformers: 4.46.3
+- Pytorch: 2.1.2+cu121
+- Datasets: 3.1.0
+- Tokenizers: 0.20.3
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 10.0,
+    "total_flos": 651355949629440.0,
+    "train_loss": 0.5446855222952038,
+    "train_runtime": 13348.3684,
+    "train_samples": 164653,
+    "train_samples_per_second": 45.308,
+    "train_steps_per_second": 0.354
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": 128001,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.46.3"
+}

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bc37e2c0c88c7ab1e7d84e513aabaf1179db7673211cef2d4b19ecb0280fd399
 size 4965799096

 version https://git-lfs.github.com/spec/v1
+oid sha256:33192c33081e847991209ca36277a2f010e5cb1f7a30aad3eb728000f95310c0
 size 4965799096

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf18ccbefc1b0641309740913aa3e69349ad7392071acf0999db4afaf4c96f55
 size 1459729952

 version https://git-lfs.github.com/spec/v1
+oid sha256:b42807ccde702bb0aa3f1c056176b4950e29415d2c26be01fc356288c0794c09
 size 1459729952

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 10.0,
+    "total_flos": 651355949629440.0,
+    "train_loss": 0.5446855222952038,
+    "train_runtime": 13348.3684,
+    "train_samples": 164653,
+    "train_samples_per_second": 45.308,
+    "train_steps_per_second": 0.354
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,787 @@

+{
+  "best_metric": 1.2029809951782227,
+  "best_model_checkpoint": "models/llama3.2-3b-sft-full/checkpoint-946",
+  "epoch": 10.0,
+  "eval_steps": 500,
+  "global_step": 4730,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0021141649048625794,
+      "grad_norm": 12.750687916050717,
+      "learning_rate": 4.228329809725159e-08,
+      "loss": 2.0313,
+      "step": 1
+    },
+    {
+      "epoch": 0.10570824524312897,
+      "grad_norm": 1.6188098557526023,
+      "learning_rate": 2.1141649048625796e-06,
+      "loss": 1.8564,
+      "step": 50
+    },
+    {
+      "epoch": 0.21141649048625794,
+      "grad_norm": 1.3518202018434258,
+      "learning_rate": 4.228329809725159e-06,
+      "loss": 1.5595,
+      "step": 100
+    },
+    {
+      "epoch": 0.3171247357293869,
+      "grad_norm": 1.0657590959696566,
+      "learning_rate": 6.342494714587738e-06,
+      "loss": 1.4551,
+      "step": 150
+    },
+    {
+      "epoch": 0.42283298097251587,
+      "grad_norm": 0.8592382874171366,
+      "learning_rate": 8.456659619450318e-06,
+      "loss": 1.3854,
+      "step": 200
+    },
+    {
+      "epoch": 0.5285412262156448,
+      "grad_norm": 0.9009389463159452,
+      "learning_rate": 1.0570824524312897e-05,
+      "loss": 1.3425,
+      "step": 250
+    },
+    {
+      "epoch": 0.6342494714587738,
+      "grad_norm": 1.1412749380165943,
+      "learning_rate": 1.2684989429175477e-05,
+      "loss": 1.3006,
+      "step": 300
+    },
+    {
+      "epoch": 0.7399577167019028,
+      "grad_norm": 0.7983643996119608,
+      "learning_rate": 1.4799154334038057e-05,
+      "loss": 1.2744,
+      "step": 350
+    },
+    {
+      "epoch": 0.8456659619450317,
+      "grad_norm": 0.8244042644021615,
+      "learning_rate": 1.6913319238900637e-05,
+      "loss": 1.2548,
+      "step": 400
+    },
+    {
+      "epoch": 0.9513742071881607,
+      "grad_norm": 0.8216465876347789,
+      "learning_rate": 1.9027484143763216e-05,
+      "loss": 1.2404,
+      "step": 450
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.2400952577590942,
+      "eval_runtime": 18.7515,
+      "eval_samples_per_second": 180.785,
+      "eval_steps_per_second": 1.44,
+      "step": 473
+    },
+    {
+      "epoch": 1.0570824524312896,
+      "grad_norm": 0.8252930310615743,
+      "learning_rate": 1.9998014930992976e-05,
+      "loss": 1.1705,
+      "step": 500
+    },
+    {
+      "epoch": 1.1627906976744187,
+      "grad_norm": 0.8757450644495188,
+      "learning_rate": 1.9983859126509827e-05,
+      "loss": 1.12,
+      "step": 550
+    },
+    {
+      "epoch": 1.2684989429175475,
+      "grad_norm": 0.7949433367488117,
+      "learning_rate": 1.9956111376617147e-05,
+      "loss": 1.1182,
+      "step": 600
+    },
+    {
+      "epoch": 1.3742071881606766,
+      "grad_norm": 0.8258557786713708,
+      "learning_rate": 1.9914809456878166e-05,
+      "loss": 1.1182,
+      "step": 650
+    },
+    {
+      "epoch": 1.4799154334038054,
+      "grad_norm": 0.7933398639276036,
+      "learning_rate": 1.9860009595393798e-05,
+      "loss": 1.1038,
+      "step": 700
+    },
+    {
+      "epoch": 1.5856236786469344,
+      "grad_norm": 0.841899074998612,
+      "learning_rate": 1.9791786396254126e-05,
+      "loss": 1.1021,
+      "step": 750
+    },
+    {
+      "epoch": 1.6913319238900635,
+      "grad_norm": 0.7846336500672028,
+      "learning_rate": 1.971023273797303e-05,
+      "loss": 1.1144,
+      "step": 800
+    },
+    {
+      "epoch": 1.7970401691331923,
+      "grad_norm": 0.8229424691925108,
+      "learning_rate": 1.961545964704409e-05,
+      "loss": 1.1026,
+      "step": 850
+    },
+    {
+      "epoch": 1.9027484143763214,
+      "grad_norm": 0.7508672417042123,
+      "learning_rate": 1.950759614679005e-05,
+      "loss": 1.0999,
+      "step": 900
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.2029809951782227,
+      "eval_runtime": 18.6697,
+      "eval_samples_per_second": 181.578,
+      "eval_steps_per_second": 1.446,
+      "step": 946
+    },
+    {
+      "epoch": 2.0084566596194504,
+      "grad_norm": 1.1731388344496063,
+      "learning_rate": 1.9386789081711465e-05,
+      "loss": 1.0829,
+      "step": 950
+    },
+    {
+      "epoch": 2.1141649048625792,
+      "grad_norm": 0.8445527906517972,
+      "learning_rate": 1.9253202917573813e-05,
+      "loss": 0.8792,
+      "step": 1000
+    },
+    {
+      "epoch": 2.219873150105708,
+      "grad_norm": 0.8760066547012747,
+      "learning_rate": 1.910701951750511e-05,
+      "loss": 0.8858,
+      "step": 1050
+    },
+    {
+      "epoch": 2.3255813953488373,
+      "grad_norm": 0.8556944230458313,
+      "learning_rate": 1.894843789440892e-05,
+      "loss": 0.8963,
+      "step": 1100
+    },
+    {
+      "epoch": 2.431289640591966,
+      "grad_norm": 0.9640553709381205,
+      "learning_rate": 1.8777673940029783e-05,
+      "loss": 0.8986,
+      "step": 1150
+    },
+    {
+      "epoch": 2.536997885835095,
+      "grad_norm": 0.8627140770648074,
+      "learning_rate": 1.8594960131039935e-05,
+      "loss": 0.8938,
+      "step": 1200
+    },
+    {
+      "epoch": 2.6427061310782243,
+      "grad_norm": 0.9535715556479282,
+      "learning_rate": 1.8400545212547452e-05,
+      "loss": 0.8955,
+      "step": 1250
+    },
+    {
+      "epoch": 2.748414376321353,
+      "grad_norm": 0.8552691365971952,
+      "learning_rate": 1.819469385945664e-05,
+      "loss": 0.9014,
+      "step": 1300
+    },
+    {
+      "epoch": 2.854122621564482,
+      "grad_norm": 0.8665945027227095,
+      "learning_rate": 1.7977686316141757e-05,
+      "loss": 0.8947,
+      "step": 1350
+    },
+    {
+      "epoch": 2.9598308668076108,
+      "grad_norm": 0.9619834334611845,
+      "learning_rate": 1.7749818014924612e-05,
+      "loss": 0.9009,
+      "step": 1400
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.2264167070388794,
+      "eval_runtime": 18.611,
+      "eval_samples_per_second": 182.15,
+      "eval_steps_per_second": 1.451,
+      "step": 1419
+    },
+    {
+      "epoch": 3.06553911205074,
+      "grad_norm": 1.0859362155871017,
+      "learning_rate": 1.7511399173875326e-05,
+      "loss": 0.7572,
+      "step": 1450
+    },
+    {
+      "epoch": 3.171247357293869,
+      "grad_norm": 4.277266247206096,
+      "learning_rate": 1.7262754374483997e-05,
+      "loss": 0.667,
+      "step": 1500
+    },
+    {
+      "epoch": 3.276955602536998,
+      "grad_norm": 1.0657798474808275,
+      "learning_rate": 1.7004222119778044e-05,
+      "loss": 0.6762,
+      "step": 1550
+    },
+    {
+      "epoch": 3.382663847780127,
+      "grad_norm": 1.1532443588160448,
+      "learning_rate": 1.673615437348693e-05,
+      "loss": 0.6818,
+      "step": 1600
+    },
+    {
+      "epoch": 3.488372093023256,
+      "grad_norm": 1.027155281075861,
+      "learning_rate": 1.6458916080881566e-05,
+      "loss": 0.6855,
+      "step": 1650
+    },
+    {
+      "epoch": 3.5940803382663846,
+      "grad_norm": 1.1083716056629094,
+      "learning_rate": 1.6172884671940753e-05,
+      "loss": 0.6812,
+      "step": 1700
+    },
+    {
+      "epoch": 3.699788583509514,
+      "grad_norm": 1.0062825632251786,
+      "learning_rate": 1.587844954752106e-05,
+      "loss": 0.6883,
+      "step": 1750
+    },
+    {
+      "epoch": 3.8054968287526427,
+      "grad_norm": 1.07000970987697,
+      "learning_rate": 1.557601154922964e-05,
+      "loss": 0.6893,
+      "step": 1800
+    },
+    {
+      "epoch": 3.9112050739957716,
+      "grad_norm": 1.0384275987977813,
+      "learning_rate": 1.5265982413721662e-05,
+      "loss": 0.6876,
+      "step": 1850
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.2967802286148071,
+      "eval_runtime": 18.6163,
+      "eval_samples_per_second": 182.098,
+      "eval_steps_per_second": 1.45,
+      "step": 1892
+    },
+    {
+      "epoch": 4.016913319238901,
+      "grad_norm": 1.3109377397084605,
+      "learning_rate": 1.494878421216539e-05,
+      "loss": 0.6596,
+      "step": 1900
+    },
+    {
+      "epoch": 4.12262156448203,
+      "grad_norm": 1.2688929301953065,
+      "learning_rate": 1.4624848775637845e-05,
+      "loss": 0.4741,
+      "step": 1950
+    },
+    {
+      "epoch": 4.2283298097251585,
+      "grad_norm": 1.1894927227873353,
+      "learning_rate": 1.4294617107233504e-05,
+      "loss": 0.4744,
+      "step": 2000
+    },
+    {
+      "epoch": 4.334038054968287,
+      "grad_norm": 1.2032936700401937,
+      "learning_rate": 1.3958538781686194e-05,
+      "loss": 0.4805,
+      "step": 2050
+    },
+    {
+      "epoch": 4.439746300211416,
+      "grad_norm": 1.3185879928970223,
+      "learning_rate": 1.3617071333321705e-05,
+      "loss": 0.4826,
+      "step": 2100
+    },
+    {
+      "epoch": 4.545454545454545,
+      "grad_norm": 1.209922790479991,
+      "learning_rate": 1.3270679633174219e-05,
+      "loss": 0.4839,
+      "step": 2150
+    },
+    {
+      "epoch": 4.651162790697675,
+      "grad_norm": 1.242791543340292,
+      "learning_rate": 1.2919835256114639e-05,
+      "loss": 0.4885,
+      "step": 2200
+    },
+    {
+      "epoch": 4.7568710359408035,
+      "grad_norm": 1.201986398108636,
+      "learning_rate": 1.2565015838852364e-05,
+      "loss": 0.4871,
+      "step": 2250
+    },
+    {
+      "epoch": 4.862579281183932,
+      "grad_norm": 1.2360526015797055,
+      "learning_rate": 1.2206704429684504e-05,
+      "loss": 0.4846,
+      "step": 2300
+    },
+    {
+      "epoch": 4.968287526427061,
+      "grad_norm": 1.1786633423072006,
+      "learning_rate": 1.1845388830877826e-05,
+      "loss": 0.4893,
+      "step": 2350
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 1.4134972095489502,
+      "eval_runtime": 18.68,
+      "eval_samples_per_second": 181.478,
+      "eval_steps_per_second": 1.445,
+      "step": 2365
+    },
+    {
+      "epoch": 5.07399577167019,
+      "grad_norm": 1.2309864025937378,
+      "learning_rate": 1.1481560934578686e-05,
+      "loss": 0.3702,
+      "step": 2400
+    },
+    {
+      "epoch": 5.179704016913319,
+      "grad_norm": 1.2786650363864003,
+      "learning_rate": 1.1115716053155003e-05,
+      "loss": 0.3218,
+      "step": 2450
+    },
+    {
+      "epoch": 5.2854122621564485,
+      "grad_norm": 1.2330334913778382,
+      "learning_rate": 1.0748352244882008e-05,
+      "loss": 0.3265,
+      "step": 2500
+    },
+    {
+      "epoch": 5.391120507399577,
+      "grad_norm": 1.2643477957547729,
+      "learning_rate": 1.0379969635889705e-05,
+      "loss": 0.3295,
+      "step": 2550
+    },
+    {
+      "epoch": 5.496828752642706,
+      "grad_norm": 1.281027075874609,
+      "learning_rate": 1.0011069739295196e-05,
+      "loss": 0.3263,
+      "step": 2600
+    },
+    {
+      "epoch": 5.602536997885835,
+      "grad_norm": 1.22994140794609,
+      "learning_rate": 9.642154772446763e-06,
+      "loss": 0.3289,
+      "step": 2650
+    },
+    {
+      "epoch": 5.708245243128964,
+      "grad_norm": 1.3028039578218702,
+      "learning_rate": 9.273726973209203e-06,
+      "loss": 0.3284,
+      "step": 2700
+    },
+    {
+      "epoch": 5.813953488372093,
+      "grad_norm": 1.2532248232758287,
+      "learning_rate": 8.906287916221259e-06,
+      "loss": 0.3308,
+      "step": 2750
+    },
+    {
+      "epoch": 5.9196617336152215,
+      "grad_norm": 1.3100012552937503,
+      "learning_rate": 8.540337830055927e-06,
+      "loss": 0.3315,
+      "step": 2800
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 1.5860239267349243,
+      "eval_runtime": 18.5094,
+      "eval_samples_per_second": 183.151,
+      "eval_steps_per_second": 1.459,
+      "step": 2838
+    },
+    {
+      "epoch": 6.025369978858351,
+      "grad_norm": 1.4354000732242087,
+      "learning_rate": 8.176374916213325e-06,
+      "loss": 0.3032,
+      "step": 2850
+    },
+    {
+      "epoch": 6.13107822410148,
+      "grad_norm": 1.1648180451771042,
+      "learning_rate": 7.814894670873171e-06,
+      "loss": 0.2141,
+      "step": 2900
+    },
+    {
+      "epoch": 6.236786469344609,
+      "grad_norm": 1.1884819713548493,
+      "learning_rate": 7.456389210330289e-06,
+      "loss": 0.2153,
+      "step": 2950
+    },
+    {
+      "epoch": 6.342494714587738,
+      "grad_norm": 1.2643468298433007,
+      "learning_rate": 7.101346601031416e-06,
+      "loss": 0.2153,
+      "step": 3000
+    },
+    {
+      "epoch": 6.4482029598308666,
+      "grad_norm": 1.2064695562383054,
+      "learning_rate": 6.750250195125509e-06,
+      "loss": 0.2188,
+      "step": 3050
+    },
+    {
+      "epoch": 6.553911205073996,
+      "grad_norm": 1.2016268195975075,
+      "learning_rate": 6.4035779724320115e-06,
+      "loss": 0.2171,
+      "step": 3100
+    },
+    {
+      "epoch": 6.659619450317125,
+      "grad_norm": 1.2411099570397714,
+      "learning_rate": 6.061801889722952e-06,
+      "loss": 0.2162,
+      "step": 3150
+    },
+    {
+      "epoch": 6.765327695560254,
+      "grad_norm": 1.2392777259407999,
+      "learning_rate": 5.725387238204831e-06,
+      "loss": 0.215,
+      "step": 3200
+    },
+    {
+      "epoch": 6.871035940803383,
+      "grad_norm": 1.1740909434133877,
+      "learning_rate": 5.394792010074907e-06,
+      "loss": 0.2166,
+      "step": 3250
+    },
+    {
+      "epoch": 6.976744186046512,
+      "grad_norm": 1.2762193770401755,
+      "learning_rate": 5.0704662750142875e-06,
+      "loss": 0.2165,
+      "step": 3300
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 1.802822470664978,
+      "eval_runtime": 18.5091,
+      "eval_samples_per_second": 183.153,
+      "eval_steps_per_second": 1.459,
+      "step": 3311
+    },
+    {
+      "epoch": 7.08245243128964,
+      "grad_norm": 1.0526386037599447,
+      "learning_rate": 4.752851567466723e-06,
+      "loss": 0.1594,
+      "step": 3350
+    },
+    {
+      "epoch": 7.188160676532769,
+      "grad_norm": 1.147178748636373,
+      "learning_rate": 4.442380285537152e-06,
+      "loss": 0.1424,
+      "step": 3400
+    },
+    {
+      "epoch": 7.293868921775898,
+      "grad_norm": 1.0933846169501016,
+      "learning_rate": 4.1394751023283985e-06,
+      "loss": 0.1429,
+      "step": 3450
+    },
+    {
+      "epoch": 7.399577167019028,
+      "grad_norm": 1.1027491437777261,
+      "learning_rate": 3.844548390517433e-06,
+      "loss": 0.1446,
+      "step": 3500
+    },
+    {
+      "epoch": 7.505285412262157,
+      "grad_norm": 1.17228307610312,
+      "learning_rate": 3.5580016609545076e-06,
+      "loss": 0.1472,
+      "step": 3550
+    },
+    {
+      "epoch": 7.6109936575052854,
+      "grad_norm": 1.1469564201071871,
+      "learning_rate": 3.280225016049543e-06,
+      "loss": 0.1444,
+      "step": 3600
+    },
+    {
+      "epoch": 7.716701902748414,
+      "grad_norm": 1.1481692114896938,
+      "learning_rate": 3.011596618689825e-06,
+      "loss": 0.1442,
+      "step": 3650
+    },
+    {
+      "epoch": 7.822410147991543,
+      "grad_norm": 1.0576784331195976,
+      "learning_rate": 2.7524821774121158e-06,
+      "loss": 0.1432,
+      "step": 3700
+    },
+    {
+      "epoch": 7.928118393234672,
+      "grad_norm": 1.017768412790375,
+      "learning_rate": 2.5032344485299886e-06,
+      "loss": 0.1444,
+      "step": 3750
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 2.0527853965759277,
+      "eval_runtime": 18.5305,
+      "eval_samples_per_second": 182.942,
+      "eval_steps_per_second": 1.457,
+      "step": 3784
+    },
+    {
+      "epoch": 8.033826638477802,
+      "grad_norm": 0.8182949486515412,
+      "learning_rate": 2.2641927558942135e-06,
+      "loss": 0.1304,
+      "step": 3800
+    },
+    {
+      "epoch": 8.13953488372093,
+      "grad_norm": 0.8622437048349989,
+      "learning_rate": 2.0356825289400185e-06,
+      "loss": 0.1036,
+      "step": 3850
+    },
+    {
+      "epoch": 8.24524312896406,
+      "grad_norm": 0.7861265978157033,
+      "learning_rate": 1.818014859650068e-06,
+      "loss": 0.1028,
+      "step": 3900
+    },
+    {
+      "epoch": 8.350951374207188,
+      "grad_norm": 0.8194211537002656,
+      "learning_rate": 1.6114860790363351e-06,
+      "loss": 0.1032,
+      "step": 3950
+    },
+    {
+      "epoch": 8.456659619450317,
+      "grad_norm": 0.8182271224247505,
+      "learning_rate": 1.4163773537174653e-06,
+      "loss": 0.1032,
+      "step": 4000
+    },
+    {
+      "epoch": 8.562367864693446,
+      "grad_norm": 0.8307023639288861,
+      "learning_rate": 1.2329543031407953e-06,
+      "loss": 0.1036,
+      "step": 4050
+    },
+    {
+      "epoch": 8.668076109936575,
+      "grad_norm": 0.8784936190909071,
+      "learning_rate": 1.0614666379701732e-06,
+      "loss": 0.1023,
+      "step": 4100
+    },
+    {
+      "epoch": 8.773784355179703,
+      "grad_norm": 0.81990506328911,
+      "learning_rate": 9.021478201318846e-07,
+      "loss": 0.1036,
+      "step": 4150
+    },
+    {
+      "epoch": 8.879492600422832,
+      "grad_norm": 1.2592556041877936,
+      "learning_rate": 7.552147449814573e-07,
+      "loss": 0.1044,
+      "step": 4200
+    },
+    {
+      "epoch": 8.985200845665961,
+      "grad_norm": 0.7975519834622627,
+      "learning_rate": 6.20867446024066e-07,
+      "loss": 0.1032,
+      "step": 4250
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 2.279863119125366,
+      "eval_runtime": 18.5289,
+      "eval_samples_per_second": 182.957,
+      "eval_steps_per_second": 1.457,
+      "step": 4257
+    },
+    {
+      "epoch": 9.090909090909092,
+      "grad_norm": 0.7603748072458637,
+      "learning_rate": 4.992888225905467e-07,
+      "loss": 0.0893,
+      "step": 4300
+    },
+    {
+      "epoch": 9.19661733615222,
+      "grad_norm": 0.7495366875123604,
+      "learning_rate": 3.906443908397017e-07,
+      "loss": 0.0873,
+      "step": 4350
+    },
+    {
+      "epoch": 9.30232558139535,
+      "grad_norm": 0.7691079659468273,
+      "learning_rate": 2.9508205842594727e-07,
+      "loss": 0.0871,
+      "step": 4400
+    },
+    {
+      "epoch": 9.408033826638478,
+      "grad_norm": 0.6348077284451538,
+      "learning_rate": 2.127319231390168e-07,
+      "loss": 0.0877,
+      "step": 4450
+    },
+    {
+      "epoch": 9.513742071881607,
+      "grad_norm": 0.6760109276931042,
+      "learning_rate": 1.4370609578987948e-07,
+      "loss": 0.0875,
+      "step": 4500
+    },
+    {
+      "epoch": 9.619450317124736,
+      "grad_norm": 0.6725906544777629,
+      "learning_rate": 8.809854758399017e-08,
+      "loss": 0.087,
+      "step": 4550
+    },
+    {
+      "epoch": 9.725158562367865,
+      "grad_norm": 0.6525914332864488,
+      "learning_rate": 4.5984982189636316e-08,
+      "loss": 0.0865,
+      "step": 4600
+    },
+    {
+      "epoch": 9.830866807610994,
+      "grad_norm": 0.6640786754191882,
+      "learning_rate": 1.7422732675583808e-08,
+      "loss": 0.0888,
+      "step": 4650
+    },
+    {
+      "epoch": 9.936575052854122,
+      "grad_norm": 0.6669826671643079,
+      "learning_rate": 2.4506834582960213e-09,
+      "loss": 0.0879,
+      "step": 4700
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 2.3936548233032227,
+      "eval_runtime": 18.4489,
+      "eval_samples_per_second": 183.751,
+      "eval_steps_per_second": 1.464,
+      "step": 4730
+    },
+    {
+      "epoch": 10.0,
+      "step": 4730,
+      "total_flos": 651355949629440.0,
+      "train_loss": 0.5446855222952038,
+      "train_runtime": 13348.3684,
+      "train_samples_per_second": 45.308,
+      "train_steps_per_second": 0.354
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 4730,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 651355949629440.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}