Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

49_128_e3_3e-5/README.md +63 -0
49_128_e3_3e-5/adapter_config.json +39 -0
49_128_e3_3e-5/adapter_model.safetensors +3 -0
49_128_e3_3e-5/all_results.json +9 -0
49_128_e3_3e-5/config.json +32 -0
49_128_e3_3e-5/merges.txt +0 -0
49_128_e3_3e-5/special_tokens_map.json +45 -0
49_128_e3_3e-5/tokenizer.json +0 -0
49_128_e3_3e-5/tokenizer_config.json +188 -0
49_128_e3_3e-5/train_results.json +9 -0
49_128_e3_3e-5/trainer_state.json +1604 -0
49_128_e3_3e-5/training_args.bin +3 -0
49_128_e3_3e-5/vocab.json +0 -0

49_128_e3_3e-5/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+library_name: peft
+license: apache-2.0
+base_model: ibm-granite/granite-3.3-8b-base
+tags:
+- alignment-handbook
+- generated_from_trainer
+datasets:
+- data/knowledge_lora_training_data_2000
+model-index:
+- name: 49_128_e3_3e-5
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# 49_128_e3_3e-5
+This model is a fine-tuned version of [ibm-granite/granite-3.3-8b-base](https://huggingface.co/ibm-granite/granite-3.3-8b-base) on the data/knowledge_lora_training_data_2000 dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 3e-05
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 64
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 3.0
+### Training results
+### Framework versions
+- PEFT 0.15.2
+- Transformers 4.52.4
+- Pytorch 2.7.0+cu126
+- Datasets 3.6.0
+- Tokenizers 0.21.2

49_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-base",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "q_proj",
+    "gate_proj",
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

49_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a739c1de2b7a8cbbf00ddeb310b7588498e5327213360541f505523f92c69ce
+size 791751704

49_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.5875073592094884e+18,
+    "train_loss": 0.5184633527551928,
+    "train_runtime": 742.8841,
+    "train_samples": 11877,
+    "train_samples_per_second": 47.963,
+    "train_steps_per_second": 1.502
+}

49_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49152
+}

49_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

49_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<reponame>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

49_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

49_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<reponame>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

49_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.5875073592094884e+18,
+    "train_loss": 0.5184633527551928,
+    "train_runtime": 742.8841,
+    "train_samples": 11877,
+    "train_samples_per_second": 47.963,
+    "train_steps_per_second": 1.502
+}

49_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1604 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1116,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013458950201884253,
+      "grad_norm": 0.9882215261459351,
+      "learning_rate": 2.1428571428571427e-06,
+      "loss": 1.2639,
+      "step": 5
+    },
+    {
+      "epoch": 0.026917900403768506,
+      "grad_norm": 0.940019965171814,
+      "learning_rate": 4.821428571428572e-06,
+      "loss": 1.2334,
+      "step": 10
+    },
+    {
+      "epoch": 0.040376850605652756,
+      "grad_norm": 0.6706891059875488,
+      "learning_rate": 7.5e-06,
+      "loss": 1.2291,
+      "step": 15
+    },
+    {
+      "epoch": 0.05383580080753701,
+      "grad_norm": 0.7410071492195129,
+      "learning_rate": 1.0178571428571429e-05,
+      "loss": 1.2068,
+      "step": 20
+    },
+    {
+      "epoch": 0.06729475100942127,
+      "grad_norm": 1.317247748374939,
+      "learning_rate": 1.2857142857142857e-05,
+      "loss": 1.209,
+      "step": 25
+    },
+    {
+      "epoch": 0.08075370121130551,
+      "grad_norm": 0.6265580654144287,
+      "learning_rate": 1.553571428571429e-05,
+      "loss": 1.1986,
+      "step": 30
+    },
+    {
+      "epoch": 0.09421265141318977,
+      "grad_norm": 0.6021242141723633,
+      "learning_rate": 1.8214285714285712e-05,
+      "loss": 1.1809,
+      "step": 35
+    },
+    {
+      "epoch": 0.10767160161507403,
+      "grad_norm": 0.5548004508018494,
+      "learning_rate": 2.089285714285714e-05,
+      "loss": 1.1811,
+      "step": 40
+    },
+    {
+      "epoch": 0.12113055181695828,
+      "grad_norm": 0.5183567404747009,
+      "learning_rate": 2.357142857142857e-05,
+      "loss": 1.1822,
+      "step": 45
+    },
+    {
+      "epoch": 0.13458950201884254,
+      "grad_norm": 0.5572848320007324,
+      "learning_rate": 2.625e-05,
+      "loss": 1.1761,
+      "step": 50
+    },
+    {
+      "epoch": 0.1480484522207268,
+      "grad_norm": 0.5353060364723206,
+      "learning_rate": 2.892857142857143e-05,
+      "loss": 1.124,
+      "step": 55
+    },
+    {
+      "epoch": 0.16150740242261102,
+      "grad_norm": 0.6251804828643799,
+      "learning_rate": 2.9999407089793328e-05,
+      "loss": 1.1083,
+      "step": 60
+    },
+    {
+      "epoch": 0.17496635262449528,
+      "grad_norm": 0.6196370124816895,
+      "learning_rate": 2.99957839193835e-05,
+      "loss": 1.0978,
+      "step": 65
+    },
+    {
+      "epoch": 0.18842530282637954,
+      "grad_norm": 0.4450572431087494,
+      "learning_rate": 2.9988867767784412e-05,
+      "loss": 1.0598,
+      "step": 70
+    },
+    {
+      "epoch": 0.2018842530282638,
+      "grad_norm": 0.6395757794380188,
+      "learning_rate": 2.997866015374007e-05,
+      "loss": 1.125,
+      "step": 75
+    },
+    {
+      "epoch": 0.21534320323014805,
+      "grad_norm": 0.5264338254928589,
+      "learning_rate": 2.996516331877925e-05,
+      "loss": 1.0484,
+      "step": 80
+    },
+    {
+      "epoch": 0.2288021534320323,
+      "grad_norm": 0.5416696667671204,
+      "learning_rate": 2.994838022672324e-05,
+      "loss": 1.0364,
+      "step": 85
+    },
+    {
+      "epoch": 0.24226110363391656,
+      "grad_norm": 0.4816514253616333,
+      "learning_rate": 2.9928314563035015e-05,
+      "loss": 1.0884,
+      "step": 90
+    },
+    {
+      "epoch": 0.2557200538358008,
+      "grad_norm": 0.8453904986381531,
+      "learning_rate": 2.9904970734009933e-05,
+      "loss": 1.0505,
+      "step": 95
+    },
+    {
+      "epoch": 0.2691790040376851,
+      "grad_norm": 0.5211928486824036,
+      "learning_rate": 2.9878353865808144e-05,
+      "loss": 0.9872,
+      "step": 100
+    },
+    {
+      "epoch": 0.28263795423956933,
+      "grad_norm": 0.5633994936943054,
+      "learning_rate": 2.9848469803328896e-05,
+      "loss": 1.0204,
+      "step": 105
+    },
+    {
+      "epoch": 0.2960969044414536,
+      "grad_norm": 0.5428367853164673,
+      "learning_rate": 2.981532510892707e-05,
+      "loss": 0.9901,
+      "step": 110
+    },
+    {
+      "epoch": 0.30955585464333785,
+      "grad_norm": 0.5815945863723755,
+      "learning_rate": 2.9778927060972075e-05,
+      "loss": 1.0007,
+      "step": 115
+    },
+    {
+      "epoch": 0.32301480484522205,
+      "grad_norm": 0.5644654035568237,
+      "learning_rate": 2.9739283652249625e-05,
+      "loss": 0.9891,
+      "step": 120
+    },
+    {
+      "epoch": 0.3364737550471063,
+      "grad_norm": 0.633641242980957,
+      "learning_rate": 2.9696403588206517e-05,
+      "loss": 0.9361,
+      "step": 125
+    },
+    {
+      "epoch": 0.34993270524899056,
+      "grad_norm": 0.6438229084014893,
+      "learning_rate": 2.9650296285038996e-05,
+      "loss": 0.9631,
+      "step": 130
+    },
+    {
+      "epoch": 0.3633916554508748,
+      "grad_norm": 0.6100443601608276,
+      "learning_rate": 2.9600971867625027e-05,
+      "loss": 0.9458,
+      "step": 135
+    },
+    {
+      "epoch": 0.3768506056527591,
+      "grad_norm": 0.6567280292510986,
+      "learning_rate": 2.95484411673009e-05,
+      "loss": 0.9037,
+      "step": 140
+    },
+    {
+      "epoch": 0.39030955585464333,
+      "grad_norm": 0.6098927855491638,
+      "learning_rate": 2.9492715719482776e-05,
+      "loss": 0.9394,
+      "step": 145
+    },
+    {
+      "epoch": 0.4037685060565276,
+      "grad_norm": 0.6474848389625549,
+      "learning_rate": 2.9433807761133542e-05,
+      "loss": 0.9386,
+      "step": 150
+    },
+    {
+      "epoch": 0.41722745625841184,
+      "grad_norm": 0.7610502243041992,
+      "learning_rate": 2.9371730228075684e-05,
+      "loss": 0.8807,
+      "step": 155
+    },
+    {
+      "epoch": 0.4306864064602961,
+      "grad_norm": 0.7775782346725464,
+      "learning_rate": 2.9306496752150638e-05,
+      "loss": 0.8372,
+      "step": 160
+    },
+    {
+      "epoch": 0.44414535666218036,
+      "grad_norm": 0.7716594934463501,
+      "learning_rate": 2.9238121658225337e-05,
+      "loss": 0.8837,
+      "step": 165
+    },
+    {
+      "epoch": 0.4576043068640646,
+      "grad_norm": 0.6561233401298523,
+      "learning_rate": 2.9166619961046545e-05,
+      "loss": 0.8971,
+      "step": 170
+    },
+    {
+      "epoch": 0.47106325706594887,
+      "grad_norm": 0.698351263999939,
+      "learning_rate": 2.909200736194372e-05,
+      "loss": 0.8761,
+      "step": 175
+    },
+    {
+      "epoch": 0.4845222072678331,
+      "grad_norm": 0.6881304383277893,
+      "learning_rate": 2.9014300245381095e-05,
+      "loss": 0.834,
+      "step": 180
+    },
+    {
+      "epoch": 0.4979811574697174,
+      "grad_norm": 0.726060152053833,
+      "learning_rate": 2.8933515675359766e-05,
+      "loss": 0.9004,
+      "step": 185
+    },
+    {
+      "epoch": 0.5114401076716016,
+      "grad_norm": 0.649829626083374,
+      "learning_rate": 2.8849671391670518e-05,
+      "loss": 0.837,
+      "step": 190
+    },
+    {
+      "epoch": 0.5248990578734859,
+      "grad_norm": 0.7975636124610901,
+      "learning_rate": 2.8762785805998295e-05,
+      "loss": 0.8313,
+      "step": 195
+    },
+    {
+      "epoch": 0.5383580080753702,
+      "grad_norm": 0.855278730392456,
+      "learning_rate": 2.86728779978791e-05,
+      "loss": 0.8254,
+      "step": 200
+    },
+    {
+      "epoch": 0.5518169582772544,
+      "grad_norm": 0.829576313495636,
+      "learning_rate": 2.8579967710510264e-05,
+      "loss": 0.8876,
+      "step": 205
+    },
+    {
+      "epoch": 0.5652759084791387,
+      "grad_norm": 0.7353431582450867,
+      "learning_rate": 2.8484075346414936e-05,
+      "loss": 0.7699,
+      "step": 210
+    },
+    {
+      "epoch": 0.5787348586810229,
+      "grad_norm": 0.8122780323028564,
+      "learning_rate": 2.838522196296182e-05,
+      "loss": 0.8452,
+      "step": 215
+    },
+    {
+      "epoch": 0.5921938088829072,
+      "grad_norm": 0.7391046285629272,
+      "learning_rate": 2.8283429267741138e-05,
+      "loss": 0.7429,
+      "step": 220
+    },
+    {
+      "epoch": 0.6056527590847914,
+      "grad_norm": 1.0399044752120972,
+      "learning_rate": 2.8178719613797747e-05,
+      "loss": 0.7926,
+      "step": 225
+    },
+    {
+      "epoch": 0.6191117092866757,
+      "grad_norm": 0.8288000226020813,
+      "learning_rate": 2.807111599472254e-05,
+      "loss": 0.765,
+      "step": 230
+    },
+    {
+      "epoch": 0.6325706594885598,
+      "grad_norm": 0.9703077673912048,
+      "learning_rate": 2.7960642039603235e-05,
+      "loss": 0.7335,
+      "step": 235
+    },
+    {
+      "epoch": 0.6460296096904441,
+      "grad_norm": 0.9160774946212769,
+      "learning_rate": 2.7847322007835546e-05,
+      "loss": 0.782,
+      "step": 240
+    },
+    {
+      "epoch": 0.6594885598923284,
+      "grad_norm": 0.9156641960144043,
+      "learning_rate": 2.773118078379597e-05,
+      "loss": 0.7643,
+      "step": 245
+    },
+    {
+      "epoch": 0.6729475100942126,
+      "grad_norm": 0.8600050210952759,
+      "learning_rate": 2.7612243871377342e-05,
+      "loss": 0.7607,
+      "step": 250
+    },
+    {
+      "epoch": 0.6864064602960969,
+      "grad_norm": 0.7924662828445435,
+      "learning_rate": 2.749053738838834e-05,
+      "loss": 0.7422,
+      "step": 255
+    },
+    {
+      "epoch": 0.6998654104979811,
+      "grad_norm": 0.799248993396759,
+      "learning_rate": 2.7366088060818154e-05,
+      "loss": 0.7405,
+      "step": 260
+    },
+    {
+      "epoch": 0.7133243606998654,
+      "grad_norm": 0.7867820858955383,
+      "learning_rate": 2.7238923216967666e-05,
+      "loss": 0.7193,
+      "step": 265
+    },
+    {
+      "epoch": 0.7267833109017496,
+      "grad_norm": 0.8833760619163513,
+      "learning_rate": 2.7109070781448283e-05,
+      "loss": 0.7143,
+      "step": 270
+    },
+    {
+      "epoch": 0.7402422611036339,
+      "grad_norm": 0.9755755066871643,
+      "learning_rate": 2.6976559269049875e-05,
+      "loss": 0.737,
+      "step": 275
+    },
+    {
+      "epoch": 0.7537012113055181,
+      "grad_norm": 0.9140403270721436,
+      "learning_rate": 2.6841417778479132e-05,
+      "loss": 0.6613,
+      "step": 280
+    },
+    {
+      "epoch": 0.7671601615074024,
+      "grad_norm": 1.0706220865249634,
+      "learning_rate": 2.670367598596963e-05,
+      "loss": 0.6931,
+      "step": 285
+    },
+    {
+      "epoch": 0.7806191117092867,
+      "grad_norm": 0.895652174949646,
+      "learning_rate": 2.6563364138765137e-05,
+      "loss": 0.6678,
+      "step": 290
+    },
+    {
+      "epoch": 0.7940780619111709,
+      "grad_norm": 0.9422807097434998,
+      "learning_rate": 2.6420513048477503e-05,
+      "loss": 0.7114,
+      "step": 295
+    },
+    {
+      "epoch": 0.8075370121130552,
+      "grad_norm": 0.9548665285110474,
+      "learning_rate": 2.6275154084320622e-05,
+      "loss": 0.6963,
+      "step": 300
+    },
+    {
+      "epoch": 0.8209959623149394,
+      "grad_norm": 0.8600846529006958,
+      "learning_rate": 2.6127319166221923e-05,
+      "loss": 0.6485,
+      "step": 305
+    },
+    {
+      "epoch": 0.8344549125168237,
+      "grad_norm": 0.8546967506408691,
+      "learning_rate": 2.5977040757812997e-05,
+      "loss": 0.6661,
+      "step": 310
+    },
+    {
+      "epoch": 0.847913862718708,
+      "grad_norm": 0.9586823582649231,
+      "learning_rate": 2.5824351859300748e-05,
+      "loss": 0.6856,
+      "step": 315
+    },
+    {
+      "epoch": 0.8613728129205922,
+      "grad_norm": 1.010869026184082,
+      "learning_rate": 2.5669286000220757e-05,
+      "loss": 0.701,
+      "step": 320
+    },
+    {
+      "epoch": 0.8748317631224765,
+      "grad_norm": 1.1527146100997925,
+      "learning_rate": 2.551187723207443e-05,
+      "loss": 0.6431,
+      "step": 325
+    },
+    {
+      "epoch": 0.8882907133243607,
+      "grad_norm": 0.948448657989502,
+      "learning_rate": 2.5352160120851464e-05,
+      "loss": 0.6379,
+      "step": 330
+    },
+    {
+      "epoch": 0.901749663526245,
+      "grad_norm": 1.0279428958892822,
+      "learning_rate": 2.519016973943939e-05,
+      "loss": 0.6278,
+      "step": 335
+    },
+    {
+      "epoch": 0.9152086137281292,
+      "grad_norm": 0.938048243522644,
+      "learning_rate": 2.5025941659921783e-05,
+      "loss": 0.6158,
+      "step": 340
+    },
+    {
+      "epoch": 0.9286675639300135,
+      "grad_norm": 0.951744794845581,
+      "learning_rate": 2.485951194576685e-05,
+      "loss": 0.6429,
+      "step": 345
+    },
+    {
+      "epoch": 0.9421265141318977,
+      "grad_norm": 0.9467681646347046,
+      "learning_rate": 2.469091714390811e-05,
+      "loss": 0.6459,
+      "step": 350
+    },
+    {
+      "epoch": 0.955585464333782,
+      "grad_norm": 1.0841658115386963,
+      "learning_rate": 2.4520194276718938e-05,
+      "loss": 0.6094,
+      "step": 355
+    },
+    {
+      "epoch": 0.9690444145356663,
+      "grad_norm": 0.8894978165626526,
+      "learning_rate": 2.4347380833882634e-05,
+      "loss": 0.623,
+      "step": 360
+    },
+    {
+      "epoch": 0.9825033647375505,
+      "grad_norm": 0.9439630508422852,
+      "learning_rate": 2.417251476415998e-05,
+      "loss": 0.6406,
+      "step": 365
+    },
+    {
+      "epoch": 0.9959623149394348,
+      "grad_norm": 0.9258154630661011,
+      "learning_rate": 2.3995634467055882e-05,
+      "loss": 0.5976,
+      "step": 370
+    },
+    {
+      "epoch": 1.0080753701211305,
+      "grad_norm": 1.1117361783981323,
+      "learning_rate": 2.3816778784387097e-05,
+      "loss": 0.5804,
+      "step": 375
+    },
+    {
+      "epoch": 1.0215343203230147,
+      "grad_norm": 0.9733107089996338,
+      "learning_rate": 2.363598699175281e-05,
+      "loss": 0.5061,
+      "step": 380
+    },
+    {
+      "epoch": 1.034993270524899,
+      "grad_norm": 0.9482859969139099,
+      "learning_rate": 2.3453298789909935e-05,
+      "loss": 0.523,
+      "step": 385
+    },
+    {
+      "epoch": 1.0484522207267832,
+      "grad_norm": 1.136534333229065,
+      "learning_rate": 2.3268754296055122e-05,
+      "loss": 0.5518,
+      "step": 390
+    },
+    {
+      "epoch": 1.0619111709286675,
+      "grad_norm": 1.026702880859375,
+      "learning_rate": 2.3082394035015212e-05,
+      "loss": 0.5276,
+      "step": 395
+    },
+    {
+      "epoch": 1.0753701211305517,
+      "grad_norm": 1.0939162969589233,
+      "learning_rate": 2.2894258930348284e-05,
+      "loss": 0.5324,
+      "step": 400
+    },
+    {
+      "epoch": 1.088829071332436,
+      "grad_norm": 0.9696338176727295,
+      "learning_rate": 2.2704390295357056e-05,
+      "loss": 0.5136,
+      "step": 405
+    },
+    {
+      "epoch": 1.1022880215343203,
+      "grad_norm": 0.9916676878929138,
+      "learning_rate": 2.2512829824016773e-05,
+      "loss": 0.5235,
+      "step": 410
+    },
+    {
+      "epoch": 1.1157469717362045,
+      "grad_norm": 1.096625804901123,
+      "learning_rate": 2.2319619581819458e-05,
+      "loss": 0.4873,
+      "step": 415
+    },
+    {
+      "epoch": 1.1292059219380888,
+      "grad_norm": 1.1451526880264282,
+      "learning_rate": 2.2124801996536575e-05,
+      "loss": 0.4765,
+      "step": 420
+    },
+    {
+      "epoch": 1.142664872139973,
+      "grad_norm": 0.9332502484321594,
+      "learning_rate": 2.1928419848902163e-05,
+      "loss": 0.4786,
+      "step": 425
+    },
+    {
+      "epoch": 1.1561238223418573,
+      "grad_norm": 1.0080773830413818,
+      "learning_rate": 2.1730516263218472e-05,
+      "loss": 0.4909,
+      "step": 430
+    },
+    {
+      "epoch": 1.1695827725437415,
+      "grad_norm": 1.092008113861084,
+      "learning_rate": 2.1531134697886103e-05,
+      "loss": 0.4696,
+      "step": 435
+    },
+    {
+      "epoch": 1.1830417227456258,
+      "grad_norm": 1.1402854919433594,
+      "learning_rate": 2.133031893586083e-05,
+      "loss": 0.4854,
+      "step": 440
+    },
+    {
+      "epoch": 1.19650067294751,
+      "grad_norm": 1.10186767578125,
+      "learning_rate": 2.1128113075039124e-05,
+      "loss": 0.4958,
+      "step": 445
+    },
+    {
+      "epoch": 1.2099596231493943,
+      "grad_norm": 1.1457712650299072,
+      "learning_rate": 2.092456151857455e-05,
+      "loss": 0.4801,
+      "step": 450
+    },
+    {
+      "epoch": 1.2234185733512786,
+      "grad_norm": 0.9913672804832458,
+      "learning_rate": 2.0719708965127073e-05,
+      "loss": 0.5022,
+      "step": 455
+    },
+    {
+      "epoch": 1.2368775235531628,
+      "grad_norm": 1.1649874448776245,
+      "learning_rate": 2.0513600399047545e-05,
+      "loss": 0.4768,
+      "step": 460
+    },
+    {
+      "epoch": 1.250336473755047,
+      "grad_norm": 0.9590378999710083,
+      "learning_rate": 2.0306281080499413e-05,
+      "loss": 0.5171,
+      "step": 465
+    },
+    {
+      "epoch": 1.2637954239569313,
+      "grad_norm": 1.1455888748168945,
+      "learning_rate": 2.009779653551983e-05,
+      "loss": 0.4673,
+      "step": 470
+    },
+    {
+      "epoch": 1.2772543741588156,
+      "grad_norm": 1.0884604454040527,
+      "learning_rate": 1.9888192546022488e-05,
+      "loss": 0.4416,
+      "step": 475
+    },
+    {
+      "epoch": 1.2907133243606999,
+      "grad_norm": 1.0540392398834229,
+      "learning_rate": 1.9677515139744126e-05,
+      "loss": 0.5134,
+      "step": 480
+    },
+    {
+      "epoch": 1.304172274562584,
+      "grad_norm": 0.9360066056251526,
+      "learning_rate": 1.946581058013717e-05,
+      "loss": 0.4749,
+      "step": 485
+    },
+    {
+      "epoch": 1.3176312247644684,
+      "grad_norm": 1.0760797262191772,
+      "learning_rate": 1.9253125356210547e-05,
+      "loss": 0.461,
+      "step": 490
+    },
+    {
+      "epoch": 1.3310901749663526,
+      "grad_norm": 1.1534333229064941,
+      "learning_rate": 1.903950617232098e-05,
+      "loss": 0.4503,
+      "step": 495
+    },
+    {
+      "epoch": 1.3445491251682369,
+      "grad_norm": 1.110041618347168,
+      "learning_rate": 1.8824999937917025e-05,
+      "loss": 0.4331,
+      "step": 500
+    },
+    {
+      "epoch": 1.3580080753701211,
+      "grad_norm": 1.0444817543029785,
+      "learning_rate": 1.8609653757238026e-05,
+      "loss": 0.4292,
+      "step": 505
+    },
+    {
+      "epoch": 1.3714670255720054,
+      "grad_norm": 1.058761715888977,
+      "learning_rate": 1.8393514918970315e-05,
+      "loss": 0.4265,
+      "step": 510
+    },
+    {
+      "epoch": 1.3849259757738897,
+      "grad_norm": 1.1931772232055664,
+      "learning_rate": 1.8176630885862927e-05,
+      "loss": 0.452,
+      "step": 515
+    },
+    {
+      "epoch": 1.398384925975774,
+      "grad_norm": 1.1567202806472778,
+      "learning_rate": 1.7959049284305056e-05,
+      "loss": 0.4471,
+      "step": 520
+    },
+    {
+      "epoch": 1.4118438761776582,
+      "grad_norm": 1.1528855562210083,
+      "learning_rate": 1.7740817893867613e-05,
+      "loss": 0.4357,
+      "step": 525
+    },
+    {
+      "epoch": 1.4253028263795424,
+      "grad_norm": 1.1575230360031128,
+      "learning_rate": 1.752198463681111e-05,
+      "loss": 0.4249,
+      "step": 530
+    },
+    {
+      "epoch": 1.4387617765814267,
+      "grad_norm": 1.087706208229065,
+      "learning_rate": 1.7302597567562237e-05,
+      "loss": 0.4249,
+      "step": 535
+    },
+    {
+      "epoch": 1.452220726783311,
+      "grad_norm": 1.1799367666244507,
+      "learning_rate": 1.7082704862161422e-05,
+      "loss": 0.4086,
+      "step": 540
+    },
+    {
+      "epoch": 1.4656796769851952,
+      "grad_norm": 1.1974951028823853,
+      "learning_rate": 1.6862354807683643e-05,
+      "loss": 0.4195,
+      "step": 545
+    },
+    {
+      "epoch": 1.4791386271870794,
+      "grad_norm": 1.0563615560531616,
+      "learning_rate": 1.66415957916349e-05,
+      "loss": 0.4491,
+      "step": 550
+    },
+    {
+      "epoch": 1.4925975773889637,
+      "grad_norm": 1.0967886447906494,
+      "learning_rate": 1.642047629132663e-05,
+      "loss": 0.4067,
+      "step": 555
+    },
+    {
+      "epoch": 1.506056527590848,
+      "grad_norm": 1.0966241359710693,
+      "learning_rate": 1.619904486323037e-05,
+      "loss": 0.4362,
+      "step": 560
+    },
+    {
+      "epoch": 1.5195154777927322,
+      "grad_norm": 1.0399625301361084,
+      "learning_rate": 1.597735013231507e-05,
+      "loss": 0.4074,
+      "step": 565
+    },
+    {
+      "epoch": 1.5329744279946165,
+      "grad_norm": 0.995629072189331,
+      "learning_rate": 1.5755440781369345e-05,
+      "loss": 0.4502,
+      "step": 570
+    },
+    {
+      "epoch": 1.5464333781965007,
+      "grad_norm": 1.1102111339569092,
+      "learning_rate": 1.5533365540311038e-05,
+      "loss": 0.3681,
+      "step": 575
+    },
+    {
+      "epoch": 1.559892328398385,
+      "grad_norm": 1.2387511730194092,
+      "learning_rate": 1.531117317548643e-05,
+      "loss": 0.3995,
+      "step": 580
+    },
+    {
+      "epoch": 1.5733512786002692,
+      "grad_norm": 1.2061550617218018,
+      "learning_rate": 1.5088912478961458e-05,
+      "loss": 0.3832,
+      "step": 585
+    },
+    {
+      "epoch": 1.5868102288021535,
+      "grad_norm": 1.124131202697754,
+      "learning_rate": 1.4866632257807278e-05,
+      "loss": 0.3715,
+      "step": 590
+    },
+    {
+      "epoch": 1.6002691790040378,
+      "grad_norm": 1.1609008312225342,
+      "learning_rate": 1.4644381323382539e-05,
+      "loss": 0.4226,
+      "step": 595
+    },
+    {
+      "epoch": 1.613728129205922,
+      "grad_norm": 1.1971077919006348,
+      "learning_rate": 1.4422208480614684e-05,
+      "loss": 0.3835,
+      "step": 600
+    },
+    {
+      "epoch": 1.6271870794078063,
+      "grad_norm": 1.0978761911392212,
+      "learning_rate": 1.4200162517282736e-05,
+      "loss": 0.3972,
+      "step": 605
+    },
+    {
+      "epoch": 1.6406460296096905,
+      "grad_norm": 1.0205671787261963,
+      "learning_rate": 1.3978292193303768e-05,
+      "loss": 0.3835,
+      "step": 610
+    },
+    {
+      "epoch": 1.6541049798115748,
+      "grad_norm": 1.137688398361206,
+      "learning_rate": 1.3756646230025555e-05,
+      "loss": 0.4288,
+      "step": 615
+    },
+    {
+      "epoch": 1.667563930013459,
+      "grad_norm": 1.1691123247146606,
+      "learning_rate": 1.3535273299527622e-05,
+      "loss": 0.3641,
+      "step": 620
+    },
+    {
+      "epoch": 1.6810228802153433,
+      "grad_norm": 1.6706739664077759,
+      "learning_rate": 1.3314222013933218e-05,
+      "loss": 0.4099,
+      "step": 625
+    },
+    {
+      "epoch": 1.6944818304172276,
+      "grad_norm": 1.0793585777282715,
+      "learning_rate": 1.3093540914734351e-05,
+      "loss": 0.4181,
+      "step": 630
+    },
+    {
+      "epoch": 1.7079407806191118,
+      "grad_norm": 1.3619318008422852,
+      "learning_rate": 1.2873278462132401e-05,
+      "loss": 0.3841,
+      "step": 635
+    },
+    {
+      "epoch": 1.721399730820996,
+      "grad_norm": 1.095859169960022,
+      "learning_rate": 1.2653483024396535e-05,
+      "loss": 0.427,
+      "step": 640
+    },
+    {
+      "epoch": 1.7348586810228803,
+      "grad_norm": 1.0056387186050415,
+      "learning_rate": 1.2434202867242372e-05,
+      "loss": 0.3585,
+      "step": 645
+    },
+    {
+      "epoch": 1.7483176312247646,
+      "grad_norm": 1.1625523567199707,
+      "learning_rate": 1.221548614323308e-05,
+      "loss": 0.3616,
+      "step": 650
+    },
+    {
+      "epoch": 1.7617765814266488,
+      "grad_norm": 1.359569787979126,
+      "learning_rate": 1.1997380881205431e-05,
+      "loss": 0.333,
+      "step": 655
+    },
+    {
+      "epoch": 1.775235531628533,
+      "grad_norm": 1.1408214569091797,
+      "learning_rate": 1.1779934975722919e-05,
+      "loss": 0.3538,
+      "step": 660
+    },
+    {
+      "epoch": 1.7886944818304173,
+      "grad_norm": 1.2193180322647095,
+      "learning_rate": 1.1563196176558436e-05,
+      "loss": 0.3879,
+      "step": 665
+    },
+    {
+      "epoch": 1.8021534320323016,
+      "grad_norm": 1.2086423635482788,
+      "learning_rate": 1.13472120782087e-05,
+      "loss": 0.3682,
+      "step": 670
+    },
+    {
+      "epoch": 1.8156123822341859,
+      "grad_norm": 0.997719943523407,
+      "learning_rate": 1.1132030109442823e-05,
+      "loss": 0.3839,
+      "step": 675
+    },
+    {
+      "epoch": 1.8290713324360701,
+      "grad_norm": 1.1687936782836914,
+      "learning_rate": 1.0917697522887217e-05,
+      "loss": 0.3768,
+      "step": 680
+    },
+    {
+      "epoch": 1.8425302826379544,
+      "grad_norm": 1.1776505708694458,
+      "learning_rate": 1.0704261384649242e-05,
+      "loss": 0.346,
+      "step": 685
+    },
+    {
+      "epoch": 1.8559892328398386,
+      "grad_norm": 1.044433355331421,
+      "learning_rate": 1.0491768563981747e-05,
+      "loss": 0.3792,
+      "step": 690
+    },
+    {
+      "epoch": 1.8694481830417229,
+      "grad_norm": 0.9955368638038635,
+      "learning_rate": 1.0280265722990908e-05,
+      "loss": 0.3246,
+      "step": 695
+    },
+    {
+      "epoch": 1.8829071332436071,
+      "grad_norm": 1.0968743562698364,
+      "learning_rate": 1.0069799306389485e-05,
+      "loss": 0.3595,
+      "step": 700
+    },
+    {
+      "epoch": 1.8963660834454914,
+      "grad_norm": 1.1665064096450806,
+      "learning_rate": 9.860415531297881e-06,
+      "loss": 0.3774,
+      "step": 705
+    },
+    {
+      "epoch": 1.9098250336473757,
+      "grad_norm": 1.1354666948318481,
+      "learning_rate": 9.652160377095124e-06,
+      "loss": 0.3582,
+      "step": 710
+    },
+    {
+      "epoch": 1.92328398384926,
+      "grad_norm": 1.1390712261199951,
+      "learning_rate": 9.445079575322101e-06,
+      "loss": 0.3594,
+      "step": 715
+    },
+    {
+      "epoch": 1.9367429340511442,
+      "grad_norm": 1.1198657751083374,
+      "learning_rate": 9.239218599639171e-06,
+      "loss": 0.3213,
+      "step": 720
+    },
+    {
+      "epoch": 1.9502018842530284,
+      "grad_norm": 1.2458171844482422,
+      "learning_rate": 9.03462265584046e-06,
+      "loss": 0.3293,
+      "step": 725
+    },
+    {
+      "epoch": 1.9636608344549125,
+      "grad_norm": 1.0921350717544556,
+      "learning_rate": 8.831336671926924e-06,
+      "loss": 0.3505,
+      "step": 730
+    },
+    {
+      "epoch": 1.9771197846567967,
+      "grad_norm": 1.1428606510162354,
+      "learning_rate": 8.629405288240461e-06,
+      "loss": 0.3357,
+      "step": 735
+    },
+    {
+      "epoch": 1.990578734858681,
+      "grad_norm": 1.0794802904129028,
+      "learning_rate": 8.428872847661139e-06,
+      "loss": 0.3505,
+      "step": 740
+    },
+    {
+      "epoch": 2.0026917900403767,
+      "grad_norm": 1.1085920333862305,
+      "learning_rate": 8.229783385869807e-06,
+      "loss": 0.2998,
+      "step": 745
+    },
+    {
+      "epoch": 2.016150740242261,
+      "grad_norm": 1.1450228691101074,
+      "learning_rate": 8.03218062167811e-06,
+      "loss": 0.2814,
+      "step": 750
+    },
+    {
+      "epoch": 2.029609690444145,
+      "grad_norm": 1.1784065961837769,
+      "learning_rate": 7.836107947428115e-06,
+      "loss": 0.2864,
+      "step": 755
+    },
+    {
+      "epoch": 2.0430686406460294,
+      "grad_norm": 1.024545431137085,
+      "learning_rate": 7.641608419463621e-06,
+      "loss": 0.2633,
+      "step": 760
+    },
+    {
+      "epoch": 2.0565275908479137,
+      "grad_norm": 1.183801531791687,
+      "learning_rate": 7.448724748675273e-06,
+      "loss": 0.2903,
+      "step": 765
+    },
+    {
+      "epoch": 2.069986541049798,
+      "grad_norm": 1.1893815994262695,
+      "learning_rate": 7.257499291121473e-06,
+      "loss": 0.2706,
+      "step": 770
+    },
+    {
+      "epoch": 2.083445491251682,
+      "grad_norm": 1.226550579071045,
+      "learning_rate": 7.0679740387272896e-06,
+      "loss": 0.2937,
+      "step": 775
+    },
+    {
+      "epoch": 2.0969044414535665,
+      "grad_norm": 1.2340624332427979,
+      "learning_rate": 6.880190610063272e-06,
+      "loss": 0.2722,
+      "step": 780
+    },
+    {
+      "epoch": 2.1103633916554507,
+      "grad_norm": 1.2975499629974365,
+      "learning_rate": 6.694190241206277e-06,
+      "loss": 0.258,
+      "step": 785
+    },
+    {
+      "epoch": 2.123822341857335,
+      "grad_norm": 1.0203527212142944,
+      "learning_rate": 6.510013776684281e-06,
+      "loss": 0.2509,
+      "step": 790
+    },
+    {
+      "epoch": 2.1372812920592192,
+      "grad_norm": 1.3438855409622192,
+      "learning_rate": 6.327701660507191e-06,
+      "loss": 0.2859,
+      "step": 795
+    },
+    {
+      "epoch": 2.1507402422611035,
+      "grad_norm": 1.202867865562439,
+      "learning_rate": 6.147293927285537e-06,
+      "loss": 0.264,
+      "step": 800
+    },
+    {
+      "epoch": 2.1641991924629878,
+      "grad_norm": 1.0623880624771118,
+      "learning_rate": 5.968830193439195e-06,
+      "loss": 0.3156,
+      "step": 805
+    },
+    {
+      "epoch": 2.177658142664872,
+      "grad_norm": 1.0969281196594238,
+      "learning_rate": 5.7923496484978115e-06,
+      "loss": 0.2448,
+      "step": 810
+    },
+    {
+      "epoch": 2.1911170928667563,
+      "grad_norm": 1.1712204217910767,
+      "learning_rate": 5.617891046495046e-06,
+      "loss": 0.2714,
+      "step": 815
+    },
+    {
+      "epoch": 2.2045760430686405,
+      "grad_norm": 1.1462839841842651,
+      "learning_rate": 5.44549269745842e-06,
+      "loss": 0.2491,
+      "step": 820
+    },
+    {
+      "epoch": 2.218034993270525,
+      "grad_norm": 1.1571354866027832,
+      "learning_rate": 5.275192458996682e-06,
+      "loss": 0.2679,
+      "step": 825
+    },
+    {
+      "epoch": 2.231493943472409,
+      "grad_norm": 1.1075127124786377,
+      "learning_rate": 5.1070277279864926e-06,
+      "loss": 0.2781,
+      "step": 830
+    },
+    {
+      "epoch": 2.2449528936742933,
+      "grad_norm": 1.0444560050964355,
+      "learning_rate": 4.941035432360333e-06,
+      "loss": 0.2483,
+      "step": 835
+    },
+    {
+      "epoch": 2.2584118438761775,
+      "grad_norm": 1.3500826358795166,
+      "learning_rate": 4.777252022997327e-06,
+      "loss": 0.2628,
+      "step": 840
+    },
+    {
+      "epoch": 2.271870794078062,
+      "grad_norm": 1.1868863105773926,
+      "learning_rate": 4.615713465718867e-06,
+      "loss": 0.2592,
+      "step": 845
+    },
+    {
+      "epoch": 2.285329744279946,
+      "grad_norm": 1.0396500825881958,
+      "learning_rate": 4.456455233390751e-06,
+      "loss": 0.2534,
+      "step": 850
+    },
+    {
+      "epoch": 2.2987886944818303,
+      "grad_norm": 1.1607229709625244,
+      "learning_rate": 4.299512298133546e-06,
+      "loss": 0.2782,
+      "step": 855
+    },
+    {
+      "epoch": 2.3122476446837146,
+      "grad_norm": 1.3405728340148926,
+      "learning_rate": 4.144919123642947e-06,
+      "loss": 0.2784,
+      "step": 860
+    },
+    {
+      "epoch": 2.325706594885599,
+      "grad_norm": 1.042685627937317,
+      "learning_rate": 3.992709657621739e-06,
+      "loss": 0.248,
+      "step": 865
+    },
+    {
+      "epoch": 2.339165545087483,
+      "grad_norm": 1.0927951335906982,
+      "learning_rate": 3.842917324325107e-06,
+      "loss": 0.2642,
+      "step": 870
+    },
+    {
+      "epoch": 2.3526244952893673,
+      "grad_norm": 1.212036371231079,
+      "learning_rate": 3.6955750172208763e-06,
+      "loss": 0.2477,
+      "step": 875
+    },
+    {
+      "epoch": 2.3660834454912516,
+      "grad_norm": 1.1431571245193481,
+      "learning_rate": 3.550715091766307e-06,
+      "loss": 0.2751,
+      "step": 880
+    },
+    {
+      "epoch": 2.379542395693136,
+      "grad_norm": 1.0427595376968384,
+      "learning_rate": 3.4083693583030306e-06,
+      "loss": 0.2686,
+      "step": 885
+    },
+    {
+      "epoch": 2.39300134589502,
+      "grad_norm": 1.1862140893936157,
+      "learning_rate": 3.268569075071722e-06,
+      "loss": 0.2837,
+      "step": 890
+    },
+    {
+      "epoch": 2.4064602960969044,
+      "grad_norm": 1.0857174396514893,
+      "learning_rate": 3.131344941347937e-06,
+      "loss": 0.2632,
+      "step": 895
+    },
+    {
+      "epoch": 2.4199192462987886,
+      "grad_norm": 1.2340177297592163,
+      "learning_rate": 2.996727090700794e-06,
+      "loss": 0.2635,
+      "step": 900
+    },
+    {
+      "epoch": 2.433378196500673,
+      "grad_norm": 1.1067768335342407,
+      "learning_rate": 2.86474508437579e-06,
+      "loss": 0.2617,
+      "step": 905
+    },
+    {
+      "epoch": 2.446837146702557,
+      "grad_norm": 1.1215053796768188,
+      "learning_rate": 2.7354279048033524e-06,
+      "loss": 0.2606,
+      "step": 910
+    },
+    {
+      "epoch": 2.4602960969044414,
+      "grad_norm": 1.1256719827651978,
+      "learning_rate": 2.6088039492344707e-06,
+      "loss": 0.2487,
+      "step": 915
+    },
+    {
+      "epoch": 2.4737550471063257,
+      "grad_norm": 1.2089550495147705,
+      "learning_rate": 2.4849010235048513e-06,
+      "loss": 0.2404,
+      "step": 920
+    },
+    {
+      "epoch": 2.48721399730821,
+      "grad_norm": 1.1536446809768677,
+      "learning_rate": 2.3637463359288914e-06,
+      "loss": 0.2633,
+      "step": 925
+    },
+    {
+      "epoch": 2.500672947510094,
+      "grad_norm": 1.0565274953842163,
+      "learning_rate": 2.2453664913249505e-06,
+      "loss": 0.2311,
+      "step": 930
+    },
+    {
+      "epoch": 2.5141318977119784,
+      "grad_norm": 1.1993619203567505,
+      "learning_rate": 2.1297874851730575e-06,
+      "loss": 0.274,
+      "step": 935
+    },
+    {
+      "epoch": 2.5275908479138627,
+      "grad_norm": 1.0851179361343384,
+      "learning_rate": 2.017034697906489e-06,
+      "loss": 0.2326,
+      "step": 940
+    },
+    {
+      "epoch": 2.541049798115747,
+      "grad_norm": 1.1467525959014893,
+      "learning_rate": 1.9071328893383667e-06,
+      "loss": 0.242,
+      "step": 945
+    },
+    {
+      "epoch": 2.554508748317631,
+      "grad_norm": 1.3856874704360962,
+      "learning_rate": 1.8001061932245654e-06,
+      "loss": 0.2445,
+      "step": 950
+    },
+    {
+      "epoch": 2.5679676985195155,
+      "grad_norm": 1.1563737392425537,
+      "learning_rate": 1.6959781119640894e-06,
+      "loss": 0.2311,
+      "step": 955
+    },
+    {
+      "epoch": 2.5814266487213997,
+      "grad_norm": 1.0257714986801147,
+      "learning_rate": 1.594771511438095e-06,
+      "loss": 0.2504,
+      "step": 960
+    },
+    {
+      "epoch": 2.594885598923284,
+      "grad_norm": 1.142681360244751,
+      "learning_rate": 1.4965086159886694e-06,
+      "loss": 0.2349,
+      "step": 965
+    },
+    {
+      "epoch": 2.608344549125168,
+      "grad_norm": 1.1093924045562744,
+      "learning_rate": 1.401211003538519e-06,
+      "loss": 0.2381,
+      "step": 970
+    },
+    {
+      "epoch": 2.6218034993270525,
+      "grad_norm": 1.2210307121276855,
+      "learning_rate": 1.308899600852585e-06,
+      "loss": 0.2722,
+      "step": 975
+    },
+    {
+      "epoch": 2.6352624495289367,
+      "grad_norm": 1.1265455484390259,
+      "learning_rate": 1.2195946789426531e-06,
+      "loss": 0.2578,
+      "step": 980
+    },
+    {
+      "epoch": 2.648721399730821,
+      "grad_norm": 1.0628033876419067,
+      "learning_rate": 1.13331584861597e-06,
+      "loss": 0.2182,
+      "step": 985
+    },
+    {
+      "epoch": 2.6621803499327052,
+      "grad_norm": 1.1813066005706787,
+      "learning_rate": 1.0500820561688374e-06,
+      "loss": 0.2619,
+      "step": 990
+    },
+    {
+      "epoch": 2.6756393001345895,
+      "grad_norm": 1.1125173568725586,
+      "learning_rate": 9.699115792260953e-07,
+      "loss": 0.2933,
+      "step": 995
+    },
+    {
+      "epoch": 2.6890982503364738,
+      "grad_norm": 1.0151184797286987,
+      "learning_rate": 8.928220227275086e-07,
+      "loss": 0.2342,
+      "step": 1000
+    },
+    {
+      "epoch": 2.702557200538358,
+      "grad_norm": 0.9034965634346008,
+      "learning_rate": 8.18830315061801e-07,
+      "loss": 0.2455,
+      "step": 1005
+    },
+    {
+      "epoch": 2.7160161507402423,
+      "grad_norm": 1.1290091276168823,
+      "learning_rate": 7.479527043492984e-07,
+      "loss": 0.237,
+      "step": 1010
+    },
+    {
+      "epoch": 2.7294751009421265,
+      "grad_norm": 1.0831630229949951,
+      "learning_rate": 6.802047548739409e-07,
+      "loss": 0.2242,
+      "step": 1015
+    },
+    {
+      "epoch": 2.742934051144011,
+      "grad_norm": 1.130257248878479,
+      "learning_rate": 6.156013436654617e-07,
+      "loss": 0.2319,
+      "step": 1020
+    },
+    {
+      "epoch": 2.756393001345895,
+      "grad_norm": 1.158980369567871,
+      "learning_rate": 5.541566572324786e-07,
+      "loss": 0.263,
+      "step": 1025
+    },
+    {
+      "epoch": 2.7698519515477793,
+      "grad_norm": 1.0408811569213867,
+      "learning_rate": 4.958841884472409e-07,
+      "loss": 0.2423,
+      "step": 1030
+    },
+    {
+      "epoch": 2.7833109017496636,
+      "grad_norm": 1.1301546096801758,
+      "learning_rate": 4.407967335826585e-07,
+      "loss": 0.2513,
+      "step": 1035
+    },
+    {
+      "epoch": 2.796769851951548,
+      "grad_norm": 1.0434672832489014,
+      "learning_rate": 3.889063895023287e-07,
+      "loss": 0.2385,
+      "step": 1040
+    },
+    {
+      "epoch": 2.810228802153432,
+      "grad_norm": 1.2883914709091187,
+      "learning_rate": 3.4022455100414184e-07,
+      "loss": 0.2629,
+      "step": 1045
+    },
+    {
+      "epoch": 2.8236877523553163,
+      "grad_norm": 1.1323546171188354,
+      "learning_rate": 2.947619083180525e-07,
+      "loss": 0.251,
+      "step": 1050
+    },
+    {
+      "epoch": 2.8371467025572006,
+      "grad_norm": 1.0120676755905151,
+      "learning_rate": 2.5252844475856906e-07,
+      "loss": 0.2367,
+      "step": 1055
+    },
+    {
+      "epoch": 2.850605652759085,
+      "grad_norm": 1.0989749431610107,
+      "learning_rate": 2.1353343453248408e-07,
+      "loss": 0.2462,
+      "step": 1060
+    },
+    {
+      "epoch": 2.864064602960969,
+      "grad_norm": 0.9685646295547485,
+      "learning_rate": 1.777854407023083e-07,
+      "loss": 0.2408,
+      "step": 1065
+    },
+    {
+      "epoch": 2.8775235531628534,
+      "grad_norm": 1.096001148223877,
+      "learning_rate": 1.4529231330588644e-07,
+      "loss": 0.2486,
+      "step": 1070
+    },
+    {
+      "epoch": 2.8909825033647376,
+      "grad_norm": 1.1372036933898926,
+      "learning_rate": 1.160611876325679e-07,
+      "loss": 0.2621,
+      "step": 1075
+    },
+    {
+      "epoch": 2.904441453566622,
+      "grad_norm": 1.0930324792861938,
+      "learning_rate": 9.009848265634669e-08,
+      "loss": 0.2618,
+      "step": 1080
+    },
+    {
+      "epoch": 2.917900403768506,
+      "grad_norm": 0.9669865369796753,
+      "learning_rate": 6.740989962628896e-08,
+      "loss": 0.227,
+      "step": 1085
+    },
+    {
+      "epoch": 2.9313593539703904,
+      "grad_norm": 1.0694055557250977,
+      "learning_rate": 4.800042081458456e-08,
+      "loss": 0.2678,
+      "step": 1090
+    },
+    {
+      "epoch": 2.9448183041722746,
+      "grad_norm": 1.0851000547409058,
+      "learning_rate": 3.187430842245709e-08,
+      "loss": 0.2118,
+      "step": 1095
+    },
+    {
+      "epoch": 2.958277254374159,
+      "grad_norm": 1.1466342210769653,
+      "learning_rate": 1.9035103644222675e-08,
+      "loss": 0.2271,
+      "step": 1100
+    },
+    {
+      "epoch": 2.971736204576043,
+      "grad_norm": 1.0995334386825562,
+      "learning_rate": 9.485625889660288e-09,
+      "loss": 0.2574,
+      "step": 1105
+    },
+    {
+      "epoch": 2.9851951547779274,
+      "grad_norm": 1.0613255500793457,
+      "learning_rate": 3.2279721648870162e-09,
+      "loss": 0.2179,
+      "step": 1110
+    },
+    {
+      "epoch": 2.9986541049798117,
+      "grad_norm": 1.0960551500320435,
+      "learning_rate": 2.6351661186974074e-10,
+      "loss": 0.2189,
+      "step": 1115
+    },
+    {
+      "epoch": 3.0,
+      "step": 1116,
+      "total_flos": 1.5875073592094884e+18,
+      "train_loss": 0.5184633527551928,
+      "train_runtime": 742.8841,
+      "train_samples_per_second": 47.963,
+      "train_steps_per_second": 1.502
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1116,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5875073592094884e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

49_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a96fa255e81544f1dc2ef41869395efbe9db155f4a3a73a7d1fe297e1f7a706a
+size 8145

49_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff