Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

4_128_e3_3e-5/README.md +63 -0
4_128_e3_3e-5/adapter_config.json +39 -0
4_128_e3_3e-5/adapter_model.safetensors +3 -0
4_128_e3_3e-5/all_results.json +9 -0
4_128_e3_3e-5/config.json +32 -0
4_128_e3_3e-5/merges.txt +0 -0
4_128_e3_3e-5/special_tokens_map.json +45 -0
4_128_e3_3e-5/tokenizer.json +0 -0
4_128_e3_3e-5/tokenizer_config.json +188 -0
4_128_e3_3e-5/train_results.json +9 -0
4_128_e3_3e-5/trainer_state.json +1604 -0
4_128_e3_3e-5/training_args.bin +3 -0
4_128_e3_3e-5/vocab.json +0 -0

4_128_e3_3e-5/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+library_name: peft
+license: apache-2.0
+base_model: ibm-granite/granite-3.3-8b-base
+tags:
+- alignment-handbook
+- generated_from_trainer
+datasets:
+- data/knowledge_lora_training_data_2000
+model-index:
+- name: 4_128_e3_3e-5
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# 4_128_e3_3e-5
+This model is a fine-tuned version of [ibm-granite/granite-3.3-8b-base](https://huggingface.co/ibm-granite/granite-3.3-8b-base) on the data/knowledge_lora_training_data_2000 dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 3e-05
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 64
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 3.0
+### Training results
+### Framework versions
+- PEFT 0.15.2
+- Transformers 4.52.4
+- Pytorch 2.7.0+cu126
+- Datasets 3.6.0
+- Tokenizers 0.21.2

4_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-base",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "o_proj",
+    "down_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

4_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4db078d940fc3e313688dfd11fae025731ae19a2ef462d7ba1e35a87c5add446
+size 791751704

4_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.647805420300927e+18,
+    "train_loss": 0.5340931572159877,
+    "train_runtime": 781.1418,
+    "train_samples": 11931,
+    "train_samples_per_second": 45.821,
+    "train_steps_per_second": 1.433
+}

4_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49152
+}

4_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

4_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<reponame>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

4_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

4_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<reponame>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

4_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.647805420300927e+18,
+    "train_loss": 0.5340931572159877,
+    "train_runtime": 781.1418,
+    "train_samples": 11931,
+    "train_samples_per_second": 45.821,
+    "train_steps_per_second": 1.433
+}

4_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1604 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1119,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013404825737265416,
+      "grad_norm": 1.2472357749938965,
+      "learning_rate": 2.1428571428571427e-06,
+      "loss": 1.3007,
+      "step": 5
+    },
+    {
+      "epoch": 0.02680965147453083,
+      "grad_norm": 1.018729567527771,
+      "learning_rate": 4.821428571428572e-06,
+      "loss": 1.2872,
+      "step": 10
+    },
+    {
+      "epoch": 0.040214477211796246,
+      "grad_norm": 0.6834114193916321,
+      "learning_rate": 7.5e-06,
+      "loss": 1.2708,
+      "step": 15
+    },
+    {
+      "epoch": 0.05361930294906166,
+      "grad_norm": 0.924774706363678,
+      "learning_rate": 1.0178571428571429e-05,
+      "loss": 1.1525,
+      "step": 20
+    },
+    {
+      "epoch": 0.06702412868632708,
+      "grad_norm": 0.5801697373390198,
+      "learning_rate": 1.2857142857142857e-05,
+      "loss": 1.2542,
+      "step": 25
+    },
+    {
+      "epoch": 0.08042895442359249,
+      "grad_norm": 0.5304354429244995,
+      "learning_rate": 1.553571428571429e-05,
+      "loss": 1.1908,
+      "step": 30
+    },
+    {
+      "epoch": 0.0938337801608579,
+      "grad_norm": 0.48565009236335754,
+      "learning_rate": 1.8214285714285712e-05,
+      "loss": 1.1505,
+      "step": 35
+    },
+    {
+      "epoch": 0.10723860589812333,
+      "grad_norm": 0.46631884574890137,
+      "learning_rate": 2.089285714285714e-05,
+      "loss": 1.2448,
+      "step": 40
+    },
+    {
+      "epoch": 0.12064343163538874,
+      "grad_norm": 0.600545346736908,
+      "learning_rate": 2.357142857142857e-05,
+      "loss": 1.1581,
+      "step": 45
+    },
+    {
+      "epoch": 0.13404825737265416,
+      "grad_norm": 0.527691662311554,
+      "learning_rate": 2.625e-05,
+      "loss": 1.1731,
+      "step": 50
+    },
+    {
+      "epoch": 0.14745308310991956,
+      "grad_norm": 0.48526814579963684,
+      "learning_rate": 2.892857142857143e-05,
+      "loss": 1.1426,
+      "step": 55
+    },
+    {
+      "epoch": 0.16085790884718498,
+      "grad_norm": 0.410771906375885,
+      "learning_rate": 2.999941043167295e-05,
+      "loss": 1.1407,
+      "step": 60
+    },
+    {
+      "epoch": 0.1742627345844504,
+      "grad_norm": 0.5673904418945312,
+      "learning_rate": 2.999580768195271e-05,
+      "loss": 1.0899,
+      "step": 65
+    },
+    {
+      "epoch": 0.1876675603217158,
+      "grad_norm": 0.5336377024650574,
+      "learning_rate": 2.998893050620046e-05,
+      "loss": 1.0582,
+      "step": 70
+    },
+    {
+      "epoch": 0.20107238605898123,
+      "grad_norm": 0.53374844789505,
+      "learning_rate": 2.9978780406089445e-05,
+      "loss": 1.0804,
+      "step": 75
+    },
+    {
+      "epoch": 0.21447721179624665,
+      "grad_norm": 0.4427356421947479,
+      "learning_rate": 2.996535959795591e-05,
+      "loss": 1.0686,
+      "step": 80
+    },
+    {
+      "epoch": 0.22788203753351208,
+      "grad_norm": 0.48419925570487976,
+      "learning_rate": 2.994867101231513e-05,
+      "loss": 1.0867,
+      "step": 85
+    },
+    {
+      "epoch": 0.24128686327077747,
+      "grad_norm": 0.5140860676765442,
+      "learning_rate": 2.9928718293221532e-05,
+      "loss": 1.0337,
+      "step": 90
+    },
+    {
+      "epoch": 0.2546916890080429,
+      "grad_norm": 0.5096598863601685,
+      "learning_rate": 2.9905505797472965e-05,
+      "loss": 1.0434,
+      "step": 95
+    },
+    {
+      "epoch": 0.2680965147453083,
+      "grad_norm": 0.5465592741966248,
+      "learning_rate": 2.9879038593659403e-05,
+      "loss": 1.0407,
+      "step": 100
+    },
+    {
+      "epoch": 0.28150134048257375,
+      "grad_norm": 0.6191142797470093,
+      "learning_rate": 2.984932246105616e-05,
+      "loss": 1.0505,
+      "step": 105
+    },
+    {
+      "epoch": 0.2949061662198391,
+      "grad_norm": 0.5154615640640259,
+      "learning_rate": 2.981636388836196e-05,
+      "loss": 0.9597,
+      "step": 110
+    },
+    {
+      "epoch": 0.30831099195710454,
+      "grad_norm": 0.6329454183578491,
+      "learning_rate": 2.978017007228208e-05,
+      "loss": 0.9858,
+      "step": 115
+    },
+    {
+      "epoch": 0.32171581769436997,
+      "grad_norm": 0.575188934803009,
+      "learning_rate": 2.9740748915956932e-05,
+      "loss": 0.9933,
+      "step": 120
+    },
+    {
+      "epoch": 0.3351206434316354,
+      "grad_norm": 0.502245306968689,
+      "learning_rate": 2.9698109027236335e-05,
+      "loss": 0.9603,
+      "step": 125
+    },
+    {
+      "epoch": 0.3485254691689008,
+      "grad_norm": 0.5507754683494568,
+      "learning_rate": 2.965225971679996e-05,
+      "loss": 0.981,
+      "step": 130
+    },
+    {
+      "epoch": 0.36193029490616624,
+      "grad_norm": 0.7022887468338013,
+      "learning_rate": 2.9603210996124257e-05,
+      "loss": 0.9451,
+      "step": 135
+    },
+    {
+      "epoch": 0.3753351206434316,
+      "grad_norm": 0.611986517906189,
+      "learning_rate": 2.9550973575296424e-05,
+      "loss": 0.9538,
+      "step": 140
+    },
+    {
+      "epoch": 0.38873994638069703,
+      "grad_norm": 0.8291884064674377,
+      "learning_rate": 2.949555886067578e-05,
+      "loss": 0.995,
+      "step": 145
+    },
+    {
+      "epoch": 0.40214477211796246,
+      "grad_norm": 0.6599870324134827,
+      "learning_rate": 2.9436978952403103e-05,
+      "loss": 0.8675,
+      "step": 150
+    },
+    {
+      "epoch": 0.4155495978552279,
+      "grad_norm": 0.6758149266242981,
+      "learning_rate": 2.937524664175851e-05,
+      "loss": 0.9174,
+      "step": 155
+    },
+    {
+      "epoch": 0.4289544235924933,
+      "grad_norm": 0.6708205938339233,
+      "learning_rate": 2.931037540836839e-05,
+      "loss": 0.8959,
+      "step": 160
+    },
+    {
+      "epoch": 0.44235924932975873,
+      "grad_norm": 0.7568298578262329,
+      "learning_rate": 2.9242379417262052e-05,
+      "loss": 0.9291,
+      "step": 165
+    },
+    {
+      "epoch": 0.45576407506702415,
+      "grad_norm": 0.7548916339874268,
+      "learning_rate": 2.9171273515778703e-05,
+      "loss": 0.946,
+      "step": 170
+    },
+    {
+      "epoch": 0.4691689008042895,
+      "grad_norm": 0.737224280834198,
+      "learning_rate": 2.909707323032545e-05,
+      "loss": 0.8876,
+      "step": 175
+    },
+    {
+      "epoch": 0.48257372654155495,
+      "grad_norm": 0.7204071879386902,
+      "learning_rate": 2.901979476298699e-05,
+      "loss": 0.8444,
+      "step": 180
+    },
+    {
+      "epoch": 0.4959785522788204,
+      "grad_norm": 0.794966995716095,
+      "learning_rate": 2.893945498798781e-05,
+      "loss": 0.8443,
+      "step": 185
+    },
+    {
+      "epoch": 0.5093833780160858,
+      "grad_norm": 0.8569888472557068,
+      "learning_rate": 2.885607144800759e-05,
+      "loss": 0.8748,
+      "step": 190
+    },
+    {
+      "epoch": 0.5227882037533512,
+      "grad_norm": 0.708555281162262,
+      "learning_rate": 2.876966235035064e-05,
+      "loss": 0.8526,
+      "step": 195
+    },
+    {
+      "epoch": 0.5361930294906166,
+      "grad_norm": 0.7718046307563782,
+      "learning_rate": 2.8680246562970253e-05,
+      "loss": 0.8597,
+      "step": 200
+    },
+    {
+      "epoch": 0.5495978552278821,
+      "grad_norm": 0.8034407496452332,
+      "learning_rate": 2.8587843610348735e-05,
+      "loss": 0.8829,
+      "step": 205
+    },
+    {
+      "epoch": 0.5630026809651475,
+      "grad_norm": 0.8720042705535889,
+      "learning_rate": 2.8492473669234143e-05,
+      "loss": 0.8171,
+      "step": 210
+    },
+    {
+      "epoch": 0.5764075067024129,
+      "grad_norm": 0.851708173751831,
+      "learning_rate": 2.8394157564234564e-05,
+      "loss": 0.8083,
+      "step": 215
+    },
+    {
+      "epoch": 0.5898123324396782,
+      "grad_norm": 0.7046942710876465,
+      "learning_rate": 2.8292916763270918e-05,
+      "loss": 0.8058,
+      "step": 220
+    },
+    {
+      "epoch": 0.6032171581769437,
+      "grad_norm": 0.7370072603225708,
+      "learning_rate": 2.818877337288934e-05,
+      "loss": 0.7986,
+      "step": 225
+    },
+    {
+      "epoch": 0.6166219839142091,
+      "grad_norm": 0.8055124878883362,
+      "learning_rate": 2.808175013343404e-05,
+      "loss": 0.8417,
+      "step": 230
+    },
+    {
+      "epoch": 0.6300268096514745,
+      "grad_norm": 0.9540407061576843,
+      "learning_rate": 2.7971870414081845e-05,
+      "loss": 0.7665,
+      "step": 235
+    },
+    {
+      "epoch": 0.6434316353887399,
+      "grad_norm": 0.869381308555603,
+      "learning_rate": 2.785915820773937e-05,
+      "loss": 0.8064,
+      "step": 240
+    },
+    {
+      "epoch": 0.6568364611260054,
+      "grad_norm": 0.8873690366744995,
+      "learning_rate": 2.774363812580405e-05,
+      "loss": 0.7943,
+      "step": 245
+    },
+    {
+      "epoch": 0.6702412868632708,
+      "grad_norm": 0.8190541863441467,
+      "learning_rate": 2.7625335392790056e-05,
+      "loss": 0.7807,
+      "step": 250
+    },
+    {
+      "epoch": 0.6836461126005362,
+      "grad_norm": 0.7668455839157104,
+      "learning_rate": 2.75042758408204e-05,
+      "loss": 0.7612,
+      "step": 255
+    },
+    {
+      "epoch": 0.6970509383378016,
+      "grad_norm": 0.925933837890625,
+      "learning_rate": 2.7380485903986317e-05,
+      "loss": 0.8064,
+      "step": 260
+    },
+    {
+      "epoch": 0.710455764075067,
+      "grad_norm": 0.8035215139389038,
+      "learning_rate": 2.7253992612575225e-05,
+      "loss": 0.7501,
+      "step": 265
+    },
+    {
+      "epoch": 0.7238605898123325,
+      "grad_norm": 0.8842463493347168,
+      "learning_rate": 2.712482358716848e-05,
+      "loss": 0.7334,
+      "step": 270
+    },
+    {
+      "epoch": 0.7372654155495979,
+      "grad_norm": 1.0536301136016846,
+      "learning_rate": 2.6993007032610297e-05,
+      "loss": 0.7069,
+      "step": 275
+    },
+    {
+      "epoch": 0.7506702412868632,
+      "grad_norm": 0.8780192732810974,
+      "learning_rate": 2.6858571731848997e-05,
+      "loss": 0.727,
+      "step": 280
+    },
+    {
+      "epoch": 0.7640750670241286,
+      "grad_norm": 0.8872283101081848,
+      "learning_rate": 2.672154703965212e-05,
+      "loss": 0.733,
+      "step": 285
+    },
+    {
+      "epoch": 0.7774798927613941,
+      "grad_norm": 0.8710986375808716,
+      "learning_rate": 2.6581962876196593e-05,
+      "loss": 0.6887,
+      "step": 290
+    },
+    {
+      "epoch": 0.7908847184986595,
+      "grad_norm": 0.8486837148666382,
+      "learning_rate": 2.643984972053551e-05,
+      "loss": 0.7331,
+      "step": 295
+    },
+    {
+      "epoch": 0.8042895442359249,
+      "grad_norm": 1.0940479040145874,
+      "learning_rate": 2.6295238603942832e-05,
+      "loss": 0.6835,
+      "step": 300
+    },
+    {
+      "epoch": 0.8176943699731903,
+      "grad_norm": 0.8243003487586975,
+      "learning_rate": 2.6148161103137515e-05,
+      "loss": 0.7032,
+      "step": 305
+    },
+    {
+      "epoch": 0.8310991957104558,
+      "grad_norm": 0.9313507676124573,
+      "learning_rate": 2.599864933338854e-05,
+      "loss": 0.686,
+      "step": 310
+    },
+    {
+      "epoch": 0.8445040214477212,
+      "grad_norm": 0.8665029406547546,
+      "learning_rate": 2.5846735941502356e-05,
+      "loss": 0.6883,
+      "step": 315
+    },
+    {
+      "epoch": 0.8579088471849866,
+      "grad_norm": 1.0570703744888306,
+      "learning_rate": 2.5692454098694256e-05,
+      "loss": 0.69,
+      "step": 320
+    },
+    {
+      "epoch": 0.871313672922252,
+      "grad_norm": 1.0085052251815796,
+      "learning_rate": 2.553583749334522e-05,
+      "loss": 0.7056,
+      "step": 325
+    },
+    {
+      "epoch": 0.8847184986595175,
+      "grad_norm": 0.9562423825263977,
+      "learning_rate": 2.537692032364587e-05,
+      "loss": 0.678,
+      "step": 330
+    },
+    {
+      "epoch": 0.8981233243967829,
+      "grad_norm": 0.9648351669311523,
+      "learning_rate": 2.521573729012907e-05,
+      "loss": 0.6541,
+      "step": 335
+    },
+    {
+      "epoch": 0.9115281501340483,
+      "grad_norm": 1.0201152563095093,
+      "learning_rate": 2.5052323588092878e-05,
+      "loss": 0.6809,
+      "step": 340
+    },
+    {
+      "epoch": 0.9249329758713136,
+      "grad_norm": 0.9566739201545715,
+      "learning_rate": 2.4886714899915415e-05,
+      "loss": 0.6185,
+      "step": 345
+    },
+    {
+      "epoch": 0.938337801608579,
+      "grad_norm": 1.1028639078140259,
+      "learning_rate": 2.4718947387263403e-05,
+      "loss": 0.6555,
+      "step": 350
+    },
+    {
+      "epoch": 0.9517426273458445,
+      "grad_norm": 1.0449227094650269,
+      "learning_rate": 2.4549057683196054e-05,
+      "loss": 0.6629,
+      "step": 355
+    },
+    {
+      "epoch": 0.9651474530831099,
+      "grad_norm": 1.186073899269104,
+      "learning_rate": 2.4377082884166016e-05,
+      "loss": 0.6665,
+      "step": 360
+    },
+    {
+      "epoch": 0.9785522788203753,
+      "grad_norm": 0.9336662292480469,
+      "learning_rate": 2.4203060541919136e-05,
+      "loss": 0.5925,
+      "step": 365
+    },
+    {
+      "epoch": 0.9919571045576407,
+      "grad_norm": 0.9751836657524109,
+      "learning_rate": 2.4027028655294804e-05,
+      "loss": 0.5957,
+      "step": 370
+    },
+    {
+      "epoch": 1.0053619302949062,
+      "grad_norm": 1.0686187744140625,
+      "learning_rate": 2.384902566192867e-05,
+      "loss": 0.5863,
+      "step": 375
+    },
+    {
+      "epoch": 1.0187667560321716,
+      "grad_norm": 1.0750106573104858,
+      "learning_rate": 2.366909042985956e-05,
+      "loss": 0.5347,
+      "step": 380
+    },
+    {
+      "epoch": 1.032171581769437,
+      "grad_norm": 1.0693711042404175,
+      "learning_rate": 2.3487262249042412e-05,
+      "loss": 0.5365,
+      "step": 385
+    },
+    {
+      "epoch": 1.0455764075067024,
+      "grad_norm": 0.9094949960708618,
+      "learning_rate": 2.330358082276905e-05,
+      "loss": 0.5336,
+      "step": 390
+    },
+    {
+      "epoch": 1.0589812332439679,
+      "grad_norm": 1.031535267829895,
+      "learning_rate": 2.3118086258998748e-05,
+      "loss": 0.5705,
+      "step": 395
+    },
+    {
+      "epoch": 1.0723860589812333,
+      "grad_norm": 0.9758612513542175,
+      "learning_rate": 2.2930819061600413e-05,
+      "loss": 0.5211,
+      "step": 400
+    },
+    {
+      "epoch": 1.0857908847184987,
+      "grad_norm": 1.0757410526275635,
+      "learning_rate": 2.27418201215083e-05,
+      "loss": 0.5639,
+      "step": 405
+    },
+    {
+      "epoch": 1.0991957104557641,
+      "grad_norm": 1.033258080482483,
+      "learning_rate": 2.2551130707793243e-05,
+      "loss": 0.5209,
+      "step": 410
+    },
+    {
+      "epoch": 1.1126005361930296,
+      "grad_norm": 2.3148107528686523,
+      "learning_rate": 2.2358792458651304e-05,
+      "loss": 0.5049,
+      "step": 415
+    },
+    {
+      "epoch": 1.126005361930295,
+      "grad_norm": 1.3143815994262695,
+      "learning_rate": 2.2164847372311804e-05,
+      "loss": 0.5074,
+      "step": 420
+    },
+    {
+      "epoch": 1.1394101876675604,
+      "grad_norm": 1.0450321435928345,
+      "learning_rate": 2.1969337797866772e-05,
+      "loss": 0.5174,
+      "step": 425
+    },
+    {
+      "epoch": 1.1528150134048256,
+      "grad_norm": 1.199095606803894,
+      "learning_rate": 2.1772306426023737e-05,
+      "loss": 0.4997,
+      "step": 430
+    },
+    {
+      "epoch": 1.1662198391420913,
+      "grad_norm": 1.0282012224197388,
+      "learning_rate": 2.1573796279783987e-05,
+      "loss": 0.4671,
+      "step": 435
+    },
+    {
+      "epoch": 1.1796246648793565,
+      "grad_norm": 1.1715396642684937,
+      "learning_rate": 2.137385070504821e-05,
+      "loss": 0.5437,
+      "step": 440
+    },
+    {
+      "epoch": 1.193029490616622,
+      "grad_norm": 1.0932139158248901,
+      "learning_rate": 2.1172513361151643e-05,
+      "loss": 0.4993,
+      "step": 445
+    },
+    {
+      "epoch": 1.2064343163538873,
+      "grad_norm": 0.9794313311576843,
+      "learning_rate": 2.0969828211330824e-05,
+      "loss": 0.5142,
+      "step": 450
+    },
+    {
+      "epoch": 1.2198391420911527,
+      "grad_norm": 0.9845211505889893,
+      "learning_rate": 2.0765839513123923e-05,
+      "loss": 0.4902,
+      "step": 455
+    },
+    {
+      "epoch": 1.2332439678284182,
+      "grad_norm": 1.1693155765533447,
+      "learning_rate": 2.056059180870684e-05,
+      "loss": 0.4812,
+      "step": 460
+    },
+    {
+      "epoch": 1.2466487935656836,
+      "grad_norm": 1.0942554473876953,
+      "learning_rate": 2.0354129915167175e-05,
+      "loss": 0.5094,
+      "step": 465
+    },
+    {
+      "epoch": 1.260053619302949,
+      "grad_norm": 1.0578761100769043,
+      "learning_rate": 2.014649891471811e-05,
+      "loss": 0.4843,
+      "step": 470
+    },
+    {
+      "epoch": 1.2734584450402144,
+      "grad_norm": 1.0976276397705078,
+      "learning_rate": 1.9937744144854446e-05,
+      "loss": 0.4965,
+      "step": 475
+    },
+    {
+      "epoch": 1.2868632707774799,
+      "grad_norm": 1.304211139678955,
+      "learning_rate": 1.9727911188452903e-05,
+      "loss": 0.5413,
+      "step": 480
+    },
+    {
+      "epoch": 1.3002680965147453,
+      "grad_norm": 1.3818888664245605,
+      "learning_rate": 1.9517045863818792e-05,
+      "loss": 0.5436,
+      "step": 485
+    },
+    {
+      "epoch": 1.3136729222520107,
+      "grad_norm": 1.1613248586654663,
+      "learning_rate": 1.930519421468133e-05,
+      "loss": 0.4902,
+      "step": 490
+    },
+    {
+      "epoch": 1.3270777479892761,
+      "grad_norm": 1.0429706573486328,
+      "learning_rate": 1.9092402500139693e-05,
+      "loss": 0.5013,
+      "step": 495
+    },
+    {
+      "epoch": 1.3404825737265416,
+      "grad_norm": 1.0879743099212646,
+      "learning_rate": 1.8878717184562078e-05,
+      "loss": 0.4853,
+      "step": 500
+    },
+    {
+      "epoch": 1.353887399463807,
+      "grad_norm": 1.1582545042037964,
+      "learning_rate": 1.8664184927439907e-05,
+      "loss": 0.4574,
+      "step": 505
+    },
+    {
+      "epoch": 1.3672922252010724,
+      "grad_norm": 1.0746876001358032,
+      "learning_rate": 1.8448852573199465e-05,
+      "loss": 0.4626,
+      "step": 510
+    },
+    {
+      "epoch": 1.3806970509383378,
+      "grad_norm": 1.1986364126205444,
+      "learning_rate": 1.823276714097311e-05,
+      "loss": 0.4319,
+      "step": 515
+    },
+    {
+      "epoch": 1.3941018766756033,
+      "grad_norm": 1.1719796657562256,
+      "learning_rate": 1.8015975814332373e-05,
+      "loss": 0.4701,
+      "step": 520
+    },
+    {
+      "epoch": 1.4075067024128687,
+      "grad_norm": 1.3504611253738403,
+      "learning_rate": 1.779852593098513e-05,
+      "loss": 0.436,
+      "step": 525
+    },
+    {
+      "epoch": 1.420911528150134,
+      "grad_norm": 1.1986589431762695,
+      "learning_rate": 1.7580464972439137e-05,
+      "loss": 0.4929,
+      "step": 530
+    },
+    {
+      "epoch": 1.4343163538873995,
+      "grad_norm": 1.1194978952407837,
+      "learning_rate": 1.736184055363414e-05,
+      "loss": 0.4459,
+      "step": 535
+    },
+    {
+      "epoch": 1.447721179624665,
+      "grad_norm": 1.290337085723877,
+      "learning_rate": 1.7142700412544867e-05,
+      "loss": 0.4153,
+      "step": 540
+    },
+    {
+      "epoch": 1.4611260053619302,
+      "grad_norm": 0.9937898516654968,
+      "learning_rate": 1.692309239975715e-05,
+      "loss": 0.4374,
+      "step": 545
+    },
+    {
+      "epoch": 1.4745308310991958,
+      "grad_norm": 1.1232582330703735,
+      "learning_rate": 1.670306446801947e-05,
+      "loss": 0.4724,
+      "step": 550
+    },
+    {
+      "epoch": 1.487935656836461,
+      "grad_norm": 1.0209157466888428,
+      "learning_rate": 1.6482664661772153e-05,
+      "loss": 0.4321,
+      "step": 555
+    },
+    {
+      "epoch": 1.5013404825737267,
+      "grad_norm": 1.0886987447738647,
+      "learning_rate": 1.626194110665661e-05,
+      "loss": 0.4133,
+      "step": 560
+    },
+    {
+      "epoch": 1.5147453083109919,
+      "grad_norm": 1.0784316062927246,
+      "learning_rate": 1.6040941999006768e-05,
+      "loss": 0.4336,
+      "step": 565
+    },
+    {
+      "epoch": 1.5281501340482575,
+      "grad_norm": 1.1950879096984863,
+      "learning_rate": 1.581971559532515e-05,
+      "loss": 0.4528,
+      "step": 570
+    },
+    {
+      "epoch": 1.5415549597855227,
+      "grad_norm": 1.1852750778198242,
+      "learning_rate": 1.559831020174576e-05,
+      "loss": 0.4013,
+      "step": 575
+    },
+    {
+      "epoch": 1.5549597855227884,
+      "grad_norm": 1.2094613313674927,
+      "learning_rate": 1.5376774163486104e-05,
+      "loss": 0.4055,
+      "step": 580
+    },
+    {
+      "epoch": 1.5683646112600536,
+      "grad_norm": 1.06105375289917,
+      "learning_rate": 1.5155155854290777e-05,
+      "loss": 0.4311,
+      "step": 585
+    },
+    {
+      "epoch": 1.5817694369973192,
+      "grad_norm": 1.1074191331863403,
+      "learning_rate": 1.493350366586873e-05,
+      "loss": 0.449,
+      "step": 590
+    },
+    {
+      "epoch": 1.5951742627345844,
+      "grad_norm": 1.2042276859283447,
+      "learning_rate": 1.4711865997326648e-05,
+      "loss": 0.4303,
+      "step": 595
+    },
+    {
+      "epoch": 1.6085790884718498,
+      "grad_norm": 1.2283929586410522,
+      "learning_rate": 1.4490291244600712e-05,
+      "loss": 0.4116,
+      "step": 600
+    },
+    {
+      "epoch": 1.6219839142091153,
+      "grad_norm": 1.035912036895752,
+      "learning_rate": 1.4268827789889054e-05,
+      "loss": 0.4216,
+      "step": 605
+    },
+    {
+      "epoch": 1.6353887399463807,
+      "grad_norm": 1.076318383216858,
+      "learning_rate": 1.4047523991087194e-05,
+      "loss": 0.3904,
+      "step": 610
+    },
+    {
+      "epoch": 1.648793565683646,
+      "grad_norm": 1.3118699789047241,
+      "learning_rate": 1.3826428171228824e-05,
+      "loss": 0.3902,
+      "step": 615
+    },
+    {
+      "epoch": 1.6621983914209115,
+      "grad_norm": 1.1527796983718872,
+      "learning_rate": 1.3605588607934153e-05,
+      "loss": 0.3938,
+      "step": 620
+    },
+    {
+      "epoch": 1.675603217158177,
+      "grad_norm": 1.0946987867355347,
+      "learning_rate": 1.3385053522868229e-05,
+      "loss": 0.3789,
+      "step": 625
+    },
+    {
+      "epoch": 1.6890080428954424,
+      "grad_norm": 1.0634331703186035,
+      "learning_rate": 1.3164871071211399e-05,
+      "loss": 0.3894,
+      "step": 630
+    },
+    {
+      "epoch": 1.7024128686327078,
+      "grad_norm": 1.1408213376998901,
+      "learning_rate": 1.2945089331144364e-05,
+      "loss": 0.4033,
+      "step": 635
+    },
+    {
+      "epoch": 1.7158176943699732,
+      "grad_norm": 1.238582968711853,
+      "learning_rate": 1.2725756293350011e-05,
+      "loss": 0.4068,
+      "step": 640
+    },
+    {
+      "epoch": 1.7292225201072386,
+      "grad_norm": 1.1031956672668457,
+      "learning_rate": 1.2506919850534343e-05,
+      "loss": 0.3751,
+      "step": 645
+    },
+    {
+      "epoch": 1.742627345844504,
+      "grad_norm": 0.9783279299736023,
+      "learning_rate": 1.2288627786968826e-05,
+      "loss": 0.3667,
+      "step": 650
+    },
+    {
+      "epoch": 1.7560321715817695,
+      "grad_norm": 1.234369158744812,
+      "learning_rate": 1.2070927768056399e-05,
+      "loss": 0.3462,
+      "step": 655
+    },
+    {
+      "epoch": 1.7694369973190347,
+      "grad_norm": 1.101770043373108,
+      "learning_rate": 1.1853867329923436e-05,
+      "loss": 0.3976,
+      "step": 660
+    },
+    {
+      "epoch": 1.7828418230563003,
+      "grad_norm": 1.1363108158111572,
+      "learning_rate": 1.163749386903995e-05,
+      "loss": 0.3607,
+      "step": 665
+    },
+    {
+      "epoch": 1.7962466487935655,
+      "grad_norm": 1.2321057319641113,
+      "learning_rate": 1.1421854631870291e-05,
+      "loss": 0.384,
+      "step": 670
+    },
+    {
+      "epoch": 1.8096514745308312,
+      "grad_norm": 1.101366400718689,
+      "learning_rate": 1.1206996704556575e-05,
+      "loss": 0.4095,
+      "step": 675
+    },
+    {
+      "epoch": 1.8230563002680964,
+      "grad_norm": 1.4615360498428345,
+      "learning_rate": 1.0992967002637148e-05,
+      "loss": 0.3794,
+      "step": 680
+    },
+    {
+      "epoch": 1.836461126005362,
+      "grad_norm": 1.1510567665100098,
+      "learning_rate": 1.0779812260802303e-05,
+      "loss": 0.3664,
+      "step": 685
+    },
+    {
+      "epoch": 1.8498659517426272,
+      "grad_norm": 1.039561152458191,
+      "learning_rate": 1.056757902268945e-05,
+      "loss": 0.4016,
+      "step": 690
+    },
+    {
+      "epoch": 1.863270777479893,
+      "grad_norm": 1.1654466390609741,
+      "learning_rate": 1.035631363072005e-05,
+      "loss": 0.3947,
+      "step": 695
+    },
+    {
+      "epoch": 1.876675603217158,
+      "grad_norm": 1.1753458976745605,
+      "learning_rate": 1.014606221598046e-05,
+      "loss": 0.3328,
+      "step": 700
+    },
+    {
+      "epoch": 1.8900804289544237,
+      "grad_norm": 1.1596604585647583,
+      "learning_rate": 9.936870688148924e-06,
+      "loss": 0.386,
+      "step": 705
+    },
+    {
+      "epoch": 1.903485254691689,
+      "grad_norm": 1.0891685485839844,
+      "learning_rate": 9.728784725470913e-06,
+      "loss": 0.3767,
+      "step": 710
+    },
+    {
+      "epoch": 1.9168900804289544,
+      "grad_norm": 1.1352084875106812,
+      "learning_rate": 9.521849764785018e-06,
+      "loss": 0.3346,
+      "step": 715
+    },
+    {
+      "epoch": 1.9302949061662198,
+      "grad_norm": 1.1333717107772827,
+      "learning_rate": 9.31611099160152e-06,
+      "loss": 0.3706,
+      "step": 720
+    },
+    {
+      "epoch": 1.9436997319034852,
+      "grad_norm": 1.1660853624343872,
+      "learning_rate": 9.111613330235866e-06,
+      "loss": 0.3489,
+      "step": 725
+    },
+    {
+      "epoch": 1.9571045576407506,
+      "grad_norm": 1.1579868793487549,
+      "learning_rate": 8.90840143399917e-06,
+      "loss": 0.3801,
+      "step": 730
+    },
+    {
+      "epoch": 1.970509383378016,
+      "grad_norm": 1.192337155342102,
+      "learning_rate": 8.706519675447898e-06,
+      "loss": 0.3304,
+      "step": 735
+    },
+    {
+      "epoch": 1.9839142091152815,
+      "grad_norm": 1.2775137424468994,
+      "learning_rate": 8.506012136694832e-06,
+      "loss": 0.3452,
+      "step": 740
+    },
+    {
+      "epoch": 1.997319034852547,
+      "grad_norm": 1.1246819496154785,
+      "learning_rate": 8.306922599783491e-06,
+      "loss": 0.362,
+      "step": 745
+    },
+    {
+      "epoch": 2.0107238605898123,
+      "grad_norm": 0.9721271991729736,
+      "learning_rate": 8.109294537128057e-06,
+      "loss": 0.3169,
+      "step": 750
+    },
+    {
+      "epoch": 2.0241286863270775,
+      "grad_norm": 1.1820639371871948,
+      "learning_rate": 7.91317110202087e-06,
+      "loss": 0.2801,
+      "step": 755
+    },
+    {
+      "epoch": 2.037533512064343,
+      "grad_norm": 1.1740771532058716,
+      "learning_rate": 7.718595119209691e-06,
+      "loss": 0.3133,
+      "step": 760
+    },
+    {
+      "epoch": 2.0509383378016084,
+      "grad_norm": 1.3226786851882935,
+      "learning_rate": 7.525609075546649e-06,
+      "loss": 0.2753,
+      "step": 765
+    },
+    {
+      "epoch": 2.064343163538874,
+      "grad_norm": 1.2498587369918823,
+      "learning_rate": 7.334255110710933e-06,
+      "loss": 0.2996,
+      "step": 770
+    },
+    {
+      "epoch": 2.0777479892761392,
+      "grad_norm": 1.1227176189422607,
+      "learning_rate": 7.1445750080073964e-06,
+      "loss": 0.2727,
+      "step": 775
+    },
+    {
+      "epoch": 2.091152815013405,
+      "grad_norm": 1.1149563789367676,
+      "learning_rate": 6.956610185242891e-06,
+      "loss": 0.2945,
+      "step": 780
+    },
+    {
+      "epoch": 2.10455764075067,
+      "grad_norm": 1.2285109758377075,
+      "learning_rate": 6.770401685682417e-06,
+      "loss": 0.2762,
+      "step": 785
+    },
+    {
+      "epoch": 2.1179624664879357,
+      "grad_norm": 1.1361923217773438,
+      "learning_rate": 6.585990169087112e-06,
+      "loss": 0.2984,
+      "step": 790
+    },
+    {
+      "epoch": 2.131367292225201,
+      "grad_norm": 1.0839574337005615,
+      "learning_rate": 6.40341590283593e-06,
+      "loss": 0.2708,
+      "step": 795
+    },
+    {
+      "epoch": 2.1447721179624666,
+      "grad_norm": 1.0973879098892212,
+      "learning_rate": 6.222718753132994e-06,
+      "loss": 0.2721,
+      "step": 800
+    },
+    {
+      "epoch": 2.158176943699732,
+      "grad_norm": 1.155579686164856,
+      "learning_rate": 6.043938176302596e-06,
+      "loss": 0.2689,
+      "step": 805
+    },
+    {
+      "epoch": 2.1715817694369974,
+      "grad_norm": 1.289603590965271,
+      "learning_rate": 5.8671132101736625e-06,
+      "loss": 0.3008,
+      "step": 810
+    },
+    {
+      "epoch": 2.1849865951742626,
+      "grad_norm": 1.242112159729004,
+      "learning_rate": 5.692282465555585e-06,
+      "loss": 0.254,
+      "step": 815
+    },
+    {
+      "epoch": 2.1983914209115283,
+      "grad_norm": 1.2852767705917358,
+      "learning_rate": 5.519484117807341e-06,
+      "loss": 0.2664,
+      "step": 820
+    },
+    {
+      "epoch": 2.2117962466487935,
+      "grad_norm": 1.2217613458633423,
+      "learning_rate": 5.348755898501662e-06,
+      "loss": 0.288,
+      "step": 825
+    },
+    {
+      "epoch": 2.225201072386059,
+      "grad_norm": 1.1166845560073853,
+      "learning_rate": 5.180135087186101e-06,
+      "loss": 0.2741,
+      "step": 830
+    },
+    {
+      "epoch": 2.2386058981233243,
+      "grad_norm": 1.089866280555725,
+      "learning_rate": 5.013658503242845e-06,
+      "loss": 0.3089,
+      "step": 835
+    },
+    {
+      "epoch": 2.25201072386059,
+      "grad_norm": 1.0453401803970337,
+      "learning_rate": 4.849362497848947e-06,
+      "loss": 0.2575,
+      "step": 840
+    },
+    {
+      "epoch": 2.265415549597855,
+      "grad_norm": 1.0049515962600708,
+      "learning_rate": 4.687282946038842e-06,
+      "loss": 0.2697,
+      "step": 845
+    },
+    {
+      "epoch": 2.278820375335121,
+      "grad_norm": 1.1833009719848633,
+      "learning_rate": 4.527455238870821e-06,
+      "loss": 0.2894,
+      "step": 850
+    },
+    {
+      "epoch": 2.292225201072386,
+      "grad_norm": 1.3225071430206299,
+      "learning_rate": 4.3699142756991635e-06,
+      "loss": 0.2672,
+      "step": 855
+    },
+    {
+      "epoch": 2.3056300268096512,
+      "grad_norm": 1.1755006313323975,
+      "learning_rate": 4.2146944565536485e-06,
+      "loss": 0.2879,
+      "step": 860
+    },
+    {
+      "epoch": 2.319034852546917,
+      "grad_norm": 1.1457116603851318,
+      "learning_rate": 4.061829674628116e-06,
+      "loss": 0.2907,
+      "step": 865
+    },
+    {
+      "epoch": 2.3324396782841825,
+      "grad_norm": 1.122604250907898,
+      "learning_rate": 3.911353308879673e-06,
+      "loss": 0.2375,
+      "step": 870
+    },
+    {
+      "epoch": 2.3458445040214477,
+      "grad_norm": 1.0773766040802002,
+      "learning_rate": 3.763298216740176e-06,
+      "loss": 0.2546,
+      "step": 875
+    },
+    {
+      "epoch": 2.359249329758713,
+      "grad_norm": 1.158325433731079,
+      "learning_rate": 3.617696726941645e-06,
+      "loss": 0.2842,
+      "step": 880
+    },
+    {
+      "epoch": 2.3726541554959786,
+      "grad_norm": 1.0826832056045532,
+      "learning_rate": 3.474580632457067e-06,
+      "loss": 0.2757,
+      "step": 885
+    },
+    {
+      "epoch": 2.386058981233244,
+      "grad_norm": 1.1899993419647217,
+      "learning_rate": 3.333981183558196e-06,
+      "loss": 0.2736,
+      "step": 890
+    },
+    {
+      "epoch": 2.3994638069705094,
+      "grad_norm": 1.191117525100708,
+      "learning_rate": 3.1959290809918953e-06,
+      "loss": 0.2469,
+      "step": 895
+    },
+    {
+      "epoch": 2.4128686327077746,
+      "grad_norm": 1.150436282157898,
+      "learning_rate": 3.060454469276423e-06,
+      "loss": 0.2686,
+      "step": 900
+    },
+    {
+      "epoch": 2.4262734584450403,
+      "grad_norm": 1.097445011138916,
+      "learning_rate": 2.9275869301191855e-06,
+      "loss": 0.2735,
+      "step": 905
+    },
+    {
+      "epoch": 2.4396782841823055,
+      "grad_norm": 1.221527338027954,
+      "learning_rate": 2.7973554759574116e-06,
+      "loss": 0.2953,
+      "step": 910
+    },
+    {
+      "epoch": 2.453083109919571,
+      "grad_norm": 1.2584657669067383,
+      "learning_rate": 2.6697885436231023e-06,
+      "loss": 0.2498,
+      "step": 915
+    },
+    {
+      "epoch": 2.4664879356568363,
+      "grad_norm": 1.1825857162475586,
+      "learning_rate": 2.5449139881336587e-06,
+      "loss": 0.2816,
+      "step": 920
+    },
+    {
+      "epoch": 2.479892761394102,
+      "grad_norm": 1.2222062349319458,
+      "learning_rate": 2.422759076609597e-06,
+      "loss": 0.2868,
+      "step": 925
+    },
+    {
+      "epoch": 2.493297587131367,
+      "grad_norm": 1.2197879552841187,
+      "learning_rate": 2.303350482320592e-06,
+      "loss": 0.2765,
+      "step": 930
+    },
+    {
+      "epoch": 2.506702412868633,
+      "grad_norm": 1.0478525161743164,
+      "learning_rate": 2.1867142788611937e-06,
+      "loss": 0.2793,
+      "step": 935
+    },
+    {
+      "epoch": 2.520107238605898,
+      "grad_norm": 1.177180528640747,
+      "learning_rate": 2.0728759344575272e-06,
+      "loss": 0.265,
+      "step": 940
+    },
+    {
+      "epoch": 2.5335120643431637,
+      "grad_norm": 1.1697869300842285,
+      "learning_rate": 1.961860306406133e-06,
+      "loss": 0.2607,
+      "step": 945
+    },
+    {
+      "epoch": 2.546916890080429,
+      "grad_norm": 1.2164334058761597,
+      "learning_rate": 1.8536916356462158e-06,
+      "loss": 0.2565,
+      "step": 950
+    },
+    {
+      "epoch": 2.5603217158176945,
+      "grad_norm": 1.2828480005264282,
+      "learning_rate": 1.748393541466507e-06,
+      "loss": 0.2425,
+      "step": 955
+    },
+    {
+      "epoch": 2.5737265415549597,
+      "grad_norm": 1.054573893547058,
+      "learning_rate": 1.6459890163478391e-06,
+      "loss": 0.2443,
+      "step": 960
+    },
+    {
+      "epoch": 2.5871313672922254,
+      "grad_norm": 0.9550372362136841,
+      "learning_rate": 1.5465004209426053e-06,
+      "loss": 0.2704,
+      "step": 965
+    },
+    {
+      "epoch": 2.6005361930294906,
+      "grad_norm": 1.1408909559249878,
+      "learning_rate": 1.4499494791921563e-06,
+      "loss": 0.2583,
+      "step": 970
+    },
+    {
+      "epoch": 2.6139410187667558,
+      "grad_norm": 1.234317660331726,
+      "learning_rate": 1.3563572735832668e-06,
+      "loss": 0.2557,
+      "step": 975
+    },
+    {
+      "epoch": 2.6273458445040214,
+      "grad_norm": 1.1538945436477661,
+      "learning_rate": 1.2657442405446345e-06,
+      "loss": 0.2638,
+      "step": 980
+    },
+    {
+      "epoch": 2.640750670241287,
+      "grad_norm": 1.0675137042999268,
+      "learning_rate": 1.178130165984458e-06,
+      "loss": 0.2706,
+      "step": 985
+    },
+    {
+      "epoch": 2.6541554959785523,
+      "grad_norm": 1.1528904438018799,
+      "learning_rate": 1.093534180970074e-06,
+      "loss": 0.2813,
+      "step": 990
+    },
+    {
+      "epoch": 2.6675603217158175,
+      "grad_norm": 0.9447672963142395,
+      "learning_rate": 1.0119747575505695e-06,
+      "loss": 0.2516,
+      "step": 995
+    },
+    {
+      "epoch": 2.680965147453083,
+      "grad_norm": 1.2417553663253784,
+      "learning_rate": 9.334697047232849e-07,
+      "loss": 0.2865,
+      "step": 1000
+    },
+    {
+      "epoch": 2.6943699731903488,
+      "grad_norm": 1.1432898044586182,
+      "learning_rate": 8.580361645451257e-07,
+      "loss": 0.2698,
+      "step": 1005
+    },
+    {
+      "epoch": 2.707774798927614,
+      "grad_norm": 1.1654596328735352,
+      "learning_rate": 7.856906083894871e-07,
+      "loss": 0.2697,
+      "step": 1010
+    },
+    {
+      "epoch": 2.721179624664879,
+      "grad_norm": 1.143183946609497,
+      "learning_rate": 7.16448833349621e-07,
+      "loss": 0.2382,
+      "step": 1015
+    },
+    {
+      "epoch": 2.734584450402145,
+      "grad_norm": 1.1008965969085693,
+      "learning_rate": 6.503259587892535e-07,
+      "loss": 0.2502,
+      "step": 1020
+    },
+    {
+      "epoch": 2.7479892761394105,
+      "grad_norm": 1.171980857849121,
+      "learning_rate": 5.873364230411754e-07,
+      "loss": 0.2462,
+      "step": 1025
+    },
+    {
+      "epoch": 2.7613941018766757,
+      "grad_norm": 1.036729097366333,
+      "learning_rate": 5.274939802545415e-07,
+      "loss": 0.2596,
+      "step": 1030
+    },
+    {
+      "epoch": 2.774798927613941,
+      "grad_norm": 1.177030324935913,
+      "learning_rate": 4.7081169739157717e-07,
+      "loss": 0.2802,
+      "step": 1035
+    },
+    {
+      "epoch": 2.7882037533512065,
+      "grad_norm": 1.1302504539489746,
+      "learning_rate": 4.173019513743198e-07,
+      "loss": 0.2842,
+      "step": 1040
+    },
+    {
+      "epoch": 2.8016085790884717,
+      "grad_norm": 1.1969364881515503,
+      "learning_rate": 3.6697642638204354e-07,
+      "loss": 0.2594,
+      "step": 1045
+    },
+    {
+      "epoch": 2.8150134048257374,
+      "grad_norm": 1.1496009826660156,
+      "learning_rate": 3.198461112999468e-07,
+      "loss": 0.283,
+      "step": 1050
+    },
+    {
+      "epoch": 2.8284182305630026,
+      "grad_norm": 1.0756111145019531,
+      "learning_rate": 2.7592129731967176e-07,
+      "loss": 0.2784,
+      "step": 1055
+    },
+    {
+      "epoch": 2.841823056300268,
+      "grad_norm": 1.0963841676712036,
+      "learning_rate": 2.3521157569214024e-07,
+      "loss": 0.2681,
+      "step": 1060
+    },
+    {
+      "epoch": 2.8552278820375334,
+      "grad_norm": 1.1973907947540283,
+      "learning_rate": 1.9772583563326729e-07,
+      "loss": 0.2839,
+      "step": 1065
+    },
+    {
+      "epoch": 2.868632707774799,
+      "grad_norm": 1.1479398012161255,
+      "learning_rate": 1.6347226238293377e-07,
+      "loss": 0.2454,
+      "step": 1070
+    },
+    {
+      "epoch": 2.8820375335120643,
+      "grad_norm": 1.0420469045639038,
+      "learning_rate": 1.3245833541768947e-07,
+      "loss": 0.2409,
+      "step": 1075
+    },
+    {
+      "epoch": 2.89544235924933,
+      "grad_norm": 1.1466476917266846,
+      "learning_rate": 1.0469082681757114e-07,
+      "loss": 0.263,
+      "step": 1080
+    },
+    {
+      "epoch": 2.908847184986595,
+      "grad_norm": 1.198553442955017,
+      "learning_rate": 8.0175799787372e-08,
+      "loss": 0.2287,
+      "step": 1085
+    },
+    {
+      "epoch": 2.9222520107238603,
+      "grad_norm": 1.0365747213363647,
+      "learning_rate": 5.891860733270249e-08,
+      "loss": 0.2552,
+      "step": 1090
+    },
+    {
+      "epoch": 2.935656836461126,
+      "grad_norm": 1.1062986850738525,
+      "learning_rate": 4.092389109113526e-08,
+      "loss": 0.246,
+      "step": 1095
+    },
+    {
+      "epoch": 2.9490616621983916,
+      "grad_norm": 1.0808930397033691,
+      "learning_rate": 2.6195580318667533e-08,
+      "loss": 0.2531,
+      "step": 1100
+    },
+    {
+      "epoch": 2.962466487935657,
+      "grad_norm": 1.215558648109436,
+      "learning_rate": 1.4736891031752464e-08,
+      "loss": 0.254,
+      "step": 1105
+    },
+    {
+      "epoch": 2.975871313672922,
+      "grad_norm": 1.107862114906311,
+      "learning_rate": 6.550325305054172e-09,
+      "loss": 0.2557,
+      "step": 1110
+    },
+    {
+      "epoch": 2.9892761394101877,
+      "grad_norm": 1.1284103393554688,
+      "learning_rate": 1.6376707251097855e-09,
+      "loss": 0.247,
+      "step": 1115
+    },
+    {
+      "epoch": 3.0,
+      "step": 1119,
+      "total_flos": 1.647805420300927e+18,
+      "train_loss": 0.5340931572159877,
+      "train_runtime": 781.1418,
+      "train_samples_per_second": 45.821,
+      "train_steps_per_second": 1.433
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1119,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.647805420300927e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

4_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eedd797c52a048cbb82c620bc0f76da294abbf5058459a6987bdc745abaf434d
+size 8145

4_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff