Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

15_128_e3_3e-5/README.md +63 -0
15_128_e3_3e-5/adapter_config.json +39 -0
15_128_e3_3e-5/adapter_model.safetensors +3 -0
15_128_e3_3e-5/all_results.json +9 -0
15_128_e3_3e-5/config.json +32 -0
15_128_e3_3e-5/merges.txt +0 -0
15_128_e3_3e-5/special_tokens_map.json +45 -0
15_128_e3_3e-5/tokenizer.json +0 -0
15_128_e3_3e-5/tokenizer_config.json +188 -0
15_128_e3_3e-5/train_results.json +9 -0
15_128_e3_3e-5/trainer_state.json +1604 -0
15_128_e3_3e-5/training_args.bin +3 -0
15_128_e3_3e-5/vocab.json +0 -0

15_128_e3_3e-5/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+library_name: peft
+license: apache-2.0
+base_model: ibm-granite/granite-3.3-8b-base
+tags:
+- alignment-handbook
+- generated_from_trainer
+datasets:
+- data/knowledge_lora_training_data_2000
+model-index:
+- name: 15_128_e3_3e-5
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# 15_128_e3_3e-5
+This model is a fine-tuned version of [ibm-granite/granite-3.3-8b-base](https://huggingface.co/ibm-granite/granite-3.3-8b-base) on the data/knowledge_lora_training_data_2000 dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 3e-05
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 64
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 3.0
+### Training results
+### Framework versions
+- PEFT 0.15.2
+- Transformers 4.52.4
+- Pytorch 2.7.0+cu126
+- Datasets 3.6.0
+- Tokenizers 0.21.2

15_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-base",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

15_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2152da1b09ed2cdd41ea9adf07c6c32f1c1eb9c524e46d1d4a4297da706508d
+size 791751704

15_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.489123253103362e+18,
+    "train_loss": 0.5669498731239871,
+    "train_runtime": 689.3067,
+    "train_samples": 11907,
+    "train_samples_per_second": 51.822,
+    "train_steps_per_second": 1.623
+}

15_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49152
+}

15_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

15_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<reponame>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

15_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

15_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<reponame>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

15_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.489123253103362e+18,
+    "train_loss": 0.5669498731239871,
+    "train_runtime": 689.3067,
+    "train_samples": 11907,
+    "train_samples_per_second": 51.822,
+    "train_steps_per_second": 1.623
+}

15_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1604 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1119,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013422818791946308,
+      "grad_norm": 1.1787831783294678,
+      "learning_rate": 2.1428571428571427e-06,
+      "loss": 1.3227,
+      "step": 5
+    },
+    {
+      "epoch": 0.026845637583892617,
+      "grad_norm": 1.0532420873641968,
+      "learning_rate": 4.821428571428572e-06,
+      "loss": 1.3664,
+      "step": 10
+    },
+    {
+      "epoch": 0.040268456375838924,
+      "grad_norm": 0.644639790058136,
+      "learning_rate": 7.5e-06,
+      "loss": 1.3344,
+      "step": 15
+    },
+    {
+      "epoch": 0.053691275167785234,
+      "grad_norm": 0.5746031403541565,
+      "learning_rate": 1.0178571428571429e-05,
+      "loss": 1.2832,
+      "step": 20
+    },
+    {
+      "epoch": 0.06711409395973154,
+      "grad_norm": 0.6982018351554871,
+      "learning_rate": 1.2857142857142857e-05,
+      "loss": 1.3015,
+      "step": 25
+    },
+    {
+      "epoch": 0.08053691275167785,
+      "grad_norm": 0.6423090100288391,
+      "learning_rate": 1.553571428571429e-05,
+      "loss": 1.2091,
+      "step": 30
+    },
+    {
+      "epoch": 0.09395973154362416,
+      "grad_norm": 0.7088264226913452,
+      "learning_rate": 1.8214285714285712e-05,
+      "loss": 1.2208,
+      "step": 35
+    },
+    {
+      "epoch": 0.10738255033557047,
+      "grad_norm": 0.5552237629890442,
+      "learning_rate": 2.089285714285714e-05,
+      "loss": 1.2633,
+      "step": 40
+    },
+    {
+      "epoch": 0.12080536912751678,
+      "grad_norm": 0.5400457382202148,
+      "learning_rate": 2.357142857142857e-05,
+      "loss": 1.1537,
+      "step": 45
+    },
+    {
+      "epoch": 0.1342281879194631,
+      "grad_norm": 0.591413140296936,
+      "learning_rate": 2.625e-05,
+      "loss": 1.1879,
+      "step": 50
+    },
+    {
+      "epoch": 0.1476510067114094,
+      "grad_norm": 0.5079585909843445,
+      "learning_rate": 2.892857142857143e-05,
+      "loss": 1.2126,
+      "step": 55
+    },
+    {
+      "epoch": 0.1610738255033557,
+      "grad_norm": 0.5020539164543152,
+      "learning_rate": 2.999941043167295e-05,
+      "loss": 1.1875,
+      "step": 60
+    },
+    {
+      "epoch": 0.174496644295302,
+      "grad_norm": 0.5398063063621521,
+      "learning_rate": 2.999580768195271e-05,
+      "loss": 1.1763,
+      "step": 65
+    },
+    {
+      "epoch": 0.18791946308724833,
+      "grad_norm": 0.557966947555542,
+      "learning_rate": 2.998893050620046e-05,
+      "loss": 1.1391,
+      "step": 70
+    },
+    {
+      "epoch": 0.20134228187919462,
+      "grad_norm": 0.6859679818153381,
+      "learning_rate": 2.9978780406089445e-05,
+      "loss": 1.1046,
+      "step": 75
+    },
+    {
+      "epoch": 0.21476510067114093,
+      "grad_norm": 0.6387895345687866,
+      "learning_rate": 2.996535959795591e-05,
+      "loss": 1.1715,
+      "step": 80
+    },
+    {
+      "epoch": 0.22818791946308725,
+      "grad_norm": 0.5165074467658997,
+      "learning_rate": 2.994867101231513e-05,
+      "loss": 1.081,
+      "step": 85
+    },
+    {
+      "epoch": 0.24161073825503357,
+      "grad_norm": 0.5764747858047485,
+      "learning_rate": 2.9928718293221532e-05,
+      "loss": 1.129,
+      "step": 90
+    },
+    {
+      "epoch": 0.2550335570469799,
+      "grad_norm": 0.6636740565299988,
+      "learning_rate": 2.9905505797472965e-05,
+      "loss": 1.0798,
+      "step": 95
+    },
+    {
+      "epoch": 0.2684563758389262,
+      "grad_norm": 0.6049888134002686,
+      "learning_rate": 2.9879038593659403e-05,
+      "loss": 1.123,
+      "step": 100
+    },
+    {
+      "epoch": 0.28187919463087246,
+      "grad_norm": 0.5570958852767944,
+      "learning_rate": 2.984932246105616e-05,
+      "loss": 1.0794,
+      "step": 105
+    },
+    {
+      "epoch": 0.2953020134228188,
+      "grad_norm": 0.5862316489219666,
+      "learning_rate": 2.981636388836196e-05,
+      "loss": 1.0739,
+      "step": 110
+    },
+    {
+      "epoch": 0.3087248322147651,
+      "grad_norm": 0.5860150456428528,
+      "learning_rate": 2.978017007228208e-05,
+      "loss": 1.0318,
+      "step": 115
+    },
+    {
+      "epoch": 0.3221476510067114,
+      "grad_norm": 0.6639479398727417,
+      "learning_rate": 2.9740748915956932e-05,
+      "loss": 1.0425,
+      "step": 120
+    },
+    {
+      "epoch": 0.33557046979865773,
+      "grad_norm": 0.604107141494751,
+      "learning_rate": 2.9698109027236335e-05,
+      "loss": 1.0513,
+      "step": 125
+    },
+    {
+      "epoch": 0.348993288590604,
+      "grad_norm": 0.6186829805374146,
+      "learning_rate": 2.965225971679996e-05,
+      "loss": 1.0319,
+      "step": 130
+    },
+    {
+      "epoch": 0.3624161073825503,
+      "grad_norm": 0.6194995641708374,
+      "learning_rate": 2.9603210996124257e-05,
+      "loss": 1.025,
+      "step": 135
+    },
+    {
+      "epoch": 0.37583892617449666,
+      "grad_norm": 0.7160004377365112,
+      "learning_rate": 2.9550973575296424e-05,
+      "loss": 0.9989,
+      "step": 140
+    },
+    {
+      "epoch": 0.38926174496644295,
+      "grad_norm": 0.6104715466499329,
+      "learning_rate": 2.949555886067578e-05,
+      "loss": 0.9596,
+      "step": 145
+    },
+    {
+      "epoch": 0.40268456375838924,
+      "grad_norm": 0.7308571934700012,
+      "learning_rate": 2.9436978952403103e-05,
+      "loss": 0.9086,
+      "step": 150
+    },
+    {
+      "epoch": 0.4161073825503356,
+      "grad_norm": 0.7849708199501038,
+      "learning_rate": 2.937524664175851e-05,
+      "loss": 0.9829,
+      "step": 155
+    },
+    {
+      "epoch": 0.42953020134228187,
+      "grad_norm": 0.6270202994346619,
+      "learning_rate": 2.931037540836839e-05,
+      "loss": 1.0149,
+      "step": 160
+    },
+    {
+      "epoch": 0.4429530201342282,
+      "grad_norm": 0.755927562713623,
+      "learning_rate": 2.9242379417262052e-05,
+      "loss": 0.9525,
+      "step": 165
+    },
+    {
+      "epoch": 0.4563758389261745,
+      "grad_norm": 0.7812522649765015,
+      "learning_rate": 2.9171273515778703e-05,
+      "loss": 0.9489,
+      "step": 170
+    },
+    {
+      "epoch": 0.4697986577181208,
+      "grad_norm": 0.7418034076690674,
+      "learning_rate": 2.909707323032545e-05,
+      "loss": 0.9167,
+      "step": 175
+    },
+    {
+      "epoch": 0.48322147651006714,
+      "grad_norm": 0.6826059222221375,
+      "learning_rate": 2.901979476298699e-05,
+      "loss": 0.8951,
+      "step": 180
+    },
+    {
+      "epoch": 0.4966442953020134,
+      "grad_norm": 0.8364844918251038,
+      "learning_rate": 2.893945498798781e-05,
+      "loss": 0.9801,
+      "step": 185
+    },
+    {
+      "epoch": 0.5100671140939598,
+      "grad_norm": 0.8375508189201355,
+      "learning_rate": 2.885607144800759e-05,
+      "loss": 0.8842,
+      "step": 190
+    },
+    {
+      "epoch": 0.5234899328859061,
+      "grad_norm": 0.7986782789230347,
+      "learning_rate": 2.876966235035064e-05,
+      "loss": 0.9052,
+      "step": 195
+    },
+    {
+      "epoch": 0.5369127516778524,
+      "grad_norm": 0.7564641237258911,
+      "learning_rate": 2.8680246562970253e-05,
+      "loss": 0.878,
+      "step": 200
+    },
+    {
+      "epoch": 0.5503355704697986,
+      "grad_norm": 0.8237295746803284,
+      "learning_rate": 2.8587843610348735e-05,
+      "loss": 0.8907,
+      "step": 205
+    },
+    {
+      "epoch": 0.5637583892617449,
+      "grad_norm": 0.8681392073631287,
+      "learning_rate": 2.8492473669234143e-05,
+      "loss": 0.885,
+      "step": 210
+    },
+    {
+      "epoch": 0.5771812080536913,
+      "grad_norm": 0.899863600730896,
+      "learning_rate": 2.8394157564234564e-05,
+      "loss": 0.8917,
+      "step": 215
+    },
+    {
+      "epoch": 0.5906040268456376,
+      "grad_norm": 0.8868224024772644,
+      "learning_rate": 2.8292916763270918e-05,
+      "loss": 0.8699,
+      "step": 220
+    },
+    {
+      "epoch": 0.6040268456375839,
+      "grad_norm": 0.7697249054908752,
+      "learning_rate": 2.818877337288934e-05,
+      "loss": 0.8571,
+      "step": 225
+    },
+    {
+      "epoch": 0.6174496644295302,
+      "grad_norm": 0.7981431484222412,
+      "learning_rate": 2.808175013343404e-05,
+      "loss": 0.8457,
+      "step": 230
+    },
+    {
+      "epoch": 0.6308724832214765,
+      "grad_norm": 1.021409511566162,
+      "learning_rate": 2.7971870414081845e-05,
+      "loss": 0.8259,
+      "step": 235
+    },
+    {
+      "epoch": 0.6442953020134228,
+      "grad_norm": 0.8456146717071533,
+      "learning_rate": 2.785915820773937e-05,
+      "loss": 0.8069,
+      "step": 240
+    },
+    {
+      "epoch": 0.6577181208053692,
+      "grad_norm": 0.9129796028137207,
+      "learning_rate": 2.774363812580405e-05,
+      "loss": 0.8247,
+      "step": 245
+    },
+    {
+      "epoch": 0.6711409395973155,
+      "grad_norm": 0.9513639211654663,
+      "learning_rate": 2.7625335392790056e-05,
+      "loss": 0.7858,
+      "step": 250
+    },
+    {
+      "epoch": 0.6845637583892618,
+      "grad_norm": 1.0434490442276,
+      "learning_rate": 2.75042758408204e-05,
+      "loss": 0.8043,
+      "step": 255
+    },
+    {
+      "epoch": 0.697986577181208,
+      "grad_norm": 0.9335957765579224,
+      "learning_rate": 2.7380485903986317e-05,
+      "loss": 0.8076,
+      "step": 260
+    },
+    {
+      "epoch": 0.7114093959731543,
+      "grad_norm": 0.9164956212043762,
+      "learning_rate": 2.7253992612575225e-05,
+      "loss": 0.8095,
+      "step": 265
+    },
+    {
+      "epoch": 0.7248322147651006,
+      "grad_norm": 0.9571513533592224,
+      "learning_rate": 2.712482358716848e-05,
+      "loss": 0.7843,
+      "step": 270
+    },
+    {
+      "epoch": 0.738255033557047,
+      "grad_norm": 0.9782169461250305,
+      "learning_rate": 2.6993007032610297e-05,
+      "loss": 0.7802,
+      "step": 275
+    },
+    {
+      "epoch": 0.7516778523489933,
+      "grad_norm": 0.9108016490936279,
+      "learning_rate": 2.6858571731848997e-05,
+      "loss": 0.7601,
+      "step": 280
+    },
+    {
+      "epoch": 0.7651006711409396,
+      "grad_norm": 0.8660235404968262,
+      "learning_rate": 2.672154703965212e-05,
+      "loss": 0.784,
+      "step": 285
+    },
+    {
+      "epoch": 0.7785234899328859,
+      "grad_norm": 0.8920661211013794,
+      "learning_rate": 2.6581962876196593e-05,
+      "loss": 0.7859,
+      "step": 290
+    },
+    {
+      "epoch": 0.7919463087248322,
+      "grad_norm": 1.0612117052078247,
+      "learning_rate": 2.643984972053551e-05,
+      "loss": 0.7712,
+      "step": 295
+    },
+    {
+      "epoch": 0.8053691275167785,
+      "grad_norm": 0.9185782074928284,
+      "learning_rate": 2.6295238603942832e-05,
+      "loss": 0.7404,
+      "step": 300
+    },
+    {
+      "epoch": 0.8187919463087249,
+      "grad_norm": 0.8650130033493042,
+      "learning_rate": 2.6148161103137515e-05,
+      "loss": 0.7214,
+      "step": 305
+    },
+    {
+      "epoch": 0.8322147651006712,
+      "grad_norm": 0.9116557240486145,
+      "learning_rate": 2.599864933338854e-05,
+      "loss": 0.7893,
+      "step": 310
+    },
+    {
+      "epoch": 0.8456375838926175,
+      "grad_norm": 1.0048680305480957,
+      "learning_rate": 2.5846735941502356e-05,
+      "loss": 0.7267,
+      "step": 315
+    },
+    {
+      "epoch": 0.8590604026845637,
+      "grad_norm": 0.8960395455360413,
+      "learning_rate": 2.5692454098694256e-05,
+      "loss": 0.7249,
+      "step": 320
+    },
+    {
+      "epoch": 0.87248322147651,
+      "grad_norm": 0.9515731930732727,
+      "learning_rate": 2.553583749334522e-05,
+      "loss": 0.6721,
+      "step": 325
+    },
+    {
+      "epoch": 0.8859060402684564,
+      "grad_norm": 0.9288879036903381,
+      "learning_rate": 2.537692032364587e-05,
+      "loss": 0.6933,
+      "step": 330
+    },
+    {
+      "epoch": 0.8993288590604027,
+      "grad_norm": 1.1222081184387207,
+      "learning_rate": 2.521573729012907e-05,
+      "loss": 0.7104,
+      "step": 335
+    },
+    {
+      "epoch": 0.912751677852349,
+      "grad_norm": 1.0263097286224365,
+      "learning_rate": 2.5052323588092878e-05,
+      "loss": 0.7055,
+      "step": 340
+    },
+    {
+      "epoch": 0.9261744966442953,
+      "grad_norm": 0.9645348191261292,
+      "learning_rate": 2.4886714899915415e-05,
+      "loss": 0.7122,
+      "step": 345
+    },
+    {
+      "epoch": 0.9395973154362416,
+      "grad_norm": 1.0961672067642212,
+      "learning_rate": 2.4718947387263403e-05,
+      "loss": 0.6549,
+      "step": 350
+    },
+    {
+      "epoch": 0.9530201342281879,
+      "grad_norm": 1.0570881366729736,
+      "learning_rate": 2.4549057683196054e-05,
+      "loss": 0.6443,
+      "step": 355
+    },
+    {
+      "epoch": 0.9664429530201343,
+      "grad_norm": 0.9710385799407959,
+      "learning_rate": 2.4377082884166016e-05,
+      "loss": 0.7179,
+      "step": 360
+    },
+    {
+      "epoch": 0.9798657718120806,
+      "grad_norm": 1.0300064086914062,
+      "learning_rate": 2.4203060541919136e-05,
+      "loss": 0.6581,
+      "step": 365
+    },
+    {
+      "epoch": 0.9932885906040269,
+      "grad_norm": 0.9689120054244995,
+      "learning_rate": 2.4027028655294804e-05,
+      "loss": 0.6745,
+      "step": 370
+    },
+    {
+      "epoch": 1.0053691275167784,
+      "grad_norm": 1.0210607051849365,
+      "learning_rate": 2.384902566192867e-05,
+      "loss": 0.592,
+      "step": 375
+    },
+    {
+      "epoch": 1.018791946308725,
+      "grad_norm": 1.0200635194778442,
+      "learning_rate": 2.366909042985956e-05,
+      "loss": 0.5768,
+      "step": 380
+    },
+    {
+      "epoch": 1.0322147651006712,
+      "grad_norm": 0.8891041874885559,
+      "learning_rate": 2.3487262249042412e-05,
+      "loss": 0.6326,
+      "step": 385
+    },
+    {
+      "epoch": 1.0456375838926175,
+      "grad_norm": 1.1362190246582031,
+      "learning_rate": 2.330358082276905e-05,
+      "loss": 0.5426,
+      "step": 390
+    },
+    {
+      "epoch": 1.0590604026845638,
+      "grad_norm": 0.9437598586082458,
+      "learning_rate": 2.3118086258998748e-05,
+      "loss": 0.5861,
+      "step": 395
+    },
+    {
+      "epoch": 1.07248322147651,
+      "grad_norm": 1.1715295314788818,
+      "learning_rate": 2.2930819061600413e-05,
+      "loss": 0.5639,
+      "step": 400
+    },
+    {
+      "epoch": 1.0859060402684564,
+      "grad_norm": 1.1314239501953125,
+      "learning_rate": 2.27418201215083e-05,
+      "loss": 0.5614,
+      "step": 405
+    },
+    {
+      "epoch": 1.0993288590604027,
+      "grad_norm": 0.9698748588562012,
+      "learning_rate": 2.2551130707793243e-05,
+      "loss": 0.5665,
+      "step": 410
+    },
+    {
+      "epoch": 1.112751677852349,
+      "grad_norm": 1.0911478996276855,
+      "learning_rate": 2.2358792458651304e-05,
+      "loss": 0.5773,
+      "step": 415
+    },
+    {
+      "epoch": 1.1261744966442953,
+      "grad_norm": 0.9545378684997559,
+      "learning_rate": 2.2164847372311804e-05,
+      "loss": 0.5821,
+      "step": 420
+    },
+    {
+      "epoch": 1.1395973154362415,
+      "grad_norm": 1.0998480319976807,
+      "learning_rate": 2.1969337797866772e-05,
+      "loss": 0.5808,
+      "step": 425
+    },
+    {
+      "epoch": 1.1530201342281878,
+      "grad_norm": 1.2536629438400269,
+      "learning_rate": 2.1772306426023737e-05,
+      "loss": 0.5551,
+      "step": 430
+    },
+    {
+      "epoch": 1.1664429530201343,
+      "grad_norm": 1.112061858177185,
+      "learning_rate": 2.1573796279783987e-05,
+      "loss": 0.5249,
+      "step": 435
+    },
+    {
+      "epoch": 1.1798657718120806,
+      "grad_norm": 1.1083247661590576,
+      "learning_rate": 2.137385070504821e-05,
+      "loss": 0.5207,
+      "step": 440
+    },
+    {
+      "epoch": 1.193288590604027,
+      "grad_norm": 1.0115104913711548,
+      "learning_rate": 2.1172513361151643e-05,
+      "loss": 0.5315,
+      "step": 445
+    },
+    {
+      "epoch": 1.2067114093959732,
+      "grad_norm": 1.1656289100646973,
+      "learning_rate": 2.0969828211330824e-05,
+      "loss": 0.5002,
+      "step": 450
+    },
+    {
+      "epoch": 1.2201342281879195,
+      "grad_norm": 1.2733368873596191,
+      "learning_rate": 2.0765839513123923e-05,
+      "loss": 0.543,
+      "step": 455
+    },
+    {
+      "epoch": 1.2335570469798658,
+      "grad_norm": 1.5573337078094482,
+      "learning_rate": 2.056059180870684e-05,
+      "loss": 0.4892,
+      "step": 460
+    },
+    {
+      "epoch": 1.246979865771812,
+      "grad_norm": 1.126818299293518,
+      "learning_rate": 2.0354129915167175e-05,
+      "loss": 0.5449,
+      "step": 465
+    },
+    {
+      "epoch": 1.2604026845637584,
+      "grad_norm": 1.4031105041503906,
+      "learning_rate": 2.014649891471811e-05,
+      "loss": 0.5198,
+      "step": 470
+    },
+    {
+      "epoch": 1.2738255033557047,
+      "grad_norm": 1.1321215629577637,
+      "learning_rate": 1.9937744144854446e-05,
+      "loss": 0.5023,
+      "step": 475
+    },
+    {
+      "epoch": 1.287248322147651,
+      "grad_norm": 1.0238393545150757,
+      "learning_rate": 1.9727911188452903e-05,
+      "loss": 0.5113,
+      "step": 480
+    },
+    {
+      "epoch": 1.3006711409395972,
+      "grad_norm": 1.2181116342544556,
+      "learning_rate": 1.9517045863818792e-05,
+      "loss": 0.539,
+      "step": 485
+    },
+    {
+      "epoch": 1.3140939597315437,
+      "grad_norm": 0.9544431567192078,
+      "learning_rate": 1.930519421468133e-05,
+      "loss": 0.5149,
+      "step": 490
+    },
+    {
+      "epoch": 1.3275167785234898,
+      "grad_norm": 1.0378201007843018,
+      "learning_rate": 1.9092402500139693e-05,
+      "loss": 0.4909,
+      "step": 495
+    },
+    {
+      "epoch": 1.3409395973154363,
+      "grad_norm": 1.2054181098937988,
+      "learning_rate": 1.8878717184562078e-05,
+      "loss": 0.5234,
+      "step": 500
+    },
+    {
+      "epoch": 1.3543624161073826,
+      "grad_norm": 1.563539981842041,
+      "learning_rate": 1.8664184927439907e-05,
+      "loss": 0.4752,
+      "step": 505
+    },
+    {
+      "epoch": 1.367785234899329,
+      "grad_norm": 1.0908130407333374,
+      "learning_rate": 1.8448852573199465e-05,
+      "loss": 0.5165,
+      "step": 510
+    },
+    {
+      "epoch": 1.3812080536912752,
+      "grad_norm": 1.0390334129333496,
+      "learning_rate": 1.823276714097311e-05,
+      "loss": 0.4796,
+      "step": 515
+    },
+    {
+      "epoch": 1.3946308724832215,
+      "grad_norm": 1.2405122518539429,
+      "learning_rate": 1.8015975814332373e-05,
+      "loss": 0.4662,
+      "step": 520
+    },
+    {
+      "epoch": 1.4080536912751678,
+      "grad_norm": 1.111845850944519,
+      "learning_rate": 1.779852593098513e-05,
+      "loss": 0.4883,
+      "step": 525
+    },
+    {
+      "epoch": 1.421476510067114,
+      "grad_norm": 1.127173900604248,
+      "learning_rate": 1.7580464972439137e-05,
+      "loss": 0.4946,
+      "step": 530
+    },
+    {
+      "epoch": 1.4348993288590604,
+      "grad_norm": 1.0390585660934448,
+      "learning_rate": 1.736184055363414e-05,
+      "loss": 0.4914,
+      "step": 535
+    },
+    {
+      "epoch": 1.4483221476510066,
+      "grad_norm": 1.1070321798324585,
+      "learning_rate": 1.7142700412544867e-05,
+      "loss": 0.497,
+      "step": 540
+    },
+    {
+      "epoch": 1.4617449664429532,
+      "grad_norm": 1.1228235960006714,
+      "learning_rate": 1.692309239975715e-05,
+      "loss": 0.4852,
+      "step": 545
+    },
+    {
+      "epoch": 1.4751677852348992,
+      "grad_norm": 1.114735722541809,
+      "learning_rate": 1.670306446801947e-05,
+      "loss": 0.4654,
+      "step": 550
+    },
+    {
+      "epoch": 1.4885906040268457,
+      "grad_norm": 1.1709102392196655,
+      "learning_rate": 1.6482664661772153e-05,
+      "loss": 0.4917,
+      "step": 555
+    },
+    {
+      "epoch": 1.5020134228187918,
+      "grad_norm": 1.105033278465271,
+      "learning_rate": 1.626194110665661e-05,
+      "loss": 0.4737,
+      "step": 560
+    },
+    {
+      "epoch": 1.5154362416107383,
+      "grad_norm": 1.1771923303604126,
+      "learning_rate": 1.6040941999006768e-05,
+      "loss": 0.5242,
+      "step": 565
+    },
+    {
+      "epoch": 1.5288590604026846,
+      "grad_norm": 1.2981674671173096,
+      "learning_rate": 1.581971559532515e-05,
+      "loss": 0.4543,
+      "step": 570
+    },
+    {
+      "epoch": 1.542281879194631,
+      "grad_norm": 1.2492069005966187,
+      "learning_rate": 1.559831020174576e-05,
+      "loss": 0.4476,
+      "step": 575
+    },
+    {
+      "epoch": 1.5557046979865772,
+      "grad_norm": 1.0671732425689697,
+      "learning_rate": 1.5376774163486104e-05,
+      "loss": 0.4902,
+      "step": 580
+    },
+    {
+      "epoch": 1.5691275167785235,
+      "grad_norm": 1.2057262659072876,
+      "learning_rate": 1.5155155854290777e-05,
+      "loss": 0.4134,
+      "step": 585
+    },
+    {
+      "epoch": 1.5825503355704698,
+      "grad_norm": 1.3330824375152588,
+      "learning_rate": 1.493350366586873e-05,
+      "loss": 0.4832,
+      "step": 590
+    },
+    {
+      "epoch": 1.595973154362416,
+      "grad_norm": 1.0262600183486938,
+      "learning_rate": 1.4711865997326648e-05,
+      "loss": 0.4696,
+      "step": 595
+    },
+    {
+      "epoch": 1.6093959731543626,
+      "grad_norm": 1.2874300479888916,
+      "learning_rate": 1.4490291244600712e-05,
+      "loss": 0.428,
+      "step": 600
+    },
+    {
+      "epoch": 1.6228187919463086,
+      "grad_norm": 1.1492985486984253,
+      "learning_rate": 1.4268827789889054e-05,
+      "loss": 0.4634,
+      "step": 605
+    },
+    {
+      "epoch": 1.6362416107382551,
+      "grad_norm": 1.2107088565826416,
+      "learning_rate": 1.4047523991087194e-05,
+      "loss": 0.4117,
+      "step": 610
+    },
+    {
+      "epoch": 1.6496644295302012,
+      "grad_norm": 1.1944514513015747,
+      "learning_rate": 1.3826428171228824e-05,
+      "loss": 0.447,
+      "step": 615
+    },
+    {
+      "epoch": 1.6630872483221477,
+      "grad_norm": 1.212609887123108,
+      "learning_rate": 1.3605588607934153e-05,
+      "loss": 0.4262,
+      "step": 620
+    },
+    {
+      "epoch": 1.676510067114094,
+      "grad_norm": 1.2843233346939087,
+      "learning_rate": 1.3385053522868229e-05,
+      "loss": 0.4584,
+      "step": 625
+    },
+    {
+      "epoch": 1.6899328859060403,
+      "grad_norm": 1.163159728050232,
+      "learning_rate": 1.3164871071211399e-05,
+      "loss": 0.3994,
+      "step": 630
+    },
+    {
+      "epoch": 1.7033557046979866,
+      "grad_norm": 1.2616965770721436,
+      "learning_rate": 1.2945089331144364e-05,
+      "loss": 0.4307,
+      "step": 635
+    },
+    {
+      "epoch": 1.7167785234899329,
+      "grad_norm": 1.1640808582305908,
+      "learning_rate": 1.2725756293350011e-05,
+      "loss": 0.4522,
+      "step": 640
+    },
+    {
+      "epoch": 1.7302013422818792,
+      "grad_norm": 1.0755280256271362,
+      "learning_rate": 1.2506919850534343e-05,
+      "loss": 0.3801,
+      "step": 645
+    },
+    {
+      "epoch": 1.7436241610738255,
+      "grad_norm": 1.2205264568328857,
+      "learning_rate": 1.2288627786968826e-05,
+      "loss": 0.4387,
+      "step": 650
+    },
+    {
+      "epoch": 1.757046979865772,
+      "grad_norm": 1.1449589729309082,
+      "learning_rate": 1.2070927768056399e-05,
+      "loss": 0.4201,
+      "step": 655
+    },
+    {
+      "epoch": 1.770469798657718,
+      "grad_norm": 1.2469054460525513,
+      "learning_rate": 1.1853867329923436e-05,
+      "loss": 0.4124,
+      "step": 660
+    },
+    {
+      "epoch": 1.7838926174496645,
+      "grad_norm": 1.0459575653076172,
+      "learning_rate": 1.163749386903995e-05,
+      "loss": 0.4082,
+      "step": 665
+    },
+    {
+      "epoch": 1.7973154362416106,
+      "grad_norm": 1.1973217725753784,
+      "learning_rate": 1.1421854631870291e-05,
+      "loss": 0.3598,
+      "step": 670
+    },
+    {
+      "epoch": 1.8107382550335571,
+      "grad_norm": 1.285914659500122,
+      "learning_rate": 1.1206996704556575e-05,
+      "loss": 0.4466,
+      "step": 675
+    },
+    {
+      "epoch": 1.8241610738255034,
+      "grad_norm": 1.1342105865478516,
+      "learning_rate": 1.0992967002637148e-05,
+      "loss": 0.3573,
+      "step": 680
+    },
+    {
+      "epoch": 1.8375838926174497,
+      "grad_norm": 1.3216603994369507,
+      "learning_rate": 1.0779812260802303e-05,
+      "loss": 0.4127,
+      "step": 685
+    },
+    {
+      "epoch": 1.851006711409396,
+      "grad_norm": 1.1540459394454956,
+      "learning_rate": 1.056757902268945e-05,
+      "loss": 0.3684,
+      "step": 690
+    },
+    {
+      "epoch": 1.8644295302013423,
+      "grad_norm": 1.1831110715866089,
+      "learning_rate": 1.035631363072005e-05,
+      "loss": 0.4129,
+      "step": 695
+    },
+    {
+      "epoch": 1.8778523489932886,
+      "grad_norm": 1.1281007528305054,
+      "learning_rate": 1.014606221598046e-05,
+      "loss": 0.3791,
+      "step": 700
+    },
+    {
+      "epoch": 1.8912751677852349,
+      "grad_norm": 1.256843090057373,
+      "learning_rate": 9.936870688148924e-06,
+      "loss": 0.4196,
+      "step": 705
+    },
+    {
+      "epoch": 1.9046979865771814,
+      "grad_norm": 1.1659599542617798,
+      "learning_rate": 9.728784725470913e-06,
+      "loss": 0.4092,
+      "step": 710
+    },
+    {
+      "epoch": 1.9181208053691274,
+      "grad_norm": 1.2047243118286133,
+      "learning_rate": 9.521849764785018e-06,
+      "loss": 0.3963,
+      "step": 715
+    },
+    {
+      "epoch": 1.931543624161074,
+      "grad_norm": 1.3271708488464355,
+      "learning_rate": 9.31611099160152e-06,
+      "loss": 0.3999,
+      "step": 720
+    },
+    {
+      "epoch": 1.94496644295302,
+      "grad_norm": 1.1906070709228516,
+      "learning_rate": 9.111613330235866e-06,
+      "loss": 0.3809,
+      "step": 725
+    },
+    {
+      "epoch": 1.9583892617449665,
+      "grad_norm": 1.152764916419983,
+      "learning_rate": 8.90840143399917e-06,
+      "loss": 0.3904,
+      "step": 730
+    },
+    {
+      "epoch": 1.9718120805369126,
+      "grad_norm": 1.2785158157348633,
+      "learning_rate": 8.706519675447898e-06,
+      "loss": 0.3846,
+      "step": 735
+    },
+    {
+      "epoch": 1.985234899328859,
+      "grad_norm": 1.1316757202148438,
+      "learning_rate": 8.506012136694832e-06,
+      "loss": 0.3483,
+      "step": 740
+    },
+    {
+      "epoch": 1.9986577181208054,
+      "grad_norm": 1.2931638956069946,
+      "learning_rate": 8.306922599783491e-06,
+      "loss": 0.3662,
+      "step": 745
+    },
+    {
+      "epoch": 2.010738255033557,
+      "grad_norm": 1.2080596685409546,
+      "learning_rate": 8.109294537128057e-06,
+      "loss": 0.2796,
+      "step": 750
+    },
+    {
+      "epoch": 2.0241610738255034,
+      "grad_norm": 1.0464208126068115,
+      "learning_rate": 7.91317110202087e-06,
+      "loss": 0.2901,
+      "step": 755
+    },
+    {
+      "epoch": 2.03758389261745,
+      "grad_norm": 1.1539617776870728,
+      "learning_rate": 7.718595119209691e-06,
+      "loss": 0.3277,
+      "step": 760
+    },
+    {
+      "epoch": 2.051006711409396,
+      "grad_norm": 1.1852216720581055,
+      "learning_rate": 7.525609075546649e-06,
+      "loss": 0.3001,
+      "step": 765
+    },
+    {
+      "epoch": 2.0644295302013425,
+      "grad_norm": 1.010262370109558,
+      "learning_rate": 7.334255110710933e-06,
+      "loss": 0.3258,
+      "step": 770
+    },
+    {
+      "epoch": 2.0778523489932885,
+      "grad_norm": 1.1409698724746704,
+      "learning_rate": 7.1445750080073964e-06,
+      "loss": 0.3745,
+      "step": 775
+    },
+    {
+      "epoch": 2.091275167785235,
+      "grad_norm": 1.206615924835205,
+      "learning_rate": 6.956610185242891e-06,
+      "loss": 0.326,
+      "step": 780
+    },
+    {
+      "epoch": 2.104697986577181,
+      "grad_norm": 1.255170464515686,
+      "learning_rate": 6.770401685682417e-06,
+      "loss": 0.32,
+      "step": 785
+    },
+    {
+      "epoch": 2.1181208053691276,
+      "grad_norm": 1.0881547927856445,
+      "learning_rate": 6.585990169087112e-06,
+      "loss": 0.2953,
+      "step": 790
+    },
+    {
+      "epoch": 2.1315436241610737,
+      "grad_norm": 1.1568206548690796,
+      "learning_rate": 6.40341590283593e-06,
+      "loss": 0.3367,
+      "step": 795
+    },
+    {
+      "epoch": 2.14496644295302,
+      "grad_norm": 1.216955542564392,
+      "learning_rate": 6.222718753132994e-06,
+      "loss": 0.3378,
+      "step": 800
+    },
+    {
+      "epoch": 2.1583892617449663,
+      "grad_norm": 1.241304874420166,
+      "learning_rate": 6.043938176302596e-06,
+      "loss": 0.2985,
+      "step": 805
+    },
+    {
+      "epoch": 2.1718120805369128,
+      "grad_norm": 1.0041223764419556,
+      "learning_rate": 5.8671132101736625e-06,
+      "loss": 0.2638,
+      "step": 810
+    },
+    {
+      "epoch": 2.185234899328859,
+      "grad_norm": 1.333095669746399,
+      "learning_rate": 5.692282465555585e-06,
+      "loss": 0.3218,
+      "step": 815
+    },
+    {
+      "epoch": 2.1986577181208053,
+      "grad_norm": 1.3311021327972412,
+      "learning_rate": 5.519484117807341e-06,
+      "loss": 0.251,
+      "step": 820
+    },
+    {
+      "epoch": 2.212080536912752,
+      "grad_norm": 1.2956736087799072,
+      "learning_rate": 5.348755898501662e-06,
+      "loss": 0.3234,
+      "step": 825
+    },
+    {
+      "epoch": 2.225503355704698,
+      "grad_norm": 1.4365692138671875,
+      "learning_rate": 5.180135087186101e-06,
+      "loss": 0.2785,
+      "step": 830
+    },
+    {
+      "epoch": 2.2389261744966444,
+      "grad_norm": 1.163100242614746,
+      "learning_rate": 5.013658503242845e-06,
+      "loss": 0.2742,
+      "step": 835
+    },
+    {
+      "epoch": 2.2523489932885905,
+      "grad_norm": 1.3038033246994019,
+      "learning_rate": 4.849362497848947e-06,
+      "loss": 0.3043,
+      "step": 840
+    },
+    {
+      "epoch": 2.265771812080537,
+      "grad_norm": 1.2883868217468262,
+      "learning_rate": 4.687282946038842e-06,
+      "loss": 0.3169,
+      "step": 845
+    },
+    {
+      "epoch": 2.279194630872483,
+      "grad_norm": 1.0793969631195068,
+      "learning_rate": 4.527455238870821e-06,
+      "loss": 0.3002,
+      "step": 850
+    },
+    {
+      "epoch": 2.2926174496644296,
+      "grad_norm": 1.4007631540298462,
+      "learning_rate": 4.3699142756991635e-06,
+      "loss": 0.3237,
+      "step": 855
+    },
+    {
+      "epoch": 2.3060402684563757,
+      "grad_norm": 1.1891025304794312,
+      "learning_rate": 4.2146944565536485e-06,
+      "loss": 0.2968,
+      "step": 860
+    },
+    {
+      "epoch": 2.319463087248322,
+      "grad_norm": 1.0564630031585693,
+      "learning_rate": 4.061829674628116e-06,
+      "loss": 0.2943,
+      "step": 865
+    },
+    {
+      "epoch": 2.3328859060402687,
+      "grad_norm": 1.2466074228286743,
+      "learning_rate": 3.911353308879673e-06,
+      "loss": 0.3101,
+      "step": 870
+    },
+    {
+      "epoch": 2.3463087248322148,
+      "grad_norm": 1.2940239906311035,
+      "learning_rate": 3.763298216740176e-06,
+      "loss": 0.3008,
+      "step": 875
+    },
+    {
+      "epoch": 2.3597315436241613,
+      "grad_norm": 1.1645023822784424,
+      "learning_rate": 3.617696726941645e-06,
+      "loss": 0.2936,
+      "step": 880
+    },
+    {
+      "epoch": 2.3731543624161073,
+      "grad_norm": 1.130132794380188,
+      "learning_rate": 3.474580632457067e-06,
+      "loss": 0.3071,
+      "step": 885
+    },
+    {
+      "epoch": 2.386577181208054,
+      "grad_norm": 1.4512666463851929,
+      "learning_rate": 3.333981183558196e-06,
+      "loss": 0.286,
+      "step": 890
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 1.2466052770614624,
+      "learning_rate": 3.1959290809918953e-06,
+      "loss": 0.2805,
+      "step": 895
+    },
+    {
+      "epoch": 2.4134228187919464,
+      "grad_norm": 1.1981033086776733,
+      "learning_rate": 3.060454469276423e-06,
+      "loss": 0.3166,
+      "step": 900
+    },
+    {
+      "epoch": 2.4268456375838925,
+      "grad_norm": 1.4141004085540771,
+      "learning_rate": 2.9275869301191855e-06,
+      "loss": 0.2694,
+      "step": 905
+    },
+    {
+      "epoch": 2.440268456375839,
+      "grad_norm": 1.2529476881027222,
+      "learning_rate": 2.7973554759574116e-06,
+      "loss": 0.3289,
+      "step": 910
+    },
+    {
+      "epoch": 2.453691275167785,
+      "grad_norm": 1.2852303981781006,
+      "learning_rate": 2.6697885436231023e-06,
+      "loss": 0.2548,
+      "step": 915
+    },
+    {
+      "epoch": 2.4671140939597316,
+      "grad_norm": 1.1721196174621582,
+      "learning_rate": 2.5449139881336587e-06,
+      "loss": 0.2976,
+      "step": 920
+    },
+    {
+      "epoch": 2.4805369127516776,
+      "grad_norm": 1.2852176427841187,
+      "learning_rate": 2.422759076609597e-06,
+      "loss": 0.266,
+      "step": 925
+    },
+    {
+      "epoch": 2.493959731543624,
+      "grad_norm": 1.2244685888290405,
+      "learning_rate": 2.303350482320592e-06,
+      "loss": 0.2866,
+      "step": 930
+    },
+    {
+      "epoch": 2.5073825503355707,
+      "grad_norm": 1.1460144519805908,
+      "learning_rate": 2.1867142788611937e-06,
+      "loss": 0.2525,
+      "step": 935
+    },
+    {
+      "epoch": 2.5208053691275167,
+      "grad_norm": 1.1944162845611572,
+      "learning_rate": 2.0728759344575272e-06,
+      "loss": 0.2659,
+      "step": 940
+    },
+    {
+      "epoch": 2.5342281879194632,
+      "grad_norm": 1.2347413301467896,
+      "learning_rate": 1.961860306406133e-06,
+      "loss": 0.2876,
+      "step": 945
+    },
+    {
+      "epoch": 2.5476510067114093,
+      "grad_norm": 1.1762523651123047,
+      "learning_rate": 1.8536916356462158e-06,
+      "loss": 0.2565,
+      "step": 950
+    },
+    {
+      "epoch": 2.561073825503356,
+      "grad_norm": 1.3083927631378174,
+      "learning_rate": 1.748393541466507e-06,
+      "loss": 0.295,
+      "step": 955
+    },
+    {
+      "epoch": 2.574496644295302,
+      "grad_norm": 1.20221745967865,
+      "learning_rate": 1.6459890163478391e-06,
+      "loss": 0.2945,
+      "step": 960
+    },
+    {
+      "epoch": 2.5879194630872484,
+      "grad_norm": 1.1404248476028442,
+      "learning_rate": 1.5465004209426053e-06,
+      "loss": 0.2678,
+      "step": 965
+    },
+    {
+      "epoch": 2.6013422818791945,
+      "grad_norm": 1.1557812690734863,
+      "learning_rate": 1.4499494791921563e-06,
+      "loss": 0.246,
+      "step": 970
+    },
+    {
+      "epoch": 2.614765100671141,
+      "grad_norm": 1.4273406267166138,
+      "learning_rate": 1.3563572735832668e-06,
+      "loss": 0.2618,
+      "step": 975
+    },
+    {
+      "epoch": 2.6281879194630875,
+      "grad_norm": 1.2271504402160645,
+      "learning_rate": 1.2657442405446345e-06,
+      "loss": 0.2766,
+      "step": 980
+    },
+    {
+      "epoch": 2.6416107382550336,
+      "grad_norm": 1.3696398735046387,
+      "learning_rate": 1.178130165984458e-06,
+      "loss": 0.2756,
+      "step": 985
+    },
+    {
+      "epoch": 2.6550335570469796,
+      "grad_norm": 1.165326476097107,
+      "learning_rate": 1.093534180970074e-06,
+      "loss": 0.2577,
+      "step": 990
+    },
+    {
+      "epoch": 2.668456375838926,
+      "grad_norm": 1.1621224880218506,
+      "learning_rate": 1.0119747575505695e-06,
+      "loss": 0.2703,
+      "step": 995
+    },
+    {
+      "epoch": 2.6818791946308727,
+      "grad_norm": 1.2746843099594116,
+      "learning_rate": 9.334697047232849e-07,
+      "loss": 0.271,
+      "step": 1000
+    },
+    {
+      "epoch": 2.6953020134228187,
+      "grad_norm": 1.3766860961914062,
+      "learning_rate": 8.580361645451257e-07,
+      "loss": 0.2704,
+      "step": 1005
+    },
+    {
+      "epoch": 2.7087248322147652,
+      "grad_norm": 1.2429687976837158,
+      "learning_rate": 7.856906083894871e-07,
+      "loss": 0.278,
+      "step": 1010
+    },
+    {
+      "epoch": 2.7221476510067113,
+      "grad_norm": 1.3704997301101685,
+      "learning_rate": 7.16448833349621e-07,
+      "loss": 0.2922,
+      "step": 1015
+    },
+    {
+      "epoch": 2.735570469798658,
+      "grad_norm": 1.2053768634796143,
+      "learning_rate": 6.503259587892535e-07,
+      "loss": 0.3096,
+      "step": 1020
+    },
+    {
+      "epoch": 2.748993288590604,
+      "grad_norm": 1.2349939346313477,
+      "learning_rate": 5.873364230411754e-07,
+      "loss": 0.3126,
+      "step": 1025
+    },
+    {
+      "epoch": 2.7624161073825504,
+      "grad_norm": 1.1747756004333496,
+      "learning_rate": 5.274939802545415e-07,
+      "loss": 0.2868,
+      "step": 1030
+    },
+    {
+      "epoch": 2.7758389261744965,
+      "grad_norm": 1.2930761575698853,
+      "learning_rate": 4.7081169739157717e-07,
+      "loss": 0.2636,
+      "step": 1035
+    },
+    {
+      "epoch": 2.789261744966443,
+      "grad_norm": 1.2722703218460083,
+      "learning_rate": 4.173019513743198e-07,
+      "loss": 0.2855,
+      "step": 1040
+    },
+    {
+      "epoch": 2.8026845637583895,
+      "grad_norm": 1.2160191535949707,
+      "learning_rate": 3.6697642638204354e-07,
+      "loss": 0.3027,
+      "step": 1045
+    },
+    {
+      "epoch": 2.8161073825503355,
+      "grad_norm": 1.281112790107727,
+      "learning_rate": 3.198461112999468e-07,
+      "loss": 0.2749,
+      "step": 1050
+    },
+    {
+      "epoch": 2.8295302013422816,
+      "grad_norm": 1.319238305091858,
+      "learning_rate": 2.7592129731967176e-07,
+      "loss": 0.2721,
+      "step": 1055
+    },
+    {
+      "epoch": 2.842953020134228,
+      "grad_norm": 1.1398475170135498,
+      "learning_rate": 2.3521157569214024e-07,
+      "loss": 0.2741,
+      "step": 1060
+    },
+    {
+      "epoch": 2.8563758389261746,
+      "grad_norm": 1.2062604427337646,
+      "learning_rate": 1.9772583563326729e-07,
+      "loss": 0.2914,
+      "step": 1065
+    },
+    {
+      "epoch": 2.8697986577181207,
+      "grad_norm": 1.1941601037979126,
+      "learning_rate": 1.6347226238293377e-07,
+      "loss": 0.264,
+      "step": 1070
+    },
+    {
+      "epoch": 2.883221476510067,
+      "grad_norm": 1.2056347131729126,
+      "learning_rate": 1.3245833541768947e-07,
+      "loss": 0.2621,
+      "step": 1075
+    },
+    {
+      "epoch": 2.8966442953020133,
+      "grad_norm": 1.2341067790985107,
+      "learning_rate": 1.0469082681757114e-07,
+      "loss": 0.3256,
+      "step": 1080
+    },
+    {
+      "epoch": 2.91006711409396,
+      "grad_norm": 1.2186816930770874,
+      "learning_rate": 8.0175799787372e-08,
+      "loss": 0.2971,
+      "step": 1085
+    },
+    {
+      "epoch": 2.9234899328859063,
+      "grad_norm": 1.2171205282211304,
+      "learning_rate": 5.891860733270249e-08,
+      "loss": 0.2766,
+      "step": 1090
+    },
+    {
+      "epoch": 2.9369127516778524,
+      "grad_norm": 1.2805997133255005,
+      "learning_rate": 4.092389109113526e-08,
+      "loss": 0.2823,
+      "step": 1095
+    },
+    {
+      "epoch": 2.9503355704697984,
+      "grad_norm": 1.32804274559021,
+      "learning_rate": 2.6195580318667533e-08,
+      "loss": 0.3115,
+      "step": 1100
+    },
+    {
+      "epoch": 2.963758389261745,
+      "grad_norm": 1.1139308214187622,
+      "learning_rate": 1.4736891031752464e-08,
+      "loss": 0.3099,
+      "step": 1105
+    },
+    {
+      "epoch": 2.9771812080536915,
+      "grad_norm": 1.4985675811767578,
+      "learning_rate": 6.550325305054172e-09,
+      "loss": 0.2728,
+      "step": 1110
+    },
+    {
+      "epoch": 2.9906040268456375,
+      "grad_norm": 1.1384669542312622,
+      "learning_rate": 1.6376707251097855e-09,
+      "loss": 0.2683,
+      "step": 1115
+    },
+    {
+      "epoch": 3.0,
+      "step": 1119,
+      "total_flos": 1.489123253103362e+18,
+      "train_loss": 0.5669498731239871,
+      "train_runtime": 689.3067,
+      "train_samples_per_second": 51.822,
+      "train_steps_per_second": 1.623
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1119,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.489123253103362e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

15_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e8d42762062aea0f58961ec368193103fbe93728b30aa3e942f0cc8c00b2446
+size 8145

15_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff