Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

29_128_e3_3e-5/README.md +63 -0
29_128_e3_3e-5/adapter_config.json +39 -0
29_128_e3_3e-5/adapter_model.safetensors +3 -0
29_128_e3_3e-5/all_results.json +9 -0
29_128_e3_3e-5/config.json +32 -0
29_128_e3_3e-5/merges.txt +0 -0
29_128_e3_3e-5/special_tokens_map.json +45 -0
29_128_e3_3e-5/tokenizer.json +0 -0
29_128_e3_3e-5/tokenizer_config.json +188 -0
29_128_e3_3e-5/train_results.json +9 -0
29_128_e3_3e-5/trainer_state.json +1723 -0
29_128_e3_3e-5/training_args.bin +3 -0
29_128_e3_3e-5/vocab.json +0 -0

29_128_e3_3e-5/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+library_name: peft
+license: apache-2.0
+base_model: ibm-granite/granite-3.3-8b-base
+tags:
+- alignment-handbook
+- generated_from_trainer
+datasets:
+- data/knowledge_lora_training_data_2000
+model-index:
+- name: 29_128_e3_3e-5
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# 29_128_e3_3e-5
+This model is a fine-tuned version of [ibm-granite/granite-3.3-8b-base](https://huggingface.co/ibm-granite/granite-3.3-8b-base) on the data/knowledge_lora_training_data_2000 dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 3e-05
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 64
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 3.0
+### Training results
+### Framework versions
+- PEFT 0.15.2
+- Transformers 4.52.4
+- Pytorch 2.7.0+cu126
+- Datasets 3.6.0
+- Tokenizers 0.21.2

29_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-base",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

29_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8da9d5bf0be63ed20777c8fa09b03bf962da22a3f40572b4b455765be70603ec
+size 791751704

29_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.64227002403704e+18,
+    "train_loss": 0.565493812263012,
+    "train_runtime": 754.0979,
+    "train_samples": 12789,
+    "train_samples_per_second": 50.878,
+    "train_steps_per_second": 1.591
+}

29_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49152
+}

29_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

29_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<reponame>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

29_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

29_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<reponame>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

29_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.64227002403704e+18,
+    "train_loss": 0.565493812263012,
+    "train_runtime": 754.0979,
+    "train_samples": 12789,
+    "train_samples_per_second": 50.878,
+    "train_steps_per_second": 1.591
+}

29_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1723 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0125,
+      "grad_norm": 1.096501350402832,
+      "learning_rate": 2e-06,
+      "loss": 1.3059,
+      "step": 5
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.8720690608024597,
+      "learning_rate": 4.5e-06,
+      "loss": 1.3236,
+      "step": 10
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.6325503587722778,
+      "learning_rate": 7e-06,
+      "loss": 1.3225,
+      "step": 15
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.6217526793479919,
+      "learning_rate": 9.5e-06,
+      "loss": 1.2885,
+      "step": 20
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.47595125436782837,
+      "learning_rate": 1.2e-05,
+      "loss": 1.316,
+      "step": 25
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.5025820136070251,
+      "learning_rate": 1.45e-05,
+      "loss": 1.2431,
+      "step": 30
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.5048433542251587,
+      "learning_rate": 1.7e-05,
+      "loss": 1.2902,
+      "step": 35
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5279613137245178,
+      "learning_rate": 1.95e-05,
+      "loss": 1.2261,
+      "step": 40
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 1.3911240100860596,
+      "learning_rate": 2.2e-05,
+      "loss": 1.2389,
+      "step": 45
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.45133835077285767,
+      "learning_rate": 2.45e-05,
+      "loss": 1.1965,
+      "step": 50
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 0.4962674677371979,
+      "learning_rate": 2.7000000000000002e-05,
+      "loss": 1.2192,
+      "step": 55
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.48562711477279663,
+      "learning_rate": 2.95e-05,
+      "loss": 1.1988,
+      "step": 60
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 0.4591135084629059,
+      "learning_rate": 2.9999088688415145e-05,
+      "loss": 1.1593,
+      "step": 65
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.4925994575023651,
+      "learning_rate": 2.9995386674880524e-05,
+      "loss": 1.1978,
+      "step": 70
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.5008227229118347,
+      "learning_rate": 2.9988837704729014e-05,
+      "loss": 1.1277,
+      "step": 75
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5128725171089172,
+      "learning_rate": 2.9979443021318607e-05,
+      "loss": 1.1022,
+      "step": 80
+    },
+    {
+      "epoch": 0.2125,
+      "grad_norm": 0.49049896001815796,
+      "learning_rate": 2.9967204408281618e-05,
+      "loss": 1.1422,
+      "step": 85
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.647089958190918,
+      "learning_rate": 2.9952124189186028e-05,
+      "loss": 1.1323,
+      "step": 90
+    },
+    {
+      "epoch": 0.2375,
+      "grad_norm": 0.5251462459564209,
+      "learning_rate": 2.9934205227094347e-05,
+      "loss": 1.1207,
+      "step": 95
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5728532075881958,
+      "learning_rate": 2.9913450924020067e-05,
+      "loss": 1.0544,
+      "step": 100
+    },
+    {
+      "epoch": 0.2625,
+      "grad_norm": 0.5652995109558105,
+      "learning_rate": 2.9889865220281747e-05,
+      "loss": 1.1441,
+      "step": 105
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 0.6100318431854248,
+      "learning_rate": 2.9863452593754943e-05,
+      "loss": 1.0696,
+      "step": 110
+    },
+    {
+      "epoch": 0.2875,
+      "grad_norm": 0.5077236890792847,
+      "learning_rate": 2.9834218059022027e-05,
+      "loss": 1.074,
+      "step": 115
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5974183678627014,
+      "learning_rate": 2.9802167166420182e-05,
+      "loss": 1.0579,
+      "step": 120
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.5726377367973328,
+      "learning_rate": 2.976730600098761e-05,
+      "loss": 1.0062,
+      "step": 125
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 0.5457852482795715,
+      "learning_rate": 2.972964118130826e-05,
+      "loss": 1.0449,
+      "step": 130
+    },
+    {
+      "epoch": 0.3375,
+      "grad_norm": 0.633044958114624,
+      "learning_rate": 2.9689179858255252e-05,
+      "loss": 1.0299,
+      "step": 135
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.7141392827033997,
+      "learning_rate": 2.964592971363327e-05,
+      "loss": 1.02,
+      "step": 140
+    },
+    {
+      "epoch": 0.3625,
+      "grad_norm": 0.675027072429657,
+      "learning_rate": 2.9599898958720088e-05,
+      "loss": 1.0242,
+      "step": 145
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.6282708644866943,
+      "learning_rate": 2.955109633270764e-05,
+      "loss": 0.9516,
+      "step": 150
+    },
+    {
+      "epoch": 0.3875,
+      "grad_norm": 0.784489631652832,
+      "learning_rate": 2.9499531101042834e-05,
+      "loss": 0.9881,
+      "step": 155
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6914710998535156,
+      "learning_rate": 2.9445213053668444e-05,
+      "loss": 0.9733,
+      "step": 160
+    },
+    {
+      "epoch": 0.4125,
+      "grad_norm": 0.687872052192688,
+      "learning_rate": 2.938815250316445e-05,
+      "loss": 0.8953,
+      "step": 165
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.83106929063797,
+      "learning_rate": 2.932836028279013e-05,
+      "loss": 0.9592,
+      "step": 170
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.6633298993110657,
+      "learning_rate": 2.9265847744427305e-05,
+      "loss": 0.9416,
+      "step": 175
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.7028163075447083,
+      "learning_rate": 2.9200626756425132e-05,
+      "loss": 0.9324,
+      "step": 180
+    },
+    {
+      "epoch": 0.4625,
+      "grad_norm": 0.7386612296104431,
+      "learning_rate": 2.9132709701346815e-05,
+      "loss": 0.9592,
+      "step": 185
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 0.6585894823074341,
+      "learning_rate": 2.9062109473618732e-05,
+      "loss": 0.955,
+      "step": 190
+    },
+    {
+      "epoch": 0.4875,
+      "grad_norm": 0.7104294896125793,
+      "learning_rate": 2.898883947708233e-05,
+      "loss": 0.9462,
+      "step": 195
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7890173196792603,
+      "learning_rate": 2.8912913622449335e-05,
+      "loss": 0.9005,
+      "step": 200
+    },
+    {
+      "epoch": 0.5125,
+      "grad_norm": 0.7631654143333435,
+      "learning_rate": 2.883434632466077e-05,
+      "loss": 0.9035,
+      "step": 205
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 0.7017130851745605,
+      "learning_rate": 2.875315250015014e-05,
+      "loss": 0.8929,
+      "step": 210
+    },
+    {
+      "epoch": 0.5375,
+      "grad_norm": 0.8477493524551392,
+      "learning_rate": 2.8669347564011493e-05,
+      "loss": 0.8661,
+      "step": 215
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.8296931385993958,
+      "learning_rate": 2.8582947427072784e-05,
+      "loss": 0.859,
+      "step": 220
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.7606105804443359,
+      "learning_rate": 2.8493968492875104e-05,
+      "loss": 0.878,
+      "step": 225
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 0.7497696280479431,
+      "learning_rate": 2.84024276545584e-05,
+      "loss": 0.9029,
+      "step": 230
+    },
+    {
+      "epoch": 0.5875,
+      "grad_norm": 0.8201003670692444,
+      "learning_rate": 2.830834229165418e-05,
+      "loss": 0.8247,
+      "step": 235
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.7867859601974487,
+      "learning_rate": 2.821173026678595e-05,
+      "loss": 0.8574,
+      "step": 240
+    },
+    {
+      "epoch": 0.6125,
+      "grad_norm": 0.8410252332687378,
+      "learning_rate": 2.8112609922277886e-05,
+      "loss": 0.85,
+      "step": 245
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.8968583345413208,
+      "learning_rate": 2.801100007667243e-05,
+      "loss": 0.8596,
+      "step": 250
+    },
+    {
+      "epoch": 0.6375,
+      "grad_norm": 0.8730499744415283,
+      "learning_rate": 2.7906920021157508e-05,
+      "loss": 0.8076,
+      "step": 255
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.942690908908844,
+      "learning_rate": 2.780038951590397e-05,
+      "loss": 0.8235,
+      "step": 260
+    },
+    {
+      "epoch": 0.6625,
+      "grad_norm": 0.9312007427215576,
+      "learning_rate": 2.769142878631403e-05,
+      "loss": 0.8294,
+      "step": 265
+    },
+    {
+      "epoch": 0.675,
+      "grad_norm": 0.9603184461593628,
+      "learning_rate": 2.7580058519181363e-05,
+      "loss": 0.7691,
+      "step": 270
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.9270436763763428,
+      "learning_rate": 2.7466299858763573e-05,
+      "loss": 0.7912,
+      "step": 275
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.9059388637542725,
+      "learning_rate": 2.7350174402767887e-05,
+      "loss": 0.8148,
+      "step": 280
+    },
+    {
+      "epoch": 0.7125,
+      "grad_norm": 0.8338863849639893,
+      "learning_rate": 2.723170419825067e-05,
+      "loss": 0.8079,
+      "step": 285
+    },
+    {
+      "epoch": 0.725,
+      "grad_norm": 0.9500786066055298,
+      "learning_rate": 2.7110911737431697e-05,
+      "loss": 0.7996,
+      "step": 290
+    },
+    {
+      "epoch": 0.7375,
+      "grad_norm": 0.951601505279541,
+      "learning_rate": 2.698781995342387e-05,
+      "loss": 0.8073,
+      "step": 295
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.9220702052116394,
+      "learning_rate": 2.686245221587924e-05,
+      "loss": 0.7983,
+      "step": 300
+    },
+    {
+      "epoch": 0.7625,
+      "grad_norm": 0.9541069865226746,
+      "learning_rate": 2.6734832326552148e-05,
+      "loss": 0.7416,
+      "step": 305
+    },
+    {
+      "epoch": 0.775,
+      "grad_norm": 0.9559035897254944,
+      "learning_rate": 2.6604984514780343e-05,
+      "loss": 0.8257,
+      "step": 310
+    },
+    {
+      "epoch": 0.7875,
+      "grad_norm": 1.0292361974716187,
+      "learning_rate": 2.6472933432884897e-05,
+      "loss": 0.7343,
+      "step": 315
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.9453174471855164,
+      "learning_rate": 2.6338704151489827e-05,
+      "loss": 0.7561,
+      "step": 320
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.9862915873527527,
+      "learning_rate": 2.620232215476231e-05,
+      "loss": 0.7356,
+      "step": 325
+    },
+    {
+      "epoch": 0.825,
+      "grad_norm": 0.9760687351226807,
+      "learning_rate": 2.6063813335574357e-05,
+      "loss": 0.7632,
+      "step": 330
+    },
+    {
+      "epoch": 0.8375,
+      "grad_norm": 0.9775648713111877,
+      "learning_rate": 2.5923203990586933e-05,
+      "loss": 0.73,
+      "step": 335
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.1117346286773682,
+      "learning_rate": 2.578052081525736e-05,
+      "loss": 0.7226,
+      "step": 340
+    },
+    {
+      "epoch": 0.8625,
+      "grad_norm": 0.9384631514549255,
+      "learning_rate": 2.563579089877106e-05,
+      "loss": 0.6991,
+      "step": 345
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 1.0402817726135254,
+      "learning_rate": 2.548904171889852e-05,
+      "loss": 0.712,
+      "step": 350
+    },
+    {
+      "epoch": 0.8875,
+      "grad_norm": 0.9653849005699158,
+      "learning_rate": 2.534030113677849e-05,
+      "loss": 0.7059,
+      "step": 355
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.017627239227295,
+      "learning_rate": 2.518959739162837e-05,
+      "loss": 0.6552,
+      "step": 360
+    },
+    {
+      "epoch": 0.9125,
+      "grad_norm": 0.8845812082290649,
+      "learning_rate": 2.5036959095382875e-05,
+      "loss": 0.723,
+      "step": 365
+    },
+    {
+      "epoch": 0.925,
+      "grad_norm": 1.0637366771697998,
+      "learning_rate": 2.488241522726187e-05,
+      "loss": 0.6525,
+      "step": 370
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 0.9697546362876892,
+      "learning_rate": 2.4725995128268523e-05,
+      "loss": 0.6579,
+      "step": 375
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.962817907333374,
+      "learning_rate": 2.4567728495618763e-05,
+      "loss": 0.6756,
+      "step": 380
+    },
+    {
+      "epoch": 0.9625,
+      "grad_norm": 1.0425231456756592,
+      "learning_rate": 2.4407645377103056e-05,
+      "loss": 0.7168,
+      "step": 385
+    },
+    {
+      "epoch": 0.975,
+      "grad_norm": 0.9876998066902161,
+      "learning_rate": 2.424577616538173e-05,
+      "loss": 0.6452,
+      "step": 390
+    },
+    {
+      "epoch": 0.9875,
+      "grad_norm": 0.9951984286308289,
+      "learning_rate": 2.4082151592214717e-05,
+      "loss": 0.6772,
+      "step": 395
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.9820576906204224,
+      "learning_rate": 2.3916802722626972e-05,
+      "loss": 0.6617,
+      "step": 400
+    },
+    {
+      "epoch": 1.0125,
+      "grad_norm": 0.9402483105659485,
+      "learning_rate": 2.37497609490106e-05,
+      "loss": 0.5926,
+      "step": 405
+    },
+    {
+      "epoch": 1.025,
+      "grad_norm": 1.0088366270065308,
+      "learning_rate": 2.3581057985164857e-05,
+      "loss": 0.5814,
+      "step": 410
+    },
+    {
+      "epoch": 1.0375,
+      "grad_norm": 1.1835291385650635,
+      "learning_rate": 2.3410725860275092e-05,
+      "loss": 0.5888,
+      "step": 415
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.0397685766220093,
+      "learning_rate": 2.323879691283184e-05,
+      "loss": 0.6096,
+      "step": 420
+    },
+    {
+      "epoch": 1.0625,
+      "grad_norm": 1.2611539363861084,
+      "learning_rate": 2.3065303784491174e-05,
+      "loss": 0.5664,
+      "step": 425
+    },
+    {
+      "epoch": 1.075,
+      "grad_norm": 1.0762382745742798,
+      "learning_rate": 2.2890279413877512e-05,
+      "loss": 0.5758,
+      "step": 430
+    },
+    {
+      "epoch": 1.0875,
+      "grad_norm": 1.06105375289917,
+      "learning_rate": 2.2713757030330046e-05,
+      "loss": 0.554,
+      "step": 435
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.3117653131484985,
+      "learning_rate": 2.253577014759395e-05,
+      "loss": 0.5716,
+      "step": 440
+    },
+    {
+      "epoch": 1.1125,
+      "grad_norm": 1.1829787492752075,
+      "learning_rate": 2.2356352557457624e-05,
+      "loss": 0.5253,
+      "step": 445
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 1.2821694612503052,
+      "learning_rate": 2.217553832333714e-05,
+      "loss": 0.5658,
+      "step": 450
+    },
+    {
+      "epoch": 1.1375,
+      "grad_norm": 1.1681571006774902,
+      "learning_rate": 2.1993361773809102e-05,
+      "loss": 0.5154,
+      "step": 455
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.218019723892212,
+      "learning_rate": 2.1809857496093203e-05,
+      "loss": 0.5681,
+      "step": 460
+    },
+    {
+      "epoch": 1.1625,
+      "grad_norm": 1.1740199327468872,
+      "learning_rate": 2.162506032948561e-05,
+      "loss": 0.4932,
+      "step": 465
+    },
+    {
+      "epoch": 1.175,
+      "grad_norm": 1.0119123458862305,
+      "learning_rate": 2.143900535874457e-05,
+      "loss": 0.5498,
+      "step": 470
+    },
+    {
+      "epoch": 1.1875,
+      "grad_norm": 1.17697012424469,
+      "learning_rate": 2.1251727907429357e-05,
+      "loss": 0.5778,
+      "step": 475
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.0151280164718628,
+      "learning_rate": 2.1063263531193905e-05,
+      "loss": 0.5157,
+      "step": 480
+    },
+    {
+      "epoch": 1.2125,
+      "grad_norm": 1.1586745977401733,
+      "learning_rate": 2.0873648011036373e-05,
+      "loss": 0.5108,
+      "step": 485
+    },
+    {
+      "epoch": 1.225,
+      "grad_norm": 1.2907254695892334,
+      "learning_rate": 2.0682917346505917e-05,
+      "loss": 0.5131,
+      "step": 490
+    },
+    {
+      "epoch": 1.2375,
+      "grad_norm": 1.1124744415283203,
+      "learning_rate": 2.0491107748867985e-05,
+      "loss": 0.5237,
+      "step": 495
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.2544411420822144,
+      "learning_rate": 2.0298255634229426e-05,
+      "loss": 0.5386,
+      "step": 500
+    },
+    {
+      "epoch": 1.2625,
+      "grad_norm": 1.0245000123977661,
+      "learning_rate": 2.0104397616624646e-05,
+      "loss": 0.5461,
+      "step": 505
+    },
+    {
+      "epoch": 1.275,
+      "grad_norm": 1.1500096321105957,
+      "learning_rate": 1.990957050106428e-05,
+      "loss": 0.5202,
+      "step": 510
+    },
+    {
+      "epoch": 1.2875,
+      "grad_norm": 0.9860815405845642,
+      "learning_rate": 1.9713811276547527e-05,
+      "loss": 0.505,
+      "step": 515
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.0547618865966797,
+      "learning_rate": 1.9517157109039614e-05,
+      "loss": 0.4947,
+      "step": 520
+    },
+    {
+      "epoch": 1.3125,
+      "grad_norm": 1.0779049396514893,
+      "learning_rate": 1.9319645334415594e-05,
+      "loss": 0.5056,
+      "step": 525
+    },
+    {
+      "epoch": 1.325,
+      "grad_norm": 1.2494122982025146,
+      "learning_rate": 1.9121313451371957e-05,
+      "loss": 0.5309,
+      "step": 530
+    },
+    {
+      "epoch": 1.3375,
+      "grad_norm": 1.2767302989959717,
+      "learning_rate": 1.8922199114307297e-05,
+      "loss": 0.5095,
+      "step": 535
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.0909416675567627,
+      "learning_rate": 1.872234012617339e-05,
+      "loss": 0.456,
+      "step": 540
+    },
+    {
+      "epoch": 1.3625,
+      "grad_norm": 1.008742332458496,
+      "learning_rate": 1.8521774431298116e-05,
+      "loss": 0.4669,
+      "step": 545
+    },
+    {
+      "epoch": 1.375,
+      "grad_norm": 1.1449681520462036,
+      "learning_rate": 1.8320540108181518e-05,
+      "loss": 0.5324,
+      "step": 550
+    },
+    {
+      "epoch": 1.3875,
+      "grad_norm": 1.0932765007019043,
+      "learning_rate": 1.8118675362266388e-05,
+      "loss": 0.4762,
+      "step": 555
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.0657200813293457,
+      "learning_rate": 1.791621851868476e-05,
+      "loss": 0.4846,
+      "step": 560
+    },
+    {
+      "epoch": 1.4125,
+      "grad_norm": 1.1053435802459717,
+      "learning_rate": 1.771320801498165e-05,
+      "loss": 0.5211,
+      "step": 565
+    },
+    {
+      "epoch": 1.425,
+      "grad_norm": 1.0770117044448853,
+      "learning_rate": 1.75096823938175e-05,
+      "loss": 0.5032,
+      "step": 570
+    },
+    {
+      "epoch": 1.4375,
+      "grad_norm": 1.2533434629440308,
+      "learning_rate": 1.7305680295650638e-05,
+      "loss": 0.4753,
+      "step": 575
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.1737210750579834,
+      "learning_rate": 1.710124045140117e-05,
+      "loss": 0.4317,
+      "step": 580
+    },
+    {
+      "epoch": 1.4625,
+      "grad_norm": 1.1556503772735596,
+      "learning_rate": 1.6896401675097696e-05,
+      "loss": 0.4803,
+      "step": 585
+    },
+    {
+      "epoch": 1.475,
+      "grad_norm": 1.074533224105835,
+      "learning_rate": 1.6691202856508267e-05,
+      "loss": 0.459,
+      "step": 590
+    },
+    {
+      "epoch": 1.4875,
+      "grad_norm": 1.2261135578155518,
+      "learning_rate": 1.6485682953756945e-05,
+      "loss": 0.4701,
+      "step": 595
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.1798652410507202,
+      "learning_rate": 1.627988098592737e-05,
+      "loss": 0.5234,
+      "step": 600
+    },
+    {
+      "epoch": 1.5125,
+      "grad_norm": 1.1400408744812012,
+      "learning_rate": 1.6073836025654805e-05,
+      "loss": 0.44,
+      "step": 605
+    },
+    {
+      "epoch": 1.525,
+      "grad_norm": 1.1641569137573242,
+      "learning_rate": 1.5867587191707962e-05,
+      "loss": 0.4535,
+      "step": 610
+    },
+    {
+      "epoch": 1.5375,
+      "grad_norm": 1.1707454919815063,
+      "learning_rate": 1.5661173641562085e-05,
+      "loss": 0.4404,
+      "step": 615
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.2437214851379395,
+      "learning_rate": 1.5454634563964686e-05,
+      "loss": 0.4599,
+      "step": 620
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": 1.313765287399292,
+      "learning_rate": 1.524800917149538e-05,
+      "loss": 0.4303,
+      "step": 625
+    },
+    {
+      "epoch": 1.575,
+      "grad_norm": 1.2089906930923462,
+      "learning_rate": 1.5041336693121106e-05,
+      "loss": 0.4079,
+      "step": 630
+    },
+    {
+      "epoch": 1.5875,
+      "grad_norm": 1.2107923030853271,
+      "learning_rate": 1.4834656366748348e-05,
+      "loss": 0.4414,
+      "step": 635
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.0747815370559692,
+      "learning_rate": 1.462800743177355e-05,
+      "loss": 0.4232,
+      "step": 640
+    },
+    {
+      "epoch": 1.6125,
+      "grad_norm": 1.1553806066513062,
+      "learning_rate": 1.4421429121633342e-05,
+      "loss": 0.4199,
+      "step": 645
+    },
+    {
+      "epoch": 1.625,
+      "grad_norm": 1.0519508123397827,
+      "learning_rate": 1.4214960656355843e-05,
+      "loss": 0.4449,
+      "step": 650
+    },
+    {
+      "epoch": 1.6375,
+      "grad_norm": 1.206945776939392,
+      "learning_rate": 1.400864123511451e-05,
+      "loss": 0.4428,
+      "step": 655
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.1311391592025757,
+      "learning_rate": 1.3802510028785977e-05,
+      "loss": 0.4549,
+      "step": 660
+    },
+    {
+      "epoch": 1.6625,
+      "grad_norm": 1.1363489627838135,
+      "learning_rate": 1.3596606172513235e-05,
+      "loss": 0.4562,
+      "step": 665
+    },
+    {
+      "epoch": 1.675,
+      "grad_norm": 1.1651484966278076,
+      "learning_rate": 1.3390968758275628e-05,
+      "loss": 0.4394,
+      "step": 670
+    },
+    {
+      "epoch": 1.6875,
+      "grad_norm": 1.0412386655807495,
+      "learning_rate": 1.3185636827467006e-05,
+      "loss": 0.4657,
+      "step": 675
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.0228976011276245,
+      "learning_rate": 1.2980649363483511e-05,
+      "loss": 0.4393,
+      "step": 680
+    },
+    {
+      "epoch": 1.7125,
+      "grad_norm": 1.1123242378234863,
+      "learning_rate": 1.277604528432237e-05,
+      "loss": 0.4142,
+      "step": 685
+    },
+    {
+      "epoch": 1.725,
+      "grad_norm": 1.1547119617462158,
+      "learning_rate": 1.2571863435193088e-05,
+      "loss": 0.4662,
+      "step": 690
+    },
+    {
+      "epoch": 1.7375,
+      "grad_norm": 1.1411031484603882,
+      "learning_rate": 1.2368142581142479e-05,
+      "loss": 0.4317,
+      "step": 695
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.362444281578064,
+      "learning_rate": 1.2164921399694899e-05,
+      "loss": 0.4444,
+      "step": 700
+    },
+    {
+      "epoch": 1.7625,
+      "grad_norm": 1.3785865306854248,
+      "learning_rate": 1.1962238473509122e-05,
+      "loss": 0.4037,
+      "step": 705
+    },
+    {
+      "epoch": 1.775,
+      "grad_norm": 1.1370779275894165,
+      "learning_rate": 1.1760132283053199e-05,
+      "loss": 0.4188,
+      "step": 710
+    },
+    {
+      "epoch": 1.7875,
+      "grad_norm": 1.2741678953170776,
+      "learning_rate": 1.1558641199298728e-05,
+      "loss": 0.4074,
+      "step": 715
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.2674919366836548,
+      "learning_rate": 1.1357803476435919e-05,
+      "loss": 0.3899,
+      "step": 720
+    },
+    {
+      "epoch": 1.8125,
+      "grad_norm": 1.211328148841858,
+      "learning_rate": 1.1157657244610825e-05,
+      "loss": 0.3987,
+      "step": 725
+    },
+    {
+      "epoch": 1.825,
+      "grad_norm": 1.5080320835113525,
+      "learning_rate": 1.095824050268613e-05,
+      "loss": 0.4008,
+      "step": 730
+    },
+    {
+      "epoch": 1.8375,
+      "grad_norm": 1.1555554866790771,
+      "learning_rate": 1.0759591111026837e-05,
+      "loss": 0.4042,
+      "step": 735
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.4236955642700195,
+      "learning_rate": 1.0561746784312277e-05,
+      "loss": 0.4372,
+      "step": 740
+    },
+    {
+      "epoch": 1.8625,
+      "grad_norm": 1.066102147102356,
+      "learning_rate": 1.036474508437579e-05,
+      "loss": 0.3984,
+      "step": 745
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": 1.2895863056182861,
+      "learning_rate": 1.0168623413073387e-05,
+      "loss": 0.393,
+      "step": 750
+    },
+    {
+      "epoch": 1.8875,
+      "grad_norm": 1.2252490520477295,
+      "learning_rate": 9.973419005182792e-06,
+      "loss": 0.391,
+      "step": 755
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.3331528902053833,
+      "learning_rate": 9.779168921334232e-06,
+      "loss": 0.4127,
+      "step": 760
+    },
+    {
+      "epoch": 1.9125,
+      "grad_norm": 1.3168723583221436,
+      "learning_rate": 9.585910040974282e-06,
+      "loss": 0.3986,
+      "step": 765
+    },
+    {
+      "epoch": 1.925,
+      "grad_norm": 1.1572790145874023,
+      "learning_rate": 9.393679055364054e-06,
+      "loss": 0.3686,
+      "step": 770
+    },
+    {
+      "epoch": 1.9375,
+      "grad_norm": 1.1793782711029053,
+      "learning_rate": 9.20251246061322e-06,
+      "loss": 0.3819,
+      "step": 775
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.2285219430923462,
+      "learning_rate": 9.01244655075097e-06,
+      "loss": 0.4054,
+      "step": 780
+    },
+    {
+      "epoch": 1.9625,
+      "grad_norm": 1.2546799182891846,
+      "learning_rate": 8.823517410835427e-06,
+      "loss": 0.3962,
+      "step": 785
+    },
+    {
+      "epoch": 1.975,
+      "grad_norm": 1.2645176649093628,
+      "learning_rate": 8.635760910102697e-06,
+      "loss": 0.3819,
+      "step": 790
+    },
+    {
+      "epoch": 1.9875,
+      "grad_norm": 1.1737213134765625,
+      "learning_rate": 8.449212695156873e-06,
+      "loss": 0.3666,
+      "step": 795
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.5198034048080444,
+      "learning_rate": 8.263908183202348e-06,
+      "loss": 0.3473,
+      "step": 800
+    },
+    {
+      "epoch": 2.0125,
+      "grad_norm": 1.1322892904281616,
+      "learning_rate": 8.079882555319685e-06,
+      "loss": 0.3098,
+      "step": 805
+    },
+    {
+      "epoch": 2.025,
+      "grad_norm": 1.3172906637191772,
+      "learning_rate": 7.897170749786242e-06,
+      "loss": 0.3046,
+      "step": 810
+    },
+    {
+      "epoch": 2.0375,
+      "grad_norm": 1.1878982782363892,
+      "learning_rate": 7.715807455443015e-06,
+      "loss": 0.3438,
+      "step": 815
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 1.3433789014816284,
+      "learning_rate": 7.5358271051086975e-06,
+      "loss": 0.2994,
+      "step": 820
+    },
+    {
+      "epoch": 2.0625,
+      "grad_norm": 1.3891838788986206,
+      "learning_rate": 7.357263869042479e-06,
+      "loss": 0.3286,
+      "step": 825
+    },
+    {
+      "epoch": 2.075,
+      "grad_norm": 1.303449034690857,
+      "learning_rate": 7.18015164845662e-06,
+      "loss": 0.3307,
+      "step": 830
+    },
+    {
+      "epoch": 2.0875,
+      "grad_norm": 1.2112329006195068,
+      "learning_rate": 7.0045240690800975e-06,
+      "loss": 0.3302,
+      "step": 835
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 1.2413291931152344,
+      "learning_rate": 6.830414474774594e-06,
+      "loss": 0.3125,
+      "step": 840
+    },
+    {
+      "epoch": 2.1125,
+      "grad_norm": 1.4404971599578857,
+      "learning_rate": 6.657855921203991e-06,
+      "loss": 0.3063,
+      "step": 845
+    },
+    {
+      "epoch": 2.125,
+      "grad_norm": 1.0838373899459839,
+      "learning_rate": 6.4868811695585465e-06,
+      "loss": 0.327,
+      "step": 850
+    },
+    {
+      "epoch": 2.1375,
+      "grad_norm": 1.1617887020111084,
+      "learning_rate": 6.317522680335015e-06,
+      "loss": 0.2965,
+      "step": 855
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 1.256178617477417,
+      "learning_rate": 6.149812607173844e-06,
+      "loss": 0.2983,
+      "step": 860
+    },
+    {
+      "epoch": 2.1625,
+      "grad_norm": 1.2694519758224487,
+      "learning_rate": 5.983782790754624e-06,
+      "loss": 0.2831,
+      "step": 865
+    },
+    {
+      "epoch": 2.175,
+      "grad_norm": 1.1719343662261963,
+      "learning_rate": 5.819464752750978e-06,
+      "loss": 0.2873,
+      "step": 870
+    },
+    {
+      "epoch": 2.1875,
+      "grad_norm": 1.22670578956604,
+      "learning_rate": 5.656889689845964e-06,
+      "loss": 0.2979,
+      "step": 875
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 1.1985101699829102,
+      "learning_rate": 5.496088467809243e-06,
+      "loss": 0.3081,
+      "step": 880
+    },
+    {
+      "epoch": 2.2125,
+      "grad_norm": 1.339827060699463,
+      "learning_rate": 5.337091615637039e-06,
+      "loss": 0.2944,
+      "step": 885
+    },
+    {
+      "epoch": 2.225,
+      "grad_norm": 1.3325270414352417,
+      "learning_rate": 5.179929319756025e-06,
+      "loss": 0.2841,
+      "step": 890
+    },
+    {
+      "epoch": 2.2375,
+      "grad_norm": 1.170397162437439,
+      "learning_rate": 5.024631418292275e-06,
+      "loss": 0.3372,
+      "step": 895
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 1.2334169149398804,
+      "learning_rate": 4.871227395406347e-06,
+      "loss": 0.2963,
+      "step": 900
+    },
+    {
+      "epoch": 2.2625,
+      "grad_norm": 1.1435058116912842,
+      "learning_rate": 4.719746375695505e-06,
+      "loss": 0.2856,
+      "step": 905
+    },
+    {
+      "epoch": 2.275,
+      "grad_norm": 1.2726664543151855,
+      "learning_rate": 4.570217118664321e-06,
+      "loss": 0.285,
+      "step": 910
+    },
+    {
+      "epoch": 2.2875,
+      "grad_norm": 1.1388657093048096,
+      "learning_rate": 4.422668013264467e-06,
+      "loss": 0.2911,
+      "step": 915
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 1.2628424167633057,
+      "learning_rate": 4.277127072504945e-06,
+      "loss": 0.3115,
+      "step": 920
+    },
+    {
+      "epoch": 2.3125,
+      "grad_norm": 1.4541245698928833,
+      "learning_rate": 4.133621928133666e-06,
+      "loss": 0.2879,
+      "step": 925
+    },
+    {
+      "epoch": 2.325,
+      "grad_norm": 1.2590599060058594,
+      "learning_rate": 3.992179825391391e-06,
+      "loss": 0.2664,
+      "step": 930
+    },
+    {
+      "epoch": 2.3375,
+      "grad_norm": 1.1543575525283813,
+      "learning_rate": 3.852827617839085e-06,
+      "loss": 0.2709,
+      "step": 935
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 1.2745298147201538,
+      "learning_rate": 3.715591762259633e-06,
+      "loss": 0.3077,
+      "step": 940
+    },
+    {
+      "epoch": 2.3625,
+      "grad_norm": 1.2039170265197754,
+      "learning_rate": 3.5804983136348313e-06,
+      "loss": 0.299,
+      "step": 945
+    },
+    {
+      "epoch": 2.375,
+      "grad_norm": 1.2109931707382202,
+      "learning_rate": 3.4475729201987693e-06,
+      "loss": 0.2897,
+      "step": 950
+    },
+    {
+      "epoch": 2.3875,
+      "grad_norm": 1.2224966287612915,
+      "learning_rate": 3.3168408185683153e-06,
+      "loss": 0.2615,
+      "step": 955
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 1.1242839097976685,
+      "learning_rate": 3.1883268289518308e-06,
+      "loss": 0.2877,
+      "step": 960
+    },
+    {
+      "epoch": 2.4125,
+      "grad_norm": 1.2263603210449219,
+      "learning_rate": 3.062055350436923e-06,
+      "loss": 0.2895,
+      "step": 965
+    },
+    {
+      "epoch": 2.425,
+      "grad_norm": 1.2495533227920532,
+      "learning_rate": 2.9380503563581295e-06,
+      "loss": 0.3306,
+      "step": 970
+    },
+    {
+      "epoch": 2.4375,
+      "grad_norm": 1.2463157176971436,
+      "learning_rate": 2.8163353897454474e-06,
+      "loss": 0.2893,
+      "step": 975
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 1.2636357545852661,
+      "learning_rate": 2.696933558854589e-06,
+      "loss": 0.2875,
+      "step": 980
+    },
+    {
+      "epoch": 2.4625,
+      "grad_norm": 1.3296539783477783,
+      "learning_rate": 2.5798675327796994e-06,
+      "loss": 0.2906,
+      "step": 985
+    },
+    {
+      "epoch": 2.475,
+      "grad_norm": 1.4464213848114014,
+      "learning_rate": 2.465159537149563e-06,
+      "loss": 0.3093,
+      "step": 990
+    },
+    {
+      "epoch": 2.4875,
+      "grad_norm": 1.2355504035949707,
+      "learning_rate": 2.352831349907904e-06,
+      "loss": 0.2875,
+      "step": 995
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 1.3024587631225586,
+      "learning_rate": 2.2429042971787366e-06,
+      "loss": 0.2882,
+      "step": 1000
+    },
+    {
+      "epoch": 2.5125,
+      "grad_norm": 1.4043537378311157,
+      "learning_rate": 2.1353992492174944e-06,
+      "loss": 0.2705,
+      "step": 1005
+    },
+    {
+      "epoch": 2.525,
+      "grad_norm": 1.4674831628799438,
+      "learning_rate": 2.03033661644868e-06,
+      "loss": 0.2741,
+      "step": 1010
+    },
+    {
+      "epoch": 2.5375,
+      "grad_norm": 1.1488456726074219,
+      "learning_rate": 1.927736345590839e-06,
+      "loss": 0.3123,
+      "step": 1015
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 1.2895066738128662,
+      "learning_rate": 1.8276179158695822e-06,
+      "loss": 0.2657,
+      "step": 1020
+    },
+    {
+      "epoch": 2.5625,
+      "grad_norm": 1.3622872829437256,
+      "learning_rate": 1.7300003353193117e-06,
+      "loss": 0.2768,
+      "step": 1025
+    },
+    {
+      "epoch": 2.575,
+      "grad_norm": 1.2856098413467407,
+      "learning_rate": 1.6349021371744833e-06,
+      "loss": 0.2654,
+      "step": 1030
+    },
+    {
+      "epoch": 2.5875,
+      "grad_norm": 1.2219773530960083,
+      "learning_rate": 1.5423413763509376e-06,
+      "loss": 0.2553,
+      "step": 1035
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 1.32290780544281,
+      "learning_rate": 1.452335626018086e-06,
+      "loss": 0.2954,
+      "step": 1040
+    },
+    {
+      "epoch": 2.6125,
+      "grad_norm": 1.2187912464141846,
+      "learning_rate": 1.3649019742625624e-06,
+      "loss": 0.2776,
+      "step": 1045
+    },
+    {
+      "epoch": 2.625,
+      "grad_norm": 1.0754672288894653,
+      "learning_rate": 1.280057020843927e-06,
+      "loss": 0.3043,
+      "step": 1050
+    },
+    {
+      "epoch": 2.6375,
+      "grad_norm": 1.2335586547851562,
+      "learning_rate": 1.1978168740431311e-06,
+      "loss": 0.2971,
+      "step": 1055
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 1.2342262268066406,
+      "learning_rate": 1.1181971476042736e-06,
+      "loss": 0.293,
+      "step": 1060
+    },
+    {
+      "epoch": 2.6625,
+      "grad_norm": 1.2400048971176147,
+      "learning_rate": 1.041212957770225e-06,
+      "loss": 0.2714,
+      "step": 1065
+    },
+    {
+      "epoch": 2.675,
+      "grad_norm": 1.2360520362854004,
+      "learning_rate": 9.668789204127454e-07,
+      "loss": 0.3051,
+      "step": 1070
+    },
+    {
+      "epoch": 2.6875,
+      "grad_norm": 1.135083556175232,
+      "learning_rate": 8.952091482575825e-07,
+      "loss": 0.2639,
+      "step": 1075
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 1.126283884048462,
+      "learning_rate": 8.262172482050711e-07,
+      "loss": 0.2521,
+      "step": 1080
+    },
+    {
+      "epoch": 2.7125,
+      "grad_norm": 1.2299840450286865,
+      "learning_rate": 7.599163187468111e-07,
+      "loss": 0.2674,
+      "step": 1085
+    },
+    {
+      "epoch": 2.725,
+      "grad_norm": 1.176446795463562,
+      "learning_rate": 6.963189474788378e-07,
+      "loss": 0.2721,
+      "step": 1090
+    },
+    {
+      "epoch": 2.7375,
+      "grad_norm": 1.374687910079956,
+      "learning_rate": 6.354372087117927e-07,
+      "loss": 0.2971,
+      "step": 1095
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 1.1717859506607056,
+      "learning_rate": 5.772826611785631e-07,
+      "loss": 0.3002,
+      "step": 1100
+    },
+    {
+      "epoch": 2.7625,
+      "grad_norm": 1.1102797985076904,
+      "learning_rate": 5.218663458397716e-07,
+      "loss": 0.2612,
+      "step": 1105
+    },
+    {
+      "epoch": 2.775,
+      "grad_norm": 1.257702350616455,
+      "learning_rate": 4.6919878378760885e-07,
+      "loss": 0.2504,
+      "step": 1110
+    },
+    {
+      "epoch": 2.7875,
+      "grad_norm": 1.372108817100525,
+      "learning_rate": 4.1928997424834525e-07,
+      "loss": 0.2392,
+      "step": 1115
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 1.1753861904144287,
+      "learning_rate": 3.721493926839042e-07,
+      "loss": 0.2725,
+      "step": 1120
+    },
+    {
+      "epoch": 2.8125,
+      "grad_norm": 1.2952775955200195,
+      "learning_rate": 3.277859889929147e-07,
+      "loss": 0.2951,
+      "step": 1125
+    },
+    {
+      "epoch": 2.825,
+      "grad_norm": 1.161136507987976,
+      "learning_rate": 2.862081858115151e-07,
+      "loss": 0.2452,
+      "step": 1130
+    },
+    {
+      "epoch": 2.8375,
+      "grad_norm": 1.0940366983413696,
+      "learning_rate": 2.474238769142645e-07,
+      "loss": 0.2863,
+      "step": 1135
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 1.2279645204544067,
+      "learning_rate": 2.1144042571547618e-07,
+      "loss": 0.281,
+      "step": 1140
+    },
+    {
+      "epoch": 2.8625,
+      "grad_norm": 1.3091001510620117,
+      "learning_rate": 1.7826466387122597e-07,
+      "loss": 0.2737,
+      "step": 1145
+    },
+    {
+      "epoch": 2.875,
+      "grad_norm": 1.1495012044906616,
+      "learning_rate": 1.4790288998232316e-07,
+      "loss": 0.2485,
+      "step": 1150
+    },
+    {
+      "epoch": 2.8875,
+      "grad_norm": 1.218850016593933,
+      "learning_rate": 1.2036086839850025e-07,
+      "loss": 0.2892,
+      "step": 1155
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 1.2607446908950806,
+      "learning_rate": 9.564382812399852e-08,
+      "loss": 0.2453,
+      "step": 1160
+    },
+    {
+      "epoch": 2.9125,
+      "grad_norm": 1.2456682920455933,
+      "learning_rate": 7.375646182482875e-08,
+      "loss": 0.2859,
+      "step": 1165
+    },
+    {
+      "epoch": 2.925,
+      "grad_norm": 1.2477439641952515,
+      "learning_rate": 5.470292493783891e-08,
+      "loss": 0.2686,
+      "step": 1170
+    },
+    {
+      "epoch": 2.9375,
+      "grad_norm": 1.1869429349899292,
+      "learning_rate": 3.848683488177851e-08,
+      "loss": 0.3132,
+      "step": 1175
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 1.2138971090316772,
+      "learning_rate": 2.5111270370511417e-08,
+      "loss": 0.2843,
+      "step": 1180
+    },
+    {
+      "epoch": 2.9625,
+      "grad_norm": 1.2493516206741333,
+      "learning_rate": 1.4578770828511667e-08,
+      "loss": 0.3035,
+      "step": 1185
+    },
+    {
+      "epoch": 2.975,
+      "grad_norm": 1.255171298980713,
+      "learning_rate": 6.891335908732477e-09,
+      "loss": 0.2631,
+      "step": 1190
+    },
+    {
+      "epoch": 2.9875,
+      "grad_norm": 1.3329716920852661,
+      "learning_rate": 2.0504251129649378e-09,
+      "loss": 0.2917,
+      "step": 1195
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.5410183668136597,
+      "learning_rate": 5.695751474465993e-11,
+      "loss": 0.25,
+      "step": 1200
+    },
+    {
+      "epoch": 3.0,
+      "step": 1200,
+      "total_flos": 1.64227002403704e+18,
+      "train_loss": 0.565493812263012,
+      "train_runtime": 754.0979,
+      "train_samples_per_second": 50.878,
+      "train_steps_per_second": 1.591
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1200,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.64227002403704e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

29_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b779de9e0c2c622ab93d15602e493780d3d693c9e13c98d311ea921dfcaee6db
+size 8145

29_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff