Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

2_128_e3_3e-5/README.md +63 -0
2_128_e3_3e-5/adapter_config.json +39 -0
2_128_e3_3e-5/adapter_model.safetensors +3 -0
2_128_e3_3e-5/all_results.json +9 -0
2_128_e3_3e-5/config.json +32 -0
2_128_e3_3e-5/merges.txt +0 -0
2_128_e3_3e-5/special_tokens_map.json +45 -0
2_128_e3_3e-5/tokenizer.json +0 -0
2_128_e3_3e-5/tokenizer_config.json +188 -0
2_128_e3_3e-5/train_results.json +9 -0
2_128_e3_3e-5/trainer_state.json +1450 -0
2_128_e3_3e-5/training_args.bin +3 -0
2_128_e3_3e-5/vocab.json +0 -0

2_128_e3_3e-5/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+library_name: peft
+license: apache-2.0
+base_model: ibm-granite/granite-3.3-8b-base
+tags:
+- alignment-handbook
+- generated_from_trainer
+datasets:
+- data/knowledge_lora_training_data_2000
+model-index:
+- name: 2_128_e3_3e-5
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# 2_128_e3_3e-5
+This model is a fine-tuned version of [ibm-granite/granite-3.3-8b-base](https://huggingface.co/ibm-granite/granite-3.3-8b-base) on the data/knowledge_lora_training_data_2000 dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 3e-05
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 64
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 3.0
+### Training results
+### Framework versions
+- PEFT 0.15.2
+- Transformers 4.52.4
+- Pytorch 2.7.0+cu126
+- Datasets 3.6.0
+- Tokenizers 0.21.2

2_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-base",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "q_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

2_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36a9be3fc167d2d75e862a646171219a7e3a3263aba99eb4510d8ff9b4079a4c
+size 791751704

2_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.482127184633004e+18,
+    "train_loss": 0.5478373010360187,
+    "train_runtime": 672.1067,
+    "train_samples": 10710,
+    "train_samples_per_second": 47.805,
+    "train_steps_per_second": 1.495
+}

2_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49152
+}

2_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

2_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<reponame>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

2_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

2_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<reponame>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

2_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.482127184633004e+18,
+    "train_loss": 0.5478373010360187,
+    "train_runtime": 672.1067,
+    "train_samples": 10710,
+    "train_samples_per_second": 47.805,
+    "train_steps_per_second": 1.495
+}

2_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1450 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1005,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.014925373134328358,
+      "grad_norm": 1.1936415433883667,
+      "learning_rate": 2.3529411764705885e-06,
+      "loss": 1.3178,
+      "step": 5
+    },
+    {
+      "epoch": 0.029850746268656716,
+      "grad_norm": 0.8011311292648315,
+      "learning_rate": 5.294117647058824e-06,
+      "loss": 1.299,
+      "step": 10
+    },
+    {
+      "epoch": 0.04477611940298507,
+      "grad_norm": 0.5648296475410461,
+      "learning_rate": 8.23529411764706e-06,
+      "loss": 1.2979,
+      "step": 15
+    },
+    {
+      "epoch": 0.05970149253731343,
+      "grad_norm": 1.0879466533660889,
+      "learning_rate": 1.1176470588235295e-05,
+      "loss": 1.2498,
+      "step": 20
+    },
+    {
+      "epoch": 0.07462686567164178,
+      "grad_norm": 0.5851609110832214,
+      "learning_rate": 1.411764705882353e-05,
+      "loss": 1.2138,
+      "step": 25
+    },
+    {
+      "epoch": 0.08955223880597014,
+      "grad_norm": 0.5263943076133728,
+      "learning_rate": 1.7058823529411763e-05,
+      "loss": 1.1806,
+      "step": 30
+    },
+    {
+      "epoch": 0.1044776119402985,
+      "grad_norm": 0.37341439723968506,
+      "learning_rate": 1.9999999999999998e-05,
+      "loss": 1.2425,
+      "step": 35
+    },
+    {
+      "epoch": 0.11940298507462686,
+      "grad_norm": 0.4495214521884918,
+      "learning_rate": 2.2941176470588233e-05,
+      "loss": 1.2472,
+      "step": 40
+    },
+    {
+      "epoch": 0.13432835820895522,
+      "grad_norm": 0.5549694299697876,
+      "learning_rate": 2.5882352941176472e-05,
+      "loss": 1.1519,
+      "step": 45
+    },
+    {
+      "epoch": 0.14925373134328357,
+      "grad_norm": 0.4199811816215515,
+      "learning_rate": 2.8823529411764707e-05,
+      "loss": 1.185,
+      "step": 50
+    },
+    {
+      "epoch": 0.16417910447761194,
+      "grad_norm": 0.42709535360336304,
+      "learning_rate": 2.9999268013221688e-05,
+      "loss": 1.1315,
+      "step": 55
+    },
+    {
+      "epoch": 0.1791044776119403,
+      "grad_norm": 0.5405896306037903,
+      "learning_rate": 2.9994795019399927e-05,
+      "loss": 1.1239,
+      "step": 60
+    },
+    {
+      "epoch": 0.19402985074626866,
+      "grad_norm": 0.43888697028160095,
+      "learning_rate": 2.9986256902246587e-05,
+      "loss": 1.0712,
+      "step": 65
+    },
+    {
+      "epoch": 0.208955223880597,
+      "grad_norm": 0.48112964630126953,
+      "learning_rate": 2.9973655976464824e-05,
+      "loss": 1.0667,
+      "step": 70
+    },
+    {
+      "epoch": 0.22388059701492538,
+      "grad_norm": 0.5392914414405823,
+      "learning_rate": 2.995699565819452e-05,
+      "loss": 1.1046,
+      "step": 75
+    },
+    {
+      "epoch": 0.23880597014925373,
+      "grad_norm": 0.51181560754776,
+      "learning_rate": 2.993628046408618e-05,
+      "loss": 1.1095,
+      "step": 80
+    },
+    {
+      "epoch": 0.2537313432835821,
+      "grad_norm": 0.448114275932312,
+      "learning_rate": 2.991151601007646e-05,
+      "loss": 1.1248,
+      "step": 85
+    },
+    {
+      "epoch": 0.26865671641791045,
+      "grad_norm": 0.5458486676216125,
+      "learning_rate": 2.9882709009865653e-05,
+      "loss": 1.0544,
+      "step": 90
+    },
+    {
+      "epoch": 0.2835820895522388,
+      "grad_norm": 0.5127103328704834,
+      "learning_rate": 2.98498672730976e-05,
+      "loss": 1.0849,
+      "step": 95
+    },
+    {
+      "epoch": 0.29850746268656714,
+      "grad_norm": 0.5187199711799622,
+      "learning_rate": 2.9812999703242502e-05,
+      "loss": 1.0488,
+      "step": 100
+    },
+    {
+      "epoch": 0.31343283582089554,
+      "grad_norm": 0.5600414276123047,
+      "learning_rate": 2.977211629518312e-05,
+      "loss": 1.0317,
+      "step": 105
+    },
+    {
+      "epoch": 0.3283582089552239,
+      "grad_norm": 0.5698317885398865,
+      "learning_rate": 2.9727228132505178e-05,
+      "loss": 0.9852,
+      "step": 110
+    },
+    {
+      "epoch": 0.34328358208955223,
+      "grad_norm": 0.5142781138420105,
+      "learning_rate": 2.967834738449256e-05,
+      "loss": 1.0272,
+      "step": 115
+    },
+    {
+      "epoch": 0.3582089552238806,
+      "grad_norm": 0.5430530905723572,
+      "learning_rate": 2.9625487302828198e-05,
+      "loss": 0.994,
+      "step": 120
+    },
+    {
+      "epoch": 0.373134328358209,
+      "grad_norm": 0.668404221534729,
+      "learning_rate": 2.956866221800151e-05,
+      "loss": 1.0267,
+      "step": 125
+    },
+    {
+      "epoch": 0.3880597014925373,
+      "grad_norm": 0.6017296314239502,
+      "learning_rate": 2.9507887535423357e-05,
+      "loss": 0.9794,
+      "step": 130
+    },
+    {
+      "epoch": 0.40298507462686567,
+      "grad_norm": 0.8586568832397461,
+      "learning_rate": 2.944317973124962e-05,
+      "loss": 0.9733,
+      "step": 135
+    },
+    {
+      "epoch": 0.417910447761194,
+      "grad_norm": 0.6064942479133606,
+      "learning_rate": 2.937455634791447e-05,
+      "loss": 0.9092,
+      "step": 140
+    },
+    {
+      "epoch": 0.43283582089552236,
+      "grad_norm": 0.6258195638656616,
+      "learning_rate": 2.9302035989374565e-05,
+      "loss": 0.9095,
+      "step": 145
+    },
+    {
+      "epoch": 0.44776119402985076,
+      "grad_norm": 0.6791467070579529,
+      "learning_rate": 2.9225638316065483e-05,
+      "loss": 0.9329,
+      "step": 150
+    },
+    {
+      "epoch": 0.4626865671641791,
+      "grad_norm": 0.7371143698692322,
+      "learning_rate": 2.9145384039571743e-05,
+      "loss": 0.9231,
+      "step": 155
+    },
+    {
+      "epoch": 0.47761194029850745,
+      "grad_norm": 0.7199986577033997,
+      "learning_rate": 2.9061294917011817e-05,
+      "loss": 0.8616,
+      "step": 160
+    },
+    {
+      "epoch": 0.4925373134328358,
+      "grad_norm": 0.7791478037834167,
+      "learning_rate": 2.897339374513975e-05,
+      "loss": 0.884,
+      "step": 165
+    },
+    {
+      "epoch": 0.5074626865671642,
+      "grad_norm": 0.8169253468513489,
+      "learning_rate": 2.888170435416491e-05,
+      "loss": 0.9079,
+      "step": 170
+    },
+    {
+      "epoch": 0.5223880597014925,
+      "grad_norm": 0.6654359698295593,
+      "learning_rate": 2.878625160129155e-05,
+      "loss": 0.8549,
+      "step": 175
+    },
+    {
+      "epoch": 0.5373134328358209,
+      "grad_norm": 0.8726022839546204,
+      "learning_rate": 2.8687061363979963e-05,
+      "loss": 0.8601,
+      "step": 180
+    },
+    {
+      "epoch": 0.5522388059701493,
+      "grad_norm": 0.77958744764328,
+      "learning_rate": 2.858416053293105e-05,
+      "loss": 0.8176,
+      "step": 185
+    },
+    {
+      "epoch": 0.5671641791044776,
+      "grad_norm": 0.7674434781074524,
+      "learning_rate": 2.84775770047962e-05,
+      "loss": 0.8029,
+      "step": 190
+    },
+    {
+      "epoch": 0.582089552238806,
+      "grad_norm": 0.7162519693374634,
+      "learning_rate": 2.8367339674614402e-05,
+      "loss": 0.8761,
+      "step": 195
+    },
+    {
+      "epoch": 0.5970149253731343,
+      "grad_norm": 0.8640219569206238,
+      "learning_rate": 2.825347842797879e-05,
+      "loss": 0.8374,
+      "step": 200
+    },
+    {
+      "epoch": 0.6119402985074627,
+      "grad_norm": 0.7389693260192871,
+      "learning_rate": 2.8136024132934552e-05,
+      "loss": 0.804,
+      "step": 205
+    },
+    {
+      "epoch": 0.6268656716417911,
+      "grad_norm": 0.974527895450592,
+      "learning_rate": 2.8015008631610545e-05,
+      "loss": 0.8253,
+      "step": 210
+    },
+    {
+      "epoch": 0.6417910447761194,
+      "grad_norm": 0.7666051983833313,
+      "learning_rate": 2.789046473158682e-05,
+      "loss": 0.8327,
+      "step": 215
+    },
+    {
+      "epoch": 0.6567164179104478,
+      "grad_norm": 0.8711564540863037,
+      "learning_rate": 2.7762426197000404e-05,
+      "loss": 0.7511,
+      "step": 220
+    },
+    {
+      "epoch": 0.6716417910447762,
+      "grad_norm": 0.8236544728279114,
+      "learning_rate": 2.763092773939177e-05,
+      "loss": 0.8551,
+      "step": 225
+    },
+    {
+      "epoch": 0.6865671641791045,
+      "grad_norm": 0.8237460851669312,
+      "learning_rate": 2.749600500829448e-05,
+      "loss": 0.7853,
+      "step": 230
+    },
+    {
+      "epoch": 0.7014925373134329,
+      "grad_norm": 0.9001902937889099,
+      "learning_rate": 2.7357694581570475e-05,
+      "loss": 0.7611,
+      "step": 235
+    },
+    {
+      "epoch": 0.7164179104477612,
+      "grad_norm": 0.8697739839553833,
+      "learning_rate": 2.7216033955493756e-05,
+      "loss": 0.7656,
+      "step": 240
+    },
+    {
+      "epoch": 0.7313432835820896,
+      "grad_norm": 0.7716524600982666,
+      "learning_rate": 2.7071061534585064e-05,
+      "loss": 0.7567,
+      "step": 245
+    },
+    {
+      "epoch": 0.746268656716418,
+      "grad_norm": 0.8154096007347107,
+      "learning_rate": 2.6922816621200302e-05,
+      "loss": 0.7274,
+      "step": 250
+    },
+    {
+      "epoch": 0.7611940298507462,
+      "grad_norm": 0.8069023489952087,
+      "learning_rate": 2.6771339404875602e-05,
+      "loss": 0.7115,
+      "step": 255
+    },
+    {
+      "epoch": 0.7761194029850746,
+      "grad_norm": 1.0328160524368286,
+      "learning_rate": 2.6616670951431842e-05,
+      "loss": 0.7695,
+      "step": 260
+    },
+    {
+      "epoch": 0.7910447761194029,
+      "grad_norm": 0.8895705938339233,
+      "learning_rate": 2.645885319184159e-05,
+      "loss": 0.717,
+      "step": 265
+    },
+    {
+      "epoch": 0.8059701492537313,
+      "grad_norm": 0.7964750528335571,
+      "learning_rate": 2.6297928910861546e-05,
+      "loss": 0.7042,
+      "step": 270
+    },
+    {
+      "epoch": 0.8208955223880597,
+      "grad_norm": 0.8405295014381409,
+      "learning_rate": 2.6133941735433496e-05,
+      "loss": 0.6939,
+      "step": 275
+    },
+    {
+      "epoch": 0.835820895522388,
+      "grad_norm": 1.0508198738098145,
+      "learning_rate": 2.596693612285691e-05,
+      "loss": 0.7287,
+      "step": 280
+    },
+    {
+      "epoch": 0.8507462686567164,
+      "grad_norm": 0.9012879133224487,
+      "learning_rate": 2.5796957348736522e-05,
+      "loss": 0.6646,
+      "step": 285
+    },
+    {
+      "epoch": 0.8656716417910447,
+      "grad_norm": 0.990813672542572,
+      "learning_rate": 2.5624051494707967e-05,
+      "loss": 0.6691,
+      "step": 290
+    },
+    {
+      "epoch": 0.8805970149253731,
+      "grad_norm": 0.8366561532020569,
+      "learning_rate": 2.5448265435944957e-05,
+      "loss": 0.6826,
+      "step": 295
+    },
+    {
+      "epoch": 0.8955223880597015,
+      "grad_norm": 0.8978521823883057,
+      "learning_rate": 2.5269646828451323e-05,
+      "loss": 0.6714,
+      "step": 300
+    },
+    {
+      "epoch": 0.9104477611940298,
+      "grad_norm": 0.9813509583473206,
+      "learning_rate": 2.5088244096141355e-05,
+      "loss": 0.6563,
+      "step": 305
+    },
+    {
+      "epoch": 0.9253731343283582,
+      "grad_norm": 0.9724867939949036,
+      "learning_rate": 2.490410641771196e-05,
+      "loss": 0.7257,
+      "step": 310
+    },
+    {
+      "epoch": 0.9402985074626866,
+      "grad_norm": 1.1510107517242432,
+      "learning_rate": 2.4717283713310224e-05,
+      "loss": 0.6253,
+      "step": 315
+    },
+    {
+      "epoch": 0.9552238805970149,
+      "grad_norm": 1.104021668434143,
+      "learning_rate": 2.4527826630999922e-05,
+      "loss": 0.6789,
+      "step": 320
+    },
+    {
+      "epoch": 0.9701492537313433,
+      "grad_norm": 1.060115098953247,
+      "learning_rate": 2.4335786533030736e-05,
+      "loss": 0.6366,
+      "step": 325
+    },
+    {
+      "epoch": 0.9850746268656716,
+      "grad_norm": 1.024133563041687,
+      "learning_rate": 2.414121548191381e-05,
+      "loss": 0.6211,
+      "step": 330
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.941692054271698,
+      "learning_rate": 2.39441662263075e-05,
+      "loss": 0.6087,
+      "step": 335
+    },
+    {
+      "epoch": 1.0149253731343284,
+      "grad_norm": 1.182281494140625,
+      "learning_rate": 2.374469218671708e-05,
+      "loss": 0.6062,
+      "step": 340
+    },
+    {
+      "epoch": 1.0298507462686568,
+      "grad_norm": 1.2160669565200806,
+      "learning_rate": 2.3542847441012325e-05,
+      "loss": 0.5772,
+      "step": 345
+    },
+    {
+      "epoch": 1.044776119402985,
+      "grad_norm": 1.051630973815918,
+      "learning_rate": 2.333868670976688e-05,
+      "loss": 0.558,
+      "step": 350
+    },
+    {
+      "epoch": 1.0597014925373134,
+      "grad_norm": 1.1331043243408203,
+      "learning_rate": 2.3132265341423382e-05,
+      "loss": 0.5661,
+      "step": 355
+    },
+    {
+      "epoch": 1.0746268656716418,
+      "grad_norm": 1.0946743488311768,
+      "learning_rate": 2.292363929728836e-05,
+      "loss": 0.546,
+      "step": 360
+    },
+    {
+      "epoch": 1.0895522388059702,
+      "grad_norm": 1.2076295614242554,
+      "learning_rate": 2.2712865136361037e-05,
+      "loss": 0.5187,
+      "step": 365
+    },
+    {
+      "epoch": 1.1044776119402986,
+      "grad_norm": 1.2114005088806152,
+      "learning_rate": 2.25e-05,
+      "loss": 0.5682,
+      "step": 370
+    },
+    {
+      "epoch": 1.1194029850746268,
+      "grad_norm": 0.9777150750160217,
+      "learning_rate": 2.2285101596432084e-05,
+      "loss": 0.5737,
+      "step": 375
+    },
+    {
+      "epoch": 1.1343283582089552,
+      "grad_norm": 1.0940109491348267,
+      "learning_rate": 2.2068228185107524e-05,
+      "loss": 0.5084,
+      "step": 380
+    },
+    {
+      "epoch": 1.1492537313432836,
+      "grad_norm": 1.0106772184371948,
+      "learning_rate": 2.1849438560905697e-05,
+      "loss": 0.5529,
+      "step": 385
+    },
+    {
+      "epoch": 1.164179104477612,
+      "grad_norm": 0.9576737880706787,
+      "learning_rate": 2.162879203819568e-05,
+      "loss": 0.4859,
+      "step": 390
+    },
+    {
+      "epoch": 1.1791044776119404,
+      "grad_norm": 1.06365168094635,
+      "learning_rate": 2.1406348434755994e-05,
+      "loss": 0.5127,
+      "step": 395
+    },
+    {
+      "epoch": 1.1940298507462686,
+      "grad_norm": 1.0055838823318481,
+      "learning_rate": 2.118216805555788e-05,
+      "loss": 0.4942,
+      "step": 400
+    },
+    {
+      "epoch": 1.208955223880597,
+      "grad_norm": 1.1863685846328735,
+      "learning_rate": 2.0956311676416464e-05,
+      "loss": 0.5223,
+      "step": 405
+    },
+    {
+      "epoch": 1.2238805970149254,
+      "grad_norm": 1.1252774000167847,
+      "learning_rate": 2.0728840527514294e-05,
+      "loss": 0.4785,
+      "step": 410
+    },
+    {
+      "epoch": 1.2388059701492538,
+      "grad_norm": 1.134757399559021,
+      "learning_rate": 2.0499816276801724e-05,
+      "loss": 0.5052,
+      "step": 415
+    },
+    {
+      "epoch": 1.2537313432835822,
+      "grad_norm": 1.0209614038467407,
+      "learning_rate": 2.0269301013278555e-05,
+      "loss": 0.5208,
+      "step": 420
+    },
+    {
+      "epoch": 1.2686567164179103,
+      "grad_norm": 1.060960292816162,
+      "learning_rate": 2.0037357230161587e-05,
+      "loss": 0.4837,
+      "step": 425
+    },
+    {
+      "epoch": 1.2835820895522387,
+      "grad_norm": 1.0340241193771362,
+      "learning_rate": 1.9804047807942564e-05,
+      "loss": 0.4973,
+      "step": 430
+    },
+    {
+      "epoch": 1.2985074626865671,
+      "grad_norm": 1.2472810745239258,
+      "learning_rate": 1.956943599734112e-05,
+      "loss": 0.4962,
+      "step": 435
+    },
+    {
+      "epoch": 1.3134328358208955,
+      "grad_norm": 1.099755883216858,
+      "learning_rate": 1.9333585402157365e-05,
+      "loss": 0.5169,
+      "step": 440
+    },
+    {
+      "epoch": 1.328358208955224,
+      "grad_norm": 1.2505892515182495,
+      "learning_rate": 1.9096559962028746e-05,
+      "loss": 0.4608,
+      "step": 445
+    },
+    {
+      "epoch": 1.3432835820895521,
+      "grad_norm": 1.0441431999206543,
+      "learning_rate": 1.88584239350959e-05,
+      "loss": 0.4526,
+      "step": 450
+    },
+    {
+      "epoch": 1.3582089552238805,
+      "grad_norm": 1.1357834339141846,
+      "learning_rate": 1.861924188058205e-05,
+      "loss": 0.4598,
+      "step": 455
+    },
+    {
+      "epoch": 1.373134328358209,
+      "grad_norm": 1.0953128337860107,
+      "learning_rate": 1.8379078641290923e-05,
+      "loss": 0.4761,
+      "step": 460
+    },
+    {
+      "epoch": 1.3880597014925373,
+      "grad_norm": 1.1454157829284668,
+      "learning_rate": 1.8137999326027696e-05,
+      "loss": 0.4476,
+      "step": 465
+    },
+    {
+      "epoch": 1.4029850746268657,
+      "grad_norm": 1.136718988418579,
+      "learning_rate": 1.7896069291947827e-05,
+      "loss": 0.4767,
+      "step": 470
+    },
+    {
+      "epoch": 1.417910447761194,
+      "grad_norm": 1.2287747859954834,
+      "learning_rate": 1.7653354126838593e-05,
+      "loss": 0.4895,
+      "step": 475
+    },
+    {
+      "epoch": 1.4328358208955223,
+      "grad_norm": 1.2421404123306274,
+      "learning_rate": 1.7409919631338124e-05,
+      "loss": 0.4497,
+      "step": 480
+    },
+    {
+      "epoch": 1.4477611940298507,
+      "grad_norm": 1.2321255207061768,
+      "learning_rate": 1.7165831801096635e-05,
+      "loss": 0.4562,
+      "step": 485
+    },
+    {
+      "epoch": 1.462686567164179,
+      "grad_norm": 1.145817518234253,
+      "learning_rate": 1.6921156808884904e-05,
+      "loss": 0.4579,
+      "step": 490
+    },
+    {
+      "epoch": 1.4776119402985075,
+      "grad_norm": 1.1590453386306763,
+      "learning_rate": 1.6675960986654675e-05,
+      "loss": 0.4286,
+      "step": 495
+    },
+    {
+      "epoch": 1.4925373134328357,
+      "grad_norm": 1.1244103908538818,
+      "learning_rate": 1.6430310807555884e-05,
+      "loss": 0.4615,
+      "step": 500
+    },
+    {
+      "epoch": 1.5074626865671643,
+      "grad_norm": 1.3445566892623901,
+      "learning_rate": 1.618427286791568e-05,
+      "loss": 0.4497,
+      "step": 505
+    },
+    {
+      "epoch": 1.5223880597014925,
+      "grad_norm": 1.043714165687561,
+      "learning_rate": 1.593791386918396e-05,
+      "loss": 0.4193,
+      "step": 510
+    },
+    {
+      "epoch": 1.537313432835821,
+      "grad_norm": 1.227029800415039,
+      "learning_rate": 1.5691300599850495e-05,
+      "loss": 0.4536,
+      "step": 515
+    },
+    {
+      "epoch": 1.5522388059701493,
+      "grad_norm": 1.1881043910980225,
+      "learning_rate": 1.5444499917338398e-05,
+      "loss": 0.4398,
+      "step": 520
+    },
+    {
+      "epoch": 1.5671641791044775,
+      "grad_norm": 1.1590487957000732,
+      "learning_rate": 1.5197578729878915e-05,
+      "loss": 0.4193,
+      "step": 525
+    },
+    {
+      "epoch": 1.582089552238806,
+      "grad_norm": 1.143646240234375,
+      "learning_rate": 1.4950603978372467e-05,
+      "loss": 0.4248,
+      "step": 530
+    },
+    {
+      "epoch": 1.5970149253731343,
+      "grad_norm": 1.2340905666351318,
+      "learning_rate": 1.4703642618240806e-05,
+      "loss": 0.4214,
+      "step": 535
+    },
+    {
+      "epoch": 1.6119402985074627,
+      "grad_norm": 1.1602320671081543,
+      "learning_rate": 1.4456761601275254e-05,
+      "loss": 0.4379,
+      "step": 540
+    },
+    {
+      "epoch": 1.626865671641791,
+      "grad_norm": 1.1080914735794067,
+      "learning_rate": 1.4210027857485932e-05,
+      "loss": 0.411,
+      "step": 545
+    },
+    {
+      "epoch": 1.6417910447761193,
+      "grad_norm": 1.2245306968688965,
+      "learning_rate": 1.3963508276956832e-05,
+      "loss": 0.3917,
+      "step": 550
+    },
+    {
+      "epoch": 1.6567164179104479,
+      "grad_norm": 1.128736138343811,
+      "learning_rate": 1.371726969171182e-05,
+      "loss": 0.4208,
+      "step": 555
+    },
+    {
+      "epoch": 1.671641791044776,
+      "grad_norm": 1.1787062883377075,
+      "learning_rate": 1.34713788575963e-05,
+      "loss": 0.4566,
+      "step": 560
+    },
+    {
+      "epoch": 1.6865671641791045,
+      "grad_norm": 1.0624361038208008,
+      "learning_rate": 1.3225902436179515e-05,
+      "loss": 0.3836,
+      "step": 565
+    },
+    {
+      "epoch": 1.7014925373134329,
+      "grad_norm": 1.165006399154663,
+      "learning_rate": 1.2980906976682508e-05,
+      "loss": 0.4087,
+      "step": 570
+    },
+    {
+      "epoch": 1.716417910447761,
+      "grad_norm": 1.2284427881240845,
+      "learning_rate": 1.2736458897936432e-05,
+      "loss": 0.4,
+      "step": 575
+    },
+    {
+      "epoch": 1.7313432835820897,
+      "grad_norm": 1.223211646080017,
+      "learning_rate": 1.2492624470376253e-05,
+      "loss": 0.4149,
+      "step": 580
+    },
+    {
+      "epoch": 1.7462686567164178,
+      "grad_norm": 1.2130999565124512,
+      "learning_rate": 1.22494697980747e-05,
+      "loss": 0.3921,
+      "step": 585
+    },
+    {
+      "epoch": 1.7611940298507462,
+      "grad_norm": 1.1766337156295776,
+      "learning_rate": 1.20070608008213e-05,
+      "loss": 0.4109,
+      "step": 590
+    },
+    {
+      "epoch": 1.7761194029850746,
+      "grad_norm": 1.3741768598556519,
+      "learning_rate": 1.1765463196251349e-05,
+      "loss": 0.4134,
+      "step": 595
+    },
+    {
+      "epoch": 1.7910447761194028,
+      "grad_norm": 1.1917859315872192,
+      "learning_rate": 1.1524742482029728e-05,
+      "loss": 0.4252,
+      "step": 600
+    },
+    {
+      "epoch": 1.8059701492537314,
+      "grad_norm": 1.2797183990478516,
+      "learning_rate": 1.1284963918094346e-05,
+      "loss": 0.3733,
+      "step": 605
+    },
+    {
+      "epoch": 1.8208955223880596,
+      "grad_norm": 1.1159847974777222,
+      "learning_rate": 1.104619250896399e-05,
+      "loss": 0.3997,
+      "step": 610
+    },
+    {
+      "epoch": 1.835820895522388,
+      "grad_norm": 1.3664534091949463,
+      "learning_rate": 1.0808492986115476e-05,
+      "loss": 0.3821,
+      "step": 615
+    },
+    {
+      "epoch": 1.8507462686567164,
+      "grad_norm": 1.210760235786438,
+      "learning_rate": 1.0571929790434792e-05,
+      "loss": 0.4107,
+      "step": 620
+    },
+    {
+      "epoch": 1.8656716417910446,
+      "grad_norm": 1.217391848564148,
+      "learning_rate": 1.0336567054747033e-05,
+      "loss": 0.4017,
+      "step": 625
+    },
+    {
+      "epoch": 1.8805970149253732,
+      "grad_norm": 1.246115803718567,
+      "learning_rate": 1.0102468586429808e-05,
+      "loss": 0.3738,
+      "step": 630
+    },
+    {
+      "epoch": 1.8955223880597014,
+      "grad_norm": 1.0801810026168823,
+      "learning_rate": 9.86969785011497e-06,
+      "loss": 0.3889,
+      "step": 635
+    },
+    {
+      "epoch": 1.9104477611940298,
+      "grad_norm": 1.1230485439300537,
+      "learning_rate": 9.638317950483167e-06,
+      "loss": 0.4431,
+      "step": 640
+    },
+    {
+      "epoch": 1.9253731343283582,
+      "grad_norm": 1.2072858810424805,
+      "learning_rate": 9.408391615156023e-06,
+      "loss": 0.3637,
+      "step": 645
+    },
+    {
+      "epoch": 1.9402985074626866,
+      "grad_norm": 1.1373963356018066,
+      "learning_rate": 9.179981177690566e-06,
+      "loss": 0.3738,
+      "step": 650
+    },
+    {
+      "epoch": 1.955223880597015,
+      "grad_norm": 1.2958288192749023,
+      "learning_rate": 8.953148560680419e-06,
+      "loss": 0.3544,
+      "step": 655
+    },
+    {
+      "epoch": 1.9701492537313432,
+      "grad_norm": 1.0563827753067017,
+      "learning_rate": 8.727955258968462e-06,
+      "loss": 0.3481,
+      "step": 660
+    },
+    {
+      "epoch": 1.9850746268656716,
+      "grad_norm": 1.2998480796813965,
+      "learning_rate": 8.504462322975442e-06,
+      "loss": 0.3665,
+      "step": 665
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.3147248029708862,
+      "learning_rate": 8.282730342149059e-06,
+      "loss": 0.3383,
+      "step": 670
+    },
+    {
+      "epoch": 2.014925373134328,
+      "grad_norm": 1.1079124212265015,
+      "learning_rate": 8.062819428538009e-06,
+      "loss": 0.2936,
+      "step": 675
+    },
+    {
+      "epoch": 2.029850746268657,
+      "grad_norm": 1.253826379776001,
+      "learning_rate": 7.844789200495517e-06,
+      "loss": 0.297,
+      "step": 680
+    },
+    {
+      "epoch": 2.044776119402985,
+      "grad_norm": 1.0365105867385864,
+      "learning_rate": 7.628698766516625e-06,
+      "loss": 0.3391,
+      "step": 685
+    },
+    {
+      "epoch": 2.0597014925373136,
+      "grad_norm": 1.523829698562622,
+      "learning_rate": 7.414606709213735e-06,
+      "loss": 0.2841,
+      "step": 690
+    },
+    {
+      "epoch": 2.074626865671642,
+      "grad_norm": 1.3061442375183105,
+      "learning_rate": 7.202571069434772e-06,
+      "loss": 0.2874,
+      "step": 695
+    },
+    {
+      "epoch": 2.08955223880597,
+      "grad_norm": 1.1913034915924072,
+      "learning_rate": 6.992649330528146e-06,
+      "loss": 0.3118,
+      "step": 700
+    },
+    {
+      "epoch": 2.1044776119402986,
+      "grad_norm": 1.2372983694076538,
+      "learning_rate": 6.78489840275887e-06,
+      "loss": 0.3048,
+      "step": 705
+    },
+    {
+      "epoch": 2.1194029850746268,
+      "grad_norm": 1.207451581954956,
+      "learning_rate": 6.579374607880116e-06,
+      "loss": 0.3184,
+      "step": 710
+    },
+    {
+      "epoch": 2.1343283582089554,
+      "grad_norm": 1.1948586702346802,
+      "learning_rate": 6.376133663864196e-06,
+      "loss": 0.2921,
+      "step": 715
+    },
+    {
+      "epoch": 2.1492537313432836,
+      "grad_norm": 1.109611988067627,
+      "learning_rate": 6.175230669797306e-06,
+      "loss": 0.2919,
+      "step": 720
+    },
+    {
+      "epoch": 2.1641791044776117,
+      "grad_norm": 1.3636291027069092,
+      "learning_rate": 5.976720090942066e-06,
+      "loss": 0.2716,
+      "step": 725
+    },
+    {
+      "epoch": 2.1791044776119404,
+      "grad_norm": 1.219353199005127,
+      "learning_rate": 5.780655743971844e-06,
+      "loss": 0.3033,
+      "step": 730
+    },
+    {
+      "epoch": 2.1940298507462686,
+      "grad_norm": 1.2699675559997559,
+      "learning_rate": 5.587090782380912e-06,
+      "loss": 0.3124,
+      "step": 735
+    },
+    {
+      "epoch": 2.208955223880597,
+      "grad_norm": 1.3106281757354736,
+      "learning_rate": 5.3960776820744415e-06,
+      "loss": 0.254,
+      "step": 740
+    },
+    {
+      "epoch": 2.2238805970149254,
+      "grad_norm": 1.1213213205337524,
+      "learning_rate": 5.207668227142178e-06,
+      "loss": 0.2919,
+      "step": 745
+    },
+    {
+      "epoch": 2.2388059701492535,
+      "grad_norm": 1.3240259885787964,
+      "learning_rate": 5.021913495819593e-06,
+      "loss": 0.2833,
+      "step": 750
+    },
+    {
+      "epoch": 2.253731343283582,
+      "grad_norm": 1.1573519706726074,
+      "learning_rate": 4.838863846640524e-06,
+      "loss": 0.3073,
+      "step": 755
+    },
+    {
+      "epoch": 2.2686567164179103,
+      "grad_norm": 1.3405065536499023,
+      "learning_rate": 4.6585689047848264e-06,
+      "loss": 0.2861,
+      "step": 760
+    },
+    {
+      "epoch": 2.283582089552239,
+      "grad_norm": 1.0261584520339966,
+      "learning_rate": 4.481077548624871e-06,
+      "loss": 0.2648,
+      "step": 765
+    },
+    {
+      "epoch": 2.298507462686567,
+      "grad_norm": 1.32846999168396,
+      "learning_rate": 4.306437896474523e-06,
+      "loss": 0.2879,
+      "step": 770
+    },
+    {
+      "epoch": 2.3134328358208958,
+      "grad_norm": 1.1488946676254272,
+      "learning_rate": 4.134697293544158e-06,
+      "loss": 0.2564,
+      "step": 775
+    },
+    {
+      "epoch": 2.328358208955224,
+      "grad_norm": 1.0947158336639404,
+      "learning_rate": 3.965902299105245e-06,
+      "loss": 0.2802,
+      "step": 780
+    },
+    {
+      "epoch": 2.343283582089552,
+      "grad_norm": 1.0991296768188477,
+      "learning_rate": 3.8000986738680245e-06,
+      "loss": 0.2903,
+      "step": 785
+    },
+    {
+      "epoch": 2.3582089552238807,
+      "grad_norm": 1.1161385774612427,
+      "learning_rate": 3.637331367575698e-06,
+      "loss": 0.2712,
+      "step": 790
+    },
+    {
+      "epoch": 2.373134328358209,
+      "grad_norm": 1.5450501441955566,
+      "learning_rate": 3.4776445068184365e-06,
+      "loss": 0.2758,
+      "step": 795
+    },
+    {
+      "epoch": 2.388059701492537,
+      "grad_norm": 1.128070592880249,
+      "learning_rate": 3.32108138307054e-06,
+      "loss": 0.2778,
+      "step": 800
+    },
+    {
+      "epoch": 2.4029850746268657,
+      "grad_norm": 1.107636570930481,
+      "learning_rate": 3.1676844409540607e-06,
+      "loss": 0.2878,
+      "step": 805
+    },
+    {
+      "epoch": 2.417910447761194,
+      "grad_norm": 1.1108747720718384,
+      "learning_rate": 3.017495266731942e-06,
+      "loss": 0.2509,
+      "step": 810
+    },
+    {
+      "epoch": 2.4328358208955225,
+      "grad_norm": 1.2078962326049805,
+      "learning_rate": 2.8705545770338758e-06,
+      "loss": 0.295,
+      "step": 815
+    },
+    {
+      "epoch": 2.4477611940298507,
+      "grad_norm": 1.092396855354309,
+      "learning_rate": 2.7269022078179638e-06,
+      "loss": 0.2683,
+      "step": 820
+    },
+    {
+      "epoch": 2.4626865671641793,
+      "grad_norm": 1.1813055276870728,
+      "learning_rate": 2.5865771035710777e-06,
+      "loss": 0.2995,
+      "step": 825
+    },
+    {
+      "epoch": 2.4776119402985075,
+      "grad_norm": 1.1523305177688599,
+      "learning_rate": 2.449617306750913e-06,
+      "loss": 0.3208,
+      "step": 830
+    },
+    {
+      "epoch": 2.4925373134328357,
+      "grad_norm": 1.1093213558197021,
+      "learning_rate": 2.3160599474726073e-06,
+      "loss": 0.2873,
+      "step": 835
+    },
+    {
+      "epoch": 2.5074626865671643,
+      "grad_norm": 1.1179418563842773,
+      "learning_rate": 2.1859412334426853e-06,
+      "loss": 0.2613,
+      "step": 840
+    },
+    {
+      "epoch": 2.5223880597014925,
+      "grad_norm": 1.1804648637771606,
+      "learning_rate": 2.0592964401430377e-06,
+      "loss": 0.2934,
+      "step": 845
+    },
+    {
+      "epoch": 2.5373134328358207,
+      "grad_norm": 1.260128378868103,
+      "learning_rate": 1.936159901267682e-06,
+      "loss": 0.2912,
+      "step": 850
+    },
+    {
+      "epoch": 2.5522388059701493,
+      "grad_norm": 1.1166480779647827,
+      "learning_rate": 1.8165649994148203e-06,
+      "loss": 0.2507,
+      "step": 855
+    },
+    {
+      "epoch": 2.5671641791044775,
+      "grad_norm": 1.1106736660003662,
+      "learning_rate": 1.7005441570367164e-06,
+      "loss": 0.2609,
+      "step": 860
+    },
+    {
+      "epoch": 2.582089552238806,
+      "grad_norm": 1.2218323945999146,
+      "learning_rate": 1.5881288276499211e-06,
+      "loss": 0.2887,
+      "step": 865
+    },
+    {
+      "epoch": 2.5970149253731343,
+      "grad_norm": 1.1319966316223145,
+      "learning_rate": 1.4793494873081504e-06,
+      "loss": 0.2547,
+      "step": 870
+    },
+    {
+      "epoch": 2.611940298507463,
+      "grad_norm": 1.1479063034057617,
+      "learning_rate": 1.374235626340128e-06,
+      "loss": 0.2594,
+      "step": 875
+    },
+    {
+      "epoch": 2.626865671641791,
+      "grad_norm": 1.2150938510894775,
+      "learning_rate": 1.2728157413547232e-06,
+      "loss": 0.2899,
+      "step": 880
+    },
+    {
+      "epoch": 2.6417910447761193,
+      "grad_norm": 1.1205209493637085,
+      "learning_rate": 1.1751173275154403e-06,
+      "loss": 0.2238,
+      "step": 885
+    },
+    {
+      "epoch": 2.656716417910448,
+      "grad_norm": 1.1784629821777344,
+      "learning_rate": 1.0811668710864098e-06,
+      "loss": 0.2659,
+      "step": 890
+    },
+    {
+      "epoch": 2.671641791044776,
+      "grad_norm": 1.2707682847976685,
+      "learning_rate": 9.909898422519198e-07,
+      "loss": 0.262,
+      "step": 895
+    },
+    {
+      "epoch": 2.6865671641791042,
+      "grad_norm": 1.0494028329849243,
+      "learning_rate": 9.046106882113753e-07,
+      "loss": 0.2817,
+      "step": 900
+    },
+    {
+      "epoch": 2.701492537313433,
+      "grad_norm": 1.4347975254058838,
+      "learning_rate": 8.220528265516125e-07,
+      "loss": 0.2685,
+      "step": 905
+    },
+    {
+      "epoch": 2.716417910447761,
+      "grad_norm": 1.1479558944702148,
+      "learning_rate": 7.433386388983343e-07,
+      "loss": 0.2736,
+      "step": 910
+    },
+    {
+      "epoch": 2.7313432835820897,
+      "grad_norm": 1.0413694381713867,
+      "learning_rate": 6.684894648484069e-07,
+      "loss": 0.2911,
+      "step": 915
+    },
+    {
+      "epoch": 2.746268656716418,
+      "grad_norm": 1.1672275066375732,
+      "learning_rate": 5.975255961846343e-07,
+      "loss": 0.2262,
+      "step": 920
+    },
+    {
+      "epoch": 2.7611940298507465,
+      "grad_norm": 1.1373767852783203,
+      "learning_rate": 5.304662713746205e-07,
+      "loss": 0.2714,
+      "step": 925
+    },
+    {
+      "epoch": 2.7761194029850746,
+      "grad_norm": 1.1416226625442505,
+      "learning_rate": 4.6732967035517326e-07,
+      "loss": 0.2465,
+      "step": 930
+    },
+    {
+      "epoch": 2.791044776119403,
+      "grad_norm": 1.2618836164474487,
+      "learning_rate": 4.081329096036829e-07,
+      "loss": 0.2971,
+      "step": 935
+    },
+    {
+      "epoch": 2.8059701492537314,
+      "grad_norm": 1.1520038843154907,
+      "learning_rate": 3.528920374977979e-07,
+      "loss": 0.2506,
+      "step": 940
+    },
+    {
+      "epoch": 2.8208955223880596,
+      "grad_norm": 1.231122612953186,
+      "learning_rate": 3.0162202996468156e-07,
+      "loss": 0.2525,
+      "step": 945
+    },
+    {
+      "epoch": 2.835820895522388,
+      "grad_norm": 1.2169133424758911,
+      "learning_rate": 2.5433678642100664e-07,
+      "loss": 0.2834,
+      "step": 950
+    },
+    {
+      "epoch": 2.8507462686567164,
+      "grad_norm": 1.2695796489715576,
+      "learning_rate": 2.110491260047792e-07,
+      "loss": 0.2772,
+      "step": 955
+    },
+    {
+      "epoch": 2.8656716417910446,
+      "grad_norm": 1.1452456712722778,
+      "learning_rate": 1.7177078410005041e-07,
+      "loss": 0.2584,
+      "step": 960
+    },
+    {
+      "epoch": 2.8805970149253732,
+      "grad_norm": 1.173682689666748,
+      "learning_rate": 1.3651240915542652e-07,
+      "loss": 0.2962,
+      "step": 965
+    },
+    {
+      "epoch": 2.8955223880597014,
+      "grad_norm": 1.1223686933517456,
+      "learning_rate": 1.0528355979724624e-07,
+      "loss": 0.31,
+      "step": 970
+    },
+    {
+      "epoch": 2.91044776119403,
+      "grad_norm": 1.1745933294296265,
+      "learning_rate": 7.809270223821552e-08,
+      "loss": 0.2515,
+      "step": 975
+    },
+    {
+      "epoch": 2.925373134328358,
+      "grad_norm": 1.2465354204177856,
+      "learning_rate": 5.4947207982204985e-08,
+      "loss": 0.2521,
+      "step": 980
+    },
+    {
+      "epoch": 2.9402985074626864,
+      "grad_norm": 1.225307822227478,
+      "learning_rate": 3.585335182580529e-08,
+      "loss": 0.2909,
+      "step": 985
+    },
+    {
+      "epoch": 2.955223880597015,
+      "grad_norm": 1.2866557836532593,
+      "learning_rate": 2.0816310157227846e-08,
+      "loss": 0.2654,
+      "step": 990
+    },
+    {
+      "epoch": 2.970149253731343,
+      "grad_norm": 1.1425098180770874,
+      "learning_rate": 9.840159552969019e-09,
+      "loss": 0.2893,
+      "step": 995
+    },
+    {
+      "epoch": 2.9850746268656714,
+      "grad_norm": 1.2532261610031128,
+      "learning_rate": 2.9278756726375257e-09,
+      "loss": 0.2369,
+      "step": 1000
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.1680136919021606,
+      "learning_rate": 8.133245225305785e-11,
+      "loss": 0.2499,
+      "step": 1005
+    },
+    {
+      "epoch": 3.0,
+      "step": 1005,
+      "total_flos": 1.482127184633004e+18,
+      "train_loss": 0.5478373010360187,
+      "train_runtime": 672.1067,
+      "train_samples_per_second": 47.805,
+      "train_steps_per_second": 1.495
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1005,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.482127184633004e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

2_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:029e470f5991d42ad81e00ee01edf16b3c021d8308a6a72505c8365c9183f4b9
+size 8145

2_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff