Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

10_128_e3_3e-5/README.md +63 -0
10_128_e3_3e-5/adapter_config.json +39 -0
10_128_e3_3e-5/adapter_model.safetensors +3 -0
10_128_e3_3e-5/all_results.json +9 -0
10_128_e3_3e-5/config.json +32 -0
10_128_e3_3e-5/merges.txt +0 -0
10_128_e3_3e-5/special_tokens_map.json +45 -0
10_128_e3_3e-5/tokenizer.json +0 -0
10_128_e3_3e-5/tokenizer_config.json +188 -0
10_128_e3_3e-5/train_results.json +9 -0
10_128_e3_3e-5/trainer_state.json +1471 -0
10_128_e3_3e-5/training_args.bin +3 -0
10_128_e3_3e-5/vocab.json +0 -0

10_128_e3_3e-5/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+library_name: peft
+license: apache-2.0
+base_model: ibm-granite/granite-3.3-8b-base
+tags:
+- alignment-handbook
+- generated_from_trainer
+datasets:
+- data/knowledge_lora_training_data_2000
+model-index:
+- name: 10_128_e3_3e-5
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# 10_128_e3_3e-5
+This model is a fine-tuned version of [ibm-granite/granite-3.3-8b-base](https://huggingface.co/ibm-granite/granite-3.3-8b-base) on the data/knowledge_lora_training_data_2000 dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 3e-05
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 64
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 3.0
+### Training results
+### Framework versions
+- PEFT 0.15.2
+- Transformers 4.52.4
+- Pytorch 2.7.0+cu126
+- Datasets 3.6.0
+- Tokenizers 0.21.2

10_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-base",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "q_proj",
+    "down_proj",
+    "gate_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

10_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:421d8a5caa59f4352cc822e9aa3b57d7f801baca10c61e362b48a5664010564d
+size 791751704

10_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.4978820658094408e+18,
+    "train_loss": 0.567423046625721,
+    "train_runtime": 689.9192,
+    "train_samples": 10902,
+    "train_samples_per_second": 47.406,
+    "train_steps_per_second": 1.483
+}

10_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49152
+}

10_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

10_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<reponame>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

10_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

10_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<reponame>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

10_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.4978820658094408e+18,
+    "train_loss": 0.567423046625721,
+    "train_runtime": 689.9192,
+    "train_samples": 10902,
+    "train_samples_per_second": 47.406,
+    "train_steps_per_second": 1.483
+}

10_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1471 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1023,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01466275659824047,
+      "grad_norm": 1.404855489730835,
+      "learning_rate": 2.307692307692308e-06,
+      "loss": 1.2897,
+      "step": 5
+    },
+    {
+      "epoch": 0.02932551319648094,
+      "grad_norm": 0.9086072444915771,
+      "learning_rate": 5.192307692307692e-06,
+      "loss": 1.3027,
+      "step": 10
+    },
+    {
+      "epoch": 0.04398826979472141,
+      "grad_norm": 0.8060473203659058,
+      "learning_rate": 8.076923076923077e-06,
+      "loss": 1.3126,
+      "step": 15
+    },
+    {
+      "epoch": 0.05865102639296188,
+      "grad_norm": 0.5817658305168152,
+      "learning_rate": 1.0961538461538462e-05,
+      "loss": 1.3005,
+      "step": 20
+    },
+    {
+      "epoch": 0.07331378299120235,
+      "grad_norm": 0.519001305103302,
+      "learning_rate": 1.3846153846153847e-05,
+      "loss": 1.2342,
+      "step": 25
+    },
+    {
+      "epoch": 0.08797653958944282,
+      "grad_norm": 0.5387595295906067,
+      "learning_rate": 1.673076923076923e-05,
+      "loss": 1.2237,
+      "step": 30
+    },
+    {
+      "epoch": 0.10263929618768329,
+      "grad_norm": 0.5379096269607544,
+      "learning_rate": 1.9615384615384617e-05,
+      "loss": 1.2331,
+      "step": 35
+    },
+    {
+      "epoch": 0.11730205278592376,
+      "grad_norm": 0.49764829874038696,
+      "learning_rate": 2.25e-05,
+      "loss": 1.2284,
+      "step": 40
+    },
+    {
+      "epoch": 0.13196480938416422,
+      "grad_norm": 0.49528032541275024,
+      "learning_rate": 2.5384615384615386e-05,
+      "loss": 1.1852,
+      "step": 45
+    },
+    {
+      "epoch": 0.1466275659824047,
+      "grad_norm": 0.5460355877876282,
+      "learning_rate": 2.8269230769230768e-05,
+      "loss": 1.1357,
+      "step": 50
+    },
+    {
+      "epoch": 0.16129032258064516,
+      "grad_norm": 0.428093820810318,
+      "learning_rate": 2.9999685962851756e-05,
+      "loss": 1.1952,
+      "step": 55
+    },
+    {
+      "epoch": 0.17595307917888564,
+      "grad_norm": 0.4283364415168762,
+      "learning_rate": 2.9996153195943092e-05,
+      "loss": 1.1639,
+      "step": 60
+    },
+    {
+      "epoch": 0.1906158357771261,
+      "grad_norm": 0.4574543833732605,
+      "learning_rate": 2.9988696043272093e-05,
+      "loss": 1.1343,
+      "step": 65
+    },
+    {
+      "epoch": 0.20527859237536658,
+      "grad_norm": 0.4929468333721161,
+      "learning_rate": 2.9977316456322143e-05,
+      "loss": 1.1432,
+      "step": 70
+    },
+    {
+      "epoch": 0.21994134897360704,
+      "grad_norm": 0.6179081797599792,
+      "learning_rate": 2.996201741304954e-05,
+      "loss": 1.1054,
+      "step": 75
+    },
+    {
+      "epoch": 0.23460410557184752,
+      "grad_norm": 0.7540493607521057,
+      "learning_rate": 2.9942802917104218e-05,
+      "loss": 1.1155,
+      "step": 80
+    },
+    {
+      "epoch": 0.24926686217008798,
+      "grad_norm": 0.5757607221603394,
+      "learning_rate": 2.9919677996781987e-05,
+      "loss": 1.0311,
+      "step": 85
+    },
+    {
+      "epoch": 0.26392961876832843,
+      "grad_norm": 0.5738183259963989,
+      "learning_rate": 2.989264870370867e-05,
+      "loss": 1.0608,
+      "step": 90
+    },
+    {
+      "epoch": 0.2785923753665689,
+      "grad_norm": 0.5233305096626282,
+      "learning_rate": 2.9861722111256466e-05,
+      "loss": 1.046,
+      "step": 95
+    },
+    {
+      "epoch": 0.2932551319648094,
+      "grad_norm": 0.532455563545227,
+      "learning_rate": 2.9826906312692855e-05,
+      "loss": 0.9954,
+      "step": 100
+    },
+    {
+      "epoch": 0.30791788856304986,
+      "grad_norm": 0.5668688416481018,
+      "learning_rate": 2.9788210419062677e-05,
+      "loss": 1.0457,
+      "step": 105
+    },
+    {
+      "epoch": 0.3225806451612903,
+      "grad_norm": 0.5246180891990662,
+      "learning_rate": 2.974564455680383e-05,
+      "loss": 1.0178,
+      "step": 110
+    },
+    {
+      "epoch": 0.33724340175953077,
+      "grad_norm": 0.5526345372200012,
+      "learning_rate": 2.969921986509725e-05,
+      "loss": 0.9902,
+      "step": 115
+    },
+    {
+      "epoch": 0.3519061583577713,
+      "grad_norm": 0.554076075553894,
+      "learning_rate": 2.964894849295187e-05,
+      "loss": 0.9988,
+      "step": 120
+    },
+    {
+      "epoch": 0.36656891495601174,
+      "grad_norm": 0.5224078893661499,
+      "learning_rate": 2.9594843596025315e-05,
+      "loss": 1.0225,
+      "step": 125
+    },
+    {
+      "epoch": 0.3812316715542522,
+      "grad_norm": 0.6201514601707458,
+      "learning_rate": 2.953691933318115e-05,
+      "loss": 0.9719,
+      "step": 130
+    },
+    {
+      "epoch": 0.39589442815249265,
+      "grad_norm": 0.5892394781112671,
+      "learning_rate": 2.9475190862783628e-05,
+      "loss": 0.9845,
+      "step": 135
+    },
+    {
+      "epoch": 0.41055718475073316,
+      "grad_norm": 0.6271149516105652,
+      "learning_rate": 2.940967433873082e-05,
+      "loss": 0.925,
+      "step": 140
+    },
+    {
+      "epoch": 0.4252199413489736,
+      "grad_norm": 0.6505178213119507,
+      "learning_rate": 2.9340386906227295e-05,
+      "loss": 0.966,
+      "step": 145
+    },
+    {
+      "epoch": 0.4398826979472141,
+      "grad_norm": 0.6356052160263062,
+      "learning_rate": 2.9267346697297322e-05,
+      "loss": 0.9323,
+      "step": 150
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 0.6684208512306213,
+      "learning_rate": 2.919057282603984e-05,
+      "loss": 0.9426,
+      "step": 155
+    },
+    {
+      "epoch": 0.46920821114369504,
+      "grad_norm": 0.7759992480278015,
+      "learning_rate": 2.9110085383626453e-05,
+      "loss": 0.9315,
+      "step": 160
+    },
+    {
+      "epoch": 0.4838709677419355,
+      "grad_norm": 0.7040038108825684,
+      "learning_rate": 2.902590543304372e-05,
+      "loss": 0.9105,
+      "step": 165
+    },
+    {
+      "epoch": 0.49853372434017595,
+      "grad_norm": 0.7048050165176392,
+      "learning_rate": 2.893805500358109e-05,
+      "loss": 0.9013,
+      "step": 170
+    },
+    {
+      "epoch": 0.5131964809384164,
+      "grad_norm": 0.6984567642211914,
+      "learning_rate": 2.8846557085066033e-05,
+      "loss": 0.888,
+      "step": 175
+    },
+    {
+      "epoch": 0.5278592375366569,
+      "grad_norm": 0.726510226726532,
+      "learning_rate": 2.8751435621847747e-05,
+      "loss": 0.8879,
+      "step": 180
+    },
+    {
+      "epoch": 0.5425219941348973,
+      "grad_norm": 0.6819034814834595,
+      "learning_rate": 2.865271550653108e-05,
+      "loss": 0.8786,
+      "step": 185
+    },
+    {
+      "epoch": 0.5571847507331378,
+      "grad_norm": 0.8962215185165405,
+      "learning_rate": 2.8550422573462363e-05,
+      "loss": 0.8462,
+      "step": 190
+    },
+    {
+      "epoch": 0.5718475073313783,
+      "grad_norm": 0.6712878346443176,
+      "learning_rate": 2.8444583591968676e-05,
+      "loss": 0.8501,
+      "step": 195
+    },
+    {
+      "epoch": 0.5865102639296188,
+      "grad_norm": 0.8235214948654175,
+      "learning_rate": 2.8335226259352578e-05,
+      "loss": 0.8633,
+      "step": 200
+    },
+    {
+      "epoch": 0.6011730205278593,
+      "grad_norm": 0.7097874879837036,
+      "learning_rate": 2.8222379193643863e-05,
+      "loss": 0.8318,
+      "step": 205
+    },
+    {
+      "epoch": 0.6158357771260997,
+      "grad_norm": 0.7363569140434265,
+      "learning_rate": 2.8106071926110472e-05,
+      "loss": 0.8345,
+      "step": 210
+    },
+    {
+      "epoch": 0.6304985337243402,
+      "grad_norm": 0.9131688475608826,
+      "learning_rate": 2.7986334893530343e-05,
+      "loss": 0.8133,
+      "step": 215
+    },
+    {
+      "epoch": 0.6451612903225806,
+      "grad_norm": 0.8629419803619385,
+      "learning_rate": 2.7863199430226328e-05,
+      "loss": 0.8877,
+      "step": 220
+    },
+    {
+      "epoch": 0.6598240469208211,
+      "grad_norm": 0.77187579870224,
+      "learning_rate": 2.7736697759866244e-05,
+      "loss": 0.837,
+      "step": 225
+    },
+    {
+      "epoch": 0.6744868035190615,
+      "grad_norm": 1.0400891304016113,
+      "learning_rate": 2.760686298703015e-05,
+      "loss": 0.8241,
+      "step": 230
+    },
+    {
+      "epoch": 0.6891495601173021,
+      "grad_norm": 0.9449446201324463,
+      "learning_rate": 2.7473729088547127e-05,
+      "loss": 0.782,
+      "step": 235
+    },
+    {
+      "epoch": 0.7038123167155426,
+      "grad_norm": 1.0150853395462036,
+      "learning_rate": 2.7337330904603776e-05,
+      "loss": 0.8285,
+      "step": 240
+    },
+    {
+      "epoch": 0.718475073313783,
+      "grad_norm": 0.8893378376960754,
+      "learning_rate": 2.71977041296268e-05,
+      "loss": 0.8303,
+      "step": 245
+    },
+    {
+      "epoch": 0.7331378299120235,
+      "grad_norm": 0.9086685180664062,
+      "learning_rate": 2.7054885302942028e-05,
+      "loss": 0.7748,
+      "step": 250
+    },
+    {
+      "epoch": 0.7478005865102639,
+      "grad_norm": 0.8139998912811279,
+      "learning_rate": 2.6908911799212322e-05,
+      "loss": 0.8112,
+      "step": 255
+    },
+    {
+      "epoch": 0.7624633431085044,
+      "grad_norm": 0.7712191939353943,
+      "learning_rate": 2.6759821818656918e-05,
+      "loss": 0.7374,
+      "step": 260
+    },
+    {
+      "epoch": 0.7771260997067448,
+      "grad_norm": 0.948762059211731,
+      "learning_rate": 2.660765437705469e-05,
+      "loss": 0.7672,
+      "step": 265
+    },
+    {
+      "epoch": 0.7917888563049853,
+      "grad_norm": 0.8369901180267334,
+      "learning_rate": 2.6452449295533995e-05,
+      "loss": 0.7307,
+      "step": 270
+    },
+    {
+      "epoch": 0.8064516129032258,
+      "grad_norm": 0.8895859122276306,
+      "learning_rate": 2.6294247190151776e-05,
+      "loss": 0.734,
+      "step": 275
+    },
+    {
+      "epoch": 0.8211143695014663,
+      "grad_norm": 0.8661187291145325,
+      "learning_rate": 2.6133089461264638e-05,
+      "loss": 0.7754,
+      "step": 280
+    },
+    {
+      "epoch": 0.8357771260997068,
+      "grad_norm": 1.0180197954177856,
+      "learning_rate": 2.5969018282694648e-05,
+      "loss": 0.7032,
+      "step": 285
+    },
+    {
+      "epoch": 0.8504398826979472,
+      "grad_norm": 0.7993912100791931,
+      "learning_rate": 2.5802076590692784e-05,
+      "loss": 0.7384,
+      "step": 290
+    },
+    {
+      "epoch": 0.8651026392961877,
+      "grad_norm": 0.835310161113739,
+      "learning_rate": 2.5632308072702797e-05,
+      "loss": 0.6594,
+      "step": 295
+    },
+    {
+      "epoch": 0.8797653958944281,
+      "grad_norm": 0.8714587688446045,
+      "learning_rate": 2.5459757155928522e-05,
+      "loss": 0.7202,
+      "step": 300
+    },
+    {
+      "epoch": 0.8944281524926686,
+      "grad_norm": 0.9846564531326294,
+      "learning_rate": 2.5284468995707623e-05,
+      "loss": 0.6962,
+      "step": 305
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.99905925989151,
+      "learning_rate": 2.5106489463694727e-05,
+      "loss": 0.6634,
+      "step": 310
+    },
+    {
+      "epoch": 0.9237536656891495,
+      "grad_norm": 0.8811106085777283,
+      "learning_rate": 2.492586513585718e-05,
+      "loss": 0.7435,
+      "step": 315
+    },
+    {
+      "epoch": 0.9384164222873901,
+      "grad_norm": 0.9337815046310425,
+      "learning_rate": 2.474264328028641e-05,
+      "loss": 0.6988,
+      "step": 320
+    },
+    {
+      "epoch": 0.9530791788856305,
+      "grad_norm": 1.0272332429885864,
+      "learning_rate": 2.4556871844828245e-05,
+      "loss": 0.6982,
+      "step": 325
+    },
+    {
+      "epoch": 0.967741935483871,
+      "grad_norm": 1.043241262435913,
+      "learning_rate": 2.4368599444535284e-05,
+      "loss": 0.6366,
+      "step": 330
+    },
+    {
+      "epoch": 0.9824046920821115,
+      "grad_norm": 1.0359058380126953,
+      "learning_rate": 2.4177875348944678e-05,
+      "loss": 0.686,
+      "step": 335
+    },
+    {
+      "epoch": 0.9970674486803519,
+      "grad_norm": 1.171642541885376,
+      "learning_rate": 2.3984749469184677e-05,
+      "loss": 0.7073,
+      "step": 340
+    },
+    {
+      "epoch": 1.0117302052785924,
+      "grad_norm": 0.9656211733818054,
+      "learning_rate": 2.3789272344913208e-05,
+      "loss": 0.6161,
+      "step": 345
+    },
+    {
+      "epoch": 1.0263929618768328,
+      "grad_norm": 0.9734588265419006,
+      "learning_rate": 2.359149513109204e-05,
+      "loss": 0.5756,
+      "step": 350
+    },
+    {
+      "epoch": 1.0410557184750733,
+      "grad_norm": 1.0881298780441284,
+      "learning_rate": 2.3391469584599877e-05,
+      "loss": 0.5743,
+      "step": 355
+    },
+    {
+      "epoch": 1.0557184750733137,
+      "grad_norm": 1.1606433391571045,
+      "learning_rate": 2.318924805068797e-05,
+      "loss": 0.5845,
+      "step": 360
+    },
+    {
+      "epoch": 1.0703812316715542,
+      "grad_norm": 1.0384148359298706,
+      "learning_rate": 2.2984883449281716e-05,
+      "loss": 0.5896,
+      "step": 365
+    },
+    {
+      "epoch": 1.0850439882697946,
+      "grad_norm": 1.040235161781311,
+      "learning_rate": 2.277842926113193e-05,
+      "loss": 0.5823,
+      "step": 370
+    },
+    {
+      "epoch": 1.099706744868035,
+      "grad_norm": 1.1339130401611328,
+      "learning_rate": 2.256993951381928e-05,
+      "loss": 0.5433,
+      "step": 375
+    },
+    {
+      "epoch": 1.1143695014662756,
+      "grad_norm": 1.0906201601028442,
+      "learning_rate": 2.235946876761567e-05,
+      "loss": 0.5674,
+      "step": 380
+    },
+    {
+      "epoch": 1.129032258064516,
+      "grad_norm": 1.2759273052215576,
+      "learning_rate": 2.2147072101206217e-05,
+      "loss": 0.6016,
+      "step": 385
+    },
+    {
+      "epoch": 1.1436950146627567,
+      "grad_norm": 0.9344790577888489,
+      "learning_rate": 2.193280509727554e-05,
+      "loss": 0.5668,
+      "step": 390
+    },
+    {
+      "epoch": 1.1583577712609971,
+      "grad_norm": 1.1530736684799194,
+      "learning_rate": 2.171672382796218e-05,
+      "loss": 0.5293,
+      "step": 395
+    },
+    {
+      "epoch": 1.1730205278592376,
+      "grad_norm": 1.1002442836761475,
+      "learning_rate": 2.149888484018492e-05,
+      "loss": 0.5772,
+      "step": 400
+    },
+    {
+      "epoch": 1.187683284457478,
+      "grad_norm": 1.137981653213501,
+      "learning_rate": 2.127934514084486e-05,
+      "loss": 0.531,
+      "step": 405
+    },
+    {
+      "epoch": 1.2023460410557185,
+      "grad_norm": 1.17744779586792,
+      "learning_rate": 2.1058162181907112e-05,
+      "loss": 0.5558,
+      "step": 410
+    },
+    {
+      "epoch": 1.217008797653959,
+      "grad_norm": 1.029596209526062,
+      "learning_rate": 2.0835393845366062e-05,
+      "loss": 0.548,
+      "step": 415
+    },
+    {
+      "epoch": 1.2316715542521994,
+      "grad_norm": 1.1402246952056885,
+      "learning_rate": 2.061109842809803e-05,
+      "loss": 0.5304,
+      "step": 420
+    },
+    {
+      "epoch": 1.2463343108504399,
+      "grad_norm": 1.0744937658309937,
+      "learning_rate": 2.0385334626605412e-05,
+      "loss": 0.5108,
+      "step": 425
+    },
+    {
+      "epoch": 1.2609970674486803,
+      "grad_norm": 1.1842135190963745,
+      "learning_rate": 2.0158161521656245e-05,
+      "loss": 0.5034,
+      "step": 430
+    },
+    {
+      "epoch": 1.2756598240469208,
+      "grad_norm": 1.0653235912322998,
+      "learning_rate": 1.9929638562823155e-05,
+      "loss": 0.5075,
+      "step": 435
+    },
+    {
+      "epoch": 1.2903225806451613,
+      "grad_norm": 0.9864193201065063,
+      "learning_rate": 1.9699825552925858e-05,
+      "loss": 0.5499,
+      "step": 440
+    },
+    {
+      "epoch": 1.3049853372434017,
+      "grad_norm": 1.0366642475128174,
+      "learning_rate": 1.9468782632381188e-05,
+      "loss": 0.506,
+      "step": 445
+    },
+    {
+      "epoch": 1.3196480938416422,
+      "grad_norm": 1.1418167352676392,
+      "learning_rate": 1.9236570263464763e-05,
+      "loss": 0.5623,
+      "step": 450
+    },
+    {
+      "epoch": 1.3343108504398826,
+      "grad_norm": 1.1316769123077393,
+      "learning_rate": 1.9003249214488473e-05,
+      "loss": 0.5067,
+      "step": 455
+    },
+    {
+      "epoch": 1.3489736070381233,
+      "grad_norm": 1.1603069305419922,
+      "learning_rate": 1.8768880543897814e-05,
+      "loss": 0.5099,
+      "step": 460
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 1.1564180850982666,
+      "learning_rate": 1.853352558429336e-05,
+      "loss": 0.4988,
+      "step": 465
+    },
+    {
+      "epoch": 1.3782991202346042,
+      "grad_norm": 1.100997805595398,
+      "learning_rate": 1.8297245926380427e-05,
+      "loss": 0.5056,
+      "step": 470
+    },
+    {
+      "epoch": 1.3929618768328447,
+      "grad_norm": 1.0203278064727783,
+      "learning_rate": 1.8060103402851274e-05,
+      "loss": 0.4754,
+      "step": 475
+    },
+    {
+      "epoch": 1.4076246334310851,
+      "grad_norm": 1.0419028997421265,
+      "learning_rate": 1.7822160072203884e-05,
+      "loss": 0.5307,
+      "step": 480
+    },
+    {
+      "epoch": 1.4222873900293256,
+      "grad_norm": 1.1616261005401611,
+      "learning_rate": 1.7583478202501737e-05,
+      "loss": 0.468,
+      "step": 485
+    },
+    {
+      "epoch": 1.436950146627566,
+      "grad_norm": 1.0473759174346924,
+      "learning_rate": 1.734412025507867e-05,
+      "loss": 0.478,
+      "step": 490
+    },
+    {
+      "epoch": 1.4516129032258065,
+      "grad_norm": 1.030143141746521,
+      "learning_rate": 1.7104148868193232e-05,
+      "loss": 0.5341,
+      "step": 495
+    },
+    {
+      "epoch": 1.466275659824047,
+      "grad_norm": 1.3637970685958862,
+      "learning_rate": 1.686362684063666e-05,
+      "loss": 0.4753,
+      "step": 500
+    },
+    {
+      "epoch": 1.4809384164222874,
+      "grad_norm": 1.1191473007202148,
+      "learning_rate": 1.6622617115298923e-05,
+      "loss": 0.4577,
+      "step": 505
+    },
+    {
+      "epoch": 1.4956011730205279,
+      "grad_norm": 1.2150533199310303,
+      "learning_rate": 1.6381182762696993e-05,
+      "loss": 0.4491,
+      "step": 510
+    },
+    {
+      "epoch": 1.5102639296187683,
+      "grad_norm": 1.005942463874817,
+      "learning_rate": 1.6139386964469754e-05,
+      "loss": 0.4735,
+      "step": 515
+    },
+    {
+      "epoch": 1.5249266862170088,
+      "grad_norm": 1.232484221458435,
+      "learning_rate": 1.589729299684382e-05,
+      "loss": 0.4707,
+      "step": 520
+    },
+    {
+      "epoch": 1.5395894428152492,
+      "grad_norm": 1.3594202995300293,
+      "learning_rate": 1.5654964214074604e-05,
+      "loss": 0.4814,
+      "step": 525
+    },
+    {
+      "epoch": 1.5542521994134897,
+      "grad_norm": 1.0931212902069092,
+      "learning_rate": 1.541246403186694e-05,
+      "loss": 0.4489,
+      "step": 530
+    },
+    {
+      "epoch": 1.5689149560117301,
+      "grad_norm": 1.0040931701660156,
+      "learning_rate": 1.5169855910779694e-05,
+      "loss": 0.4586,
+      "step": 535
+    },
+    {
+      "epoch": 1.5835777126099706,
+      "grad_norm": 1.2385358810424805,
+      "learning_rate": 1.4927203339618536e-05,
+      "loss": 0.4519,
+      "step": 540
+    },
+    {
+      "epoch": 1.598240469208211,
+      "grad_norm": 1.2667471170425415,
+      "learning_rate": 1.4684569818821412e-05,
+      "loss": 0.442,
+      "step": 545
+    },
+    {
+      "epoch": 1.6129032258064515,
+      "grad_norm": 1.0587009191513062,
+      "learning_rate": 1.4442018843840932e-05,
+      "loss": 0.446,
+      "step": 550
+    },
+    {
+      "epoch": 1.627565982404692,
+      "grad_norm": 1.2656630277633667,
+      "learning_rate": 1.4199613888528044e-05,
+      "loss": 0.3993,
+      "step": 555
+    },
+    {
+      "epoch": 1.6422287390029324,
+      "grad_norm": 1.2869272232055664,
+      "learning_rate": 1.3957418388521413e-05,
+      "loss": 0.4311,
+      "step": 560
+    },
+    {
+      "epoch": 1.6568914956011729,
+      "grad_norm": 1.0959758758544922,
+      "learning_rate": 1.3715495724646731e-05,
+      "loss": 0.4242,
+      "step": 565
+    },
+    {
+      "epoch": 1.6715542521994133,
+      "grad_norm": 1.3144580125808716,
+      "learning_rate": 1.3473909206330444e-05,
+      "loss": 0.4349,
+      "step": 570
+    },
+    {
+      "epoch": 1.6862170087976538,
+      "grad_norm": 1.212710976600647,
+      "learning_rate": 1.323272205503212e-05,
+      "loss": 0.4418,
+      "step": 575
+    },
+    {
+      "epoch": 1.7008797653958945,
+      "grad_norm": 1.2405732870101929,
+      "learning_rate": 1.299199738769983e-05,
+      "loss": 0.4164,
+      "step": 580
+    },
+    {
+      "epoch": 1.715542521994135,
+      "grad_norm": 1.1547958850860596,
+      "learning_rate": 1.2751798200252912e-05,
+      "loss": 0.4439,
+      "step": 585
+    },
+    {
+      "epoch": 1.7302052785923754,
+      "grad_norm": 1.2431228160858154,
+      "learning_rate": 1.251218735109639e-05,
+      "loss": 0.456,
+      "step": 590
+    },
+    {
+      "epoch": 1.7448680351906158,
+      "grad_norm": 1.0995466709136963,
+      "learning_rate": 1.2273227544671367e-05,
+      "loss": 0.4248,
+      "step": 595
+    },
+    {
+      "epoch": 1.7595307917888563,
+      "grad_norm": 1.2348030805587769,
+      "learning_rate": 1.2034981315045745e-05,
+      "loss": 0.4296,
+      "step": 600
+    },
+    {
+      "epoch": 1.7741935483870968,
+      "grad_norm": 1.1425237655639648,
+      "learning_rate": 1.1797511009549478e-05,
+      "loss": 0.432,
+      "step": 605
+    },
+    {
+      "epoch": 1.7888563049853372,
+      "grad_norm": 1.1894876956939697,
+      "learning_rate": 1.1560878772458757e-05,
+      "loss": 0.3871,
+      "step": 610
+    },
+    {
+      "epoch": 1.8035190615835777,
+      "grad_norm": 1.0838085412979126,
+      "learning_rate": 1.1325146528733262e-05,
+      "loss": 0.3822,
+      "step": 615
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 1.1685465574264526,
+      "learning_rate": 1.1090375967810879e-05,
+      "loss": 0.3995,
+      "step": 620
+    },
+    {
+      "epoch": 1.8328445747800588,
+      "grad_norm": 1.0686448812484741,
+      "learning_rate": 1.0856628527463986e-05,
+      "loss": 0.4027,
+      "step": 625
+    },
+    {
+      "epoch": 1.8475073313782993,
+      "grad_norm": 1.138701319694519,
+      "learning_rate": 1.0623965377721652e-05,
+      "loss": 0.4182,
+      "step": 630
+    },
+    {
+      "epoch": 1.8621700879765397,
+      "grad_norm": 1.1836135387420654,
+      "learning_rate": 1.0392447404861866e-05,
+      "loss": 0.3667,
+      "step": 635
+    },
+    {
+      "epoch": 1.8768328445747802,
+      "grad_norm": 1.1814188957214355,
+      "learning_rate": 1.016213519547805e-05,
+      "loss": 0.4021,
+      "step": 640
+    },
+    {
+      "epoch": 1.8914956011730206,
+      "grad_norm": 1.048718810081482,
+      "learning_rate": 9.933089020623942e-06,
+      "loss": 0.3905,
+      "step": 645
+    },
+    {
+      "epoch": 1.906158357771261,
+      "grad_norm": 1.1059246063232422,
+      "learning_rate": 9.705368820041149e-06,
+      "loss": 0.4351,
+      "step": 650
+    },
+    {
+      "epoch": 1.9208211143695015,
+      "grad_norm": 1.4553226232528687,
+      "learning_rate": 9.479034186473307e-06,
+      "loss": 0.3786,
+      "step": 655
+    },
+    {
+      "epoch": 1.935483870967742,
+      "grad_norm": 1.2545796632766724,
+      "learning_rate": 9.25414435007111e-06,
+      "loss": 0.3828,
+      "step": 660
+    },
+    {
+      "epoch": 1.9501466275659824,
+      "grad_norm": 1.3140236139297485,
+      "learning_rate": 9.03075816289217e-06,
+      "loss": 0.4024,
+      "step": 665
+    },
+    {
+      "epoch": 1.964809384164223,
+      "grad_norm": 1.1320492029190063,
+      "learning_rate": 8.808934083499897e-06,
+      "loss": 0.3613,
+      "step": 670
+    },
+    {
+      "epoch": 1.9794721407624634,
+      "grad_norm": 1.1655583381652832,
+      "learning_rate": 8.588730161665303e-06,
+      "loss": 0.3785,
+      "step": 675
+    },
+    {
+      "epoch": 1.9941348973607038,
+      "grad_norm": 1.2500450611114502,
+      "learning_rate": 8.37020402317576e-06,
+      "loss": 0.3804,
+      "step": 680
+    },
+    {
+      "epoch": 2.0087976539589443,
+      "grad_norm": 1.1778208017349243,
+      "learning_rate": 8.153412854754791e-06,
+      "loss": 0.3377,
+      "step": 685
+    },
+    {
+      "epoch": 2.0234604105571847,
+      "grad_norm": 1.3056201934814453,
+      "learning_rate": 7.938413389096684e-06,
+      "loss": 0.2937,
+      "step": 690
+    },
+    {
+      "epoch": 2.038123167155425,
+      "grad_norm": 1.2293964624404907,
+      "learning_rate": 7.72526189001995e-06,
+      "loss": 0.3157,
+      "step": 695
+    },
+    {
+      "epoch": 2.0527859237536656,
+      "grad_norm": 1.303946614265442,
+      "learning_rate": 7.5140141377435114e-06,
+      "loss": 0.3191,
+      "step": 700
+    },
+    {
+      "epoch": 2.067448680351906,
+      "grad_norm": 1.1810277700424194,
+      "learning_rate": 7.304725414289409e-06,
+      "loss": 0.3124,
+      "step": 705
+    },
+    {
+      "epoch": 2.0821114369501466,
+      "grad_norm": 1.1029647588729858,
+      "learning_rate": 7.097450489015864e-06,
+      "loss": 0.3125,
+      "step": 710
+    },
+    {
+      "epoch": 2.096774193548387,
+      "grad_norm": 1.1499955654144287,
+      "learning_rate": 6.8922436042845735e-06,
+      "loss": 0.299,
+      "step": 715
+    },
+    {
+      "epoch": 2.1114369501466275,
+      "grad_norm": 1.2358781099319458,
+      "learning_rate": 6.689158461265855e-06,
+      "loss": 0.3361,
+      "step": 720
+    },
+    {
+      "epoch": 2.126099706744868,
+      "grad_norm": 1.3672081232070923,
+      "learning_rate": 6.488248205885413e-06,
+      "loss": 0.3377,
+      "step": 725
+    },
+    {
+      "epoch": 2.1407624633431084,
+      "grad_norm": 1.0571002960205078,
+      "learning_rate": 6.289565414916472e-06,
+      "loss": 0.3094,
+      "step": 730
+    },
+    {
+      "epoch": 2.155425219941349,
+      "grad_norm": 1.1626659631729126,
+      "learning_rate": 6.093162082220785e-06,
+      "loss": 0.3238,
+      "step": 735
+    },
+    {
+      "epoch": 2.1700879765395893,
+      "grad_norm": 1.3589155673980713,
+      "learning_rate": 5.899089605142225e-06,
+      "loss": 0.2983,
+      "step": 740
+    },
+    {
+      "epoch": 2.1847507331378297,
+      "grad_norm": 1.2361712455749512,
+      "learning_rate": 5.7073987710564485e-06,
+      "loss": 0.3079,
+      "step": 745
+    },
+    {
+      "epoch": 2.19941348973607,
+      "grad_norm": 1.13412606716156,
+      "learning_rate": 5.518139744080231e-06,
+      "loss": 0.2895,
+      "step": 750
+    },
+    {
+      "epoch": 2.2140762463343107,
+      "grad_norm": 1.224824070930481,
+      "learning_rate": 5.331362051943864e-06,
+      "loss": 0.3505,
+      "step": 755
+    },
+    {
+      "epoch": 2.228739002932551,
+      "grad_norm": 1.3557347059249878,
+      "learning_rate": 5.147114573030105e-06,
+      "loss": 0.3044,
+      "step": 760
+    },
+    {
+      "epoch": 2.2434017595307916,
+      "grad_norm": 1.1949517726898193,
+      "learning_rate": 4.965445523583039e-06,
+      "loss": 0.2715,
+      "step": 765
+    },
+    {
+      "epoch": 2.258064516129032,
+      "grad_norm": 1.329795002937317,
+      "learning_rate": 4.786402445090264e-06,
+      "loss": 0.2929,
+      "step": 770
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 1.1268538236618042,
+      "learning_rate": 4.610032191841606e-06,
+      "loss": 0.3035,
+      "step": 775
+    },
+    {
+      "epoch": 2.2873900293255134,
+      "grad_norm": 1.2942050695419312,
+      "learning_rate": 4.43638091866769e-06,
+      "loss": 0.2901,
+      "step": 780
+    },
+    {
+      "epoch": 2.302052785923754,
+      "grad_norm": 1.1928037405014038,
+      "learning_rate": 4.265494068861539e-06,
+      "loss": 0.2802,
+      "step": 785
+    },
+    {
+      "epoch": 2.3167155425219943,
+      "grad_norm": 1.1997088193893433,
+      "learning_rate": 4.097416362286422e-06,
+      "loss": 0.2879,
+      "step": 790
+    },
+    {
+      "epoch": 2.3313782991202348,
+      "grad_norm": 1.2495310306549072,
+      "learning_rate": 3.932191783672954e-06,
+      "loss": 0.3466,
+      "step": 795
+    },
+    {
+      "epoch": 2.346041055718475,
+      "grad_norm": 1.1649198532104492,
+      "learning_rate": 3.769863571108632e-06,
+      "loss": 0.3077,
+      "step": 800
+    },
+    {
+      "epoch": 2.3607038123167157,
+      "grad_norm": 1.3067137002944946,
+      "learning_rate": 3.610474204722708e-06,
+      "loss": 0.283,
+      "step": 805
+    },
+    {
+      "epoch": 2.375366568914956,
+      "grad_norm": 1.4458105564117432,
+      "learning_rate": 3.4540653955694806e-06,
+      "loss": 0.2987,
+      "step": 810
+    },
+    {
+      "epoch": 2.3900293255131966,
+      "grad_norm": 1.2861709594726562,
+      "learning_rate": 3.300678074712782e-06,
+      "loss": 0.2962,
+      "step": 815
+    },
+    {
+      "epoch": 2.404692082111437,
+      "grad_norm": 1.1888914108276367,
+      "learning_rate": 3.1503523825146308e-06,
+      "loss": 0.3048,
+      "step": 820
+    },
+    {
+      "epoch": 2.4193548387096775,
+      "grad_norm": 1.1350572109222412,
+      "learning_rate": 3.003127658130765e-06,
+      "loss": 0.3065,
+      "step": 825
+    },
+    {
+      "epoch": 2.434017595307918,
+      "grad_norm": 1.2664979696273804,
+      "learning_rate": 2.8590424292158957e-06,
+      "loss": 0.3048,
+      "step": 830
+    },
+    {
+      "epoch": 2.4486803519061584,
+      "grad_norm": 1.222652554512024,
+      "learning_rate": 2.7181344018412736e-06,
+      "loss": 0.3064,
+      "step": 835
+    },
+    {
+      "epoch": 2.463343108504399,
+      "grad_norm": 1.2085829973220825,
+      "learning_rate": 2.5804404506272926e-06,
+      "loss": 0.3077,
+      "step": 840
+    },
+    {
+      "epoch": 2.4780058651026393,
+      "grad_norm": 1.2079604864120483,
+      "learning_rate": 2.445996609093653e-06,
+      "loss": 0.2638,
+      "step": 845
+    },
+    {
+      "epoch": 2.4926686217008798,
+      "grad_norm": 1.3720932006835938,
+      "learning_rate": 2.3148380602296665e-06,
+      "loss": 0.2865,
+      "step": 850
+    },
+    {
+      "epoch": 2.5073313782991202,
+      "grad_norm": 1.2781530618667603,
+      "learning_rate": 2.1869991272871055e-06,
+      "loss": 0.2817,
+      "step": 855
+    },
+    {
+      "epoch": 2.5219941348973607,
+      "grad_norm": 1.1719965934753418,
+      "learning_rate": 2.062513264798061e-06,
+      "loss": 0.2737,
+      "step": 860
+    },
+    {
+      "epoch": 2.536656891495601,
+      "grad_norm": 1.117966651916504,
+      "learning_rate": 1.941413049820123e-06,
+      "loss": 0.289,
+      "step": 865
+    },
+    {
+      "epoch": 2.5513196480938416,
+      "grad_norm": 1.2426401376724243,
+      "learning_rate": 1.8237301734112132e-06,
+      "loss": 0.2837,
+      "step": 870
+    },
+    {
+      "epoch": 2.565982404692082,
+      "grad_norm": 1.2011477947235107,
+      "learning_rate": 1.7094954323362495e-06,
+      "loss": 0.3109,
+      "step": 875
+    },
+    {
+      "epoch": 2.5806451612903225,
+      "grad_norm": 1.3482277393341064,
+      "learning_rate": 1.5987387210078586e-06,
+      "loss": 0.2861,
+      "step": 880
+    },
+    {
+      "epoch": 2.595307917888563,
+      "grad_norm": 1.16232168674469,
+      "learning_rate": 1.4914890236632161e-06,
+      "loss": 0.2683,
+      "step": 885
+    },
+    {
+      "epoch": 2.6099706744868034,
+      "grad_norm": 1.0949877500534058,
+      "learning_rate": 1.3877744067790933e-06,
+      "loss": 0.2858,
+      "step": 890
+    },
+    {
+      "epoch": 2.624633431085044,
+      "grad_norm": 1.257165551185608,
+      "learning_rate": 1.2876220117270466e-06,
+      "loss": 0.2818,
+      "step": 895
+    },
+    {
+      "epoch": 2.6392961876832843,
+      "grad_norm": 1.2227627038955688,
+      "learning_rate": 1.1910580476707305e-06,
+      "loss": 0.2841,
+      "step": 900
+    },
+    {
+      "epoch": 2.653958944281525,
+      "grad_norm": 1.2360917329788208,
+      "learning_rate": 1.0981077847071236e-06,
+      "loss": 0.2796,
+      "step": 905
+    },
+    {
+      "epoch": 2.6686217008797652,
+      "grad_norm": 1.1866899728775024,
+      "learning_rate": 1.0087955472535526e-06,
+      "loss": 0.2982,
+      "step": 910
+    },
+    {
+      "epoch": 2.6832844574780057,
+      "grad_norm": 1.1590216159820557,
+      "learning_rate": 9.231447076821503e-07,
+      "loss": 0.2835,
+      "step": 915
+    },
+    {
+      "epoch": 2.6979472140762466,
+      "grad_norm": 1.215848445892334,
+      "learning_rate": 8.411776802034843e-07,
+      "loss": 0.317,
+      "step": 920
+    },
+    {
+      "epoch": 2.712609970674487,
+      "grad_norm": 1.1392394304275513,
+      "learning_rate": 7.629159150008958e-07,
+      "loss": 0.2859,
+      "step": 925
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 1.3150230646133423,
+      "learning_rate": 6.88379892617173e-07,
+      "loss": 0.2515,
+      "step": 930
+    },
+    {
+      "epoch": 2.741935483870968,
+      "grad_norm": 1.205656886100769,
+      "learning_rate": 6.175891185949189e-07,
+      "loss": 0.2767,
+      "step": 935
+    },
+    {
+      "epoch": 2.7565982404692084,
+      "grad_norm": 1.210679054260254,
+      "learning_rate": 5.505621183720904e-07,
+      "loss": 0.2882,
+      "step": 940
+    },
+    {
+      "epoch": 2.771260997067449,
+      "grad_norm": 1.0707758665084839,
+      "learning_rate": 4.873164324340318e-07,
+      "loss": 0.2885,
+      "step": 945
+    },
+    {
+      "epoch": 2.7859237536656893,
+      "grad_norm": 1.2976630926132202,
+      "learning_rate": 4.2786861172325774e-07,
+      "loss": 0.3224,
+      "step": 950
+    },
+    {
+      "epoch": 2.80058651026393,
+      "grad_norm": 1.1435562372207642,
+      "learning_rate": 3.722342133081785e-07,
+      "loss": 0.2875,
+      "step": 955
+    },
+    {
+      "epoch": 2.8152492668621703,
+      "grad_norm": 1.1515800952911377,
+      "learning_rate": 3.204277963119362e-07,
+      "loss": 0.2955,
+      "step": 960
+    },
+    {
+      "epoch": 2.8299120234604107,
+      "grad_norm": 1.1778301000595093,
+      "learning_rate": 2.724629181023841e-07,
+      "loss": 0.2753,
+      "step": 965
+    },
+    {
+      "epoch": 2.844574780058651,
+      "grad_norm": 1.2622073888778687,
+      "learning_rate": 2.283521307442199e-07,
+      "loss": 0.3011,
+      "step": 970
+    },
+    {
+      "epoch": 2.8592375366568916,
+      "grad_norm": 1.1627691984176636,
+      "learning_rate": 1.881069777142047e-07,
+      "loss": 0.3021,
+      "step": 975
+    },
+    {
+      "epoch": 2.873900293255132,
+      "grad_norm": 1.2690354585647583,
+      "learning_rate": 1.517379908803046e-07,
+      "loss": 0.2888,
+      "step": 980
+    },
+    {
+      "epoch": 2.8885630498533725,
+      "grad_norm": 1.1645830869674683,
+      "learning_rate": 1.1925468774559855e-07,
+      "loss": 0.2723,
+      "step": 985
+    },
+    {
+      "epoch": 2.903225806451613,
+      "grad_norm": 1.177254319190979,
+      "learning_rate": 9.066556895759249e-08,
+      "loss": 0.2836,
+      "step": 990
+    },
+    {
+      "epoch": 2.9178885630498534,
+      "grad_norm": 1.414916753768921,
+      "learning_rate": 6.597811608368031e-08,
+      "loss": 0.3021,
+      "step": 995
+    },
+    {
+      "epoch": 2.932551319648094,
+      "grad_norm": 1.2326536178588867,
+      "learning_rate": 4.519878965325852e-08,
+      "loss": 0.2895,
+      "step": 1000
+    },
+    {
+      "epoch": 2.9472140762463344,
+      "grad_norm": 1.2043007612228394,
+      "learning_rate": 2.8333027467053463e-08,
+      "loss": 0.2662,
+      "step": 1005
+    },
+    {
+      "epoch": 2.961876832844575,
+      "grad_norm": 1.2307794094085693,
+      "learning_rate": 1.5385243174099728e-08,
+      "loss": 0.2739,
+      "step": 1010
+    },
+    {
+      "epoch": 2.9765395894428153,
+      "grad_norm": 1.1917132139205933,
+      "learning_rate": 6.3588251167007176e-09,
+      "loss": 0.2843,
+      "step": 1015
+    },
+    {
+      "epoch": 2.9912023460410557,
+      "grad_norm": 1.2122886180877686,
+      "learning_rate": 1.2561354437412576e-09,
+      "loss": 0.2653,
+      "step": 1020
+    },
+    {
+      "epoch": 3.0,
+      "step": 1023,
+      "total_flos": 1.4978820658094408e+18,
+      "train_loss": 0.567423046625721,
+      "train_runtime": 689.9192,
+      "train_samples_per_second": 47.406,
+      "train_steps_per_second": 1.483
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1023,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4978820658094408e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

10_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fe1c439d1beaccd67a5b4e3239a08eccc460ada83f1bd98996143ff1ff7e978
+size 8145

10_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff