Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

19_128_e3_3e-5/README.md +63 -0
19_128_e3_3e-5/adapter_config.json +39 -0
19_128_e3_3e-5/adapter_model.safetensors +3 -0
19_128_e3_3e-5/all_results.json +9 -0
19_128_e3_3e-5/config.json +32 -0
19_128_e3_3e-5/merges.txt +0 -0
19_128_e3_3e-5/special_tokens_map.json +45 -0
19_128_e3_3e-5/tokenizer.json +0 -0
19_128_e3_3e-5/tokenizer_config.json +188 -0
19_128_e3_3e-5/train_results.json +9 -0
19_128_e3_3e-5/trainer_state.json +1394 -0
19_128_e3_3e-5/training_args.bin +3 -0
19_128_e3_3e-5/vocab.json +0 -0

19_128_e3_3e-5/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+library_name: peft
+license: apache-2.0
+base_model: ibm-granite/granite-3.3-8b-base
+tags:
+- alignment-handbook
+- generated_from_trainer
+datasets:
+- data/knowledge_lora_training_data_2000
+model-index:
+- name: 19_128_e3_3e-5
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# 19_128_e3_3e-5
+This model is a fine-tuned version of [ibm-granite/granite-3.3-8b-base](https://huggingface.co/ibm-granite/granite-3.3-8b-base) on the data/knowledge_lora_training_data_2000 dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 3e-05
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 64
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 3.0
+### Training results
+### Framework versions
+- PEFT 0.15.2
+- Transformers 4.52.4
+- Pytorch 2.7.0+cu126
+- Datasets 3.6.0
+- Tokenizers 0.21.2

19_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-base",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "down_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

19_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:088d7e1a6c1e7d8ad3df8906213e359369a3d12550e46723aee6fcfca0909eb7
+size 791751704

19_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.3759050762074194e+18,
+    "train_loss": 0.5874219706771784,
+    "train_runtime": 632.2067,
+    "train_samples": 10278,
+    "train_samples_per_second": 48.772,
+    "train_steps_per_second": 1.528
+}

19_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49152
+}

19_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

19_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<reponame>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

19_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

19_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<reponame>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

19_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.3759050762074194e+18,
+    "train_loss": 0.5874219706771784,
+    "train_runtime": 632.2067,
+    "train_samples": 10278,
+    "train_samples_per_second": 48.772,
+    "train_steps_per_second": 1.528
+}

19_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1394 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 966,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.015552099533437015,
+      "grad_norm": 1.1665724515914917,
+      "learning_rate": 2.4489795918367347e-06,
+      "loss": 1.3511,
+      "step": 5
+    },
+    {
+      "epoch": 0.03110419906687403,
+      "grad_norm": 0.7245913743972778,
+      "learning_rate": 5.510204081632653e-06,
+      "loss": 1.4013,
+      "step": 10
+    },
+    {
+      "epoch": 0.04665629860031104,
+      "grad_norm": 0.5644237399101257,
+      "learning_rate": 8.571428571428571e-06,
+      "loss": 1.2894,
+      "step": 15
+    },
+    {
+      "epoch": 0.06220839813374806,
+      "grad_norm": 1.1369251012802124,
+      "learning_rate": 1.163265306122449e-05,
+      "loss": 1.3037,
+      "step": 20
+    },
+    {
+      "epoch": 0.07776049766718507,
+      "grad_norm": 0.5249414443969727,
+      "learning_rate": 1.4693877551020408e-05,
+      "loss": 1.2825,
+      "step": 25
+    },
+    {
+      "epoch": 0.09331259720062209,
+      "grad_norm": 0.5075188875198364,
+      "learning_rate": 1.7755102040816327e-05,
+      "loss": 1.2408,
+      "step": 30
+    },
+    {
+      "epoch": 0.1088646967340591,
+      "grad_norm": 0.5279954671859741,
+      "learning_rate": 2.0816326530612247e-05,
+      "loss": 1.2877,
+      "step": 35
+    },
+    {
+      "epoch": 0.12441679626749612,
+      "grad_norm": 0.48007676005363464,
+      "learning_rate": 2.3877551020408164e-05,
+      "loss": 1.2534,
+      "step": 40
+    },
+    {
+      "epoch": 0.13996889580093314,
+      "grad_norm": 0.52071613073349,
+      "learning_rate": 2.6938775510204084e-05,
+      "loss": 1.2314,
+      "step": 45
+    },
+    {
+      "epoch": 0.15552099533437014,
+      "grad_norm": 0.5492898225784302,
+      "learning_rate": 3e-05,
+      "loss": 1.2783,
+      "step": 50
+    },
+    {
+      "epoch": 0.17107309486780714,
+      "grad_norm": 0.5895590782165527,
+      "learning_rate": 2.999779934619741e-05,
+      "loss": 1.1216,
+      "step": 55
+    },
+    {
+      "epoch": 0.18662519440124417,
+      "grad_norm": 0.45102763175964355,
+      "learning_rate": 2.9991198030506576e-05,
+      "loss": 1.1467,
+      "step": 60
+    },
+    {
+      "epoch": 0.20217729393468117,
+      "grad_norm": 0.46132299304008484,
+      "learning_rate": 2.9980197989888906e-05,
+      "loss": 1.1478,
+      "step": 65
+    },
+    {
+      "epoch": 0.2177293934681182,
+      "grad_norm": 0.5213153958320618,
+      "learning_rate": 2.9964802451981894e-05,
+      "loss": 1.1288,
+      "step": 70
+    },
+    {
+      "epoch": 0.2332814930015552,
+      "grad_norm": 0.7137885093688965,
+      "learning_rate": 2.9945015934152078e-05,
+      "loss": 1.1262,
+      "step": 75
+    },
+    {
+      "epoch": 0.24883359253499224,
+      "grad_norm": 0.5476042032241821,
+      "learning_rate": 2.992084424216955e-05,
+      "loss": 1.1398,
+      "step": 80
+    },
+    {
+      "epoch": 0.2643856920684292,
+      "grad_norm": 0.5478896498680115,
+      "learning_rate": 2.9892294468504425e-05,
+      "loss": 1.1329,
+      "step": 85
+    },
+    {
+      "epoch": 0.27993779160186627,
+      "grad_norm": 0.5697451829910278,
+      "learning_rate": 2.9859374990245774e-05,
+      "loss": 1.0797,
+      "step": 90
+    },
+    {
+      "epoch": 0.2954898911353033,
+      "grad_norm": 0.4974559247493744,
+      "learning_rate": 2.982209546664359e-05,
+      "loss": 1.1171,
+      "step": 95
+    },
+    {
+      "epoch": 0.3110419906687403,
+      "grad_norm": 0.5971369743347168,
+      "learning_rate": 2.97804668362746e-05,
+      "loss": 1.1315,
+      "step": 100
+    },
+    {
+      "epoch": 0.3265940902021773,
+      "grad_norm": 0.5410080552101135,
+      "learning_rate": 2.9734501313832623e-05,
+      "loss": 1.1018,
+      "step": 105
+    },
+    {
+      "epoch": 0.3421461897356143,
+      "grad_norm": 0.5843712687492371,
+      "learning_rate": 2.9684212386544562e-05,
+      "loss": 1.0267,
+      "step": 110
+    },
+    {
+      "epoch": 0.35769828926905134,
+      "grad_norm": 0.6015697121620178,
+      "learning_rate": 2.9629614810212954e-05,
+      "loss": 1.0076,
+      "step": 115
+    },
+    {
+      "epoch": 0.37325038880248834,
+      "grad_norm": 0.602651834487915,
+      "learning_rate": 2.957072460488634e-05,
+      "loss": 1.0124,
+      "step": 120
+    },
+    {
+      "epoch": 0.38880248833592534,
+      "grad_norm": 0.6042071580886841,
+      "learning_rate": 2.9507559050158618e-05,
+      "loss": 1.0516,
+      "step": 125
+    },
+    {
+      "epoch": 0.40435458786936235,
+      "grad_norm": 0.6740609407424927,
+      "learning_rate": 2.9440136680098882e-05,
+      "loss": 0.999,
+      "step": 130
+    },
+    {
+      "epoch": 0.4199066874027994,
+      "grad_norm": 0.6953685283660889,
+      "learning_rate": 2.9368477277813134e-05,
+      "loss": 0.9843,
+      "step": 135
+    },
+    {
+      "epoch": 0.4354587869362364,
+      "grad_norm": 0.6302007436752319,
+      "learning_rate": 2.929260186963954e-05,
+      "loss": 1.0633,
+      "step": 140
+    },
+    {
+      "epoch": 0.4510108864696734,
+      "grad_norm": 0.5916042327880859,
+      "learning_rate": 2.9212532718978817e-05,
+      "loss": 0.9495,
+      "step": 145
+    },
+    {
+      "epoch": 0.4665629860031104,
+      "grad_norm": 0.7407164573669434,
+      "learning_rate": 2.9128293319761762e-05,
+      "loss": 0.9612,
+      "step": 150
+    },
+    {
+      "epoch": 0.4821150855365474,
+      "grad_norm": 0.7023082971572876,
+      "learning_rate": 2.90399083895556e-05,
+      "loss": 0.9939,
+      "step": 155
+    },
+    {
+      "epoch": 0.4976671850699845,
+      "grad_norm": 0.6539564728736877,
+      "learning_rate": 2.894740386231136e-05,
+      "loss": 0.9772,
+      "step": 160
+    },
+    {
+      "epoch": 0.5132192846034215,
+      "grad_norm": 0.7266659736633301,
+      "learning_rate": 2.8850806880754333e-05,
+      "loss": 0.9117,
+      "step": 165
+    },
+    {
+      "epoch": 0.5287713841368584,
+      "grad_norm": 0.6634218096733093,
+      "learning_rate": 2.8750145788419817e-05,
+      "loss": 0.9615,
+      "step": 170
+    },
+    {
+      "epoch": 0.5443234836702955,
+      "grad_norm": 0.8376555442810059,
+      "learning_rate": 2.8645450121336567e-05,
+      "loss": 0.9009,
+      "step": 175
+    },
+    {
+      "epoch": 0.5598755832037325,
+      "grad_norm": 0.8775768280029297,
+      "learning_rate": 2.85367505993603e-05,
+      "loss": 0.8598,
+      "step": 180
+    },
+    {
+      "epoch": 0.5754276827371695,
+      "grad_norm": 0.7909477949142456,
+      "learning_rate": 2.8424079117159866e-05,
+      "loss": 0.8665,
+      "step": 185
+    },
+    {
+      "epoch": 0.5909797822706065,
+      "grad_norm": 0.7720655202865601,
+      "learning_rate": 2.8307468734858692e-05,
+      "loss": 0.8442,
+      "step": 190
+    },
+    {
+      "epoch": 0.6065318818040435,
+      "grad_norm": 0.7633533477783203,
+      "learning_rate": 2.8186953668334282e-05,
+      "loss": 0.844,
+      "step": 195
+    },
+    {
+      "epoch": 0.6220839813374806,
+      "grad_norm": 0.8189910650253296,
+      "learning_rate": 2.806256927917856e-05,
+      "loss": 0.8068,
+      "step": 200
+    },
+    {
+      "epoch": 0.6376360808709176,
+      "grad_norm": 0.8279641270637512,
+      "learning_rate": 2.793435206432205e-05,
+      "loss": 0.8302,
+      "step": 205
+    },
+    {
+      "epoch": 0.6531881804043546,
+      "grad_norm": 0.8447226881980896,
+      "learning_rate": 2.780233964532495e-05,
+      "loss": 0.8415,
+      "step": 210
+    },
+    {
+      "epoch": 0.6687402799377916,
+      "grad_norm": 0.8167079091072083,
+      "learning_rate": 2.7666570757338167e-05,
+      "loss": 0.8347,
+      "step": 215
+    },
+    {
+      "epoch": 0.6842923794712286,
+      "grad_norm": 0.9327054023742676,
+      "learning_rate": 2.7527085237737654e-05,
+      "loss": 0.7996,
+      "step": 220
+    },
+    {
+      "epoch": 0.6998444790046656,
+      "grad_norm": 0.811721682548523,
+      "learning_rate": 2.7383924014435294e-05,
+      "loss": 0.8218,
+      "step": 225
+    },
+    {
+      "epoch": 0.7153965785381027,
+      "grad_norm": 0.8451733589172363,
+      "learning_rate": 2.7237129093869814e-05,
+      "loss": 0.8216,
+      "step": 230
+    },
+    {
+      "epoch": 0.7309486780715396,
+      "grad_norm": 1.0027233362197876,
+      "learning_rate": 2.7086743548681228e-05,
+      "loss": 0.7764,
+      "step": 235
+    },
+    {
+      "epoch": 0.7465007776049767,
+      "grad_norm": 1.0437304973602295,
+      "learning_rate": 2.693281150507246e-05,
+      "loss": 0.7948,
+      "step": 240
+    },
+    {
+      "epoch": 0.7620528771384136,
+      "grad_norm": 0.9429616332054138,
+      "learning_rate": 2.6775378129861787e-05,
+      "loss": 0.8142,
+      "step": 245
+    },
+    {
+      "epoch": 0.7776049766718507,
+      "grad_norm": 0.9813476800918579,
+      "learning_rate": 2.661448961722998e-05,
+      "loss": 0.8272,
+      "step": 250
+    },
+    {
+      "epoch": 0.7931570762052877,
+      "grad_norm": 0.9027245044708252,
+      "learning_rate": 2.6450193175165994e-05,
+      "loss": 0.8074,
+      "step": 255
+    },
+    {
+      "epoch": 0.8087091757387247,
+      "grad_norm": 0.9813098311424255,
+      "learning_rate": 2.628253701161516e-05,
+      "loss": 0.7499,
+      "step": 260
+    },
+    {
+      "epoch": 0.8242612752721618,
+      "grad_norm": 0.9413791298866272,
+      "learning_rate": 2.6111570320333987e-05,
+      "loss": 0.7948,
+      "step": 265
+    },
+    {
+      "epoch": 0.8398133748055988,
+      "grad_norm": 1.061509609222412,
+      "learning_rate": 2.5937343266455715e-05,
+      "loss": 0.7484,
+      "step": 270
+    },
+    {
+      "epoch": 0.8553654743390358,
+      "grad_norm": 1.0488642454147339,
+      "learning_rate": 2.5759906971770828e-05,
+      "loss": 0.7763,
+      "step": 275
+    },
+    {
+      "epoch": 0.8709175738724728,
+      "grad_norm": 0.9727094769477844,
+      "learning_rate": 2.5579313499726874e-05,
+      "loss": 0.7487,
+      "step": 280
+    },
+    {
+      "epoch": 0.8864696734059098,
+      "grad_norm": 1.0250043869018555,
+      "learning_rate": 2.5395615840151986e-05,
+      "loss": 0.7112,
+      "step": 285
+    },
+    {
+      "epoch": 0.9020217729393468,
+      "grad_norm": 1.0248920917510986,
+      "learning_rate": 2.5208867893706568e-05,
+      "loss": 0.7353,
+      "step": 290
+    },
+    {
+      "epoch": 0.9175738724727839,
+      "grad_norm": 0.9377548098564148,
+      "learning_rate": 2.5019124456067755e-05,
+      "loss": 0.6972,
+      "step": 295
+    },
+    {
+      "epoch": 0.9331259720062208,
+      "grad_norm": 1.1410298347473145,
+      "learning_rate": 2.4826441201851223e-05,
+      "loss": 0.6945,
+      "step": 300
+    },
+    {
+      "epoch": 0.9486780715396579,
+      "grad_norm": 1.036268711090088,
+      "learning_rate": 2.463087466827511e-05,
+      "loss": 0.7185,
+      "step": 305
+    },
+    {
+      "epoch": 0.9642301710730948,
+      "grad_norm": 0.9181062579154968,
+      "learning_rate": 2.4432482238570856e-05,
+      "loss": 0.7145,
+      "step": 310
+    },
+    {
+      "epoch": 0.9797822706065319,
+      "grad_norm": 1.0887435674667358,
+      "learning_rate": 2.4231322125145777e-05,
+      "loss": 0.6757,
+      "step": 315
+    },
+    {
+      "epoch": 0.995334370139969,
+      "grad_norm": 1.0177348852157593,
+      "learning_rate": 2.4027453352502337e-05,
+      "loss": 0.6528,
+      "step": 320
+    },
+    {
+      "epoch": 1.009331259720062,
+      "grad_norm": 0.9978148937225342,
+      "learning_rate": 2.3820935739919173e-05,
+      "loss": 0.6153,
+      "step": 325
+    },
+    {
+      "epoch": 1.0248833592534992,
+      "grad_norm": 1.1397278308868408,
+      "learning_rate": 2.361182988389888e-05,
+      "loss": 0.5659,
+      "step": 330
+    },
+    {
+      "epoch": 1.0404354587869362,
+      "grad_norm": 0.9721418619155884,
+      "learning_rate": 2.3400197140387744e-05,
+      "loss": 0.6249,
+      "step": 335
+    },
+    {
+      "epoch": 1.0559875583203733,
+      "grad_norm": 1.223249077796936,
+      "learning_rate": 2.3186099606772667e-05,
+      "loss": 0.5941,
+      "step": 340
+    },
+    {
+      "epoch": 1.0715396578538103,
+      "grad_norm": 1.2995885610580444,
+      "learning_rate": 2.2969600103660517e-05,
+      "loss": 0.5937,
+      "step": 345
+    },
+    {
+      "epoch": 1.0870917573872472,
+      "grad_norm": 1.1245447397232056,
+      "learning_rate": 2.275076215644526e-05,
+      "loss": 0.5536,
+      "step": 350
+    },
+    {
+      "epoch": 1.1026438569206842,
+      "grad_norm": 0.9878132939338684,
+      "learning_rate": 2.2529649976668324e-05,
+      "loss": 0.5846,
+      "step": 355
+    },
+    {
+      "epoch": 1.1181959564541213,
+      "grad_norm": 1.0392405986785889,
+      "learning_rate": 2.230632844317761e-05,
+      "loss": 0.5888,
+      "step": 360
+    },
+    {
+      "epoch": 1.1337480559875583,
+      "grad_norm": 1.0900201797485352,
+      "learning_rate": 2.2080863083090696e-05,
+      "loss": 0.5771,
+      "step": 365
+    },
+    {
+      "epoch": 1.1493001555209954,
+      "grad_norm": 1.2057057619094849,
+      "learning_rate": 2.185332005256785e-05,
+      "loss": 0.6013,
+      "step": 370
+    },
+    {
+      "epoch": 1.1648522550544325,
+      "grad_norm": 1.0806993246078491,
+      "learning_rate": 2.162376611740047e-05,
+      "loss": 0.5444,
+      "step": 375
+    },
+    {
+      "epoch": 1.1804043545878693,
+      "grad_norm": 1.0961475372314453,
+      "learning_rate": 2.1392268633420583e-05,
+      "loss": 0.5636,
+      "step": 380
+    },
+    {
+      "epoch": 1.1959564541213064,
+      "grad_norm": 1.1113520860671997,
+      "learning_rate": 2.1158895526737313e-05,
+      "loss": 0.5751,
+      "step": 385
+    },
+    {
+      "epoch": 1.2115085536547434,
+      "grad_norm": 1.0920902490615845,
+      "learning_rate": 2.0923715273805953e-05,
+      "loss": 0.5506,
+      "step": 390
+    },
+    {
+      "epoch": 1.2270606531881805,
+      "grad_norm": 1.1759194135665894,
+      "learning_rate": 2.0686796881335555e-05,
+      "loss": 0.5751,
+      "step": 395
+    },
+    {
+      "epoch": 1.2426127527216173,
+      "grad_norm": 1.2953706979751587,
+      "learning_rate": 2.0448209866040956e-05,
+      "loss": 0.5292,
+      "step": 400
+    },
+    {
+      "epoch": 1.2581648522550544,
+      "grad_norm": 1.2400802373886108,
+      "learning_rate": 2.0208024234245155e-05,
+      "loss": 0.5356,
+      "step": 405
+    },
+    {
+      "epoch": 1.2737169517884914,
+      "grad_norm": 1.1973105669021606,
+      "learning_rate": 1.9966310461338005e-05,
+      "loss": 0.5792,
+      "step": 410
+    },
+    {
+      "epoch": 1.2892690513219285,
+      "grad_norm": 1.2088028192520142,
+      "learning_rate": 1.9723139471097307e-05,
+      "loss": 0.5398,
+      "step": 415
+    },
+    {
+      "epoch": 1.3048211508553655,
+      "grad_norm": 1.1584738492965698,
+      "learning_rate": 1.9478582614878307e-05,
+      "loss": 0.5561,
+      "step": 420
+    },
+    {
+      "epoch": 1.3203732503888026,
+      "grad_norm": 1.0781487226486206,
+      "learning_rate": 1.9232711650677754e-05,
+      "loss": 0.5403,
+      "step": 425
+    },
+    {
+      "epoch": 1.3359253499222394,
+      "grad_norm": 1.3555227518081665,
+      "learning_rate": 1.8985598722078618e-05,
+      "loss": 0.5167,
+      "step": 430
+    },
+    {
+      "epoch": 1.3514774494556765,
+      "grad_norm": 1.133573293685913,
+      "learning_rate": 1.8737316337081706e-05,
+      "loss": 0.5315,
+      "step": 435
+    },
+    {
+      "epoch": 1.3670295489891136,
+      "grad_norm": 1.2002782821655273,
+      "learning_rate": 1.8487937346830292e-05,
+      "loss": 0.5322,
+      "step": 440
+    },
+    {
+      "epoch": 1.3825816485225506,
+      "grad_norm": 1.1956273317337036,
+      "learning_rate": 1.823753492423415e-05,
+      "loss": 0.5454,
+      "step": 445
+    },
+    {
+      "epoch": 1.3981337480559874,
+      "grad_norm": 1.0847748517990112,
+      "learning_rate": 1.7986182542499064e-05,
+      "loss": 0.5173,
+      "step": 450
+    },
+    {
+      "epoch": 1.4136858475894245,
+      "grad_norm": 1.2930800914764404,
+      "learning_rate": 1.773395395356833e-05,
+      "loss": 0.4822,
+      "step": 455
+    },
+    {
+      "epoch": 1.4292379471228616,
+      "grad_norm": 1.123884677886963,
+      "learning_rate": 1.7480923166482385e-05,
+      "loss": 0.436,
+      "step": 460
+    },
+    {
+      "epoch": 1.4447900466562986,
+      "grad_norm": 1.1324944496154785,
+      "learning_rate": 1.7227164425663077e-05,
+      "loss": 0.5257,
+      "step": 465
+    },
+    {
+      "epoch": 1.4603421461897357,
+      "grad_norm": 1.127148151397705,
+      "learning_rate": 1.6972752189128786e-05,
+      "loss": 0.4699,
+      "step": 470
+    },
+    {
+      "epoch": 1.4758942457231727,
+      "grad_norm": 1.206831455230713,
+      "learning_rate": 1.671776110664695e-05,
+      "loss": 0.5033,
+      "step": 475
+    },
+    {
+      "epoch": 1.4914463452566096,
+      "grad_norm": 1.3057889938354492,
+      "learning_rate": 1.6462265997830278e-05,
+      "loss": 0.5137,
+      "step": 480
+    },
+    {
+      "epoch": 1.5069984447900466,
+      "grad_norm": 1.1263861656188965,
+      "learning_rate": 1.6206341830183137e-05,
+      "loss": 0.5449,
+      "step": 485
+    },
+    {
+      "epoch": 1.5225505443234837,
+      "grad_norm": 1.5598522424697876,
+      "learning_rate": 1.595006369710455e-05,
+      "loss": 0.4338,
+      "step": 490
+    },
+    {
+      "epoch": 1.5381026438569205,
+      "grad_norm": 1.1175037622451782,
+      "learning_rate": 1.5693506795854262e-05,
+      "loss": 0.4636,
+      "step": 495
+    },
+    {
+      "epoch": 1.5536547433903576,
+      "grad_norm": 1.2600256204605103,
+      "learning_rate": 1.5436746405488323e-05,
+      "loss": 0.4665,
+      "step": 500
+    },
+    {
+      "epoch": 1.5692068429237946,
+      "grad_norm": 1.0980114936828613,
+      "learning_rate": 1.5179857864770648e-05,
+      "loss": 0.4819,
+      "step": 505
+    },
+    {
+      "epoch": 1.5847589424572317,
+      "grad_norm": 1.2625893354415894,
+      "learning_rate": 1.4922916550067099e-05,
+      "loss": 0.4601,
+      "step": 510
+    },
+    {
+      "epoch": 1.6003110419906688,
+      "grad_norm": 1.1526122093200684,
+      "learning_rate": 1.4665997853228512e-05,
+      "loss": 0.4538,
+      "step": 515
+    },
+    {
+      "epoch": 1.6158631415241058,
+      "grad_norm": 1.2933363914489746,
+      "learning_rate": 1.440917715946918e-05,
+      "loss": 0.4669,
+      "step": 520
+    },
+    {
+      "epoch": 1.6314152410575429,
+      "grad_norm": 1.2250269651412964,
+      "learning_rate": 1.4152529825247266e-05,
+      "loss": 0.4563,
+      "step": 525
+    },
+    {
+      "epoch": 1.64696734059098,
+      "grad_norm": 1.214328408241272,
+      "learning_rate": 1.38961311561537e-05,
+      "loss": 0.4829,
+      "step": 530
+    },
+    {
+      "epoch": 1.6625194401244168,
+      "grad_norm": 1.1899065971374512,
+      "learning_rate": 1.3640056384815984e-05,
+      "loss": 0.4576,
+      "step": 535
+    },
+    {
+      "epoch": 1.6780715396578538,
+      "grad_norm": 1.2638428211212158,
+      "learning_rate": 1.3384380648823335e-05,
+      "loss": 0.4742,
+      "step": 540
+    },
+    {
+      "epoch": 1.6936236391912907,
+      "grad_norm": 1.4615281820297241,
+      "learning_rate": 1.312917896867985e-05,
+      "loss": 0.4565,
+      "step": 545
+    },
+    {
+      "epoch": 1.7091757387247277,
+      "grad_norm": 1.158037781715393,
+      "learning_rate": 1.2874526225791905e-05,
+      "loss": 0.4339,
+      "step": 550
+    },
+    {
+      "epoch": 1.7247278382581648,
+      "grad_norm": 1.2889515161514282,
+      "learning_rate": 1.2620497140496429e-05,
+      "loss": 0.4611,
+      "step": 555
+    },
+    {
+      "epoch": 1.7402799377916018,
+      "grad_norm": 1.189125418663025,
+      "learning_rate": 1.2367166250136427e-05,
+      "loss": 0.4112,
+      "step": 560
+    },
+    {
+      "epoch": 1.755832037325039,
+      "grad_norm": 1.255334496498108,
+      "learning_rate": 1.2114607887190197e-05,
+      "loss": 0.4427,
+      "step": 565
+    },
+    {
+      "epoch": 1.771384136858476,
+      "grad_norm": 1.326514482498169,
+      "learning_rate": 1.1862896157460625e-05,
+      "loss": 0.448,
+      "step": 570
+    },
+    {
+      "epoch": 1.786936236391913,
+      "grad_norm": 1.050620675086975,
+      "learning_rate": 1.1612104918331095e-05,
+      "loss": 0.4134,
+      "step": 575
+    },
+    {
+      "epoch": 1.80248833592535,
+      "grad_norm": 1.3778687715530396,
+      "learning_rate": 1.1362307757094127e-05,
+      "loss": 0.4037,
+      "step": 580
+    },
+    {
+      "epoch": 1.818040435458787,
+      "grad_norm": 1.1140987873077393,
+      "learning_rate": 1.1113577969359428e-05,
+      "loss": 0.4211,
+      "step": 585
+    },
+    {
+      "epoch": 1.833592534992224,
+      "grad_norm": 1.2997198104858398,
+      "learning_rate": 1.0865988537547428e-05,
+      "loss": 0.4474,
+      "step": 590
+    },
+    {
+      "epoch": 1.8491446345256608,
+      "grad_norm": 1.2311125993728638,
+      "learning_rate": 1.0619612109474733e-05,
+      "loss": 0.4117,
+      "step": 595
+    },
+    {
+      "epoch": 1.8646967340590979,
+      "grad_norm": 1.2353358268737793,
+      "learning_rate": 1.037452097703779e-05,
+      "loss": 0.4345,
+      "step": 600
+    },
+    {
+      "epoch": 1.880248833592535,
+      "grad_norm": 1.2450509071350098,
+      "learning_rate": 1.0130787055000947e-05,
+      "loss": 0.4215,
+      "step": 605
+    },
+    {
+      "epoch": 1.895800933125972,
+      "grad_norm": 1.1635645627975464,
+      "learning_rate": 9.888481859895166e-06,
+      "loss": 0.3964,
+      "step": 610
+    },
+    {
+      "epoch": 1.911353032659409,
+      "grad_norm": 1.2068110704421997,
+      "learning_rate": 9.647676489033671e-06,
+      "loss": 0.428,
+      "step": 615
+    },
+    {
+      "epoch": 1.926905132192846,
+      "grad_norm": 1.2517467737197876,
+      "learning_rate": 9.408441599650453e-06,
+      "loss": 0.3943,
+      "step": 620
+    },
+    {
+      "epoch": 1.9424572317262832,
+      "grad_norm": 1.3384915590286255,
+      "learning_rate": 9.170847388168056e-06,
+      "loss": 0.3926,
+      "step": 625
+    },
+    {
+      "epoch": 1.9580093312597202,
+      "grad_norm": 1.3269731998443604,
+      "learning_rate": 8.93496356960048e-06,
+      "loss": 0.3945,
+      "step": 630
+    },
+    {
+      "epoch": 1.973561430793157,
+      "grad_norm": 1.2442697286605835,
+      "learning_rate": 8.700859357097362e-06,
+      "loss": 0.365,
+      "step": 635
+    },
+    {
+      "epoch": 1.989113530326594,
+      "grad_norm": 1.3288805484771729,
+      "learning_rate": 8.468603441635425e-06,
+      "loss": 0.3966,
+      "step": 640
+    },
+    {
+      "epoch": 2.0031104199066876,
+      "grad_norm": 1.1929374933242798,
+      "learning_rate": 8.238263971863145e-06,
+      "loss": 0.393,
+      "step": 645
+    },
+    {
+      "epoch": 2.018662519440124,
+      "grad_norm": 1.252542495727539,
+      "learning_rate": 8.009908534104526e-06,
+      "loss": 0.3125,
+      "step": 650
+    },
+    {
+      "epoch": 2.0342146189735613,
+      "grad_norm": 1.299525260925293,
+      "learning_rate": 7.783604132527897e-06,
+      "loss": 0.3482,
+      "step": 655
+    },
+    {
+      "epoch": 2.0497667185069983,
+      "grad_norm": 1.264736294746399,
+      "learning_rate": 7.55941716948551e-06,
+      "loss": 0.3169,
+      "step": 660
+    },
+    {
+      "epoch": 2.0653188180404354,
+      "grad_norm": 1.3163446187973022,
+      "learning_rate": 7.337413426029715e-06,
+      "loss": 0.301,
+      "step": 665
+    },
+    {
+      "epoch": 2.0808709175738724,
+      "grad_norm": 1.332731008529663,
+      "learning_rate": 7.11765804261149e-06,
+      "loss": 0.3178,
+      "step": 670
+    },
+    {
+      "epoch": 2.0964230171073095,
+      "grad_norm": 1.308467984199524,
+      "learning_rate": 6.900215499966844e-06,
+      "loss": 0.3322,
+      "step": 675
+    },
+    {
+      "epoch": 2.1119751166407466,
+      "grad_norm": 1.137407898902893,
+      "learning_rate": 6.68514960019689e-06,
+      "loss": 0.3171,
+      "step": 680
+    },
+    {
+      "epoch": 2.1275272161741836,
+      "grad_norm": 1.1944776773452759,
+      "learning_rate": 6.4725234480469805e-06,
+      "loss": 0.3226,
+      "step": 685
+    },
+    {
+      "epoch": 2.1430793157076207,
+      "grad_norm": 1.4939020872116089,
+      "learning_rate": 6.262399432390485e-06,
+      "loss": 0.3048,
+      "step": 690
+    },
+    {
+      "epoch": 2.1586314152410577,
+      "grad_norm": 1.156252145767212,
+      "learning_rate": 6.05483920792261e-06,
+      "loss": 0.3564,
+      "step": 695
+    },
+    {
+      "epoch": 2.1741835147744943,
+      "grad_norm": 1.2493164539337158,
+      "learning_rate": 5.849903677069662e-06,
+      "loss": 0.3617,
+      "step": 700
+    },
+    {
+      "epoch": 2.1897356143079314,
+      "grad_norm": 1.427577018737793,
+      "learning_rate": 5.647652972118998e-06,
+      "loss": 0.3282,
+      "step": 705
+    },
+    {
+      "epoch": 2.2052877138413685,
+      "grad_norm": 1.2930374145507812,
+      "learning_rate": 5.448146437575019e-06,
+      "loss": 0.3015,
+      "step": 710
+    },
+    {
+      "epoch": 2.2208398133748055,
+      "grad_norm": 1.4489259719848633,
+      "learning_rate": 5.25144261274623e-06,
+      "loss": 0.2967,
+      "step": 715
+    },
+    {
+      "epoch": 2.2363919129082426,
+      "grad_norm": 1.0761257410049438,
+      "learning_rate": 5.0575992145686534e-06,
+      "loss": 0.3146,
+      "step": 720
+    },
+    {
+      "epoch": 2.2519440124416796,
+      "grad_norm": 1.4483816623687744,
+      "learning_rate": 4.86667312067046e-06,
+      "loss": 0.3171,
+      "step": 725
+    },
+    {
+      "epoch": 2.2674961119751167,
+      "grad_norm": 1.2705985307693481,
+      "learning_rate": 4.6787203526829245e-06,
+      "loss": 0.3637,
+      "step": 730
+    },
+    {
+      "epoch": 2.2830482115085537,
+      "grad_norm": 1.365267038345337,
+      "learning_rate": 4.493796059802525e-06,
+      "loss": 0.3496,
+      "step": 735
+    },
+    {
+      "epoch": 2.298600311041991,
+      "grad_norm": 1.1215273141860962,
+      "learning_rate": 4.31195450260904e-06,
+      "loss": 0.2845,
+      "step": 740
+    },
+    {
+      "epoch": 2.314152410575428,
+      "grad_norm": 1.251165509223938,
+      "learning_rate": 4.133249037144371e-06,
+      "loss": 0.3173,
+      "step": 745
+    },
+    {
+      "epoch": 2.329704510108865,
+      "grad_norm": 1.2096902132034302,
+      "learning_rate": 3.957732099256801e-06,
+      "loss": 0.2962,
+      "step": 750
+    },
+    {
+      "epoch": 2.3452566096423015,
+      "grad_norm": 1.3482387065887451,
+      "learning_rate": 3.7854551892152433e-06,
+      "loss": 0.2963,
+      "step": 755
+    },
+    {
+      "epoch": 2.3608087091757386,
+      "grad_norm": 1.2713910341262817,
+      "learning_rate": 3.6164688565979774e-06,
+      "loss": 0.2826,
+      "step": 760
+    },
+    {
+      "epoch": 2.3763608087091757,
+      "grad_norm": 1.434398889541626,
+      "learning_rate": 3.450822685460402e-06,
+      "loss": 0.3071,
+      "step": 765
+    },
+    {
+      "epoch": 2.3919129082426127,
+      "grad_norm": 1.4093025922775269,
+      "learning_rate": 3.2885652797860395e-06,
+      "loss": 0.29,
+      "step": 770
+    },
+    {
+      "epoch": 2.4074650077760498,
+      "grad_norm": 1.1628525257110596,
+      "learning_rate": 3.1297442492251254e-06,
+      "loss": 0.2744,
+      "step": 775
+    },
+    {
+      "epoch": 2.423017107309487,
+      "grad_norm": 1.2740814685821533,
+      "learning_rate": 2.9744061951249797e-06,
+      "loss": 0.2876,
+      "step": 780
+    },
+    {
+      "epoch": 2.438569206842924,
+      "grad_norm": 1.0924385786056519,
+      "learning_rate": 2.8225966968561845e-06,
+      "loss": 0.3331,
+      "step": 785
+    },
+    {
+      "epoch": 2.454121306376361,
+      "grad_norm": 1.4028589725494385,
+      "learning_rate": 2.6743602984386944e-06,
+      "loss": 0.2898,
+      "step": 790
+    },
+    {
+      "epoch": 2.469673405909798,
+      "grad_norm": 1.3594895601272583,
+      "learning_rate": 2.5297404954716967e-06,
+      "loss": 0.2877,
+      "step": 795
+    },
+    {
+      "epoch": 2.4852255054432346,
+      "grad_norm": 1.1996512413024902,
+      "learning_rate": 2.3887797223710926e-06,
+      "loss": 0.3164,
+      "step": 800
+    },
+    {
+      "epoch": 2.5007776049766717,
+      "grad_norm": 1.3675403594970703,
+      "learning_rate": 2.2515193399183954e-06,
+      "loss": 0.3143,
+      "step": 805
+    },
+    {
+      "epoch": 2.5163297045101087,
+      "grad_norm": 1.2102525234222412,
+      "learning_rate": 2.117999623124629e-06,
+      "loss": 0.3296,
+      "step": 810
+    },
+    {
+      "epoch": 2.531881804043546,
+      "grad_norm": 1.39519464969635,
+      "learning_rate": 1.988259749412778e-06,
+      "loss": 0.3038,
+      "step": 815
+    },
+    {
+      "epoch": 2.547433903576983,
+      "grad_norm": 1.2118239402770996,
+      "learning_rate": 1.8623377871223746e-06,
+      "loss": 0.3048,
+      "step": 820
+    },
+    {
+      "epoch": 2.56298600311042,
+      "grad_norm": 1.0533260107040405,
+      "learning_rate": 1.7402706843394311e-06,
+      "loss": 0.2967,
+      "step": 825
+    },
+    {
+      "epoch": 2.578538102643857,
+      "grad_norm": 1.3837428092956543,
+      "learning_rate": 1.622094258055139e-06,
+      "loss": 0.2996,
+      "step": 830
+    },
+    {
+      "epoch": 2.594090202177294,
+      "grad_norm": 1.2500343322753906,
+      "learning_rate": 1.5078431836564199e-06,
+      "loss": 0.293,
+      "step": 835
+    },
+    {
+      "epoch": 2.609642301710731,
+      "grad_norm": 1.4100749492645264,
+      "learning_rate": 1.3975509847514434e-06,
+      "loss": 0.3553,
+      "step": 840
+    },
+    {
+      "epoch": 2.625194401244168,
+      "grad_norm": 1.1867926120758057,
+      "learning_rate": 1.291250023333133e-06,
+      "loss": 0.329,
+      "step": 845
+    },
+    {
+      "epoch": 2.640746500777605,
+      "grad_norm": 1.2801156044006348,
+      "learning_rate": 1.1889714902834897e-06,
+      "loss": 0.3137,
+      "step": 850
+    },
+    {
+      "epoch": 2.6562986003110423,
+      "grad_norm": 1.301249623298645,
+      "learning_rate": 1.090745396221533e-06,
+      "loss": 0.2994,
+      "step": 855
+    },
+    {
+      "epoch": 2.671850699844479,
+      "grad_norm": 1.3433258533477783,
+      "learning_rate": 9.96600562697586e-07,
+      "loss": 0.267,
+      "step": 860
+    },
+    {
+      "epoch": 2.687402799377916,
+      "grad_norm": 1.2630574703216553,
+      "learning_rate": 9.065646137364314e-07,
+      "loss": 0.3619,
+      "step": 865
+    },
+    {
+      "epoch": 2.702954898911353,
+      "grad_norm": 1.2644740343093872,
+      "learning_rate": 8.20663967731865e-07,
+      "loss": 0.297,
+      "step": 870
+    },
+    {
+      "epoch": 2.71850699844479,
+      "grad_norm": 1.3221532106399536,
+      "learning_rate": 7.389238296949902e-07,
+      "loss": 0.3194,
+      "step": 875
+    },
+    {
+      "epoch": 2.734059097978227,
+      "grad_norm": 1.3244149684906006,
+      "learning_rate": 6.613681838585517e-07,
+      "loss": 0.3098,
+      "step": 880
+    },
+    {
+      "epoch": 2.749611197511664,
+      "grad_norm": 1.1736044883728027,
+      "learning_rate": 5.880197866394738e-07,
+      "loss": 0.283,
+      "step": 885
+    },
+    {
+      "epoch": 2.765163297045101,
+      "grad_norm": 1.3244141340255737,
+      "learning_rate": 5.189001599616561e-07,
+      "loss": 0.2777,
+      "step": 890
+    },
+    {
+      "epoch": 2.780715396578538,
+      "grad_norm": 1.1822513341903687,
+      "learning_rate": 4.54029584941002e-07,
+      "loss": 0.3147,
+      "step": 895
+    },
+    {
+      "epoch": 2.796267496111975,
+      "grad_norm": 1.1570910215377808,
+      "learning_rate": 3.934270959345271e-07,
+      "loss": 0.2742,
+      "step": 900
+    },
+    {
+      "epoch": 2.811819595645412,
+      "grad_norm": 1.2180461883544922,
+      "learning_rate": 3.371104749552817e-07,
+      "loss": 0.2845,
+      "step": 905
+    },
+    {
+      "epoch": 2.827371695178849,
+      "grad_norm": 1.2462286949157715,
+      "learning_rate": 2.8509624645474464e-07,
+      "loss": 0.3078,
+      "step": 910
+    },
+    {
+      "epoch": 2.842923794712286,
+      "grad_norm": 1.2295945882797241,
+      "learning_rate": 2.3739967247421502e-07,
+      "loss": 0.2539,
+      "step": 915
+    },
+    {
+      "epoch": 2.858475894245723,
+      "grad_norm": 1.2694175243377686,
+      "learning_rate": 1.9403474816661272e-07,
+      "loss": 0.2787,
+      "step": 920
+    },
+    {
+      "epoch": 2.87402799377916,
+      "grad_norm": 1.1908364295959473,
+      "learning_rate": 1.5501419769001656e-07,
+      "loss": 0.2958,
+      "step": 925
+    },
+    {
+      "epoch": 2.8895800933125972,
+      "grad_norm": 1.3138959407806396,
+      "learning_rate": 1.203494704741276e-07,
+      "loss": 0.3035,
+      "step": 930
+    },
+    {
+      "epoch": 2.9051321928460343,
+      "grad_norm": 1.295821189880371,
+      "learning_rate": 9.005073786078621e-08,
+      "loss": 0.3641,
+      "step": 935
+    },
+    {
+      "epoch": 2.9206842923794714,
+      "grad_norm": 1.3944648504257202,
+      "learning_rate": 6.412689011947303e-08,
+      "loss": 0.2998,
+      "step": 940
+    },
+    {
+      "epoch": 2.9362363919129084,
+      "grad_norm": 1.3905586004257202,
+      "learning_rate": 4.2585533838738756e-08,
+      "loss": 0.3256,
+      "step": 945
+    },
+    {
+      "epoch": 2.9517884914463455,
+      "grad_norm": 1.382743239402771,
+      "learning_rate": 2.5432989694262886e-08,
+      "loss": 0.2788,
+      "step": 950
+    },
+    {
+      "epoch": 2.9673405909797825,
+      "grad_norm": 1.207732915878296,
+      "learning_rate": 1.267429059424563e-08,
+      "loss": 0.2751,
+      "step": 955
+    },
+    {
+      "epoch": 2.982892690513219,
+      "grad_norm": 1.2087626457214355,
+      "learning_rate": 4.313180202647482e-09,
+      "loss": 0.3045,
+      "step": 960
+    },
+    {
+      "epoch": 2.998444790046656,
+      "grad_norm": 1.259070873260498,
+      "learning_rate": 3.521118407162627e-10,
+      "loss": 0.3122,
+      "step": 965
+    },
+    {
+      "epoch": 3.0,
+      "step": 966,
+      "total_flos": 1.3759050762074194e+18,
+      "train_loss": 0.5874219706771784,
+      "train_runtime": 632.2067,
+      "train_samples_per_second": 48.772,
+      "train_steps_per_second": 1.528
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 966,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.3759050762074194e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

19_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:933076bd603aa6d76a97b642ccb09f3049a06ee6df8d2d0308fb98e2f422d0aa
+size 8145

19_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff