Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

48_128_e3_3e-5/README.md +63 -0
48_128_e3_3e-5/adapter_config.json +39 -0
48_128_e3_3e-5/adapter_model.safetensors +3 -0
48_128_e3_3e-5/all_results.json +9 -0
48_128_e3_3e-5/config.json +32 -0
48_128_e3_3e-5/merges.txt +0 -0
48_128_e3_3e-5/special_tokens_map.json +45 -0
48_128_e3_3e-5/tokenizer.json +0 -0
48_128_e3_3e-5/tokenizer_config.json +188 -0
48_128_e3_3e-5/train_results.json +9 -0
48_128_e3_3e-5/trainer_state.json +1310 -0
48_128_e3_3e-5/training_args.bin +3 -0
48_128_e3_3e-5/vocab.json +0 -0

48_128_e3_3e-5/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+library_name: peft
+license: apache-2.0
+base_model: ibm-granite/granite-3.3-8b-base
+tags:
+- alignment-handbook
+- generated_from_trainer
+datasets:
+- data/knowledge_lora_training_data_2000
+model-index:
+- name: 48_128_e3_3e-5
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# 48_128_e3_3e-5
+This model is a fine-tuned version of [ibm-granite/granite-3.3-8b-base](https://huggingface.co/ibm-granite/granite-3.3-8b-base) on the data/knowledge_lora_training_data_2000 dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 3e-05
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 64
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 3.0
+### Training results
+### Framework versions
+- PEFT 0.15.2
+- Transformers 4.52.4
+- Pytorch 2.7.0+cu126
+- Datasets 3.6.0
+- Tokenizers 0.21.2

48_128_e3_3e-5/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "ibm-granite/granite-3.3-8b-base",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "up_proj",
+    "gate_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

48_128_e3_3e-5/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b956c0cb43ba9e01f0197219dd6b9439666222d8319d202fb60cbdab0c6dc478
+size 791751704

48_128_e3_3e-5/all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.273900111881044e+18,
+    "train_loss": 0.571772654588991,
+    "train_runtime": 581.8624,
+    "train_samples": 9666,
+    "train_samples_per_second": 49.837,
+    "train_steps_per_second": 1.562
+}

48_128_e3_3e-5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 49152
+}

48_128_e3_3e-5/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

48_128_e3_3e-5/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<reponame>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

48_128_e3_3e-5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

48_128_e3_3e-5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<reponame>",
+  "padding_side": "left",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

48_128_e3_3e-5/train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 1.273900111881044e+18,
+    "train_loss": 0.571772654588991,
+    "train_runtime": 581.8624,
+    "train_samples": 9666,
+    "train_samples_per_second": 49.837,
+    "train_steps_per_second": 1.562
+}

48_128_e3_3e-5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1310 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 909,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01652892561983471,
+      "grad_norm": 1.3831015825271606,
+      "learning_rate": 2.6086956521739132e-06,
+      "loss": 1.3336,
+      "step": 5
+    },
+    {
+      "epoch": 0.03305785123966942,
+      "grad_norm": 0.7098184823989868,
+      "learning_rate": 5.869565217391305e-06,
+      "loss": 1.3505,
+      "step": 10
+    },
+    {
+      "epoch": 0.049586776859504134,
+      "grad_norm": 0.6232060194015503,
+      "learning_rate": 9.130434782608697e-06,
+      "loss": 1.3319,
+      "step": 15
+    },
+    {
+      "epoch": 0.06611570247933884,
+      "grad_norm": 0.6442826390266418,
+      "learning_rate": 1.2391304347826088e-05,
+      "loss": 1.2733,
+      "step": 20
+    },
+    {
+      "epoch": 0.08264462809917356,
+      "grad_norm": 0.5439983010292053,
+      "learning_rate": 1.5652173913043477e-05,
+      "loss": 1.3119,
+      "step": 25
+    },
+    {
+      "epoch": 0.09917355371900827,
+      "grad_norm": 0.5362527370452881,
+      "learning_rate": 1.891304347826087e-05,
+      "loss": 1.2365,
+      "step": 30
+    },
+    {
+      "epoch": 0.11570247933884298,
+      "grad_norm": 0.5195441246032715,
+      "learning_rate": 2.217391304347826e-05,
+      "loss": 1.2603,
+      "step": 35
+    },
+    {
+      "epoch": 0.1322314049586777,
+      "grad_norm": 0.5031000375747681,
+      "learning_rate": 2.5434782608695653e-05,
+      "loss": 1.2659,
+      "step": 40
+    },
+    {
+      "epoch": 0.1487603305785124,
+      "grad_norm": 0.4853697419166565,
+      "learning_rate": 2.8695652173913044e-05,
+      "loss": 1.248,
+      "step": 45
+    },
+    {
+      "epoch": 0.1652892561983471,
+      "grad_norm": 0.4460432529449463,
+      "learning_rate": 2.999910550563221e-05,
+      "loss": 1.175,
+      "step": 50
+    },
+    {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 0.488101601600647,
+      "learning_rate": 2.9993639537498208e-05,
+      "loss": 1.1665,
+      "step": 55
+    },
+    {
+      "epoch": 0.19834710743801653,
+      "grad_norm": 0.6104041934013367,
+      "learning_rate": 2.9983206351162094e-05,
+      "loss": 1.1775,
+      "step": 60
+    },
+    {
+      "epoch": 0.21487603305785125,
+      "grad_norm": 0.49601423740386963,
+      "learning_rate": 2.9967809403017022e-05,
+      "loss": 1.1435,
+      "step": 65
+    },
+    {
+      "epoch": 0.23140495867768596,
+      "grad_norm": 0.5215215086936951,
+      "learning_rate": 2.9947453793892644e-05,
+      "loss": 1.18,
+      "step": 70
+    },
+    {
+      "epoch": 0.24793388429752067,
+      "grad_norm": 0.6732470393180847,
+      "learning_rate": 2.992214626736525e-05,
+      "loss": 1.107,
+      "step": 75
+    },
+    {
+      "epoch": 0.2644628099173554,
+      "grad_norm": 0.4841562509536743,
+      "learning_rate": 2.9891895207523713e-05,
+      "loss": 1.0892,
+      "step": 80
+    },
+    {
+      "epoch": 0.2809917355371901,
+      "grad_norm": 0.5512502789497375,
+      "learning_rate": 2.985671063619194e-05,
+      "loss": 1.0827,
+      "step": 85
+    },
+    {
+      "epoch": 0.2975206611570248,
+      "grad_norm": 0.6520716547966003,
+      "learning_rate": 2.9816604209608744e-05,
+      "loss": 1.0575,
+      "step": 90
+    },
+    {
+      "epoch": 0.3140495867768595,
+      "grad_norm": 0.5574434995651245,
+      "learning_rate": 2.9771589214566274e-05,
+      "loss": 1.0648,
+      "step": 95
+    },
+    {
+      "epoch": 0.3305785123966942,
+      "grad_norm": 0.5233076810836792,
+      "learning_rate": 2.9721680564008264e-05,
+      "loss": 1.0955,
+      "step": 100
+    },
+    {
+      "epoch": 0.34710743801652894,
+      "grad_norm": 0.6986305117607117,
+      "learning_rate": 2.9666894792089538e-05,
+      "loss": 1.0185,
+      "step": 105
+    },
+    {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 0.628684401512146,
+      "learning_rate": 2.9607250048698438e-05,
+      "loss": 1.0259,
+      "step": 110
+    },
+    {
+      "epoch": 0.38016528925619836,
+      "grad_norm": 0.5662065744400024,
+      "learning_rate": 2.954276609344399e-05,
+      "loss": 1.0174,
+      "step": 115
+    },
+    {
+      "epoch": 0.39669421487603307,
+      "grad_norm": 0.6545687317848206,
+      "learning_rate": 2.9473464289109746e-05,
+      "loss": 0.9881,
+      "step": 120
+    },
+    {
+      "epoch": 0.4132231404958678,
+      "grad_norm": 0.6962680220603943,
+      "learning_rate": 2.9399367594576588e-05,
+      "loss": 0.9667,
+      "step": 125
+    },
+    {
+      "epoch": 0.4297520661157025,
+      "grad_norm": 0.6917610168457031,
+      "learning_rate": 2.9320500557216672e-05,
+      "loss": 1.0016,
+      "step": 130
+    },
+    {
+      "epoch": 0.4462809917355372,
+      "grad_norm": 0.6968785524368286,
+      "learning_rate": 2.92368893047612e-05,
+      "loss": 1.0067,
+      "step": 135
+    },
+    {
+      "epoch": 0.4628099173553719,
+      "grad_norm": 0.690639078617096,
+      "learning_rate": 2.914856153664459e-05,
+      "loss": 0.9132,
+      "step": 140
+    },
+    {
+      "epoch": 0.4793388429752066,
+      "grad_norm": 0.734661877155304,
+      "learning_rate": 2.905554651482798e-05,
+      "loss": 0.9474,
+      "step": 145
+    },
+    {
+      "epoch": 0.49586776859504134,
+      "grad_norm": 0.7354484796524048,
+      "learning_rate": 2.8957875054105098e-05,
+      "loss": 0.9368,
+      "step": 150
+    },
+    {
+      "epoch": 0.512396694214876,
+      "grad_norm": 0.7630740404129028,
+      "learning_rate": 2.8855579511893653e-05,
+      "loss": 0.9324,
+      "step": 155
+    },
+    {
+      "epoch": 0.5289256198347108,
+      "grad_norm": 0.7361947894096375,
+      "learning_rate": 2.874869377751571e-05,
+      "loss": 0.905,
+      "step": 160
+    },
+    {
+      "epoch": 0.5454545454545454,
+      "grad_norm": 0.704692542552948,
+      "learning_rate": 2.863725326097056e-05,
+      "loss": 0.9494,
+      "step": 165
+    },
+    {
+      "epoch": 0.5619834710743802,
+      "grad_norm": 0.8412842154502869,
+      "learning_rate": 2.8521294881203792e-05,
+      "loss": 0.868,
+      "step": 170
+    },
+    {
+      "epoch": 0.5785123966942148,
+      "grad_norm": 0.863633930683136,
+      "learning_rate": 2.840085705387647e-05,
+      "loss": 0.8822,
+      "step": 175
+    },
+    {
+      "epoch": 0.5950413223140496,
+      "grad_norm": 0.8332576751708984,
+      "learning_rate": 2.8275979678638485e-05,
+      "loss": 0.8624,
+      "step": 180
+    },
+    {
+      "epoch": 0.6115702479338843,
+      "grad_norm": 0.8531330823898315,
+      "learning_rate": 2.814670412591026e-05,
+      "loss": 0.8288,
+      "step": 185
+    },
+    {
+      "epoch": 0.628099173553719,
+      "grad_norm": 0.9115040898323059,
+      "learning_rate": 2.8013073223177207e-05,
+      "loss": 0.8904,
+      "step": 190
+    },
+    {
+      "epoch": 0.6446280991735537,
+      "grad_norm": 0.8175347447395325,
+      "learning_rate": 2.7875131240801494e-05,
+      "loss": 0.8584,
+      "step": 195
+    },
+    {
+      "epoch": 0.6611570247933884,
+      "grad_norm": 0.8207992315292358,
+      "learning_rate": 2.7732923877355764e-05,
+      "loss": 0.8296,
+      "step": 200
+    },
+    {
+      "epoch": 0.6776859504132231,
+      "grad_norm": 0.9442360997200012,
+      "learning_rate": 2.7586498244483746e-05,
+      "loss": 0.8551,
+      "step": 205
+    },
+    {
+      "epoch": 0.6942148760330579,
+      "grad_norm": 0.8399072885513306,
+      "learning_rate": 2.7435902851292735e-05,
+      "loss": 0.8044,
+      "step": 210
+    },
+    {
+      "epoch": 0.7107438016528925,
+      "grad_norm": 0.8840962648391724,
+      "learning_rate": 2.7281187588283047e-05,
+      "loss": 0.8239,
+      "step": 215
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 0.9720616936683655,
+      "learning_rate": 2.712240371081991e-05,
+      "loss": 0.797,
+      "step": 220
+    },
+    {
+      "epoch": 0.743801652892562,
+      "grad_norm": 0.922126293182373,
+      "learning_rate": 2.6959603822153147e-05,
+      "loss": 0.763,
+      "step": 225
+    },
+    {
+      "epoch": 0.7603305785123967,
+      "grad_norm": 1.0185389518737793,
+      "learning_rate": 2.6792841855990325e-05,
+      "loss": 0.7521,
+      "step": 230
+    },
+    {
+      "epoch": 0.7768595041322314,
+      "grad_norm": 1.1431677341461182,
+      "learning_rate": 2.6622173058629178e-05,
+      "loss": 0.7847,
+      "step": 235
+    },
+    {
+      "epoch": 0.7933884297520661,
+      "grad_norm": 0.9715793132781982,
+      "learning_rate": 2.644765397065509e-05,
+      "loss": 0.7447,
+      "step": 240
+    },
+    {
+      "epoch": 0.8099173553719008,
+      "grad_norm": 1.0262930393218994,
+      "learning_rate": 2.626934240820993e-05,
+      "loss": 0.7593,
+      "step": 245
+    },
+    {
+      "epoch": 0.8264462809917356,
+      "grad_norm": 0.9361246824264526,
+      "learning_rate": 2.608729744383817e-05,
+      "loss": 0.7346,
+      "step": 250
+    },
+    {
+      "epoch": 0.8429752066115702,
+      "grad_norm": 1.1547300815582275,
+      "learning_rate": 2.590157938691691e-05,
+      "loss": 0.7273,
+      "step": 255
+    },
+    {
+      "epoch": 0.859504132231405,
+      "grad_norm": 1.0726925134658813,
+      "learning_rate": 2.5712249763676003e-05,
+      "loss": 0.7315,
+      "step": 260
+    },
+    {
+      "epoch": 0.8760330578512396,
+      "grad_norm": 1.0144833326339722,
+      "learning_rate": 2.5519371296815197e-05,
+      "loss": 0.6952,
+      "step": 265
+    },
+    {
+      "epoch": 0.8925619834710744,
+      "grad_norm": 1.1299502849578857,
+      "learning_rate": 2.532300788472481e-05,
+      "loss": 0.7149,
+      "step": 270
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 1.034497618675232,
+      "learning_rate": 2.512322458031693e-05,
+      "loss": 0.6787,
+      "step": 275
+    },
+    {
+      "epoch": 0.9256198347107438,
+      "grad_norm": 1.2203060388565063,
+      "learning_rate": 2.4920087569474145e-05,
+      "loss": 0.709,
+      "step": 280
+    },
+    {
+      "epoch": 0.9421487603305785,
+      "grad_norm": 0.9873307347297668,
+      "learning_rate": 2.4713664149122964e-05,
+      "loss": 0.7434,
+      "step": 285
+    },
+    {
+      "epoch": 0.9586776859504132,
+      "grad_norm": 1.0340741872787476,
+      "learning_rate": 2.4504022704939075e-05,
+      "loss": 0.662,
+      "step": 290
+    },
+    {
+      "epoch": 0.9752066115702479,
+      "grad_norm": 1.0096772909164429,
+      "learning_rate": 2.4291232688691996e-05,
+      "loss": 0.6921,
+      "step": 295
+    },
+    {
+      "epoch": 0.9917355371900827,
+      "grad_norm": 1.0512659549713135,
+      "learning_rate": 2.4075364595236515e-05,
+      "loss": 0.716,
+      "step": 300
+    },
+    {
+      "epoch": 1.006611570247934,
+      "grad_norm": 1.0512038469314575,
+      "learning_rate": 2.3856489939158566e-05,
+      "loss": 0.7193,
+      "step": 305
+    },
+    {
+      "epoch": 1.0231404958677686,
+      "grad_norm": 1.073168396949768,
+      "learning_rate": 2.363468123108326e-05,
+      "loss": 0.547,
+      "step": 310
+    },
+    {
+      "epoch": 1.0396694214876032,
+      "grad_norm": 1.0484970808029175,
+      "learning_rate": 2.341001195365298e-05,
+      "loss": 0.5766,
+      "step": 315
+    },
+    {
+      "epoch": 1.056198347107438,
+      "grad_norm": 1.3234968185424805,
+      "learning_rate": 2.3182556537183437e-05,
+      "loss": 0.5844,
+      "step": 320
+    },
+    {
+      "epoch": 1.0727272727272728,
+      "grad_norm": 1.0770665407180786,
+      "learning_rate": 2.2952390335005742e-05,
+      "loss": 0.5496,
+      "step": 325
+    },
+    {
+      "epoch": 1.0892561983471074,
+      "grad_norm": 1.1759930849075317,
+      "learning_rate": 2.2719589598502776e-05,
+      "loss": 0.5549,
+      "step": 330
+    },
+    {
+      "epoch": 1.105785123966942,
+      "grad_norm": 1.1473444700241089,
+      "learning_rate": 2.2484231451847952e-05,
+      "loss": 0.5656,
+      "step": 335
+    },
+    {
+      "epoch": 1.122314049586777,
+      "grad_norm": 1.1218301057815552,
+      "learning_rate": 2.2246393866454893e-05,
+      "loss": 0.5663,
+      "step": 340
+    },
+    {
+      "epoch": 1.1388429752066116,
+      "grad_norm": 1.324342966079712,
+      "learning_rate": 2.2006155635146398e-05,
+      "loss": 0.5479,
+      "step": 345
+    },
+    {
+      "epoch": 1.1553719008264463,
+      "grad_norm": 1.2391122579574585,
+      "learning_rate": 2.176359634605132e-05,
+      "loss": 0.5655,
+      "step": 350
+    },
+    {
+      "epoch": 1.171900826446281,
+      "grad_norm": 0.9759783148765564,
+      "learning_rate": 2.151879635623793e-05,
+      "loss": 0.5415,
+      "step": 355
+    },
+    {
+      "epoch": 1.1884297520661158,
+      "grad_norm": 1.1686252355575562,
+      "learning_rate": 2.1271836765092552e-05,
+      "loss": 0.5068,
+      "step": 360
+    },
+    {
+      "epoch": 1.2049586776859504,
+      "grad_norm": 1.2139272689819336,
+      "learning_rate": 2.1022799387452276e-05,
+      "loss": 0.5295,
+      "step": 365
+    },
+    {
+      "epoch": 1.221487603305785,
+      "grad_norm": 1.2539520263671875,
+      "learning_rate": 2.0771766726500698e-05,
+      "loss": 0.5389,
+      "step": 370
+    },
+    {
+      "epoch": 1.2380165289256198,
+      "grad_norm": 1.1020694971084595,
+      "learning_rate": 2.05188219464355e-05,
+      "loss": 0.5341,
+      "step": 375
+    },
+    {
+      "epoch": 1.2545454545454544,
+      "grad_norm": 1.029186487197876,
+      "learning_rate": 2.0264048844917204e-05,
+      "loss": 0.5168,
+      "step": 380
+    },
+    {
+      "epoch": 1.2710743801652893,
+      "grad_norm": 1.19265878200531,
+      "learning_rate": 2.000753182530791e-05,
+      "loss": 0.5223,
+      "step": 385
+    },
+    {
+      "epoch": 1.287603305785124,
+      "grad_norm": 1.1817671060562134,
+      "learning_rate": 1.9749355868709497e-05,
+      "loss": 0.512,
+      "step": 390
+    },
+    {
+      "epoch": 1.3041322314049586,
+      "grad_norm": 1.246814489364624,
+      "learning_rate": 1.948960650581034e-05,
+      "loss": 0.5491,
+      "step": 395
+    },
+    {
+      "epoch": 1.3206611570247935,
+      "grad_norm": 1.4044491052627563,
+      "learning_rate": 1.9228369788550004e-05,
+      "loss": 0.5321,
+      "step": 400
+    },
+    {
+      "epoch": 1.3371900826446281,
+      "grad_norm": 1.1388851404190063,
+      "learning_rate": 1.8965732261611246e-05,
+      "loss": 0.5175,
+      "step": 405
+    },
+    {
+      "epoch": 1.3537190082644628,
+      "grad_norm": 1.1884291172027588,
+      "learning_rate": 1.870178093374875e-05,
+      "loss": 0.5522,
+      "step": 410
+    },
+    {
+      "epoch": 1.3702479338842974,
+      "grad_norm": 1.1615290641784668,
+      "learning_rate": 1.843660324896415e-05,
+      "loss": 0.483,
+      "step": 415
+    },
+    {
+      "epoch": 1.386776859504132,
+      "grad_norm": 1.1628854274749756,
+      "learning_rate": 1.817028705753685e-05,
+      "loss": 0.4992,
+      "step": 420
+    },
+    {
+      "epoch": 1.403305785123967,
+      "grad_norm": 1.163860559463501,
+      "learning_rate": 1.7902920586920248e-05,
+      "loss": 0.4997,
+      "step": 425
+    },
+    {
+      "epoch": 1.4198347107438016,
+      "grad_norm": 1.249915361404419,
+      "learning_rate": 1.7634592412513008e-05,
+      "loss": 0.4936,
+      "step": 430
+    },
+    {
+      "epoch": 1.4363636363636363,
+      "grad_norm": 1.1150130033493042,
+      "learning_rate": 1.7365391428315068e-05,
+      "loss": 0.4403,
+      "step": 435
+    },
+    {
+      "epoch": 1.4528925619834712,
+      "grad_norm": 1.2004356384277344,
+      "learning_rate": 1.7095406817478083e-05,
+      "loss": 0.5046,
+      "step": 440
+    },
+    {
+      "epoch": 1.4694214876033058,
+      "grad_norm": 1.265571117401123,
+      "learning_rate": 1.6824728022760085e-05,
+      "loss": 0.4732,
+      "step": 445
+    },
+    {
+      "epoch": 1.4859504132231405,
+      "grad_norm": 1.3377974033355713,
+      "learning_rate": 1.6553444716894155e-05,
+      "loss": 0.4405,
+      "step": 450
+    },
+    {
+      "epoch": 1.5024793388429751,
+      "grad_norm": 1.2106013298034668,
+      "learning_rate": 1.628164677288086e-05,
+      "loss": 0.4906,
+      "step": 455
+    },
+    {
+      "epoch": 1.5190082644628098,
+      "grad_norm": 1.2407844066619873,
+      "learning_rate": 1.600942423421442e-05,
+      "loss": 0.4584,
+      "step": 460
+    },
+    {
+      "epoch": 1.5355371900826447,
+      "grad_norm": 1.1791411638259888,
+      "learning_rate": 1.573686728505229e-05,
+      "loss": 0.4661,
+      "step": 465
+    },
+    {
+      "epoch": 1.5520661157024793,
+      "grad_norm": 1.275108814239502,
+      "learning_rate": 1.5464066220338247e-05,
+      "loss": 0.4865,
+      "step": 470
+    },
+    {
+      "epoch": 1.5685950413223142,
+      "grad_norm": 1.1702497005462646,
+      "learning_rate": 1.5191111415888718e-05,
+      "loss": 0.4755,
+      "step": 475
+    },
+    {
+      "epoch": 1.5851239669421489,
+      "grad_norm": 1.5265661478042603,
+      "learning_rate": 1.4918093298452279e-05,
+      "loss": 0.4614,
+      "step": 480
+    },
+    {
+      "epoch": 1.6016528925619835,
+      "grad_norm": 1.2412056922912598,
+      "learning_rate": 1.464510231575239e-05,
+      "loss": 0.4653,
+      "step": 485
+    },
+    {
+      "epoch": 1.6181818181818182,
+      "grad_norm": 1.3541052341461182,
+      "learning_rate": 1.4372228906523086e-05,
+      "loss": 0.4012,
+      "step": 490
+    },
+    {
+      "epoch": 1.6347107438016528,
+      "grad_norm": 1.1902379989624023,
+      "learning_rate": 1.4099563470547673e-05,
+      "loss": 0.444,
+      "step": 495
+    },
+    {
+      "epoch": 1.6512396694214875,
+      "grad_norm": 1.2285325527191162,
+      "learning_rate": 1.382719633871034e-05,
+      "loss": 0.4585,
+      "step": 500
+    },
+    {
+      "epoch": 1.6677685950413224,
+      "grad_norm": 1.4228185415267944,
+      "learning_rate": 1.3555217743070593e-05,
+      "loss": 0.4272,
+      "step": 505
+    },
+    {
+      "epoch": 1.684297520661157,
+      "grad_norm": 1.268658995628357,
+      "learning_rate": 1.3283717786970421e-05,
+      "loss": 0.3885,
+      "step": 510
+    },
+    {
+      "epoch": 1.7008264462809919,
+      "grad_norm": 1.2212035655975342,
+      "learning_rate": 1.3012786415184087e-05,
+      "loss": 0.4023,
+      "step": 515
+    },
+    {
+      "epoch": 1.7173553719008265,
+      "grad_norm": 1.1776573657989502,
+      "learning_rate": 1.2742513384120528e-05,
+      "loss": 0.4295,
+      "step": 520
+    },
+    {
+      "epoch": 1.7338842975206612,
+      "grad_norm": 1.295913815498352,
+      "learning_rate": 1.2472988232088027e-05,
+      "loss": 0.4626,
+      "step": 525
+    },
+    {
+      "epoch": 1.7504132231404959,
+      "grad_norm": 1.1968791484832764,
+      "learning_rate": 1.2204300249631276e-05,
+      "loss": 0.4138,
+      "step": 530
+    },
+    {
+      "epoch": 1.7669421487603305,
+      "grad_norm": 1.395003318786621,
+      "learning_rate": 1.193653844995042e-05,
+      "loss": 0.4174,
+      "step": 535
+    },
+    {
+      "epoch": 1.7834710743801652,
+      "grad_norm": 1.2571361064910889,
+      "learning_rate": 1.166979153941205e-05,
+      "loss": 0.4136,
+      "step": 540
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.2398642301559448,
+      "learning_rate": 1.1404147888161787e-05,
+      "loss": 0.3862,
+      "step": 545
+    },
+    {
+      "epoch": 1.8165289256198347,
+      "grad_norm": 1.38470458984375,
+      "learning_rate": 1.1139695500848289e-05,
+      "loss": 0.4012,
+      "step": 550
+    },
+    {
+      "epoch": 1.8330578512396696,
+      "grad_norm": 1.3475830554962158,
+      "learning_rate": 1.0876521987468375e-05,
+      "loss": 0.4093,
+      "step": 555
+    },
+    {
+      "epoch": 1.8495867768595042,
+      "grad_norm": 1.2676841020584106,
+      "learning_rate": 1.061471453434278e-05,
+      "loss": 0.4011,
+      "step": 560
+    },
+    {
+      "epoch": 1.866115702479339,
+      "grad_norm": 1.324524164199829,
+      "learning_rate": 1.0354359875232404e-05,
+      "loss": 0.4045,
+      "step": 565
+    },
+    {
+      "epoch": 1.8826446280991735,
+      "grad_norm": 1.2812235355377197,
+      "learning_rate": 1.0095544262604409e-05,
+      "loss": 0.3747,
+      "step": 570
+    },
+    {
+      "epoch": 1.8991735537190082,
+      "grad_norm": 1.3111671209335327,
+      "learning_rate": 9.838353439057763e-06,
+      "loss": 0.3689,
+      "step": 575
+    },
+    {
+      "epoch": 1.9157024793388429,
+      "grad_norm": 1.297474980354309,
+      "learning_rate": 9.582872608917738e-06,
+      "loss": 0.4218,
+      "step": 580
+    },
+    {
+      "epoch": 1.9322314049586777,
+      "grad_norm": 1.283281683921814,
+      "learning_rate": 9.3291864100087e-06,
+      "loss": 0.3647,
+      "step": 585
+    },
+    {
+      "epoch": 1.9487603305785124,
+      "grad_norm": 1.2572720050811768,
+      "learning_rate": 9.077378885614589e-06,
+      "loss": 0.3772,
+      "step": 590
+    },
+    {
+      "epoch": 1.9652892561983473,
+      "grad_norm": 1.2468287944793701,
+      "learning_rate": 8.827533456636326e-06,
+      "loss": 0.3794,
+      "step": 595
+    },
+    {
+      "epoch": 1.981818181818182,
+      "grad_norm": 1.250914216041565,
+      "learning_rate": 8.579732893955506e-06,
+      "loss": 0.3579,
+      "step": 600
+    },
+    {
+      "epoch": 1.9983471074380166,
+      "grad_norm": 1.2372207641601562,
+      "learning_rate": 8.334059291013297e-06,
+      "loss": 0.3954,
+      "step": 605
+    },
+    {
+      "epoch": 2.013223140495868,
+      "grad_norm": 1.3838112354278564,
+      "learning_rate": 8.09059403661389e-06,
+      "loss": 0.3406,
+      "step": 610
+    },
+    {
+      "epoch": 2.0297520661157025,
+      "grad_norm": 1.3364596366882324,
+      "learning_rate": 7.849417787961301e-06,
+      "loss": 0.328,
+      "step": 615
+    },
+    {
+      "epoch": 2.046280991735537,
+      "grad_norm": 1.2164885997772217,
+      "learning_rate": 7.610610443938613e-06,
+      "loss": 0.3243,
+      "step": 620
+    },
+    {
+      "epoch": 2.062809917355372,
+      "grad_norm": 1.371264934539795,
+      "learning_rate": 7.374251118638359e-06,
+      "loss": 0.2965,
+      "step": 625
+    },
+    {
+      "epoch": 2.0793388429752064,
+      "grad_norm": 1.362658143043518,
+      "learning_rate": 7.140418115153002e-06,
+      "loss": 0.2769,
+      "step": 630
+    },
+    {
+      "epoch": 2.0958677685950415,
+      "grad_norm": 1.2558574676513672,
+      "learning_rate": 6.90918889963399e-06,
+      "loss": 0.3218,
+      "step": 635
+    },
+    {
+      "epoch": 2.112396694214876,
+      "grad_norm": 1.3430614471435547,
+      "learning_rate": 6.680640075628127e-06,
+      "loss": 0.2896,
+      "step": 640
+    },
+    {
+      "epoch": 2.128925619834711,
+      "grad_norm": 1.3200640678405762,
+      "learning_rate": 6.454847358699728e-06,
+      "loss": 0.303,
+      "step": 645
+    },
+    {
+      "epoch": 2.1454545454545455,
+      "grad_norm": 1.313966155052185,
+      "learning_rate": 6.231885551346929e-06,
+      "loss": 0.3082,
+      "step": 650
+    },
+    {
+      "epoch": 2.16198347107438,
+      "grad_norm": 1.317543864250183,
+      "learning_rate": 6.011828518220467e-06,
+      "loss": 0.2737,
+      "step": 655
+    },
+    {
+      "epoch": 2.178512396694215,
+      "grad_norm": 1.511826992034912,
+      "learning_rate": 5.794749161653201e-06,
+      "loss": 0.316,
+      "step": 660
+    },
+    {
+      "epoch": 2.1950413223140495,
+      "grad_norm": 1.2921308279037476,
+      "learning_rate": 5.580719397508446e-06,
+      "loss": 0.3149,
+      "step": 665
+    },
+    {
+      "epoch": 2.211570247933884,
+      "grad_norm": 1.3625119924545288,
+      "learning_rate": 5.369810131355022e-06,
+      "loss": 0.2986,
+      "step": 670
+    },
+    {
+      "epoch": 2.2280991735537192,
+      "grad_norm": 1.486613154411316,
+      "learning_rate": 5.162091234977103e-06,
+      "loss": 0.3031,
+      "step": 675
+    },
+    {
+      "epoch": 2.244628099173554,
+      "grad_norm": 1.3193362951278687,
+      "learning_rate": 4.957631523226512e-06,
+      "loss": 0.2925,
+      "step": 680
+    },
+    {
+      "epoch": 2.2611570247933885,
+      "grad_norm": 1.3618298768997192,
+      "learning_rate": 4.756498731225157e-06,
+      "loss": 0.284,
+      "step": 685
+    },
+    {
+      "epoch": 2.277685950413223,
+      "grad_norm": 1.2601497173309326,
+      "learning_rate": 4.5587594919251565e-06,
+      "loss": 0.3003,
+      "step": 690
+    },
+    {
+      "epoch": 2.294214876033058,
+      "grad_norm": 1.2634460926055908,
+      "learning_rate": 4.364479314034129e-06,
+      "loss": 0.3091,
+      "step": 695
+    },
+    {
+      "epoch": 2.3107438016528925,
+      "grad_norm": 1.3098965883255005,
+      "learning_rate": 4.173722560312947e-06,
+      "loss": 0.281,
+      "step": 700
+    },
+    {
+      "epoch": 2.327272727272727,
+      "grad_norm": 1.2892374992370605,
+      "learning_rate": 3.98655242625306e-06,
+      "loss": 0.2854,
+      "step": 705
+    },
+    {
+      "epoch": 2.343801652892562,
+      "grad_norm": 1.3915247917175293,
+      "learning_rate": 3.8030309191405802e-06,
+      "loss": 0.2745,
+      "step": 710
+    },
+    {
+      "epoch": 2.3603305785123965,
+      "grad_norm": 1.3079394102096558,
+      "learning_rate": 3.623218837514011e-06,
+      "loss": 0.2861,
+      "step": 715
+    },
+    {
+      "epoch": 2.3768595041322316,
+      "grad_norm": 1.3242440223693848,
+      "learning_rate": 3.4471757510223774e-06,
+      "loss": 0.2991,
+      "step": 720
+    },
+    {
+      "epoch": 2.3933884297520662,
+      "grad_norm": 1.0953946113586426,
+      "learning_rate": 3.2749599806904878e-06,
+      "loss": 0.2871,
+      "step": 725
+    },
+    {
+      "epoch": 2.409917355371901,
+      "grad_norm": 1.4852635860443115,
+      "learning_rate": 3.1066285795978833e-06,
+      "loss": 0.2992,
+      "step": 730
+    },
+    {
+      "epoch": 2.4264462809917355,
+      "grad_norm": 1.294978141784668,
+      "learning_rate": 2.9422373139778065e-06,
+      "loss": 0.2686,
+      "step": 735
+    },
+    {
+      "epoch": 2.44297520661157,
+      "grad_norm": 1.4800472259521484,
+      "learning_rate": 2.7818406447424805e-06,
+      "loss": 0.3113,
+      "step": 740
+    },
+    {
+      "epoch": 2.459504132231405,
+      "grad_norm": 1.24534010887146,
+      "learning_rate": 2.6254917094408565e-06,
+      "loss": 0.3078,
+      "step": 745
+    },
+    {
+      "epoch": 2.4760330578512395,
+      "grad_norm": 1.306331753730774,
+      "learning_rate": 2.473242304654783e-06,
+      "loss": 0.3373,
+      "step": 750
+    },
+    {
+      "epoch": 2.4925619834710746,
+      "grad_norm": 1.260591983795166,
+      "learning_rate": 2.3251428688393627e-06,
+      "loss": 0.2945,
+      "step": 755
+    },
+    {
+      "epoch": 2.509090909090909,
+      "grad_norm": 1.201499342918396,
+      "learning_rate": 2.1812424656133003e-06,
+      "loss": 0.2686,
+      "step": 760
+    },
+    {
+      "epoch": 2.525619834710744,
+      "grad_norm": 1.202189326286316,
+      "learning_rate": 2.0415887675046966e-06,
+      "loss": 0.2604,
+      "step": 765
+    },
+    {
+      "epoch": 2.5421487603305786,
+      "grad_norm": 1.4685600996017456,
+      "learning_rate": 1.906228040157702e-06,
+      "loss": 0.2844,
+      "step": 770
+    },
+    {
+      "epoch": 2.5586776859504132,
+      "grad_norm": 1.2637609243392944,
+      "learning_rate": 1.7752051270052194e-06,
+      "loss": 0.2851,
+      "step": 775
+    },
+    {
+      "epoch": 2.575206611570248,
+      "grad_norm": 1.3588604927062988,
+      "learning_rate": 1.6485634344128325e-06,
+      "loss": 0.2872,
+      "step": 780
+    },
+    {
+      "epoch": 2.5917355371900825,
+      "grad_norm": 1.1751164197921753,
+      "learning_rate": 1.526344917298761e-06,
+      "loss": 0.2578,
+      "step": 785
+    },
+    {
+      "epoch": 2.608264462809917,
+      "grad_norm": 1.256611704826355,
+      "learning_rate": 1.4085900652346834e-06,
+      "loss": 0.2816,
+      "step": 790
+    },
+    {
+      "epoch": 2.624793388429752,
+      "grad_norm": 1.2114397287368774,
+      "learning_rate": 1.2953378890320278e-06,
+      "loss": 0.3006,
+      "step": 795
+    },
+    {
+      "epoch": 2.641322314049587,
+      "grad_norm": 1.2731175422668457,
+      "learning_rate": 1.1866259078181274e-06,
+      "loss": 0.2668,
+      "step": 800
+    },
+    {
+      "epoch": 2.6578512396694216,
+      "grad_norm": 1.3774311542510986,
+      "learning_rate": 1.082490136606587e-06,
+      "loss": 0.2715,
+      "step": 805
+    },
+    {
+      "epoch": 2.6743801652892563,
+      "grad_norm": 1.2104158401489258,
+      "learning_rate": 9.829650743659185e-07,
+      "loss": 0.294,
+      "step": 810
+    },
+    {
+      "epoch": 2.690909090909091,
+      "grad_norm": 1.1546332836151123,
+      "learning_rate": 8.880836925904284e-07,
+      "loss": 0.2714,
+      "step": 815
+    },
+    {
+      "epoch": 2.7074380165289256,
+      "grad_norm": 1.2266736030578613,
+      "learning_rate": 7.978774243771636e-07,
+      "loss": 0.2867,
+      "step": 820
+    },
+    {
+      "epoch": 2.7239669421487602,
+      "grad_norm": 1.2701377868652344,
+      "learning_rate": 7.123761540124729e-07,
+      "loss": 0.3008,
+      "step": 825
+    },
+    {
+      "epoch": 2.740495867768595,
+      "grad_norm": 1.2228130102157593,
+      "learning_rate": 6.316082070717083e-07,
+      "loss": 0.272,
+      "step": 830
+    },
+    {
+      "epoch": 2.75702479338843,
+      "grad_norm": 1.3307437896728516,
+      "learning_rate": 5.556003410352762e-07,
+      "loss": 0.287,
+      "step": 835
+    },
+    {
+      "epoch": 2.773553719008264,
+      "grad_norm": 1.4137649536132812,
+      "learning_rate": 4.843777364242064e-07,
+      "loss": 0.279,
+      "step": 840
+    },
+    {
+      "epoch": 2.7900826446280993,
+      "grad_norm": 1.3000072240829468,
+      "learning_rate": 4.179639884581371e-07,
+      "loss": 0.2768,
+      "step": 845
+    },
+    {
+      "epoch": 2.806611570247934,
+      "grad_norm": 1.2189329862594604,
+      "learning_rate": 3.563810992384858e-07,
+      "loss": 0.2695,
+      "step": 850
+    },
+    {
+      "epoch": 2.8231404958677686,
+      "grad_norm": 1.1713279485702515,
+      "learning_rate": 2.996494704594183e-07,
+      "loss": 0.3059,
+      "step": 855
+    },
+    {
+      "epoch": 2.8396694214876033,
+      "grad_norm": 4.241643905639648,
+      "learning_rate": 2.477878966490049e-07,
+      "loss": 0.2362,
+      "step": 860
+    },
+    {
+      "epoch": 2.856198347107438,
+      "grad_norm": 1.2542489767074585,
+      "learning_rate": 2.0081355894280008e-07,
+      "loss": 0.3209,
+      "step": 865
+    },
+    {
+      "epoch": 2.8727272727272726,
+      "grad_norm": 1.1765050888061523,
+      "learning_rate": 1.587420193919481e-07,
+      "loss": 0.2759,
+      "step": 870
+    },
+    {
+      "epoch": 2.8892561983471072,
+      "grad_norm": 1.19680655002594,
+      "learning_rate": 1.2158721580764753e-07,
+      "loss": 0.2926,
+      "step": 875
+    },
+    {
+      "epoch": 2.9057851239669423,
+      "grad_norm": 1.3726508617401123,
+      "learning_rate": 8.936145714371624e-08,
+      "loss": 0.2599,
+      "step": 880
+    },
+    {
+      "epoch": 2.922314049586777,
+      "grad_norm": 1.3616031408309937,
+      "learning_rate": 6.207541941878503e-08,
+      "loss": 0.2823,
+      "step": 885
+    },
+    {
+      "epoch": 2.9388429752066116,
+      "grad_norm": 1.4766091108322144,
+      "learning_rate": 3.973814217945016e-08,
+      "loss": 0.2905,
+      "step": 890
+    },
+    {
+      "epoch": 2.9553719008264463,
+      "grad_norm": 1.2850749492645264,
+      "learning_rate": 2.2357025505577165e-08,
+      "loss": 0.2481,
+      "step": 895
+    },
+    {
+      "epoch": 2.971900826446281,
+      "grad_norm": 1.1268725395202637,
+      "learning_rate": 9.937827558748613e-09,
+      "loss": 0.284,
+      "step": 900
+    },
+    {
+      "epoch": 2.9884297520661156,
+      "grad_norm": 1.3954100608825684,
+      "learning_rate": 2.4846626746399812e-09,
+      "loss": 0.2483,
+      "step": 905
+    },
+    {
+      "epoch": 3.0,
+      "step": 909,
+      "total_flos": 1.273900111881044e+18,
+      "train_loss": 0.571772654588991,
+      "train_runtime": 581.8624,
+      "train_samples_per_second": 49.837,
+      "train_steps_per_second": 1.562
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 909,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.273900111881044e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

48_128_e3_3e-5/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f163d3723476b34aaac4d3acd2824eefa5e4a07ceb42e44c2bf1267e196fbf8
+size 8145

48_128_e3_3e-5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff