Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

all_results.json +9 -0
config.json +37 -0
configuration_backpack_gpt2.py +42 -0
generation_config.json +8 -0
merges.txt +0 -0
modeling_backpack_gpt2.py +269 -0
pytorch_model.bin +3 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +20 -0
train_results.json +9 -0
trainer_state.json +2227 -0
training_args.bin +3 -0
vocab.json +0 -0

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.222658667991288,
+    "total_flos": 8.246852548747592e+18,
+    "train_loss": 1.5593041332244872,
+    "train_runtime": 85792.9956,
+    "train_samples": 16749432,
+    "train_samples_per_second": 238.714,
+    "train_steps_per_second": 0.117
+}

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "BackpackGPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_backpack_gpt2.BackpackGPT2Config",
+    "AutoModelForCausalLM": "modeling_backpack_gpt2.BackpackGPT2LMHeadModel"
+  },
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 512,
+  "num_senses": 16,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "sense_intermediate_scale": 4,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "transformers_version": "4.57.0",
+  "use_cache": true,
+  "vocab_size": 50264
+}

configuration_backpack_gpt2.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+class BackpackGPT2Config(GPT2Config):
+  """
+    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
+    instantiate a Backpack GPT-2 model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`GPT2Config`] and can be used to control the model outputs. Read the
+    documentation from [`GPT2Config`] for more information.
+    Args:
+        num_senses (`int`, *optional*, defaults to 16):
+            The number of sense vectors to define for each word.
+        sense_intermediate_scale (`int`, *optional*, defaults ot 4):
+            The hidden dimensionality of the sense vector network.
+    Example:
+    ```python
+    >>> from transformers import BackpackGPT2Config, BackpackGPT2Model
+    >>> # Initializing a GPT2 configuration
+    >>> configuration = BackpackGPT2Config()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = BackpackGPT2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+  """
+  def __init__(self,
+               vocab_size=50264,
+               num_senses=16,
+               sense_intermediate_scale=4,
+               n_positions=512,
+               scale_attn_by_inverse_layer_idx=True,
+               **kwargs,
+  ):
+    self.num_senses = num_senses
+    self.sense_intermediate_scale = sense_intermediate_scale
+    super().__init__(vocab_size=vocab_size, n_positions=n_positions, scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, **kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": [
+    50256
+  ],
+  "transformers_version": "4.57.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_backpack_gpt2.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.pytorch_utils import Conv1D
+from transformers.utils import (
+    ModelOutput,
+    logging,
+)
+from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2PreTrainedModel, GenerationMixin
+from transformers.cache_utils import Cache
+from .configuration_backpack_gpt2 import BackpackGPT2Config
+logger = logging.get_logger(__name__)
+### Backpack-Specific
+class BackpackGPT2PreTrainedModel(GPT2PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias"]
+    config_class = BackpackGPT2Config
+    base_model_prefix = "backpack"
+    is_parallelizable = True
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["GPT2Block", "BackpackNoMixBlock"]
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+class BackpackMLP(nn.Module):
+  def __init__(self, embed_dim, intermediate_dim, out_dim, config):
+        super().__init__()
+        self.c_fc = Conv1D(intermediate_dim, embed_dim)
+        self.c_proj = Conv1D(out_dim, intermediate_dim)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+  def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+      hidden_states = self.c_fc(hidden_states)
+      hidden_states = self.act(hidden_states)
+      hidden_states = self.c_proj(hidden_states)
+      hidden_states = self.dropout(hidden_states)
+      return hidden_states
+class BackpackNoMixBlock(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+    self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+    self.mlp = BackpackMLP(config.n_embd, config.n_embd*4, config.n_embd, config)
+    self.resid_dropout1 = nn.Dropout(config.resid_pdrop)
+    self.resid_dropout2 = nn.Dropout(config.resid_pdrop)
+  def forward(self, hidden_states, residual):
+    residual = self.resid_dropout1(hidden_states) + residual
+    hidden_states = self.ln_1(residual)
+    mlp_out = self.mlp(hidden_states)
+    residual = self.resid_dropout2(mlp_out) + residual
+    hidden_states = self.ln_2(residual)
+    return hidden_states
+class BackpackSenseNetwork(nn.Module):
+    def __init__(self, config, num_senses, device=None, dtype=None):
+        super().__init__()
+        self.num_senses = num_senses
+        #self.embeddings = embeddings
+        self.n_embd = config.n_embd
+        self.dropout = nn.Dropout(config.embd_pdrop)
+        self.block = BackpackNoMixBlock(config)
+        self.ln = nn.LayerNorm(self.n_embd, eps=config.layer_norm_epsilon)
+        self.final_mlp = BackpackMLP(
+            embed_dim=config.n_embd,
+            intermediate_dim=config.sense_intermediate_scale*config.n_embd,
+            out_dim=config.n_embd*config.num_senses,
+            config=config,
+            )
+    def forward(self, input_embeds):
+      residual = self.dropout(input_embeds)
+      hidden_states = self.ln(residual)
+      hidden_states = self.block(hidden_states, residual)
+      senses = self.final_mlp(hidden_states)
+      bs, s, nvd = senses.shape
+      return senses.reshape(bs, s, self.num_senses, self.n_embd).transpose(1,2) # (bs, nv, s, d)
+class BackpackWeightNetwork(nn.Module):
+  def __init__(self, num_senses, embed_dim):
+    super().__init__()
+    self.n_embd = embed_dim
+    self.num_senses = num_senses
+    self.embed_per_sense = embed_dim // num_senses
+    self.c_attn = nn.Linear(embed_dim, 2 * num_senses * self.embed_per_sense)
+    self.softmax_scale = None
+  def forward(self, encoded):
+    b, s, d = encoded.shape
+    encoded = self.c_attn(encoded) # (b, s, 2*d)
+    encoded = encoded.reshape(b, s, 2, self.num_senses, self.embed_per_sense) #(b, s, 2, nv, d//nv)
+    batch_size, seqlen = encoded.shape[0], encoded.shape[1]
+    # compute scores & mask
+    q, k = encoded.unbind(dim=2)
+    softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
+    scores = torch.einsum('bthd,bshd->bhts', q, k * softmax_scale)
+    causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+    scores = scores + causal_mask.to(dtype=scores.dtype)
+    return torch.softmax(scores, dim=-1, dtype=q.dtype)
+@dataclass
+class BackpackGPT2BaseModelOutput(ModelOutput):
+    hidden_states: torch.FloatTensor = None
+    contextualization: torch.FloatTensor = None
+class BackpackGPT2Model(BackpackGPT2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r".*attn.masked_bias", r".*attn.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed_dim = config.n_embd
+        self.num_senses = config.num_senses
+        self.gpt2_model = GPT2Model(config)
+        self.sense_network = BackpackSenseNetwork(config, self.num_senses, self.gpt2_model.wte)
+        self.word_embeddings = self.gpt2_model.wte
+        self.position_embeddings = self.gpt2_model.wpe
+        self.sense_weight_net = BackpackWeightNetwork(self.num_senses, self.embed_dim)
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+    def get_num_senses(self):
+        return self.num_senses
+    def get_word_embeddings(self):
+        return self.word_embeddings
+    def get_sense_network(self):
+        return self.sense_network
+    def get_input_embeddings(self):
+        return self.word_embeddings
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        cache_position: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs):
+        # Compute senses
+        sense_input_embeds = self.word_embeddings(input_ids)
+        senses = self.sense_network(sense_input_embeds) # (bs, nv, s, d)
+        # Compute contextualization weights
+        #contextl_hidden_states = self.gpt2_model(input_ids, position_ids=position_ids).last_hidden_state # (bs, s, d)
+        contextl_hidden_states = self.gpt2_model(input_ids=input_ids, position_ids=position_ids, **kwargs).last_hidden_state
+        contextualization = self.sense_weight_net(contextl_hidden_states) # (bs, nv, s, s)
+        # Compute resulting outputs
+        hidden_states = torch.sum(contextualization @ senses, dim=1) # (bs, nv, s, d) -> (bs, s, d)
+        return BackpackGPT2BaseModelOutput(
+            hidden_states=hidden_states,
+            contextualization=contextualization,
+        )
+    def run_with_custom_contextualization(self, input_ids, contextualization):
+        # Compute senses
+        sense_input_embeds = self.word_embeddings(input_ids)
+        senses = self.sense_network(sense_input_embeds) # (bs, nv, s, d)
+        # Compute resulting outputs
+        hidden_states = torch.sum(contextualization @ senses, dim=1) # (bs, nv, s, d) -> (bs, s, d)
+        return BackpackGPT2BaseModelOutput(
+            hidden_states=hidden_states,
+            contextualization=contextualization,
+        )
+@dataclass
+class BackpackGPT2LMHeadModelOutput(ModelOutput):
+    logits: torch.FloatTensor = None
+    contextualization: torch.FloatTensor = None
+    loss: Optional[torch.FloatTensor] = None
+class BackpackGPT2LMHeadModel(BackpackGPT2PreTrainedModel, GenerationMixin):
+  _keys_to_ignore_on_load_missing = [r".*attn.masked_bias", r".*attn.bias"]
+  accepts_loss_kwargs = False
+  def __init__(self, config):
+    super().__init__(config)
+    self.backpack = BackpackGPT2Model(config)
+    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+    # Model parallel
+    self.model_parallel = False
+    self.device_map = None
+    self.tie_weights()
+  def tie_weights(self):
+      self.lm_head.weight = self.backpack.word_embeddings.weight # also tied with the underlying underlying transf
+  def get_lm_head(self):
+      return self.lm_head
+  def get_input_embeddings(self):
+    return self.backpack.word_embeddings
+  def forward(
+    self,
+    input_ids,
+    position_ids=None,
+    labels: Optional[torch.LongTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Cache] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    **kwargs):
+      outputs = self.backpack(input_ids, position_ids=position_ids, **kwargs)
+      hidden_states, contextualization = outputs.hidden_states, outputs.contextualization
+      lm_logits = self.lm_head(hidden_states) # (bs, s, V)
+      loss = None
+      if labels is not None:
+        labels = labels.to(lm_logits.device)
+        shift_logits = lm_logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        loss_fct = CrossEntropyLoss(ignore_index=-100, reduction='mean')
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        #print(f"[DEBUG] loss.item(): {loss.item()}, loss.shape: {loss.shape}")
+      return BackpackGPT2LMHeadModelOutput(
+            logits=lm_logits,
+            contextualization=contextualization,
+            loss=loss
+        )
+  def run_with_custom_contextualization(self, input_ids, contextualization):
+      outputs = self.backpack.run_with_custom_contextualization(input_ids, contextualization)
+      hidden_states, contextualization = outputs.hidden_states, outputs.contextualization
+      lm_logits = self.lm_head(hidden_states)
+      return BackpackGPT2LMHeadModelOutput(
+        logits=lm_logits,
+        contextualization=contextualization,
+    )

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e843dcebc05852269a0c08e3b26bc17f2684d89ef9b13bb3e4869e45e1d03d1
+size 680390003

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.222658667991288,
+    "total_flos": 8.246852548747592e+18,
+    "train_loss": 1.5593041332244872,
+    "train_runtime": 85792.9956,
+    "train_samples": 16749432,
+    "train_samples_per_second": 238.714,
+    "train_steps_per_second": 0.117
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2227 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.222658667991288,
+  "eval_steps": 500,
+  "global_step": 10000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.003912727828512476,
+      "grad_norm": 8.57561206817627,
+      "learning_rate": 1.55e-06,
+      "loss": 5.0338,
+      "step": 32
+    },
+    {
+      "epoch": 0.007825455657024952,
+      "grad_norm": 5.36021089553833,
+      "learning_rate": 3.1500000000000003e-06,
+      "loss": 4.456,
+      "step": 64
+    },
+    {
+      "epoch": 0.011738183485537427,
+      "grad_norm": 3.3196067810058594,
+      "learning_rate": 4.75e-06,
+      "loss": 3.9216,
+      "step": 96
+    },
+    {
+      "epoch": 0.015650911314049904,
+      "grad_norm": 2.2839956283569336,
+      "learning_rate": 6.35e-06,
+      "loss": 3.6983,
+      "step": 128
+    },
+    {
+      "epoch": 0.01956363914256238,
+      "grad_norm": 1.4226499795913696,
+      "learning_rate": 7.95e-06,
+      "loss": 3.5863,
+      "step": 160
+    },
+    {
+      "epoch": 0.023476366971074854,
+      "grad_norm": 0.9770936369895935,
+      "learning_rate": 9.55e-06,
+      "loss": 3.5076,
+      "step": 192
+    },
+    {
+      "epoch": 0.02738909479958733,
+      "grad_norm": 0.6855128407478333,
+      "learning_rate": 1.115e-05,
+      "loss": 3.4515,
+      "step": 224
+    },
+    {
+      "epoch": 0.03130182262809981,
+      "grad_norm": 0.5743525624275208,
+      "learning_rate": 1.2750000000000002e-05,
+      "loss": 3.4145,
+      "step": 256
+    },
+    {
+      "epoch": 0.03521455045661228,
+      "grad_norm": 0.4765739440917969,
+      "learning_rate": 1.435e-05,
+      "loss": 3.3892,
+      "step": 288
+    },
+    {
+      "epoch": 0.03912727828512476,
+      "grad_norm": 0.40247443318367004,
+      "learning_rate": 1.595e-05,
+      "loss": 3.3664,
+      "step": 320
+    },
+    {
+      "epoch": 0.04304000611363723,
+      "grad_norm": 0.3582874834537506,
+      "learning_rate": 1.755e-05,
+      "loss": 3.3487,
+      "step": 352
+    },
+    {
+      "epoch": 0.04695273394214971,
+      "grad_norm": 0.31657862663269043,
+      "learning_rate": 1.915e-05,
+      "loss": 3.3349,
+      "step": 384
+    },
+    {
+      "epoch": 0.05086546177066218,
+      "grad_norm": 0.28206518292427063,
+      "learning_rate": 2.075e-05,
+      "loss": 3.3197,
+      "step": 416
+    },
+    {
+      "epoch": 0.05477818959917466,
+      "grad_norm": 0.2577824890613556,
+      "learning_rate": 2.235e-05,
+      "loss": 3.3058,
+      "step": 448
+    },
+    {
+      "epoch": 0.05869091742768714,
+      "grad_norm": 0.23786848783493042,
+      "learning_rate": 2.395e-05,
+      "loss": 3.2955,
+      "step": 480
+    },
+    {
+      "epoch": 0.06260364525619962,
+      "grad_norm": 0.2239329218864441,
+      "learning_rate": 2.555e-05,
+      "loss": 3.2846,
+      "step": 512
+    },
+    {
+      "epoch": 0.06651637308471209,
+      "grad_norm": 0.22519271075725555,
+      "learning_rate": 2.7150000000000003e-05,
+      "loss": 3.2731,
+      "step": 544
+    },
+    {
+      "epoch": 0.07042910091322456,
+      "grad_norm": 0.2189016044139862,
+      "learning_rate": 2.8749999999999997e-05,
+      "loss": 3.2663,
+      "step": 576
+    },
+    {
+      "epoch": 0.07434182874173703,
+      "grad_norm": 0.20760661363601685,
+      "learning_rate": 3.035e-05,
+      "loss": 3.2581,
+      "step": 608
+    },
+    {
+      "epoch": 0.07825455657024952,
+      "grad_norm": 0.205606147646904,
+      "learning_rate": 3.1950000000000004e-05,
+      "loss": 3.2451,
+      "step": 640
+    },
+    {
+      "epoch": 0.08216728439876199,
+      "grad_norm": 0.22558899223804474,
+      "learning_rate": 3.355e-05,
+      "loss": 3.2412,
+      "step": 672
+    },
+    {
+      "epoch": 0.08608001222727446,
+      "grad_norm": 0.22584667801856995,
+      "learning_rate": 3.515e-05,
+      "loss": 3.2358,
+      "step": 704
+    },
+    {
+      "epoch": 0.08999274005578695,
+      "grad_norm": 0.22091105580329895,
+      "learning_rate": 3.675e-05,
+      "loss": 3.2302,
+      "step": 736
+    },
+    {
+      "epoch": 0.09390546788429942,
+      "grad_norm": 0.22428959608078003,
+      "learning_rate": 3.8350000000000004e-05,
+      "loss": 3.2228,
+      "step": 768
+    },
+    {
+      "epoch": 0.09781819571281189,
+      "grad_norm": 0.22730223834514618,
+      "learning_rate": 3.995e-05,
+      "loss": 3.2207,
+      "step": 800
+    },
+    {
+      "epoch": 0.10173092354132436,
+      "grad_norm": 0.28039082884788513,
+      "learning_rate": 4.155e-05,
+      "loss": 3.2171,
+      "step": 832
+    },
+    {
+      "epoch": 0.10564365136983685,
+      "grad_norm": 0.32776346802711487,
+      "learning_rate": 4.315e-05,
+      "loss": 3.2104,
+      "step": 864
+    },
+    {
+      "epoch": 0.10955637919834932,
+      "grad_norm": 0.2800813615322113,
+      "learning_rate": 4.4750000000000004e-05,
+      "loss": 3.2053,
+      "step": 896
+    },
+    {
+      "epoch": 0.11346910702686179,
+      "grad_norm": 0.24571874737739563,
+      "learning_rate": 4.635e-05,
+      "loss": 3.2046,
+      "step": 928
+    },
+    {
+      "epoch": 0.11738183485537428,
+      "grad_norm": 0.5581298470497131,
+      "learning_rate": 4.795e-05,
+      "loss": 3.2004,
+      "step": 960
+    },
+    {
+      "epoch": 0.12129456268388675,
+      "grad_norm": 0.47118815779685974,
+      "learning_rate": 4.9550000000000005e-05,
+      "loss": 3.1967,
+      "step": 992
+    },
+    {
+      "epoch": 0.12520729051239923,
+      "grad_norm": 0.23707512021064758,
+      "learning_rate": 4.9872222222222225e-05,
+      "loss": 3.1945,
+      "step": 1024
+    },
+    {
+      "epoch": 0.1291200183409117,
+      "grad_norm": 0.41069141030311584,
+      "learning_rate": 4.969444444444445e-05,
+      "loss": 3.1928,
+      "step": 1056
+    },
+    {
+      "epoch": 0.13303274616942418,
+      "grad_norm": 0.376223623752594,
+      "learning_rate": 4.9516666666666666e-05,
+      "loss": 3.1871,
+      "step": 1088
+    },
+    {
+      "epoch": 0.13694547399793663,
+      "grad_norm": 0.22380244731903076,
+      "learning_rate": 4.933888888888889e-05,
+      "loss": 3.1862,
+      "step": 1120
+    },
+    {
+      "epoch": 0.14085820182644912,
+      "grad_norm": 0.2950900197029114,
+      "learning_rate": 4.9161111111111115e-05,
+      "loss": 3.1828,
+      "step": 1152
+    },
+    {
+      "epoch": 0.1447709296549616,
+      "grad_norm": 0.25872257351875305,
+      "learning_rate": 4.8983333333333336e-05,
+      "loss": 3.1828,
+      "step": 1184
+    },
+    {
+      "epoch": 0.14868365748347406,
+      "grad_norm": 0.3597142994403839,
+      "learning_rate": 4.880555555555556e-05,
+      "loss": 3.1845,
+      "step": 1216
+    },
+    {
+      "epoch": 0.15259638531198655,
+      "grad_norm": 0.30377593636512756,
+      "learning_rate": 4.862777777777778e-05,
+      "loss": 3.1806,
+      "step": 1248
+    },
+    {
+      "epoch": 0.15650911314049903,
+      "grad_norm": 0.3617115318775177,
+      "learning_rate": 4.845e-05,
+      "loss": 3.178,
+      "step": 1280
+    },
+    {
+      "epoch": 0.1604218409690115,
+      "grad_norm": 0.31589606404304504,
+      "learning_rate": 4.8272222222222226e-05,
+      "loss": 3.1787,
+      "step": 1312
+    },
+    {
+      "epoch": 0.16433456879752398,
+      "grad_norm": 0.30715763568878174,
+      "learning_rate": 4.809444444444445e-05,
+      "loss": 3.1754,
+      "step": 1344
+    },
+    {
+      "epoch": 0.16824729662603646,
+      "grad_norm": 0.2574257254600525,
+      "learning_rate": 4.791666666666667e-05,
+      "loss": 3.1732,
+      "step": 1376
+    },
+    {
+      "epoch": 0.17216002445454892,
+      "grad_norm": 0.3290633261203766,
+      "learning_rate": 4.773888888888889e-05,
+      "loss": 3.1723,
+      "step": 1408
+    },
+    {
+      "epoch": 0.1760727522830614,
+      "grad_norm": 0.24164608120918274,
+      "learning_rate": 4.756111111111111e-05,
+      "loss": 3.1693,
+      "step": 1440
+    },
+    {
+      "epoch": 0.1799854801115739,
+      "grad_norm": 0.30125918984413147,
+      "learning_rate": 4.738333333333334e-05,
+      "loss": 3.1685,
+      "step": 1472
+    },
+    {
+      "epoch": 0.18389820794008635,
+      "grad_norm": 0.3488104045391083,
+      "learning_rate": 4.720555555555556e-05,
+      "loss": 3.1678,
+      "step": 1504
+    },
+    {
+      "epoch": 0.18781093576859884,
+      "grad_norm": 0.2793637812137604,
+      "learning_rate": 4.702777777777778e-05,
+      "loss": 3.1668,
+      "step": 1536
+    },
+    {
+      "epoch": 0.1917236635971113,
+      "grad_norm": 0.2682870030403137,
+      "learning_rate": 4.685000000000001e-05,
+      "loss": 3.1642,
+      "step": 1568
+    },
+    {
+      "epoch": 0.19563639142562378,
+      "grad_norm": 0.36307454109191895,
+      "learning_rate": 4.667222222222222e-05,
+      "loss": 3.1654,
+      "step": 1600
+    },
+    {
+      "epoch": 0.19954911925413626,
+      "grad_norm": 0.23930683732032776,
+      "learning_rate": 4.649444444444445e-05,
+      "loss": 3.1641,
+      "step": 1632
+    },
+    {
+      "epoch": 0.20346184708264872,
+      "grad_norm": 0.3049800992012024,
+      "learning_rate": 4.631666666666667e-05,
+      "loss": 3.1654,
+      "step": 1664
+    },
+    {
+      "epoch": 0.2073745749111612,
+      "grad_norm": 0.27725374698638916,
+      "learning_rate": 4.613888888888889e-05,
+      "loss": 3.1642,
+      "step": 1696
+    },
+    {
+      "epoch": 0.2112873027396737,
+      "grad_norm": 0.2733665108680725,
+      "learning_rate": 4.596111111111112e-05,
+      "loss": 3.1584,
+      "step": 1728
+    },
+    {
+      "epoch": 0.21520003056818615,
+      "grad_norm": 0.34570956230163574,
+      "learning_rate": 4.578333333333333e-05,
+      "loss": 3.162,
+      "step": 1760
+    },
+    {
+      "epoch": 0.21911275839669864,
+      "grad_norm": 0.2521582543849945,
+      "learning_rate": 4.560555555555556e-05,
+      "loss": 3.1603,
+      "step": 1792
+    },
+    {
+      "epoch": 0.22302548622521112,
+      "grad_norm": 0.29344356060028076,
+      "learning_rate": 4.542777777777778e-05,
+      "loss": 3.1587,
+      "step": 1824
+    },
+    {
+      "epoch": 0.22693821405372358,
+      "grad_norm": 0.426881343126297,
+      "learning_rate": 4.525e-05,
+      "loss": 3.1561,
+      "step": 1856
+    },
+    {
+      "epoch": 0.23085094188223607,
+      "grad_norm": 0.27699196338653564,
+      "learning_rate": 4.507222222222223e-05,
+      "loss": 3.1581,
+      "step": 1888
+    },
+    {
+      "epoch": 0.23476366971074855,
+      "grad_norm": 0.32313504815101624,
+      "learning_rate": 4.4894444444444444e-05,
+      "loss": 3.1578,
+      "step": 1920
+    },
+    {
+      "epoch": 0.238676397539261,
+      "grad_norm": 0.26697778701782227,
+      "learning_rate": 4.4716666666666665e-05,
+      "loss": 3.157,
+      "step": 1952
+    },
+    {
+      "epoch": 0.2425891253677735,
+      "grad_norm": 0.2206508368253708,
+      "learning_rate": 4.453888888888889e-05,
+      "loss": 3.1551,
+      "step": 1984
+    },
+    {
+      "epoch": 0.24650185319628595,
+      "grad_norm": 0.252888947725296,
+      "learning_rate": 4.4361111111111113e-05,
+      "loss": 3.1563,
+      "step": 2016
+    },
+    {
+      "epoch": 0.25041458102479847,
+      "grad_norm": 0.28254494071006775,
+      "learning_rate": 4.4183333333333334e-05,
+      "loss": 3.156,
+      "step": 2048
+    },
+    {
+      "epoch": 0.2543273088533109,
+      "grad_norm": 0.28460440039634705,
+      "learning_rate": 4.4005555555555555e-05,
+      "loss": 3.156,
+      "step": 2080
+    },
+    {
+      "epoch": 0.2582400366818234,
+      "grad_norm": 0.290326863527298,
+      "learning_rate": 4.3827777777777776e-05,
+      "loss": 3.1518,
+      "step": 2112
+    },
+    {
+      "epoch": 0.26215276451033587,
+      "grad_norm": 0.2769670784473419,
+      "learning_rate": 4.3650000000000004e-05,
+      "loss": 3.1515,
+      "step": 2144
+    },
+    {
+      "epoch": 0.26606549233884835,
+      "grad_norm": 0.21678052842617035,
+      "learning_rate": 4.3472222222222225e-05,
+      "loss": 3.1518,
+      "step": 2176
+    },
+    {
+      "epoch": 0.26997822016736084,
+      "grad_norm": 0.3134085536003113,
+      "learning_rate": 4.3294444444444446e-05,
+      "loss": 3.1501,
+      "step": 2208
+    },
+    {
+      "epoch": 0.27389094799587327,
+      "grad_norm": 0.35099807381629944,
+      "learning_rate": 4.311666666666667e-05,
+      "loss": 3.1523,
+      "step": 2240
+    },
+    {
+      "epoch": 0.27780367582438575,
+      "grad_norm": 0.27320197224617004,
+      "learning_rate": 4.293888888888889e-05,
+      "loss": 3.1507,
+      "step": 2272
+    },
+    {
+      "epoch": 0.28171640365289824,
+      "grad_norm": 0.28096139430999756,
+      "learning_rate": 4.2761111111111115e-05,
+      "loss": 3.1474,
+      "step": 2304
+    },
+    {
+      "epoch": 0.2856291314814107,
+      "grad_norm": 0.30300965905189514,
+      "learning_rate": 4.2583333333333336e-05,
+      "loss": 3.15,
+      "step": 2336
+    },
+    {
+      "epoch": 0.2895418593099232,
+      "grad_norm": 0.2996535003185272,
+      "learning_rate": 4.240555555555556e-05,
+      "loss": 3.1528,
+      "step": 2368
+    },
+    {
+      "epoch": 0.2934545871384357,
+      "grad_norm": 0.2503749132156372,
+      "learning_rate": 4.222777777777778e-05,
+      "loss": 3.1522,
+      "step": 2400
+    },
+    {
+      "epoch": 0.2973673149669481,
+      "grad_norm": 0.2272900640964508,
+      "learning_rate": 4.205e-05,
+      "loss": 3.1472,
+      "step": 2432
+    },
+    {
+      "epoch": 0.3012800427954606,
+      "grad_norm": 0.2367839366197586,
+      "learning_rate": 4.1872222222222227e-05,
+      "loss": 3.1479,
+      "step": 2464
+    },
+    {
+      "epoch": 0.3051927706239731,
+      "grad_norm": 0.3656509220600128,
+      "learning_rate": 4.169444444444445e-05,
+      "loss": 3.1506,
+      "step": 2496
+    },
+    {
+      "epoch": 0.3091054984524856,
+      "grad_norm": 0.25474536418914795,
+      "learning_rate": 4.151666666666667e-05,
+      "loss": 3.1506,
+      "step": 2528
+    },
+    {
+      "epoch": 0.31301822628099807,
+      "grad_norm": 0.21729741990566254,
+      "learning_rate": 4.133888888888889e-05,
+      "loss": 3.1466,
+      "step": 2560
+    },
+    {
+      "epoch": 0.31693095410951055,
+      "grad_norm": 0.26999133825302124,
+      "learning_rate": 4.116111111111111e-05,
+      "loss": 3.1468,
+      "step": 2592
+    },
+    {
+      "epoch": 0.320843681938023,
+      "grad_norm": 0.2668827176094055,
+      "learning_rate": 4.098333333333334e-05,
+      "loss": 3.144,
+      "step": 2624
+    },
+    {
+      "epoch": 0.32475640976653547,
+      "grad_norm": 0.24051733314990997,
+      "learning_rate": 4.080555555555556e-05,
+      "loss": 3.1465,
+      "step": 2656
+    },
+    {
+      "epoch": 0.32866913759504796,
+      "grad_norm": 0.24717700481414795,
+      "learning_rate": 4.062777777777778e-05,
+      "loss": 3.1465,
+      "step": 2688
+    },
+    {
+      "epoch": 0.33258186542356044,
+      "grad_norm": 0.23907746374607086,
+      "learning_rate": 4.045000000000001e-05,
+      "loss": 3.1453,
+      "step": 2720
+    },
+    {
+      "epoch": 0.3364945932520729,
+      "grad_norm": 0.24447326362133026,
+      "learning_rate": 4.027222222222222e-05,
+      "loss": 3.1406,
+      "step": 2752
+    },
+    {
+      "epoch": 0.34040732108058536,
+      "grad_norm": 0.25871723890304565,
+      "learning_rate": 4.009444444444444e-05,
+      "loss": 3.1435,
+      "step": 2784
+    },
+    {
+      "epoch": 0.34432004890909784,
+      "grad_norm": 0.3173305094242096,
+      "learning_rate": 3.991666666666667e-05,
+      "loss": 3.1439,
+      "step": 2816
+    },
+    {
+      "epoch": 0.34823277673761033,
+      "grad_norm": 0.2715188264846802,
+      "learning_rate": 3.973888888888889e-05,
+      "loss": 3.1433,
+      "step": 2848
+    },
+    {
+      "epoch": 0.3521455045661228,
+      "grad_norm": 0.2764374315738678,
+      "learning_rate": 3.956111111111112e-05,
+      "loss": 3.1455,
+      "step": 2880
+    },
+    {
+      "epoch": 0.3560582323946353,
+      "grad_norm": 0.3014623522758484,
+      "learning_rate": 3.938333333333333e-05,
+      "loss": 3.1399,
+      "step": 2912
+    },
+    {
+      "epoch": 0.3599709602231478,
+      "grad_norm": 0.22385312616825104,
+      "learning_rate": 3.9205555555555554e-05,
+      "loss": 3.1426,
+      "step": 2944
+    },
+    {
+      "epoch": 0.3638836880516602,
+      "grad_norm": 0.22400549054145813,
+      "learning_rate": 3.902777777777778e-05,
+      "loss": 3.1393,
+      "step": 2976
+    },
+    {
+      "epoch": 0.3677964158801727,
+      "grad_norm": 0.266812801361084,
+      "learning_rate": 3.885e-05,
+      "loss": 3.1426,
+      "step": 3008
+    },
+    {
+      "epoch": 0.3717091437086852,
+      "grad_norm": 0.2830856442451477,
+      "learning_rate": 3.867222222222222e-05,
+      "loss": 3.14,
+      "step": 3040
+    },
+    {
+      "epoch": 0.37562187153719767,
+      "grad_norm": 0.2724515199661255,
+      "learning_rate": 3.8494444444444444e-05,
+      "loss": 3.1419,
+      "step": 3072
+    },
+    {
+      "epoch": 0.37953459936571016,
+      "grad_norm": 0.22998973727226257,
+      "learning_rate": 3.8316666666666665e-05,
+      "loss": 3.139,
+      "step": 3104
+    },
+    {
+      "epoch": 0.3834473271942226,
+      "grad_norm": 0.23931734263896942,
+      "learning_rate": 3.813888888888889e-05,
+      "loss": 3.1408,
+      "step": 3136
+    },
+    {
+      "epoch": 0.3873600550227351,
+      "grad_norm": 0.26907482743263245,
+      "learning_rate": 3.7961111111111114e-05,
+      "loss": 3.1374,
+      "step": 3168
+    },
+    {
+      "epoch": 0.39127278285124756,
+      "grad_norm": 0.24700401723384857,
+      "learning_rate": 3.7783333333333335e-05,
+      "loss": 3.137,
+      "step": 3200
+    },
+    {
+      "epoch": 0.39518551067976004,
+      "grad_norm": 0.2963546812534332,
+      "learning_rate": 3.7605555555555556e-05,
+      "loss": 3.1401,
+      "step": 3232
+    },
+    {
+      "epoch": 0.39909823850827253,
+      "grad_norm": 0.2659439444541931,
+      "learning_rate": 3.7427777777777777e-05,
+      "loss": 3.1387,
+      "step": 3264
+    },
+    {
+      "epoch": 0.403010966336785,
+      "grad_norm": 0.26796412467956543,
+      "learning_rate": 3.7250000000000004e-05,
+      "loss": 3.1403,
+      "step": 3296
+    },
+    {
+      "epoch": 0.40692369416529744,
+      "grad_norm": 0.29361388087272644,
+      "learning_rate": 3.7072222222222225e-05,
+      "loss": 3.1389,
+      "step": 3328
+    },
+    {
+      "epoch": 0.41083642199380993,
+      "grad_norm": 0.24953944981098175,
+      "learning_rate": 3.6894444444444446e-05,
+      "loss": 3.1402,
+      "step": 3360
+    },
+    {
+      "epoch": 0.4147491498223224,
+      "grad_norm": 0.23955155909061432,
+      "learning_rate": 3.671666666666667e-05,
+      "loss": 3.1377,
+      "step": 3392
+    },
+    {
+      "epoch": 0.4186618776508349,
+      "grad_norm": 0.22984126210212708,
+      "learning_rate": 3.653888888888889e-05,
+      "loss": 3.1375,
+      "step": 3424
+    },
+    {
+      "epoch": 0.4225746054793474,
+      "grad_norm": 0.2523467540740967,
+      "learning_rate": 3.6361111111111116e-05,
+      "loss": 3.1364,
+      "step": 3456
+    },
+    {
+      "epoch": 0.4264873333078598,
+      "grad_norm": 0.23315957188606262,
+      "learning_rate": 3.6183333333333336e-05,
+      "loss": 3.1389,
+      "step": 3488
+    },
+    {
+      "epoch": 0.4304000611363723,
+      "grad_norm": 0.22483432292938232,
+      "learning_rate": 3.600555555555556e-05,
+      "loss": 3.1357,
+      "step": 3520
+    },
+    {
+      "epoch": 0.4343127889648848,
+      "grad_norm": 0.23685774207115173,
+      "learning_rate": 3.582777777777778e-05,
+      "loss": 3.136,
+      "step": 3552
+    },
+    {
+      "epoch": 0.4382255167933973,
+      "grad_norm": 0.24475786089897156,
+      "learning_rate": 3.565e-05,
+      "loss": 3.1364,
+      "step": 3584
+    },
+    {
+      "epoch": 0.44213824462190976,
+      "grad_norm": 0.21655669808387756,
+      "learning_rate": 3.547222222222222e-05,
+      "loss": 3.1363,
+      "step": 3616
+    },
+    {
+      "epoch": 0.44605097245042225,
+      "grad_norm": 0.24810287356376648,
+      "learning_rate": 3.529444444444445e-05,
+      "loss": 3.1364,
+      "step": 3648
+    },
+    {
+      "epoch": 0.4499637002789347,
+      "grad_norm": 0.23016402125358582,
+      "learning_rate": 3.511666666666667e-05,
+      "loss": 3.1345,
+      "step": 3680
+    },
+    {
+      "epoch": 0.45387642810744716,
+      "grad_norm": 0.24041368067264557,
+      "learning_rate": 3.4938888888888896e-05,
+      "loss": 3.1389,
+      "step": 3712
+    },
+    {
+      "epoch": 0.45778915593595965,
+      "grad_norm": 0.237365260720253,
+      "learning_rate": 3.476111111111111e-05,
+      "loss": 3.1335,
+      "step": 3744
+    },
+    {
+      "epoch": 0.46170188376447213,
+      "grad_norm": 0.21840572357177734,
+      "learning_rate": 3.458333333333333e-05,
+      "loss": 3.1365,
+      "step": 3776
+    },
+    {
+      "epoch": 0.4656146115929846,
+      "grad_norm": 0.22491848468780518,
+      "learning_rate": 3.440555555555556e-05,
+      "loss": 3.1365,
+      "step": 3808
+    },
+    {
+      "epoch": 0.4695273394214971,
+      "grad_norm": 0.2349662482738495,
+      "learning_rate": 3.422777777777778e-05,
+      "loss": 3.1364,
+      "step": 3840
+    },
+    {
+      "epoch": 0.47344006725000953,
+      "grad_norm": 0.3244574964046478,
+      "learning_rate": 3.405e-05,
+      "loss": 3.1333,
+      "step": 3872
+    },
+    {
+      "epoch": 0.477352795078522,
+      "grad_norm": 0.20271480083465576,
+      "learning_rate": 3.387222222222222e-05,
+      "loss": 3.1337,
+      "step": 3904
+    },
+    {
+      "epoch": 0.4812655229070345,
+      "grad_norm": 0.22787164151668549,
+      "learning_rate": 3.369444444444444e-05,
+      "loss": 3.1359,
+      "step": 3936
+    },
+    {
+      "epoch": 0.485178250735547,
+      "grad_norm": 0.2814686894416809,
+      "learning_rate": 3.351666666666667e-05,
+      "loss": 3.1344,
+      "step": 3968
+    },
+    {
+      "epoch": 0.4890909785640595,
+      "grad_norm": 0.20366469025611877,
+      "learning_rate": 3.333888888888889e-05,
+      "loss": 3.1342,
+      "step": 4000
+    },
+    {
+      "epoch": 0.4930037063925719,
+      "grad_norm": 0.2670027017593384,
+      "learning_rate": 3.316111111111111e-05,
+      "loss": 3.1319,
+      "step": 4032
+    },
+    {
+      "epoch": 0.4969164342210844,
+      "grad_norm": 0.2204466164112091,
+      "learning_rate": 3.298333333333333e-05,
+      "loss": 3.1328,
+      "step": 4064
+    },
+    {
+      "epoch": 0.5008291620495969,
+      "grad_norm": 0.2765197157859802,
+      "learning_rate": 3.2805555555555554e-05,
+      "loss": 3.132,
+      "step": 4096
+    },
+    {
+      "epoch": 0.5047418898781093,
+      "grad_norm": 0.2624960243701935,
+      "learning_rate": 3.262777777777778e-05,
+      "loss": 3.1348,
+      "step": 4128
+    },
+    {
+      "epoch": 0.5086546177066218,
+      "grad_norm": 0.2254333347082138,
+      "learning_rate": 3.245e-05,
+      "loss": 3.1327,
+      "step": 4160
+    },
+    {
+      "epoch": 0.5125673455351343,
+      "grad_norm": 0.25047773122787476,
+      "learning_rate": 3.2272222222222224e-05,
+      "loss": 3.1318,
+      "step": 4192
+    },
+    {
+      "epoch": 0.5164800733636468,
+      "grad_norm": 0.23816271126270294,
+      "learning_rate": 3.2094444444444445e-05,
+      "loss": 3.1331,
+      "step": 4224
+    },
+    {
+      "epoch": 0.5203928011921592,
+      "grad_norm": 0.22233732044696808,
+      "learning_rate": 3.1916666666666665e-05,
+      "loss": 3.1315,
+      "step": 4256
+    },
+    {
+      "epoch": 0.5243055290206717,
+      "grad_norm": 0.25133851170539856,
+      "learning_rate": 3.173888888888889e-05,
+      "loss": 3.1333,
+      "step": 4288
+    },
+    {
+      "epoch": 0.5282182568491842,
+      "grad_norm": 0.21504898369312286,
+      "learning_rate": 3.1561111111111114e-05,
+      "loss": 3.1332,
+      "step": 4320
+    },
+    {
+      "epoch": 0.5321309846776967,
+      "grad_norm": 0.2872157394886017,
+      "learning_rate": 3.1383333333333335e-05,
+      "loss": 3.1303,
+      "step": 4352
+    },
+    {
+      "epoch": 0.5360437125062092,
+      "grad_norm": 0.244154691696167,
+      "learning_rate": 3.1205555555555556e-05,
+      "loss": 3.1323,
+      "step": 4384
+    },
+    {
+      "epoch": 0.5399564403347217,
+      "grad_norm": 0.24791453778743744,
+      "learning_rate": 3.102777777777778e-05,
+      "loss": 3.1312,
+      "step": 4416
+    },
+    {
+      "epoch": 0.5438691681632342,
+      "grad_norm": 0.2378605306148529,
+      "learning_rate": 3.0850000000000004e-05,
+      "loss": 3.1309,
+      "step": 4448
+    },
+    {
+      "epoch": 0.5477818959917465,
+      "grad_norm": 0.21514585614204407,
+      "learning_rate": 3.0672222222222225e-05,
+      "loss": 3.1244,
+      "step": 4480
+    },
+    {
+      "epoch": 0.551694623820259,
+      "grad_norm": 0.22684329748153687,
+      "learning_rate": 3.0494444444444446e-05,
+      "loss": 3.1297,
+      "step": 4512
+    },
+    {
+      "epoch": 0.5556073516487715,
+      "grad_norm": 0.21271203458309174,
+      "learning_rate": 3.0316666666666664e-05,
+      "loss": 3.1286,
+      "step": 4544
+    },
+    {
+      "epoch": 0.559520079477284,
+      "grad_norm": 0.22873900830745697,
+      "learning_rate": 3.0138888888888888e-05,
+      "loss": 3.1262,
+      "step": 4576
+    },
+    {
+      "epoch": 0.5634328073057965,
+      "grad_norm": 0.24229228496551514,
+      "learning_rate": 2.9961111111111112e-05,
+      "loss": 3.1312,
+      "step": 4608
+    },
+    {
+      "epoch": 0.567345535134309,
+      "grad_norm": 0.2754037380218506,
+      "learning_rate": 2.9783333333333337e-05,
+      "loss": 3.1296,
+      "step": 4640
+    },
+    {
+      "epoch": 0.5712582629628215,
+      "grad_norm": 0.20053815841674805,
+      "learning_rate": 2.9605555555555558e-05,
+      "loss": 3.128,
+      "step": 4672
+    },
+    {
+      "epoch": 0.5751709907913339,
+      "grad_norm": 0.24577876925468445,
+      "learning_rate": 2.9427777777777782e-05,
+      "loss": 3.1302,
+      "step": 4704
+    },
+    {
+      "epoch": 0.5790837186198464,
+      "grad_norm": 0.2547786235809326,
+      "learning_rate": 2.925e-05,
+      "loss": 3.1263,
+      "step": 4736
+    },
+    {
+      "epoch": 0.5829964464483589,
+      "grad_norm": 0.18451441824436188,
+      "learning_rate": 2.9072222222222224e-05,
+      "loss": 3.1282,
+      "step": 4768
+    },
+    {
+      "epoch": 0.5869091742768714,
+      "grad_norm": 0.21002881228923798,
+      "learning_rate": 2.8894444444444445e-05,
+      "loss": 3.1271,
+      "step": 4800
+    },
+    {
+      "epoch": 0.5908219021053838,
+      "grad_norm": 0.21180187165737152,
+      "learning_rate": 2.871666666666667e-05,
+      "loss": 3.1272,
+      "step": 4832
+    },
+    {
+      "epoch": 0.5947346299338963,
+      "grad_norm": 0.2123003453016281,
+      "learning_rate": 2.8538888888888893e-05,
+      "loss": 3.1285,
+      "step": 4864
+    },
+    {
+      "epoch": 0.5986473577624087,
+      "grad_norm": 0.20064932107925415,
+      "learning_rate": 2.836111111111111e-05,
+      "loss": 3.1289,
+      "step": 4896
+    },
+    {
+      "epoch": 0.6025600855909212,
+      "grad_norm": 0.19583889842033386,
+      "learning_rate": 2.8183333333333335e-05,
+      "loss": 3.128,
+      "step": 4928
+    },
+    {
+      "epoch": 0.6064728134194337,
+      "grad_norm": 0.1817025989294052,
+      "learning_rate": 2.8005555555555556e-05,
+      "loss": 3.1263,
+      "step": 4960
+    },
+    {
+      "epoch": 0.6103855412479462,
+      "grad_norm": 0.18323124945163727,
+      "learning_rate": 2.782777777777778e-05,
+      "loss": 3.1276,
+      "step": 4992
+    },
+    {
+      "epoch": 0.6142982690764587,
+      "grad_norm": 0.21348968148231506,
+      "learning_rate": 2.7650000000000005e-05,
+      "loss": 3.1262,
+      "step": 5024
+    },
+    {
+      "epoch": 0.6182109969049712,
+      "grad_norm": 0.24803143739700317,
+      "learning_rate": 2.7472222222222222e-05,
+      "loss": 3.1278,
+      "step": 5056
+    },
+    {
+      "epoch": 0.6221237247334837,
+      "grad_norm": 0.27887552976608276,
+      "learning_rate": 2.7294444444444443e-05,
+      "loss": 3.1261,
+      "step": 5088
+    },
+    {
+      "epoch": 0.6260364525619961,
+      "grad_norm": 0.20992670953273773,
+      "learning_rate": 2.7116666666666667e-05,
+      "loss": 3.1248,
+      "step": 5120
+    },
+    {
+      "epoch": 0.6299491803905086,
+      "grad_norm": 0.20632390677928925,
+      "learning_rate": 2.693888888888889e-05,
+      "loss": 3.1295,
+      "step": 5152
+    },
+    {
+      "epoch": 0.6338619082190211,
+      "grad_norm": 0.22720162570476532,
+      "learning_rate": 2.6761111111111116e-05,
+      "loss": 3.124,
+      "step": 5184
+    },
+    {
+      "epoch": 0.6377746360475335,
+      "grad_norm": 0.20604351162910461,
+      "learning_rate": 2.6583333333333333e-05,
+      "loss": 3.1246,
+      "step": 5216
+    },
+    {
+      "epoch": 0.641687363876046,
+      "grad_norm": 0.21567173302173615,
+      "learning_rate": 2.6405555555555554e-05,
+      "loss": 3.1266,
+      "step": 5248
+    },
+    {
+      "epoch": 0.6456000917045585,
+      "grad_norm": 0.22443106770515442,
+      "learning_rate": 2.622777777777778e-05,
+      "loss": 3.1265,
+      "step": 5280
+    },
+    {
+      "epoch": 0.6495128195330709,
+      "grad_norm": 0.2323237955570221,
+      "learning_rate": 2.6050000000000003e-05,
+      "loss": 3.1214,
+      "step": 5312
+    },
+    {
+      "epoch": 0.6534255473615834,
+      "grad_norm": 0.21166770160198212,
+      "learning_rate": 2.5872222222222224e-05,
+      "loss": 3.125,
+      "step": 5344
+    },
+    {
+      "epoch": 0.6573382751900959,
+      "grad_norm": 0.21922937035560608,
+      "learning_rate": 2.5694444444444445e-05,
+      "loss": 3.1236,
+      "step": 5376
+    },
+    {
+      "epoch": 0.6612510030186084,
+      "grad_norm": 0.19853883981704712,
+      "learning_rate": 2.5516666666666666e-05,
+      "loss": 3.1256,
+      "step": 5408
+    },
+    {
+      "epoch": 0.6651637308471209,
+      "grad_norm": 0.22357633709907532,
+      "learning_rate": 2.533888888888889e-05,
+      "loss": 3.1257,
+      "step": 5440
+    },
+    {
+      "epoch": 0.6690764586756334,
+      "grad_norm": 0.22123898565769196,
+      "learning_rate": 2.5161111111111114e-05,
+      "loss": 3.1265,
+      "step": 5472
+    },
+    {
+      "epoch": 0.6729891865041459,
+      "grad_norm": 0.20758691430091858,
+      "learning_rate": 2.4983333333333335e-05,
+      "loss": 3.1244,
+      "step": 5504
+    },
+    {
+      "epoch": 0.6769019143326583,
+      "grad_norm": 0.19084863364696503,
+      "learning_rate": 2.4805555555555556e-05,
+      "loss": 3.124,
+      "step": 5536
+    },
+    {
+      "epoch": 0.6808146421611707,
+      "grad_norm": 0.21082304418087006,
+      "learning_rate": 2.462777777777778e-05,
+      "loss": 3.1247,
+      "step": 5568
+    },
+    {
+      "epoch": 0.6847273699896832,
+      "grad_norm": 0.19547946751117706,
+      "learning_rate": 2.445e-05,
+      "loss": 3.1254,
+      "step": 5600
+    },
+    {
+      "epoch": 0.6886400978181957,
+      "grad_norm": 0.20289190113544464,
+      "learning_rate": 2.4272222222222222e-05,
+      "loss": 3.1274,
+      "step": 5632
+    },
+    {
+      "epoch": 0.6925528256467082,
+      "grad_norm": 0.21069744229316711,
+      "learning_rate": 2.4094444444444443e-05,
+      "loss": 3.1235,
+      "step": 5664
+    },
+    {
+      "epoch": 0.6964655534752207,
+      "grad_norm": 0.20337700843811035,
+      "learning_rate": 2.3916666666666668e-05,
+      "loss": 3.1253,
+      "step": 5696
+    },
+    {
+      "epoch": 0.7003782813037331,
+      "grad_norm": 0.2150067836046219,
+      "learning_rate": 2.3738888888888892e-05,
+      "loss": 3.1255,
+      "step": 5728
+    },
+    {
+      "epoch": 0.7042910091322456,
+      "grad_norm": 0.1990475058555603,
+      "learning_rate": 2.3561111111111113e-05,
+      "loss": 3.1247,
+      "step": 5760
+    },
+    {
+      "epoch": 0.7082037369607581,
+      "grad_norm": 0.20272456109523773,
+      "learning_rate": 2.3383333333333334e-05,
+      "loss": 3.1235,
+      "step": 5792
+    },
+    {
+      "epoch": 0.7121164647892706,
+      "grad_norm": 0.21050025522708893,
+      "learning_rate": 2.3205555555555555e-05,
+      "loss": 3.1226,
+      "step": 5824
+    },
+    {
+      "epoch": 0.7160291926177831,
+      "grad_norm": 0.2530113160610199,
+      "learning_rate": 2.302777777777778e-05,
+      "loss": 3.1242,
+      "step": 5856
+    },
+    {
+      "epoch": 0.7199419204462956,
+      "grad_norm": 0.2530890703201294,
+      "learning_rate": 2.2850000000000003e-05,
+      "loss": 3.1215,
+      "step": 5888
+    },
+    {
+      "epoch": 0.7238546482748079,
+      "grad_norm": 0.19028717279434204,
+      "learning_rate": 2.2672222222222224e-05,
+      "loss": 3.1236,
+      "step": 5920
+    },
+    {
+      "epoch": 0.7277673761033204,
+      "grad_norm": 0.20547839999198914,
+      "learning_rate": 2.2494444444444445e-05,
+      "loss": 3.1225,
+      "step": 5952
+    },
+    {
+      "epoch": 0.7316801039318329,
+      "grad_norm": 0.19479484856128693,
+      "learning_rate": 2.231666666666667e-05,
+      "loss": 3.1248,
+      "step": 5984
+    },
+    {
+      "epoch": 0.7355928317603454,
+      "grad_norm": 0.2140408456325531,
+      "learning_rate": 2.213888888888889e-05,
+      "loss": 3.1237,
+      "step": 6016
+    },
+    {
+      "epoch": 0.7395055595888579,
+      "grad_norm": 0.17809583246707916,
+      "learning_rate": 2.1961111111111114e-05,
+      "loss": 3.1243,
+      "step": 6048
+    },
+    {
+      "epoch": 0.7434182874173704,
+      "grad_norm": 0.19468888640403748,
+      "learning_rate": 2.1783333333333332e-05,
+      "loss": 3.1246,
+      "step": 6080
+    },
+    {
+      "epoch": 0.7473310152458829,
+      "grad_norm": 0.2106105089187622,
+      "learning_rate": 2.1605555555555556e-05,
+      "loss": 3.1224,
+      "step": 6112
+    },
+    {
+      "epoch": 0.7512437430743953,
+      "grad_norm": 0.20489418506622314,
+      "learning_rate": 2.142777777777778e-05,
+      "loss": 3.1237,
+      "step": 6144
+    },
+    {
+      "epoch": 0.7551564709029078,
+      "grad_norm": 0.2453160136938095,
+      "learning_rate": 2.125e-05,
+      "loss": 3.1212,
+      "step": 6176
+    },
+    {
+      "epoch": 0.7590691987314203,
+      "grad_norm": 0.2121828943490982,
+      "learning_rate": 2.1072222222222222e-05,
+      "loss": 3.1192,
+      "step": 6208
+    },
+    {
+      "epoch": 0.7629819265599328,
+      "grad_norm": 0.18198275566101074,
+      "learning_rate": 2.0894444444444443e-05,
+      "loss": 3.1213,
+      "step": 6240
+    },
+    {
+      "epoch": 0.7668946543884452,
+      "grad_norm": 0.1795693039894104,
+      "learning_rate": 2.0716666666666668e-05,
+      "loss": 3.1201,
+      "step": 6272
+    },
+    {
+      "epoch": 0.7708073822169577,
+      "grad_norm": 0.24014544486999512,
+      "learning_rate": 2.0538888888888892e-05,
+      "loss": 3.122,
+      "step": 6304
+    },
+    {
+      "epoch": 0.7747201100454701,
+      "grad_norm": 0.20040743052959442,
+      "learning_rate": 2.0361111111111113e-05,
+      "loss": 3.1207,
+      "step": 6336
+    },
+    {
+      "epoch": 0.7786328378739826,
+      "grad_norm": 0.2076857089996338,
+      "learning_rate": 2.0183333333333334e-05,
+      "loss": 3.1245,
+      "step": 6368
+    },
+    {
+      "epoch": 0.7825455657024951,
+      "grad_norm": 0.19411978125572205,
+      "learning_rate": 2.0005555555555555e-05,
+      "loss": 3.1216,
+      "step": 6400
+    },
+    {
+      "epoch": 0.7864582935310076,
+      "grad_norm": 0.17701873183250427,
+      "learning_rate": 1.982777777777778e-05,
+      "loss": 3.1228,
+      "step": 6432
+    },
+    {
+      "epoch": 0.7903710213595201,
+      "grad_norm": 0.19787663221359253,
+      "learning_rate": 1.9650000000000003e-05,
+      "loss": 3.122,
+      "step": 6464
+    },
+    {
+      "epoch": 0.7942837491880326,
+      "grad_norm": 0.18991973996162415,
+      "learning_rate": 1.947222222222222e-05,
+      "loss": 3.1211,
+      "step": 6496
+    },
+    {
+      "epoch": 0.7981964770165451,
+      "grad_norm": 0.18508349359035492,
+      "learning_rate": 1.9294444444444445e-05,
+      "loss": 3.1211,
+      "step": 6528
+    },
+    {
+      "epoch": 0.8021092048450575,
+      "grad_norm": 0.17648939788341522,
+      "learning_rate": 1.911666666666667e-05,
+      "loss": 3.1237,
+      "step": 6560
+    },
+    {
+      "epoch": 0.80602193267357,
+      "grad_norm": 0.20672652125358582,
+      "learning_rate": 1.893888888888889e-05,
+      "loss": 3.1213,
+      "step": 6592
+    },
+    {
+      "epoch": 0.8099346605020824,
+      "grad_norm": 0.21490968763828278,
+      "learning_rate": 1.876111111111111e-05,
+      "loss": 3.1201,
+      "step": 6624
+    },
+    {
+      "epoch": 0.8138473883305949,
+      "grad_norm": 0.20175087451934814,
+      "learning_rate": 1.8583333333333332e-05,
+      "loss": 3.1184,
+      "step": 6656
+    },
+    {
+      "epoch": 0.8177601161591074,
+      "grad_norm": 0.17700786888599396,
+      "learning_rate": 1.8405555555555556e-05,
+      "loss": 3.1194,
+      "step": 6688
+    },
+    {
+      "epoch": 0.8216728439876199,
+      "grad_norm": 0.19697381556034088,
+      "learning_rate": 1.822777777777778e-05,
+      "loss": 3.1208,
+      "step": 6720
+    },
+    {
+      "epoch": 0.8255855718161323,
+      "grad_norm": 0.19516746699810028,
+      "learning_rate": 1.805e-05,
+      "loss": 3.122,
+      "step": 6752
+    },
+    {
+      "epoch": 0.8294982996446448,
+      "grad_norm": 0.19233250617980957,
+      "learning_rate": 1.7872222222222223e-05,
+      "loss": 3.1237,
+      "step": 6784
+    },
+    {
+      "epoch": 0.8334110274731573,
+      "grad_norm": 0.20740792155265808,
+      "learning_rate": 1.7694444444444443e-05,
+      "loss": 3.1227,
+      "step": 6816
+    },
+    {
+      "epoch": 0.8373237553016698,
+      "grad_norm": 0.18789739906787872,
+      "learning_rate": 1.7516666666666668e-05,
+      "loss": 3.1198,
+      "step": 6848
+    },
+    {
+      "epoch": 0.8412364831301823,
+      "grad_norm": 0.17981740832328796,
+      "learning_rate": 1.7338888888888892e-05,
+      "loss": 3.121,
+      "step": 6880
+    },
+    {
+      "epoch": 0.8451492109586948,
+      "grad_norm": 0.2110264003276825,
+      "learning_rate": 1.716111111111111e-05,
+      "loss": 3.1186,
+      "step": 6912
+    },
+    {
+      "epoch": 0.8490619387872073,
+      "grad_norm": 0.19858282804489136,
+      "learning_rate": 1.6983333333333334e-05,
+      "loss": 3.1236,
+      "step": 6944
+    },
+    {
+      "epoch": 0.8529746666157196,
+      "grad_norm": 0.17566311359405518,
+      "learning_rate": 1.6805555555555558e-05,
+      "loss": 3.1225,
+      "step": 6976
+    },
+    {
+      "epoch": 0.8568873944442321,
+      "grad_norm": 0.19274671375751495,
+      "learning_rate": 1.662777777777778e-05,
+      "loss": 3.1197,
+      "step": 7008
+    },
+    {
+      "epoch": 0.8608001222727446,
+      "grad_norm": 0.20043255388736725,
+      "learning_rate": 1.645e-05,
+      "loss": 3.1221,
+      "step": 7040
+    },
+    {
+      "epoch": 0.8647128501012571,
+      "grad_norm": 0.17369119822978973,
+      "learning_rate": 1.627222222222222e-05,
+      "loss": 3.119,
+      "step": 7072
+    },
+    {
+      "epoch": 0.8686255779297696,
+      "grad_norm": 0.18795572221279144,
+      "learning_rate": 1.6094444444444445e-05,
+      "loss": 3.116,
+      "step": 7104
+    },
+    {
+      "epoch": 0.8725383057582821,
+      "grad_norm": 0.20084317028522491,
+      "learning_rate": 1.591666666666667e-05,
+      "loss": 3.1164,
+      "step": 7136
+    },
+    {
+      "epoch": 0.8764510335867945,
+      "grad_norm": 0.1732749342918396,
+      "learning_rate": 1.573888888888889e-05,
+      "loss": 3.1184,
+      "step": 7168
+    },
+    {
+      "epoch": 0.880363761415307,
+      "grad_norm": 0.18775592744350433,
+      "learning_rate": 1.556111111111111e-05,
+      "loss": 3.1186,
+      "step": 7200
+    },
+    {
+      "epoch": 0.8842764892438195,
+      "grad_norm": 0.1810338944196701,
+      "learning_rate": 1.5383333333333332e-05,
+      "loss": 3.1211,
+      "step": 7232
+    },
+    {
+      "epoch": 0.888189217072332,
+      "grad_norm": 0.17264607548713684,
+      "learning_rate": 1.5205555555555557e-05,
+      "loss": 3.115,
+      "step": 7264
+    },
+    {
+      "epoch": 0.8921019449008445,
+      "grad_norm": 0.18331947922706604,
+      "learning_rate": 1.502777777777778e-05,
+      "loss": 3.1176,
+      "step": 7296
+    },
+    {
+      "epoch": 0.896014672729357,
+      "grad_norm": 0.1883401870727539,
+      "learning_rate": 1.485e-05,
+      "loss": 3.1194,
+      "step": 7328
+    },
+    {
+      "epoch": 0.8999274005578694,
+      "grad_norm": 0.17407892644405365,
+      "learning_rate": 1.4672222222222223e-05,
+      "loss": 3.1188,
+      "step": 7360
+    },
+    {
+      "epoch": 0.9038401283863818,
+      "grad_norm": 0.1941099464893341,
+      "learning_rate": 1.4494444444444444e-05,
+      "loss": 3.1211,
+      "step": 7392
+    },
+    {
+      "epoch": 0.9077528562148943,
+      "grad_norm": 0.17381389439105988,
+      "learning_rate": 1.4316666666666668e-05,
+      "loss": 3.1194,
+      "step": 7424
+    },
+    {
+      "epoch": 0.9116655840434068,
+      "grad_norm": 0.18369047343730927,
+      "learning_rate": 1.413888888888889e-05,
+      "loss": 3.1165,
+      "step": 7456
+    },
+    {
+      "epoch": 0.9155783118719193,
+      "grad_norm": 0.17392371594905853,
+      "learning_rate": 1.3961111111111111e-05,
+      "loss": 3.1165,
+      "step": 7488
+    },
+    {
+      "epoch": 0.9194910397004318,
+      "grad_norm": 0.17337463796138763,
+      "learning_rate": 1.3783333333333334e-05,
+      "loss": 3.1192,
+      "step": 7520
+    },
+    {
+      "epoch": 0.9234037675289443,
+      "grad_norm": 0.1813974380493164,
+      "learning_rate": 1.3605555555555557e-05,
+      "loss": 3.1158,
+      "step": 7552
+    },
+    {
+      "epoch": 0.9273164953574567,
+      "grad_norm": 0.1770683377981186,
+      "learning_rate": 1.3427777777777778e-05,
+      "loss": 3.1173,
+      "step": 7584
+    },
+    {
+      "epoch": 0.9312292231859692,
+      "grad_norm": 0.18390090763568878,
+      "learning_rate": 1.3250000000000002e-05,
+      "loss": 3.1211,
+      "step": 7616
+    },
+    {
+      "epoch": 0.9351419510144817,
+      "grad_norm": 0.17356765270233154,
+      "learning_rate": 1.3072222222222221e-05,
+      "loss": 3.1187,
+      "step": 7648
+    },
+    {
+      "epoch": 0.9390546788429942,
+      "grad_norm": 0.173334538936615,
+      "learning_rate": 1.2894444444444445e-05,
+      "loss": 3.1191,
+      "step": 7680
+    },
+    {
+      "epoch": 0.9429674066715066,
+      "grad_norm": 0.18598856031894684,
+      "learning_rate": 1.2716666666666668e-05,
+      "loss": 3.1192,
+      "step": 7712
+    },
+    {
+      "epoch": 0.9468801345000191,
+      "grad_norm": 0.1667858213186264,
+      "learning_rate": 1.2538888888888889e-05,
+      "loss": 3.1173,
+      "step": 7744
+    },
+    {
+      "epoch": 0.9507928623285316,
+      "grad_norm": 0.17433424293994904,
+      "learning_rate": 1.2361111111111112e-05,
+      "loss": 3.1184,
+      "step": 7776
+    },
+    {
+      "epoch": 0.954705590157044,
+      "grad_norm": 0.1921132653951645,
+      "learning_rate": 1.2183333333333334e-05,
+      "loss": 3.119,
+      "step": 7808
+    },
+    {
+      "epoch": 0.9586183179855565,
+      "grad_norm": 0.16437648236751556,
+      "learning_rate": 1.2005555555555557e-05,
+      "loss": 3.1179,
+      "step": 7840
+    },
+    {
+      "epoch": 0.962531045814069,
+      "grad_norm": 0.17323090136051178,
+      "learning_rate": 1.1827777777777778e-05,
+      "loss": 3.1192,
+      "step": 7872
+    },
+    {
+      "epoch": 0.9664437736425815,
+      "grad_norm": 0.16646146774291992,
+      "learning_rate": 1.1650000000000002e-05,
+      "loss": 3.1176,
+      "step": 7904
+    },
+    {
+      "epoch": 0.970356501471094,
+      "grad_norm": 0.18198241293430328,
+      "learning_rate": 1.1472222222222223e-05,
+      "loss": 3.1178,
+      "step": 7936
+    },
+    {
+      "epoch": 0.9742692292996065,
+      "grad_norm": 0.17490531504154205,
+      "learning_rate": 1.1294444444444445e-05,
+      "loss": 3.1161,
+      "step": 7968
+    },
+    {
+      "epoch": 0.978181957128119,
+      "grad_norm": 0.17505322396755219,
+      "learning_rate": 1.1116666666666666e-05,
+      "loss": 3.1213,
+      "step": 8000
+    },
+    {
+      "epoch": 0.9820946849566314,
+      "grad_norm": 0.17005711793899536,
+      "learning_rate": 1.0938888888888889e-05,
+      "loss": 3.1187,
+      "step": 8032
+    },
+    {
+      "epoch": 0.9860074127851438,
+      "grad_norm": 0.18125712871551514,
+      "learning_rate": 1.0761111111111112e-05,
+      "loss": 3.12,
+      "step": 8064
+    },
+    {
+      "epoch": 0.9899201406136563,
+      "grad_norm": 0.17013822495937347,
+      "learning_rate": 1.0583333333333334e-05,
+      "loss": 3.1157,
+      "step": 8096
+    },
+    {
+      "epoch": 0.9938328684421688,
+      "grad_norm": 0.1698048710823059,
+      "learning_rate": 1.0405555555555555e-05,
+      "loss": 3.1172,
+      "step": 8128
+    },
+    {
+      "epoch": 0.9977455962706813,
+      "grad_norm": 0.17143802344799042,
+      "learning_rate": 1.0227777777777778e-05,
+      "loss": 3.1153,
+      "step": 8160
+    },
+    {
+      "epoch": 1.0015895456803332,
+      "grad_norm": 0.1739780455827713,
+      "learning_rate": 1.005e-05,
+      "loss": 3.1163,
+      "step": 8192
+    },
+    {
+      "epoch": 1.0055022735088457,
+      "grad_norm": 0.17907440662384033,
+      "learning_rate": 9.872222222222223e-06,
+      "loss": 3.1143,
+      "step": 8224
+    },
+    {
+      "epoch": 1.0094150013373582,
+      "grad_norm": 0.17365169525146484,
+      "learning_rate": 9.694444444444446e-06,
+      "loss": 3.1157,
+      "step": 8256
+    },
+    {
+      "epoch": 1.0133277291658707,
+      "grad_norm": 0.1645737588405609,
+      "learning_rate": 9.516666666666666e-06,
+      "loss": 3.1134,
+      "step": 8288
+    },
+    {
+      "epoch": 1.0172404569943831,
+      "grad_norm": 0.15174245834350586,
+      "learning_rate": 9.338888888888889e-06,
+      "loss": 3.1142,
+      "step": 8320
+    },
+    {
+      "epoch": 1.0211531848228956,
+      "grad_norm": 0.16984011232852936,
+      "learning_rate": 9.161111111111112e-06,
+      "loss": 3.1142,
+      "step": 8352
+    },
+    {
+      "epoch": 1.0250659126514081,
+      "grad_norm": 0.1772463321685791,
+      "learning_rate": 8.983333333333334e-06,
+      "loss": 3.1178,
+      "step": 8384
+    },
+    {
+      "epoch": 1.0289786404799206,
+      "grad_norm": 0.16304141283035278,
+      "learning_rate": 8.805555555555555e-06,
+      "loss": 3.113,
+      "step": 8416
+    },
+    {
+      "epoch": 1.032891368308433,
+      "grad_norm": 0.15513816475868225,
+      "learning_rate": 8.627777777777778e-06,
+      "loss": 3.1145,
+      "step": 8448
+    },
+    {
+      "epoch": 1.0368040961369456,
+      "grad_norm": 0.1862088292837143,
+      "learning_rate": 8.45e-06,
+      "loss": 3.1109,
+      "step": 8480
+    },
+    {
+      "epoch": 1.0407168239654578,
+      "grad_norm": 0.17995817959308624,
+      "learning_rate": 8.272222222222223e-06,
+      "loss": 3.1128,
+      "step": 8512
+    },
+    {
+      "epoch": 1.0446295517939703,
+      "grad_norm": 0.1758676916360855,
+      "learning_rate": 8.094444444444444e-06,
+      "loss": 3.1128,
+      "step": 8544
+    },
+    {
+      "epoch": 1.0485422796224828,
+      "grad_norm": 0.16609688103199005,
+      "learning_rate": 7.916666666666667e-06,
+      "loss": 3.114,
+      "step": 8576
+    },
+    {
+      "epoch": 1.0524550074509953,
+      "grad_norm": 0.15258896350860596,
+      "learning_rate": 7.738888888888889e-06,
+      "loss": 3.1171,
+      "step": 8608
+    },
+    {
+      "epoch": 1.0563677352795078,
+      "grad_norm": 0.16240954399108887,
+      "learning_rate": 7.561111111111112e-06,
+      "loss": 3.113,
+      "step": 8640
+    },
+    {
+      "epoch": 1.0602804631080203,
+      "grad_norm": 0.16423362493515015,
+      "learning_rate": 7.3833333333333335e-06,
+      "loss": 3.1154,
+      "step": 8672
+    },
+    {
+      "epoch": 1.0641931909365328,
+      "grad_norm": 0.17032068967819214,
+      "learning_rate": 7.205555555555555e-06,
+      "loss": 3.1146,
+      "step": 8704
+    },
+    {
+      "epoch": 1.0681059187650452,
+      "grad_norm": 0.1564359813928604,
+      "learning_rate": 7.027777777777778e-06,
+      "loss": 3.1162,
+      "step": 8736
+    },
+    {
+      "epoch": 1.0720186465935577,
+      "grad_norm": 0.15838623046875,
+      "learning_rate": 6.8500000000000005e-06,
+      "loss": 3.113,
+      "step": 8768
+    },
+    {
+      "epoch": 1.0759313744220702,
+      "grad_norm": 0.17325465381145477,
+      "learning_rate": 6.672222222222223e-06,
+      "loss": 3.1153,
+      "step": 8800
+    },
+    {
+      "epoch": 1.0798441022505827,
+      "grad_norm": 0.16170760989189148,
+      "learning_rate": 6.494444444444445e-06,
+      "loss": 3.115,
+      "step": 8832
+    },
+    {
+      "epoch": 1.0837568300790952,
+      "grad_norm": 0.15591956675052643,
+      "learning_rate": 6.316666666666667e-06,
+      "loss": 3.1088,
+      "step": 8864
+    },
+    {
+      "epoch": 1.0876695579076077,
+      "grad_norm": 0.15115121006965637,
+      "learning_rate": 6.138888888888889e-06,
+      "loss": 3.1103,
+      "step": 8896
+    },
+    {
+      "epoch": 1.0915822857361202,
+      "grad_norm": 0.1577509045600891,
+      "learning_rate": 5.961111111111111e-06,
+      "loss": 3.112,
+      "step": 8928
+    },
+    {
+      "epoch": 1.0954950135646326,
+      "grad_norm": 0.1545899361371994,
+      "learning_rate": 5.783333333333334e-06,
+      "loss": 3.1108,
+      "step": 8960
+    },
+    {
+      "epoch": 1.0994077413931451,
+      "grad_norm": 0.1597297489643097,
+      "learning_rate": 5.605555555555555e-06,
+      "loss": 3.1172,
+      "step": 8992
+    },
+    {
+      "epoch": 1.1033204692216576,
+      "grad_norm": 0.16016387939453125,
+      "learning_rate": 5.427777777777778e-06,
+      "loss": 3.1156,
+      "step": 9024
+    },
+    {
+      "epoch": 1.10723319705017,
+      "grad_norm": 0.15304987132549286,
+      "learning_rate": 5.25e-06,
+      "loss": 3.1126,
+      "step": 9056
+    },
+    {
+      "epoch": 1.1111459248786826,
+      "grad_norm": 0.1560225784778595,
+      "learning_rate": 5.072222222222222e-06,
+      "loss": 3.1152,
+      "step": 9088
+    },
+    {
+      "epoch": 1.115058652707195,
+      "grad_norm": 0.16613492369651794,
+      "learning_rate": 4.894444444444445e-06,
+      "loss": 3.1147,
+      "step": 9120
+    },
+    {
+      "epoch": 1.1189713805357075,
+      "grad_norm": 0.15055406093597412,
+      "learning_rate": 4.7166666666666675e-06,
+      "loss": 3.1116,
+      "step": 9152
+    },
+    {
+      "epoch": 1.12288410836422,
+      "grad_norm": 0.16280752420425415,
+      "learning_rate": 4.538888888888889e-06,
+      "loss": 3.1148,
+      "step": 9184
+    },
+    {
+      "epoch": 1.1267968361927325,
+      "grad_norm": 0.1523207277059555,
+      "learning_rate": 4.361111111111112e-06,
+      "loss": 3.1133,
+      "step": 9216
+    },
+    {
+      "epoch": 1.1307095640212448,
+      "grad_norm": 0.1500737965106964,
+      "learning_rate": 4.183333333333334e-06,
+      "loss": 3.1177,
+      "step": 9248
+    },
+    {
+      "epoch": 1.1346222918497573,
+      "grad_norm": 0.16134943068027496,
+      "learning_rate": 4.005555555555555e-06,
+      "loss": 3.1143,
+      "step": 9280
+    },
+    {
+      "epoch": 1.1385350196782698,
+      "grad_norm": 0.1499546766281128,
+      "learning_rate": 3.827777777777778e-06,
+      "loss": 3.1133,
+      "step": 9312
+    },
+    {
+      "epoch": 1.1424477475067822,
+      "grad_norm": 0.15620845556259155,
+      "learning_rate": 3.6499999999999998e-06,
+      "loss": 3.1122,
+      "step": 9344
+    },
+    {
+      "epoch": 1.1463604753352947,
+      "grad_norm": 0.15544985234737396,
+      "learning_rate": 3.4722222222222224e-06,
+      "loss": 3.1146,
+      "step": 9376
+    },
+    {
+      "epoch": 1.1502732031638072,
+      "grad_norm": 0.15928788483142853,
+      "learning_rate": 3.2944444444444446e-06,
+      "loss": 3.1123,
+      "step": 9408
+    },
+    {
+      "epoch": 1.1541859309923197,
+      "grad_norm": 0.14999979734420776,
+      "learning_rate": 3.1166666666666668e-06,
+      "loss": 3.1149,
+      "step": 9440
+    },
+    {
+      "epoch": 1.1580986588208322,
+      "grad_norm": 0.15014442801475525,
+      "learning_rate": 2.938888888888889e-06,
+      "loss": 3.1113,
+      "step": 9472
+    },
+    {
+      "epoch": 1.1620113866493447,
+      "grad_norm": 0.14749625325202942,
+      "learning_rate": 2.761111111111111e-06,
+      "loss": 3.113,
+      "step": 9504
+    },
+    {
+      "epoch": 1.1659241144778572,
+      "grad_norm": 0.14931970834732056,
+      "learning_rate": 2.5833333333333333e-06,
+      "loss": 3.1144,
+      "step": 9536
+    },
+    {
+      "epoch": 1.1698368423063696,
+      "grad_norm": 0.14572674036026,
+      "learning_rate": 2.4055555555555555e-06,
+      "loss": 3.1093,
+      "step": 9568
+    },
+    {
+      "epoch": 1.1737495701348821,
+      "grad_norm": 0.15361888706684113,
+      "learning_rate": 2.227777777777778e-06,
+      "loss": 3.1138,
+      "step": 9600
+    },
+    {
+      "epoch": 1.1776622979633946,
+      "grad_norm": 0.1433536857366562,
+      "learning_rate": 2.0500000000000003e-06,
+      "loss": 3.1123,
+      "step": 9632
+    },
+    {
+      "epoch": 1.181575025791907,
+      "grad_norm": 0.14533208310604095,
+      "learning_rate": 1.8722222222222225e-06,
+      "loss": 3.1116,
+      "step": 9664
+    },
+    {
+      "epoch": 1.1854877536204196,
+      "grad_norm": 0.14816279709339142,
+      "learning_rate": 1.6944444444444446e-06,
+      "loss": 3.1128,
+      "step": 9696
+    },
+    {
+      "epoch": 1.189400481448932,
+      "grad_norm": 0.14798638224601746,
+      "learning_rate": 1.5166666666666668e-06,
+      "loss": 3.116,
+      "step": 9728
+    },
+    {
+      "epoch": 1.1933132092774446,
+      "grad_norm": 0.1386597454547882,
+      "learning_rate": 1.338888888888889e-06,
+      "loss": 3.1145,
+      "step": 9760
+    },
+    {
+      "epoch": 1.197225937105957,
+      "grad_norm": 0.14148685336112976,
+      "learning_rate": 1.161111111111111e-06,
+      "loss": 3.1115,
+      "step": 9792
+    },
+    {
+      "epoch": 1.2011386649344695,
+      "grad_norm": 0.14324016869068146,
+      "learning_rate": 9.833333333333334e-07,
+      "loss": 3.1117,
+      "step": 9824
+    },
+    {
+      "epoch": 1.205051392762982,
+      "grad_norm": 0.14499281346797943,
+      "learning_rate": 8.055555555555556e-07,
+      "loss": 3.1129,
+      "step": 9856
+    },
+    {
+      "epoch": 1.2089641205914945,
+      "grad_norm": 0.1464635133743286,
+      "learning_rate": 6.277777777777778e-07,
+      "loss": 3.1169,
+      "step": 9888
+    },
+    {
+      "epoch": 1.2128768484200068,
+      "grad_norm": 0.14767299592494965,
+      "learning_rate": 4.5e-07,
+      "loss": 3.1131,
+      "step": 9920
+    },
+    {
+      "epoch": 1.2167895762485195,
+      "grad_norm": 0.14456725120544434,
+      "learning_rate": 2.722222222222222e-07,
+      "loss": 3.116,
+      "step": 9952
+    },
+    {
+      "epoch": 1.2207023040770317,
+      "grad_norm": 0.1386868953704834,
+      "learning_rate": 9.444444444444445e-08,
+      "loss": 3.1151,
+      "step": 9984
+    },
+    {
+      "epoch": 1.222658667991288,
+      "step": 10000,
+      "total_flos": 8.246852548747592e+18,
+      "train_loss": 1.5593041332244872,
+      "train_runtime": 85792.9956,
+      "train_samples_per_second": 238.714,
+      "train_steps_per_second": 0.117
+    }
+  ],
+  "logging_steps": 32,
+  "max_steps": 10000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.246852548747592e+18,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b37590802fddc45a46f216e638339c053760e6371d80ec1e79399b9524c81317
+size 6033

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff