summerstars commited on Apr 27

Commit

00a9e74

verified ·

1 Parent(s): 71d2512

Upload continued pretraining checkpoint EN-summer-cpt

Browse files

Files changed (32) hide show

checkpoint-2000/config.json +33 -0
checkpoint-2000/generation_config.json +10 -0
checkpoint-2000/model.safetensors +3 -0
checkpoint-2000/modeling_minimythos_hybrid.py +337 -0
checkpoint-2000/optimizer.pt +3 -0
checkpoint-2000/rng_state.pth +3 -0
checkpoint-2000/scheduler.pt +3 -0
checkpoint-2000/trainer_state.json +1434 -0
checkpoint-2000/training_args.bin +3 -0
checkpoint-2500/config.json +33 -0
checkpoint-2500/generation_config.json +10 -0
checkpoint-2500/model.safetensors +3 -0
checkpoint-2500/modeling_minimythos_hybrid.py +337 -0
checkpoint-2500/optimizer.pt +3 -0
checkpoint-2500/rng_state.pth +3 -0
checkpoint-2500/scheduler.pt +3 -0
checkpoint-2500/trainer_state.json +1784 -0
checkpoint-2500/training_args.bin +3 -0
checkpoint-3000/config.json +33 -0
checkpoint-3000/generation_config.json +10 -0
checkpoint-3000/model.safetensors +3 -0
checkpoint-3000/modeling_minimythos_hybrid.py +337 -0
checkpoint-3000/optimizer.pt +3 -0
checkpoint-3000/rng_state.pth +3 -0
checkpoint-3000/scheduler.pt +3 -0
checkpoint-3000/trainer_state.json +2134 -0
checkpoint-3000/training_args.bin +3 -0
config.json +25 -18
generation_config.json +4 -3
model.safetensors +2 -2
tokenizer_config.json +1 -0
training_args.bin +3 -0

checkpoint-2000/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "architectures": [
+    "MiniMythosHybridForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_minimythos_hybrid.MiniMythosHybridConfig",
+    "AutoModelForCausalLM": "modeling_minimythos_hybrid.MiniMythosHybridForCausalLM"
+  },
+  "block_size": 1024,
+  "bos_token_id": 2,
+  "dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 3,
+  "hidden_size": 1024,
+  "input_scale": 0.2,
+  "is_decoder": true,
+  "leak_rate": 0.25,
+  "max_position_embeddings": 1024,
+  "mlp_mult": 4,
+  "model_type": "minimythos_hybrid",
+  "n_attn_layers": 4,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_reservoir_layers": 4,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 8,
+  "pad_token_id": 0,
+  "reservoir_scale": 0.9,
+  "tie_word_embeddings": false,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "vocab_size": 8192
+}

checkpoint-2000/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token_id": 2,
+  "do_sample": true,
+  "eos_token_id": 3,
+  "max_new_tokens": 256,
+  "pad_token_id": 0,
+  "temperature": 0.7,
+  "top_k": 50,
+  "transformers_version": "5.0.0"
+}

checkpoint-2000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e224f900873a30aa8292e11c3889254111899fc3e79344a290486c7b6bf0b80
+size 184580440

checkpoint-2000/modeling_minimythos_hybrid.py ADDED Viewed

	@@ -0,0 +1,337 @@

+# coding=utf-8
+"""
+MiniMythos Hybrid Reservoir LM
+Hugging Face remote-code compatible modeling file.
+Load with:
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    model_id = "summerstars/EN-summer"
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
+"""
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+class MiniMythosHybridConfig(PretrainedConfig):
+    model_type = "minimythos_hybrid"
+    def __init__(
+        self,
+        vocab_size: int = 8192,
+        block_size: int = 512,
+        n_embd: int = 1024,
+        n_reservoir_layers: int = 6,
+        n_attn_layers: int = 4,
+        n_head: int = 16,
+        mlp_mult: int = 4,
+        dropout: float = 0.0,
+        leak_rate: float = 0.25,
+        reservoir_scale: float = 0.90,
+        input_scale: float = 0.20,
+        pad_token_id: int = 0,
+        bos_token_id: int = 2,
+        eos_token_id: int = 3,
+        tie_word_embeddings: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        self.n_embd = n_embd
+        self.n_reservoir_layers = n_reservoir_layers
+        self.n_attn_layers = n_attn_layers
+        self.n_head = n_head
+        self.mlp_mult = mlp_mult
+        self.dropout = dropout
+        self.leak_rate = leak_rate
+        self.reservoir_scale = reservoir_scale
+        self.input_scale = input_scale
+        self.hidden_size = n_embd
+        self.num_hidden_layers = n_reservoir_layers + n_attn_layers
+        self.num_attention_heads = n_head
+        self.max_position_embeddings = block_size
+        self.is_decoder = True
+        self.is_encoder_decoder = False
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Compute RMSNorm in fp32 for numerical stability, then cast back.
+        orig_dtype = x.dtype
+        x_float = x.float()
+        var = x_float.pow(2).mean(-1, keepdim=True)
+        x_norm = x_float * torch.rsqrt(var + self.eps)
+        return (self.weight.float() * x_norm).to(orig_dtype)
+class ReservoirBlock(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        self.leak_rate = config.leak_rate
+        self.drop = nn.Dropout(config.dropout)
+        self.in_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.reservoir = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.norm = RMSNorm(config.n_embd)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        z = torch.tanh(self.in_proj(x) + self.reservoir(x))
+        x = self.leak_rate * x + (1.0 - self.leak_rate) * z
+        x = self.norm(x)
+        return self.drop(x)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        if config.n_embd % config.n_head != 0:
+            raise ValueError("n_embd must be divisible by n_head")
+        self.n_head = config.n_head
+        self.head_dim = config.n_embd // config.n_head
+        if self.head_dim % 2 != 0:
+            raise ValueError("head_dim must be even for RoPE")
+        self.qkv = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
+        self.proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.dropout = config.dropout
+        self.resid_drop = nn.Dropout(config.dropout)
+        inv_freq = 1.0 / (
+            10000 ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32) / self.head_dim)
+        )
+        t = torch.arange(config.block_size, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("rope_cos", freqs.cos()[None, :, None, :], persistent=False)
+        self.register_buffer("rope_sin", freqs.sin()[None, :, None, :], persistent=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, channels = x.shape
+        q, k, v = self.qkv(x).reshape(
+            batch_size, seq_len, 3, self.n_head, self.head_dim
+        ).unbind(dim=2)
+        cos = self.rope_cos[:, :seq_len].to(dtype=q.dtype, device=q.device)
+        sin = self.rope_sin[:, :seq_len].to(dtype=q.dtype, device=q.device)
+        def apply_rope(u: torch.Tensor) -> torch.Tensor:
+            u_e = u[..., 0::2]
+            u_o = u[..., 1::2]
+            return torch.stack(
+                (u_e * cos - u_o * sin, u_e * sin + u_o * cos), dim=-1
+            ).flatten(-2)
+        q = apply_rope(q).transpose(1, 2)
+        k = apply_rope(k).transpose(1, 2)
+        v = v.transpose(1, 2)
+        y = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=True,
+        )
+        y = y.transpose(1, 2).contiguous().reshape(batch_size, seq_len, channels)
+        return self.resid_drop(self.proj(y))
+class SwiGLU(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        hidden = config.mlp_mult * config.n_embd
+        self.w1 = nn.Linear(config.n_embd, hidden, bias=False)
+        self.w2 = nn.Linear(config.n_embd, hidden, bias=False)
+        self.w3 = nn.Linear(hidden, config.n_embd, bias=False)
+        self.drop = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.drop(self.w3(F.silu(self.w1(x)) * self.w2(x)))
+class TransformerBlock(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        self.norm1 = RMSNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.norm2 = RMSNorm(config.n_embd)
+        self.mlp = SwiGLU(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class MiniMythosHybridForCausalLM(PreTrainedModel):
+    config_class = MiniMythosHybridConfig
+    base_model_prefix = "model"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = False
+    _tied_weights_keys = []
+    all_tied_weights_keys = {}
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__(config)
+        self.config = config
+        self.token_emb = nn.Embedding(config.vocab_size, config.n_embd)
+        self.drop = nn.Dropout(config.dropout)
+        self.reservoir_layers = nn.ModuleList(
+            [ReservoirBlock(config) for _ in range(config.n_reservoir_layers)]
+        )
+        self.attn_layers = nn.ModuleList(
+            [TransformerBlock(config) for _ in range(config.n_attn_layers)]
+        )
+        self.norm = RMSNorm(config.n_embd)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+    def get_input_embeddings(self):
+        return self.token_emb
+    def set_input_embeddings(self, value):
+        self.token_emb = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def tie_weights(self, *args, **kwargs):
+        if getattr(self.config, "tie_word_embeddings", False):
+            self._tie_or_clone_weights(self.lm_head, self.token_emb)
+        return None
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("Specify either input_ids or inputs_embeds, not both.")
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You must specify input_ids or inputs_embeds.")
+        if inputs_embeds is None:
+            if input_ids.shape[1] > self.config.block_size:
+                input_ids = input_ids[:, -self.config.block_size:]
+                if labels is not None:
+                    labels = labels[:, -self.config.block_size:]
+            x = self.token_emb(input_ids)
+        else:
+            if inputs_embeds.shape[1] > self.config.block_size:
+                inputs_embeds = inputs_embeds[:, -self.config.block_size:, :]
+                if labels is not None:
+                    labels = labels[:, -self.config.block_size:]
+            x = inputs_embeds
+        hidden_states = [] if output_hidden_states else None
+        x = self.drop(x)
+        for layer in self.reservoir_layers:
+            if output_hidden_states:
+                hidden_states.append(x)
+            x = layer(x)
+        for layer in self.attn_layers:
+            if output_hidden_states:
+                hidden_states.append(x)
+            x = layer(x)
+        x = self.norm(x)
+        if output_hidden_states:
+            hidden_states.append(x)
+        logits = self.lm_head(x)
+        # Prevent generation from crashing if a checkpoint contains unstable values.
+        # This should not hide training issues, but it makes inference robust.
+        logits = torch.nan_to_num(logits, nan=0.0, posinf=1e4, neginf=-1e4)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        if not return_dict:
+            output = (logits,)
+            if use_cache:
+                output = output + (None,)
+            if output_hidden_states:
+                output = output + (tuple(hidden_states),)
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=tuple(hidden_states) if output_hidden_states else None,
+            attentions=None,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values=None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if input_ids is not None and input_ids.shape[1] > self.config.block_size:
+            input_ids = input_ids[:, -self.config.block_size:]
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, -self.config.block_size:]
+        if inputs_embeds is not None and input_ids is not None and input_ids.shape[1] == 1:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": None,
+                "use_cache": False,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        return past_key_values

checkpoint-2000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1984d07e3c2e824e935bc81b009009f0a02b31c073d8cbd2b2f67f66ba5c35ce
+size 369189643

checkpoint-2000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
+size 14645

checkpoint-2000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35c3225739899cb4fd6f5d032cd74a7cb9e2e1de82a40710b71861118677b6eb
+size 1465

checkpoint-2000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1434 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6666666666666666,
+  "eval_steps": 500,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0033333333333333335,
+      "grad_norm": 85.0,
+      "learning_rate": 7.2e-08,
+      "loss": 98.92244262695313,
+      "step": 10
+    },
+    {
+      "epoch": 0.006666666666666667,
+      "grad_norm": 93.0,
+      "learning_rate": 1.5199999999999998e-07,
+      "loss": 98.39124145507813,
+      "step": 20
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 90.0,
+      "learning_rate": 2.3199999999999999e-07,
+      "loss": 98.9702392578125,
+      "step": 30
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 90.0,
+      "learning_rate": 3.12e-07,
+      "loss": 99.00128173828125,
+      "step": 40
+    },
+    {
+      "epoch": 0.016666666666666666,
+      "grad_norm": 86.0,
+      "learning_rate": 3.9199999999999996e-07,
+      "loss": 98.24757690429688,
+      "step": 50
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 86.0,
+      "learning_rate": 4.7199999999999994e-07,
+      "loss": 100.627099609375,
+      "step": 60
+    },
+    {
+      "epoch": 0.023333333333333334,
+      "grad_norm": 88.5,
+      "learning_rate": 5.52e-07,
+      "loss": 99.09172973632812,
+      "step": 70
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 6.32e-07,
+      "loss": 99.2964111328125,
+      "step": 80
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 82.5,
+      "learning_rate": 7.12e-07,
+      "loss": 98.79883422851563,
+      "step": 90
+    },
+    {
+      "epoch": 0.03333333333333333,
+      "grad_norm": 83.5,
+      "learning_rate": 7.92e-07,
+      "loss": 97.59891967773437,
+      "step": 100
+    },
+    {
+      "epoch": 0.03666666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 7.999809885464028e-07,
+      "loss": 98.88760375976562,
+      "step": 110
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 82.0,
+      "learning_rate": 7.999152722615145e-07,
+      "loss": 100.03501586914062,
+      "step": 120
+    },
+    {
+      "epoch": 0.043333333333333335,
+      "grad_norm": 81.5,
+      "learning_rate": 7.998026241462926e-07,
+      "loss": 99.05869140625,
+      "step": 130
+    },
+    {
+      "epoch": 0.04666666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 7.996430574204927e-07,
+      "loss": 98.06262817382813,
+      "step": 140
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 91.0,
+      "learning_rate": 7.994365908099776e-07,
+      "loss": 99.91973876953125,
+      "step": 150
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 88.5,
+      "learning_rate": 7.991832485445195e-07,
+      "loss": 99.073046875,
+      "step": 160
+    },
+    {
+      "epoch": 0.056666666666666664,
+      "grad_norm": 84.5,
+      "learning_rate": 7.988830603549564e-07,
+      "loss": 98.0226318359375,
+      "step": 170
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 90.5,
+      "learning_rate": 7.985360614697036e-07,
+      "loss": 99.20685424804688,
+      "step": 180
+    },
+    {
+      "epoch": 0.06333333333333334,
+      "grad_norm": 85.5,
+      "learning_rate": 7.981422926106186e-07,
+      "loss": 98.71968383789063,
+      "step": 190
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 82.5,
+      "learning_rate": 7.977017999882226e-07,
+      "loss": 99.76533203125,
+      "step": 200
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 81.0,
+      "learning_rate": 7.972146352962785e-07,
+      "loss": 98.85039672851562,
+      "step": 210
+    },
+    {
+      "epoch": 0.07333333333333333,
+      "grad_norm": 79.5,
+      "learning_rate": 7.966808557057225e-07,
+      "loss": 96.81197509765624,
+      "step": 220
+    },
+    {
+      "epoch": 0.07666666666666666,
+      "grad_norm": 94.5,
+      "learning_rate": 7.961005238579563e-07,
+      "loss": 98.03938598632813,
+      "step": 230
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 74.5,
+      "learning_rate": 7.954737078574952e-07,
+      "loss": 97.90469970703126,
+      "step": 240
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 87.0,
+      "learning_rate": 7.948004812639763e-07,
+      "loss": 99.83749389648438,
+      "step": 250
+    },
+    {
+      "epoch": 0.08666666666666667,
+      "grad_norm": 87.5,
+      "learning_rate": 7.940809230835248e-07,
+      "loss": 99.23988647460938,
+      "step": 260
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 86.5,
+      "learning_rate": 7.933151177594838e-07,
+      "loss": 99.12615966796875,
+      "step": 270
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 86.0,
+      "learning_rate": 7.925031551625037e-07,
+      "loss": 99.5418212890625,
+      "step": 280
+    },
+    {
+      "epoch": 0.09666666666666666,
+      "grad_norm": 80.5,
+      "learning_rate": 7.916451305799951e-07,
+      "loss": 98.88993530273437,
+      "step": 290
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 81.5,
+      "learning_rate": 7.907411447049468e-07,
+      "loss": 98.3482177734375,
+      "step": 300
+    },
+    {
+      "epoch": 0.10333333333333333,
+      "grad_norm": 82.5,
+      "learning_rate": 7.897913036241098e-07,
+      "loss": 98.17821655273437,
+      "step": 310
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 7.88795718805546e-07,
+      "loss": 98.89118041992188,
+      "step": 320
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 83.0,
+      "learning_rate": 7.87754507085548e-07,
+      "loss": 98.15654907226562,
+      "step": 330
+    },
+    {
+      "epoch": 0.11333333333333333,
+      "grad_norm": 84.5,
+      "learning_rate": 7.86667790654928e-07,
+      "loss": 97.81229858398437,
+      "step": 340
+    },
+    {
+      "epoch": 0.11666666666666667,
+      "grad_norm": 84.0,
+      "learning_rate": 7.85535697044677e-07,
+      "loss": 99.2128662109375,
+      "step": 350
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 83.5,
+      "learning_rate": 7.843583591109998e-07,
+      "loss": 98.5178955078125,
+      "step": 360
+    },
+    {
+      "epoch": 0.12333333333333334,
+      "grad_norm": 86.5,
+      "learning_rate": 7.83135915019723e-07,
+      "loss": 98.7326416015625,
+      "step": 370
+    },
+    {
+      "epoch": 0.12666666666666668,
+      "grad_norm": 83.0,
+      "learning_rate": 7.818685082300806e-07,
+      "loss": 98.09605712890625,
+      "step": 380
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 82.0,
+      "learning_rate": 7.805562874778789e-07,
+      "loss": 99.17857666015625,
+      "step": 390
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 78.5,
+      "learning_rate": 7.791994067580411e-07,
+      "loss": 97.64859619140626,
+      "step": 400
+    },
+    {
+      "epoch": 0.13666666666666666,
+      "grad_norm": 84.5,
+      "learning_rate": 7.77798025306536e-07,
+      "loss": 96.37807006835938,
+      "step": 410
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 83.5,
+      "learning_rate": 7.763523075816902e-07,
+      "loss": 99.2608642578125,
+      "step": 420
+    },
+    {
+      "epoch": 0.14333333333333334,
+      "grad_norm": 77.0,
+      "learning_rate": 7.748624232448886e-07,
+      "loss": 99.56741333007812,
+      "step": 430
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 78.0,
+      "learning_rate": 7.733285471406642e-07,
+      "loss": 99.81188354492187,
+      "step": 440
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 82.5,
+      "learning_rate": 7.717508592761785e-07,
+      "loss": 98.52041015625,
+      "step": 450
+    },
+    {
+      "epoch": 0.15333333333333332,
+      "grad_norm": 82.0,
+      "learning_rate": 7.701295448000974e-07,
+      "loss": 99.01697387695313,
+      "step": 460
+    },
+    {
+      "epoch": 0.15666666666666668,
+      "grad_norm": 79.5,
+      "learning_rate": 7.684647939808636e-07,
+      "loss": 98.38013916015625,
+      "step": 470
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 78.5,
+      "learning_rate": 7.667568021843666e-07,
+      "loss": 97.33247680664063,
+      "step": 480
+    },
+    {
+      "epoch": 0.16333333333333333,
+      "grad_norm": 82.5,
+      "learning_rate": 7.650057698510164e-07,
+      "loss": 97.90450439453124,
+      "step": 490
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 84.5,
+      "learning_rate": 7.632119024722212e-07,
+      "loss": 98.53453369140625,
+      "step": 500
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 91.0,
+      "learning_rate": 7.613754105662717e-07,
+      "loss": 98.44060668945312,
+      "step": 510
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 83.5,
+      "learning_rate": 7.594965096536353e-07,
+      "loss": 98.7102294921875,
+      "step": 520
+    },
+    {
+      "epoch": 0.17666666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 7.575754202316649e-07,
+      "loss": 97.73778076171875,
+      "step": 530
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 83.5,
+      "learning_rate": 7.556123677487218e-07,
+      "loss": 99.01414184570312,
+      "step": 540
+    },
+    {
+      "epoch": 0.18333333333333332,
+      "grad_norm": 80.0,
+      "learning_rate": 7.536075825777187e-07,
+      "loss": 100.30809936523437,
+      "step": 550
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 86.5,
+      "learning_rate": 7.515612999890841e-07,
+      "loss": 99.7580322265625,
+      "step": 560
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 84.5,
+      "learning_rate": 7.494737601231523e-07,
+      "loss": 98.83981323242188,
+      "step": 570
+    },
+    {
+      "epoch": 0.19333333333333333,
+      "grad_norm": 81.5,
+      "learning_rate": 7.473452079619826e-07,
+      "loss": 98.02926635742188,
+      "step": 580
+    },
+    {
+      "epoch": 0.19666666666666666,
+      "grad_norm": 83.5,
+      "learning_rate": 7.451758933006086e-07,
+      "loss": 98.2523681640625,
+      "step": 590
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 75.5,
+      "learning_rate": 7.429660707177239e-07,
+      "loss": 97.91044311523437,
+      "step": 600
+    },
+    {
+      "epoch": 0.20333333333333334,
+      "grad_norm": 80.0,
+      "learning_rate": 7.407159995458066e-07,
+      "loss": 98.97639770507813,
+      "step": 610
+    },
+    {
+      "epoch": 0.20666666666666667,
+      "grad_norm": 80.0,
+      "learning_rate": 7.384259438406848e-07,
+      "loss": 96.91513671875,
+      "step": 620
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 75.0,
+      "learning_rate": 7.360961723505495e-07,
+      "loss": 98.74181518554687,
+      "step": 630
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 82.0,
+      "learning_rate": 7.337269584844142e-07,
+      "loss": 98.77709350585937,
+      "step": 640
+    },
+    {
+      "epoch": 0.21666666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 7.313185802800312e-07,
+      "loss": 98.27448120117188,
+      "step": 650
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 85.0,
+      "learning_rate": 7.288713203712605e-07,
+      "loss": 98.03839111328125,
+      "step": 660
+    },
+    {
+      "epoch": 0.22333333333333333,
+      "grad_norm": 91.5,
+      "learning_rate": 7.263854659549032e-07,
+      "loss": 97.03794555664062,
+      "step": 670
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 78.0,
+      "learning_rate": 7.23861308756997e-07,
+      "loss": 98.09403686523437,
+      "step": 680
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 91.0,
+      "learning_rate": 7.212991449985802e-07,
+      "loss": 98.54012451171874,
+      "step": 690
+    },
+    {
+      "epoch": 0.23333333333333334,
+      "grad_norm": 85.0,
+      "learning_rate": 7.186992753609302e-07,
+      "loss": 97.112890625,
+      "step": 700
+    },
+    {
+      "epoch": 0.23666666666666666,
+      "grad_norm": 83.0,
+      "learning_rate": 7.160620049502761e-07,
+      "loss": 97.43547973632812,
+      "step": 710
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 81.0,
+      "learning_rate": 7.133876432619936e-07,
+      "loss": 97.59598388671876,
+      "step": 720
+    },
+    {
+      "epoch": 0.24333333333333335,
+      "grad_norm": 82.0,
+      "learning_rate": 7.106765041442847e-07,
+      "loss": 98.765087890625,
+      "step": 730
+    },
+    {
+      "epoch": 0.24666666666666667,
+      "grad_norm": 82.5,
+      "learning_rate": 7.079289057613449e-07,
+      "loss": 98.65599365234375,
+      "step": 740
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 85.5,
+      "learning_rate": 7.051451705560269e-07,
+      "loss": 100.546728515625,
+      "step": 750
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 81.5,
+      "learning_rate": 7.023256252119996e-07,
+      "loss": 99.18995361328125,
+      "step": 760
+    },
+    {
+      "epoch": 0.25666666666666665,
+      "grad_norm": 83.5,
+      "learning_rate": 6.994706006154102e-07,
+      "loss": 98.26092529296875,
+      "step": 770
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 77.5,
+      "learning_rate": 6.965804318160538e-07,
+      "loss": 100.81718139648437,
+      "step": 780
+    },
+    {
+      "epoch": 0.2633333333333333,
+      "grad_norm": 106.0,
+      "learning_rate": 6.936554579880531e-07,
+      "loss": 99.88375244140624,
+      "step": 790
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 81.0,
+      "learning_rate": 6.906960223900558e-07,
+      "loss": 98.47666015625,
+      "step": 800
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 79.0,
+      "learning_rate": 6.877024723249506e-07,
+      "loss": 98.1271240234375,
+      "step": 810
+    },
+    {
+      "epoch": 0.2733333333333333,
+      "grad_norm": 86.0,
+      "learning_rate": 6.846751590991103e-07,
+      "loss": 97.95358276367188,
+      "step": 820
+    },
+    {
+      "epoch": 0.27666666666666667,
+      "grad_norm": 85.5,
+      "learning_rate": 6.816144379811647e-07,
+      "loss": 97.06906127929688,
+      "step": 830
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 80.0,
+      "learning_rate": 6.785206681603071e-07,
+      "loss": 97.89542236328126,
+      "step": 840
+    },
+    {
+      "epoch": 0.2833333333333333,
+      "grad_norm": 76.5,
+      "learning_rate": 6.753942127041434e-07,
+      "loss": 98.07276611328125,
+      "step": 850
+    },
+    {
+      "epoch": 0.2866666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 6.722354385160832e-07,
+      "loss": 98.10612182617187,
+      "step": 860
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 85.0,
+      "learning_rate": 6.690447162922828e-07,
+      "loss": 97.93245239257813,
+      "step": 870
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 79.0,
+      "learning_rate": 6.65822420478142e-07,
+      "loss": 97.72557373046875,
+      "step": 880
+    },
+    {
+      "epoch": 0.2966666666666667,
+      "grad_norm": 80.0,
+      "learning_rate": 6.625689292243618e-07,
+      "loss": 97.9116455078125,
+      "step": 890
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 83.5,
+      "learning_rate": 6.59284624342566e-07,
+      "loss": 97.18661499023438,
+      "step": 900
+    },
+    {
+      "epoch": 0.30333333333333334,
+      "grad_norm": 82.5,
+      "learning_rate": 6.55969891260494e-07,
+      "loss": 97.79299926757812,
+      "step": 910
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 79.5,
+      "learning_rate": 6.526251189767701e-07,
+      "loss": 97.34827270507813,
+      "step": 920
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 80.5,
+      "learning_rate": 6.492507000152516e-07,
+      "loss": 98.7041748046875,
+      "step": 930
+    },
+    {
+      "epoch": 0.31333333333333335,
+      "grad_norm": 82.0,
+      "learning_rate": 6.458470303789652e-07,
+      "loss": 98.721240234375,
+      "step": 940
+    },
+    {
+      "epoch": 0.31666666666666665,
+      "grad_norm": 85.0,
+      "learning_rate": 6.424145095036337e-07,
+      "loss": 99.30765991210937,
+      "step": 950
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 77.5,
+      "learning_rate": 6.389535402108008e-07,
+      "loss": 98.2544921875,
+      "step": 960
+    },
+    {
+      "epoch": 0.3233333333333333,
+      "grad_norm": 85.0,
+      "learning_rate": 6.354645286605583e-07,
+      "loss": 97.76597290039062,
+      "step": 970
+    },
+    {
+      "epoch": 0.32666666666666666,
+      "grad_norm": 88.5,
+      "learning_rate": 6.31947884303881e-07,
+      "loss": 98.00664672851562,
+      "step": 980
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 80.0,
+      "learning_rate": 6.284040198345763e-07,
+      "loss": 98.64342651367187,
+      "step": 990
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 92.0,
+      "learning_rate": 6.248333511408522e-07,
+      "loss": 97.74580688476563,
+      "step": 1000
+    },
+    {
+      "epoch": 0.33666666666666667,
+      "grad_norm": 91.0,
+      "learning_rate": 6.212362972565115e-07,
+      "loss": 98.11343994140626,
+      "step": 1010
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 82.0,
+      "learning_rate": 6.176132803117761e-07,
+      "loss": 98.30454711914062,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3433333333333333,
+      "grad_norm": 81.5,
+      "learning_rate": 6.13964725483748e-07,
+      "loss": 97.90970458984376,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 85.5,
+      "learning_rate": 6.102910609465133e-07,
+      "loss": 96.98532104492188,
+      "step": 1040
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 81.0,
+      "learning_rate": 6.065927178208936e-07,
+      "loss": 107.39459228515625,
+      "step": 1050
+    },
+    {
+      "epoch": 0.35333333333333333,
+      "grad_norm": 82.5,
+      "learning_rate": 6.028701301238521e-07,
+      "loss": 98.15614624023438,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3566666666666667,
+      "grad_norm": 81.5,
+      "learning_rate": 5.991237347175605e-07,
+      "loss": 98.47268676757812,
+      "step": 1070
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 75.0,
+      "learning_rate": 5.953539712581301e-07,
+      "loss": 97.77119750976563,
+      "step": 1080
+    },
+    {
+      "epoch": 0.36333333333333334,
+      "grad_norm": 78.0,
+      "learning_rate": 5.915612821440172e-07,
+      "loss": 98.135302734375,
+      "step": 1090
+    },
+    {
+      "epoch": 0.36666666666666664,
+      "grad_norm": 76.5,
+      "learning_rate": 5.877461124641053e-07,
+      "loss": 97.557373046875,
+      "step": 1100
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 81.5,
+      "learning_rate": 5.839089099454721e-07,
+      "loss": 98.51465454101563,
+      "step": 1110
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 81.5,
+      "learning_rate": 5.800501249008462e-07,
+      "loss": 99.03858032226563,
+      "step": 1120
+    },
+    {
+      "epoch": 0.37666666666666665,
+      "grad_norm": 75.5,
+      "learning_rate": 5.761702101757618e-07,
+      "loss": 99.02989501953125,
+      "step": 1130
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 83.5,
+      "learning_rate": 5.722696210954143e-07,
+      "loss": 100.75755004882812,
+      "step": 1140
+    },
+    {
+      "epoch": 0.38333333333333336,
+      "grad_norm": 77.0,
+      "learning_rate": 5.683488154112268e-07,
+      "loss": 98.41821899414063,
+      "step": 1150
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 77.0,
+      "learning_rate": 5.644082532471301e-07,
+      "loss": 98.3514892578125,
+      "step": 1160
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 87.0,
+      "learning_rate": 5.60448397045566e-07,
+      "loss": 97.98330078125,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3933333333333333,
+      "grad_norm": 78.0,
+      "learning_rate": 5.564697115132166e-07,
+      "loss": 99.36333618164062,
+      "step": 1180
+    },
+    {
+      "epoch": 0.39666666666666667,
+      "grad_norm": 80.0,
+      "learning_rate": 5.524726635664701e-07,
+      "loss": 98.58826293945313,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 82.0,
+      "learning_rate": 5.484577222766244e-07,
+      "loss": 99.26712646484376,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4033333333333333,
+      "grad_norm": 78.5,
+      "learning_rate": 5.444253588148419e-07,
+      "loss": 99.81590576171875,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4066666666666667,
+      "grad_norm": 89.0,
+      "learning_rate": 5.40376046396853e-07,
+      "loss": 97.74580078125,
+      "step": 1220
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 80.5,
+      "learning_rate": 5.363102602274239e-07,
+      "loss": 98.692529296875,
+      "step": 1230
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 80.0,
+      "learning_rate": 5.32228477444588e-07,
+      "loss": 99.05579833984375,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 77.5,
+      "learning_rate": 5.281311770636531e-07,
+      "loss": 98.2328369140625,
+      "step": 1250
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 79.5,
+      "learning_rate": 5.24018839920985e-07,
+      "loss": 98.72367553710937,
+      "step": 1260
+    },
+    {
+      "epoch": 0.42333333333333334,
+      "grad_norm": 80.0,
+      "learning_rate": 5.198919486175807e-07,
+      "loss": 98.29996948242187,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 108.0,
+      "learning_rate": 5.157509874624324e-07,
+      "loss": 97.95867919921875,
+      "step": 1280
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 83.0,
+      "learning_rate": 5.115964424156917e-07,
+      "loss": 97.64778442382813,
+      "step": 1290
+    },
+    {
+      "epoch": 0.43333333333333335,
+      "grad_norm": 103.5,
+      "learning_rate": 5.0742880103164e-07,
+      "loss": 97.5724365234375,
+      "step": 1300
+    },
+    {
+      "epoch": 0.43666666666666665,
+      "grad_norm": 85.0,
+      "learning_rate": 5.032485524014726e-07,
+      "loss": 97.94261474609375,
+      "step": 1310
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 81.5,
+      "learning_rate": 4.990561870958998e-07,
+      "loss": 98.76618041992188,
+      "step": 1320
+    },
+    {
+      "epoch": 0.44333333333333336,
+      "grad_norm": 78.5,
+      "learning_rate": 4.948521971075788e-07,
+      "loss": 100.03206176757813,
+      "step": 1330
+    },
+    {
+      "epoch": 0.44666666666666666,
+      "grad_norm": 76.5,
+      "learning_rate": 4.906370757933739e-07,
+      "loss": 97.71541748046874,
+      "step": 1340
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 78.0,
+      "learning_rate": 4.864113178164604e-07,
+      "loss": 98.21603393554688,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 80.0,
+      "learning_rate": 4.821754190882729e-07,
+      "loss": 98.40943603515625,
+      "step": 1360
+    },
+    {
+      "epoch": 0.45666666666666667,
+      "grad_norm": 79.0,
+      "learning_rate": 4.779298767103083e-07,
+      "loss": 98.84269409179687,
+      "step": 1370
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 85.5,
+      "learning_rate": 4.736751889157882e-07,
+      "loss": 97.90013427734375,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4633333333333333,
+      "grad_norm": 76.5,
+      "learning_rate": 4.6941185501118975e-07,
+      "loss": 96.82548828125,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 4.6514037531764925e-07,
+      "loss": 98.97203979492187,
+      "step": 1400
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 90.5,
+      "learning_rate": 4.608612511122476e-07,
+      "loss": 98.75665893554688,
+      "step": 1410
+    },
+    {
+      "epoch": 0.47333333333333333,
+      "grad_norm": 76.0,
+      "learning_rate": 4.565749845691828e-07,
+      "loss": 97.86590576171875,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4766666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 4.5228207870083823e-07,
+      "loss": 98.30226440429688,
+      "step": 1430
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 76.5,
+      "learning_rate": 4.479830372987511e-07,
+      "loss": 97.27890625,
+      "step": 1440
+    },
+    {
+      "epoch": 0.48333333333333334,
+      "grad_norm": 82.5,
+      "learning_rate": 4.436783648744911e-07,
+      "loss": 96.8334716796875,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4866666666666667,
+      "grad_norm": 94.5,
+      "learning_rate": 4.3936856660045317e-07,
+      "loss": 100.1866943359375,
+      "step": 1460
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 76.0,
+      "learning_rate": 4.350541482505733e-07,
+      "loss": 97.47962036132813,
+      "step": 1470
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 80.5,
+      "learning_rate": 4.30735616140974e-07,
+      "loss": 98.73521118164062,
+      "step": 1480
+    },
+    {
+      "epoch": 0.49666666666666665,
+      "grad_norm": 80.0,
+      "learning_rate": 4.2641347707054586e-07,
+      "loss": 97.36817016601563,
+      "step": 1490
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 81.0,
+      "learning_rate": 4.220882382614721e-07,
+      "loss": 98.93515625,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5033333333333333,
+      "grad_norm": 84.5,
+      "learning_rate": 4.177604072997041e-07,
+      "loss": 99.65122680664062,
+      "step": 1510
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 81.5,
+      "learning_rate": 4.134304920753937e-07,
+      "loss": 99.22744140625,
+      "step": 1520
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 81.0,
+      "learning_rate": 4.090990007232907e-07,
+      "loss": 98.11915893554688,
+      "step": 1530
+    },
+    {
+      "epoch": 0.5133333333333333,
+      "grad_norm": 91.5,
+      "learning_rate": 4.0476644156310994e-07,
+      "loss": 99.190869140625,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5166666666666667,
+      "grad_norm": 76.0,
+      "learning_rate": 4.0043332303987834e-07,
+      "loss": 97.55637817382812,
+      "step": 1550
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 82.5,
+      "learning_rate": 3.961001536642667e-07,
+      "loss": 98.12214965820313,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5233333333333333,
+      "grad_norm": 80.5,
+      "learning_rate": 3.9176744195291366e-07,
+      "loss": 100.78977661132812,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5266666666666666,
+      "grad_norm": 78.0,
+      "learning_rate": 3.874356963687487e-07,
+      "loss": 98.84946899414062,
+      "step": 1580
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 81.5,
+      "learning_rate": 3.831054252613222e-07,
+      "loss": 99.25551147460938,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 78.0,
+      "learning_rate": 3.787771368071479e-07,
+      "loss": 98.48034057617187,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5366666666666666,
+      "grad_norm": 78.0,
+      "learning_rate": 3.7445133895006673e-07,
+      "loss": 100.24962768554687,
+      "step": 1610
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 77.5,
+      "learning_rate": 3.7012853934163675e-07,
+      "loss": 98.97702026367188,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5433333333333333,
+      "grad_norm": 80.5,
+      "learning_rate": 3.6580924528155834e-07,
+      "loss": 97.33264770507813,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 81.5,
+      "learning_rate": 3.6149396365814017e-07,
+      "loss": 98.55026245117188,
+      "step": 1640
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 94.5,
+      "learning_rate": 3.571832008888139e-07,
+      "loss": 97.8692138671875,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5533333333333333,
+      "grad_norm": 79.0,
+      "learning_rate": 3.528774628607033e-07,
+      "loss": 99.7307861328125,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5566666666666666,
+      "grad_norm": 76.0,
+      "learning_rate": 3.485772548712565e-07,
+      "loss": 98.52730712890624,
+      "step": 1670
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 76.5,
+      "learning_rate": 3.442830815689475e-07,
+      "loss": 98.9763427734375,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5633333333333334,
+      "grad_norm": 73.0,
+      "learning_rate": 3.399954468940525e-07,
+      "loss": 97.82731323242187,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5666666666666667,
+      "grad_norm": 76.5,
+      "learning_rate": 3.357148540195112e-07,
+      "loss": 98.65582885742188,
+      "step": 1700
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 87.5,
+      "learning_rate": 3.314418052918764e-07,
+      "loss": 97.91898193359376,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 76.5,
+      "learning_rate": 3.2717680217236214e-07,
+      "loss": 98.35755615234375,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5766666666666667,
+      "grad_norm": 95.0,
+      "learning_rate": 3.2292034517799457e-07,
+      "loss": 98.27772827148438,
+      "step": 1730
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 79.0,
+      "learning_rate": 3.1867293382287417e-07,
+      "loss": 97.74335327148438,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 87.5,
+      "learning_rate": 3.1443506655955536e-07,
+      "loss": 97.20853881835937,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 3.1020724072055136e-07,
+      "loss": 98.31585083007812,
+      "step": 1760
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 84.0,
+      "learning_rate": 3.0598995245996964e-07,
+      "loss": 99.7463623046875,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5933333333333334,
+      "grad_norm": 83.5,
+      "learning_rate": 3.017836966952859e-07,
+      "loss": 98.76014404296875,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5966666666666667,
+      "grad_norm": 138.0,
+      "learning_rate": 2.9758896704926393e-07,
+      "loss": 99.64288330078125,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 80.5,
+      "learning_rate": 2.93406255792026e-07,
+      "loss": 98.490380859375,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6033333333333334,
+      "grad_norm": 79.0,
+      "learning_rate": 2.8923605378328365e-07,
+      "loss": 97.536083984375,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6066666666666667,
+      "grad_norm": 76.5,
+      "learning_rate": 2.8507885041473197e-07,
+      "loss": 98.35460815429687,
+      "step": 1820
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 80.0,
+      "learning_rate": 2.809351335526184e-07,
+      "loss": 98.06194458007812,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 79.0,
+      "learning_rate": 2.7680538948048913e-07,
+      "loss": 97.76737670898437,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6166666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 2.7269010284212136e-07,
+      "loss": 99.20833740234374,
+      "step": 1850
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 82.5,
+      "learning_rate": 2.685897565846484e-07,
+      "loss": 98.02843627929687,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6233333333333333,
+      "grad_norm": 84.0,
+      "learning_rate": 2.6450483190188343e-07,
+      "loss": 97.90446166992187,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 78.0,
+      "learning_rate": 2.604358081778498e-07,
+      "loss": 97.0069091796875,
+      "step": 1880
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 86.0,
+      "learning_rate": 2.5638316293052245e-07,
+      "loss": 98.136376953125,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6333333333333333,
+      "grad_norm": 76.0,
+      "learning_rate": 2.523473717557898e-07,
+      "loss": 98.2837158203125,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6366666666666667,
+      "grad_norm": 83.5,
+      "learning_rate": 2.4832890827163993e-07,
+      "loss": 99.101025390625,
+      "step": 1910
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 85.0,
+      "learning_rate": 2.443282440625797e-07,
+      "loss": 98.89285278320312,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6433333333333333,
+      "grad_norm": 75.5,
+      "learning_rate": 2.403458486242921e-07,
+      "loss": 97.999560546875,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6466666666666666,
+      "grad_norm": 91.5,
+      "learning_rate": 2.3638218930853874e-07,
+      "loss": 96.57308959960938,
+      "step": 1940
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 88.5,
+      "learning_rate": 2.3243773126831448e-07,
+      "loss": 98.37883911132812,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 87.0,
+      "learning_rate": 2.2851293740325895e-07,
+      "loss": 97.91729125976562,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6566666666666666,
+      "grad_norm": 80.0,
+      "learning_rate": 2.2460826830533416e-07,
+      "loss": 99.19988403320312,
+      "step": 1970
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 79.0,
+      "learning_rate": 2.2072418220477083e-07,
+      "loss": 97.540087890625,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6633333333333333,
+      "grad_norm": 77.5,
+      "learning_rate": 2.168611349162943e-07,
+      "loss": 98.80101318359375,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 83.0,
+      "learning_rate": 2.1301957978563152e-07,
+      "loss": 98.22483520507812,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8247645831168000.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50806f9bca90fe0b02ef16ab9659f09b9a467f1eb053112098cedbe0578b9473
+size 5137

checkpoint-2500/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "architectures": [
+    "MiniMythosHybridForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_minimythos_hybrid.MiniMythosHybridConfig",
+    "AutoModelForCausalLM": "modeling_minimythos_hybrid.MiniMythosHybridForCausalLM"
+  },
+  "block_size": 1024,
+  "bos_token_id": 2,
+  "dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 3,
+  "hidden_size": 1024,
+  "input_scale": 0.2,
+  "is_decoder": true,
+  "leak_rate": 0.25,
+  "max_position_embeddings": 1024,
+  "mlp_mult": 4,
+  "model_type": "minimythos_hybrid",
+  "n_attn_layers": 4,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_reservoir_layers": 4,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 8,
+  "pad_token_id": 0,
+  "reservoir_scale": 0.9,
+  "tie_word_embeddings": false,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "vocab_size": 8192
+}

checkpoint-2500/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token_id": 2,
+  "do_sample": true,
+  "eos_token_id": 3,
+  "max_new_tokens": 256,
+  "pad_token_id": 0,
+  "temperature": 0.7,
+  "top_k": 50,
+  "transformers_version": "5.0.0"
+}

checkpoint-2500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f29b6a09f8850a530c721b3cce54fa8068455c2eedc3b8150f87b88340102303
+size 184580440

checkpoint-2500/modeling_minimythos_hybrid.py ADDED Viewed

	@@ -0,0 +1,337 @@

+# coding=utf-8
+"""
+MiniMythos Hybrid Reservoir LM
+Hugging Face remote-code compatible modeling file.
+Load with:
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    model_id = "summerstars/EN-summer"
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
+"""
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+class MiniMythosHybridConfig(PretrainedConfig):
+    model_type = "minimythos_hybrid"
+    def __init__(
+        self,
+        vocab_size: int = 8192,
+        block_size: int = 512,
+        n_embd: int = 1024,
+        n_reservoir_layers: int = 6,
+        n_attn_layers: int = 4,
+        n_head: int = 16,
+        mlp_mult: int = 4,
+        dropout: float = 0.0,
+        leak_rate: float = 0.25,
+        reservoir_scale: float = 0.90,
+        input_scale: float = 0.20,
+        pad_token_id: int = 0,
+        bos_token_id: int = 2,
+        eos_token_id: int = 3,
+        tie_word_embeddings: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        self.n_embd = n_embd
+        self.n_reservoir_layers = n_reservoir_layers
+        self.n_attn_layers = n_attn_layers
+        self.n_head = n_head
+        self.mlp_mult = mlp_mult
+        self.dropout = dropout
+        self.leak_rate = leak_rate
+        self.reservoir_scale = reservoir_scale
+        self.input_scale = input_scale
+        self.hidden_size = n_embd
+        self.num_hidden_layers = n_reservoir_layers + n_attn_layers
+        self.num_attention_heads = n_head
+        self.max_position_embeddings = block_size
+        self.is_decoder = True
+        self.is_encoder_decoder = False
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Compute RMSNorm in fp32 for numerical stability, then cast back.
+        orig_dtype = x.dtype
+        x_float = x.float()
+        var = x_float.pow(2).mean(-1, keepdim=True)
+        x_norm = x_float * torch.rsqrt(var + self.eps)
+        return (self.weight.float() * x_norm).to(orig_dtype)
+class ReservoirBlock(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        self.leak_rate = config.leak_rate
+        self.drop = nn.Dropout(config.dropout)
+        self.in_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.reservoir = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.norm = RMSNorm(config.n_embd)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        z = torch.tanh(self.in_proj(x) + self.reservoir(x))
+        x = self.leak_rate * x + (1.0 - self.leak_rate) * z
+        x = self.norm(x)
+        return self.drop(x)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        if config.n_embd % config.n_head != 0:
+            raise ValueError("n_embd must be divisible by n_head")
+        self.n_head = config.n_head
+        self.head_dim = config.n_embd // config.n_head
+        if self.head_dim % 2 != 0:
+            raise ValueError("head_dim must be even for RoPE")
+        self.qkv = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
+        self.proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.dropout = config.dropout
+        self.resid_drop = nn.Dropout(config.dropout)
+        inv_freq = 1.0 / (
+            10000 ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32) / self.head_dim)
+        )
+        t = torch.arange(config.block_size, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("rope_cos", freqs.cos()[None, :, None, :], persistent=False)
+        self.register_buffer("rope_sin", freqs.sin()[None, :, None, :], persistent=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, channels = x.shape
+        q, k, v = self.qkv(x).reshape(
+            batch_size, seq_len, 3, self.n_head, self.head_dim
+        ).unbind(dim=2)
+        cos = self.rope_cos[:, :seq_len].to(dtype=q.dtype, device=q.device)
+        sin = self.rope_sin[:, :seq_len].to(dtype=q.dtype, device=q.device)
+        def apply_rope(u: torch.Tensor) -> torch.Tensor:
+            u_e = u[..., 0::2]
+            u_o = u[..., 1::2]
+            return torch.stack(
+                (u_e * cos - u_o * sin, u_e * sin + u_o * cos), dim=-1
+            ).flatten(-2)
+        q = apply_rope(q).transpose(1, 2)
+        k = apply_rope(k).transpose(1, 2)
+        v = v.transpose(1, 2)
+        y = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=True,
+        )
+        y = y.transpose(1, 2).contiguous().reshape(batch_size, seq_len, channels)
+        return self.resid_drop(self.proj(y))
+class SwiGLU(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        hidden = config.mlp_mult * config.n_embd
+        self.w1 = nn.Linear(config.n_embd, hidden, bias=False)
+        self.w2 = nn.Linear(config.n_embd, hidden, bias=False)
+        self.w3 = nn.Linear(hidden, config.n_embd, bias=False)
+        self.drop = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.drop(self.w3(F.silu(self.w1(x)) * self.w2(x)))
+class TransformerBlock(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        self.norm1 = RMSNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.norm2 = RMSNorm(config.n_embd)
+        self.mlp = SwiGLU(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class MiniMythosHybridForCausalLM(PreTrainedModel):
+    config_class = MiniMythosHybridConfig
+    base_model_prefix = "model"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = False
+    _tied_weights_keys = []
+    all_tied_weights_keys = {}
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__(config)
+        self.config = config
+        self.token_emb = nn.Embedding(config.vocab_size, config.n_embd)
+        self.drop = nn.Dropout(config.dropout)
+        self.reservoir_layers = nn.ModuleList(
+            [ReservoirBlock(config) for _ in range(config.n_reservoir_layers)]
+        )
+        self.attn_layers = nn.ModuleList(
+            [TransformerBlock(config) for _ in range(config.n_attn_layers)]
+        )
+        self.norm = RMSNorm(config.n_embd)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+    def get_input_embeddings(self):
+        return self.token_emb
+    def set_input_embeddings(self, value):
+        self.token_emb = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def tie_weights(self, *args, **kwargs):
+        if getattr(self.config, "tie_word_embeddings", False):
+            self._tie_or_clone_weights(self.lm_head, self.token_emb)
+        return None
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("Specify either input_ids or inputs_embeds, not both.")
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You must specify input_ids or inputs_embeds.")
+        if inputs_embeds is None:
+            if input_ids.shape[1] > self.config.block_size:
+                input_ids = input_ids[:, -self.config.block_size:]
+                if labels is not None:
+                    labels = labels[:, -self.config.block_size:]
+            x = self.token_emb(input_ids)
+        else:
+            if inputs_embeds.shape[1] > self.config.block_size:
+                inputs_embeds = inputs_embeds[:, -self.config.block_size:, :]
+                if labels is not None:
+                    labels = labels[:, -self.config.block_size:]
+            x = inputs_embeds
+        hidden_states = [] if output_hidden_states else None
+        x = self.drop(x)
+        for layer in self.reservoir_layers:
+            if output_hidden_states:
+                hidden_states.append(x)
+            x = layer(x)
+        for layer in self.attn_layers:
+            if output_hidden_states:
+                hidden_states.append(x)
+            x = layer(x)
+        x = self.norm(x)
+        if output_hidden_states:
+            hidden_states.append(x)
+        logits = self.lm_head(x)
+        # Prevent generation from crashing if a checkpoint contains unstable values.
+        # This should not hide training issues, but it makes inference robust.
+        logits = torch.nan_to_num(logits, nan=0.0, posinf=1e4, neginf=-1e4)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        if not return_dict:
+            output = (logits,)
+            if use_cache:
+                output = output + (None,)
+            if output_hidden_states:
+                output = output + (tuple(hidden_states),)
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=tuple(hidden_states) if output_hidden_states else None,
+            attentions=None,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values=None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if input_ids is not None and input_ids.shape[1] > self.config.block_size:
+            input_ids = input_ids[:, -self.config.block_size:]
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, -self.config.block_size:]
+        if inputs_embeds is not None and input_ids is not None and input_ids.shape[1] == 1:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": None,
+                "use_cache": False,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        return past_key_values

checkpoint-2500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:182ea63eb20e29957c484c8508836a741fd324df542e2106c8332a938240312f
+size 369189643

checkpoint-2500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
+size 14645

checkpoint-2500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8331319bcb250d64e1c0a021c768102914db2d392635711285a1d9fe0b4ad672
+size 1465

checkpoint-2500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1784 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.8333333333333334,
+  "eval_steps": 500,
+  "global_step": 2500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0033333333333333335,
+      "grad_norm": 85.0,
+      "learning_rate": 7.2e-08,
+      "loss": 98.92244262695313,
+      "step": 10
+    },
+    {
+      "epoch": 0.006666666666666667,
+      "grad_norm": 93.0,
+      "learning_rate": 1.5199999999999998e-07,
+      "loss": 98.39124145507813,
+      "step": 20
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 90.0,
+      "learning_rate": 2.3199999999999999e-07,
+      "loss": 98.9702392578125,
+      "step": 30
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 90.0,
+      "learning_rate": 3.12e-07,
+      "loss": 99.00128173828125,
+      "step": 40
+    },
+    {
+      "epoch": 0.016666666666666666,
+      "grad_norm": 86.0,
+      "learning_rate": 3.9199999999999996e-07,
+      "loss": 98.24757690429688,
+      "step": 50
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 86.0,
+      "learning_rate": 4.7199999999999994e-07,
+      "loss": 100.627099609375,
+      "step": 60
+    },
+    {
+      "epoch": 0.023333333333333334,
+      "grad_norm": 88.5,
+      "learning_rate": 5.52e-07,
+      "loss": 99.09172973632812,
+      "step": 70
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 6.32e-07,
+      "loss": 99.2964111328125,
+      "step": 80
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 82.5,
+      "learning_rate": 7.12e-07,
+      "loss": 98.79883422851563,
+      "step": 90
+    },
+    {
+      "epoch": 0.03333333333333333,
+      "grad_norm": 83.5,
+      "learning_rate": 7.92e-07,
+      "loss": 97.59891967773437,
+      "step": 100
+    },
+    {
+      "epoch": 0.03666666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 7.999809885464028e-07,
+      "loss": 98.88760375976562,
+      "step": 110
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 82.0,
+      "learning_rate": 7.999152722615145e-07,
+      "loss": 100.03501586914062,
+      "step": 120
+    },
+    {
+      "epoch": 0.043333333333333335,
+      "grad_norm": 81.5,
+      "learning_rate": 7.998026241462926e-07,
+      "loss": 99.05869140625,
+      "step": 130
+    },
+    {
+      "epoch": 0.04666666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 7.996430574204927e-07,
+      "loss": 98.06262817382813,
+      "step": 140
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 91.0,
+      "learning_rate": 7.994365908099776e-07,
+      "loss": 99.91973876953125,
+      "step": 150
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 88.5,
+      "learning_rate": 7.991832485445195e-07,
+      "loss": 99.073046875,
+      "step": 160
+    },
+    {
+      "epoch": 0.056666666666666664,
+      "grad_norm": 84.5,
+      "learning_rate": 7.988830603549564e-07,
+      "loss": 98.0226318359375,
+      "step": 170
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 90.5,
+      "learning_rate": 7.985360614697036e-07,
+      "loss": 99.20685424804688,
+      "step": 180
+    },
+    {
+      "epoch": 0.06333333333333334,
+      "grad_norm": 85.5,
+      "learning_rate": 7.981422926106186e-07,
+      "loss": 98.71968383789063,
+      "step": 190
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 82.5,
+      "learning_rate": 7.977017999882226e-07,
+      "loss": 99.76533203125,
+      "step": 200
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 81.0,
+      "learning_rate": 7.972146352962785e-07,
+      "loss": 98.85039672851562,
+      "step": 210
+    },
+    {
+      "epoch": 0.07333333333333333,
+      "grad_norm": 79.5,
+      "learning_rate": 7.966808557057225e-07,
+      "loss": 96.81197509765624,
+      "step": 220
+    },
+    {
+      "epoch": 0.07666666666666666,
+      "grad_norm": 94.5,
+      "learning_rate": 7.961005238579563e-07,
+      "loss": 98.03938598632813,
+      "step": 230
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 74.5,
+      "learning_rate": 7.954737078574952e-07,
+      "loss": 97.90469970703126,
+      "step": 240
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 87.0,
+      "learning_rate": 7.948004812639763e-07,
+      "loss": 99.83749389648438,
+      "step": 250
+    },
+    {
+      "epoch": 0.08666666666666667,
+      "grad_norm": 87.5,
+      "learning_rate": 7.940809230835248e-07,
+      "loss": 99.23988647460938,
+      "step": 260
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 86.5,
+      "learning_rate": 7.933151177594838e-07,
+      "loss": 99.12615966796875,
+      "step": 270
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 86.0,
+      "learning_rate": 7.925031551625037e-07,
+      "loss": 99.5418212890625,
+      "step": 280
+    },
+    {
+      "epoch": 0.09666666666666666,
+      "grad_norm": 80.5,
+      "learning_rate": 7.916451305799951e-07,
+      "loss": 98.88993530273437,
+      "step": 290
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 81.5,
+      "learning_rate": 7.907411447049468e-07,
+      "loss": 98.3482177734375,
+      "step": 300
+    },
+    {
+      "epoch": 0.10333333333333333,
+      "grad_norm": 82.5,
+      "learning_rate": 7.897913036241098e-07,
+      "loss": 98.17821655273437,
+      "step": 310
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 7.88795718805546e-07,
+      "loss": 98.89118041992188,
+      "step": 320
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 83.0,
+      "learning_rate": 7.87754507085548e-07,
+      "loss": 98.15654907226562,
+      "step": 330
+    },
+    {
+      "epoch": 0.11333333333333333,
+      "grad_norm": 84.5,
+      "learning_rate": 7.86667790654928e-07,
+      "loss": 97.81229858398437,
+      "step": 340
+    },
+    {
+      "epoch": 0.11666666666666667,
+      "grad_norm": 84.0,
+      "learning_rate": 7.85535697044677e-07,
+      "loss": 99.2128662109375,
+      "step": 350
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 83.5,
+      "learning_rate": 7.843583591109998e-07,
+      "loss": 98.5178955078125,
+      "step": 360
+    },
+    {
+      "epoch": 0.12333333333333334,
+      "grad_norm": 86.5,
+      "learning_rate": 7.83135915019723e-07,
+      "loss": 98.7326416015625,
+      "step": 370
+    },
+    {
+      "epoch": 0.12666666666666668,
+      "grad_norm": 83.0,
+      "learning_rate": 7.818685082300806e-07,
+      "loss": 98.09605712890625,
+      "step": 380
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 82.0,
+      "learning_rate": 7.805562874778789e-07,
+      "loss": 99.17857666015625,
+      "step": 390
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 78.5,
+      "learning_rate": 7.791994067580411e-07,
+      "loss": 97.64859619140626,
+      "step": 400
+    },
+    {
+      "epoch": 0.13666666666666666,
+      "grad_norm": 84.5,
+      "learning_rate": 7.77798025306536e-07,
+      "loss": 96.37807006835938,
+      "step": 410
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 83.5,
+      "learning_rate": 7.763523075816902e-07,
+      "loss": 99.2608642578125,
+      "step": 420
+    },
+    {
+      "epoch": 0.14333333333333334,
+      "grad_norm": 77.0,
+      "learning_rate": 7.748624232448886e-07,
+      "loss": 99.56741333007812,
+      "step": 430
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 78.0,
+      "learning_rate": 7.733285471406642e-07,
+      "loss": 99.81188354492187,
+      "step": 440
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 82.5,
+      "learning_rate": 7.717508592761785e-07,
+      "loss": 98.52041015625,
+      "step": 450
+    },
+    {
+      "epoch": 0.15333333333333332,
+      "grad_norm": 82.0,
+      "learning_rate": 7.701295448000974e-07,
+      "loss": 99.01697387695313,
+      "step": 460
+    },
+    {
+      "epoch": 0.15666666666666668,
+      "grad_norm": 79.5,
+      "learning_rate": 7.684647939808636e-07,
+      "loss": 98.38013916015625,
+      "step": 470
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 78.5,
+      "learning_rate": 7.667568021843666e-07,
+      "loss": 97.33247680664063,
+      "step": 480
+    },
+    {
+      "epoch": 0.16333333333333333,
+      "grad_norm": 82.5,
+      "learning_rate": 7.650057698510164e-07,
+      "loss": 97.90450439453124,
+      "step": 490
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 84.5,
+      "learning_rate": 7.632119024722212e-07,
+      "loss": 98.53453369140625,
+      "step": 500
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 91.0,
+      "learning_rate": 7.613754105662717e-07,
+      "loss": 98.44060668945312,
+      "step": 510
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 83.5,
+      "learning_rate": 7.594965096536353e-07,
+      "loss": 98.7102294921875,
+      "step": 520
+    },
+    {
+      "epoch": 0.17666666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 7.575754202316649e-07,
+      "loss": 97.73778076171875,
+      "step": 530
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 83.5,
+      "learning_rate": 7.556123677487218e-07,
+      "loss": 99.01414184570312,
+      "step": 540
+    },
+    {
+      "epoch": 0.18333333333333332,
+      "grad_norm": 80.0,
+      "learning_rate": 7.536075825777187e-07,
+      "loss": 100.30809936523437,
+      "step": 550
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 86.5,
+      "learning_rate": 7.515612999890841e-07,
+      "loss": 99.7580322265625,
+      "step": 560
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 84.5,
+      "learning_rate": 7.494737601231523e-07,
+      "loss": 98.83981323242188,
+      "step": 570
+    },
+    {
+      "epoch": 0.19333333333333333,
+      "grad_norm": 81.5,
+      "learning_rate": 7.473452079619826e-07,
+      "loss": 98.02926635742188,
+      "step": 580
+    },
+    {
+      "epoch": 0.19666666666666666,
+      "grad_norm": 83.5,
+      "learning_rate": 7.451758933006086e-07,
+      "loss": 98.2523681640625,
+      "step": 590
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 75.5,
+      "learning_rate": 7.429660707177239e-07,
+      "loss": 97.91044311523437,
+      "step": 600
+    },
+    {
+      "epoch": 0.20333333333333334,
+      "grad_norm": 80.0,
+      "learning_rate": 7.407159995458066e-07,
+      "loss": 98.97639770507813,
+      "step": 610
+    },
+    {
+      "epoch": 0.20666666666666667,
+      "grad_norm": 80.0,
+      "learning_rate": 7.384259438406848e-07,
+      "loss": 96.91513671875,
+      "step": 620
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 75.0,
+      "learning_rate": 7.360961723505495e-07,
+      "loss": 98.74181518554687,
+      "step": 630
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 82.0,
+      "learning_rate": 7.337269584844142e-07,
+      "loss": 98.77709350585937,
+      "step": 640
+    },
+    {
+      "epoch": 0.21666666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 7.313185802800312e-07,
+      "loss": 98.27448120117188,
+      "step": 650
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 85.0,
+      "learning_rate": 7.288713203712605e-07,
+      "loss": 98.03839111328125,
+      "step": 660
+    },
+    {
+      "epoch": 0.22333333333333333,
+      "grad_norm": 91.5,
+      "learning_rate": 7.263854659549032e-07,
+      "loss": 97.03794555664062,
+      "step": 670
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 78.0,
+      "learning_rate": 7.23861308756997e-07,
+      "loss": 98.09403686523437,
+      "step": 680
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 91.0,
+      "learning_rate": 7.212991449985802e-07,
+      "loss": 98.54012451171874,
+      "step": 690
+    },
+    {
+      "epoch": 0.23333333333333334,
+      "grad_norm": 85.0,
+      "learning_rate": 7.186992753609302e-07,
+      "loss": 97.112890625,
+      "step": 700
+    },
+    {
+      "epoch": 0.23666666666666666,
+      "grad_norm": 83.0,
+      "learning_rate": 7.160620049502761e-07,
+      "loss": 97.43547973632812,
+      "step": 710
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 81.0,
+      "learning_rate": 7.133876432619936e-07,
+      "loss": 97.59598388671876,
+      "step": 720
+    },
+    {
+      "epoch": 0.24333333333333335,
+      "grad_norm": 82.0,
+      "learning_rate": 7.106765041442847e-07,
+      "loss": 98.765087890625,
+      "step": 730
+    },
+    {
+      "epoch": 0.24666666666666667,
+      "grad_norm": 82.5,
+      "learning_rate": 7.079289057613449e-07,
+      "loss": 98.65599365234375,
+      "step": 740
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 85.5,
+      "learning_rate": 7.051451705560269e-07,
+      "loss": 100.546728515625,
+      "step": 750
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 81.5,
+      "learning_rate": 7.023256252119996e-07,
+      "loss": 99.18995361328125,
+      "step": 760
+    },
+    {
+      "epoch": 0.25666666666666665,
+      "grad_norm": 83.5,
+      "learning_rate": 6.994706006154102e-07,
+      "loss": 98.26092529296875,
+      "step": 770
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 77.5,
+      "learning_rate": 6.965804318160538e-07,
+      "loss": 100.81718139648437,
+      "step": 780
+    },
+    {
+      "epoch": 0.2633333333333333,
+      "grad_norm": 106.0,
+      "learning_rate": 6.936554579880531e-07,
+      "loss": 99.88375244140624,
+      "step": 790
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 81.0,
+      "learning_rate": 6.906960223900558e-07,
+      "loss": 98.47666015625,
+      "step": 800
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 79.0,
+      "learning_rate": 6.877024723249506e-07,
+      "loss": 98.1271240234375,
+      "step": 810
+    },
+    {
+      "epoch": 0.2733333333333333,
+      "grad_norm": 86.0,
+      "learning_rate": 6.846751590991103e-07,
+      "loss": 97.95358276367188,
+      "step": 820
+    },
+    {
+      "epoch": 0.27666666666666667,
+      "grad_norm": 85.5,
+      "learning_rate": 6.816144379811647e-07,
+      "loss": 97.06906127929688,
+      "step": 830
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 80.0,
+      "learning_rate": 6.785206681603071e-07,
+      "loss": 97.89542236328126,
+      "step": 840
+    },
+    {
+      "epoch": 0.2833333333333333,
+      "grad_norm": 76.5,
+      "learning_rate": 6.753942127041434e-07,
+      "loss": 98.07276611328125,
+      "step": 850
+    },
+    {
+      "epoch": 0.2866666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 6.722354385160832e-07,
+      "loss": 98.10612182617187,
+      "step": 860
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 85.0,
+      "learning_rate": 6.690447162922828e-07,
+      "loss": 97.93245239257813,
+      "step": 870
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 79.0,
+      "learning_rate": 6.65822420478142e-07,
+      "loss": 97.72557373046875,
+      "step": 880
+    },
+    {
+      "epoch": 0.2966666666666667,
+      "grad_norm": 80.0,
+      "learning_rate": 6.625689292243618e-07,
+      "loss": 97.9116455078125,
+      "step": 890
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 83.5,
+      "learning_rate": 6.59284624342566e-07,
+      "loss": 97.18661499023438,
+      "step": 900
+    },
+    {
+      "epoch": 0.30333333333333334,
+      "grad_norm": 82.5,
+      "learning_rate": 6.55969891260494e-07,
+      "loss": 97.79299926757812,
+      "step": 910
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 79.5,
+      "learning_rate": 6.526251189767701e-07,
+      "loss": 97.34827270507813,
+      "step": 920
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 80.5,
+      "learning_rate": 6.492507000152516e-07,
+      "loss": 98.7041748046875,
+      "step": 930
+    },
+    {
+      "epoch": 0.31333333333333335,
+      "grad_norm": 82.0,
+      "learning_rate": 6.458470303789652e-07,
+      "loss": 98.721240234375,
+      "step": 940
+    },
+    {
+      "epoch": 0.31666666666666665,
+      "grad_norm": 85.0,
+      "learning_rate": 6.424145095036337e-07,
+      "loss": 99.30765991210937,
+      "step": 950
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 77.5,
+      "learning_rate": 6.389535402108008e-07,
+      "loss": 98.2544921875,
+      "step": 960
+    },
+    {
+      "epoch": 0.3233333333333333,
+      "grad_norm": 85.0,
+      "learning_rate": 6.354645286605583e-07,
+      "loss": 97.76597290039062,
+      "step": 970
+    },
+    {
+      "epoch": 0.32666666666666666,
+      "grad_norm": 88.5,
+      "learning_rate": 6.31947884303881e-07,
+      "loss": 98.00664672851562,
+      "step": 980
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 80.0,
+      "learning_rate": 6.284040198345763e-07,
+      "loss": 98.64342651367187,
+      "step": 990
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 92.0,
+      "learning_rate": 6.248333511408522e-07,
+      "loss": 97.74580688476563,
+      "step": 1000
+    },
+    {
+      "epoch": 0.33666666666666667,
+      "grad_norm": 91.0,
+      "learning_rate": 6.212362972565115e-07,
+      "loss": 98.11343994140626,
+      "step": 1010
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 82.0,
+      "learning_rate": 6.176132803117761e-07,
+      "loss": 98.30454711914062,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3433333333333333,
+      "grad_norm": 81.5,
+      "learning_rate": 6.13964725483748e-07,
+      "loss": 97.90970458984376,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 85.5,
+      "learning_rate": 6.102910609465133e-07,
+      "loss": 96.98532104492188,
+      "step": 1040
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 81.0,
+      "learning_rate": 6.065927178208936e-07,
+      "loss": 107.39459228515625,
+      "step": 1050
+    },
+    {
+      "epoch": 0.35333333333333333,
+      "grad_norm": 82.5,
+      "learning_rate": 6.028701301238521e-07,
+      "loss": 98.15614624023438,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3566666666666667,
+      "grad_norm": 81.5,
+      "learning_rate": 5.991237347175605e-07,
+      "loss": 98.47268676757812,
+      "step": 1070
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 75.0,
+      "learning_rate": 5.953539712581301e-07,
+      "loss": 97.77119750976563,
+      "step": 1080
+    },
+    {
+      "epoch": 0.36333333333333334,
+      "grad_norm": 78.0,
+      "learning_rate": 5.915612821440172e-07,
+      "loss": 98.135302734375,
+      "step": 1090
+    },
+    {
+      "epoch": 0.36666666666666664,
+      "grad_norm": 76.5,
+      "learning_rate": 5.877461124641053e-07,
+      "loss": 97.557373046875,
+      "step": 1100
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 81.5,
+      "learning_rate": 5.839089099454721e-07,
+      "loss": 98.51465454101563,
+      "step": 1110
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 81.5,
+      "learning_rate": 5.800501249008462e-07,
+      "loss": 99.03858032226563,
+      "step": 1120
+    },
+    {
+      "epoch": 0.37666666666666665,
+      "grad_norm": 75.5,
+      "learning_rate": 5.761702101757618e-07,
+      "loss": 99.02989501953125,
+      "step": 1130
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 83.5,
+      "learning_rate": 5.722696210954143e-07,
+      "loss": 100.75755004882812,
+      "step": 1140
+    },
+    {
+      "epoch": 0.38333333333333336,
+      "grad_norm": 77.0,
+      "learning_rate": 5.683488154112268e-07,
+      "loss": 98.41821899414063,
+      "step": 1150
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 77.0,
+      "learning_rate": 5.644082532471301e-07,
+      "loss": 98.3514892578125,
+      "step": 1160
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 87.0,
+      "learning_rate": 5.60448397045566e-07,
+      "loss": 97.98330078125,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3933333333333333,
+      "grad_norm": 78.0,
+      "learning_rate": 5.564697115132166e-07,
+      "loss": 99.36333618164062,
+      "step": 1180
+    },
+    {
+      "epoch": 0.39666666666666667,
+      "grad_norm": 80.0,
+      "learning_rate": 5.524726635664701e-07,
+      "loss": 98.58826293945313,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 82.0,
+      "learning_rate": 5.484577222766244e-07,
+      "loss": 99.26712646484376,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4033333333333333,
+      "grad_norm": 78.5,
+      "learning_rate": 5.444253588148419e-07,
+      "loss": 99.81590576171875,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4066666666666667,
+      "grad_norm": 89.0,
+      "learning_rate": 5.40376046396853e-07,
+      "loss": 97.74580078125,
+      "step": 1220
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 80.5,
+      "learning_rate": 5.363102602274239e-07,
+      "loss": 98.692529296875,
+      "step": 1230
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 80.0,
+      "learning_rate": 5.32228477444588e-07,
+      "loss": 99.05579833984375,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 77.5,
+      "learning_rate": 5.281311770636531e-07,
+      "loss": 98.2328369140625,
+      "step": 1250
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 79.5,
+      "learning_rate": 5.24018839920985e-07,
+      "loss": 98.72367553710937,
+      "step": 1260
+    },
+    {
+      "epoch": 0.42333333333333334,
+      "grad_norm": 80.0,
+      "learning_rate": 5.198919486175807e-07,
+      "loss": 98.29996948242187,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 108.0,
+      "learning_rate": 5.157509874624324e-07,
+      "loss": 97.95867919921875,
+      "step": 1280
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 83.0,
+      "learning_rate": 5.115964424156917e-07,
+      "loss": 97.64778442382813,
+      "step": 1290
+    },
+    {
+      "epoch": 0.43333333333333335,
+      "grad_norm": 103.5,
+      "learning_rate": 5.0742880103164e-07,
+      "loss": 97.5724365234375,
+      "step": 1300
+    },
+    {
+      "epoch": 0.43666666666666665,
+      "grad_norm": 85.0,
+      "learning_rate": 5.032485524014726e-07,
+      "loss": 97.94261474609375,
+      "step": 1310
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 81.5,
+      "learning_rate": 4.990561870958998e-07,
+      "loss": 98.76618041992188,
+      "step": 1320
+    },
+    {
+      "epoch": 0.44333333333333336,
+      "grad_norm": 78.5,
+      "learning_rate": 4.948521971075788e-07,
+      "loss": 100.03206176757813,
+      "step": 1330
+    },
+    {
+      "epoch": 0.44666666666666666,
+      "grad_norm": 76.5,
+      "learning_rate": 4.906370757933739e-07,
+      "loss": 97.71541748046874,
+      "step": 1340
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 78.0,
+      "learning_rate": 4.864113178164604e-07,
+      "loss": 98.21603393554688,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 80.0,
+      "learning_rate": 4.821754190882729e-07,
+      "loss": 98.40943603515625,
+      "step": 1360
+    },
+    {
+      "epoch": 0.45666666666666667,
+      "grad_norm": 79.0,
+      "learning_rate": 4.779298767103083e-07,
+      "loss": 98.84269409179687,
+      "step": 1370
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 85.5,
+      "learning_rate": 4.736751889157882e-07,
+      "loss": 97.90013427734375,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4633333333333333,
+      "grad_norm": 76.5,
+      "learning_rate": 4.6941185501118975e-07,
+      "loss": 96.82548828125,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 4.6514037531764925e-07,
+      "loss": 98.97203979492187,
+      "step": 1400
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 90.5,
+      "learning_rate": 4.608612511122476e-07,
+      "loss": 98.75665893554688,
+      "step": 1410
+    },
+    {
+      "epoch": 0.47333333333333333,
+      "grad_norm": 76.0,
+      "learning_rate": 4.565749845691828e-07,
+      "loss": 97.86590576171875,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4766666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 4.5228207870083823e-07,
+      "loss": 98.30226440429688,
+      "step": 1430
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 76.5,
+      "learning_rate": 4.479830372987511e-07,
+      "loss": 97.27890625,
+      "step": 1440
+    },
+    {
+      "epoch": 0.48333333333333334,
+      "grad_norm": 82.5,
+      "learning_rate": 4.436783648744911e-07,
+      "loss": 96.8334716796875,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4866666666666667,
+      "grad_norm": 94.5,
+      "learning_rate": 4.3936856660045317e-07,
+      "loss": 100.1866943359375,
+      "step": 1460
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 76.0,
+      "learning_rate": 4.350541482505733e-07,
+      "loss": 97.47962036132813,
+      "step": 1470
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 80.5,
+      "learning_rate": 4.30735616140974e-07,
+      "loss": 98.73521118164062,
+      "step": 1480
+    },
+    {
+      "epoch": 0.49666666666666665,
+      "grad_norm": 80.0,
+      "learning_rate": 4.2641347707054586e-07,
+      "loss": 97.36817016601563,
+      "step": 1490
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 81.0,
+      "learning_rate": 4.220882382614721e-07,
+      "loss": 98.93515625,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5033333333333333,
+      "grad_norm": 84.5,
+      "learning_rate": 4.177604072997041e-07,
+      "loss": 99.65122680664062,
+      "step": 1510
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 81.5,
+      "learning_rate": 4.134304920753937e-07,
+      "loss": 99.22744140625,
+      "step": 1520
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 81.0,
+      "learning_rate": 4.090990007232907e-07,
+      "loss": 98.11915893554688,
+      "step": 1530
+    },
+    {
+      "epoch": 0.5133333333333333,
+      "grad_norm": 91.5,
+      "learning_rate": 4.0476644156310994e-07,
+      "loss": 99.190869140625,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5166666666666667,
+      "grad_norm": 76.0,
+      "learning_rate": 4.0043332303987834e-07,
+      "loss": 97.55637817382812,
+      "step": 1550
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 82.5,
+      "learning_rate": 3.961001536642667e-07,
+      "loss": 98.12214965820313,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5233333333333333,
+      "grad_norm": 80.5,
+      "learning_rate": 3.9176744195291366e-07,
+      "loss": 100.78977661132812,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5266666666666666,
+      "grad_norm": 78.0,
+      "learning_rate": 3.874356963687487e-07,
+      "loss": 98.84946899414062,
+      "step": 1580
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 81.5,
+      "learning_rate": 3.831054252613222e-07,
+      "loss": 99.25551147460938,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 78.0,
+      "learning_rate": 3.787771368071479e-07,
+      "loss": 98.48034057617187,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5366666666666666,
+      "grad_norm": 78.0,
+      "learning_rate": 3.7445133895006673e-07,
+      "loss": 100.24962768554687,
+      "step": 1610
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 77.5,
+      "learning_rate": 3.7012853934163675e-07,
+      "loss": 98.97702026367188,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5433333333333333,
+      "grad_norm": 80.5,
+      "learning_rate": 3.6580924528155834e-07,
+      "loss": 97.33264770507813,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 81.5,
+      "learning_rate": 3.6149396365814017e-07,
+      "loss": 98.55026245117188,
+      "step": 1640
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 94.5,
+      "learning_rate": 3.571832008888139e-07,
+      "loss": 97.8692138671875,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5533333333333333,
+      "grad_norm": 79.0,
+      "learning_rate": 3.528774628607033e-07,
+      "loss": 99.7307861328125,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5566666666666666,
+      "grad_norm": 76.0,
+      "learning_rate": 3.485772548712565e-07,
+      "loss": 98.52730712890624,
+      "step": 1670
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 76.5,
+      "learning_rate": 3.442830815689475e-07,
+      "loss": 98.9763427734375,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5633333333333334,
+      "grad_norm": 73.0,
+      "learning_rate": 3.399954468940525e-07,
+      "loss": 97.82731323242187,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5666666666666667,
+      "grad_norm": 76.5,
+      "learning_rate": 3.357148540195112e-07,
+      "loss": 98.65582885742188,
+      "step": 1700
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 87.5,
+      "learning_rate": 3.314418052918764e-07,
+      "loss": 97.91898193359376,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 76.5,
+      "learning_rate": 3.2717680217236214e-07,
+      "loss": 98.35755615234375,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5766666666666667,
+      "grad_norm": 95.0,
+      "learning_rate": 3.2292034517799457e-07,
+      "loss": 98.27772827148438,
+      "step": 1730
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 79.0,
+      "learning_rate": 3.1867293382287417e-07,
+      "loss": 97.74335327148438,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 87.5,
+      "learning_rate": 3.1443506655955536e-07,
+      "loss": 97.20853881835937,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 3.1020724072055136e-07,
+      "loss": 98.31585083007812,
+      "step": 1760
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 84.0,
+      "learning_rate": 3.0598995245996964e-07,
+      "loss": 99.7463623046875,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5933333333333334,
+      "grad_norm": 83.5,
+      "learning_rate": 3.017836966952859e-07,
+      "loss": 98.76014404296875,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5966666666666667,
+      "grad_norm": 138.0,
+      "learning_rate": 2.9758896704926393e-07,
+      "loss": 99.64288330078125,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 80.5,
+      "learning_rate": 2.93406255792026e-07,
+      "loss": 98.490380859375,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6033333333333334,
+      "grad_norm": 79.0,
+      "learning_rate": 2.8923605378328365e-07,
+      "loss": 97.536083984375,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6066666666666667,
+      "grad_norm": 76.5,
+      "learning_rate": 2.8507885041473197e-07,
+      "loss": 98.35460815429687,
+      "step": 1820
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 80.0,
+      "learning_rate": 2.809351335526184e-07,
+      "loss": 98.06194458007812,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 79.0,
+      "learning_rate": 2.7680538948048913e-07,
+      "loss": 97.76737670898437,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6166666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 2.7269010284212136e-07,
+      "loss": 99.20833740234374,
+      "step": 1850
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 82.5,
+      "learning_rate": 2.685897565846484e-07,
+      "loss": 98.02843627929687,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6233333333333333,
+      "grad_norm": 84.0,
+      "learning_rate": 2.6450483190188343e-07,
+      "loss": 97.90446166992187,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 78.0,
+      "learning_rate": 2.604358081778498e-07,
+      "loss": 97.0069091796875,
+      "step": 1880
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 86.0,
+      "learning_rate": 2.5638316293052245e-07,
+      "loss": 98.136376953125,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6333333333333333,
+      "grad_norm": 76.0,
+      "learning_rate": 2.523473717557898e-07,
+      "loss": 98.2837158203125,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6366666666666667,
+      "grad_norm": 83.5,
+      "learning_rate": 2.4832890827163993e-07,
+      "loss": 99.101025390625,
+      "step": 1910
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 85.0,
+      "learning_rate": 2.443282440625797e-07,
+      "loss": 98.89285278320312,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6433333333333333,
+      "grad_norm": 75.5,
+      "learning_rate": 2.403458486242921e-07,
+      "loss": 97.999560546875,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6466666666666666,
+      "grad_norm": 91.5,
+      "learning_rate": 2.3638218930853874e-07,
+      "loss": 96.57308959960938,
+      "step": 1940
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 88.5,
+      "learning_rate": 2.3243773126831448e-07,
+      "loss": 98.37883911132812,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 87.0,
+      "learning_rate": 2.2851293740325895e-07,
+      "loss": 97.91729125976562,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6566666666666666,
+      "grad_norm": 80.0,
+      "learning_rate": 2.2460826830533416e-07,
+      "loss": 99.19988403320312,
+      "step": 1970
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 79.0,
+      "learning_rate": 2.2072418220477083e-07,
+      "loss": 97.540087890625,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6633333333333333,
+      "grad_norm": 77.5,
+      "learning_rate": 2.168611349162943e-07,
+      "loss": 98.80101318359375,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 83.0,
+      "learning_rate": 2.1301957978563152e-07,
+      "loss": 98.22483520507812,
+      "step": 2000
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 81.0,
+      "learning_rate": 2.0919996763630974e-07,
+      "loss": 98.53840942382813,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6733333333333333,
+      "grad_norm": 83.5,
+      "learning_rate": 2.0540274671675008e-07,
+      "loss": 97.65671997070312,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6766666666666666,
+      "grad_norm": 79.0,
+      "learning_rate": 2.0162836264766344e-07,
+      "loss": 97.53788452148437,
+      "step": 2030
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 77.5,
+      "learning_rate": 1.9787725836975495e-07,
+      "loss": 98.66753540039062,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6833333333333333,
+      "grad_norm": 78.5,
+      "learning_rate": 1.9414987409174327e-07,
+      "loss": 98.99920043945312,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6866666666666666,
+      "grad_norm": 76.5,
+      "learning_rate": 1.9044664723869982e-07,
+      "loss": 98.08507080078125,
+      "step": 2060
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 83.5,
+      "learning_rate": 1.867680124007152e-07,
+      "loss": 98.79881591796875,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 77.0,
+      "learning_rate": 1.8311440128189757e-07,
+      "loss": 98.93706665039062,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6966666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 1.794862426497112e-07,
+      "loss": 98.5080810546875,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 82.0,
+      "learning_rate": 1.7588396228465795e-07,
+      "loss": 98.43944091796875,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7033333333333334,
+      "grad_norm": 91.5,
+      "learning_rate": 1.723079829303106e-07,
+      "loss": 98.53809204101563,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 86.5,
+      "learning_rate": 1.6875872424370105e-07,
+      "loss": 97.679150390625,
+      "step": 2120
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 81.0,
+      "learning_rate": 1.6523660274607302e-07,
+      "loss": 98.63049926757813,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7133333333333334,
+      "grad_norm": 80.5,
+      "learning_rate": 1.6174203177400068e-07,
+      "loss": 98.349755859375,
+      "step": 2140
+    },
+    {
+      "epoch": 0.7166666666666667,
+      "grad_norm": 77.5,
+      "learning_rate": 1.5827542143088156e-07,
+      "loss": 97.28834228515625,
+      "step": 2150
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 82.0,
+      "learning_rate": 1.548371785388095e-07,
+      "loss": 98.768017578125,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7233333333333334,
+      "grad_norm": 79.0,
+      "learning_rate": 1.5142770659083234e-07,
+      "loss": 100.50790405273438,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7266666666666667,
+      "grad_norm": 90.0,
+      "learning_rate": 1.4804740570360008e-07,
+      "loss": 96.94910278320313,
+      "step": 2180
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 81.0,
+      "learning_rate": 1.4469667257040942e-07,
+      "loss": 98.48642578125,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 77.0,
+      "learning_rate": 1.4137590041464967e-07,
+      "loss": 97.86531982421874,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7366666666666667,
+      "grad_norm": 82.5,
+      "learning_rate": 1.38085478943657e-07,
+      "loss": 98.24257202148438,
+      "step": 2210
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 84.5,
+      "learning_rate": 1.3482579430298002e-07,
+      "loss": 97.57532958984375,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7433333333333333,
+      "grad_norm": 81.5,
+      "learning_rate": 1.315972290310641e-07,
+      "loss": 97.32590942382812,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 86.0,
+      "learning_rate": 1.2840016201435847e-07,
+      "loss": 98.21510620117188,
+      "step": 2240
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 74.0,
+      "learning_rate": 1.2523496844285263e-07,
+      "loss": 99.79307250976562,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7533333333333333,
+      "grad_norm": 83.0,
+      "learning_rate": 1.2210201976604607e-07,
+      "loss": 98.51522216796874,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7566666666666667,
+      "grad_norm": 79.0,
+      "learning_rate": 1.1900168364935676e-07,
+      "loss": 99.13048095703125,
+      "step": 2270
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 86.5,
+      "learning_rate": 1.1593432393097406e-07,
+      "loss": 98.39559936523438,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7633333333333333,
+      "grad_norm": 78.0,
+      "learning_rate": 1.1290030057916103e-07,
+      "loss": 96.49028930664062,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7666666666666667,
+      "grad_norm": 86.5,
+      "learning_rate": 1.098999696500102e-07,
+      "loss": 97.28034057617188,
+      "step": 2300
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 78.5,
+      "learning_rate": 1.0693368324565888e-07,
+      "loss": 98.45610961914062,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 88.5,
+      "learning_rate": 1.0400178947296825e-07,
+      "loss": 99.782568359375,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7766666666666666,
+      "grad_norm": 80.0,
+      "learning_rate": 1.0110463240267208e-07,
+      "loss": 97.72544555664062,
+      "step": 2330
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 79.0,
+      "learning_rate": 9.824255202899791e-08,
+      "loss": 98.3702880859375,
+      "step": 2340
+    },
+    {
+      "epoch": 0.7833333333333333,
+      "grad_norm": 83.0,
+      "learning_rate": 9.541588422976747e-08,
+      "loss": 100.18757934570313,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 84.5,
+      "learning_rate": 9.26249607269796e-08,
+      "loss": 101.06594848632812,
+      "step": 2360
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 72.0,
+      "learning_rate": 8.987010904788177e-08,
+      "loss": 98.80386962890626,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7933333333333333,
+      "grad_norm": 89.0,
+      "learning_rate": 8.715165248653295e-08,
+      "loss": 98.323046875,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7966666666666666,
+      "grad_norm": 83.5,
+      "learning_rate": 8.446991006586373e-08,
+      "loss": 97.82243041992187,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 77.0,
+      "learning_rate": 8.182519650023719e-08,
+      "loss": 98.9300048828125,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8033333333333333,
+      "grad_norm": 85.5,
+      "learning_rate": 7.921782215851642e-08,
+      "loss": 97.94855346679688,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8066666666666666,
+      "grad_norm": 82.0,
+      "learning_rate": 7.664809302764097e-08,
+      "loss": 98.41607055664062,
+      "step": 2420
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 81.5,
+      "learning_rate": 7.411631067671802e-08,
+      "loss": 97.95900268554688,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 83.5,
+      "learning_rate": 7.162277222163156e-08,
+      "loss": 99.18399047851562,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8166666666666667,
+      "grad_norm": 98.0,
+      "learning_rate": 6.916777029017522e-08,
+      "loss": 98.34249267578124,
+      "step": 2450
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 81.0,
+      "learning_rate": 6.675159298771067e-08,
+      "loss": 99.41766967773438,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8233333333333334,
+      "grad_norm": 87.5,
+      "learning_rate": 6.437452386335707e-08,
+      "loss": 98.87661743164062,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 79.5,
+      "learning_rate": 6.203684187671542e-08,
+      "loss": 99.74188232421875,
+      "step": 2480
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 85.0,
+      "learning_rate": 5.973882136513166e-08,
+      "loss": 98.54530029296875,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 82.5,
+      "learning_rate": 5.748073201150183e-08,
+      "loss": 99.4644287109375,
+      "step": 2500
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.030955728896e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50806f9bca90fe0b02ef16ab9659f09b9a467f1eb053112098cedbe0578b9473
+size 5137

checkpoint-3000/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "architectures": [
+    "MiniMythosHybridForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_minimythos_hybrid.MiniMythosHybridConfig",
+    "AutoModelForCausalLM": "modeling_minimythos_hybrid.MiniMythosHybridForCausalLM"
+  },
+  "block_size": 1024,
+  "bos_token_id": 2,
+  "dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 3,
+  "hidden_size": 1024,
+  "input_scale": 0.2,
+  "is_decoder": true,
+  "leak_rate": 0.25,
+  "max_position_embeddings": 1024,
+  "mlp_mult": 4,
+  "model_type": "minimythos_hybrid",
+  "n_attn_layers": 4,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_reservoir_layers": 4,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 8,
+  "pad_token_id": 0,
+  "reservoir_scale": 0.9,
+  "tie_word_embeddings": false,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "vocab_size": 8192
+}

checkpoint-3000/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token_id": 2,
+  "do_sample": true,
+  "eos_token_id": 3,
+  "max_new_tokens": 256,
+  "pad_token_id": 0,
+  "temperature": 0.7,
+  "top_k": 50,
+  "transformers_version": "5.0.0"
+}

checkpoint-3000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2431c0e0e022b86e142a82f790295cb3ae6411acdb908a6b402a20e25ed1f09f
+size 184580440

checkpoint-3000/modeling_minimythos_hybrid.py ADDED Viewed

	@@ -0,0 +1,337 @@

+# coding=utf-8
+"""
+MiniMythos Hybrid Reservoir LM
+Hugging Face remote-code compatible modeling file.
+Load with:
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    model_id = "summerstars/EN-summer"
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
+"""
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+class MiniMythosHybridConfig(PretrainedConfig):
+    model_type = "minimythos_hybrid"
+    def __init__(
+        self,
+        vocab_size: int = 8192,
+        block_size: int = 512,
+        n_embd: int = 1024,
+        n_reservoir_layers: int = 6,
+        n_attn_layers: int = 4,
+        n_head: int = 16,
+        mlp_mult: int = 4,
+        dropout: float = 0.0,
+        leak_rate: float = 0.25,
+        reservoir_scale: float = 0.90,
+        input_scale: float = 0.20,
+        pad_token_id: int = 0,
+        bos_token_id: int = 2,
+        eos_token_id: int = 3,
+        tie_word_embeddings: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        self.n_embd = n_embd
+        self.n_reservoir_layers = n_reservoir_layers
+        self.n_attn_layers = n_attn_layers
+        self.n_head = n_head
+        self.mlp_mult = mlp_mult
+        self.dropout = dropout
+        self.leak_rate = leak_rate
+        self.reservoir_scale = reservoir_scale
+        self.input_scale = input_scale
+        self.hidden_size = n_embd
+        self.num_hidden_layers = n_reservoir_layers + n_attn_layers
+        self.num_attention_heads = n_head
+        self.max_position_embeddings = block_size
+        self.is_decoder = True
+        self.is_encoder_decoder = False
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Compute RMSNorm in fp32 for numerical stability, then cast back.
+        orig_dtype = x.dtype
+        x_float = x.float()
+        var = x_float.pow(2).mean(-1, keepdim=True)
+        x_norm = x_float * torch.rsqrt(var + self.eps)
+        return (self.weight.float() * x_norm).to(orig_dtype)
+class ReservoirBlock(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        self.leak_rate = config.leak_rate
+        self.drop = nn.Dropout(config.dropout)
+        self.in_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.reservoir = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.norm = RMSNorm(config.n_embd)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        z = torch.tanh(self.in_proj(x) + self.reservoir(x))
+        x = self.leak_rate * x + (1.0 - self.leak_rate) * z
+        x = self.norm(x)
+        return self.drop(x)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        if config.n_embd % config.n_head != 0:
+            raise ValueError("n_embd must be divisible by n_head")
+        self.n_head = config.n_head
+        self.head_dim = config.n_embd // config.n_head
+        if self.head_dim % 2 != 0:
+            raise ValueError("head_dim must be even for RoPE")
+        self.qkv = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
+        self.proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.dropout = config.dropout
+        self.resid_drop = nn.Dropout(config.dropout)
+        inv_freq = 1.0 / (
+            10000 ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32) / self.head_dim)
+        )
+        t = torch.arange(config.block_size, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("rope_cos", freqs.cos()[None, :, None, :], persistent=False)
+        self.register_buffer("rope_sin", freqs.sin()[None, :, None, :], persistent=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, channels = x.shape
+        q, k, v = self.qkv(x).reshape(
+            batch_size, seq_len, 3, self.n_head, self.head_dim
+        ).unbind(dim=2)
+        cos = self.rope_cos[:, :seq_len].to(dtype=q.dtype, device=q.device)
+        sin = self.rope_sin[:, :seq_len].to(dtype=q.dtype, device=q.device)
+        def apply_rope(u: torch.Tensor) -> torch.Tensor:
+            u_e = u[..., 0::2]
+            u_o = u[..., 1::2]
+            return torch.stack(
+                (u_e * cos - u_o * sin, u_e * sin + u_o * cos), dim=-1
+            ).flatten(-2)
+        q = apply_rope(q).transpose(1, 2)
+        k = apply_rope(k).transpose(1, 2)
+        v = v.transpose(1, 2)
+        y = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=True,
+        )
+        y = y.transpose(1, 2).contiguous().reshape(batch_size, seq_len, channels)
+        return self.resid_drop(self.proj(y))
+class SwiGLU(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        hidden = config.mlp_mult * config.n_embd
+        self.w1 = nn.Linear(config.n_embd, hidden, bias=False)
+        self.w2 = nn.Linear(config.n_embd, hidden, bias=False)
+        self.w3 = nn.Linear(hidden, config.n_embd, bias=False)
+        self.drop = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.drop(self.w3(F.silu(self.w1(x)) * self.w2(x)))
+class TransformerBlock(nn.Module):
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__()
+        self.norm1 = RMSNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.norm2 = RMSNorm(config.n_embd)
+        self.mlp = SwiGLU(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class MiniMythosHybridForCausalLM(PreTrainedModel):
+    config_class = MiniMythosHybridConfig
+    base_model_prefix = "model"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = False
+    _tied_weights_keys = []
+    all_tied_weights_keys = {}
+    def __init__(self, config: MiniMythosHybridConfig):
+        super().__init__(config)
+        self.config = config
+        self.token_emb = nn.Embedding(config.vocab_size, config.n_embd)
+        self.drop = nn.Dropout(config.dropout)
+        self.reservoir_layers = nn.ModuleList(
+            [ReservoirBlock(config) for _ in range(config.n_reservoir_layers)]
+        )
+        self.attn_layers = nn.ModuleList(
+            [TransformerBlock(config) for _ in range(config.n_attn_layers)]
+        )
+        self.norm = RMSNorm(config.n_embd)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+    def get_input_embeddings(self):
+        return self.token_emb
+    def set_input_embeddings(self, value):
+        self.token_emb = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def tie_weights(self, *args, **kwargs):
+        if getattr(self.config, "tie_word_embeddings", False):
+            self._tie_or_clone_weights(self.lm_head, self.token_emb)
+        return None
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("Specify either input_ids or inputs_embeds, not both.")
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You must specify input_ids or inputs_embeds.")
+        if inputs_embeds is None:
+            if input_ids.shape[1] > self.config.block_size:
+                input_ids = input_ids[:, -self.config.block_size:]
+                if labels is not None:
+                    labels = labels[:, -self.config.block_size:]
+            x = self.token_emb(input_ids)
+        else:
+            if inputs_embeds.shape[1] > self.config.block_size:
+                inputs_embeds = inputs_embeds[:, -self.config.block_size:, :]
+                if labels is not None:
+                    labels = labels[:, -self.config.block_size:]
+            x = inputs_embeds
+        hidden_states = [] if output_hidden_states else None
+        x = self.drop(x)
+        for layer in self.reservoir_layers:
+            if output_hidden_states:
+                hidden_states.append(x)
+            x = layer(x)
+        for layer in self.attn_layers:
+            if output_hidden_states:
+                hidden_states.append(x)
+            x = layer(x)
+        x = self.norm(x)
+        if output_hidden_states:
+            hidden_states.append(x)
+        logits = self.lm_head(x)
+        # Prevent generation from crashing if a checkpoint contains unstable values.
+        # This should not hide training issues, but it makes inference robust.
+        logits = torch.nan_to_num(logits, nan=0.0, posinf=1e4, neginf=-1e4)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        if not return_dict:
+            output = (logits,)
+            if use_cache:
+                output = output + (None,)
+            if output_hidden_states:
+                output = output + (tuple(hidden_states),)
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=tuple(hidden_states) if output_hidden_states else None,
+            attentions=None,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values=None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if input_ids is not None and input_ids.shape[1] > self.config.block_size:
+            input_ids = input_ids[:, -self.config.block_size:]
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, -self.config.block_size:]
+        if inputs_embeds is not None and input_ids is not None and input_ids.shape[1] == 1:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": None,
+                "use_cache": False,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        return past_key_values

checkpoint-3000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9174dc75986413df139d995f7c1b756d10533962c285f88cab13a6972bb6812
+size 369189643

checkpoint-3000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
+size 14645

checkpoint-3000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98d5502195c30fe6686ed1cc42c2a4d34fff72792b752a8e6c5d88f3090a3f2a
+size 1465

checkpoint-3000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2134 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0033333333333333335,
+      "grad_norm": 85.0,
+      "learning_rate": 7.2e-08,
+      "loss": 98.92244262695313,
+      "step": 10
+    },
+    {
+      "epoch": 0.006666666666666667,
+      "grad_norm": 93.0,
+      "learning_rate": 1.5199999999999998e-07,
+      "loss": 98.39124145507813,
+      "step": 20
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 90.0,
+      "learning_rate": 2.3199999999999999e-07,
+      "loss": 98.9702392578125,
+      "step": 30
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 90.0,
+      "learning_rate": 3.12e-07,
+      "loss": 99.00128173828125,
+      "step": 40
+    },
+    {
+      "epoch": 0.016666666666666666,
+      "grad_norm": 86.0,
+      "learning_rate": 3.9199999999999996e-07,
+      "loss": 98.24757690429688,
+      "step": 50
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 86.0,
+      "learning_rate": 4.7199999999999994e-07,
+      "loss": 100.627099609375,
+      "step": 60
+    },
+    {
+      "epoch": 0.023333333333333334,
+      "grad_norm": 88.5,
+      "learning_rate": 5.52e-07,
+      "loss": 99.09172973632812,
+      "step": 70
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 6.32e-07,
+      "loss": 99.2964111328125,
+      "step": 80
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 82.5,
+      "learning_rate": 7.12e-07,
+      "loss": 98.79883422851563,
+      "step": 90
+    },
+    {
+      "epoch": 0.03333333333333333,
+      "grad_norm": 83.5,
+      "learning_rate": 7.92e-07,
+      "loss": 97.59891967773437,
+      "step": 100
+    },
+    {
+      "epoch": 0.03666666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 7.999809885464028e-07,
+      "loss": 98.88760375976562,
+      "step": 110
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 82.0,
+      "learning_rate": 7.999152722615145e-07,
+      "loss": 100.03501586914062,
+      "step": 120
+    },
+    {
+      "epoch": 0.043333333333333335,
+      "grad_norm": 81.5,
+      "learning_rate": 7.998026241462926e-07,
+      "loss": 99.05869140625,
+      "step": 130
+    },
+    {
+      "epoch": 0.04666666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 7.996430574204927e-07,
+      "loss": 98.06262817382813,
+      "step": 140
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 91.0,
+      "learning_rate": 7.994365908099776e-07,
+      "loss": 99.91973876953125,
+      "step": 150
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 88.5,
+      "learning_rate": 7.991832485445195e-07,
+      "loss": 99.073046875,
+      "step": 160
+    },
+    {
+      "epoch": 0.056666666666666664,
+      "grad_norm": 84.5,
+      "learning_rate": 7.988830603549564e-07,
+      "loss": 98.0226318359375,
+      "step": 170
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 90.5,
+      "learning_rate": 7.985360614697036e-07,
+      "loss": 99.20685424804688,
+      "step": 180
+    },
+    {
+      "epoch": 0.06333333333333334,
+      "grad_norm": 85.5,
+      "learning_rate": 7.981422926106186e-07,
+      "loss": 98.71968383789063,
+      "step": 190
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 82.5,
+      "learning_rate": 7.977017999882226e-07,
+      "loss": 99.76533203125,
+      "step": 200
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 81.0,
+      "learning_rate": 7.972146352962785e-07,
+      "loss": 98.85039672851562,
+      "step": 210
+    },
+    {
+      "epoch": 0.07333333333333333,
+      "grad_norm": 79.5,
+      "learning_rate": 7.966808557057225e-07,
+      "loss": 96.81197509765624,
+      "step": 220
+    },
+    {
+      "epoch": 0.07666666666666666,
+      "grad_norm": 94.5,
+      "learning_rate": 7.961005238579563e-07,
+      "loss": 98.03938598632813,
+      "step": 230
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 74.5,
+      "learning_rate": 7.954737078574952e-07,
+      "loss": 97.90469970703126,
+      "step": 240
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 87.0,
+      "learning_rate": 7.948004812639763e-07,
+      "loss": 99.83749389648438,
+      "step": 250
+    },
+    {
+      "epoch": 0.08666666666666667,
+      "grad_norm": 87.5,
+      "learning_rate": 7.940809230835248e-07,
+      "loss": 99.23988647460938,
+      "step": 260
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 86.5,
+      "learning_rate": 7.933151177594838e-07,
+      "loss": 99.12615966796875,
+      "step": 270
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 86.0,
+      "learning_rate": 7.925031551625037e-07,
+      "loss": 99.5418212890625,
+      "step": 280
+    },
+    {
+      "epoch": 0.09666666666666666,
+      "grad_norm": 80.5,
+      "learning_rate": 7.916451305799951e-07,
+      "loss": 98.88993530273437,
+      "step": 290
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 81.5,
+      "learning_rate": 7.907411447049468e-07,
+      "loss": 98.3482177734375,
+      "step": 300
+    },
+    {
+      "epoch": 0.10333333333333333,
+      "grad_norm": 82.5,
+      "learning_rate": 7.897913036241098e-07,
+      "loss": 98.17821655273437,
+      "step": 310
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 7.88795718805546e-07,
+      "loss": 98.89118041992188,
+      "step": 320
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 83.0,
+      "learning_rate": 7.87754507085548e-07,
+      "loss": 98.15654907226562,
+      "step": 330
+    },
+    {
+      "epoch": 0.11333333333333333,
+      "grad_norm": 84.5,
+      "learning_rate": 7.86667790654928e-07,
+      "loss": 97.81229858398437,
+      "step": 340
+    },
+    {
+      "epoch": 0.11666666666666667,
+      "grad_norm": 84.0,
+      "learning_rate": 7.85535697044677e-07,
+      "loss": 99.2128662109375,
+      "step": 350
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 83.5,
+      "learning_rate": 7.843583591109998e-07,
+      "loss": 98.5178955078125,
+      "step": 360
+    },
+    {
+      "epoch": 0.12333333333333334,
+      "grad_norm": 86.5,
+      "learning_rate": 7.83135915019723e-07,
+      "loss": 98.7326416015625,
+      "step": 370
+    },
+    {
+      "epoch": 0.12666666666666668,
+      "grad_norm": 83.0,
+      "learning_rate": 7.818685082300806e-07,
+      "loss": 98.09605712890625,
+      "step": 380
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 82.0,
+      "learning_rate": 7.805562874778789e-07,
+      "loss": 99.17857666015625,
+      "step": 390
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 78.5,
+      "learning_rate": 7.791994067580411e-07,
+      "loss": 97.64859619140626,
+      "step": 400
+    },
+    {
+      "epoch": 0.13666666666666666,
+      "grad_norm": 84.5,
+      "learning_rate": 7.77798025306536e-07,
+      "loss": 96.37807006835938,
+      "step": 410
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 83.5,
+      "learning_rate": 7.763523075816902e-07,
+      "loss": 99.2608642578125,
+      "step": 420
+    },
+    {
+      "epoch": 0.14333333333333334,
+      "grad_norm": 77.0,
+      "learning_rate": 7.748624232448886e-07,
+      "loss": 99.56741333007812,
+      "step": 430
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 78.0,
+      "learning_rate": 7.733285471406642e-07,
+      "loss": 99.81188354492187,
+      "step": 440
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 82.5,
+      "learning_rate": 7.717508592761785e-07,
+      "loss": 98.52041015625,
+      "step": 450
+    },
+    {
+      "epoch": 0.15333333333333332,
+      "grad_norm": 82.0,
+      "learning_rate": 7.701295448000974e-07,
+      "loss": 99.01697387695313,
+      "step": 460
+    },
+    {
+      "epoch": 0.15666666666666668,
+      "grad_norm": 79.5,
+      "learning_rate": 7.684647939808636e-07,
+      "loss": 98.38013916015625,
+      "step": 470
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 78.5,
+      "learning_rate": 7.667568021843666e-07,
+      "loss": 97.33247680664063,
+      "step": 480
+    },
+    {
+      "epoch": 0.16333333333333333,
+      "grad_norm": 82.5,
+      "learning_rate": 7.650057698510164e-07,
+      "loss": 97.90450439453124,
+      "step": 490
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 84.5,
+      "learning_rate": 7.632119024722212e-07,
+      "loss": 98.53453369140625,
+      "step": 500
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 91.0,
+      "learning_rate": 7.613754105662717e-07,
+      "loss": 98.44060668945312,
+      "step": 510
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 83.5,
+      "learning_rate": 7.594965096536353e-07,
+      "loss": 98.7102294921875,
+      "step": 520
+    },
+    {
+      "epoch": 0.17666666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 7.575754202316649e-07,
+      "loss": 97.73778076171875,
+      "step": 530
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 83.5,
+      "learning_rate": 7.556123677487218e-07,
+      "loss": 99.01414184570312,
+      "step": 540
+    },
+    {
+      "epoch": 0.18333333333333332,
+      "grad_norm": 80.0,
+      "learning_rate": 7.536075825777187e-07,
+      "loss": 100.30809936523437,
+      "step": 550
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 86.5,
+      "learning_rate": 7.515612999890841e-07,
+      "loss": 99.7580322265625,
+      "step": 560
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 84.5,
+      "learning_rate": 7.494737601231523e-07,
+      "loss": 98.83981323242188,
+      "step": 570
+    },
+    {
+      "epoch": 0.19333333333333333,
+      "grad_norm": 81.5,
+      "learning_rate": 7.473452079619826e-07,
+      "loss": 98.02926635742188,
+      "step": 580
+    },
+    {
+      "epoch": 0.19666666666666666,
+      "grad_norm": 83.5,
+      "learning_rate": 7.451758933006086e-07,
+      "loss": 98.2523681640625,
+      "step": 590
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 75.5,
+      "learning_rate": 7.429660707177239e-07,
+      "loss": 97.91044311523437,
+      "step": 600
+    },
+    {
+      "epoch": 0.20333333333333334,
+      "grad_norm": 80.0,
+      "learning_rate": 7.407159995458066e-07,
+      "loss": 98.97639770507813,
+      "step": 610
+    },
+    {
+      "epoch": 0.20666666666666667,
+      "grad_norm": 80.0,
+      "learning_rate": 7.384259438406848e-07,
+      "loss": 96.91513671875,
+      "step": 620
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 75.0,
+      "learning_rate": 7.360961723505495e-07,
+      "loss": 98.74181518554687,
+      "step": 630
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 82.0,
+      "learning_rate": 7.337269584844142e-07,
+      "loss": 98.77709350585937,
+      "step": 640
+    },
+    {
+      "epoch": 0.21666666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 7.313185802800312e-07,
+      "loss": 98.27448120117188,
+      "step": 650
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 85.0,
+      "learning_rate": 7.288713203712605e-07,
+      "loss": 98.03839111328125,
+      "step": 660
+    },
+    {
+      "epoch": 0.22333333333333333,
+      "grad_norm": 91.5,
+      "learning_rate": 7.263854659549032e-07,
+      "loss": 97.03794555664062,
+      "step": 670
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 78.0,
+      "learning_rate": 7.23861308756997e-07,
+      "loss": 98.09403686523437,
+      "step": 680
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 91.0,
+      "learning_rate": 7.212991449985802e-07,
+      "loss": 98.54012451171874,
+      "step": 690
+    },
+    {
+      "epoch": 0.23333333333333334,
+      "grad_norm": 85.0,
+      "learning_rate": 7.186992753609302e-07,
+      "loss": 97.112890625,
+      "step": 700
+    },
+    {
+      "epoch": 0.23666666666666666,
+      "grad_norm": 83.0,
+      "learning_rate": 7.160620049502761e-07,
+      "loss": 97.43547973632812,
+      "step": 710
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 81.0,
+      "learning_rate": 7.133876432619936e-07,
+      "loss": 97.59598388671876,
+      "step": 720
+    },
+    {
+      "epoch": 0.24333333333333335,
+      "grad_norm": 82.0,
+      "learning_rate": 7.106765041442847e-07,
+      "loss": 98.765087890625,
+      "step": 730
+    },
+    {
+      "epoch": 0.24666666666666667,
+      "grad_norm": 82.5,
+      "learning_rate": 7.079289057613449e-07,
+      "loss": 98.65599365234375,
+      "step": 740
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 85.5,
+      "learning_rate": 7.051451705560269e-07,
+      "loss": 100.546728515625,
+      "step": 750
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 81.5,
+      "learning_rate": 7.023256252119996e-07,
+      "loss": 99.18995361328125,
+      "step": 760
+    },
+    {
+      "epoch": 0.25666666666666665,
+      "grad_norm": 83.5,
+      "learning_rate": 6.994706006154102e-07,
+      "loss": 98.26092529296875,
+      "step": 770
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 77.5,
+      "learning_rate": 6.965804318160538e-07,
+      "loss": 100.81718139648437,
+      "step": 780
+    },
+    {
+      "epoch": 0.2633333333333333,
+      "grad_norm": 106.0,
+      "learning_rate": 6.936554579880531e-07,
+      "loss": 99.88375244140624,
+      "step": 790
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 81.0,
+      "learning_rate": 6.906960223900558e-07,
+      "loss": 98.47666015625,
+      "step": 800
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 79.0,
+      "learning_rate": 6.877024723249506e-07,
+      "loss": 98.1271240234375,
+      "step": 810
+    },
+    {
+      "epoch": 0.2733333333333333,
+      "grad_norm": 86.0,
+      "learning_rate": 6.846751590991103e-07,
+      "loss": 97.95358276367188,
+      "step": 820
+    },
+    {
+      "epoch": 0.27666666666666667,
+      "grad_norm": 85.5,
+      "learning_rate": 6.816144379811647e-07,
+      "loss": 97.06906127929688,
+      "step": 830
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 80.0,
+      "learning_rate": 6.785206681603071e-07,
+      "loss": 97.89542236328126,
+      "step": 840
+    },
+    {
+      "epoch": 0.2833333333333333,
+      "grad_norm": 76.5,
+      "learning_rate": 6.753942127041434e-07,
+      "loss": 98.07276611328125,
+      "step": 850
+    },
+    {
+      "epoch": 0.2866666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 6.722354385160832e-07,
+      "loss": 98.10612182617187,
+      "step": 860
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 85.0,
+      "learning_rate": 6.690447162922828e-07,
+      "loss": 97.93245239257813,
+      "step": 870
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 79.0,
+      "learning_rate": 6.65822420478142e-07,
+      "loss": 97.72557373046875,
+      "step": 880
+    },
+    {
+      "epoch": 0.2966666666666667,
+      "grad_norm": 80.0,
+      "learning_rate": 6.625689292243618e-07,
+      "loss": 97.9116455078125,
+      "step": 890
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 83.5,
+      "learning_rate": 6.59284624342566e-07,
+      "loss": 97.18661499023438,
+      "step": 900
+    },
+    {
+      "epoch": 0.30333333333333334,
+      "grad_norm": 82.5,
+      "learning_rate": 6.55969891260494e-07,
+      "loss": 97.79299926757812,
+      "step": 910
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 79.5,
+      "learning_rate": 6.526251189767701e-07,
+      "loss": 97.34827270507813,
+      "step": 920
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 80.5,
+      "learning_rate": 6.492507000152516e-07,
+      "loss": 98.7041748046875,
+      "step": 930
+    },
+    {
+      "epoch": 0.31333333333333335,
+      "grad_norm": 82.0,
+      "learning_rate": 6.458470303789652e-07,
+      "loss": 98.721240234375,
+      "step": 940
+    },
+    {
+      "epoch": 0.31666666666666665,
+      "grad_norm": 85.0,
+      "learning_rate": 6.424145095036337e-07,
+      "loss": 99.30765991210937,
+      "step": 950
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 77.5,
+      "learning_rate": 6.389535402108008e-07,
+      "loss": 98.2544921875,
+      "step": 960
+    },
+    {
+      "epoch": 0.3233333333333333,
+      "grad_norm": 85.0,
+      "learning_rate": 6.354645286605583e-07,
+      "loss": 97.76597290039062,
+      "step": 970
+    },
+    {
+      "epoch": 0.32666666666666666,
+      "grad_norm": 88.5,
+      "learning_rate": 6.31947884303881e-07,
+      "loss": 98.00664672851562,
+      "step": 980
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 80.0,
+      "learning_rate": 6.284040198345763e-07,
+      "loss": 98.64342651367187,
+      "step": 990
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 92.0,
+      "learning_rate": 6.248333511408522e-07,
+      "loss": 97.74580688476563,
+      "step": 1000
+    },
+    {
+      "epoch": 0.33666666666666667,
+      "grad_norm": 91.0,
+      "learning_rate": 6.212362972565115e-07,
+      "loss": 98.11343994140626,
+      "step": 1010
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 82.0,
+      "learning_rate": 6.176132803117761e-07,
+      "loss": 98.30454711914062,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3433333333333333,
+      "grad_norm": 81.5,
+      "learning_rate": 6.13964725483748e-07,
+      "loss": 97.90970458984376,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 85.5,
+      "learning_rate": 6.102910609465133e-07,
+      "loss": 96.98532104492188,
+      "step": 1040
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 81.0,
+      "learning_rate": 6.065927178208936e-07,
+      "loss": 107.39459228515625,
+      "step": 1050
+    },
+    {
+      "epoch": 0.35333333333333333,
+      "grad_norm": 82.5,
+      "learning_rate": 6.028701301238521e-07,
+      "loss": 98.15614624023438,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3566666666666667,
+      "grad_norm": 81.5,
+      "learning_rate": 5.991237347175605e-07,
+      "loss": 98.47268676757812,
+      "step": 1070
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 75.0,
+      "learning_rate": 5.953539712581301e-07,
+      "loss": 97.77119750976563,
+      "step": 1080
+    },
+    {
+      "epoch": 0.36333333333333334,
+      "grad_norm": 78.0,
+      "learning_rate": 5.915612821440172e-07,
+      "loss": 98.135302734375,
+      "step": 1090
+    },
+    {
+      "epoch": 0.36666666666666664,
+      "grad_norm": 76.5,
+      "learning_rate": 5.877461124641053e-07,
+      "loss": 97.557373046875,
+      "step": 1100
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 81.5,
+      "learning_rate": 5.839089099454721e-07,
+      "loss": 98.51465454101563,
+      "step": 1110
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 81.5,
+      "learning_rate": 5.800501249008462e-07,
+      "loss": 99.03858032226563,
+      "step": 1120
+    },
+    {
+      "epoch": 0.37666666666666665,
+      "grad_norm": 75.5,
+      "learning_rate": 5.761702101757618e-07,
+      "loss": 99.02989501953125,
+      "step": 1130
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 83.5,
+      "learning_rate": 5.722696210954143e-07,
+      "loss": 100.75755004882812,
+      "step": 1140
+    },
+    {
+      "epoch": 0.38333333333333336,
+      "grad_norm": 77.0,
+      "learning_rate": 5.683488154112268e-07,
+      "loss": 98.41821899414063,
+      "step": 1150
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 77.0,
+      "learning_rate": 5.644082532471301e-07,
+      "loss": 98.3514892578125,
+      "step": 1160
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 87.0,
+      "learning_rate": 5.60448397045566e-07,
+      "loss": 97.98330078125,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3933333333333333,
+      "grad_norm": 78.0,
+      "learning_rate": 5.564697115132166e-07,
+      "loss": 99.36333618164062,
+      "step": 1180
+    },
+    {
+      "epoch": 0.39666666666666667,
+      "grad_norm": 80.0,
+      "learning_rate": 5.524726635664701e-07,
+      "loss": 98.58826293945313,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 82.0,
+      "learning_rate": 5.484577222766244e-07,
+      "loss": 99.26712646484376,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4033333333333333,
+      "grad_norm": 78.5,
+      "learning_rate": 5.444253588148419e-07,
+      "loss": 99.81590576171875,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4066666666666667,
+      "grad_norm": 89.0,
+      "learning_rate": 5.40376046396853e-07,
+      "loss": 97.74580078125,
+      "step": 1220
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 80.5,
+      "learning_rate": 5.363102602274239e-07,
+      "loss": 98.692529296875,
+      "step": 1230
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 80.0,
+      "learning_rate": 5.32228477444588e-07,
+      "loss": 99.05579833984375,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 77.5,
+      "learning_rate": 5.281311770636531e-07,
+      "loss": 98.2328369140625,
+      "step": 1250
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 79.5,
+      "learning_rate": 5.24018839920985e-07,
+      "loss": 98.72367553710937,
+      "step": 1260
+    },
+    {
+      "epoch": 0.42333333333333334,
+      "grad_norm": 80.0,
+      "learning_rate": 5.198919486175807e-07,
+      "loss": 98.29996948242187,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 108.0,
+      "learning_rate": 5.157509874624324e-07,
+      "loss": 97.95867919921875,
+      "step": 1280
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 83.0,
+      "learning_rate": 5.115964424156917e-07,
+      "loss": 97.64778442382813,
+      "step": 1290
+    },
+    {
+      "epoch": 0.43333333333333335,
+      "grad_norm": 103.5,
+      "learning_rate": 5.0742880103164e-07,
+      "loss": 97.5724365234375,
+      "step": 1300
+    },
+    {
+      "epoch": 0.43666666666666665,
+      "grad_norm": 85.0,
+      "learning_rate": 5.032485524014726e-07,
+      "loss": 97.94261474609375,
+      "step": 1310
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 81.5,
+      "learning_rate": 4.990561870958998e-07,
+      "loss": 98.76618041992188,
+      "step": 1320
+    },
+    {
+      "epoch": 0.44333333333333336,
+      "grad_norm": 78.5,
+      "learning_rate": 4.948521971075788e-07,
+      "loss": 100.03206176757813,
+      "step": 1330
+    },
+    {
+      "epoch": 0.44666666666666666,
+      "grad_norm": 76.5,
+      "learning_rate": 4.906370757933739e-07,
+      "loss": 97.71541748046874,
+      "step": 1340
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 78.0,
+      "learning_rate": 4.864113178164604e-07,
+      "loss": 98.21603393554688,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 80.0,
+      "learning_rate": 4.821754190882729e-07,
+      "loss": 98.40943603515625,
+      "step": 1360
+    },
+    {
+      "epoch": 0.45666666666666667,
+      "grad_norm": 79.0,
+      "learning_rate": 4.779298767103083e-07,
+      "loss": 98.84269409179687,
+      "step": 1370
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 85.5,
+      "learning_rate": 4.736751889157882e-07,
+      "loss": 97.90013427734375,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4633333333333333,
+      "grad_norm": 76.5,
+      "learning_rate": 4.6941185501118975e-07,
+      "loss": 96.82548828125,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 4.6514037531764925e-07,
+      "loss": 98.97203979492187,
+      "step": 1400
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 90.5,
+      "learning_rate": 4.608612511122476e-07,
+      "loss": 98.75665893554688,
+      "step": 1410
+    },
+    {
+      "epoch": 0.47333333333333333,
+      "grad_norm": 76.0,
+      "learning_rate": 4.565749845691828e-07,
+      "loss": 97.86590576171875,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4766666666666667,
+      "grad_norm": 78.5,
+      "learning_rate": 4.5228207870083823e-07,
+      "loss": 98.30226440429688,
+      "step": 1430
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 76.5,
+      "learning_rate": 4.479830372987511e-07,
+      "loss": 97.27890625,
+      "step": 1440
+    },
+    {
+      "epoch": 0.48333333333333334,
+      "grad_norm": 82.5,
+      "learning_rate": 4.436783648744911e-07,
+      "loss": 96.8334716796875,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4866666666666667,
+      "grad_norm": 94.5,
+      "learning_rate": 4.3936856660045317e-07,
+      "loss": 100.1866943359375,
+      "step": 1460
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 76.0,
+      "learning_rate": 4.350541482505733e-07,
+      "loss": 97.47962036132813,
+      "step": 1470
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 80.5,
+      "learning_rate": 4.30735616140974e-07,
+      "loss": 98.73521118164062,
+      "step": 1480
+    },
+    {
+      "epoch": 0.49666666666666665,
+      "grad_norm": 80.0,
+      "learning_rate": 4.2641347707054586e-07,
+      "loss": 97.36817016601563,
+      "step": 1490
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 81.0,
+      "learning_rate": 4.220882382614721e-07,
+      "loss": 98.93515625,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5033333333333333,
+      "grad_norm": 84.5,
+      "learning_rate": 4.177604072997041e-07,
+      "loss": 99.65122680664062,
+      "step": 1510
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 81.5,
+      "learning_rate": 4.134304920753937e-07,
+      "loss": 99.22744140625,
+      "step": 1520
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 81.0,
+      "learning_rate": 4.090990007232907e-07,
+      "loss": 98.11915893554688,
+      "step": 1530
+    },
+    {
+      "epoch": 0.5133333333333333,
+      "grad_norm": 91.5,
+      "learning_rate": 4.0476644156310994e-07,
+      "loss": 99.190869140625,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5166666666666667,
+      "grad_norm": 76.0,
+      "learning_rate": 4.0043332303987834e-07,
+      "loss": 97.55637817382812,
+      "step": 1550
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 82.5,
+      "learning_rate": 3.961001536642667e-07,
+      "loss": 98.12214965820313,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5233333333333333,
+      "grad_norm": 80.5,
+      "learning_rate": 3.9176744195291366e-07,
+      "loss": 100.78977661132812,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5266666666666666,
+      "grad_norm": 78.0,
+      "learning_rate": 3.874356963687487e-07,
+      "loss": 98.84946899414062,
+      "step": 1580
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 81.5,
+      "learning_rate": 3.831054252613222e-07,
+      "loss": 99.25551147460938,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 78.0,
+      "learning_rate": 3.787771368071479e-07,
+      "loss": 98.48034057617187,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5366666666666666,
+      "grad_norm": 78.0,
+      "learning_rate": 3.7445133895006673e-07,
+      "loss": 100.24962768554687,
+      "step": 1610
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 77.5,
+      "learning_rate": 3.7012853934163675e-07,
+      "loss": 98.97702026367188,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5433333333333333,
+      "grad_norm": 80.5,
+      "learning_rate": 3.6580924528155834e-07,
+      "loss": 97.33264770507813,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 81.5,
+      "learning_rate": 3.6149396365814017e-07,
+      "loss": 98.55026245117188,
+      "step": 1640
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 94.5,
+      "learning_rate": 3.571832008888139e-07,
+      "loss": 97.8692138671875,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5533333333333333,
+      "grad_norm": 79.0,
+      "learning_rate": 3.528774628607033e-07,
+      "loss": 99.7307861328125,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5566666666666666,
+      "grad_norm": 76.0,
+      "learning_rate": 3.485772548712565e-07,
+      "loss": 98.52730712890624,
+      "step": 1670
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 76.5,
+      "learning_rate": 3.442830815689475e-07,
+      "loss": 98.9763427734375,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5633333333333334,
+      "grad_norm": 73.0,
+      "learning_rate": 3.399954468940525e-07,
+      "loss": 97.82731323242187,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5666666666666667,
+      "grad_norm": 76.5,
+      "learning_rate": 3.357148540195112e-07,
+      "loss": 98.65582885742188,
+      "step": 1700
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 87.5,
+      "learning_rate": 3.314418052918764e-07,
+      "loss": 97.91898193359376,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 76.5,
+      "learning_rate": 3.2717680217236214e-07,
+      "loss": 98.35755615234375,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5766666666666667,
+      "grad_norm": 95.0,
+      "learning_rate": 3.2292034517799457e-07,
+      "loss": 98.27772827148438,
+      "step": 1730
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 79.0,
+      "learning_rate": 3.1867293382287417e-07,
+      "loss": 97.74335327148438,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 87.5,
+      "learning_rate": 3.1443506655955536e-07,
+      "loss": 97.20853881835937,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 84.5,
+      "learning_rate": 3.1020724072055136e-07,
+      "loss": 98.31585083007812,
+      "step": 1760
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 84.0,
+      "learning_rate": 3.0598995245996964e-07,
+      "loss": 99.7463623046875,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5933333333333334,
+      "grad_norm": 83.5,
+      "learning_rate": 3.017836966952859e-07,
+      "loss": 98.76014404296875,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5966666666666667,
+      "grad_norm": 138.0,
+      "learning_rate": 2.9758896704926393e-07,
+      "loss": 99.64288330078125,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 80.5,
+      "learning_rate": 2.93406255792026e-07,
+      "loss": 98.490380859375,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6033333333333334,
+      "grad_norm": 79.0,
+      "learning_rate": 2.8923605378328365e-07,
+      "loss": 97.536083984375,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6066666666666667,
+      "grad_norm": 76.5,
+      "learning_rate": 2.8507885041473197e-07,
+      "loss": 98.35460815429687,
+      "step": 1820
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 80.0,
+      "learning_rate": 2.809351335526184e-07,
+      "loss": 98.06194458007812,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 79.0,
+      "learning_rate": 2.7680538948048913e-07,
+      "loss": 97.76737670898437,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6166666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 2.7269010284212136e-07,
+      "loss": 99.20833740234374,
+      "step": 1850
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 82.5,
+      "learning_rate": 2.685897565846484e-07,
+      "loss": 98.02843627929687,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6233333333333333,
+      "grad_norm": 84.0,
+      "learning_rate": 2.6450483190188343e-07,
+      "loss": 97.90446166992187,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 78.0,
+      "learning_rate": 2.604358081778498e-07,
+      "loss": 97.0069091796875,
+      "step": 1880
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 86.0,
+      "learning_rate": 2.5638316293052245e-07,
+      "loss": 98.136376953125,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6333333333333333,
+      "grad_norm": 76.0,
+      "learning_rate": 2.523473717557898e-07,
+      "loss": 98.2837158203125,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6366666666666667,
+      "grad_norm": 83.5,
+      "learning_rate": 2.4832890827163993e-07,
+      "loss": 99.101025390625,
+      "step": 1910
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 85.0,
+      "learning_rate": 2.443282440625797e-07,
+      "loss": 98.89285278320312,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6433333333333333,
+      "grad_norm": 75.5,
+      "learning_rate": 2.403458486242921e-07,
+      "loss": 97.999560546875,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6466666666666666,
+      "grad_norm": 91.5,
+      "learning_rate": 2.3638218930853874e-07,
+      "loss": 96.57308959960938,
+      "step": 1940
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 88.5,
+      "learning_rate": 2.3243773126831448e-07,
+      "loss": 98.37883911132812,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 87.0,
+      "learning_rate": 2.2851293740325895e-07,
+      "loss": 97.91729125976562,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6566666666666666,
+      "grad_norm": 80.0,
+      "learning_rate": 2.2460826830533416e-07,
+      "loss": 99.19988403320312,
+      "step": 1970
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 79.0,
+      "learning_rate": 2.2072418220477083e-07,
+      "loss": 97.540087890625,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6633333333333333,
+      "grad_norm": 77.5,
+      "learning_rate": 2.168611349162943e-07,
+      "loss": 98.80101318359375,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 83.0,
+      "learning_rate": 2.1301957978563152e-07,
+      "loss": 98.22483520507812,
+      "step": 2000
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 81.0,
+      "learning_rate": 2.0919996763630974e-07,
+      "loss": 98.53840942382813,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6733333333333333,
+      "grad_norm": 83.5,
+      "learning_rate": 2.0540274671675008e-07,
+      "loss": 97.65671997070312,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6766666666666666,
+      "grad_norm": 79.0,
+      "learning_rate": 2.0162836264766344e-07,
+      "loss": 97.53788452148437,
+      "step": 2030
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 77.5,
+      "learning_rate": 1.9787725836975495e-07,
+      "loss": 98.66753540039062,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6833333333333333,
+      "grad_norm": 78.5,
+      "learning_rate": 1.9414987409174327e-07,
+      "loss": 98.99920043945312,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6866666666666666,
+      "grad_norm": 76.5,
+      "learning_rate": 1.9044664723869982e-07,
+      "loss": 98.08507080078125,
+      "step": 2060
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 83.5,
+      "learning_rate": 1.867680124007152e-07,
+      "loss": 98.79881591796875,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 77.0,
+      "learning_rate": 1.8311440128189757e-07,
+      "loss": 98.93706665039062,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6966666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 1.794862426497112e-07,
+      "loss": 98.5080810546875,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 82.0,
+      "learning_rate": 1.7588396228465795e-07,
+      "loss": 98.43944091796875,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7033333333333334,
+      "grad_norm": 91.5,
+      "learning_rate": 1.723079829303106e-07,
+      "loss": 98.53809204101563,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 86.5,
+      "learning_rate": 1.6875872424370105e-07,
+      "loss": 97.679150390625,
+      "step": 2120
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 81.0,
+      "learning_rate": 1.6523660274607302e-07,
+      "loss": 98.63049926757813,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7133333333333334,
+      "grad_norm": 80.5,
+      "learning_rate": 1.6174203177400068e-07,
+      "loss": 98.349755859375,
+      "step": 2140
+    },
+    {
+      "epoch": 0.7166666666666667,
+      "grad_norm": 77.5,
+      "learning_rate": 1.5827542143088156e-07,
+      "loss": 97.28834228515625,
+      "step": 2150
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 82.0,
+      "learning_rate": 1.548371785388095e-07,
+      "loss": 98.768017578125,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7233333333333334,
+      "grad_norm": 79.0,
+      "learning_rate": 1.5142770659083234e-07,
+      "loss": 100.50790405273438,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7266666666666667,
+      "grad_norm": 90.0,
+      "learning_rate": 1.4804740570360008e-07,
+      "loss": 96.94910278320313,
+      "step": 2180
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 81.0,
+      "learning_rate": 1.4469667257040942e-07,
+      "loss": 98.48642578125,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 77.0,
+      "learning_rate": 1.4137590041464967e-07,
+      "loss": 97.86531982421874,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7366666666666667,
+      "grad_norm": 82.5,
+      "learning_rate": 1.38085478943657e-07,
+      "loss": 98.24257202148438,
+      "step": 2210
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 84.5,
+      "learning_rate": 1.3482579430298002e-07,
+      "loss": 97.57532958984375,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7433333333333333,
+      "grad_norm": 81.5,
+      "learning_rate": 1.315972290310641e-07,
+      "loss": 97.32590942382812,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 86.0,
+      "learning_rate": 1.2840016201435847e-07,
+      "loss": 98.21510620117188,
+      "step": 2240
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 74.0,
+      "learning_rate": 1.2523496844285263e-07,
+      "loss": 99.79307250976562,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7533333333333333,
+      "grad_norm": 83.0,
+      "learning_rate": 1.2210201976604607e-07,
+      "loss": 98.51522216796874,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7566666666666667,
+      "grad_norm": 79.0,
+      "learning_rate": 1.1900168364935676e-07,
+      "loss": 99.13048095703125,
+      "step": 2270
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 86.5,
+      "learning_rate": 1.1593432393097406e-07,
+      "loss": 98.39559936523438,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7633333333333333,
+      "grad_norm": 78.0,
+      "learning_rate": 1.1290030057916103e-07,
+      "loss": 96.49028930664062,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7666666666666667,
+      "grad_norm": 86.5,
+      "learning_rate": 1.098999696500102e-07,
+      "loss": 97.28034057617188,
+      "step": 2300
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 78.5,
+      "learning_rate": 1.0693368324565888e-07,
+      "loss": 98.45610961914062,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 88.5,
+      "learning_rate": 1.0400178947296825e-07,
+      "loss": 99.782568359375,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7766666666666666,
+      "grad_norm": 80.0,
+      "learning_rate": 1.0110463240267208e-07,
+      "loss": 97.72544555664062,
+      "step": 2330
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 79.0,
+      "learning_rate": 9.824255202899791e-08,
+      "loss": 98.3702880859375,
+      "step": 2340
+    },
+    {
+      "epoch": 0.7833333333333333,
+      "grad_norm": 83.0,
+      "learning_rate": 9.541588422976747e-08,
+      "loss": 100.18757934570313,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 84.5,
+      "learning_rate": 9.26249607269796e-08,
+      "loss": 101.06594848632812,
+      "step": 2360
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 72.0,
+      "learning_rate": 8.987010904788177e-08,
+      "loss": 98.80386962890626,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7933333333333333,
+      "grad_norm": 89.0,
+      "learning_rate": 8.715165248653295e-08,
+      "loss": 98.323046875,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7966666666666666,
+      "grad_norm": 83.5,
+      "learning_rate": 8.446991006586373e-08,
+      "loss": 97.82243041992187,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 77.0,
+      "learning_rate": 8.182519650023719e-08,
+      "loss": 98.9300048828125,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8033333333333333,
+      "grad_norm": 85.5,
+      "learning_rate": 7.921782215851642e-08,
+      "loss": 97.94855346679688,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8066666666666666,
+      "grad_norm": 82.0,
+      "learning_rate": 7.664809302764097e-08,
+      "loss": 98.41607055664062,
+      "step": 2420
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 81.5,
+      "learning_rate": 7.411631067671802e-08,
+      "loss": 97.95900268554688,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 83.5,
+      "learning_rate": 7.162277222163156e-08,
+      "loss": 99.18399047851562,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8166666666666667,
+      "grad_norm": 98.0,
+      "learning_rate": 6.916777029017522e-08,
+      "loss": 98.34249267578124,
+      "step": 2450
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 81.0,
+      "learning_rate": 6.675159298771067e-08,
+      "loss": 99.41766967773438,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8233333333333334,
+      "grad_norm": 87.5,
+      "learning_rate": 6.437452386335707e-08,
+      "loss": 98.87661743164062,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 79.5,
+      "learning_rate": 6.203684187671542e-08,
+      "loss": 99.74188232421875,
+      "step": 2480
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 85.0,
+      "learning_rate": 5.973882136513166e-08,
+      "loss": 98.54530029296875,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 82.5,
+      "learning_rate": 5.748073201150183e-08,
+      "loss": 99.4644287109375,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8366666666666667,
+      "grad_norm": 82.0,
+      "learning_rate": 5.5262838812623456e-08,
+      "loss": 98.99419555664062,
+      "step": 2510
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 83.5,
+      "learning_rate": 5.3085402048096904e-08,
+      "loss": 99.17667846679687,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8433333333333334,
+      "grad_norm": 77.0,
+      "learning_rate": 5.0948677249780826e-08,
+      "loss": 97.988623046875,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8466666666666667,
+      "grad_norm": 79.5,
+      "learning_rate": 4.8852915171804053e-08,
+      "loss": 98.18543090820313,
+      "step": 2540
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 80.0,
+      "learning_rate": 4.679836176113867e-08,
+      "loss": 98.3475830078125,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 76.5,
+      "learning_rate": 4.478525812873668e-08,
+      "loss": 102.21969604492188,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8566666666666667,
+      "grad_norm": 77.5,
+      "learning_rate": 4.281384052123509e-08,
+      "loss": 98.18867797851563,
+      "step": 2570
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 79.5,
+      "learning_rate": 4.0884340293230935e-08,
+      "loss": 98.02352294921874,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8633333333333333,
+      "grad_norm": 80.5,
+      "learning_rate": 3.899698388013104e-08,
+      "loss": 98.23216552734375,
+      "step": 2590
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 85.5,
+      "learning_rate": 3.715199277157839e-08,
+      "loss": 98.76737060546876,
+      "step": 2600
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 76.0,
+      "learning_rate": 3.534958348545998e-08,
+      "loss": 96.958544921875,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8733333333333333,
+      "grad_norm": 90.0,
+      "learning_rate": 3.358996754249701e-08,
+      "loss": 98.20695190429687,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8766666666666667,
+      "grad_norm": 115.5,
+      "learning_rate": 3.1873351441422134e-08,
+      "loss": 96.37230834960937,
+      "step": 2630
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 77.0,
+      "learning_rate": 3.01999366347458e-08,
+      "loss": 98.812548828125,
+      "step": 2640
+    },
+    {
+      "epoch": 0.8833333333333333,
+      "grad_norm": 79.0,
+      "learning_rate": 2.8569919505115182e-08,
+      "loss": 97.3222900390625,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8866666666666667,
+      "grad_norm": 82.5,
+      "learning_rate": 2.6983491342267563e-08,
+      "loss": 99.4928466796875,
+      "step": 2660
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 78.0,
+      "learning_rate": 2.544083832058179e-08,
+      "loss": 97.92261962890625,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 91.0,
+      "learning_rate": 2.3942141477229614e-08,
+      "loss": 98.01392211914063,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8966666666666666,
+      "grad_norm": 81.5,
+      "learning_rate": 2.2487576690930532e-08,
+      "loss": 98.38418579101562,
+      "step": 2690
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 88.0,
+      "learning_rate": 2.107731466131142e-08,
+      "loss": 97.49611206054688,
+      "step": 2700
+    },
+    {
+      "epoch": 0.9033333333333333,
+      "grad_norm": 81.0,
+      "learning_rate": 1.9711520888874334e-08,
+      "loss": 97.51200561523437,
+      "step": 2710
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 76.5,
+      "learning_rate": 1.839035565557392e-08,
+      "loss": 98.69437255859376,
+      "step": 2720
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 76.5,
+      "learning_rate": 1.7113974006008136e-08,
+      "loss": 97.694970703125,
+      "step": 2730
+    },
+    {
+      "epoch": 0.9133333333333333,
+      "grad_norm": 78.5,
+      "learning_rate": 1.5882525729222772e-08,
+      "loss": 97.87529907226562,
+      "step": 2740
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 94.0,
+      "learning_rate": 1.4696155341133066e-08,
+      "loss": 98.2024169921875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 80.0,
+      "learning_rate": 1.3555002067564103e-08,
+      "loss": 99.25071411132812,
+      "step": 2760
+    },
+    {
+      "epoch": 0.9233333333333333,
+      "grad_norm": 67.0,
+      "learning_rate": 1.2459199827912171e-08,
+      "loss": 97.6507568359375,
+      "step": 2770
+    },
+    {
+      "epoch": 0.9266666666666666,
+      "grad_norm": 84.0,
+      "learning_rate": 1.1408877219428736e-08,
+      "loss": 98.1393798828125,
+      "step": 2780
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 79.5,
+      "learning_rate": 1.040415750212862e-08,
+      "loss": 98.26876220703124,
+      "step": 2790
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 80.5,
+      "learning_rate": 9.445158584325509e-09,
+      "loss": 97.49425048828125,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9366666666666666,
+      "grad_norm": 77.0,
+      "learning_rate": 8.531993008794281e-09,
+      "loss": 97.94264526367188,
+      "step": 2810
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 95.0,
+      "learning_rate": 7.664767939564009e-09,
+      "loss": 99.15473022460938,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9433333333333334,
+      "grad_norm": 82.5,
+      "learning_rate": 6.843585149341757e-09,
+      "loss": 98.96663818359374,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 86.5,
+      "learning_rate": 6.068541007568706e-09,
+      "loss": 99.20695190429687,
+      "step": 2840
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 81.0,
+      "learning_rate": 5.339726469111427e-09,
+      "loss": 97.62832641601562,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9533333333333334,
+      "grad_norm": 83.5,
+      "learning_rate": 4.6572270635873105e-09,
+      "loss": 99.68925170898437,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9566666666666667,
+      "grad_norm": 90.0,
+      "learning_rate": 4.021122885327744e-09,
+      "loss": 97.71112670898438,
+      "step": 2870
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 77.5,
+      "learning_rate": 3.4314885839782594e-09,
+      "loss": 97.45633544921876,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9633333333333334,
+      "grad_norm": 77.5,
+      "learning_rate": 2.8883933557385163e-09,
+      "loss": 98.58109130859376,
+      "step": 2890
+    },
+    {
+      "epoch": 0.9666666666666667,
+      "grad_norm": 80.0,
+      "learning_rate": 2.3919009352414645e-09,
+      "loss": 97.45646362304687,
+      "step": 2900
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 77.0,
+      "learning_rate": 1.942069588074036e-09,
+      "loss": 98.3052734375,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 98.5,
+      "learning_rate": 1.5389521039392394e-09,
+      "loss": 99.1884765625,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9766666666666667,
+      "grad_norm": 86.0,
+      "learning_rate": 1.1825957904611605e-09,
+      "loss": 99.708837890625,
+      "step": 2930
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 81.5,
+      "learning_rate": 8.730424676331782e-10,
+      "loss": 98.9242431640625,
+      "step": 2940
+    },
+    {
+      "epoch": 0.9833333333333333,
+      "grad_norm": 82.0,
+      "learning_rate": 6.103284629102479e-10,
+      "loss": 98.93572998046875,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 78.0,
+      "learning_rate": 3.9448460694564425e-10,
+      "loss": 98.48370971679688,
+      "step": 2960
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 80.5,
+      "learning_rate": 2.255362299727892e-10,
+      "loss": 98.5514892578125,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9933333333333333,
+      "grad_norm": 82.0,
+      "learning_rate": 1.0350315883291827e-10,
+      "loss": 98.85977172851562,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9966666666666667,
+      "grad_norm": 87.0,
+      "learning_rate": 2.839971464791979e-11,
+      "loss": 97.87850952148438,
+      "step": 2990
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 81.0,
+      "learning_rate": 2.3471113999029566e-13,
+      "loss": 97.35574340820312,
+      "step": 3000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.2371468746752e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-3000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50806f9bca90fe0b02ef16ab9659f09b9a467f1eb053112098cedbe0578b9473
+size 5137

config.json CHANGED Viewed

@@ -1,19 +1,4 @@
 {
-  "vocab_size": 8192,
-  "block_size": 1024,
-  "n_embd": 1024,
-  "n_reservoir_layers": 4,
-  "n_attn_layers": 4,
-  "n_head": 16,
-  "mlp_mult": 4,
-  "dropout": 0.0,
-  "leak_rate": 0.25,
-  "reservoir_scale": 0.9,
-  "input_scale": 0.2,
-  "pad_token_id": 0,
-  "bos_token_id": 2,
-  "eos_token_id": 3,
-  "model_type": "minimythos_hybrid",
   "architectures": [
     "MiniMythosHybridForCausalLM"
   ],
@@ -21,6 +6,28 @@
     "AutoConfig": "modeling_minimythos_hybrid.MiniMythosHybridConfig",
     "AutoModelForCausalLM": "modeling_minimythos_hybrid.MiniMythosHybridForCausalLM"
   },
-  "torch_dtype": "bfloat16",
-  "transformers_version": "custom"
-}

 {
   "architectures": [
     "MiniMythosHybridForCausalLM"
   ],
     "AutoConfig": "modeling_minimythos_hybrid.MiniMythosHybridConfig",
     "AutoModelForCausalLM": "modeling_minimythos_hybrid.MiniMythosHybridForCausalLM"
   },
+  "block_size": 1024,
+  "bos_token_id": 2,
+  "dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 3,
+  "hidden_size": 1024,
+  "input_scale": 0.2,
+  "is_decoder": true,
+  "leak_rate": 0.25,
+  "max_position_embeddings": 1024,
+  "mlp_mult": 4,
+  "model_type": "minimythos_hybrid",
+  "n_attn_layers": 4,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_reservoir_layers": 4,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 8,
+  "pad_token_id": 0,
+  "reservoir_scale": 0.9,
+  "tie_word_embeddings": false,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "vocab_size": 8192
+}

generation_config.json CHANGED Viewed

@@ -1,9 +1,10 @@
 {
   "bos_token_id": 2,
   "eos_token_id": 3,
-  "pad_token_id": 0,
   "max_new_tokens": 256,
   "temperature": 0.7,
   "top_k": 50,
-  "do_sample": true
-}

 {
   "bos_token_id": 2,
+  "do_sample": true,
   "eos_token_id": 3,
   "max_new_tokens": 256,
+  "pad_token_id": 0,
   "temperature": 0.7,
   "top_k": 50,
+  "transformers_version": "5.0.0"
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:124a7177a82bcfebb3bf1b75444818d200aafa952f44c16fcd4ac2191d7367b7
-size 369156392

 version https://git-lfs.github.com/spec/v1
+oid sha256:2431c0e0e022b86e142a82f790295cb3ae6411acdb908a6b402a20e25ed1f09f
+size 184580440

tokenizer_config.json CHANGED Viewed

@@ -2,6 +2,7 @@
   "backend": "tokenizers",
   "bos_token": "<bos>",
   "eos_token": "<eos>",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
   "tokenizer_class": "TokenizersBackend",

   "backend": "tokenizers",
   "bos_token": "<bos>",
   "eos_token": "<eos>",
+  "is_local": false,
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
   "tokenizer_class": "TokenizersBackend",

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50806f9bca90fe0b02ef16ab9659f09b9a467f1eb053112098cedbe0578b9473
+size 5137