ToastyPigeon commited on Aug 21, 2025

Commit

f690e14

verified ·

1 Parent(s): b3d501e

Upload folder using huggingface_hub

Browse files

Files changed (27) hide show

.gitattributes +2 -0
README.md +304 -0
added_tokens.json +3 -0
chat_template.jinja +50 -0
checkpoint-296/added_tokens.json +3 -0
checkpoint-296/chat_template.jinja +50 -0
checkpoint-296/config.json +57 -0
checkpoint-296/generation_config.json +14 -0
checkpoint-296/model.safetensors +3 -0
checkpoint-296/optimizer.pt +3 -0
checkpoint-296/rng_state_0.pth +3 -0
checkpoint-296/rng_state_1.pth +3 -0
checkpoint-296/scheduler.pt +3 -0
checkpoint-296/special_tokens_map.json +33 -0
checkpoint-296/tokenizer.json +3 -0
checkpoint-296/tokenizer.model +3 -0
checkpoint-296/tokenizer_config.json +0 -0
checkpoint-296/trainer_state.json +2266 -0
checkpoint-296/training_args.bin +3 -0
config.json +57 -0
generation_config.json +14 -0
model.safetensors +3 -0
special_tokens_map.json +33 -0
tokenizer.json +3 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-296/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,304 @@

+---
+library_name: transformers
+license: gemma
+base_model: unsloth/gemma-3-270m-it
+tags:
+- axolotl
+- generated_from_trainer
+datasets:
+- allura-org/EU01-S2
+- allenai/tulu-3-sft-personas-instruction-following
+- ToastyPigeon/mixed-medical-reasoning-formatted
+- ToastyPigeon/steve-and-marvin
+- ToastyPigeon/kimi-stories-instruct
+- ToastyPigeon/new-story-dataset
+- allura-org/fujin-instruct-v2
+- ToastyPigeon/gutenberg-sft
+- ToastyPigeon/SpringDragon
+- ToastyPigeon/some-erotica
+model-index:
+- name: micro-glitter
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.11.0.dev0`
+```yaml
+# === Model Configuration ===
+base_model: unsloth/gemma-3-270m-it
+load_in_8bit: false
+load_in_4bit: false
+# === HF Configuration ===
+hub_model_id: allura-forge/micro-glitter
+hub_strategy: "checkpoint"
+output_dir: /workspace/aibox-standalone-pool/axolotl/lilglitter-ckpts
+# === Training Setup ===
+num_epochs: 2
+micro_batch_size: 4
+gradient_accumulation_steps: 8
+sequence_len: 8192
+sample_packing: true
+pad_to_sequence_len: true
+#max_steps: 10
+# === Evaluation ===
+val_set_size: 0.05
+evals_per_epoch: 10
+#eval_steps: 20
+#max_steps: 60
+#eval_table_size:
+eval_max_new_tokens: 128
+eval_sample_packing: true
+#eval_strategy: "no"
+# === LoRA Configuration ===
+#adapter: qlora
+#lora_model_dir:
+#lora_r: 128
+#lora_alpha: 16
+#lora_dropout: 0.25
+#lora_target_linear: true
+#lora_target_modules:
+#  - embed_tokens
+#  - lm_head
+lora_fan_in_fan_out:
+lora_target_modules:
+#peft_use_rslora: true
+lora_modules_to_save:
+#  - embed_tokens
+#  - lm_head
+#fix_untrained_tokens: true
+#lora_mlp_kernel: true
+#lora_qkv_kernel: true
+#lora_o_kernel: true
+# === Hyperparameter Configuration ===
+#optimizer: apollo_adamw_layerwise
+warmup_steps: 0
+optimizer: adamw_torch_fused
+#optimizer: paged_adamw_8bit
+#optim_args:
+#  enable_stochastic_rounding: true
+#  enable_cautious: true
+#  enable_8bit: true
+# Apollo-mini configuration:
+#optim_args: "proj=random,rank=128,scale=128.0,scale_type=tensor,update_proj_gap=100"
+# Regular Apollo configuration:
+# optim_args:
+#optim_target_modules: all_linear
+learning_rate: 1e-5
+lr_scheduler: cosine
+#cosine_min_lr_ratio: 0.2
+#lr_scheduler: cosine_with_min_lr
+#lr_scheduler_kwargs:
+#  cosine_min_lr: 1e-6
+weight_decay: 0.01
+max_grad_norm: 2.0
+#warmup_steps: 0
+#warmup_ratio: 0.025
+# === Data Configuration ===
+#
+#chat_template: jinja
+#chat_template_jinja: "{% for message in messages %}{% if not loop.first %}{{' \n\n' }}{% endif %}{% if message['role'] == 'system' %}{{ '### System:\n' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Response:\n' + message['content'].strip() + eos_token }}{% endif %}{% endfor %}"
+#chat_template_jinja: "{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris. You obediently fulfill the user's requests.\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content'] %}\n    {%- else %}\n        {%- set system_message = messages[0]['content'][0]['text'] %}\n    {%- endif %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = default_system_message %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n    {%- if message['role'] == 'user' %}\n        {%- if message['content'] is string %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n        {%- else %}\n            {{- '[INST]' }}\n            {%- for bl (line truncated to 1000 characters)
+#chat_template: chatml
+#special_tokens:
+#  eos_token: "<|im_end|>"
+#  eos_token: "</s>"
+#tokenizer_use_mistral_common: true
+shuffle_merged_datasets: true
+datasets:
+  - path: allura-org/EU01-S2
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+  - path: allenai/tulu-3-sft-personas-instruction-following
+    type: chat_template
+    split: train[:10%]
+  - path: ToastyPigeon/mixed-medical-reasoning-formatted
+    type: chat_template
+    data_files: mixed-medical-thinking.json
+    split: train[:10%]
+  - path: ToastyPigeon/steve-and-marvin
+    type: completion
+    data_files: marvin.json
+  - path: ToastyPigeon/kimi-stories-instruct
+    type: chat_template
+  - path: ToastyPigeon/new-story-dataset
+ #   type: customcompletion-regex
+    type: completion
+    data_files: new-story-dataset-v2.json
+  - path: allura-org/fujin-instruct-v2
+#    type: customchatml-regex
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+#  - path: ToastyPigeon/some-rp-extended
+ #   type: customchatml-regex
+#    type: chat_template
+#    field_messages: conversations
+#    message_property_mappings:
+#      role: from
+#      content: value
+#    roles_to_train: ["user","assistant"]
+  - path: ToastyPigeon/gutenberg-sft
+#    type: customchatml-regex
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+  - path: ToastyPigeon/SpringDragon
+#    type: customcompletion-regex
+    type: completion
+    split: train
+  - path: ToastyPigeon/some-erotica
+#    type: customcompletion-regex
+    type: completion
+    split: train[:10%]
+dataset_prepared_path: last_run_prepared
+# === Plugins ===
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+# === Hardware Optimization ===
+#gradient_checkpointing: offload
+#gradient_checkpointing_kwargs:
+#  use_reentrant: false
+liger_rope: true
+liger_rms_norm: true
+liger_layer_norm: true
+liger_glu_activation: true
+#liger_fused_linear_cross_entropy: true
+cut_cross_entropy: true
+#deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16.json
+# === FSDP Config ===
+#fsdp:
+#  - full_shard
+#  - auto_wrap
+#fsdp_config:
+#  fsdp_limit_all_gathers: true
+#  fsdp_sync_module_states: true
+#  fsdp_offload_params: true
+#  fsdp_activation_checkpointing: true
+#  fsdp_use_orig_params: false
+#  fsdp_cpu_ram_efficient_loading: true
+#  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+#  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
+#  fsdp_state_dict_type: FULL_STATE_DICT
+#  fsdp_sharding_strategy: FULL_SHARD
+#  fsdp_version: 2
+# === Wandb Tracking ===
+wandb_project: TinyGemma
+# wandb_entity: [WANDB_ENTITY]
+# wandb_name: [WANDB_RUN_NAME]
+# === Checkpointing ===
+#save_steps: 10
+saves_per_epoch: 10
+save_total_limit: 1
+# === Advanced Settings ===
+bf16: auto
+flash_attention: true
+train_on_inputs: false
+group_by_length: false
+save_safetensors: true
+logging_steps: 1
+gc_steps: 10
+seed: 69
+```
+</details><br>
+# micro-glitter
+This model is a fine-tuned version of [unsloth/gemma-3-270m-it](https://huggingface.co/unsloth/gemma-3-270m-it) on the allura-org/EU01-S2, the allenai/tulu-3-sft-personas-instruction-following, the ToastyPigeon/mixed-medical-reasoning-formatted, the ToastyPigeon/steve-and-marvin, the ToastyPigeon/kimi-stories-instruct, the ToastyPigeon/new-story-dataset, the allura-org/fujin-instruct-v2, the ToastyPigeon/gutenberg-sft, the ToastyPigeon/SpringDragon and the ToastyPigeon/some-erotica datasets.
+It achieves the following results on the evaluation set:
+- Loss: 3.7387
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 1e-05
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 69
+- distributed_type: multi-GPU
+- num_devices: 2
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 64
+- total_eval_batch_size: 8
+- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 8
+- training_steps: 296
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| No log        | 0      | 0    | 3.8582          |
+| 3.4802        | 0.1008 | 15   | 3.5118          |
+| 3.4608        | 0.2017 | 30   | 3.4890          |
+| 3.5272        | 0.3025 | 45   | 3.5189          |
+| 3.559         | 0.4034 | 60   | 3.5753          |
+| 3.5817        | 0.5042 | 75   | 3.6121          |
+| 3.6349        | 0.6050 | 90   | 3.6471          |
+| 3.68          | 0.7059 | 105  | 3.6721          |
+| 3.6597        | 0.8067 | 120  | 3.6970          |
+| 3.6462        | 0.9076 | 135  | 3.7068          |
+| 3.7009        | 1.0067 | 150  | 3.7213          |
+| 3.6717        | 1.1076 | 165  | 3.7313          |
+| 3.7631        | 1.2084 | 180  | 3.7338          |
+| 3.7535        | 1.3092 | 195  | 3.7346          |
+| 3.668         | 1.4101 | 210  | 3.7375          |
+| 3.679         | 1.5109 | 225  | 3.7383          |
+| 3.6539        | 1.6118 | 240  | 3.7386          |
+| 3.6547        | 1.7126 | 255  | 3.7386          |
+| 3.7533        | 1.8134 | 270  | 3.7400          |
+| 3.6983        | 1.9143 | 285  | 3.7387          |
+### Framework versions
+- Transformers 4.52.4
+- Pytorch 2.7.0+cu126
+- Datasets 3.6.0
+- Tokenizers 0.21.1

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,50 @@

+{# Unsloth Chat template fixes #}
+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- elif message['content'] is defined -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}
+{# Copyright 2025-present Unsloth. Apache 2.0 License. #}

checkpoint-296/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

checkpoint-296/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,50 @@

+{# Unsloth Chat template fixes #}
+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- elif message['content'] is defined -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}
+{# Copyright 2025-present Unsloth. Apache 2.0 License. #}

checkpoint-296/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "_sliding_window_pattern": 6,
+  "architectures": [
+    "Gemma3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 106,
+  "final_logit_softcapping": null,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 640,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_types": [
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "model_type": "gemma3_text",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 18,
+  "num_key_value_heads": 1,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000.0,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": 512,
+  "sliding_window_pattern": 6,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "unsloth_fixed": true,
+  "use_bidirectional_attention": false,
+  "use_cache": false,
+  "vocab_size": 262145
+}

checkpoint-296/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "do_sample": true,
+  "eos_token_id": [
+    1,
+    106
+  ],
+  "max_length": 32768,
+  "pad_token_id": 0,
+  "top_k": 64,
+  "top_p": 0.95,
+  "transformers_version": "4.52.4"
+}

checkpoint-296/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df5b41c0ad86b13aa2e31015b2fa37db700a177ccac2c390d5dcda7424957730
+size 536224336

checkpoint-296/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebf4004db03c78b58b0b334aca03fdfd686ab7f51db4b34c1b657b78232f8e77
+size 1072597003

checkpoint-296/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c164bf4d748b9b76346b397d4015086abb96fbaeccd6d42b2a9500c89e203c8a
+size 14917

checkpoint-296/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f682fc8318583d0f9c08b0ee675a29136825aaf92ff2affc73ad3431c6ba2d9
+size 14917

checkpoint-296/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eec9abeec835bcf7b7c24805795928bcbb86301ddca5cb247cf8428cab058bd2
+size 1465

checkpoint-296/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<end_of_turn>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-296/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

checkpoint-296/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

checkpoint-296/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-296/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2266 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.988235294117647,
+  "eval_steps": 15,
+  "global_step": 296,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_loss": 3.8581998348236084,
+      "eval_runtime": 30.2305,
+      "eval_samples_per_second": 41.647,
+      "eval_steps_per_second": 5.227,
+      "step": 0
+    },
+    {
+      "epoch": 0.0067226890756302525,
+      "grad_norm": 49.25,
+      "learning_rate": 0.0,
+      "loss": 3.8278,
+      "step": 1
+    },
+    {
+      "epoch": 0.013445378151260505,
+      "grad_norm": 48.0,
+      "learning_rate": 1.25e-06,
+      "loss": 3.877,
+      "step": 2
+    },
+    {
+      "epoch": 0.020168067226890758,
+      "grad_norm": 49.5,
+      "learning_rate": 2.5e-06,
+      "loss": 3.8607,
+      "step": 3
+    },
+    {
+      "epoch": 0.02689075630252101,
+      "grad_norm": 52.0,
+      "learning_rate": 3.7500000000000005e-06,
+      "loss": 3.7408,
+      "step": 4
+    },
+    {
+      "epoch": 0.03361344537815126,
+      "grad_norm": 43.25,
+      "learning_rate": 5e-06,
+      "loss": 3.7054,
+      "step": 5
+    },
+    {
+      "epoch": 0.040336134453781515,
+      "grad_norm": 44.75,
+      "learning_rate": 6.25e-06,
+      "loss": 3.7652,
+      "step": 6
+    },
+    {
+      "epoch": 0.047058823529411764,
+      "grad_norm": 36.75,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 3.7236,
+      "step": 7
+    },
+    {
+      "epoch": 0.05378151260504202,
+      "grad_norm": 35.75,
+      "learning_rate": 8.750000000000001e-06,
+      "loss": 3.7419,
+      "step": 8
+    },
+    {
+      "epoch": 0.06050420168067227,
+      "grad_norm": 21.875,
+      "learning_rate": 1e-05,
+      "loss": 3.6896,
+      "step": 9
+    },
+    {
+      "epoch": 0.06722689075630252,
+      "grad_norm": 17.5,
+      "learning_rate": 9.99970252500075e-06,
+      "loss": 3.5791,
+      "step": 10
+    },
+    {
+      "epoch": 0.07394957983193277,
+      "grad_norm": 15.375,
+      "learning_rate": 9.998810135399545e-06,
+      "loss": 3.5491,
+      "step": 11
+    },
+    {
+      "epoch": 0.08067226890756303,
+      "grad_norm": 12.4375,
+      "learning_rate": 9.997322937381829e-06,
+      "loss": 3.5476,
+      "step": 12
+    },
+    {
+      "epoch": 0.08739495798319327,
+      "grad_norm": 11.8125,
+      "learning_rate": 9.99524110790929e-06,
+      "loss": 3.5258,
+      "step": 13
+    },
+    {
+      "epoch": 0.09411764705882353,
+      "grad_norm": 10.25,
+      "learning_rate": 9.992564894698816e-06,
+      "loss": 3.5072,
+      "step": 14
+    },
+    {
+      "epoch": 0.10084033613445378,
+      "grad_norm": 9.375,
+      "learning_rate": 9.989294616193018e-06,
+      "loss": 3.4802,
+      "step": 15
+    },
+    {
+      "epoch": 0.10084033613445378,
+      "eval_loss": 3.5118489265441895,
+      "eval_runtime": 30.2065,
+      "eval_samples_per_second": 41.68,
+      "eval_steps_per_second": 5.231,
+      "step": 15
+    },
+    {
+      "epoch": 0.10756302521008404,
+      "grad_norm": 7.8125,
+      "learning_rate": 9.985430661522333e-06,
+      "loss": 3.4675,
+      "step": 16
+    },
+    {
+      "epoch": 0.11428571428571428,
+      "grad_norm": 7.5,
+      "learning_rate": 9.980973490458728e-06,
+      "loss": 3.4914,
+      "step": 17
+    },
+    {
+      "epoch": 0.12100840336134454,
+      "grad_norm": 7.65625,
+      "learning_rate": 9.975923633360985e-06,
+      "loss": 3.5515,
+      "step": 18
+    },
+    {
+      "epoch": 0.12773109243697478,
+      "grad_norm": 7.25,
+      "learning_rate": 9.970281691111598e-06,
+      "loss": 3.4299,
+      "step": 19
+    },
+    {
+      "epoch": 0.13445378151260504,
+      "grad_norm": 7.5625,
+      "learning_rate": 9.964048335045276e-06,
+      "loss": 3.4942,
+      "step": 20
+    },
+    {
+      "epoch": 0.1411764705882353,
+      "grad_norm": 6.5625,
+      "learning_rate": 9.957224306869053e-06,
+      "loss": 3.4355,
+      "step": 21
+    },
+    {
+      "epoch": 0.14789915966386555,
+      "grad_norm": 6.53125,
+      "learning_rate": 9.94981041857404e-06,
+      "loss": 3.4162,
+      "step": 22
+    },
+    {
+      "epoch": 0.1546218487394958,
+      "grad_norm": 6.25,
+      "learning_rate": 9.941807552338805e-06,
+      "loss": 3.4885,
+      "step": 23
+    },
+    {
+      "epoch": 0.16134453781512606,
+      "grad_norm": 6.03125,
+      "learning_rate": 9.933216660424396e-06,
+      "loss": 3.5423,
+      "step": 24
+    },
+    {
+      "epoch": 0.16806722689075632,
+      "grad_norm": 5.84375,
+      "learning_rate": 9.924038765061042e-06,
+      "loss": 3.4046,
+      "step": 25
+    },
+    {
+      "epoch": 0.17478991596638654,
+      "grad_norm": 6.1875,
+      "learning_rate": 9.914274958326507e-06,
+      "loss": 3.3982,
+      "step": 26
+    },
+    {
+      "epoch": 0.1815126050420168,
+      "grad_norm": 5.6875,
+      "learning_rate": 9.903926402016153e-06,
+      "loss": 3.4689,
+      "step": 27
+    },
+    {
+      "epoch": 0.18823529411764706,
+      "grad_norm": 7.03125,
+      "learning_rate": 9.892994327504693e-06,
+      "loss": 3.5937,
+      "step": 28
+    },
+    {
+      "epoch": 0.1949579831932773,
+      "grad_norm": 6.15625,
+      "learning_rate": 9.881480035599667e-06,
+      "loss": 3.4518,
+      "step": 29
+    },
+    {
+      "epoch": 0.20168067226890757,
+      "grad_norm": 5.9375,
+      "learning_rate": 9.869384896386669e-06,
+      "loss": 3.4608,
+      "step": 30
+    },
+    {
+      "epoch": 0.20168067226890757,
+      "eval_loss": 3.489025354385376,
+      "eval_runtime": 29.7531,
+      "eval_samples_per_second": 42.315,
+      "eval_steps_per_second": 5.31,
+      "step": 30
+    },
+    {
+      "epoch": 0.20840336134453782,
+      "grad_norm": 5.40625,
+      "learning_rate": 9.856710349066307e-06,
+      "loss": 3.5205,
+      "step": 31
+    },
+    {
+      "epoch": 0.21512605042016808,
+      "grad_norm": 5.0,
+      "learning_rate": 9.843457901782967e-06,
+      "loss": 3.4322,
+      "step": 32
+    },
+    {
+      "epoch": 0.2218487394957983,
+      "grad_norm": 5.875,
+      "learning_rate": 9.829629131445342e-06,
+      "loss": 3.4981,
+      "step": 33
+    },
+    {
+      "epoch": 0.22857142857142856,
+      "grad_norm": 5.875,
+      "learning_rate": 9.815225683538814e-06,
+      "loss": 3.3736,
+      "step": 34
+    },
+    {
+      "epoch": 0.23529411764705882,
+      "grad_norm": 5.1875,
+      "learning_rate": 9.800249271929645e-06,
+      "loss": 3.4398,
+      "step": 35
+    },
+    {
+      "epoch": 0.24201680672268908,
+      "grad_norm": 4.90625,
+      "learning_rate": 9.784701678661045e-06,
+      "loss": 3.4934,
+      "step": 36
+    },
+    {
+      "epoch": 0.24873949579831933,
+      "grad_norm": 4.84375,
+      "learning_rate": 9.768584753741134e-06,
+      "loss": 3.4506,
+      "step": 37
+    },
+    {
+      "epoch": 0.25546218487394956,
+      "grad_norm": 4.5,
+      "learning_rate": 9.751900414922807e-06,
+      "loss": 3.4764,
+      "step": 38
+    },
+    {
+      "epoch": 0.26218487394957984,
+      "grad_norm": 4.8125,
+      "learning_rate": 9.73465064747553e-06,
+      "loss": 3.547,
+      "step": 39
+    },
+    {
+      "epoch": 0.2689075630252101,
+      "grad_norm": 5.3125,
+      "learning_rate": 9.716837503949128e-06,
+      "loss": 3.4394,
+      "step": 40
+    },
+    {
+      "epoch": 0.27563025210084036,
+      "grad_norm": 4.6875,
+      "learning_rate": 9.698463103929542e-06,
+      "loss": 3.4722,
+      "step": 41
+    },
+    {
+      "epoch": 0.2823529411764706,
+      "grad_norm": 4.75,
+      "learning_rate": 9.67952963378663e-06,
+      "loss": 3.4639,
+      "step": 42
+    },
+    {
+      "epoch": 0.28907563025210087,
+      "grad_norm": 5.0,
+      "learning_rate": 9.660039346413994e-06,
+      "loss": 3.4936,
+      "step": 43
+    },
+    {
+      "epoch": 0.2957983193277311,
+      "grad_norm": 4.40625,
+      "learning_rate": 9.639994560960923e-06,
+      "loss": 3.5191,
+      "step": 44
+    },
+    {
+      "epoch": 0.3025210084033613,
+      "grad_norm": 4.0625,
+      "learning_rate": 9.619397662556434e-06,
+      "loss": 3.5272,
+      "step": 45
+    },
+    {
+      "epoch": 0.3025210084033613,
+      "eval_loss": 3.5188682079315186,
+      "eval_runtime": 30.2246,
+      "eval_samples_per_second": 41.655,
+      "eval_steps_per_second": 5.228,
+      "step": 45
+    },
+    {
+      "epoch": 0.3092436974789916,
+      "grad_norm": 4.59375,
+      "learning_rate": 9.598251102025463e-06,
+      "loss": 3.5391,
+      "step": 46
+    },
+    {
+      "epoch": 0.31596638655462184,
+      "grad_norm": 4.53125,
+      "learning_rate": 9.576557395597237e-06,
+      "loss": 3.4851,
+      "step": 47
+    },
+    {
+      "epoch": 0.3226890756302521,
+      "grad_norm": 5.125,
+      "learning_rate": 9.55431912460588e-06,
+      "loss": 3.5334,
+      "step": 48
+    },
+    {
+      "epoch": 0.32941176470588235,
+      "grad_norm": 4.625,
+      "learning_rate": 9.531538935183252e-06,
+      "loss": 3.4687,
+      "step": 49
+    },
+    {
+      "epoch": 0.33613445378151263,
+      "grad_norm": 5.53125,
+      "learning_rate": 9.50821953794408e-06,
+      "loss": 3.539,
+      "step": 50
+    },
+    {
+      "epoch": 0.34285714285714286,
+      "grad_norm": 4.75,
+      "learning_rate": 9.484363707663443e-06,
+      "loss": 3.5205,
+      "step": 51
+    },
+    {
+      "epoch": 0.3495798319327731,
+      "grad_norm": 4.84375,
+      "learning_rate": 9.459974282946572e-06,
+      "loss": 3.5856,
+      "step": 52
+    },
+    {
+      "epoch": 0.3563025210084034,
+      "grad_norm": 4.78125,
+      "learning_rate": 9.43505416589111e-06,
+      "loss": 3.5938,
+      "step": 53
+    },
+    {
+      "epoch": 0.3630252100840336,
+      "grad_norm": 5.0625,
+      "learning_rate": 9.409606321741776e-06,
+      "loss": 3.5446,
+      "step": 54
+    },
+    {
+      "epoch": 0.3697478991596639,
+      "grad_norm": 4.625,
+      "learning_rate": 9.38363377853754e-06,
+      "loss": 3.5746,
+      "step": 55
+    },
+    {
+      "epoch": 0.3764705882352941,
+      "grad_norm": 4.875,
+      "learning_rate": 9.357139626751308e-06,
+      "loss": 3.5536,
+      "step": 56
+    },
+    {
+      "epoch": 0.3831932773109244,
+      "grad_norm": 4.53125,
+      "learning_rate": 9.330127018922195e-06,
+      "loss": 3.4815,
+      "step": 57
+    },
+    {
+      "epoch": 0.3899159663865546,
+      "grad_norm": 4.5,
+      "learning_rate": 9.302599169280395e-06,
+      "loss": 3.5294,
+      "step": 58
+    },
+    {
+      "epoch": 0.39663865546218485,
+      "grad_norm": 3.96875,
+      "learning_rate": 9.274559353364734e-06,
+      "loss": 3.476,
+      "step": 59
+    },
+    {
+      "epoch": 0.40336134453781514,
+      "grad_norm": 5.09375,
+      "learning_rate": 9.246010907632894e-06,
+      "loss": 3.559,
+      "step": 60
+    },
+    {
+      "epoch": 0.40336134453781514,
+      "eval_loss": 3.575310230255127,
+      "eval_runtime": 29.7614,
+      "eval_samples_per_second": 42.303,
+      "eval_steps_per_second": 5.309,
+      "step": 60
+    },
+    {
+      "epoch": 0.41008403361344536,
+      "grad_norm": 4.4375,
+      "learning_rate": 9.21695722906443e-06,
+      "loss": 3.5118,
+      "step": 61
+    },
+    {
+      "epoch": 0.41680672268907565,
+      "grad_norm": 5.0625,
+      "learning_rate": 9.18740177475654e-06,
+      "loss": 3.5701,
+      "step": 62
+    },
+    {
+      "epoch": 0.4235294117647059,
+      "grad_norm": 4.5,
+      "learning_rate": 9.157348061512728e-06,
+      "loss": 3.5679,
+      "step": 63
+    },
+    {
+      "epoch": 0.43025210084033616,
+      "grad_norm": 4.0625,
+      "learning_rate": 9.126799665424319e-06,
+      "loss": 3.5001,
+      "step": 64
+    },
+    {
+      "epoch": 0.4369747899159664,
+      "grad_norm": 4.28125,
+      "learning_rate": 9.09576022144496e-06,
+      "loss": 3.559,
+      "step": 65
+    },
+    {
+      "epoch": 0.4436974789915966,
+      "grad_norm": 4.75,
+      "learning_rate": 9.064233422958078e-06,
+      "loss": 3.4816,
+      "step": 66
+    },
+    {
+      "epoch": 0.4504201680672269,
+      "grad_norm": 3.953125,
+      "learning_rate": 9.032223021337415e-06,
+      "loss": 3.6286,
+      "step": 67
+    },
+    {
+      "epoch": 0.45714285714285713,
+      "grad_norm": 4.5625,
+      "learning_rate": 8.999732825500649e-06,
+      "loss": 3.5596,
+      "step": 68
+    },
+    {
+      "epoch": 0.4638655462184874,
+      "grad_norm": 4.46875,
+      "learning_rate": 8.966766701456177e-06,
+      "loss": 3.5409,
+      "step": 69
+    },
+    {
+      "epoch": 0.47058823529411764,
+      "grad_norm": 6.9375,
+      "learning_rate": 8.933328571843086e-06,
+      "loss": 3.5449,
+      "step": 70
+    },
+    {
+      "epoch": 0.4773109243697479,
+      "grad_norm": 5.8125,
+      "learning_rate": 8.899422415464409e-06,
+      "loss": 3.6107,
+      "step": 71
+    },
+    {
+      "epoch": 0.48403361344537815,
+      "grad_norm": 5.21875,
+      "learning_rate": 8.865052266813686e-06,
+      "loss": 3.6243,
+      "step": 72
+    },
+    {
+      "epoch": 0.4907563025210084,
+      "grad_norm": 4.59375,
+      "learning_rate": 8.83022221559489e-06,
+      "loss": 3.6119,
+      "step": 73
+    },
+    {
+      "epoch": 0.49747899159663866,
+      "grad_norm": 5.0,
+      "learning_rate": 8.79493640623581e-06,
+      "loss": 3.563,
+      "step": 74
+    },
+    {
+      "epoch": 0.5042016806722689,
+      "grad_norm": 4.875,
+      "learning_rate": 8.759199037394888e-06,
+      "loss": 3.5817,
+      "step": 75
+    },
+    {
+      "epoch": 0.5042016806722689,
+      "eval_loss": 3.612149238586426,
+      "eval_runtime": 30.2292,
+      "eval_samples_per_second": 41.648,
+      "eval_steps_per_second": 5.227,
+      "step": 75
+    },
+    {
+      "epoch": 0.5109243697478991,
+      "grad_norm": 4.1875,
+      "learning_rate": 8.723014361461633e-06,
+      "loss": 3.5643,
+      "step": 76
+    },
+    {
+      "epoch": 0.5176470588235295,
+      "grad_norm": 4.40625,
+      "learning_rate": 8.68638668405062e-06,
+      "loss": 3.5424,
+      "step": 77
+    },
+    {
+      "epoch": 0.5243697478991597,
+      "grad_norm": 4.875,
+      "learning_rate": 8.649320363489178e-06,
+      "loss": 3.5679,
+      "step": 78
+    },
+    {
+      "epoch": 0.5310924369747899,
+      "grad_norm": 5.4375,
+      "learning_rate": 8.611819810298778e-06,
+      "loss": 3.5269,
+      "step": 79
+    },
+    {
+      "epoch": 0.5378151260504201,
+      "grad_norm": 4.46875,
+      "learning_rate": 8.573889486670233e-06,
+      "loss": 3.5913,
+      "step": 80
+    },
+    {
+      "epoch": 0.5445378151260504,
+      "grad_norm": 4.21875,
+      "learning_rate": 8.535533905932739e-06,
+      "loss": 3.7066,
+      "step": 81
+    },
+    {
+      "epoch": 0.5512605042016807,
+      "grad_norm": 4.40625,
+      "learning_rate": 8.496757632016836e-06,
+      "loss": 3.6143,
+      "step": 82
+    },
+    {
+      "epoch": 0.5579831932773109,
+      "grad_norm": 4.5,
+      "learning_rate": 8.457565278911349e-06,
+      "loss": 3.6007,
+      "step": 83
+    },
+    {
+      "epoch": 0.5647058823529412,
+      "grad_norm": 5.5,
+      "learning_rate": 8.417961510114357e-06,
+      "loss": 3.5805,
+      "step": 84
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 4.15625,
+      "learning_rate": 8.377951038078303e-06,
+      "loss": 3.5255,
+      "step": 85
+    },
+    {
+      "epoch": 0.5781512605042017,
+      "grad_norm": 4.21875,
+      "learning_rate": 8.337538623649237e-06,
+      "loss": 3.6272,
+      "step": 86
+    },
+    {
+      "epoch": 0.584873949579832,
+      "grad_norm": 4.40625,
+      "learning_rate": 8.296729075500345e-06,
+      "loss": 3.4642,
+      "step": 87
+    },
+    {
+      "epoch": 0.5915966386554622,
+      "grad_norm": 4.40625,
+      "learning_rate": 8.255527249559747e-06,
+      "loss": 3.6105,
+      "step": 88
+    },
+    {
+      "epoch": 0.5983193277310924,
+      "grad_norm": 5.34375,
+      "learning_rate": 8.213938048432697e-06,
+      "loss": 3.7054,
+      "step": 89
+    },
+    {
+      "epoch": 0.6050420168067226,
+      "grad_norm": 4.03125,
+      "learning_rate": 8.171966420818227e-06,
+      "loss": 3.6349,
+      "step": 90
+    },
+    {
+      "epoch": 0.6050420168067226,
+      "eval_loss": 3.647097110748291,
+      "eval_runtime": 29.7281,
+      "eval_samples_per_second": 42.351,
+      "eval_steps_per_second": 5.315,
+      "step": 90
+    },
+    {
+      "epoch": 0.611764705882353,
+      "grad_norm": 4.46875,
+      "learning_rate": 8.129617360920297e-06,
+      "loss": 3.5585,
+      "step": 91
+    },
+    {
+      "epoch": 0.6184873949579832,
+      "grad_norm": 4.09375,
+      "learning_rate": 8.086895907853526e-06,
+      "loss": 3.6065,
+      "step": 92
+    },
+    {
+      "epoch": 0.6252100840336134,
+      "grad_norm": 4.0625,
+      "learning_rate": 8.043807145043604e-06,
+      "loss": 3.5808,
+      "step": 93
+    },
+    {
+      "epoch": 0.6319327731092437,
+      "grad_norm": 4.25,
+      "learning_rate": 8.000356199622406e-06,
+      "loss": 3.6742,
+      "step": 94
+    },
+    {
+      "epoch": 0.6386554621848739,
+      "grad_norm": 3.75,
+      "learning_rate": 7.956548241817914e-06,
+      "loss": 3.609,
+      "step": 95
+    },
+    {
+      "epoch": 0.6453781512605042,
+      "grad_norm": 4.34375,
+      "learning_rate": 7.912388484339012e-06,
+      "loss": 3.5559,
+      "step": 96
+    },
+    {
+      "epoch": 0.6521008403361345,
+      "grad_norm": 4.59375,
+      "learning_rate": 7.86788218175523e-06,
+      "loss": 3.6504,
+      "step": 97
+    },
+    {
+      "epoch": 0.6588235294117647,
+      "grad_norm": 4.59375,
+      "learning_rate": 7.823034629871503e-06,
+      "loss": 3.5724,
+      "step": 98
+    },
+    {
+      "epoch": 0.6655462184873949,
+      "grad_norm": 5.25,
+      "learning_rate": 7.777851165098012e-06,
+      "loss": 3.6483,
+      "step": 99
+    },
+    {
+      "epoch": 0.6722689075630253,
+      "grad_norm": 5.40625,
+      "learning_rate": 7.732337163815218e-06,
+      "loss": 3.5782,
+      "step": 100
+    },
+    {
+      "epoch": 0.6789915966386555,
+      "grad_norm": 4.15625,
+      "learning_rate": 7.686498041734121e-06,
+      "loss": 3.5653,
+      "step": 101
+    },
+    {
+      "epoch": 0.6857142857142857,
+      "grad_norm": 4.125,
+      "learning_rate": 7.64033925325184e-06,
+      "loss": 3.6252,
+      "step": 102
+    },
+    {
+      "epoch": 0.692436974789916,
+      "grad_norm": 4.5625,
+      "learning_rate": 7.593866290802608e-06,
+      "loss": 3.7141,
+      "step": 103
+    },
+    {
+      "epoch": 0.6991596638655462,
+      "grad_norm": 4.3125,
+      "learning_rate": 7.54708468420421e-06,
+      "loss": 3.6884,
+      "step": 104
+    },
+    {
+      "epoch": 0.7058823529411765,
+      "grad_norm": 4.21875,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 3.68,
+      "step": 105
+    },
+    {
+      "epoch": 0.7058823529411765,
+      "eval_loss": 3.672091484069824,
+      "eval_runtime": 30.2768,
+      "eval_samples_per_second": 41.583,
+      "eval_steps_per_second": 5.219,
+      "step": 105
+    },
+    {
+      "epoch": 0.7126050420168067,
+      "grad_norm": 4.34375,
+      "learning_rate": 7.4526178407965396e-06,
+      "loss": 3.5934,
+      "step": 106
+    },
+    {
+      "epoch": 0.719327731092437,
+      "grad_norm": 3.765625,
+      "learning_rate": 7.404943844596939e-06,
+      "loss": 3.5845,
+      "step": 107
+    },
+    {
+      "epoch": 0.7260504201680672,
+      "grad_norm": 3.953125,
+      "learning_rate": 7.3569836841299905e-06,
+      "loss": 3.6421,
+      "step": 108
+    },
+    {
+      "epoch": 0.7327731092436974,
+      "grad_norm": 4.53125,
+      "learning_rate": 7.308743066175172e-06,
+      "loss": 3.6617,
+      "step": 109
+    },
+    {
+      "epoch": 0.7394957983193278,
+      "grad_norm": 4.71875,
+      "learning_rate": 7.2602277308836e-06,
+      "loss": 3.6388,
+      "step": 110
+    },
+    {
+      "epoch": 0.746218487394958,
+      "grad_norm": 4.40625,
+      "learning_rate": 7.211443451095007e-06,
+      "loss": 3.6798,
+      "step": 111
+    },
+    {
+      "epoch": 0.7529411764705882,
+      "grad_norm": 4.6875,
+      "learning_rate": 7.162396031650831e-06,
+      "loss": 3.8081,
+      "step": 112
+    },
+    {
+      "epoch": 0.7596638655462185,
+      "grad_norm": 4.59375,
+      "learning_rate": 7.113091308703498e-06,
+      "loss": 3.762,
+      "step": 113
+    },
+    {
+      "epoch": 0.7663865546218488,
+      "grad_norm": 4.25,
+      "learning_rate": 7.063535149021974e-06,
+      "loss": 3.5991,
+      "step": 114
+    },
+    {
+      "epoch": 0.773109243697479,
+      "grad_norm": 4.09375,
+      "learning_rate": 7.0137334492936875e-06,
+      "loss": 3.6272,
+      "step": 115
+    },
+    {
+      "epoch": 0.7798319327731092,
+      "grad_norm": 5.1875,
+      "learning_rate": 6.963692135422872e-06,
+      "loss": 3.7034,
+      "step": 116
+    },
+    {
+      "epoch": 0.7865546218487395,
+      "grad_norm": 4.6875,
+      "learning_rate": 6.913417161825449e-06,
+      "loss": 3.6734,
+      "step": 117
+    },
+    {
+      "epoch": 0.7932773109243697,
+      "grad_norm": 4.5625,
+      "learning_rate": 6.862914510720515e-06,
+      "loss": 3.6013,
+      "step": 118
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 4.4375,
+      "learning_rate": 6.812190191418508e-06,
+      "loss": 3.81,
+      "step": 119
+    },
+    {
+      "epoch": 0.8067226890756303,
+      "grad_norm": 4.15625,
+      "learning_rate": 6.7612502396061685e-06,
+      "loss": 3.6597,
+      "step": 120
+    },
+    {
+      "epoch": 0.8067226890756303,
+      "eval_loss": 3.696960926055908,
+      "eval_runtime": 29.7615,
+      "eval_samples_per_second": 42.303,
+      "eval_steps_per_second": 5.309,
+      "step": 120
+    },
+    {
+      "epoch": 0.8134453781512605,
+      "grad_norm": 4.125,
+      "learning_rate": 6.710100716628345e-06,
+      "loss": 3.7104,
+      "step": 121
+    },
+    {
+      "epoch": 0.8201680672268907,
+      "grad_norm": 4.5,
+      "learning_rate": 6.6587477087667615e-06,
+      "loss": 3.639,
+      "step": 122
+    },
+    {
+      "epoch": 0.826890756302521,
+      "grad_norm": 4.09375,
+      "learning_rate": 6.607197326515808e-06,
+      "loss": 3.6311,
+      "step": 123
+    },
+    {
+      "epoch": 0.8336134453781513,
+      "grad_norm": 4.5,
+      "learning_rate": 6.555455703855454e-06,
+      "loss": 3.7333,
+      "step": 124
+    },
+    {
+      "epoch": 0.8403361344537815,
+      "grad_norm": 4.375,
+      "learning_rate": 6.503528997521365e-06,
+      "loss": 3.7003,
+      "step": 125
+    },
+    {
+      "epoch": 0.8470588235294118,
+      "grad_norm": 4.375,
+      "learning_rate": 6.451423386272312e-06,
+      "loss": 3.6759,
+      "step": 126
+    },
+    {
+      "epoch": 0.853781512605042,
+      "grad_norm": 4.59375,
+      "learning_rate": 6.399145070154962e-06,
+      "loss": 3.6546,
+      "step": 127
+    },
+    {
+      "epoch": 0.8605042016806723,
+      "grad_norm": 3.984375,
+      "learning_rate": 6.346700269766132e-06,
+      "loss": 3.7089,
+      "step": 128
+    },
+    {
+      "epoch": 0.8672268907563025,
+      "grad_norm": 4.03125,
+      "learning_rate": 6.294095225512604e-06,
+      "loss": 3.5802,
+      "step": 129
+    },
+    {
+      "epoch": 0.8739495798319328,
+      "grad_norm": 4.0,
+      "learning_rate": 6.241336196868582e-06,
+      "loss": 3.7225,
+      "step": 130
+    },
+    {
+      "epoch": 0.880672268907563,
+      "grad_norm": 4.25,
+      "learning_rate": 6.188429461630866e-06,
+      "loss": 3.7397,
+      "step": 131
+    },
+    {
+      "epoch": 0.8873949579831932,
+      "grad_norm": 4.0625,
+      "learning_rate": 6.135381315171867e-06,
+      "loss": 3.6903,
+      "step": 132
+    },
+    {
+      "epoch": 0.8941176470588236,
+      "grad_norm": 4.28125,
+      "learning_rate": 6.0821980696905145e-06,
+      "loss": 3.6114,
+      "step": 133
+    },
+    {
+      "epoch": 0.9008403361344538,
+      "grad_norm": 4.03125,
+      "learning_rate": 6.028886053461175e-06,
+      "loss": 3.7576,
+      "step": 134
+    },
+    {
+      "epoch": 0.907563025210084,
+      "grad_norm": 3.890625,
+      "learning_rate": 5.975451610080643e-06,
+      "loss": 3.6462,
+      "step": 135
+    },
+    {
+      "epoch": 0.907563025210084,
+      "eval_loss": 3.706806182861328,
+      "eval_runtime": 30.2476,
+      "eval_samples_per_second": 41.623,
+      "eval_steps_per_second": 5.224,
+      "step": 135
+    },
+    {
+      "epoch": 0.9142857142857143,
+      "grad_norm": 5.0,
+      "learning_rate": 5.921901097713317e-06,
+      "loss": 3.6685,
+      "step": 136
+    },
+    {
+      "epoch": 0.9210084033613445,
+      "grad_norm": 4.9375,
+      "learning_rate": 5.8682408883346535e-06,
+      "loss": 3.6868,
+      "step": 137
+    },
+    {
+      "epoch": 0.9277310924369748,
+      "grad_norm": 4.15625,
+      "learning_rate": 5.814477366972945e-06,
+      "loss": 3.5962,
+      "step": 138
+    },
+    {
+      "epoch": 0.934453781512605,
+      "grad_norm": 4.0625,
+      "learning_rate": 5.760616930949584e-06,
+      "loss": 3.6538,
+      "step": 139
+    },
+    {
+      "epoch": 0.9411764705882353,
+      "grad_norm": 4.25,
+      "learning_rate": 5.7066659891178385e-06,
+      "loss": 3.7465,
+      "step": 140
+    },
+    {
+      "epoch": 0.9478991596638655,
+      "grad_norm": 4.0625,
+      "learning_rate": 5.65263096110026e-06,
+      "loss": 3.6044,
+      "step": 141
+    },
+    {
+      "epoch": 0.9546218487394958,
+      "grad_norm": 4.40625,
+      "learning_rate": 5.598518276524813e-06,
+      "loss": 3.6922,
+      "step": 142
+    },
+    {
+      "epoch": 0.9613445378151261,
+      "grad_norm": 4.6875,
+      "learning_rate": 5.544334374259823e-06,
+      "loss": 3.6808,
+      "step": 143
+    },
+    {
+      "epoch": 0.9680672268907563,
+      "grad_norm": 4.5,
+      "learning_rate": 5.490085701647805e-06,
+      "loss": 3.6849,
+      "step": 144
+    },
+    {
+      "epoch": 0.9747899159663865,
+      "grad_norm": 7.125,
+      "learning_rate": 5.435778713738292e-06,
+      "loss": 3.7327,
+      "step": 145
+    },
+    {
+      "epoch": 0.9815126050420168,
+      "grad_norm": 4.40625,
+      "learning_rate": 5.381419872519763e-06,
+      "loss": 3.7792,
+      "step": 146
+    },
+    {
+      "epoch": 0.9882352941176471,
+      "grad_norm": 4.59375,
+      "learning_rate": 5.327015646150716e-06,
+      "loss": 3.8095,
+      "step": 147
+    },
+    {
+      "epoch": 0.9949579831932773,
+      "grad_norm": 4.09375,
+      "learning_rate": 5.272572508190033e-06,
+      "loss": 3.5693,
+      "step": 148
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 4.84375,
+      "learning_rate": 5.218096936826681e-06,
+      "loss": 3.7536,
+      "step": 149
+    },
+    {
+      "epoch": 1.0067226890756302,
+      "grad_norm": 4.21875,
+      "learning_rate": 5.1635954141088815e-06,
+      "loss": 3.7009,
+      "step": 150
+    },
+    {
+      "epoch": 1.0067226890756302,
+      "eval_loss": 3.721317768096924,
+      "eval_runtime": 29.8173,
+      "eval_samples_per_second": 42.224,
+      "eval_steps_per_second": 5.299,
+      "step": 150
+    },
+    {
+      "epoch": 1.0134453781512605,
+      "grad_norm": 4.15625,
+      "learning_rate": 5.109074425172806e-06,
+      "loss": 3.7465,
+      "step": 151
+    },
+    {
+      "epoch": 1.0201680672268907,
+      "grad_norm": 3.890625,
+      "learning_rate": 5.054540457470912e-06,
+      "loss": 3.71,
+      "step": 152
+    },
+    {
+      "epoch": 1.026890756302521,
+      "grad_norm": 4.625,
+      "learning_rate": 5e-06,
+      "loss": 3.5906,
+      "step": 153
+    },
+    {
+      "epoch": 1.0336134453781514,
+      "grad_norm": 3.859375,
+      "learning_rate": 4.945459542529089e-06,
+      "loss": 3.6227,
+      "step": 154
+    },
+    {
+      "epoch": 1.0403361344537816,
+      "grad_norm": 4.71875,
+      "learning_rate": 4.890925574827195e-06,
+      "loss": 3.6398,
+      "step": 155
+    },
+    {
+      "epoch": 1.0470588235294118,
+      "grad_norm": 4.71875,
+      "learning_rate": 4.83640458589112e-06,
+      "loss": 3.6522,
+      "step": 156
+    },
+    {
+      "epoch": 1.053781512605042,
+      "grad_norm": 4.9375,
+      "learning_rate": 4.781903063173321e-06,
+      "loss": 3.7183,
+      "step": 157
+    },
+    {
+      "epoch": 1.0605042016806723,
+      "grad_norm": 4.125,
+      "learning_rate": 4.727427491809968e-06,
+      "loss": 3.765,
+      "step": 158
+    },
+    {
+      "epoch": 1.0672268907563025,
+      "grad_norm": 4.65625,
+      "learning_rate": 4.672984353849285e-06,
+      "loss": 3.6848,
+      "step": 159
+    },
+    {
+      "epoch": 1.0739495798319327,
+      "grad_norm": 5.28125,
+      "learning_rate": 4.618580127480239e-06,
+      "loss": 3.7065,
+      "step": 160
+    },
+    {
+      "epoch": 1.080672268907563,
+      "grad_norm": 4.46875,
+      "learning_rate": 4.564221286261709e-06,
+      "loss": 3.7159,
+      "step": 161
+    },
+    {
+      "epoch": 1.0873949579831932,
+      "grad_norm": 4.75,
+      "learning_rate": 4.509914298352197e-06,
+      "loss": 3.7166,
+      "step": 162
+    },
+    {
+      "epoch": 1.0941176470588236,
+      "grad_norm": 4.40625,
+      "learning_rate": 4.4556656257401786e-06,
+      "loss": 3.7179,
+      "step": 163
+    },
+    {
+      "epoch": 1.1008403361344539,
+      "grad_norm": 4.75,
+      "learning_rate": 4.401481723475189e-06,
+      "loss": 3.6981,
+      "step": 164
+    },
+    {
+      "epoch": 1.107563025210084,
+      "grad_norm": 4.3125,
+      "learning_rate": 4.347369038899744e-06,
+      "loss": 3.6717,
+      "step": 165
+    },
+    {
+      "epoch": 1.107563025210084,
+      "eval_loss": 3.731348752975464,
+      "eval_runtime": 30.2759,
+      "eval_samples_per_second": 41.584,
+      "eval_steps_per_second": 5.219,
+      "step": 165
+    },
+    {
+      "epoch": 1.1142857142857143,
+      "grad_norm": 4.15625,
+      "learning_rate": 4.293334010882164e-06,
+      "loss": 3.7169,
+      "step": 166
+    },
+    {
+      "epoch": 1.1210084033613446,
+      "grad_norm": 4.5625,
+      "learning_rate": 4.239383069050417e-06,
+      "loss": 3.7929,
+      "step": 167
+    },
+    {
+      "epoch": 1.1277310924369748,
+      "grad_norm": 4.25,
+      "learning_rate": 4.185522633027057e-06,
+      "loss": 3.663,
+      "step": 168
+    },
+    {
+      "epoch": 1.134453781512605,
+      "grad_norm": 4.71875,
+      "learning_rate": 4.131759111665349e-06,
+      "loss": 3.7563,
+      "step": 169
+    },
+    {
+      "epoch": 1.1411764705882352,
+      "grad_norm": 4.0,
+      "learning_rate": 4.078098902286684e-06,
+      "loss": 3.6651,
+      "step": 170
+    },
+    {
+      "epoch": 1.1478991596638655,
+      "grad_norm": 4.15625,
+      "learning_rate": 4.02454838991936e-06,
+      "loss": 3.6607,
+      "step": 171
+    },
+    {
+      "epoch": 1.1546218487394957,
+      "grad_norm": 4.4375,
+      "learning_rate": 3.971113946538826e-06,
+      "loss": 3.7405,
+      "step": 172
+    },
+    {
+      "epoch": 1.1613445378151261,
+      "grad_norm": 4.46875,
+      "learning_rate": 3.917801930309486e-06,
+      "loss": 3.7962,
+      "step": 173
+    },
+    {
+      "epoch": 1.1680672268907564,
+      "grad_norm": 4.34375,
+      "learning_rate": 3.864618684828135e-06,
+      "loss": 3.645,
+      "step": 174
+    },
+    {
+      "epoch": 1.1747899159663866,
+      "grad_norm": 4.15625,
+      "learning_rate": 3.8115705383691354e-06,
+      "loss": 3.6461,
+      "step": 175
+    },
+    {
+      "epoch": 1.1815126050420168,
+      "grad_norm": 4.03125,
+      "learning_rate": 3.7586638031314182e-06,
+      "loss": 3.71,
+      "step": 176
+    },
+    {
+      "epoch": 1.188235294117647,
+      "grad_norm": 8.6875,
+      "learning_rate": 3.705904774487396e-06,
+      "loss": 3.8202,
+      "step": 177
+    },
+    {
+      "epoch": 1.1949579831932773,
+      "grad_norm": 4.09375,
+      "learning_rate": 3.6532997302338704e-06,
+      "loss": 3.7077,
+      "step": 178
+    },
+    {
+      "epoch": 1.2016806722689075,
+      "grad_norm": 4.78125,
+      "learning_rate": 3.6008549298450403e-06,
+      "loss": 3.7005,
+      "step": 179
+    },
+    {
+      "epoch": 1.2084033613445377,
+      "grad_norm": 4.09375,
+      "learning_rate": 3.5485766137276894e-06,
+      "loss": 3.7631,
+      "step": 180
+    },
+    {
+      "epoch": 1.2084033613445377,
+      "eval_loss": 3.7338194847106934,
+      "eval_runtime": 29.8219,
+      "eval_samples_per_second": 42.217,
+      "eval_steps_per_second": 5.298,
+      "step": 180
+    },
+    {
+      "epoch": 1.2151260504201682,
+      "grad_norm": 4.03125,
+      "learning_rate": 3.4964710024786354e-06,
+      "loss": 3.6634,
+      "step": 181
+    },
+    {
+      "epoch": 1.2218487394957984,
+      "grad_norm": 4.5625,
+      "learning_rate": 3.444544296144546e-06,
+      "loss": 3.747,
+      "step": 182
+    },
+    {
+      "epoch": 1.2285714285714286,
+      "grad_norm": 4.9375,
+      "learning_rate": 3.3928026734841935e-06,
+      "loss": 3.6196,
+      "step": 183
+    },
+    {
+      "epoch": 1.2352941176470589,
+      "grad_norm": 3.90625,
+      "learning_rate": 3.341252291233241e-06,
+      "loss": 3.6693,
+      "step": 184
+    },
+    {
+      "epoch": 1.242016806722689,
+      "grad_norm": 4.21875,
+      "learning_rate": 3.289899283371657e-06,
+      "loss": 3.7271,
+      "step": 185
+    },
+    {
+      "epoch": 1.2487394957983193,
+      "grad_norm": 4.21875,
+      "learning_rate": 3.2387497603938327e-06,
+      "loss": 3.678,
+      "step": 186
+    },
+    {
+      "epoch": 1.2554621848739496,
+      "grad_norm": 4.0625,
+      "learning_rate": 3.1878098085814926e-06,
+      "loss": 3.702,
+      "step": 187
+    },
+    {
+      "epoch": 1.2621848739495798,
+      "grad_norm": 4.09375,
+      "learning_rate": 3.1370854892794855e-06,
+      "loss": 3.7787,
+      "step": 188
+    },
+    {
+      "epoch": 1.26890756302521,
+      "grad_norm": 4.4375,
+      "learning_rate": 3.0865828381745515e-06,
+      "loss": 3.6845,
+      "step": 189
+    },
+    {
+      "epoch": 1.2756302521008402,
+      "grad_norm": 4.125,
+      "learning_rate": 3.0363078645771303e-06,
+      "loss": 3.6905,
+      "step": 190
+    },
+    {
+      "epoch": 1.2823529411764705,
+      "grad_norm": 4.28125,
+      "learning_rate": 2.986266550706315e-06,
+      "loss": 3.6823,
+      "step": 191
+    },
+    {
+      "epoch": 1.289075630252101,
+      "grad_norm": 4.6875,
+      "learning_rate": 2.936464850978027e-06,
+      "loss": 3.7313,
+      "step": 192
+    },
+    {
+      "epoch": 1.2957983193277312,
+      "grad_norm": 4.09375,
+      "learning_rate": 2.886908691296504e-06,
+      "loss": 3.7439,
+      "step": 193
+    },
+    {
+      "epoch": 1.3025210084033614,
+      "grad_norm": 3.828125,
+      "learning_rate": 2.8376039683491683e-06,
+      "loss": 3.7323,
+      "step": 194
+    },
+    {
+      "epoch": 1.3092436974789916,
+      "grad_norm": 4.0625,
+      "learning_rate": 2.7885565489049948e-06,
+      "loss": 3.7535,
+      "step": 195
+    },
+    {
+      "epoch": 1.3092436974789916,
+      "eval_loss": 3.734619379043579,
+      "eval_runtime": 30.2166,
+      "eval_samples_per_second": 41.666,
+      "eval_steps_per_second": 5.229,
+      "step": 195
+    },
+    {
+      "epoch": 1.3159663865546218,
+      "grad_norm": 4.125,
+      "learning_rate": 2.739772269116402e-06,
+      "loss": 3.6891,
+      "step": 196
+    },
+    {
+      "epoch": 1.322689075630252,
+      "grad_norm": 4.34375,
+      "learning_rate": 2.6912569338248317e-06,
+      "loss": 3.7449,
+      "step": 197
+    },
+    {
+      "epoch": 1.3294117647058823,
+      "grad_norm": 3.96875,
+      "learning_rate": 2.6430163158700116e-06,
+      "loss": 3.6608,
+      "step": 198
+    },
+    {
+      "epoch": 1.3361344537815127,
+      "grad_norm": 4.75,
+      "learning_rate": 2.595056155403063e-06,
+      "loss": 3.7449,
+      "step": 199
+    },
+    {
+      "epoch": 1.342857142857143,
+      "grad_norm": 4.0,
+      "learning_rate": 2.5473821592034604e-06,
+      "loss": 3.7139,
+      "step": 200
+    },
+    {
+      "epoch": 1.3495798319327732,
+      "grad_norm": 4.59375,
+      "learning_rate": 2.5000000000000015e-06,
+      "loss": 3.7823,
+      "step": 201
+    },
+    {
+      "epoch": 1.3563025210084034,
+      "grad_norm": 4.21875,
+      "learning_rate": 2.4529153157957913e-06,
+      "loss": 3.7754,
+      "step": 202
+    },
+    {
+      "epoch": 1.3630252100840337,
+      "grad_norm": 4.25,
+      "learning_rate": 2.406133709197392e-06,
+      "loss": 3.7373,
+      "step": 203
+    },
+    {
+      "epoch": 1.3697478991596639,
+      "grad_norm": 4.125,
+      "learning_rate": 2.3596607467481602e-06,
+      "loss": 3.7617,
+      "step": 204
+    },
+    {
+      "epoch": 1.3764705882352941,
+      "grad_norm": 4.15625,
+      "learning_rate": 2.3135019582658803e-06,
+      "loss": 3.7332,
+      "step": 205
+    },
+    {
+      "epoch": 1.3831932773109243,
+      "grad_norm": 4.15625,
+      "learning_rate": 2.2676628361847834e-06,
+      "loss": 3.639,
+      "step": 206
+    },
+    {
+      "epoch": 1.3899159663865546,
+      "grad_norm": 4.25,
+      "learning_rate": 2.2221488349019903e-06,
+      "loss": 3.6918,
+      "step": 207
+    },
+    {
+      "epoch": 1.3966386554621848,
+      "grad_norm": 4.0625,
+      "learning_rate": 2.1769653701284983e-06,
+      "loss": 3.622,
+      "step": 208
+    },
+    {
+      "epoch": 1.403361344537815,
+      "grad_norm": 4.53125,
+      "learning_rate": 2.132117818244771e-06,
+      "loss": 3.7286,
+      "step": 209
+    },
+    {
+      "epoch": 1.4100840336134453,
+      "grad_norm": 4.0625,
+      "learning_rate": 2.08761151566099e-06,
+      "loss": 3.668,
+      "step": 210
+    },
+    {
+      "epoch": 1.4100840336134453,
+      "eval_loss": 3.7375030517578125,
+      "eval_runtime": 29.7795,
+      "eval_samples_per_second": 42.277,
+      "eval_steps_per_second": 5.306,
+      "step": 210
+    },
+    {
+      "epoch": 1.4168067226890757,
+      "grad_norm": 4.21875,
+      "learning_rate": 2.0434517581820893e-06,
+      "loss": 3.7376,
+      "step": 211
+    },
+    {
+      "epoch": 1.423529411764706,
+      "grad_norm": 3.984375,
+      "learning_rate": 1.999643800377596e-06,
+      "loss": 3.7108,
+      "step": 212
+    },
+    {
+      "epoch": 1.4302521008403362,
+      "grad_norm": 3.734375,
+      "learning_rate": 1.956192854956397e-06,
+      "loss": 3.6391,
+      "step": 213
+    },
+    {
+      "epoch": 1.4369747899159664,
+      "grad_norm": 4.0,
+      "learning_rate": 1.913104092146476e-06,
+      "loss": 3.6956,
+      "step": 214
+    },
+    {
+      "epoch": 1.4436974789915966,
+      "grad_norm": 4.375,
+      "learning_rate": 1.8703826390797047e-06,
+      "loss": 3.6241,
+      "step": 215
+    },
+    {
+      "epoch": 1.4504201680672268,
+      "grad_norm": 4.125,
+      "learning_rate": 1.8280335791817733e-06,
+      "loss": 3.7801,
+      "step": 216
+    },
+    {
+      "epoch": 1.457142857142857,
+      "grad_norm": 4.0,
+      "learning_rate": 1.7860619515673034e-06,
+      "loss": 3.6977,
+      "step": 217
+    },
+    {
+      "epoch": 1.4638655462184875,
+      "grad_norm": 4.71875,
+      "learning_rate": 1.7444727504402554e-06,
+      "loss": 3.6897,
+      "step": 218
+    },
+    {
+      "epoch": 1.4705882352941178,
+      "grad_norm": 4.65625,
+      "learning_rate": 1.7032709244996559e-06,
+      "loss": 3.6878,
+      "step": 219
+    },
+    {
+      "epoch": 1.477310924369748,
+      "grad_norm": 5.21875,
+      "learning_rate": 1.662461376350764e-06,
+      "loss": 3.7517,
+      "step": 220
+    },
+    {
+      "epoch": 1.4840336134453782,
+      "grad_norm": 4.53125,
+      "learning_rate": 1.6220489619216988e-06,
+      "loss": 3.7621,
+      "step": 221
+    },
+    {
+      "epoch": 1.4907563025210084,
+      "grad_norm": 3.984375,
+      "learning_rate": 1.5820384898856433e-06,
+      "loss": 3.7284,
+      "step": 222
+    },
+    {
+      "epoch": 1.4974789915966387,
+      "grad_norm": 4.3125,
+      "learning_rate": 1.5424347210886538e-06,
+      "loss": 3.6888,
+      "step": 223
+    },
+    {
+      "epoch": 1.504201680672269,
+      "grad_norm": 4.0625,
+      "learning_rate": 1.5032423679831642e-06,
+      "loss": 3.705,
+      "step": 224
+    },
+    {
+      "epoch": 1.5109243697478991,
+      "grad_norm": 3.765625,
+      "learning_rate": 1.4644660940672628e-06,
+      "loss": 3.679,
+      "step": 225
+    },
+    {
+      "epoch": 1.5109243697478991,
+      "eval_loss": 3.7383294105529785,
+      "eval_runtime": 30.258,
+      "eval_samples_per_second": 41.609,
+      "eval_steps_per_second": 5.222,
+      "step": 225
+    },
+    {
+      "epoch": 1.5176470588235293,
+      "grad_norm": 4.0625,
+      "learning_rate": 1.4261105133297693e-06,
+      "loss": 3.6644,
+      "step": 226
+    },
+    {
+      "epoch": 1.5243697478991596,
+      "grad_norm": 4.28125,
+      "learning_rate": 1.3881801897012225e-06,
+      "loss": 3.6869,
+      "step": 227
+    },
+    {
+      "epoch": 1.5310924369747898,
+      "grad_norm": 4.4375,
+      "learning_rate": 1.3506796365108232e-06,
+      "loss": 3.6292,
+      "step": 228
+    },
+    {
+      "epoch": 1.53781512605042,
+      "grad_norm": 4.09375,
+      "learning_rate": 1.3136133159493803e-06,
+      "loss": 3.6962,
+      "step": 229
+    },
+    {
+      "epoch": 1.5445378151260503,
+      "grad_norm": 4.125,
+      "learning_rate": 1.2769856385383689e-06,
+      "loss": 3.8197,
+      "step": 230
+    },
+    {
+      "epoch": 1.5512605042016807,
+      "grad_norm": 4.21875,
+      "learning_rate": 1.2408009626051137e-06,
+      "loss": 3.7204,
+      "step": 231
+    },
+    {
+      "epoch": 1.557983193277311,
+      "grad_norm": 4.28125,
+      "learning_rate": 1.2050635937641909e-06,
+      "loss": 3.7022,
+      "step": 232
+    },
+    {
+      "epoch": 1.5647058823529412,
+      "grad_norm": 5.125,
+      "learning_rate": 1.1697777844051105e-06,
+      "loss": 3.6865,
+      "step": 233
+    },
+    {
+      "epoch": 1.5714285714285714,
+      "grad_norm": 4.0625,
+      "learning_rate": 1.134947733186315e-06,
+      "loss": 3.6203,
+      "step": 234
+    },
+    {
+      "epoch": 1.5781512605042018,
+      "grad_norm": 4.34375,
+      "learning_rate": 1.100577584535592e-06,
+      "loss": 3.7241,
+      "step": 235
+    },
+    {
+      "epoch": 1.584873949579832,
+      "grad_norm": 3.84375,
+      "learning_rate": 1.0666714281569152e-06,
+      "loss": 3.5546,
+      "step": 236
+    },
+    {
+      "epoch": 1.5915966386554623,
+      "grad_norm": 4.03125,
+      "learning_rate": 1.0332332985438248e-06,
+      "loss": 3.7072,
+      "step": 237
+    },
+    {
+      "epoch": 1.5983193277310925,
+      "grad_norm": 4.96875,
+      "learning_rate": 1.0002671744993519e-06,
+      "loss": 3.8113,
+      "step": 238
+    },
+    {
+      "epoch": 1.6050420168067228,
+      "grad_norm": 3.953125,
+      "learning_rate": 9.677769786625869e-07,
+      "loss": 3.7273,
+      "step": 239
+    },
+    {
+      "epoch": 1.611764705882353,
+      "grad_norm": 4.3125,
+      "learning_rate": 9.357665770419244e-07,
+      "loss": 3.6539,
+      "step": 240
+    },
+    {
+      "epoch": 1.611764705882353,
+      "eval_loss": 3.7385716438293457,
+      "eval_runtime": 29.7733,
+      "eval_samples_per_second": 42.286,
+      "eval_steps_per_second": 5.307,
+      "step": 240
+    },
+    {
+      "epoch": 1.6184873949579832,
+      "grad_norm": 4.0,
+      "learning_rate": 9.042397785550405e-07,
+      "loss": 3.6896,
+      "step": 241
+    },
+    {
+      "epoch": 1.6252100840336134,
+      "grad_norm": 4.0,
+      "learning_rate": 8.732003345756812e-07,
+      "loss": 3.6619,
+      "step": 242
+    },
+    {
+      "epoch": 1.6319327731092437,
+      "grad_norm": 4.09375,
+      "learning_rate": 8.426519384872733e-07,
+      "loss": 3.7638,
+      "step": 243
+    },
+    {
+      "epoch": 1.638655462184874,
+      "grad_norm": 3.703125,
+      "learning_rate": 8.125982252434611e-07,
+      "loss": 3.685,
+      "step": 244
+    },
+    {
+      "epoch": 1.6453781512605041,
+      "grad_norm": 4.21875,
+      "learning_rate": 7.830427709355726e-07,
+      "loss": 3.6325,
+      "step": 245
+    },
+    {
+      "epoch": 1.6521008403361344,
+      "grad_norm": 4.28125,
+      "learning_rate": 7.539890923671061e-07,
+      "loss": 3.7295,
+      "step": 246
+    },
+    {
+      "epoch": 1.6588235294117646,
+      "grad_norm": 4.09375,
+      "learning_rate": 7.254406466352682e-07,
+      "loss": 3.6432,
+      "step": 247
+    },
+    {
+      "epoch": 1.6655462184873948,
+      "grad_norm": 5.09375,
+      "learning_rate": 6.974008307196057e-07,
+      "loss": 3.729,
+      "step": 248
+    },
+    {
+      "epoch": 1.6722689075630253,
+      "grad_norm": 4.09375,
+      "learning_rate": 6.698729810778065e-07,
+      "loss": 3.6502,
+      "step": 249
+    },
+    {
+      "epoch": 1.6789915966386555,
+      "grad_norm": 3.96875,
+      "learning_rate": 6.428603732486938e-07,
+      "loss": 3.6288,
+      "step": 250
+    },
+    {
+      "epoch": 1.6857142857142857,
+      "grad_norm": 3.84375,
+      "learning_rate": 6.163662214624616e-07,
+      "loss": 3.6903,
+      "step": 251
+    },
+    {
+      "epoch": 1.692436974789916,
+      "grad_norm": 4.34375,
+      "learning_rate": 5.903936782582253e-07,
+      "loss": 3.7859,
+      "step": 252
+    },
+    {
+      "epoch": 1.6991596638655462,
+      "grad_norm": 4.125,
+      "learning_rate": 5.649458341088915e-07,
+      "loss": 3.7541,
+      "step": 253
+    },
+    {
+      "epoch": 1.7058823529411766,
+      "grad_norm": 4.0625,
+      "learning_rate": 5.400257170534296e-07,
+      "loss": 3.7466,
+      "step": 254
+    },
+    {
+      "epoch": 1.7126050420168069,
+      "grad_norm": 4.125,
+      "learning_rate": 5.156362923365587e-07,
+      "loss": 3.6547,
+      "step": 255
+    },
+    {
+      "epoch": 1.7126050420168069,
+      "eval_loss": 3.738647222518921,
+      "eval_runtime": 30.241,
+      "eval_samples_per_second": 41.632,
+      "eval_steps_per_second": 5.225,
+      "step": 255
+    },
+    {
+      "epoch": 1.719327731092437,
+      "grad_norm": 3.78125,
+      "learning_rate": 4.917804620559202e-07,
+      "loss": 3.6395,
+      "step": 256
+    },
+    {
+      "epoch": 1.7260504201680673,
+      "grad_norm": 4.25,
+      "learning_rate": 4.6846106481675035e-07,
+      "loss": 3.7057,
+      "step": 257
+    },
+    {
+      "epoch": 1.7327731092436975,
+      "grad_norm": 4.125,
+      "learning_rate": 4.456808753941205e-07,
+      "loss": 3.7292,
+      "step": 258
+    },
+    {
+      "epoch": 1.7394957983193278,
+      "grad_norm": 4.3125,
+      "learning_rate": 4.2344260440276455e-07,
+      "loss": 3.7007,
+      "step": 259
+    },
+    {
+      "epoch": 1.746218487394958,
+      "grad_norm": 4.09375,
+      "learning_rate": 4.0174889797453875e-07,
+      "loss": 3.744,
+      "step": 260
+    },
+    {
+      "epoch": 1.7529411764705882,
+      "grad_norm": 4.3125,
+      "learning_rate": 3.8060233744356634e-07,
+      "loss": 3.8662,
+      "step": 261
+    },
+    {
+      "epoch": 1.7596638655462185,
+      "grad_norm": 4.3125,
+      "learning_rate": 3.600054390390778e-07,
+      "loss": 3.8242,
+      "step": 262
+    },
+    {
+      "epoch": 1.7663865546218487,
+      "grad_norm": 3.921875,
+      "learning_rate": 3.399606535860078e-07,
+      "loss": 3.6502,
+      "step": 263
+    },
+    {
+      "epoch": 1.773109243697479,
+      "grad_norm": 3.9375,
+      "learning_rate": 3.204703662133724e-07,
+      "loss": 3.6803,
+      "step": 264
+    },
+    {
+      "epoch": 1.7798319327731091,
+      "grad_norm": 4.90625,
+      "learning_rate": 3.015368960704584e-07,
+      "loss": 3.7613,
+      "step": 265
+    },
+    {
+      "epoch": 1.7865546218487394,
+      "grad_norm": 4.3125,
+      "learning_rate": 2.8316249605087386e-07,
+      "loss": 3.7316,
+      "step": 266
+    },
+    {
+      "epoch": 1.7932773109243696,
+      "grad_norm": 4.125,
+      "learning_rate": 2.653493525244721e-07,
+      "loss": 3.6491,
+      "step": 267
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 4.15625,
+      "learning_rate": 2.4809958507719444e-07,
+      "loss": 3.8626,
+      "step": 268
+    },
+    {
+      "epoch": 1.8067226890756303,
+      "grad_norm": 3.9375,
+      "learning_rate": 2.314152462588659e-07,
+      "loss": 3.7007,
+      "step": 269
+    },
+    {
+      "epoch": 1.8134453781512605,
+      "grad_norm": 3.921875,
+      "learning_rate": 2.152983213389559e-07,
+      "loss": 3.7533,
+      "step": 270
+    },
+    {
+      "epoch": 1.8134453781512605,
+      "eval_loss": 3.7400190830230713,
+      "eval_runtime": 29.7815,
+      "eval_samples_per_second": 42.275,
+      "eval_steps_per_second": 5.305,
+      "step": 270
+    },
+    {
+      "epoch": 1.8201680672268907,
+      "grad_norm": 4.15625,
+      "learning_rate": 1.99750728070357e-07,
+      "loss": 3.6811,
+      "step": 271
+    },
+    {
+      "epoch": 1.826890756302521,
+      "grad_norm": 3.921875,
+      "learning_rate": 1.8477431646118648e-07,
+      "loss": 3.6697,
+      "step": 272
+    },
+    {
+      "epoch": 1.8336134453781514,
+      "grad_norm": 4.375,
+      "learning_rate": 1.7037086855465902e-07,
+      "loss": 3.7755,
+      "step": 273
+    },
+    {
+      "epoch": 1.8403361344537816,
+      "grad_norm": 4.34375,
+      "learning_rate": 1.5654209821703458e-07,
+      "loss": 3.7415,
+      "step": 274
+    },
+    {
+      "epoch": 1.8470588235294119,
+      "grad_norm": 4.03125,
+      "learning_rate": 1.4328965093369284e-07,
+      "loss": 3.7171,
+      "step": 275
+    },
+    {
+      "epoch": 1.853781512605042,
+      "grad_norm": 4.28125,
+      "learning_rate": 1.3061510361333186e-07,
+      "loss": 3.692,
+      "step": 276
+    },
+    {
+      "epoch": 1.8605042016806723,
+      "grad_norm": 3.9375,
+      "learning_rate": 1.185199644003332e-07,
+      "loss": 3.7456,
+      "step": 277
+    },
+    {
+      "epoch": 1.8672268907563025,
+      "grad_norm": 3.875,
+      "learning_rate": 1.0700567249530835e-07,
+      "loss": 3.6095,
+      "step": 278
+    },
+    {
+      "epoch": 1.8739495798319328,
+      "grad_norm": 3.90625,
+      "learning_rate": 9.607359798384785e-08,
+      "loss": 3.759,
+      "step": 279
+    },
+    {
+      "epoch": 1.880672268907563,
+      "grad_norm": 3.984375,
+      "learning_rate": 8.572504167349449e-08,
+      "loss": 3.776,
+      "step": 280
+    },
+    {
+      "epoch": 1.8873949579831932,
+      "grad_norm": 4.0625,
+      "learning_rate": 7.59612349389599e-08,
+      "loss": 3.7251,
+      "step": 281
+    },
+    {
+      "epoch": 1.8941176470588235,
+      "grad_norm": 4.0625,
+      "learning_rate": 6.678333957560513e-08,
+      "loss": 3.6457,
+      "step": 282
+    },
+    {
+      "epoch": 1.9008403361344537,
+      "grad_norm": 3.984375,
+      "learning_rate": 5.8192447661196694e-08,
+      "loss": 3.7916,
+      "step": 283
+    },
+    {
+      "epoch": 1.907563025210084,
+      "grad_norm": 3.84375,
+      "learning_rate": 5.0189581425960644e-08,
+      "loss": 3.6759,
+      "step": 284
+    },
+    {
+      "epoch": 1.9142857142857141,
+      "grad_norm": 4.5625,
+      "learning_rate": 4.2775693130948094e-08,
+      "loss": 3.6983,
+      "step": 285
+    },
+    {
+      "epoch": 1.9142857142857141,
+      "eval_loss": 3.7386996746063232,
+      "eval_runtime": 30.2618,
+      "eval_samples_per_second": 41.604,
+      "eval_steps_per_second": 5.221,
+      "step": 285
+    },
+    {
+      "epoch": 1.9210084033613444,
+      "grad_norm": 4.59375,
+      "learning_rate": 3.59516649547248e-08,
+      "loss": 3.7151,
+      "step": 286
+    },
+    {
+      "epoch": 1.9277310924369748,
+      "grad_norm": 4.0,
+      "learning_rate": 2.971830888840177e-08,
+      "loss": 3.6223,
+      "step": 287
+    },
+    {
+      "epoch": 1.934453781512605,
+      "grad_norm": 4.03125,
+      "learning_rate": 2.4076366639015914e-08,
+      "loss": 3.6781,
+      "step": 288
+    },
+    {
+      "epoch": 1.9411764705882353,
+      "grad_norm": 4.21875,
+      "learning_rate": 1.9026509541272276e-08,
+      "loss": 3.7715,
+      "step": 289
+    },
+    {
+      "epoch": 1.9478991596638655,
+      "grad_norm": 4.0,
+      "learning_rate": 1.4569338477666838e-08,
+      "loss": 3.6257,
+      "step": 290
+    },
+    {
+      "epoch": 1.954621848739496,
+      "grad_norm": 4.125,
+      "learning_rate": 1.0705383806982606e-08,
+      "loss": 3.7141,
+      "step": 291
+    },
+    {
+      "epoch": 1.9613445378151262,
+      "grad_norm": 4.34375,
+      "learning_rate": 7.43510530118452e-09,
+      "loss": 3.6997,
+      "step": 292
+    },
+    {
+      "epoch": 1.9680672268907564,
+      "grad_norm": 4.21875,
+      "learning_rate": 4.758892090711009e-09,
+      "loss": 3.7004,
+      "step": 293
+    },
+    {
+      "epoch": 1.9747899159663866,
+      "grad_norm": 6.25,
+      "learning_rate": 2.6770626181715776e-09,
+      "loss": 3.7454,
+      "step": 294
+    },
+    {
+      "epoch": 1.9815126050420169,
+      "grad_norm": 4.5625,
+      "learning_rate": 1.189864600454338e-09,
+      "loss": 3.7919,
+      "step": 295
+    },
+    {
+      "epoch": 1.988235294117647,
+      "grad_norm": 4.1875,
+      "learning_rate": 2.974749992512571e-10,
+      "loss": 3.8223,
+      "step": 296
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 296,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 15,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.333328305140531e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-296/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ab91e38a5656a33a455ef4f7acd62dbd99d6a91529c45ba81572db9a3246c08
+size 6993

config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "_sliding_window_pattern": 6,
+  "architectures": [
+    "Gemma3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 106,
+  "final_logit_softcapping": null,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 640,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_types": [
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "model_type": "gemma3_text",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 18,
+  "num_key_value_heads": 1,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000.0,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": 512,
+  "sliding_window_pattern": 6,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4",
+  "unsloth_fixed": true,
+  "use_bidirectional_attention": false,
+  "use_cache": false,
+  "vocab_size": 262145
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "do_sample": true,
+  "eos_token_id": [
+    1,
+    106
+  ],
+  "max_length": 32768,
+  "pad_token_id": 0,
+  "top_k": 64,
+  "top_p": 0.95,
+  "transformers_version": "4.52.4"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df5b41c0ad86b13aa2e31015b2fa37db700a177ccac2c390d5dcda7424957730
+size 536224336

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<end_of_turn>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ab91e38a5656a33a455ef4f7acd62dbd99d6a91529c45ba81572db9a3246c08
+size 6993