[2026-06-13 16:40:45,849] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:590] bf16 support detected, enabling for this configuration.
[2026-06-13 16:40:46,071] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:590] baseline 0.000GB ()
[2026-06-13 16:40:46,071] [INFO] [axolotl.cli.config.load_cfg:333] [PID:590] config:
{
  "activation_offloading": true,
  "adapter": "qlora",
  "attn_implementation": "flash_attention_2",
  "attn_needs_dtype_cast": true,
  "attn_supports_packing": true,
  "attn_uses_flash_lib": true,
  "axolotl_config_path": "./configs/jacob/jacob-micro-light.yaml",
  "base_model": "Qwen/Qwen3.5-4B",
  "base_model_config": "Qwen/Qwen3.5-4B",
  "batch_size": 8,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_89",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1,
    "tf32": true
  },
  "chat_template": "qwen3_5",
  "context_parallel_size": 1,
  "cut_cross_entropy": true,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 16,
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "field_messages": "messages",
      "field_tools": "tools",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "jacob-ml/Jacob-2-SSFT-filtered",
      "split": "train",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.9.1"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "fp16": false,
  "freeze_mm_modules": true,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 8,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "hub_model_id": "jacob-ml/Jacob-2-4B",
  "include_tkps": true,
  "is_multimodal": true,
  "layer_offloading": true,
  "learning_rate": 0.0002,
  "liger_fused_linear_cross_entropy": true,
  "liger_glu_activation": true,
  "liger_layer_norm": true,
  "liger_rms_norm": true,
  "liger_rope": true,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 16,
  "lora_dropout": 0.0,
  "lora_embedding_kernel": true,
  "lora_mlp_kernel": true,
  "lora_o_kernel": true,
  "lora_qkv_kernel": true,
  "lora_r": 8,
  "lora_target_modules": "model\\.language_model\\.layers\\.[\\d]+\\.(_checkpoint_wrapped_module\\.)?(mlp|self_attn)\\.(up|down|gate|q|k|v|o)_proj",
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "merge_method": "memory_efficient",
  "micro_batch_size": 1,
  "model_config_type": "qwen3_5",
  "model_config_type_text": "qwen3_5_text",
  "num_epochs": 1.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_torch_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./outputs/Jacob-2-4B",
  "pad_to_sequence_len": false,
  "plugins": [
    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin",
    "axolotl.integrations.liger.LigerPlugin"
  ],
  "pretrain_multipack_attn": true,
  "processor_config": "Qwen/Qwen3.5-4B",
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "relora_prune_method": "magnitude",
  "remove_unused_columns": true,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": false,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "sequence_len": 4096,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Qwen/Qwen3.5-4B",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "async_prefetch": false,
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "replay_buffer_size": 0,
    "replay_recompute_logps": true,
    "reroll_max_groups": 1,
    "reroll_start_fraction": 1.0,
    "reward_num_workers": 1,
    "scale_rewards": true,
    "skip_zero_advantage_batches": true,
    "sync_ref_model": false,
    "use_data_producer": false,
    "use_vllm": false,
    "vllm_lora_sync": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_otel_metrics": false,
  "use_ray": false,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "warmup_ratio": 0.1,
  "weight_decay": 0.0,
  "world_size": 1
}
[2026-06-13 16:40:47,917] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:590] EOS: 248046 / <|im_end|>
[2026-06-13 16:40:47,917] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:590] BOS: None / None
[2026-06-13 16:40:47,917] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:590] PAD: 248044 / <|endoftext|>
[2026-06-13 16:40:47,917] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:590] UNK: None / None
[2026-06-13 16:40:47,917] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:590] Unable to find prepared dataset in last_run_prepared/df1a8767edaa54a3b07747a0b1ec1c3a
[2026-06-13 16:40:47,918] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:590] Loading raw datasets...
[2026-06-13 16:40:47,918] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:590] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
Fetching 0 files: 0it [00:00, ?it/s][AFetching 0 files: 0it [00:00, ?it/s]
Download complete: : 0.00B [00:00, ?B/s]              Download complete: : 0.00B [00:00, ?B/s]
[2026-06-13 16:40:49,854] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:590] Loading dataset: jacob-ml/Jacob-2-SSFT-filtered with base_type: chat_template and prompt_style: None
[2026-06-13 16:40:49,856] [INFO] [axolotl.prompt_strategies.chat_template.__call__:1191] [PID:590] Using chat template:
---
{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {%- if messages[0].content is string %}
            {{- messages[0].content + '\n\n' }}
        {%- else %}
            {%- for part in messages[0].content %}
                {%- if part is mapping %}
                    {%- set system_text = part.get('text') or part.get('content') or part.get('value') %}
                    {%- if system_text %}{{- system_text }}{%- endif %}
                {%- elif part is string %}
                    {{- part }}
                {%- endif %}
            {%- endfor %}
            {{- '\n\n' }}
        {%- endif %}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {%- if messages[0].content is string %}
            {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
        {%- else %}
            {{- '<|im_start|>system\n' }}
            {%- for part in messages[0].content %}
                {%- if part is mapping %}
                    {%- set system_text = part.get('text') or part.get('content') or part.get('value') %}
                    {%- if system_text %}{{- system_text }}{%- endif %}
                {%- elif part is string %}
                    {{- part }}
                {%- endif %}
            {%- endfor %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{#- Determine the real last index: use provided value or default to messages length - 1 #}
{%- if real_last_index is defined and real_last_index is not none %}
    {%- set ns.real_last_index = real_last_index %}
{%- else %}
    {%- set ns.real_last_index = messages|length - 1 %}
{%- endif %}
{%- for message in messages[::-1] %}
    {%- set index = (messages|length - 1) - loop.index0 %}
    {%- if message['content'] is string %}
        {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
            {%- set ns.multi_step_tool = false %}
            {%- set ns.last_query_index = index %}
        {%- endif %}
    {%- else %}
        {%- if ns.multi_step_tool and message.role == "user" %}
            {%- set ns.multi_step_tool = false %}
            {%- set ns.last_query_index = index %}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- for message in messages %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
        {{- '<|im_start|>' + message.role + '\n' }}
        {%- if message['content'] is string %}
            {{- message.content }}
        {%- else %}
            {%- for content in message['content'] %}
                {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content %}
                    {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
                {%- elif content['type'] == 'video' or 'video' in content %}
                    {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
                {%- elif 'text' in content %}
                    {{- content['text'] }}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "assistant" %}
        {%- if message['content'] is string %}
            {%- set content = message.content %}
        {%- else %}
            {%- set content = '' %}
            {%- for item in message['content'] %}
                {%- if 'text' in item %}
                    {%- set content = content + item['text'] %}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
        {%- set reasoning_content = '' %}
        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
            {%- set reasoning_content = message.reasoning_content %}
        {%- else %}
            {%- if '</think>' in content %}
                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
            {%- endif %}
        {%- endif %}
        {%- if loop.index0 > ns.last_query_index %}
            {%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %}
                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
            {%- else %}
                {{- '<|im_start|>' + message.role + '\n' + content }}
            {%- endif %}
        {%- else %}
            {{- '<|im_start|>' + message.role + '\n' + content }}
        {%- endif %}
        {%- if message.tool_calls %}
            {%- for tool_call in message.tool_calls %}
                {%- if (loop.first and content) or (not loop.first) %}
                    {{- '\n' }}
                {%- endif %}
                {%- if tool_call.function %}
                    {%- set tool_call = tool_call.function %}
                {%- endif %}
                {{- '<tool_call>\n{"name": "' }}
                {{- tool_call.name }}
                {{- '", "arguments": ' }}
                {%- if tool_call.arguments is string %}
                    {{- tool_call.arguments }}
                {%- else %}
                    {{- tool_call.arguments | tojson }}
                {%- endif %}
                {{- '}\n</tool_call>' }}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- message.content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n' }}
    {%- if enable_thinking is defined and enable_thinking is false %}
        {{- '<think>\n\n</think>\n\n' }}
    {%- else %}
        {{- '<think>\n\n' }}
    {%- endif %}
{%- endif %}

---
Tokenizing Prompts (num_proc=16):   0%|                                                                                                                                                                                       | 0/4209 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=16):   6%|██████████▊                                                                                                                                                                  | 263/4209 [00:06<01:32, 42.69 examples/s]Tokenizing Prompts (num_proc=16):  13%|█████████████████████▋                                                                                                                                                       | 527/4209 [00:06<00:40, 90.30 examples/s]Tokenizing Prompts (num_proc=16):  19%|████████████████████████████████▎                                                                                                                                           | 790/4209 [00:06<00:21, 160.42 examples/s]Tokenizing Prompts (num_proc=16):  25%|██████████████████████████████████████████▊                                                                                                                                | 1053/4209 [00:07<00:12, 243.33 examples/s]Tokenizing Prompts (num_proc=16):  31%|█████████████████████████████████████████████████████▍                                                                                                                     | 1316/4209 [00:07<00:08, 330.70 examples/s]Tokenizing Prompts (num_proc=16):  38%|████████████████████████████████████████████████████████████████▏                                                                                                          | 1579/4209 [00:08<00:07, 359.56 examples/s]Tokenizing Prompts (num_proc=16):  44%|██████████████████████████████████████████████████████████████████████████▊                                                                                                | 1842/4209 [00:08<00:04, 495.89 examples/s]Tokenizing Prompts (num_proc=16):  56%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 2368/4209 [00:08<00:02, 811.60 examples/s]Tokenizing Prompts (num_proc=16):  75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 3157/4209 [00:08<00:00, 1420.68 examples/s]Tokenizing Prompts (num_proc=16):  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                     | 3683/4209 [00:09<00:00, 1151.98 examples/s]Tokenizing Prompts (num_proc=16):  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 3946/4209 [00:09<00:00, 952.26 examples/s]Tokenizing Prompts (num_proc=16): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4209/4209 [00:09<00:00, 976.58 examples/s]Tokenizing Prompts (num_proc=16): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4209/4209 [00:10<00:00, 418.97 examples/s]
[2026-06-13 16:41:00,066] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:590] min_input_len: 450
[2026-06-13 16:41:00,066] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:590] max_input_len: 23626
Dropping Invalid Sequences (<None or >4096) (num_proc=16):   0%|                                                                                                                                                              | 0/4209 [00:00<?, ? examples/s]Dropping Invalid Sequences (<None or >4096) (num_proc=16):   6%|█████████▏                                                                                                                                         | 263/4209 [00:00<00:06, 626.42 examples/s]Dropping Invalid Sequences (<None or >4096) (num_proc=16): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4209/4209 [00:00<00:00, 7549.19 examples/s]
[2026-06-13 16:41:00,642] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:590] Dropped 175 sequences outside valid range ([None, 4096])
Saving the dataset (0/15 shards):   0%|                                                                                                                                                                                       | 0/4034 [00:00<?, ? examples/s]Saving the dataset (0/15 shards):   7%|███████████▌                                                                                                                                                                 | 269/4034 [00:07<01:39, 37.81 examples/s]Saving the dataset (1/15 shards):   7%|███████████▌                                                                                                                                                                 | 269/4034 [00:07<01:39, 37.81 examples/s]Saving the dataset (2/15 shards):  13%|███████████████████████                                                                                                                                                      | 538/4034 [00:07<01:32, 37.81 examples/s]Saving the dataset (3/15 shards):  20%|██████████████████████████████████▌                                                                                                                                          | 807/4034 [00:07<01:25, 37.81 examples/s]Saving the dataset (4/15 shards):  27%|█████████████████████████████████████████████▉                                                                                                                              | 1076/4034 [00:07<01:18, 37.81 examples/s]Saving the dataset (5/15 shards):  33%|█████████████████████████████████████████████████████████▎                                                                                                                  | 1345/4034 [00:07<01:11, 37.81 examples/s]Saving the dataset (6/15 shards):  47%|████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 1883/4034 [00:07<00:56, 37.81 examples/s]Saving the dataset (7/15 shards):  47%|████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 1883/4034 [00:07<00:56, 37.81 examples/s]Saving the dataset (8/15 shards):  53%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 2152/4034 [00:07<00:49, 37.81 examples/s]Saving the dataset (9/15 shards):  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 2421/4034 [00:07<00:42, 37.81 examples/s]Saving the dataset (10/15 shards):  67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 2689/4034 [00:07<00:35, 37.81 examples/s]Saving the dataset (10/15 shards):  73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 2958/4034 [00:07<00:01, 554.99 examples/s]Saving the dataset (11/15 shards):  73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 2958/4034 [00:07<00:01, 554.99 examples/s]Saving the dataset (12/15 shards):  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 3496/4034 [00:07<00:00, 554.99 examples/s]Saving the dataset (13/15 shards):  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 3496/4034 [00:07<00:00, 554.99 examples/s]Saving the dataset (14/15 shards):  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋           | 3765/4034 [00:07<00:00, 554.99 examples/s]Saving the dataset (15/15 shards): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4034/4034 [00:07<00:00, 554.99 examples/s]Saving the dataset (15/15 shards): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4034/4034 [00:08<00:00, 493.96 examples/s]
[2026-06-13 16:41:09,028] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:590] total_num_tokens: 5_816_278
[2026-06-13 16:41:09,069] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:590] `total_supervised_tokens: 3_801_290`
[2026-06-13 16:41:09,069] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:590] total_num_steps: 505
[2026-06-13 16:41:09,069] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:590] Maximum number of steps set at 505
[2026-06-13 16:41:09,109] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:590] loading tokenizer... Qwen/Qwen3.5-4B
[2026-06-13 16:41:10,732] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:590] EOS: 248046 / <|im_end|>
[2026-06-13 16:41:10,732] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:590] BOS: None / None
[2026-06-13 16:41:10,732] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:590] PAD: 248044 / <|endoftext|>
[2026-06-13 16:41:10,733] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:590] UNK: None / None
[2026-06-13 16:41:14,967] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:590] Loading model
[2026-06-13 16:41:15,111] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:590] Patched OptimState8bit for torch.compile compatibility
[2026-06-13 16:41:15,111] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:590] Patched OptimState4bit for torch.compile compatibility
[2026-06-13 16:41:15,111] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:590] Patched OptimStateFp8 for torch.compile compatibility
[2026-06-13 16:41:15,114] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:590] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-06-13 16:41:15,115] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:590] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-06-13 16:41:15,363] [INFO] [axolotl.monkeypatch.models.qwen3_5.modeling.patch_qwen3_5_vlm_flash_attention:289] [PID:590] Applied Qwen3.5 VLM flash-attention patch (3-D MRoPE position_ids)
[2026-06-13 16:41:15,518] [INFO] [axolotl.monkeypatch.lora_kernels.patch_self_attn_lora:364] [PID:590] Patched attention class with LoRA optims: Qwen3_5Attention
[2026-06-13 16:41:15,532] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:590] Applying Cut Cross Entropy to model type: qwen3_5
Loading weights:   0%|                                                                                                                                                                                                                | 0/723 [00:00<?, ?it/s]Loading weights:   1%|█▋                                                                                                                                                                                                      | 6/723 [00:00<00:14, 49.51it/s]Loading weights:   9%|██████████████████                                                                                                                                                                                    | 66/723 [00:00<00:01, 341.84it/s]Loading weights:  18%|███████████████████████████████████▉                                                                                                                                                                 | 132/723 [00:00<00:01, 471.54it/s]Loading weights:  27%|█████████████████████████████████████████████████████▍                                                                                                                                               | 196/723 [00:00<00:00, 532.26it/s]Loading weights:  35%|████████████████████████████████████████████████████████████████████▍                                                                                                                                | 251/723 [00:00<00:00, 537.40it/s]Loading weights:  42%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 306/723 [00:00<00:00, 529.41it/s]Loading weights:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                   | 360/723 [00:00<00:00, 481.93it/s]Loading weights:  59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 424/723 [00:00<00:00, 524.30it/s]Loading weights:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 636/723 [00:00<00:00, 985.87it/s]Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 723/723 [00:01<00:00, 707.42it/s]
[2026-06-13 16:41:17,441] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:900] [PID:590] converting PEFT model w/ prepare_model_for_kbit_training
[2026-06-13 16:41:17,449] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:356] [PID:590] Converting modules to torch.bfloat16
[2026-06-13 16:41:17,452] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:590] Memory usage after model load 5.454GB (+5.454GB allocated, +5.488GB reserved)
trainable params: 10,616,832 || all params: 4,549,882,368 || trainable%: 0.2333
[2026-06-13 16:41:17,559] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:590] after adapters 3.124GB (+3.124GB allocated, +5.527GB reserved)
[2026-06-13 16:41:18,170] [INFO] [axolotl.utils.freeze.freeze_mm_modules:49] [PID:590] freeze_mm_modules: froze 0 vision/audio parameters
[2026-06-13 16:41:18,920] [INFO] [axolotl.core.trainers.mixins.layer_offloading.__init__:291] [PID:590] Layer parameter offloading enabled
[2026-06-13 16:41:18,920] [WARNING] [axolotl.core.trainers.mixins.layer_offloading.__init__:73] [PID:590] LayerOffloadManager: no decoder layers found, offloading disabled
[2026-06-13 16:41:18,920] [INFO] [axolotl.train.save_initial_configs:450] [PID:590] Pre-saving adapter config to ./outputs/Jacob-2-4B...
[2026-06-13 16:41:18,920] [INFO] [axolotl.train.save_initial_configs:454] [PID:590] Pre-saving tokenizer to ./outputs/Jacob-2-4B...
[2026-06-13 16:41:19,036] [INFO] [axolotl.train.save_initial_configs:459] [PID:590] Pre-saving model config to ./outputs/Jacob-2-4B...
[2026-06-13 16:41:19,047] [INFO] [axolotl.train.save_initial_configs:463] [PID:590] Pre-saving processor to ./outputs/Jacob-2-4B...
[2026-06-13 16:41:19,161] [INFO] [axolotl.train.execute_training:226] [PID:590] Starting trainer...
  0%|                                                                                                                                                                                                                                 | 0/505 [00:00<?, ?it/s][2026-06-13 16:41:19,966] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:41:28,290] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:41:28,951] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:41:29,030] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:41:37,896] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:41:46,838] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:41:55,333] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:42:03,363] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:42:11,629] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:42:21,252] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:42:29,312] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  0%|▍                                                                                                                                                                                                                     | 1/505 [01:12<10:10:16, 72.65s/it]                                                                                                                                                                                                                                                              {'loss': '1.082', 'grad_norm': '0.6225', 'learning_rate': '0', 'ppl': '2.95', 'memory/max_active (GiB)': '3.9', 'memory/max_allocated (GiB)': '3.9', 'memory/device_reserved (GiB)': '5.53', 'tokens/train_per_sec_per_gpu': '13.94', 'tokens/total': 13016, 'tokens/trainable': 7278, 'epoch': '0.001983'}
  0%|▍                                                                                                                                                                                                                     | 1/505 [01:12<10:10:16, 72.65s/it][2026-06-13 16:42:39,138] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:42:47,299] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:42:54,835] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:43:03,017] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:43:10,602] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:43:18,727] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:43:29,005] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:43:37,266] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:43:45,017] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:43:53,893] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:44:01,773] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:44:10,188] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  0%|▊                                                                                                                                                                                                                     | 2/505 [02:51<12:18:32, 88.10s/it]                                                                                                                                                                                                                                                              {'loss': '1.157', 'grad_norm': '0.6519', 'learning_rate': '4e-06', 'ppl': '3.179', 'memory/max_active (GiB)': '4', 'memory/max_allocated (GiB)': '4', 'memory/device_reserved (GiB)': '4.19', 'tokens/train_per_sec_per_gpu': '8.766', 'tokens/total': 24812, 'tokens/trainable': 13570, 'epoch': '0.003966'}
  0%|▊                                                                                                                                                                                                                     | 2/505 [02:51<12:18:32, 88.10s/it][2026-06-13 16:44:11,329] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:44:25,415] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:44:33,617] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:44:41,798] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:44:49,925] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  1%|█▎                                                                                                                                                                                                                     | 3/505 [03:32<9:15:23, 66.38s/it]                                                                                                                                                                                                                                                              {'loss': '1.235', 'grad_norm': '0.6577', 'learning_rate': '8e-06', 'ppl': '3.437', 'memory/max_active (GiB)': '4.27', 'memory/max_allocated (GiB)': '4.27', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '26.44', 'tokens/total': 39682, 'tokens/trainable': 22538, 'epoch': '0.005949'}
  1%|█▎                                                                                                                                                                                                                     | 3/505 [03:32<9:15:23, 66.38s/it][2026-06-13 16:45:01,864] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:45:10,015] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:45:18,176] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:45:26,458] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  1%|█▋                                                                                                                                                                                                                     | 4/505 [04:08<7:36:23, 54.66s/it]                                                                                                                                                                                                                                                              {'loss': '1.236', 'grad_norm': '0.7874', 'learning_rate': '1.2e-05', 'ppl': '3.442', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '5.288', 'tokens/total': 49103, 'tokens/trainable': 27264, 'epoch': '0.007933'}
  1%|█▋                                                                                                                                                                                                                     | 4/505 [04:08<7:36:23, 54.66s/it][2026-06-13 16:45:28,548] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:45:38,854] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:45:47,157] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:45:48,646] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  1%|██▏                                                                                                                                                                                                                    | 5/505 [04:33<6:04:08, 43.70s/it]                                                                                                                                                                                                                                                              {'loss': '1.074', 'grad_norm': '0.7727', 'learning_rate': '1.6e-05', 'ppl': '2.926', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '31.78', 'tokens/total': 62059, 'tokens/trainable': 34875, 'epoch': '0.009916'}
  1%|██▏                                                                                                                                                                                                                    | 5/505 [04:33<6:04:08, 43.70s/it][2026-06-13 16:46:02,923] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:46:11,144] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  1%|██▌                                                                                                                                                                                                                    | 6/505 [04:54<5:00:18, 36.11s/it]                                                                                                                                                                                                                                                              {'loss': '1.168', 'grad_norm': '0.7778', 'learning_rate': '2e-05', 'ppl': '3.217', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '21.42', 'tokens/total': 71178, 'tokens/trainable': 38970, 'epoch': '0.0119'}
  1%|██▌                                                                                                                                                                                                                    | 6/505 [04:54<5:00:18, 36.11s/it][2026-06-13 16:46:21,518] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:46:29,935] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:46:31,855] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:46:41,848] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:46:49,969] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  1%|██▉                                                                                                                                                                                                                    | 7/505 [05:30<5:00:48, 36.24s/it]                                                                                                                                                                                                                                                              {'loss': '1.217', 'grad_norm': '0.9742', 'learning_rate': '2.4e-05', 'ppl': '3.375', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '3.834', 'tokens/total': 80286, 'tokens/trainable': 43570, 'epoch': '0.01388'}
  1%|██▉                                                                                                                                                                                                                    | 7/505 [05:30<5:00:48, 36.24s/it][2026-06-13 16:46:57,471] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:47:06,042] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:47:18,132] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:47:26,651] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:47:34,676] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:47:35,911] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:47:43,978] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:47:51,470] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:47:59,611] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  2%|███▍                                                                                                                                                                                                                   | 8/505 [06:41<6:30:34, 47.15s/it]                                                                                                                                                                                                                                                              {'loss': '1.132', 'grad_norm': '1.035', 'learning_rate': '2.8e-05', 'ppl': '3.102', 'memory/max_active (GiB)': '3.95', 'memory/max_allocated (GiB)': '3.95', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '10.17', 'tokens/total': 92306, 'tokens/trainable': 50468, 'epoch': '0.01587'}
  2%|███▍                                                                                                                                                                                                                   | 8/505 [06:41<6:30:34, 47.15s/it][2026-06-13 16:48:04,141] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  2%|███▊                                                                                                                                                                                                                   | 9/505 [06:49<4:48:11, 34.86s/it]                                                                                                                                                                                                                                                              {'loss': '1.166', 'grad_norm': '0.7779', 'learning_rate': '3.2e-05', 'ppl': '3.209', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '30.88', 'tokens/total': 103613, 'tokens/trainable': 56045, 'epoch': '0.01785'}
  2%|███▊                                                                                                                                                                                                                   | 9/505 [06:49<4:48:11, 34.86s/it][2026-06-13 16:48:13,215] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  2%|████▏                                                                                                                                                                                                                 | 10/505 [06:56<3:38:21, 26.47s/it]                                                                                                                                                                                                                                                              {'loss': '1.167', 'grad_norm': '0.8122', 'learning_rate': '3.6e-05', 'ppl': '3.213', 'memory/max_active (GiB)': '3.67', 'memory/max_allocated (GiB)': '3.67', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '31.03', 'tokens/total': 114558, 'tokens/trainable': 61757, 'epoch': '0.01983'}
  2%|████▏                                                                                                                                                                                                                 | 10/505 [06:56<3:38:21, 26.47s/it][2026-06-13 16:48:25,265] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  2%|████▋                                                                                                                                                                                                                 | 11/505 [07:06<2:56:16, 21.41s/it]                                                                                                                                                                                                                                                              {'loss': '1.016', 'grad_norm': '0.6042', 'learning_rate': '4e-05', 'ppl': '2.763', 'memory/max_active (GiB)': '3.91', 'memory/max_allocated (GiB)': '3.91', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '133.2', 'tokens/total': 128598, 'tokens/trainable': 70281, 'epoch': '0.02181'}
  2%|████▋                                                                                                                                                                                                                 | 11/505 [07:06<2:56:16, 21.41s/it]  2%|█████                                                                                                                                                                                                                 | 12/505 [07:14<2:21:17, 17.19s/it]                                                                                                                                                                                                                                                              {'loss': '1.023', 'grad_norm': '0.6655', 'learning_rate': '4.4e-05', 'ppl': '2.781', 'memory/max_active (GiB)': '3.86', 'memory/max_allocated (GiB)': '3.86', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '39.18', 'tokens/total': 139709, 'tokens/trainable': 76331, 'epoch': '0.0238'}
  2%|█████                                                                                                                                                                                                                 | 12/505 [07:14<2:21:17, 17.19s/it]  3%|█████▌                                                                                                                                                                                                                | 13/505 [07:21<1:56:23, 14.19s/it]                                                                                                                                                                                                                                                              {'loss': '0.9874', 'grad_norm': '0.6714', 'learning_rate': '4.8e-05', 'ppl': '2.684', 'memory/max_active (GiB)': '3.9', 'memory/max_allocated (GiB)': '3.9', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '245.9', 'tokens/total': 150309, 'tokens/trainable': 81529, 'epoch': '0.02578'}
  3%|█████▌                                                                                                                                                                                                                | 13/505 [07:21<1:56:23, 14.19s/it]  3%|█████▉                                                                                                                                                                                                                | 14/505 [07:30<1:43:29, 12.65s/it]                                                                                                                                                                                                                                                              {'loss': '1.007', 'grad_norm': '0.6898', 'learning_rate': '5.2e-05', 'ppl': '2.738', 'memory/max_active (GiB)': '4.2', 'memory/max_allocated (GiB)': '4.2', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '60.07', 'tokens/total': 163416, 'tokens/trainable': 88657, 'epoch': '0.02776'}
  3%|█████▉                                                                                                                                                                                                                | 14/505 [07:30<1:43:29, 12.65s/it][2026-06-13 16:48:50,566] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  3%|██████▎                                                                                                                                                                                                               | 15/505 [07:38<1:30:01, 11.02s/it]                                                                                                                                                                                                                                                              {'loss': '1.159', 'grad_norm': '0.5429', 'learning_rate': '5.6e-05', 'ppl': '3.186', 'memory/max_active (GiB)': '3.86', 'memory/max_allocated (GiB)': '3.86', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '122.6', 'tokens/total': 173816, 'tokens/trainable': 94374, 'epoch': '0.02975'}
  3%|██████▎                                                                                                                                                                                                               | 15/505 [07:38<1:30:01, 11.02s/it][2026-06-13 16:49:00,171] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:49:03,962] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  3%|██████▊                                                                                                                                                                                                               | 16/505 [07:44<1:19:36,  9.77s/it]                                                                                                                                                                                                                                                              {'loss': '1.024', 'grad_norm': '0.6696', 'learning_rate': '6e-05', 'ppl': '2.783', 'memory/max_active (GiB)': '3.59', 'memory/max_allocated (GiB)': '3.59', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '50.65', 'tokens/total': 182301, 'tokens/trainable': 98010, 'epoch': '0.03173'}
  3%|██████▊                                                                                                                                                                                                               | 16/505 [07:44<1:19:36,  9.77s/it][2026-06-13 16:49:06,221] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:49:07,739] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  3%|███████▏                                                                                                                                                                                                              | 17/505 [07:54<1:19:41,  9.80s/it]                                                                                                                                                                                                                                                              {'loss': '0.9534', 'grad_norm': '0.4745', 'learning_rate': '6.4e-05', 'ppl': '2.595', 'memory/max_active (GiB)': '4.27', 'memory/max_allocated (GiB)': '4.27', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '33.85', 'tokens/total': 196255, 'tokens/trainable': 105956, 'epoch': '0.03371'}
  3%|███████▏                                                                                                                                                                                                              | 17/505 [07:54<1:19:41,  9.80s/it]  4%|███████▋                                                                                                                                                                                                              | 18/505 [08:03<1:17:52,  9.59s/it]                                                                                                                                                                                                                                                              {'loss': '0.9406', 'grad_norm': '0.5054', 'learning_rate': '6.8e-05', 'ppl': '2.562', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '96.42', 'tokens/total': 209181, 'tokens/trainable': 113432, 'epoch': '0.0357'}
  4%|███████▋                                                                                                                                                                                                              | 18/505 [08:03<1:17:52,  9.59s/it][2026-06-13 16:49:25,527] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  4%|████████                                                                                                                                                                                                              | 19/505 [08:14<1:19:59,  9.88s/it]                                                                                                                                                                                                                                                              {'loss': '0.8627', 'grad_norm': '0.4321', 'learning_rate': '7.2e-05', 'ppl': '2.37', 'memory/max_active (GiB)': '3.91', 'memory/max_allocated (GiB)': '3.91', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '46.82', 'tokens/total': 224175, 'tokens/trainable': 121950, 'epoch': '0.03768'}
  4%|████████                                                                                                                                                                                                              | 19/505 [08:14<1:19:59,  9.88s/it][2026-06-13 16:49:38,766] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  4%|████████▍                                                                                                                                                                                                             | 20/505 [08:21<1:12:03,  8.91s/it]                                                                                                                                                                                                                                                              {'loss': '0.9855', 'grad_norm': '0.9144', 'learning_rate': '7.6e-05', 'ppl': '2.679', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '80.51', 'tokens/total': 233523, 'tokens/trainable': 125941, 'epoch': '0.03966'}
  4%|████████▍                                                                                                                                                                                                             | 20/505 [08:21<1:12:03,  8.91s/it][2026-06-13 16:49:42,173] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  4%|████████▉                                                                                                                                                                                                             | 21/505 [08:29<1:09:40,  8.64s/it]                                                                                                                                                                                                                                                              {'loss': '0.9643', 'grad_norm': '0.5501', 'learning_rate': '8e-05', 'ppl': '2.623', 'memory/max_active (GiB)': '4.2', 'memory/max_allocated (GiB)': '4.2', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '293.5', 'tokens/total': 244927, 'tokens/trainable': 131686, 'epoch': '0.04165'}
  4%|████████▉                                                                                                                                                                                                             | 21/505 [08:29<1:09:40,  8.64s/it]  4%|█████████▎                                                                                                                                                                                                            | 22/505 [08:38<1:12:01,  8.95s/it]                                                                                                                                                                                                                                                              {'loss': '0.8578', 'grad_norm': '0.5002', 'learning_rate': '8.4e-05', 'ppl': '2.358', 'memory/max_active (GiB)': '3.92', 'memory/max_allocated (GiB)': '3.92', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '121.3', 'tokens/total': 258830, 'tokens/trainable': 139931, 'epoch': '0.04363'}
  4%|█████████▎                                                                                                                                                                                                            | 22/505 [08:38<1:12:01,  8.95s/it][2026-06-13 16:49:58,682] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  5%|█████████▋                                                                                                                                                                                                            | 23/505 [08:47<1:11:31,  8.90s/it]                                                                                                                                                                                                                                                              {'loss': '0.921', 'grad_norm': '0.523', 'learning_rate': '8.8e-05', 'ppl': '2.512', 'memory/max_active (GiB)': '3.87', 'memory/max_allocated (GiB)': '3.87', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '73.74', 'tokens/total': 271497, 'tokens/trainable': 147268, 'epoch': '0.04561'}
  5%|█████████▋                                                                                                                                                                                                            | 23/505 [08:47<1:11:31,  8.90s/it]  5%|██████████▏                                                                                                                                                                                                           | 24/505 [08:53<1:03:30,  7.92s/it]                                                                                                                                                                                                                                                              {'loss': '0.9368', 'grad_norm': '0.6294', 'learning_rate': '9.2e-05', 'ppl': '2.552', 'memory/max_active (GiB)': '3.53', 'memory/max_allocated (GiB)': '3.53', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '99.16', 'tokens/total': 279730, 'tokens/trainable': 150707, 'epoch': '0.0476'}
  5%|██████████▏                                                                                                                                                                                                           | 24/505 [08:53<1:03:30,  7.92s/it]  5%|██████████▌                                                                                                                                                                                                           | 25/505 [09:01<1:04:36,  8.08s/it]                                                                                                                                                                                                                                                              {'loss': '0.9916', 'grad_norm': '0.6435', 'learning_rate': '9.6e-05', 'ppl': '2.695', 'memory/max_active (GiB)': '3.91', 'memory/max_allocated (GiB)': '3.91', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '37.68', 'tokens/total': 291992, 'tokens/trainable': 157606, 'epoch': '0.04958'}
  5%|██████████▌                                                                                                                                                                                                           | 25/505 [09:01<1:04:36,  8.08s/it]  5%|███████████                                                                                                                                                                                                           | 26/505 [09:10<1:05:16,  8.18s/it]                                                                                                                                                                                                                                                              {'loss': '0.8316', 'grad_norm': '0.6991', 'learning_rate': '0.0001', 'ppl': '2.297', 'memory/max_active (GiB)': '4.08', 'memory/max_allocated (GiB)': '4.08', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '26.05', 'tokens/total': 304164, 'tokens/trainable': 164041, 'epoch': '0.05156'}
  5%|███████████                                                                                                                                                                                                           | 26/505 [09:10<1:05:16,  8.18s/it][2026-06-13 16:50:33,928] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  5%|███████████▍                                                                                                                                                                                                          | 27/505 [09:17<1:04:03,  8.04s/it]                                                                                                                                                                                                                                                              {'loss': '0.8365', 'grad_norm': '0.615', 'learning_rate': '0.000104', 'ppl': '2.308', 'memory/max_active (GiB)': '3.75', 'memory/max_allocated (GiB)': '3.75', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '75.13', 'tokens/total': 315109, 'tokens/trainable': 169819, 'epoch': '0.05354'}
  5%|███████████▍                                                                                                                                                                                                          | 27/505 [09:17<1:04:03,  8.04s/it]  6%|███████████▉                                                                                                                                                                                                            | 28/505 [09:24<59:34,  7.49s/it]                                                                                                                                                                                                                                                              {'loss': '0.9359', 'grad_norm': '0.7526', 'learning_rate': '0.000108', 'ppl': '2.549', 'memory/max_active (GiB)': '3.67', 'memory/max_allocated (GiB)': '3.67', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '39.22', 'tokens/total': 324032, 'tokens/trainable': 173860, 'epoch': '0.05553'}
  6%|███████████▉                                                                                                                                                                                                            | 28/505 [09:24<59:34,  7.49s/it]  6%|████████████▍                                                                                                                                                                                                           | 29/505 [09:31<58:26,  7.37s/it]                                                                                                                                                                                                                                                              {'loss': '0.8777', 'grad_norm': '0.6869', 'learning_rate': '0.000112', 'ppl': '2.405', 'memory/max_active (GiB)': '3.81', 'memory/max_allocated (GiB)': '3.81', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '230.6', 'tokens/total': 334098, 'tokens/trainable': 179003, 'epoch': '0.05751'}
  6%|████████████▍                                                                                                                                                                                                           | 29/505 [09:31<58:26,  7.37s/it]  6%|████████████▊                                                                                                                                                                                                           | 30/505 [09:38<59:29,  7.51s/it]                                                                                                                                                                                                                                                              {'loss': '0.9539', 'grad_norm': '0.5351', 'learning_rate': '0.000116', 'ppl': '2.596', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '79.19', 'tokens/total': 345604, 'tokens/trainable': 185422, 'epoch': '0.05949'}
  6%|████████████▊                                                                                                                                                                                                           | 30/505 [09:38<59:29,  7.51s/it][2026-06-13 16:51:00,736] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  6%|█████████████▎                                                                                                                                                                                                          | 31/505 [09:46<58:51,  7.45s/it]                                                                                                                                                                                                                                                              {'loss': '0.9037', 'grad_norm': '0.6453', 'learning_rate': '0.00012', 'ppl': '2.469', 'memory/max_active (GiB)': '3.87', 'memory/max_allocated (GiB)': '3.87', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '49.73', 'tokens/total': 356104, 'tokens/trainable': 190821, 'epoch': '0.06148'}
  6%|█████████████▎                                                                                                                                                                                                          | 31/505 [09:46<58:51,  7.45s/it][2026-06-13 16:51:12,187] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  6%|█████████████▋                                                                                                                                                                                                          | 32/505 [09:53<58:00,  7.36s/it]                                                                                                                                                                                                                                                              {'loss': '0.9302', 'grad_norm': '0.6308', 'learning_rate': '0.000124', 'ppl': '2.535', 'memory/max_active (GiB)': '3.9', 'memory/max_allocated (GiB)': '3.9', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '102.3', 'tokens/total': 366470, 'tokens/trainable': 196211, 'epoch': '0.06346'}
  6%|█████████████▋                                                                                                                                                                                                          | 32/505 [09:53<58:00,  7.36s/it][2026-06-13 16:51:20,459] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  7%|█████████████▉                                                                                                                                                                                                        | 33/505 [10:03<1:03:33,  8.08s/it]                                                                                                                                                                                                                                                              {'loss': '0.771', 'grad_norm': '0.5821', 'learning_rate': '0.000128', 'ppl': '2.162', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '109.3', 'tokens/total': 380383, 'tokens/trainable': 203425, 'epoch': '0.06544'}
  7%|█████████████▉                                                                                                                                                                                                        | 33/505 [10:03<1:03:33,  8.08s/it]  7%|██████████████▍                                                                                                                                                                                                       | 34/505 [10:10<1:01:12,  7.80s/it]                                                                                                                                                                                                                                                              {'loss': '0.8892', 'grad_norm': '0.8138', 'learning_rate': '0.000132', 'ppl': '2.433', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '23.11', 'tokens/total': 390682, 'tokens/trainable': 208430, 'epoch': '0.06743'}
  7%|██████████████▍                                                                                                                                                                                                       | 34/505 [10:10<1:01:12,  7.80s/it]  7%|██████████████▊                                                                                                                                                                                                       | 35/505 [10:20<1:06:15,  8.46s/it]                                                                                                                                                                                                                                                              {'loss': '0.7963', 'grad_norm': '0.6062', 'learning_rate': '0.000136', 'ppl': '2.217', 'memory/max_active (GiB)': '4.04', 'memory/max_allocated (GiB)': '4.04', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '37.72', 'tokens/total': 405065, 'tokens/trainable': 217023, 'epoch': '0.06941'}
  7%|██████████████▊                                                                                                                                                                                                       | 35/505 [10:20<1:06:15,  8.46s/it]  7%|███████████████▎                                                                                                                                                                                                      | 36/505 [10:28<1:04:22,  8.23s/it]                                                                                                                                                                                                                                                              {'loss': '0.8051', 'grad_norm': '0.6768', 'learning_rate': '0.00014', 'ppl': '2.237', 'memory/max_active (GiB)': '3.74', 'memory/max_allocated (GiB)': '3.74', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '171.7', 'tokens/total': 416224, 'tokens/trainable': 222834, 'epoch': '0.07139'}
  7%|███████████████▎                                                                                                                                                                                                      | 36/505 [10:28<1:04:22,  8.23s/it]  7%|███████████████▋                                                                                                                                                                                                      | 37/505 [10:36<1:05:13,  8.36s/it]                                                                                                                                                                                                                                                              {'loss': '0.8231', 'grad_norm': '0.5466', 'learning_rate': '0.000144', 'ppl': '2.278', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '277.4', 'tokens/total': 428823, 'tokens/trainable': 229964, 'epoch': '0.07338'}
  7%|███████████████▋                                                                                                                                                                                                      | 37/505 [10:36<1:05:13,  8.36s/it]  8%|████████████████                                                                                                                                                                                                      | 38/505 [10:45<1:05:48,  8.45s/it]                                                                                                                                                                                                                                                              {'loss': '0.9663', 'grad_norm': '0.6203', 'learning_rate': '0.000148', 'ppl': '2.628', 'memory/max_active (GiB)': '3.8', 'memory/max_allocated (GiB)': '3.8', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '79.93', 'tokens/total': 441254, 'tokens/trainable': 236937, 'epoch': '0.07536'}
  8%|████████████████                                                                                                                                                                                                      | 38/505 [10:45<1:05:48,  8.45s/it]  8%|████████████████▌                                                                                                                                                                                                     | 39/505 [10:54<1:07:50,  8.74s/it]                                                                                                                                                                                                                                                              {'loss': '0.8831', 'grad_norm': '0.6198', 'learning_rate': '0.000152', 'ppl': '2.418', 'memory/max_active (GiB)': '4.1', 'memory/max_allocated (GiB)': '4.1', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '34.07', 'tokens/total': 454818, 'tokens/trainable': 244704, 'epoch': '0.07734'}
  8%|████████████████▌                                                                                                                                                                                                     | 39/505 [10:54<1:07:50,  8.74s/it][2026-06-13 16:52:17,797] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  8%|████████████████▉                                                                                                                                                                                                     | 40/505 [11:04<1:10:35,  9.11s/it]                                                                                                                                                                                                                                                              {'loss': '0.8995', 'grad_norm': '0.671', 'learning_rate': '0.000156', 'ppl': '2.458', 'memory/max_active (GiB)': '4.27', 'memory/max_allocated (GiB)': '4.27', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '26.87', 'tokens/total': 469112, 'tokens/trainable': 253370, 'epoch': '0.07933'}
  8%|████████████████▉                                                                                                                                                                                                     | 40/505 [11:04<1:10:35,  9.11s/it]  8%|█████████████████▎                                                                                                                                                                                                    | 41/505 [11:13<1:10:40,  9.14s/it]                                                                                                                                                                                                                                                              {'loss': '0.9886', 'grad_norm': '0.7411', 'learning_rate': '0.00016', 'ppl': '2.687', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '53.2', 'tokens/total': 482616, 'tokens/trainable': 260358, 'epoch': '0.08131'}
  8%|█████████████████▎                                                                                                                                                                                                    | 41/505 [11:13<1:10:40,  9.14s/it]  8%|█████████████████▊                                                                                                                                                                                                    | 42/505 [11:24<1:12:53,  9.45s/it]                                                                                                                                                                                                                                                              {'loss': '0.8109', 'grad_norm': '0.4812', 'learning_rate': '0.000164', 'ppl': '2.25', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '108.1', 'tokens/total': 497175, 'tokens/trainable': 269627, 'epoch': '0.08329'}
  8%|█████████████████▊                                                                                                                                                                                                    | 42/505 [11:24<1:12:53,  9.45s/it]  9%|██████████████████▏                                                                                                                                                                                                   | 43/505 [11:30<1:05:18,  8.48s/it]                                                                                                                                                                                                                                                              {'loss': '0.7738', 'grad_norm': '0.6842', 'learning_rate': '0.000168', 'ppl': '2.168', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '119.7', 'tokens/total': 506202, 'tokens/trainable': 273421, 'epoch': '0.08528'}
  9%|██████████████████▏                                                                                                                                                                                                   | 43/505 [11:30<1:05:18,  8.48s/it]  9%|██████████████████▋                                                                                                                                                                                                   | 44/505 [11:38<1:04:47,  8.43s/it]                                                                                                                                                                                                                                                              {'loss': '0.9146', 'grad_norm': '0.5589', 'learning_rate': '0.000172', 'ppl': '2.496', 'memory/max_active (GiB)': '3.86', 'memory/max_allocated (GiB)': '3.86', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '111.3', 'tokens/total': 518204, 'tokens/trainable': 279983, 'epoch': '0.08726'}
  9%|██████████████████▋                                                                                                                                                                                                   | 44/505 [11:38<1:04:47,  8.43s/it][2026-06-13 16:52:59,812] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

  9%|███████████████████                                                                                                                                                                                                   | 45/505 [11:45<1:00:39,  7.91s/it]                                                                                                                                                                                                                                                              {'loss': '0.9848', 'grad_norm': '0.9063', 'learning_rate': '0.000176', 'ppl': '2.677', 'memory/max_active (GiB)': '3.79', 'memory/max_allocated (GiB)': '3.79', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '68.56', 'tokens/total': 527578, 'tokens/trainable': 284249, 'epoch': '0.08924'}
  9%|███████████████████                                                                                                                                                                                                   | 45/505 [11:45<1:00:39,  7.91s/it]  9%|███████████████████▋                                                                                                                                                                                                    | 46/505 [11:52<59:21,  7.76s/it]                                                                                                                                                                                                                                                              {'loss': '0.7161', 'grad_norm': '0.6454', 'learning_rate': '0.00018', 'ppl': '2.046', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '225', 'tokens/total': 538363, 'tokens/trainable': 289558, 'epoch': '0.09122'}
  9%|███████████████████▋                                                                                                                                                                                                    | 46/505 [11:52<59:21,  7.76s/it]  9%|███████████████████▉                                                                                                                                                                                                  | 47/505 [12:02<1:02:46,  8.22s/it]                                                                                                                                                                                                                                                              {'loss': '0.8329', 'grad_norm': '0.5508', 'learning_rate': '0.000184', 'ppl': '2.3', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '79.82', 'tokens/total': 551285, 'tokens/trainable': 297089, 'epoch': '0.09321'}
  9%|███████████████████▉                                                                                                                                                                                                  | 47/505 [12:02<1:02:46,  8.22s/it] 10%|████████████████████▎                                                                                                                                                                                                 | 48/505 [12:09<1:01:02,  8.01s/it]                                                                                                                                                                                                                                                              {'loss': '0.8548', 'grad_norm': '0.5497', 'learning_rate': '0.000188', 'ppl': '2.351', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '132', 'tokens/total': 562120, 'tokens/trainable': 303063, 'epoch': '0.09519'}
 10%|████████████████████▎                                                                                                                                                                                                 | 48/505 [12:09<1:01:02,  8.01s/it] 10%|████████████████████▊                                                                                                                                                                                                 | 49/505 [12:18<1:02:31,  8.23s/it]                                                                                                                                                                                                                                                              {'loss': '0.9206', 'grad_norm': '0.6618', 'learning_rate': '0.000192', 'ppl': '2.511', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '86.1', 'tokens/total': 574447, 'tokens/trainable': 309475, 'epoch': '0.09717'}
 10%|████████████████████▊                                                                                                                                                                                                 | 49/505 [12:18<1:02:31,  8.23s/it] 10%|█████████████████████▏                                                                                                                                                                                                | 50/505 [12:26<1:01:31,  8.11s/it]                                                                                                                                                                                                                                                              {'loss': '0.8821', 'grad_norm': '0.6108', 'learning_rate': '0.000196', 'ppl': '2.416', 'memory/max_active (GiB)': '3.73', 'memory/max_allocated (GiB)': '3.73', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '63.98', 'tokens/total': 585647, 'tokens/trainable': 315293, 'epoch': '0.09916'}
 10%|█████████████████████▏                                                                                                                                                                                                | 50/505 [12:26<1:01:31,  8.11s/it] 10%|█████████████████████▊                                                                                                                                                                                                  | 51/505 [12:33<59:16,  7.83s/it]                                                                                                                                                                                                                                                              {'loss': '0.8134', 'grad_norm': '0.8396', 'learning_rate': '0.0002', 'ppl': '2.256', 'memory/max_active (GiB)': '4.19', 'memory/max_allocated (GiB)': '4.19', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '49.29', 'tokens/total': 596125, 'tokens/trainable': 320715, 'epoch': '0.1011'}
 10%|█████████████████████▊                                                                                                                                                                                                  | 51/505 [12:33<59:16,  7.83s/it] 10%|██████████████████████▏                                                                                                                                                                                                 | 52/505 [12:39<55:08,  7.30s/it]                                                                                                                                                                                                                                                              {'loss': '0.9546', 'grad_norm': '0.8993', 'learning_rate': '0.0002', 'ppl': '2.598', 'memory/max_active (GiB)': '3.64', 'memory/max_allocated (GiB)': '3.64', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '44.69', 'tokens/total': 604850, 'tokens/trainable': 324441, 'epoch': '0.1031'}
 10%|██████████████████████▏                                                                                                                                                                                                 | 52/505 [12:39<55:08,  7.30s/it] 10%|██████████████████████▋                                                                                                                                                                                                 | 53/505 [12:47<57:45,  7.67s/it]                                                                                                                                                                                                                                                              {'loss': '0.8625', 'grad_norm': '0.5124', 'learning_rate': '0.0002', 'ppl': '2.369', 'memory/max_active (GiB)': '3.73', 'memory/max_allocated (GiB)': '3.73', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '159.7', 'tokens/total': 617163, 'tokens/trainable': 331478, 'epoch': '0.1051'}
 10%|██████████████████████▋                                                                                                                                                                                                 | 53/505 [12:47<57:45,  7.67s/it] 11%|██████████████████████▉                                                                                                                                                                                               | 54/505 [12:59<1:06:07,  8.80s/it]                                                                                                                                                                                                                                                              {'loss': '0.8292', 'grad_norm': '0.5921', 'learning_rate': '0.0002', 'ppl': '2.291', 'memory/max_active (GiB)': '4.19', 'memory/max_allocated (GiB)': '4.19', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '161.7', 'tokens/total': 633758, 'tokens/trainable': 341952, 'epoch': '0.1071'}
 11%|██████████████████████▉                                                                                                                                                                                               | 54/505 [12:59<1:06:07,  8.80s/it] 11%|███████████████████████▎                                                                                                                                                                                              | 55/505 [13:09<1:08:48,  9.18s/it]                                                                                                                                                                                                                                                              {'loss': '0.9922', 'grad_norm': '0.7415', 'learning_rate': '0.0002', 'ppl': '2.697', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '18.09', 'tokens/total': 648302, 'tokens/trainable': 350504, 'epoch': '0.1091'}
 11%|███████████████████████▎                                                                                                                                                                                              | 55/505 [13:09<1:08:48,  9.18s/it] 11%|███████████████████████▋                                                                                                                                                                                              | 56/505 [13:16<1:03:36,  8.50s/it]                                                                                                                                                                                                                                                              {'loss': '0.9554', 'grad_norm': '0.7799', 'learning_rate': '0.0001999', 'ppl': '2.6', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '299.6', 'tokens/total': 658404, 'tokens/trainable': 355817, 'epoch': '0.1111'}
 11%|███████████████████████▋                                                                                                                                                                                              | 56/505 [13:16<1:03:36,  8.50s/it] 11%|████████████████████████▏                                                                                                                                                                                             | 57/505 [13:24<1:01:45,  8.27s/it]                                                                                                                                                                                                                                                              {'loss': '0.8526', 'grad_norm': '0.5605', 'learning_rate': '0.0001999', 'ppl': '2.346', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '116', 'tokens/total': 669772, 'tokens/trainable': 362158, 'epoch': '0.113'}
 11%|████████████████████████▏                                                                                                                                                                                             | 57/505 [13:24<1:01:45,  8.27s/it] 11%|████████████████████████▌                                                                                                                                                                                             | 58/505 [13:33<1:04:06,  8.61s/it]                                                                                                                                                                                                                                                              {'loss': '0.8577', 'grad_norm': '0.6188', 'learning_rate': '0.0001999', 'ppl': '2.358', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '56.67', 'tokens/total': 683300, 'tokens/trainable': 370105, 'epoch': '0.115'}
 11%|████████████████████████▌                                                                                                                                                                                             | 58/505 [13:33<1:04:06,  8.61s/it] 12%|█████████████████████████▏                                                                                                                                                                                              | 59/505 [13:39<59:10,  7.96s/it]                                                                                                                                                                                                                                                              {'loss': '0.7574', 'grad_norm': '0.6694', 'learning_rate': '0.0001998', 'ppl': '2.133', 'memory/max_active (GiB)': '3.64', 'memory/max_allocated (GiB)': '3.64', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '159.1', 'tokens/total': 692743, 'tokens/trainable': 374390, 'epoch': '0.117'}
 12%|█████████████████████████▏                                                                                                                                                                                              | 59/505 [13:39<59:10,  7.96s/it] 12%|█████████████████████████▋                                                                                                                                                                                              | 60/505 [13:46<55:47,  7.52s/it]                                                                                                                                                                                                                                                              {'loss': '0.8239', 'grad_norm': '0.7939', 'learning_rate': '0.0001998', 'ppl': '2.279', 'memory/max_active (GiB)': '3.79', 'memory/max_allocated (GiB)': '3.79', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '28.32', 'tokens/total': 702106, 'tokens/trainable': 378979, 'epoch': '0.119'}
 12%|█████████████████████████▋                                                                                                                                                                                              | 60/505 [13:46<55:47,  7.52s/it] 12%|██████████████████████████                                                                                                                                                                                              | 61/505 [13:55<58:47,  7.95s/it]                                                                                                                                                                                                                                                              {'loss': '0.7095', 'grad_norm': '0.6583', 'learning_rate': '0.0001998', 'ppl': '2.033', 'memory/max_active (GiB)': '4.09', 'memory/max_allocated (GiB)': '4.09', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '51.51', 'tokens/total': 715114, 'tokens/trainable': 386283, 'epoch': '0.121'}
 12%|██████████████████████████                                                                                                                                                                                              | 61/505 [13:55<58:47,  7.95s/it] 12%|██████████████████████████▌                                                                                                                                                                                             | 62/505 [14:02<57:51,  7.84s/it]                                                                                                                                                                                                                                                              {'loss': '0.8434', 'grad_norm': '0.6326', 'learning_rate': '0.0001997', 'ppl': '2.324', 'memory/max_active (GiB)': '3.83', 'memory/max_allocated (GiB)': '3.83', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '75.48', 'tokens/total': 726053, 'tokens/trainable': 392203, 'epoch': '0.123'}
 12%|██████████████████████████▌                                                                                                                                                                                             | 62/505 [14:02<57:51,  7.84s/it] 12%|██████████████████████████▉                                                                                                                                                                                             | 63/505 [14:10<56:21,  7.65s/it]                                                                                                                                                                                                                                                              {'loss': '0.8395', 'grad_norm': '0.6776', 'learning_rate': '0.0001997', 'ppl': '2.315', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '40.6', 'tokens/total': 736624, 'tokens/trainable': 397300, 'epoch': '0.1249'}
 12%|██████████████████████████▉                                                                                                                                                                                             | 63/505 [14:10<56:21,  7.65s/it] 13%|███████████████████████████▎                                                                                                                                                                                            | 64/505 [14:17<56:40,  7.71s/it]                                                                                                                                                                                                                                                              {'loss': '0.7866', 'grad_norm': '0.7866', 'learning_rate': '0.0001996', 'ppl': '2.196', 'memory/max_active (GiB)': '3.96', 'memory/max_allocated (GiB)': '3.96', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '27.65', 'tokens/total': 747858, 'tokens/trainable': 403243, 'epoch': '0.1269'}
 13%|███████████████████████████▎                                                                                                                                                                                            | 64/505 [14:17<56:40,  7.71s/it] 13%|███████████████████████████▌                                                                                                                                                                                          | 65/505 [14:27<1:00:44,  8.28s/it]                                                                                                                                                                                                                                                              {'loss': '0.7865', 'grad_norm': '0.567', 'learning_rate': '0.0001995', 'ppl': '2.196', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '48.96', 'tokens/total': 761792, 'tokens/trainable': 411196, 'epoch': '0.1289'}
 13%|███████████████████████████▌                                                                                                                                                                                          | 65/505 [14:27<1:00:44,  8.28s/it] 13%|████████████████████████████▏                                                                                                                                                                                           | 66/505 [14:33<54:34,  7.46s/it]                                                                                                                                                                                                                                                              {'loss': '0.8198', 'grad_norm': '0.8665', 'learning_rate': '0.0001995', 'ppl': '2.27', 'memory/max_active (GiB)': '3.73', 'memory/max_allocated (GiB)': '3.73', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '221', 'tokens/total': 769655, 'tokens/trainable': 414487, 'epoch': '0.1309'}
 13%|████████████████████████████▏                                                                                                                                                                                           | 66/505 [14:33<54:34,  7.46s/it] 13%|████████████████████████████▋                                                                                                                                                                                           | 67/505 [14:41<55:45,  7.64s/it]                                                                                                                                                                                                                                                              {'loss': '0.7974', 'grad_norm': '0.6195', 'learning_rate': '0.0001994', 'ppl': '2.22', 'memory/max_active (GiB)': '4.07', 'memory/max_allocated (GiB)': '4.07', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '69.4', 'tokens/total': 781377, 'tokens/trainable': 421036, 'epoch': '0.1329'}
 13%|████████████████████████████▋                                                                                                                                                                                           | 67/505 [14:41<55:45,  7.64s/it] 13%|████████████████████████████▊                                                                                                                                                                                         | 68/505 [14:51<1:01:47,  8.48s/it]                                                                                                                                                                                                                                                              {'loss': '0.8424', 'grad_norm': '0.6392', 'learning_rate': '0.0001993', 'ppl': '2.322', 'memory/max_active (GiB)': '4.19', 'memory/max_allocated (GiB)': '4.19', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '33.86', 'tokens/total': 796541, 'tokens/trainable': 429552, 'epoch': '0.1349'}
 13%|████████████████████████████▊                                                                                                                                                                                         | 68/505 [14:51<1:01:47,  8.48s/it] 14%|█████████████████████████████▌                                                                                                                                                                                          | 69/505 [14:57<56:13,  7.74s/it]                                                                                                                                                                                                                                                              {'loss': '0.8034', 'grad_norm': '0.7682', 'learning_rate': '0.0001992', 'ppl': '2.233', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '22.34', 'tokens/total': 805201, 'tokens/trainable': 433912, 'epoch': '0.1368'}
 14%|█████████████████████████████▌                                                                                                                                                                                          | 69/505 [14:57<56:13,  7.74s/it] 14%|█████████████████████████████▉                                                                                                                                                                                          | 70/505 [15:03<51:22,  7.09s/it]                                                                                                                                                                                                                                                              {'loss': '0.8', 'grad_norm': '1.054', 'learning_rate': '0.0001991', 'ppl': '2.226', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '84.26', 'tokens/total': 813213, 'tokens/trainable': 437169, 'epoch': '0.1388'}
 14%|█████████████████████████████▉                                                                                                                                                                                          | 70/505 [15:03<51:22,  7.09s/it][2026-06-13 16:56:22,904] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 14%|██████████████████████████████▎                                                                                                                                                                                         | 71/505 [15:10<50:40,  7.01s/it]                                                                                                                                                                                                                                                              {'loss': '0.7397', 'grad_norm': '0.7529', 'learning_rate': '0.000199', 'ppl': '2.095', 'memory/max_active (GiB)': '3.73', 'memory/max_allocated (GiB)': '3.73', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '22.89', 'tokens/total': 822774, 'tokens/trainable': 441754, 'epoch': '0.1408'}
 14%|██████████████████████████████▎                                                                                                                                                                                         | 71/505 [15:10<50:40,  7.01s/it] 14%|██████████████████████████████▊                                                                                                                                                                                         | 72/505 [15:19<55:07,  7.64s/it]                                                                                                                                                                                                                                                              {'loss': '0.881', 'grad_norm': '0.7069', 'learning_rate': '0.000199', 'ppl': '2.413', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '13.82', 'tokens/total': 836049, 'tokens/trainable': 448472, 'epoch': '0.1428'}
 14%|██████████████████████████████▊                                                                                                                                                                                         | 72/505 [15:19<55:07,  7.64s/it] 14%|███████████████████████████████▏                                                                                                                                                                                        | 73/505 [15:26<54:13,  7.53s/it]                                                                                                                                                                                                                                                              {'loss': '0.7591', 'grad_norm': '0.8976', 'learning_rate': '0.0001988', 'ppl': '2.136', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '21.86', 'tokens/total': 846523, 'tokens/trainable': 453389, 'epoch': '0.1448'}
 14%|███████████████████████████████▏                                                                                                                                                                                        | 73/505 [15:26<54:13,  7.53s/it] 15%|███████████████████████████████▋                                                                                                                                                                                        | 74/505 [15:34<55:25,  7.72s/it]                                                                                                                                                                                                                                                              {'loss': '0.8168', 'grad_norm': '0.7192', 'learning_rate': '0.0001987', 'ppl': '2.263', 'memory/max_active (GiB)': '4.05', 'memory/max_allocated (GiB)': '4.05', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '37.93', 'tokens/total': 858465, 'tokens/trainable': 459689, 'epoch': '0.1468'}
 15%|███████████████████████████████▋                                                                                                                                                                                        | 74/505 [15:34<55:25,  7.72s/it] 15%|████████████████████████████████                                                                                                                                                                                        | 75/505 [15:43<58:34,  8.17s/it]                                                                                                                                                                                                                                                              {'loss': '0.8076', 'grad_norm': '0.6757', 'learning_rate': '0.0001986', 'ppl': '2.243', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '196.3', 'tokens/total': 871779, 'tokens/trainable': 467654, 'epoch': '0.1487'}
 15%|████████████████████████████████                                                                                                                                                                                        | 75/505 [15:43<58:34,  8.17s/it] 15%|████████████████████████████████▌                                                                                                                                                                                       | 76/505 [15:51<57:30,  8.04s/it]                                                                                                                                                                                                                                                              {'loss': '0.9019', 'grad_norm': '0.6738', 'learning_rate': '0.0001985', 'ppl': '2.464', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '52.84', 'tokens/total': 883133, 'tokens/trainable': 473731, 'epoch': '0.1507'}
 15%|████████████████████████████████▌                                                                                                                                                                                       | 76/505 [15:51<57:30,  8.04s/it] 15%|████████████████████████████████▉                                                                                                                                                                                       | 77/505 [16:00<59:00,  8.27s/it]                                                                                                                                                                                                                                                              {'loss': '0.8513', 'grad_norm': '0.5526', 'learning_rate': '0.0001984', 'ppl': '2.343', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '66.1', 'tokens/total': 896042, 'tokens/trainable': 480972, 'epoch': '0.1527'}
 15%|████████████████████████████████▉                                                                                                                                                                                       | 77/505 [16:00<59:00,  8.27s/it][2026-06-13 16:57:23,087] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 15%|█████████████████████████████████▎                                                                                                                                                                                      | 78/505 [16:07<55:54,  7.86s/it]                                                                                                                                                                                                                                                              {'loss': '0.9067', 'grad_norm': '0.762', 'learning_rate': '0.0001983', 'ppl': '2.476', 'memory/max_active (GiB)': '3.62', 'memory/max_allocated (GiB)': '3.62', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '83.21', 'tokens/total': 905874, 'tokens/trainable': 485599, 'epoch': '0.1547'}
 15%|█████████████████████████████████▎                                                                                                                                                                                      | 78/505 [16:07<55:54,  7.86s/it][2026-06-13 16:57:33,873] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

[2026-06-13 16:57:42,123] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 16%|█████████████████████████████████▍                                                                                                                                                                                    | 79/505 [16:29<1:26:16, 12.15s/it]                                                                                                                                                                                                                                                              {'loss': '0.9388', 'grad_norm': '0.9771', 'learning_rate': '0.0001981', 'ppl': '2.557', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '12.95', 'tokens/total': 915771, 'tokens/trainable': 490337, 'epoch': '0.1567'}
 16%|█████████████████████████████████▍                                                                                                                                                                                    | 79/505 [16:29<1:26:16, 12.15s/it] 16%|█████████████████████████████████▉                                                                                                                                                                                    | 80/505 [16:37<1:17:50, 10.99s/it]                                                                                                                                                                                                                                                              {'loss': '0.7833', 'grad_norm': '0.5612', 'learning_rate': '0.000198', 'ppl': '2.189', 'memory/max_active (GiB)': '3.79', 'memory/max_allocated (GiB)': '3.79', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '48.94', 'tokens/total': 927768, 'tokens/trainable': 496895, 'epoch': '0.1587'}
 16%|█████████████████████████████████▉                                                                                                                                                                                    | 80/505 [16:37<1:17:50, 10.99s/it] 16%|██████████████████████████████████▎                                                                                                                                                                                   | 81/505 [16:46<1:12:25, 10.25s/it]                                                                                                                                                                                                                                                              {'loss': '0.7007', 'grad_norm': '0.644', 'learning_rate': '0.0001979', 'ppl': '2.015', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '58.44', 'tokens/total': 940341, 'tokens/trainable': 503219, 'epoch': '0.1606'}
 16%|██████████████████████████████████▎                                                                                                                                                                                   | 81/505 [16:46<1:12:25, 10.25s/it] 16%|██████████████████████████████████▋                                                                                                                                                                                   | 82/505 [16:54<1:07:12,  9.53s/it]                                                                                                                                                                                                                                                              {'loss': '0.796', 'grad_norm': '0.6602', 'learning_rate': '0.0001977', 'ppl': '2.217', 'memory/max_active (GiB)': '3.95', 'memory/max_allocated (GiB)': '3.95', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '112.2', 'tokens/total': 951748, 'tokens/trainable': 509353, 'epoch': '0.1626'}
 16%|██████████████████████████████████▋                                                                                                                                                                                   | 82/505 [16:54<1:07:12,  9.53s/it] 16%|███████████████████████████████████▏                                                                                                                                                                                  | 83/505 [17:02<1:04:02,  9.11s/it]                                                                                                                                                                                                                                                              {'loss': '0.7936', 'grad_norm': '0.7493', 'learning_rate': '0.0001976', 'ppl': '2.211', 'memory/max_active (GiB)': '4.19', 'memory/max_allocated (GiB)': '4.19', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '113.2', 'tokens/total': 963586, 'tokens/trainable': 515594, 'epoch': '0.1646'}
 16%|███████████████████████████████████▏                                                                                                                                                                                  | 83/505 [17:02<1:04:02,  9.11s/it] 17%|███████████████████████████████████▌                                                                                                                                                                                  | 84/505 [17:11<1:04:32,  9.20s/it]                                                                                                                                                                                                                                                              {'loss': '0.7418', 'grad_norm': '0.7591', 'learning_rate': '0.0001974', 'ppl': '2.1', 'memory/max_active (GiB)': '4.27', 'memory/max_allocated (GiB)': '4.27', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '137.5', 'tokens/total': 977091, 'tokens/trainable': 523367, 'epoch': '0.1666'}
 17%|███████████████████████████████████▌                                                                                                                                                                                  | 84/505 [17:11<1:04:32,  9.20s/it][2026-06-13 16:58:37,174] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 17%|████████████████████████████████████                                                                                                                                                                                  | 85/505 [17:21<1:06:15,  9.47s/it]                                                                                                                                                                                                                                                              {'loss': '0.8368', 'grad_norm': '0.5105', 'learning_rate': '0.0001973', 'ppl': '2.309', 'memory/max_active (GiB)': '3.89', 'memory/max_allocated (GiB)': '3.89', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '84.12', 'tokens/total': 991600, 'tokens/trainable': 531799, 'epoch': '0.1686'}
 17%|████████████████████████████████████                                                                                                                                                                                  | 85/505 [17:21<1:06:15,  9.47s/it] 17%|████████████████████████████████████▍                                                                                                                                                                                 | 86/505 [17:29<1:02:47,  8.99s/it]                                                                                                                                                                                                                                                              {'loss': '0.7751', 'grad_norm': '0.5887', 'learning_rate': '0.0001971', 'ppl': '2.171', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '72.55', 'tokens/total': 1003124, 'tokens/trainable': 537820, 'epoch': '0.1706'}
 17%|████████████████████████████████████▍                                                                                                                                                                                 | 86/505 [17:29<1:02:47,  8.99s/it] 17%|█████████████████████████████████████▏                                                                                                                                                                                  | 87/505 [17:36<58:23,  8.38s/it]                                                                                                                                                                                                                                                              {'loss': '0.7528', 'grad_norm': '0.8691', 'learning_rate': '0.0001969', 'ppl': '2.123', 'memory/max_active (GiB)': '3.75', 'memory/max_allocated (GiB)': '3.75', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '155.3', 'tokens/total': 1013244, 'tokens/trainable': 541948, 'epoch': '0.1725'}
 17%|█████████████████████████████████████▏                                                                                                                                                                                  | 87/505 [17:36<58:23,  8.38s/it] 17%|█████████████████████████████████████▋                                                                                                                                                                                  | 88/505 [17:44<57:28,  8.27s/it]                                                                                                                                                                                                                                                              {'loss': '0.7982', 'grad_norm': '0.6088', 'learning_rate': '0.0001968', 'ppl': '2.222', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '77.8', 'tokens/total': 1024716, 'tokens/trainable': 548240, 'epoch': '0.1745'}
 17%|█████████████████████████████████████▋                                                                                                                                                                                  | 88/505 [17:44<57:28,  8.27s/it] 18%|██████████████████████████████████████                                                                                                                                                                                  | 89/505 [17:51<55:40,  8.03s/it]                                                                                                                                                                                                                                                              {'loss': '0.8361', 'grad_norm': '0.6708', 'learning_rate': '0.0001966', 'ppl': '2.307', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '126', 'tokens/total': 1035682, 'tokens/trainable': 554011, 'epoch': '0.1765'}
 18%|██████████████████████████████████████                                                                                                                                                                                  | 89/505 [17:51<55:40,  8.03s/it] 18%|██████████████████████████████████████▍                                                                                                                                                                                 | 90/505 [17:58<53:21,  7.71s/it]                                                                                                                                                                                                                                                              {'loss': '0.9592', 'grad_norm': '0.6334', 'learning_rate': '0.0001964', 'ppl': '2.61', 'memory/max_active (GiB)': '3.68', 'memory/max_allocated (GiB)': '3.68', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '96.75', 'tokens/total': 1045879, 'tokens/trainable': 559264, 'epoch': '0.1785'}
 18%|██████████████████████████████████████▍                                                                                                                                                                                 | 90/505 [17:58<53:21,  7.71s/it] 18%|██████████████████████████████████████▉                                                                                                                                                                                 | 91/505 [18:07<55:35,  8.06s/it]                                                                                                                                                                                                                                                              {'loss': '0.9114', 'grad_norm': '0.6626', 'learning_rate': '0.0001962', 'ppl': '2.488', 'memory/max_active (GiB)': '4.11', 'memory/max_allocated (GiB)': '4.11', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '56.69', 'tokens/total': 1058682, 'tokens/trainable': 565769, 'epoch': '0.1805'}
 18%|██████████████████████████████████████▉                                                                                                                                                                                 | 91/505 [18:07<55:35,  8.06s/it] 18%|███████████████████████████████████████▎                                                                                                                                                                                | 92/505 [18:15<53:40,  7.80s/it]                                                                                                                                                                                                                                                              {'loss': '0.819', 'grad_norm': '0.6851', 'learning_rate': '0.000196', 'ppl': '2.268', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '65.04', 'tokens/total': 1069305, 'tokens/trainable': 570893, 'epoch': '0.1824'}
 18%|███████████████████████████████████████▎                                                                                                                                                                                | 92/505 [18:15<53:40,  7.80s/it] 18%|███████████████████████████████████████▊                                                                                                                                                                                | 93/505 [18:22<53:50,  7.84s/it]                                                                                                                                                                                                                                                              {'loss': '0.8623', 'grad_norm': '0.6432', 'learning_rate': '0.0001958', 'ppl': '2.369', 'memory/max_active (GiB)': '4.09', 'memory/max_allocated (GiB)': '4.09', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '81.8', 'tokens/total': 1080868, 'tokens/trainable': 577085, 'epoch': '0.1844'}
 18%|███████████████████████████████████████▊                                                                                                                                                                                | 93/505 [18:22<53:50,  7.84s/it][2026-06-13 16:59:42,691] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 19%|████████████████████████████████████████▏                                                                                                                                                                               | 94/505 [18:29<50:01,  7.30s/it]                                                                                                                                                                                                                                                              {'loss': '0.6968', 'grad_norm': '0.7381', 'learning_rate': '0.0001956', 'ppl': '2.007', 'memory/max_active (GiB)': '3.64', 'memory/max_allocated (GiB)': '3.64', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '42.5', 'tokens/total': 1089210, 'tokens/trainable': 581012, 'epoch': '0.1864'}
 19%|████████████████████████████████████████▏                                                                                                                                                                               | 94/505 [18:29<50:01,  7.30s/it] 19%|████████████████████████████████████████▋                                                                                                                                                                               | 95/505 [18:38<53:33,  7.84s/it]                                                                                                                                                                                                                                                              {'loss': '0.7805', 'grad_norm': '0.5442', 'learning_rate': '0.0001954', 'ppl': '2.183', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '98.93', 'tokens/total': 1102396, 'tokens/trainable': 588582, 'epoch': '0.1884'}
 19%|████████████████████████████████████████▋                                                                                                                                                                               | 95/505 [18:38<53:33,  7.84s/it] 19%|█████████████████████████████████████████                                                                                                                                                                               | 96/505 [18:44<50:20,  7.38s/it]                                                                                                                                                                                                                                                              {'loss': '0.7794', 'grad_norm': '0.6875', 'learning_rate': '0.0001952', 'ppl': '2.18', 'memory/max_active (GiB)': '3.58', 'memory/max_allocated (GiB)': '3.58', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '106.9', 'tokens/total': 1111521, 'tokens/trainable': 593104, 'epoch': '0.1904'}
 19%|█████████████████████████████████████████                                                                                                                                                                               | 96/505 [18:44<50:20,  7.38s/it] 19%|█████████████████████████████████████████▍                                                                                                                                                                              | 97/505 [18:54<54:53,  8.07s/it]                                                                                                                                                                                                                                                              {'loss': '0.8282', 'grad_norm': '0.5292', 'learning_rate': '0.000195', 'ppl': '2.289', 'memory/max_active (GiB)': '3.9', 'memory/max_allocated (GiB)': '3.9', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '118.9', 'tokens/total': 1125582, 'tokens/trainable': 601146, 'epoch': '0.1924'}
 19%|█████████████████████████████████████████▍                                                                                                                                                                              | 97/505 [18:54<54:53,  8.07s/it] 19%|█████████████████████████████████████████▉                                                                                                                                                                              | 98/505 [19:00<51:39,  7.62s/it]                                                                                                                                                                                                                                                              {'loss': '0.7601', 'grad_norm': '0.6183', 'learning_rate': '0.0001948', 'ppl': '2.138', 'memory/max_active (GiB)': '3.62', 'memory/max_allocated (GiB)': '3.62', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '106.3', 'tokens/total': 1135109, 'tokens/trainable': 605791, 'epoch': '0.1943'}
 19%|█████████████████████████████████████████▉                                                                                                                                                                              | 98/505 [19:00<51:39,  7.62s/it] 20%|██████████████████████████████████████████▎                                                                                                                                                                             | 99/505 [19:09<54:01,  7.98s/it]                                                                                                                                                                                                                                                              {'loss': '0.7878', 'grad_norm': '0.5387', 'learning_rate': '0.0001946', 'ppl': '2.199', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '46.24', 'tokens/total': 1147968, 'tokens/trainable': 613540, 'epoch': '0.1963'}
 20%|██████████████████████████████████████████▎                                                                                                                                                                             | 99/505 [19:09<54:01,  7.98s/it] 20%|██████████████████████████████████████████▌                                                                                                                                                                            | 100/505 [19:16<50:58,  7.55s/it]                                                                                                                                                                                                                                                              {'loss': '0.7871', 'grad_norm': '0.6693', 'learning_rate': '0.0001943', 'ppl': '2.197', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '64.03', 'tokens/total': 1157462, 'tokens/trainable': 618134, 'epoch': '0.1983'}
 20%|██████████████████████████████████████████▌                                                                                                                                                                            | 100/505 [19:16<50:58,  7.55s/it] 20%|███████████████████████████████████████████                                                                                                                                                                            | 101/505 [19:24<52:14,  7.76s/it]                                                                                                                                                                                                                                                              {'loss': '0.7776', 'grad_norm': '0.6572', 'learning_rate': '0.0001941', 'ppl': '2.176', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '116.3', 'tokens/total': 1169429, 'tokens/trainable': 624185, 'epoch': '0.2003'}
 20%|███████████████████████████████████████████                                                                                                                                                                            | 101/505 [19:24<52:14,  7.76s/it] 20%|███████████████████████████████████████████▍                                                                                                                                                                           | 102/505 [19:31<50:31,  7.52s/it]                                                                                                                                                                                                                                                              {'loss': '0.7607', 'grad_norm': '0.7296', 'learning_rate': '0.0001939', 'ppl': '2.14', 'memory/max_active (GiB)': '3.74', 'memory/max_allocated (GiB)': '3.74', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '28.68', 'tokens/total': 1179398, 'tokens/trainable': 628970, 'epoch': '0.2023'}
 20%|███████████████████████████████████████████▍                                                                                                                                                                           | 102/505 [19:31<50:31,  7.52s/it] 20%|███████████████████████████████████████████▊                                                                                                                                                                           | 103/505 [19:38<50:46,  7.58s/it]                                                                                                                                                                                                                                                              {'loss': '0.8054', 'grad_norm': '0.6234', 'learning_rate': '0.0001936', 'ppl': '2.238', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '51.77', 'tokens/total': 1190598, 'tokens/trainable': 634832, 'epoch': '0.2043'}
 20%|███████████████████████████████████████████▊                                                                                                                                                                           | 103/505 [19:38<50:46,  7.58s/it] 21%|████████████████████████████████████████████▎                                                                                                                                                                          | 104/505 [19:45<47:55,  7.17s/it]                                                                                                                                                                                                                                                              {'loss': '0.8825', 'grad_norm': '0.9507', 'learning_rate': '0.0001934', 'ppl': '2.417', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '92.36', 'tokens/total': 1199110, 'tokens/trainable': 638692, 'epoch': '0.2062'}
 21%|████████████████████████████████████████████▎                                                                                                                                                                          | 104/505 [19:45<47:55,  7.17s/it] 21%|████████████████████████████████████████████▋                                                                                                                                                                          | 105/505 [19:53<49:47,  7.47s/it]                                                                                                                                                                                                                                                              {'loss': '0.8319', 'grad_norm': '0.6216', 'learning_rate': '0.0001931', 'ppl': '2.298', 'memory/max_active (GiB)': '3.86', 'memory/max_allocated (GiB)': '3.86', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '116.1', 'tokens/total': 1211063, 'tokens/trainable': 645290, 'epoch': '0.2082'}
 21%|████████████████████████████████████████████▋                                                                                                                                                                          | 105/505 [19:53<49:47,  7.47s/it] 21%|█████████████████████████████████████████████▏                                                                                                                                                                         | 106/505 [20:00<48:13,  7.25s/it]                                                                                                                                                                                                                                                              {'loss': '0.8184', 'grad_norm': '0.6408', 'learning_rate': '0.0001929', 'ppl': '2.267', 'memory/max_active (GiB)': '3.62', 'memory/max_allocated (GiB)': '3.62', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '116', 'tokens/total': 1220939, 'tokens/trainable': 650189, 'epoch': '0.2102'}
 21%|█████████████████████████████████████████████▏                                                                                                                                                                         | 106/505 [20:00<48:13,  7.25s/it] 21%|█████████████████████████████████████████████▌                                                                                                                                                                         | 107/505 [20:09<53:00,  7.99s/it]                                                                                                                                                                                                                                                              {'loss': '0.7429', 'grad_norm': '0.5209', 'learning_rate': '0.0001926', 'ppl': '2.102', 'memory/max_active (GiB)': '4', 'memory/max_allocated (GiB)': '4', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '33.77', 'tokens/total': 1234897, 'tokens/trainable': 658553, 'epoch': '0.2122'}
 21%|█████████████████████████████████████████████▌                                                                                                                                                                         | 107/505 [20:09<53:00,  7.99s/it] 21%|█████████████████████████████████████████████▉                                                                                                                                                                         | 108/505 [20:17<52:08,  7.88s/it]                                                                                                                                                                                                                                                              {'loss': '0.8325', 'grad_norm': '0.5741', 'learning_rate': '0.0001924', 'ppl': '2.299', 'memory/max_active (GiB)': '3.73', 'memory/max_allocated (GiB)': '3.73', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '48.83', 'tokens/total': 1245828, 'tokens/trainable': 664529, 'epoch': '0.2142'}
 21%|█████████████████████████████████████████████▉                                                                                                                                                                         | 108/505 [20:17<52:08,  7.88s/it] 22%|██████████████████████████████████████████████▍                                                                                                                                                                        | 109/505 [20:25<52:08,  7.90s/it]                                                                                                                                                                                                                                                              {'loss': '0.8486', 'grad_norm': '0.6339', 'learning_rate': '0.0001921', 'ppl': '2.336', 'memory/max_active (GiB)': '4', 'memory/max_allocated (GiB)': '4', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '267.3', 'tokens/total': 1257282, 'tokens/trainable': 671074, 'epoch': '0.2162'}
 22%|██████████████████████████████████████████████▍                                                                                                                                                                        | 109/505 [20:25<52:08,  7.90s/it][2026-06-13 17:01:47,299] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 22%|██████████████████████████████████████████████▊                                                                                                                                                                        | 110/505 [20:32<50:16,  7.64s/it]                                                                                                                                                                                                                                                              {'loss': '0.7751', 'grad_norm': '0.6781', 'learning_rate': '0.0001918', 'ppl': '2.171', 'memory/max_active (GiB)': '3.64', 'memory/max_allocated (GiB)': '3.64', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '105.9', 'tokens/total': 1267353, 'tokens/trainable': 675410, 'epoch': '0.2181'}
 22%|██████████████████████████████████████████████▊                                                                                                                                                                        | 110/505 [20:32<50:16,  7.64s/it] 22%|███████████████████████████████████████████████▎                                                                                                                                                                       | 111/505 [20:41<52:33,  8.00s/it]                                                                                                                                                                                                                                                              {'loss': '0.7736', 'grad_norm': '0.7491', 'learning_rate': '0.0001915', 'ppl': '2.167', 'memory/max_active (GiB)': '3.9', 'memory/max_allocated (GiB)': '3.9', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '127.5', 'tokens/total': 1280169, 'tokens/trainable': 682698, 'epoch': '0.2201'}
 22%|███████████████████████████████████████████████▎                                                                                                                                                                       | 111/505 [20:41<52:33,  8.00s/it] 22%|███████████████████████████████████████████████▋                                                                                                                                                                       | 112/505 [20:48<51:50,  7.92s/it]                                                                                                                                                                                                                                                              {'loss': '0.7962', 'grad_norm': '0.5987', 'learning_rate': '0.0001913', 'ppl': '2.217', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '34.63', 'tokens/total': 1291403, 'tokens/trainable': 687743, 'epoch': '0.2221'}
 22%|███████████████████████████████████████████████▋                                                                                                                                                                       | 112/505 [20:48<51:50,  7.92s/it] 22%|████████████████████████████████████████████████                                                                                                                                                                       | 113/505 [20:57<53:53,  8.25s/it]                                                                                                                                                                                                                                                              {'loss': '0.7093', 'grad_norm': '0.5681', 'learning_rate': '0.000191', 'ppl': '2.033', 'memory/max_active (GiB)': '3.98', 'memory/max_allocated (GiB)': '3.98', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '78.02', 'tokens/total': 1304526, 'tokens/trainable': 695141, 'epoch': '0.2241'}
 22%|████████████████████████████████████████████████                                                                                                                                                                       | 113/505 [20:57<53:53,  8.25s/it] 23%|████████████████████████████████████████████████▌                                                                                                                                                                      | 114/505 [21:05<51:37,  7.92s/it]                                                                                                                                                                                                                                                              {'loss': '0.763', 'grad_norm': '0.6083', 'learning_rate': '0.0001907', 'ppl': '2.145', 'memory/max_active (GiB)': '3.68', 'memory/max_allocated (GiB)': '3.68', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '95.09', 'tokens/total': 1314900, 'tokens/trainable': 700414, 'epoch': '0.2261'}
 23%|████████████████████████████████████████████████▌                                                                                                                                                                      | 114/505 [21:05<51:37,  7.92s/it] 23%|████████████████████████████████████████████████▉                                                                                                                                                                      | 115/505 [21:11<47:30,  7.31s/it]                                                                                                                                                                                                                                                              {'loss': '0.8141', 'grad_norm': '0.8009', 'learning_rate': '0.0001904', 'ppl': '2.257', 'memory/max_active (GiB)': '3.59', 'memory/max_allocated (GiB)': '3.59', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '85.92', 'tokens/total': 1323435, 'tokens/trainable': 704382, 'epoch': '0.2281'}
 23%|████████████████████████████████████████████████▉                                                                                                                                                                      | 115/505 [21:11<47:30,  7.31s/it] 23%|█████████████████████████████████████████████████▍                                                                                                                                                                     | 116/505 [21:19<50:02,  7.72s/it]                                                                                                                                                                                                                                                              {'loss': '0.8545', 'grad_norm': '0.6119', 'learning_rate': '0.0001901', 'ppl': '2.35', 'memory/max_active (GiB)': '4.04', 'memory/max_allocated (GiB)': '4.04', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '66.11', 'tokens/total': 1335945, 'tokens/trainable': 711712, 'epoch': '0.23'}
 23%|█████████████████████████████████████████████████▍                                                                                                                                                                     | 116/505 [21:19<50:02,  7.72s/it] 23%|█████████████████████████████████████████████████▊                                                                                                                                                                     | 117/505 [21:26<47:37,  7.36s/it]                                                                                                                                                                                                                                                              {'loss': '0.665', 'grad_norm': '0.7372', 'learning_rate': '0.0001898', 'ppl': '1.944', 'memory/max_active (GiB)': '3.8', 'memory/max_allocated (GiB)': '3.8', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '44.19', 'tokens/total': 1345319, 'tokens/trainable': 716378, 'epoch': '0.232'}
 23%|█████████████████████████████████████████████████▊                                                                                                                                                                     | 117/505 [21:26<47:37,  7.36s/it] 23%|██████████████████████████████████████████████████▏                                                                                                                                                                    | 118/505 [21:34<48:33,  7.53s/it]                                                                                                                                                                                                                                                              {'loss': '0.8053', 'grad_norm': '0.6532', 'learning_rate': '0.0001895', 'ppl': '2.237', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '169.7', 'tokens/total': 1356687, 'tokens/trainable': 722622, 'epoch': '0.234'}
 23%|██████████████████████████████████████████████████▏                                                                                                                                                                    | 118/505 [21:34<48:33,  7.53s/it] 24%|██████████████████████████████████████████████████▋                                                                                                                                                                    | 119/505 [21:42<49:21,  7.67s/it]                                                                                                                                                                                                                                                              {'loss': '0.7363', 'grad_norm': '0.5626', 'learning_rate': '0.0001892', 'ppl': '2.088', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '63.59', 'tokens/total': 1368268, 'tokens/trainable': 728895, 'epoch': '0.236'}
 24%|██████████████████████████████████████████████████▋                                                                                                                                                                    | 119/505 [21:42<49:21,  7.67s/it][2026-06-13 17:03:02,336] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 24%|███████████████████████████████████████████████████                                                                                                                                                                    | 120/505 [21:54<57:30,  8.96s/it]                                                                                                                                                                                                                                                              {'loss': '0.7004', 'grad_norm': '0.5348', 'learning_rate': '0.0001889', 'ppl': '2.015', 'memory/max_active (GiB)': '4.25', 'memory/max_allocated (GiB)': '4.25', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '109', 'tokens/total': 1385297, 'tokens/trainable': 739158, 'epoch': '0.238'}
 24%|███████████████████████████████████████████████████                                                                                                                                                                    | 120/505 [21:54<57:30,  8.96s/it][2026-06-13 17:03:14,173] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 24%|███████████████████████████████████████████████████▌                                                                                                                                                                   | 121/505 [22:02<56:07,  8.77s/it]                                                                                                                                                                                                                                                              {'loss': '0.7182', 'grad_norm': '0.7178', 'learning_rate': '0.0001885', 'ppl': '2.051', 'memory/max_active (GiB)': '3.82', 'memory/max_allocated (GiB)': '3.82', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '8.659', 'tokens/total': 1397136, 'tokens/trainable': 746031, 'epoch': '0.24'}
 24%|███████████████████████████████████████████████████▌                                                                                                                                                                   | 121/505 [22:02<56:07,  8.77s/it] 24%|███████████████████████████████████████████████████▉                                                                                                                                                                   | 122/505 [22:10<54:51,  8.59s/it]                                                                                                                                                                                                                                                              {'loss': '0.851', 'grad_norm': '0.658', 'learning_rate': '0.0001882', 'ppl': '2.342', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '176.3', 'tokens/total': 1408966, 'tokens/trainable': 752717, 'epoch': '0.2419'}
 24%|███████████████████████████████████████████████████▉                                                                                                                                                                   | 122/505 [22:10<54:51,  8.59s/it][2026-06-13 17:03:36,725] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 24%|████████████████████████████████████████████████████▎                                                                                                                                                                  | 123/505 [22:18<54:11,  8.51s/it]                                                                                                                                                                                                                                                              {'loss': '0.8264', 'grad_norm': '0.6716', 'learning_rate': '0.0001879', 'ppl': '2.285', 'memory/max_active (GiB)': '4.19', 'memory/max_allocated (GiB)': '4.19', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '105.2', 'tokens/total': 1420891, 'tokens/trainable': 758995, 'epoch': '0.2439'}
 24%|████████████████████████████████████████████████████▎                                                                                                                                                                  | 123/505 [22:18<54:11,  8.51s/it] 25%|████████████████████████████████████████████████████▊                                                                                                                                                                  | 124/505 [22:29<57:08,  9.00s/it]                                                                                                                                                                                                                                                              {'loss': '0.8359', 'grad_norm': '0.5778', 'learning_rate': '0.0001876', 'ppl': '2.307', 'memory/max_active (GiB)': '3.98', 'memory/max_allocated (GiB)': '3.98', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '138', 'tokens/total': 1435435, 'tokens/trainable': 767980, 'epoch': '0.2459'}
 25%|████████████████████████████████████████████████████▊                                                                                                                                                                  | 124/505 [22:29<57:08,  9.00s/it] 25%|█████████████████████████████████████████████████████▏                                                                                                                                                                 | 125/505 [22:35<51:09,  8.08s/it]                                                                                                                                                                                                                                                              {'loss': '0.6378', 'grad_norm': '0.7116', 'learning_rate': '0.0001872', 'ppl': '1.892', 'memory/max_active (GiB)': '3.55', 'memory/max_allocated (GiB)': '3.55', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '104', 'tokens/total': 1443979, 'tokens/trainable': 771877, 'epoch': '0.2479'}
 25%|█████████████████████████████████████████████████████▏                                                                                                                                                                 | 125/505 [22:35<51:09,  8.08s/it] 25%|█████████████████████████████████████████████████████▋                                                                                                                                                                 | 126/505 [22:40<46:56,  7.43s/it]                                                                                                                                                                                                                                                              {'loss': '0.7961', 'grad_norm': '0.7266', 'learning_rate': '0.0001869', 'ppl': '2.217', 'memory/max_active (GiB)': '3.64', 'memory/max_allocated (GiB)': '3.64', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '54.37', 'tokens/total': 1452453, 'tokens/trainable': 775696, 'epoch': '0.2499'}
 25%|█████████████████████████████████████████████████████▋                                                                                                                                                                 | 126/505 [22:40<46:56,  7.43s/it] 25%|██████████████████████████████████████████████████████                                                                                                                                                                 | 127/505 [22:48<46:13,  7.34s/it]                                                                                                                                                                                                                                                              {'loss': '0.7854', 'grad_norm': '0.5643', 'learning_rate': '0.0001865', 'ppl': '2.193', 'memory/max_active (GiB)': '3.66', 'memory/max_allocated (GiB)': '3.66', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '38.19', 'tokens/total': 1462885, 'tokens/trainable': 780909, 'epoch': '0.2519'}
 25%|██████████████████████████████████████████████████████                                                                                                                                                                 | 127/505 [22:48<46:13,  7.34s/it][2026-06-13 17:04:14,457] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 25%|██████████████████████████████████████████████████████▍                                                                                                                                                                | 128/505 [22:58<51:23,  8.18s/it]                                                                                                                                                                                                                                                              {'loss': '0.916', 'grad_norm': '0.5525', 'learning_rate': '0.0001862', 'ppl': '2.499', 'memory/max_active (GiB)': '4', 'memory/max_allocated (GiB)': '4', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '106.2', 'tokens/total': 1477470, 'tokens/trainable': 789459, 'epoch': '0.2538'}
 25%|██████████████████████████████████████████████████████▍                                                                                                                                                                | 128/505 [22:58<51:23,  8.18s/it] 26%|██████████████████████████████████████████████████████▉                                                                                                                                                                | 129/505 [23:07<52:38,  8.40s/it]                                                                                                                                                                                                                                                              {'loss': '0.7651', 'grad_norm': '0.5915', 'learning_rate': '0.0001858', 'ppl': '2.149', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '256.4', 'tokens/total': 1490262, 'tokens/trainable': 796815, 'epoch': '0.2558'}
 26%|██████████████████████████████████████████████████████▉                                                                                                                                                                | 129/505 [23:07<52:38,  8.40s/it] 26%|███████████████████████████████████████████████████████▎                                                                                                                                                               | 130/505 [23:13<47:51,  7.66s/it]                                                                                                                                                                                                                                                              {'loss': '0.8697', 'grad_norm': '0.8074', 'learning_rate': '0.0001855', 'ppl': '2.386', 'memory/max_active (GiB)': '3.67', 'memory/max_allocated (GiB)': '3.67', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '50.64', 'tokens/total': 1498812, 'tokens/trainable': 800664, 'epoch': '0.2578'}
 26%|███████████████████████████████████████████████████████▎                                                                                                                                                               | 130/505 [23:13<47:51,  7.66s/it] 26%|███████████████████████████████████████████████████████▊                                                                                                                                                               | 131/505 [23:20<47:40,  7.65s/it]                                                                                                                                                                                                                                                              {'loss': '0.7225', 'grad_norm': '0.5593', 'learning_rate': '0.0001851', 'ppl': '2.059', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '169.7', 'tokens/total': 1509924, 'tokens/trainable': 806599, 'epoch': '0.2598'}
 26%|███████████████████████████████████████████████████████▊                                                                                                                                                               | 131/505 [23:20<47:40,  7.65s/it] 26%|████████████████████████████████████████████████████████▏                                                                                                                                                              | 132/505 [23:30<51:03,  8.21s/it]                                                                                                                                                                                                                                                              {'loss': '0.7022', 'grad_norm': '0.5381', 'learning_rate': '0.0001848', 'ppl': '2.018', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '235.2', 'tokens/total': 1523728, 'tokens/trainable': 814641, 'epoch': '0.2618'}
 26%|████████████████████████████████████████████████████████▏                                                                                                                                                              | 132/505 [23:30<51:03,  8.21s/it] 26%|████████████████████████████████████████████████████████▌                                                                                                                                                              | 133/505 [23:40<54:14,  8.75s/it]                                                                                                                                                                                                                                                              {'loss': '0.7571', 'grad_norm': '0.6672', 'learning_rate': '0.0001844', 'ppl': '2.132', 'memory/max_active (GiB)': '4.07', 'memory/max_allocated (GiB)': '4.07', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '28.72', 'tokens/total': 1538062, 'tokens/trainable': 822854, 'epoch': '0.2638'}
 26%|████████████████████████████████████████████████████████▌                                                                                                                                                              | 133/505 [23:40<54:14,  8.75s/it] 27%|█████████████████████████████████████████████████████████                                                                                                                                                              | 134/505 [23:51<58:47,  9.51s/it]                                                                                                                                                                                                                                                              {'loss': '0.8787', 'grad_norm': '0.6818', 'learning_rate': '0.000184', 'ppl': '2.408', 'memory/max_active (GiB)': '4.28', 'memory/max_allocated (GiB)': '4.28', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '55.21', 'tokens/total': 1554327, 'tokens/trainable': 832644, 'epoch': '0.2657'}
 27%|█████████████████████████████████████████████████████████                                                                                                                                                              | 134/505 [23:51<58:47,  9.51s/it] 27%|█████████████████████████████████████████████████████████▍                                                                                                                                                             | 135/505 [23:59<56:26,  9.15s/it]                                                                                                                                                                                                                                                              {'loss': '0.7134', 'grad_norm': '0.7053', 'learning_rate': '0.0001836', 'ppl': '2.041', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '116.2', 'tokens/total': 1566319, 'tokens/trainable': 839198, 'epoch': '0.2677'}
 27%|█████████████████████████████████████████████████████████▍                                                                                                                                                             | 135/505 [23:59<56:26,  9.15s/it] 27%|█████████████████████████████████████████████████████████▉                                                                                                                                                             | 136/505 [24:07<53:47,  8.75s/it]                                                                                                                                                                                                                                                              {'loss': '0.7897', 'grad_norm': '0.7959', 'learning_rate': '0.0001833', 'ppl': '2.203', 'memory/max_active (GiB)': '3.96', 'memory/max_allocated (GiB)': '3.96', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '8.719', 'tokens/total': 1577585, 'tokens/trainable': 844799, 'epoch': '0.2697'}
 27%|█████████████████████████████████████████████████████████▉                                                                                                                                                             | 136/505 [24:07<53:47,  8.75s/it] 27%|██████████████████████████████████████████████████████████▎                                                                                                                                                            | 137/505 [24:15<52:29,  8.56s/it]                                                                                                                                                                                                                                                              {'loss': '0.7433', 'grad_norm': '0.6396', 'learning_rate': '0.0001829', 'ppl': '2.103', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '14.42', 'tokens/total': 1589221, 'tokens/trainable': 851195, 'epoch': '0.2717'}
 27%|██████████████████████████████████████████████████████████▎                                                                                                                                                            | 137/505 [24:15<52:29,  8.56s/it] 27%|██████████████████████████████████████████████████████████▊                                                                                                                                                            | 138/505 [24:23<51:33,  8.43s/it]                                                                                                                                                                                                                                                              {'loss': '0.9323', 'grad_norm': '0.6288', 'learning_rate': '0.0001825', 'ppl': '2.54', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '58.22', 'tokens/total': 1600998, 'tokens/trainable': 857861, 'epoch': '0.2737'}
 27%|██████████████████████████████████████████████████████████▊                                                                                                                                                            | 138/505 [24:23<51:33,  8.43s/it] 28%|███████████████████████████████████████████████████████████▏                                                                                                                                                           | 139/505 [24:30<48:41,  7.98s/it]                                                                                                                                                                                                                                                              {'loss': '0.8722', 'grad_norm': '0.6804', 'learning_rate': '0.0001821', 'ppl': '2.392', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '64.79', 'tokens/total': 1611079, 'tokens/trainable': 862830, 'epoch': '0.2757'}
 28%|███████████████████████████████████████████████████████████▏                                                                                                                                                           | 139/505 [24:30<48:41,  7.98s/it] 28%|███████████████████████████████████████████████████████████▌                                                                                                                                                           | 140/505 [24:37<46:47,  7.69s/it]                                                                                                                                                                                                                                                              {'loss': '0.7396', 'grad_norm': '0.6361', 'learning_rate': '0.0001817', 'ppl': '2.095', 'memory/max_active (GiB)': '3.75', 'memory/max_allocated (GiB)': '3.75', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '175.8', 'tokens/total': 1621229, 'tokens/trainable': 867824, 'epoch': '0.2776'}
 28%|███████████████████████████████████████████████████████████▌                                                                                                                                                           | 140/505 [24:37<46:47,  7.69s/it] 28%|████████████████████████████████████████████████████████████                                                                                                                                                           | 141/505 [24:45<46:04,  7.59s/it]                                                                                                                                                                                                                                                              {'loss': '0.8336', 'grad_norm': '0.76', 'learning_rate': '0.0001813', 'ppl': '2.302', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '18.88', 'tokens/total': 1631774, 'tokens/trainable': 873027, 'epoch': '0.2796'}
 28%|████████████████████████████████████████████████████████████                                                                                                                                                           | 141/505 [24:45<46:04,  7.59s/it] 28%|████████████████████████████████████████████████████████████▍                                                                                                                                                          | 142/505 [24:52<44:32,  7.36s/it]                                                                                                                                                                                                                                                              {'loss': '0.9027', 'grad_norm': '0.7419', 'learning_rate': '0.0001809', 'ppl': '2.466', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '85.16', 'tokens/total': 1641632, 'tokens/trainable': 877553, 'epoch': '0.2816'}
 28%|████████████████████████████████████████████████████████████▍                                                                                                                                                          | 142/505 [24:52<44:32,  7.36s/it] 28%|████████████████████████████████████████████████████████████▉                                                                                                                                                          | 143/505 [24:59<44:27,  7.37s/it]                                                                                                                                                                                                                                                              {'loss': '0.8845', 'grad_norm': '0.7232', 'learning_rate': '0.0001805', 'ppl': '2.422', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '22.62', 'tokens/total': 1652281, 'tokens/trainable': 883164, 'epoch': '0.2836'}
 28%|████████████████████████████████████████████████████████████▉                                                                                                                                                          | 143/505 [24:59<44:27,  7.37s/it][2026-06-13 17:06:23,957] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 29%|█████████████████████████████████████████████████████████████▎                                                                                                                                                         | 144/505 [25:07<44:48,  7.45s/it]                                                                                                                                                                                                                                                              {'loss': '0.688', 'grad_norm': '0.5638', 'learning_rate': '0.0001801', 'ppl': '1.99', 'memory/max_active (GiB)': '3.74', 'memory/max_allocated (GiB)': '3.74', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '141.7', 'tokens/total': 1663161, 'tokens/trainable': 889119, 'epoch': '0.2856'}
 29%|█████████████████████████████████████████████████████████████▎                                                                                                                                                         | 144/505 [25:07<44:48,  7.45s/it] 29%|█████████████████████████████████████████████████████████████▋                                                                                                                                                         | 145/505 [25:15<46:30,  7.75s/it]                                                                                                                                                                                                                                                              {'loss': '0.7924', 'grad_norm': '0.5906', 'learning_rate': '0.0001797', 'ppl': '2.209', 'memory/max_active (GiB)': '3.96', 'memory/max_allocated (GiB)': '3.96', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '180.6', 'tokens/total': 1675395, 'tokens/trainable': 895811, 'epoch': '0.2876'}
 29%|█████████████████████████████████████████████████████████████▋                                                                                                                                                         | 145/505 [25:15<46:30,  7.75s/it] 29%|██████████████████████████████████████████████████████████████▏                                                                                                                                                        | 146/505 [25:22<44:44,  7.48s/it]                                                                                                                                                                                                                                                              {'loss': '0.8298', 'grad_norm': '0.728', 'learning_rate': '0.0001792', 'ppl': '2.293', 'memory/max_active (GiB)': '3.69', 'memory/max_allocated (GiB)': '3.69', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '89.12', 'tokens/total': 1685295, 'tokens/trainable': 900447, 'epoch': '0.2895'}
 29%|██████████████████████████████████████████████████████████████▏                                                                                                                                                        | 146/505 [25:22<44:44,  7.48s/it] 29%|██████████████████████████████████████████████████████████████▌                                                                                                                                                        | 147/505 [25:34<52:48,  8.85s/it]                                                                                                                                                                                                                                                              {'loss': '0.7075', 'grad_norm': '0.509', 'learning_rate': '0.0001788', 'ppl': '2.029', 'memory/max_active (GiB)': '4.22', 'memory/max_allocated (GiB)': '4.22', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '131.6', 'tokens/total': 1702743, 'tokens/trainable': 911182, 'epoch': '0.2915'}
 29%|██████████████████████████████████████████████████████████████▌                                                                                                                                                        | 147/505 [25:34<52:48,  8.85s/it] 29%|███████████████████████████████████████████████████████████████                                                                                                                                                        | 148/505 [25:41<49:13,  8.27s/it]                                                                                                                                                                                                                                                              {'loss': '0.9788', 'grad_norm': '0.7268', 'learning_rate': '0.0001784', 'ppl': '2.661', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '156.2', 'tokens/total': 1712731, 'tokens/trainable': 916115, 'epoch': '0.2935'}
 29%|███████████████████████████████████████████████████████████████                                                                                                                                                        | 148/505 [25:41<49:13,  8.27s/it] 30%|███████████████████████████████████████████████████████████████▍                                                                                                                                                       | 149/505 [25:50<51:35,  8.70s/it]                                                                                                                                                                                                                                                              {'loss': '0.9444', 'grad_norm': '0.6717', 'learning_rate': '0.000178', 'ppl': '2.571', 'memory/max_active (GiB)': '4.04', 'memory/max_allocated (GiB)': '4.04', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '277.6', 'tokens/total': 1726830, 'tokens/trainable': 924316, 'epoch': '0.2955'}
 30%|███████████████████████████████████████████████████████████████▍                                                                                                                                                       | 149/505 [25:50<51:35,  8.70s/it] 30%|███████████████████████████████████████████████████████████████▊                                                                                                                                                       | 150/505 [25:59<50:27,  8.53s/it]                                                                                                                                                                                                                                                              {'loss': '0.7674', 'grad_norm': '0.642', 'learning_rate': '0.0001775', 'ppl': '2.154', 'memory/max_active (GiB)': '3.87', 'memory/max_allocated (GiB)': '3.87', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '207.6', 'tokens/total': 1738584, 'tokens/trainable': 930826, 'epoch': '0.2975'}
 30%|███████████████████████████████████████████████████████████████▊                                                                                                                                                       | 150/505 [25:59<50:27,  8.53s/it] 30%|████████████████████████████████████████████████████████████████▎                                                                                                                                                      | 151/505 [26:07<49:21,  8.37s/it]                                                                                                                                                                                                                                                              {'loss': '0.7941', 'grad_norm': '0.7977', 'learning_rate': '0.0001771', 'ppl': '2.213', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '58.45', 'tokens/total': 1750188, 'tokens/trainable': 936828, 'epoch': '0.2995'}
 30%|████████████████████████████████████████████████████████████████▎                                                                                                                                                      | 151/505 [26:07<49:21,  8.37s/it] 30%|████████████████████████████████████████████████████████████████▋                                                                                                                                                      | 152/505 [26:13<45:09,  7.68s/it]                                                                                                                                                                                                                                                              {'loss': '0.6763', 'grad_norm': '0.7788', 'learning_rate': '0.0001767', 'ppl': '1.966', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '25.73', 'tokens/total': 1758981, 'tokens/trainable': 940930, 'epoch': '0.3014'}
 30%|████████████████████████████████████████████████████████████████▋                                                                                                                                                      | 152/505 [26:13<45:09,  7.68s/it] 30%|█████████████████████████████████████████████████████████████████▏                                                                                                                                                     | 153/505 [26:20<43:34,  7.43s/it]                                                                                                                                                                                                                                                              {'loss': '0.7591', 'grad_norm': '0.7326', 'learning_rate': '0.0001762', 'ppl': '2.136', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '76.07', 'tokens/total': 1768907, 'tokens/trainable': 945968, 'epoch': '0.3034'}
 30%|█████████████████████████████████████████████████████████████████▏                                                                                                                                                     | 153/505 [26:20<43:34,  7.43s/it] 30%|█████████████████████████████████████████████████████████████████▌                                                                                                                                                     | 154/505 [26:27<43:43,  7.48s/it]                                                                                                                                                                                                                                                              {'loss': '0.6958', 'grad_norm': '0.6373', 'learning_rate': '0.0001758', 'ppl': '2.005', 'memory/max_active (GiB)': '3.86', 'memory/max_allocated (GiB)': '3.86', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '14.63', 'tokens/total': 1779973, 'tokens/trainable': 951929, 'epoch': '0.3054'}
 30%|█████████████████████████████████████████████████████████████████▌                                                                                                                                                     | 154/505 [26:27<43:43,  7.48s/it][2026-06-13 17:07:50,211] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 31%|█████████████████████████████████████████████████████████████████▉                                                                                                                                                     | 155/505 [26:37<47:07,  8.08s/it]                                                                                                                                                                                                                                                              {'loss': '0.7704', 'grad_norm': '0.6824', 'learning_rate': '0.0001753', 'ppl': '2.161', 'memory/max_active (GiB)': '4.18', 'memory/max_allocated (GiB)': '4.18', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '61.55', 'tokens/total': 1793576, 'tokens/trainable': 959612, 'epoch': '0.3074'}
 31%|█████████████████████████████████████████████████████████████████▉                                                                                                                                                     | 155/505 [26:37<47:07,  8.08s/it] 31%|██████████████████████████████████████████████████████████████████▍                                                                                                                                                    | 156/505 [26:44<45:49,  7.88s/it]                                                                                                                                                                                                                                                              {'loss': '0.8173', 'grad_norm': '0.6899', 'learning_rate': '0.0001749', 'ppl': '2.264', 'memory/max_active (GiB)': '3.95', 'memory/max_allocated (GiB)': '3.95', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '39.95', 'tokens/total': 1804344, 'tokens/trainable': 965232, 'epoch': '0.3094'}
 31%|██████████████████████████████████████████████████████████████████▍                                                                                                                                                    | 156/505 [26:44<45:49,  7.88s/it] 31%|██████████████████████████████████████████████████████████████████▊                                                                                                                                                    | 157/505 [26:51<44:29,  7.67s/it]                                                                                                                                                                                                                                                              {'loss': '0.8784', 'grad_norm': '0.8051', 'learning_rate': '0.0001744', 'ppl': '2.407', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '22.12', 'tokens/total': 1814730, 'tokens/trainable': 970560, 'epoch': '0.3114'}
 31%|██████████████████████████████████████████████████████████████████▊                                                                                                                                                    | 157/505 [26:51<44:29,  7.67s/it] 31%|███████████████████████████████████████████████████████████████████▎                                                                                                                                                   | 158/505 [26:59<45:00,  7.78s/it]                                                                                                                                                                                                                                                              {'loss': '0.8088', 'grad_norm': '0.6462', 'learning_rate': '0.0001739', 'ppl': '2.245', 'memory/max_active (GiB)': '3.95', 'memory/max_allocated (GiB)': '3.95', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '67.66', 'tokens/total': 1826376, 'tokens/trainable': 977002, 'epoch': '0.3133'}
 31%|███████████████████████████████████████████████████████████████████▎                                                                                                                                                   | 158/505 [26:59<45:00,  7.78s/it] 31%|███████████████████████████████████████████████████████████████████▋                                                                                                                                                   | 159/505 [27:06<43:11,  7.49s/it]                                                                                                                                                                                                                                                              {'loss': '0.8078', 'grad_norm': '0.845', 'learning_rate': '0.0001735', 'ppl': '2.243', 'memory/max_active (GiB)': '3.69', 'memory/max_allocated (GiB)': '3.69', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '67.13', 'tokens/total': 1836158, 'tokens/trainable': 981853, 'epoch': '0.3153'}
 31%|███████████████████████████████████████████████████████████████████▋                                                                                                                                                   | 159/505 [27:06<43:11,  7.49s/it] 32%|████████████████████████████████████████████████████████████████████                                                                                                                                                   | 160/505 [27:19<52:11,  9.08s/it]                                                                                                                                                                                                                                                              {'loss': '0.7331', 'grad_norm': '0.44', 'learning_rate': '0.000173', 'ppl': '2.082', 'memory/max_active (GiB)': '4.23', 'memory/max_allocated (GiB)': '4.23', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '130.1', 'tokens/total': 1854635, 'tokens/trainable': 993636, 'epoch': '0.3173'}
 32%|████████████████████████████████████████████████████████████████████                                                                                                                                                   | 160/505 [27:19<52:11,  9.08s/it] 32%|████████████████████████████████████████████████████████████████████▌                                                                                                                                                  | 161/505 [27:26<49:01,  8.55s/it]                                                                                                                                                                                                                                                              {'loss': '0.8562', 'grad_norm': '0.8912', 'learning_rate': '0.0001725', 'ppl': '2.354', 'memory/max_active (GiB)': '3.9', 'memory/max_allocated (GiB)': '3.9', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '37.68', 'tokens/total': 1865094, 'tokens/trainable': 999289, 'epoch': '0.3193'}
 32%|████████████████████████████████████████████████████████████████████▌                                                                                                                                                  | 161/505 [27:26<49:01,  8.55s/it] 32%|████████████████████████████████████████████████████████████████████▉                                                                                                                                                  | 162/505 [27:34<48:27,  8.48s/it]                                                                                                                                                                                                                                                              {'loss': '0.8107', 'grad_norm': '0.7002', 'learning_rate': '0.000172', 'ppl': '2.249', 'memory/max_active (GiB)': '4.2', 'memory/max_allocated (GiB)': '4.2', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '312.8', 'tokens/total': 1877237, 'tokens/trainable': 1005423, 'epoch': '0.3213'}
 32%|████████████████████████████████████████████████████████████████████▉                                                                                                                                                  | 162/505 [27:34<48:27,  8.48s/it] 32%|█████████████████████████████████████████████████████████████████████▍                                                                                                                                                 | 163/505 [27:42<46:47,  8.21s/it]                                                                                                                                                                                                                                                              {'loss': '0.7728', 'grad_norm': '0.8293', 'learning_rate': '0.0001716', 'ppl': '2.166', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '63.58', 'tokens/total': 1888138, 'tokens/trainable': 1010893, 'epoch': '0.3233'}
 32%|█████████████████████████████████████████████████████████████████████▍                                                                                                                                                 | 163/505 [27:42<46:47,  8.21s/it] 32%|█████████████████████████████████████████████████████████████████████▊                                                                                                                                                 | 164/505 [27:50<46:33,  8.19s/it]                                                                                                                                                                                                                                                              {'loss': '0.7879', 'grad_norm': '0.6295', 'learning_rate': '0.0001711', 'ppl': '2.199', 'memory/max_active (GiB)': '4', 'memory/max_allocated (GiB)': '4', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '56.81', 'tokens/total': 1899987, 'tokens/trainable': 1017429, 'epoch': '0.3252'}
 32%|█████████████████████████████████████████████████████████████████████▊                                                                                                                                                 | 164/505 [27:50<46:33,  8.19s/it] 33%|██████████████████████████████████████████████████████████████████████▏                                                                                                                                                | 165/505 [27:56<42:48,  7.56s/it]                                                                                                                                                                                                                                                              {'loss': '0.8311', 'grad_norm': '0.8371', 'learning_rate': '0.0001706', 'ppl': '2.296', 'memory/max_active (GiB)': '3.67', 'memory/max_allocated (GiB)': '3.67', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '169', 'tokens/total': 1908773, 'tokens/trainable': 1021432, 'epoch': '0.3272'}
 33%|██████████████████████████████████████████████████████████████████████▏                                                                                                                                                | 165/505 [27:56<42:48,  7.56s/it] 33%|██████████████████████████████████████████████████████████████████████▋                                                                                                                                                | 166/505 [28:03<40:52,  7.23s/it]                                                                                                                                                                                                                                                              {'loss': '0.7279', 'grad_norm': '0.7396', 'learning_rate': '0.0001701', 'ppl': '2.071', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '44.58', 'tokens/total': 1918045, 'tokens/trainable': 1026095, 'epoch': '0.3292'}
 33%|██████████████████████████████████████████████████████████████████████▋                                                                                                                                                | 166/505 [28:03<40:52,  7.23s/it] 33%|███████████████████████████████████████████████████████████████████████                                                                                                                                                | 167/505 [28:12<44:20,  7.87s/it]                                                                                                                                                                                                                                                              {'loss': '0.7278', 'grad_norm': '0.6043', 'learning_rate': '0.0001696', 'ppl': '2.071', 'memory/max_active (GiB)': '4.19', 'memory/max_allocated (GiB)': '4.19', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '297.7', 'tokens/total': 1931578, 'tokens/trainable': 1033671, 'epoch': '0.3312'}
 33%|███████████████████████████████████████████████████████████████████████                                                                                                                                                | 167/505 [28:12<44:20,  7.87s/it] 33%|███████████████████████████████████████████████████████████████████████▌                                                                                                                                               | 168/505 [28:19<42:56,  7.65s/it]                                                                                                                                                                                                                                                              {'loss': '0.7869', 'grad_norm': '0.7186', 'learning_rate': '0.0001691', 'ppl': '2.197', 'memory/max_active (GiB)': '3.91', 'memory/max_allocated (GiB)': '3.91', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '250.5', 'tokens/total': 1941988, 'tokens/trainable': 1038679, 'epoch': '0.3332'}
 33%|███████████████████████████████████████████████████████████████████████▌                                                                                                                                               | 168/505 [28:19<42:56,  7.65s/it] 33%|███████████████████████████████████████████████████████████████████████▉                                                                                                                                               | 169/505 [28:26<41:54,  7.48s/it]                                                                                                                                                                                                                                                              {'loss': '0.6745', 'grad_norm': '0.6511', 'learning_rate': '0.0001686', 'ppl': '1.963', 'memory/max_active (GiB)': '3.73', 'memory/max_allocated (GiB)': '3.73', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '98.59', 'tokens/total': 1951746, 'tokens/trainable': 1043012, 'epoch': '0.3352'}
 33%|███████████████████████████████████████████████████████████████████████▉                                                                                                                                               | 169/505 [28:26<41:54,  7.48s/it][2026-06-13 17:09:50,542] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 34%|████████████████████████████████████████████████████████████████████████▍                                                                                                                                              | 170/505 [28:33<40:11,  7.20s/it]                                                                                                                                                                                                                                                              {'loss': '0.8532', 'grad_norm': '0.7904', 'learning_rate': '0.0001681', 'ppl': '2.347', 'memory/max_active (GiB)': '3.72', 'memory/max_allocated (GiB)': '3.72', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '62.28', 'tokens/total': 1961098, 'tokens/trainable': 1047526, 'epoch': '0.3371'}
 34%|████████████████████████████████████████████████████████████████████████▍                                                                                                                                              | 170/505 [28:33<40:11,  7.20s/it] 34%|████████████████████████████████████████████████████████████████████████▊                                                                                                                                              | 171/505 [28:42<42:48,  7.69s/it]                                                                                                                                                                                                                                                              {'loss': '0.7444', 'grad_norm': '0.6539', 'learning_rate': '0.0001676', 'ppl': '2.105', 'memory/max_active (GiB)': '3.9', 'memory/max_allocated (GiB)': '3.9', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '17.88', 'tokens/total': 1973870, 'tokens/trainable': 1054961, 'epoch': '0.3391'}
 34%|████████████████████████████████████████████████████████████████████████▊                                                                                                                                              | 171/505 [28:42<42:48,  7.69s/it] 34%|█████████████████████████████████████████████████████████████████████████▏                                                                                                                                             | 172/505 [28:51<45:00,  8.11s/it]                                                                                                                                                                                                                                                              {'loss': '0.8169', 'grad_norm': '0.5769', 'learning_rate': '0.0001671', 'ppl': '2.263', 'memory/max_active (GiB)': '3.86', 'memory/max_allocated (GiB)': '3.86', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '108.8', 'tokens/total': 1986993, 'tokens/trainable': 1062226, 'epoch': '0.3411'}
 34%|█████████████████████████████████████████████████████████████████████████▏                                                                                                                                             | 172/505 [28:51<45:00,  8.11s/it] 34%|█████████████████████████████████████████████████████████████████████████▋                                                                                                                                             | 173/505 [28:57<41:13,  7.45s/it]                                                                                                                                                                                                                                                              {'loss': '0.787', 'grad_norm': '0.7837', 'learning_rate': '0.0001666', 'ppl': '2.197', 'memory/max_active (GiB)': '3.58', 'memory/max_allocated (GiB)': '3.58', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '79.15', 'tokens/total': 1995499, 'tokens/trainable': 1065734, 'epoch': '0.3431'}
 34%|█████████████████████████████████████████████████████████████████████████▋                                                                                                                                             | 173/505 [28:57<41:13,  7.45s/it] 34%|██████████████████████████████████████████████████████████████████████████                                                                                                                                             | 174/505 [29:03<39:46,  7.21s/it]                                                                                                                                                                                                                                                              {'loss': '0.736', 'grad_norm': '0.7003', 'learning_rate': '0.0001661', 'ppl': '2.088', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '74', 'tokens/total': 2005047, 'tokens/trainable': 1070639, 'epoch': '0.3451'}
 34%|██████████████████████████████████████████████████████████████████████████                                                                                                                                             | 174/505 [29:03<39:46,  7.21s/it] 35%|██████████████████████████████████████████████████████████████████████████▌                                                                                                                                            | 175/505 [29:11<41:02,  7.46s/it]                                                                                                                                                                                                                                                              {'loss': '0.7859', 'grad_norm': '0.6489', 'learning_rate': '0.0001655', 'ppl': '2.194', 'memory/max_active (GiB)': '3.82', 'memory/max_allocated (GiB)': '3.82', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '151.7', 'tokens/total': 2016477, 'tokens/trainable': 1077081, 'epoch': '0.3471'}
 35%|██████████████████████████████████████████████████████████████████████████▌                                                                                                                                            | 175/505 [29:11<41:02,  7.46s/it] 35%|██████████████████████████████████████████████████████████████████████████▉                                                                                                                                            | 176/505 [29:18<39:02,  7.12s/it]                                                                                                                                                                                                                                                              {'loss': '0.7634', 'grad_norm': '0.8022', 'learning_rate': '0.000165', 'ppl': '2.146', 'memory/max_active (GiB)': '3.72', 'memory/max_allocated (GiB)': '3.72', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '29.91', 'tokens/total': 2025627, 'tokens/trainable': 1081227, 'epoch': '0.349'}
 35%|██████████████████████████████████████████████████████████████████████████▉                                                                                                                                            | 176/505 [29:18<39:02,  7.12s/it] 35%|███████████████████████████████████████████████████████████████████████████▎                                                                                                                                           | 177/505 [29:26<40:04,  7.33s/it]                                                                                                                                                                                                                                                              {'loss': '0.6472', 'grad_norm': '0.6725', 'learning_rate': '0.0001645', 'ppl': '1.91', 'memory/max_active (GiB)': '3.73', 'memory/max_allocated (GiB)': '3.73', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '95.17', 'tokens/total': 2036933, 'tokens/trainable': 1087537, 'epoch': '0.351'}
 35%|███████████████████████████████████████████████████████████████████████████▎                                                                                                                                           | 177/505 [29:26<40:04,  7.33s/it] 35%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                                           | 178/505 [29:35<43:49,  8.04s/it]                                                                                                                                                                                                                                                              {'loss': '0.7364', 'grad_norm': '0.674', 'learning_rate': '0.000164', 'ppl': '2.088', 'memory/max_active (GiB)': '3.91', 'memory/max_allocated (GiB)': '3.91', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '187.5', 'tokens/total': 2050800, 'tokens/trainable': 1095507, 'epoch': '0.353'}
 35%|███████████████████████████████████████████████████████████████████████████▊                                                                                                                                           | 178/505 [29:35<43:49,  8.04s/it] 35%|████████████████████████████████████████████████████████████████████████████▏                                                                                                                                          | 179/505 [29:40<39:05,  7.19s/it]                                                                                                                                                                                                                                                              {'loss': '0.8214', 'grad_norm': '0.908', 'learning_rate': '0.0001634', 'ppl': '2.274', 'memory/max_active (GiB)': '3.56', 'memory/max_allocated (GiB)': '3.56', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '87.76', 'tokens/total': 2058273, 'tokens/trainable': 1098513, 'epoch': '0.355'}
 35%|████████████████████████████████████████████████████████████████████████████▏                                                                                                                                          | 179/505 [29:40<39:05,  7.19s/it] 36%|████████████████████████████████████████████████████████████████████████████▋                                                                                                                                          | 180/505 [29:49<40:30,  7.48s/it]                                                                                                                                                                                                                                                              {'loss': '0.7356', 'grad_norm': '0.6062', 'learning_rate': '0.0001629', 'ppl': '2.087', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '32.94', 'tokens/total': 2070135, 'tokens/trainable': 1105244, 'epoch': '0.357'}
 36%|████████████████████████████████████████████████████████████████████████████▋                                                                                                                                          | 180/505 [29:49<40:30,  7.48s/it] 36%|█████████████████████████████████████████████████████████████████████████████                                                                                                                                          | 181/505 [29:56<40:58,  7.59s/it]                                                                                                                                                                                                                                                              {'loss': '0.7819', 'grad_norm': '0.7539', 'learning_rate': '0.0001623', 'ppl': '2.186', 'memory/max_active (GiB)': '3.98', 'memory/max_allocated (GiB)': '3.98', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '39.52', 'tokens/total': 2081445, 'tokens/trainable': 1111593, 'epoch': '0.3589'}
 36%|█████████████████████████████████████████████████████████████████████████████                                                                                                                                          | 181/505 [29:56<40:58,  7.59s/it] 36%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                                         | 182/505 [30:04<41:27,  7.70s/it]                                                                                                                                                                                                                                                              {'loss': '0.7657', 'grad_norm': '0.7265', 'learning_rate': '0.0001618', 'ppl': '2.15', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '73.07', 'tokens/total': 2093087, 'tokens/trainable': 1118106, 'epoch': '0.3609'}
 36%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                                         | 182/505 [30:04<41:27,  7.70s/it] 36%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                                         | 183/505 [30:13<42:03,  7.84s/it]                                                                                                                                                                                                                                                              {'loss': '0.8536', 'grad_norm': '0.7502', 'learning_rate': '0.0001613', 'ppl': '2.348', 'memory/max_active (GiB)': '3.96', 'memory/max_allocated (GiB)': '3.96', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '263.6', 'tokens/total': 2104858, 'tokens/trainable': 1125173, 'epoch': '0.3629'}
 36%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                                         | 183/505 [30:13<42:03,  7.84s/it] 36%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                                        | 184/505 [30:21<43:09,  8.07s/it]                                                                                                                                                                                                                                                              {'loss': '0.7193', 'grad_norm': '0.6279', 'learning_rate': '0.0001607', 'ppl': '2.053', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '31.39', 'tokens/total': 2117407, 'tokens/trainable': 1132366, 'epoch': '0.3649'}
 36%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                                                        | 184/505 [30:21<43:09,  8.07s/it] 37%|██████████████████████████████████████████████████████████████████████████████▊                                                                                                                                        | 185/505 [30:28<41:04,  7.70s/it]                                                                                                                                                                                                                                                              {'loss': '0.7647', 'grad_norm': '1.013', 'learning_rate': '0.0001602', 'ppl': '2.148', 'memory/max_active (GiB)': '3.96', 'memory/max_allocated (GiB)': '3.96', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '38.29', 'tokens/total': 2127319, 'tokens/trainable': 1137002, 'epoch': '0.3669'}
 37%|██████████████████████████████████████████████████████████████████████████████▊                                                                                                                                        | 185/505 [30:28<41:04,  7.70s/it] 37%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                                       | 186/505 [30:38<44:22,  8.35s/it]                                                                                                                                                                                                                                                              {'loss': '0.8591', 'grad_norm': '0.5862', 'learning_rate': '0.0001596', 'ppl': '2.361', 'memory/max_active (GiB)': '4.1', 'memory/max_allocated (GiB)': '4.1', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '110', 'tokens/total': 2141637, 'tokens/trainable': 1144307, 'epoch': '0.3689'}
 37%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                                       | 186/505 [30:38<44:22,  8.35s/it][2026-06-13 17:12:04,067] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 37%|███████████████████████████████████████████████████████████████████████████████▌                                                                                                                                       | 187/505 [30:49<48:37,  9.17s/it]                                                                                                                                                                                                                                                              {'loss': '0.7911', 'grad_norm': '0.606', 'learning_rate': '0.0001591', 'ppl': '2.206', 'memory/max_active (GiB)': '4.12', 'memory/max_allocated (GiB)': '4.12', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '112.1', 'tokens/total': 2157694, 'tokens/trainable': 1153983, 'epoch': '0.3708'}
 37%|███████████████████████████████████████████████████████████████████████████████▌                                                                                                                                       | 187/505 [30:49<48:37,  9.17s/it] 37%|████████████████████████████████████████████████████████████████████████████████                                                                                                                                       | 188/505 [30:58<47:54,  9.07s/it]                                                                                                                                                                                                                                                              {'loss': '0.7401', 'grad_norm': '0.5891', 'learning_rate': '0.0001585', 'ppl': '2.096', 'memory/max_active (GiB)': '4.1', 'memory/max_allocated (GiB)': '4.1', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '277.6', 'tokens/total': 2170374, 'tokens/trainable': 1161605, 'epoch': '0.3728'}
 37%|████████████████████████████████████████████████████████████████████████████████                                                                                                                                       | 188/505 [30:58<47:54,  9.07s/it] 37%|████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                      | 189/505 [31:08<49:25,  9.39s/it]                                                                                                                                                                                                                                                              {'loss': '0.7875', 'grad_norm': '0.5282', 'learning_rate': '0.0001579', 'ppl': '2.198', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '70.92', 'tokens/total': 2185005, 'tokens/trainable': 1169994, 'epoch': '0.3748'}
 37%|████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                      | 189/505 [31:08<49:25,  9.39s/it] 38%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                      | 190/505 [31:16<46:47,  8.91s/it]                                                                                                                                                                                                                                                              {'loss': '0.7885', 'grad_norm': '0.74', 'learning_rate': '0.0001574', 'ppl': '2.2', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '130', 'tokens/total': 2196313, 'tokens/trainable': 1176046, 'epoch': '0.3768'}
 38%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                      | 190/505 [31:16<46:47,  8.91s/it] 38%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                     | 191/505 [31:25<47:17,  9.04s/it]                                                                                                                                                                                                                                                              {'loss': '0.755', 'grad_norm': '0.6347', 'learning_rate': '0.0001568', 'ppl': '2.128', 'memory/max_active (GiB)': '4.1', 'memory/max_allocated (GiB)': '4.1', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '277', 'tokens/total': 2209824, 'tokens/trainable': 1183451, 'epoch': '0.3788'}
 38%|█████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                     | 191/505 [31:25<47:17,  9.04s/it] 38%|█████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                     | 192/505 [31:35<47:51,  9.17s/it]                                                                                                                                                                                                                                                              {'loss': '0.8176', 'grad_norm': '0.5931', 'learning_rate': '0.0001562', 'ppl': '2.265', 'memory/max_active (GiB)': '4.21', 'memory/max_allocated (GiB)': '4.21', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '247.6', 'tokens/total': 2223741, 'tokens/trainable': 1190783, 'epoch': '0.3808'}
 38%|█████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                     | 192/505 [31:35<47:51,  9.17s/it] 38%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                    | 193/505 [31:43<46:06,  8.87s/it]                                                                                                                                                                                                                                                              {'loss': '0.7476', 'grad_norm': '0.7064', 'learning_rate': '0.0001557', 'ppl': '2.112', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '36.84', 'tokens/total': 2235449, 'tokens/trainable': 1196713, 'epoch': '0.3827'}
 38%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                    | 193/505 [31:43<46:06,  8.87s/it] 38%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                    | 194/505 [31:51<45:22,  8.76s/it]                                                                                                                                                                                                                                                              {'loss': '0.7571', 'grad_norm': '0.5131', 'learning_rate': '0.0001551', 'ppl': '2.132', 'memory/max_active (GiB)': '3.87', 'memory/max_allocated (GiB)': '3.87', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '208.6', 'tokens/total': 2247772, 'tokens/trainable': 1203671, 'epoch': '0.3847'}
 38%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                    | 194/505 [31:51<45:22,  8.76s/it] 39%|███████████████████████████████████████████████████████████████████████████████████                                                                                                                                    | 195/505 [32:00<45:36,  8.83s/it]                                                                                                                                                                                                                                                              {'loss': '0.6939', 'grad_norm': '0.6682', 'learning_rate': '0.0001545', 'ppl': '2.001', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '254.9', 'tokens/total': 2260703, 'tokens/trainable': 1210905, 'epoch': '0.3867'}
 39%|███████████████████████████████████████████████████████████████████████████████████                                                                                                                                    | 195/505 [32:00<45:36,  8.83s/it] 39%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                   | 196/505 [32:10<46:37,  9.05s/it]                                                                                                                                                                                                                                                              {'loss': '0.6163', 'grad_norm': '0.5397', 'learning_rate': '0.0001539', 'ppl': '1.852', 'memory/max_active (GiB)': '4.19', 'memory/max_allocated (GiB)': '4.19', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '124.7', 'tokens/total': 2274603, 'tokens/trainable': 1218774, 'epoch': '0.3887'}
 39%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                   | 196/505 [32:10<46:37,  9.05s/it] 39%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                   | 197/505 [32:17<44:13,  8.62s/it]                                                                                                                                                                                                                                                              {'loss': '0.7221', 'grad_norm': '0.5736', 'learning_rate': '0.0001533', 'ppl': '2.059', 'memory/max_active (GiB)': '3.67', 'memory/max_allocated (GiB)': '3.67', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '154.1', 'tokens/total': 2285540, 'tokens/trainable': 1224616, 'epoch': '0.3907'}
 39%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                   | 197/505 [32:17<44:13,  8.62s/it] 39%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                  | 198/505 [32:25<43:07,  8.43s/it]                                                                                                                                                                                                                                                              {'loss': '0.7961', 'grad_norm': '0.7154', 'learning_rate': '0.0001528', 'ppl': '2.217', 'memory/max_active (GiB)': '4.08', 'memory/max_allocated (GiB)': '4.08', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '97.74', 'tokens/total': 2297071, 'tokens/trainable': 1230659, 'epoch': '0.3927'}
 39%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                  | 198/505 [32:25<43:07,  8.43s/it] 39%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                  | 199/505 [32:33<41:49,  8.20s/it]                                                                                                                                                                                                                                                              {'loss': '0.8384', 'grad_norm': '0.6253', 'learning_rate': '0.0001522', 'ppl': '2.313', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '67.43', 'tokens/total': 2308371, 'tokens/trainable': 1236639, 'epoch': '0.3946'}
 39%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                  | 199/505 [32:33<41:49,  8.20s/it] 40%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                 | 200/505 [32:39<38:16,  7.53s/it]                                                                                                                                                                                                                                                              {'loss': '0.6096', 'grad_norm': '0.7786', 'learning_rate': '0.0001516', 'ppl': '1.84', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '15.76', 'tokens/total': 2316990, 'tokens/trainable': 1240447, 'epoch': '0.3966'}
 40%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                 | 200/505 [32:39<38:16,  7.53s/it] 40%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                 | 201/505 [32:47<38:25,  7.58s/it]                                                                                                                                                                                                                                                              {'loss': '0.8065', 'grad_norm': '0.6262', 'learning_rate': '0.000151', 'ppl': '2.24', 'memory/max_active (GiB)': '3.87', 'memory/max_allocated (GiB)': '3.87', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '69.55', 'tokens/total': 2328169, 'tokens/trainable': 1246473, 'epoch': '0.3986'}
 40%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                 | 201/505 [32:47<38:25,  7.58s/it] 40%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                                                 | 202/505 [32:55<39:06,  7.74s/it]                                                                                                                                                                                                                                                              {'loss': '0.7145', 'grad_norm': '0.612', 'learning_rate': '0.0001504', 'ppl': '2.043', 'memory/max_active (GiB)': '3.98', 'memory/max_allocated (GiB)': '3.98', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '62.94', 'tokens/total': 2339978, 'tokens/trainable': 1252549, 'epoch': '0.4006'}
 40%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                                                 | 202/505 [32:55<39:06,  7.74s/it] 40%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                | 203/505 [33:00<35:27,  7.04s/it]                                                                                                                                                                                                                                                              {'loss': '0.7812', 'grad_norm': '0.7388', 'learning_rate': '0.0001498', 'ppl': '2.184', 'memory/max_active (GiB)': '3.54', 'memory/max_allocated (GiB)': '3.54', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '30.32', 'tokens/total': 2347763, 'tokens/trainable': 1255555, 'epoch': '0.4026'}
 40%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                | 203/505 [33:00<35:27,  7.04s/it] 40%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                | 204/505 [33:11<40:31,  8.08s/it]                                                                                                                                                                                                                                                              {'loss': '0.7972', 'grad_norm': '0.5027', 'learning_rate': '0.0001492', 'ppl': '2.219', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '59.28', 'tokens/total': 2362882, 'tokens/trainable': 1264587, 'epoch': '0.4046'}
 40%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                | 204/505 [33:11<40:31,  8.08s/it] 41%|███████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                               | 205/505 [33:20<41:52,  8.38s/it]                                                                                                                                                                                                                                                              {'loss': '0.7526', 'grad_norm': '0.5519', 'learning_rate': '0.0001486', 'ppl': '2.123', 'memory/max_active (GiB)': '4.04', 'memory/max_allocated (GiB)': '4.04', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '41.47', 'tokens/total': 2375923, 'tokens/trainable': 1272240, 'epoch': '0.4065'}
 41%|███████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                               | 205/505 [33:20<41:52,  8.38s/it] 41%|███████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                               | 206/505 [33:27<40:01,  8.03s/it]                                                                                                                                                                                                                                                              {'loss': '0.7722', 'grad_norm': '0.595', 'learning_rate': '0.000148', 'ppl': '2.165', 'memory/max_active (GiB)': '3.67', 'memory/max_allocated (GiB)': '3.67', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '47.3', 'tokens/total': 2386558, 'tokens/trainable': 1277679, 'epoch': '0.4085'}
 41%|███████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                               | 206/505 [33:27<40:01,  8.03s/it] 41%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                              | 207/505 [33:34<38:57,  7.84s/it]                                                                                                                                                                                                                                                              {'loss': '0.8153', 'grad_norm': '0.8339', 'learning_rate': '0.0001474', 'ppl': '2.26', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '48.37', 'tokens/total': 2397196, 'tokens/trainable': 1282744, 'epoch': '0.4105'}
 41%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                              | 207/505 [33:34<38:57,  7.84s/it] 41%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                              | 208/505 [33:42<39:03,  7.89s/it]                                                                                                                                                                                                                                                              {'loss': '0.7112', 'grad_norm': '0.6473', 'learning_rate': '0.0001468', 'ppl': '2.036', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '87.51', 'tokens/total': 2408713, 'tokens/trainable': 1289166, 'epoch': '0.4125'}
 41%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                              | 208/505 [33:42<39:03,  7.89s/it][2026-06-13 17:15:08,452] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 41%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                              | 209/505 [33:54<44:54,  9.10s/it]                                                                                                                                                                                                                                                              {'loss': '0.7607', 'grad_norm': '0.491', 'learning_rate': '0.0001462', 'ppl': '2.14', 'memory/max_active (GiB)': '4.26', 'memory/max_allocated (GiB)': '4.26', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '229.7', 'tokens/total': 2425801, 'tokens/trainable': 1300160, 'epoch': '0.4145'}
 41%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                              | 209/505 [33:54<44:54,  9.10s/it] 42%|█████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                             | 210/505 [34:01<41:05,  8.36s/it]                                                                                                                                                                                                                                                              {'loss': '0.7804', 'grad_norm': '0.6828', 'learning_rate': '0.0001456', 'ppl': '2.182', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '60.74', 'tokens/total': 2435434, 'tokens/trainable': 1304881, 'epoch': '0.4165'}
 42%|█████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                             | 210/505 [34:01<41:05,  8.36s/it] 42%|█████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                             | 211/505 [34:09<40:39,  8.30s/it]                                                                                                                                                                                                                                                              {'loss': '0.8198', 'grad_norm': '0.6852', 'learning_rate': '0.0001449', 'ppl': '2.27', 'memory/max_active (GiB)': '3.96', 'memory/max_allocated (GiB)': '3.96', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '22.56', 'tokens/total': 2447125, 'tokens/trainable': 1311052, 'epoch': '0.4184'}
 42%|█████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                             | 211/505 [34:09<40:39,  8.30s/it] 42%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                            | 212/505 [34:18<41:29,  8.50s/it]                                                                                                                                                                                                                                                              {'loss': '0.7153', 'grad_norm': '0.7018', 'learning_rate': '0.0001443', 'ppl': '2.045', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '13.73', 'tokens/total': 2460028, 'tokens/trainable': 1318503, 'epoch': '0.4204'}
 42%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                            | 212/505 [34:18<41:29,  8.50s/it] 42%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                            | 213/505 [34:25<39:29,  8.12s/it]                                                                                                                                                                                                                                                              {'loss': '0.7315', 'grad_norm': '0.6778', 'learning_rate': '0.0001437', 'ppl': '2.078', 'memory/max_active (GiB)': '3.79', 'memory/max_allocated (GiB)': '3.79', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '100.5', 'tokens/total': 2470450, 'tokens/trainable': 1324124, 'epoch': '0.4224'}
 42%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                            | 213/505 [34:25<39:29,  8.12s/it] 42%|███████████████████████████████████████████████████████████████████████████████████████████                                                                                                                            | 214/505 [34:33<38:43,  7.98s/it]                                                                                                                                                                                                                                                              {'loss': '0.8151', 'grad_norm': '0.6128', 'learning_rate': '0.0001431', 'ppl': '2.259', 'memory/max_active (GiB)': '3.8', 'memory/max_allocated (GiB)': '3.8', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '98.23', 'tokens/total': 2481478, 'tokens/trainable': 1330171, 'epoch': '0.4244'}
 42%|███████████████████████████████████████████████████████████████████████████████████████████                                                                                                                            | 214/505 [34:33<38:43,  7.98s/it] 43%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                           | 215/505 [34:41<38:00,  7.87s/it]                                                                                                                                                                                                                                                              {'loss': '0.7758', 'grad_norm': '0.711', 'learning_rate': '0.0001425', 'ppl': '2.172', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '50.49', 'tokens/total': 2492483, 'tokens/trainable': 1335689, 'epoch': '0.4264'}
 43%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                           | 215/505 [34:41<38:00,  7.87s/it] 43%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                           | 216/505 [34:49<38:42,  8.04s/it]                                                                                                                                                                                                                                                              {'loss': '0.693', 'grad_norm': '0.5818', 'learning_rate': '0.0001418', 'ppl': '2', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '75.54', 'tokens/total': 2504614, 'tokens/trainable': 1342571, 'epoch': '0.4284'}
 43%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                           | 216/505 [34:49<38:42,  8.04s/it] 43%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                          | 217/505 [34:59<41:07,  8.57s/it]                                                                                                                                                                                                                                                              {'loss': '0.6625', 'grad_norm': '0.5779', 'learning_rate': '0.0001412', 'ppl': '1.94', 'memory/max_active (GiB)': '3.89', 'memory/max_allocated (GiB)': '3.89', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '163.1', 'tokens/total': 2518748, 'tokens/trainable': 1351182, 'epoch': '0.4303'}
 43%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                          | 217/505 [34:59<41:07,  8.57s/it][2026-06-13 17:16:21,577] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 43%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                          | 218/505 [35:06<38:16,  8.00s/it]                                                                                                                                                                                                                                                              {'loss': '0.7077', 'grad_norm': '0.7485', 'learning_rate': '0.0001406', 'ppl': '2.029', 'memory/max_active (GiB)': '3.56', 'memory/max_allocated (GiB)': '3.56', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '88.2', 'tokens/total': 2527224, 'tokens/trainable': 1354666, 'epoch': '0.4323'}
 43%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                          | 218/505 [35:06<38:16,  8.00s/it] 43%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                         | 219/505 [35:12<35:58,  7.55s/it]                                                                                                                                                                                                                                                              {'loss': '0.6958', 'grad_norm': '0.8665', 'learning_rate': '0.0001399', 'ppl': '2.005', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '141.4', 'tokens/total': 2536509, 'tokens/trainable': 1358933, 'epoch': '0.4343'}
 43%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                         | 219/505 [35:12<35:58,  7.55s/it] 44%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                         | 220/505 [35:21<38:26,  8.09s/it]                                                                                                                                                                                                                                                              {'loss': '0.834', 'grad_norm': '0.7537', 'learning_rate': '0.0001393', 'ppl': '2.303', 'memory/max_active (GiB)': '4.26', 'memory/max_allocated (GiB)': '4.26', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '300.6', 'tokens/total': 2549894, 'tokens/trainable': 1366402, 'epoch': '0.4363'}
 44%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                         | 220/505 [35:21<38:26,  8.09s/it] 44%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                         | 221/505 [35:28<36:50,  7.78s/it]                                                                                                                                                                                                                                                              {'loss': '0.8722', 'grad_norm': '0.9327', 'learning_rate': '0.0001387', 'ppl': '2.392', 'memory/max_active (GiB)': '4.04', 'memory/max_allocated (GiB)': '4.04', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '320.1', 'tokens/total': 2560158, 'tokens/trainable': 1371314, 'epoch': '0.4383'}
 44%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                         | 221/505 [35:28<36:50,  7.78s/it] 44%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                        | 222/505 [35:36<36:32,  7.75s/it]                                                                                                                                                                                                                                                              {'loss': '0.753', 'grad_norm': '0.6261', 'learning_rate': '0.000138', 'ppl': '2.123', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '60.07', 'tokens/total': 2571319, 'tokens/trainable': 1376979, 'epoch': '0.4403'}
 44%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                        | 222/505 [35:36<36:32,  7.75s/it] 44%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                        | 223/505 [35:43<35:25,  7.54s/it]                                                                                                                                                                                                                                                              {'loss': '0.902', 'grad_norm': '0.6572', 'learning_rate': '0.0001374', 'ppl': '2.465', 'memory/max_active (GiB)': '3.72', 'memory/max_allocated (GiB)': '3.72', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '74.11', 'tokens/total': 2581549, 'tokens/trainable': 1382297, 'epoch': '0.4422'}
 44%|██████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                        | 223/505 [35:43<35:25,  7.54s/it] 44%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                       | 224/505 [35:51<35:50,  7.65s/it]                                                                                                                                                                                                                                                              {'loss': '0.7844', 'grad_norm': '0.6246', 'learning_rate': '0.0001367', 'ppl': '2.191', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '67.47', 'tokens/total': 2593043, 'tokens/trainable': 1388426, 'epoch': '0.4442'}
 44%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                       | 224/505 [35:51<35:50,  7.65s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                       | 225/505 [35:58<35:21,  7.58s/it]                                                                                                                                                                                                                                                              {'loss': '0.7643', 'grad_norm': '0.7317', 'learning_rate': '0.0001361', 'ppl': '2.147', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '19.2', 'tokens/total': 2603665, 'tokens/trainable': 1393728, 'epoch': '0.4462'}
 45%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                       | 225/505 [35:58<35:21,  7.58s/it] 45%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                      | 226/505 [36:09<40:00,  8.60s/it]                                                                                                                                                                                                                                                              {'loss': '0.7371', 'grad_norm': '0.5489', 'learning_rate': '0.0001355', 'ppl': '2.09', 'memory/max_active (GiB)': '4.24', 'memory/max_allocated (GiB)': '4.24', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '56.2', 'tokens/total': 2619653, 'tokens/trainable': 1403386, 'epoch': '0.4482'}
 45%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                      | 226/505 [36:09<40:00,  8.60s/it] 45%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 227/505 [36:17<38:09,  8.24s/it]                                                                                                                                                                                                                                                              {'loss': '0.6848', 'grad_norm': '0.705', 'learning_rate': '0.0001348', 'ppl': '1.983', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '14.51', 'tokens/total': 2630362, 'tokens/trainable': 1408695, 'epoch': '0.4502'}
 45%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 227/505 [36:17<38:09,  8.24s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                      | 228/505 [36:26<38:58,  8.44s/it]                                                                                                                                                                                                                                                              {'loss': '0.8258', 'grad_norm': '0.7267', 'learning_rate': '0.0001342', 'ppl': '2.284', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '140.7', 'tokens/total': 2643133, 'tokens/trainable': 1415881, 'epoch': '0.4522'}
 45%|█████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                      | 228/505 [36:26<38:58,  8.44s/it] 45%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                     | 229/505 [36:34<38:02,  8.27s/it]                                                                                                                                                                                                                                                              {'loss': '0.7365', 'grad_norm': '0.6201', 'learning_rate': '0.0001335', 'ppl': '2.089', 'memory/max_active (GiB)': '3.79', 'memory/max_allocated (GiB)': '3.79', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '47.01', 'tokens/total': 2654504, 'tokens/trainable': 1421843, 'epoch': '0.4541'}
 45%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                     | 229/505 [36:34<38:02,  8.27s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 230/505 [36:40<35:51,  7.82s/it]                                                                                                                                                                                                                                                              {'loss': '0.5672', 'grad_norm': '0.9076', 'learning_rate': '0.0001329', 'ppl': '1.763', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '98.22', 'tokens/total': 2664287, 'tokens/trainable': 1426216, 'epoch': '0.4561'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 230/505 [36:40<35:51,  7.82s/it] 46%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                    | 231/505 [36:48<34:56,  7.65s/it]                                                                                                                                                                                                                                                              {'loss': '0.6877', 'grad_norm': '0.7074', 'learning_rate': '0.0001322', 'ppl': '1.989', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '47.91', 'tokens/total': 2674702, 'tokens/trainable': 1431352, 'epoch': '0.4581'}
 46%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                    | 231/505 [36:48<34:56,  7.65s/it] 46%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 232/505 [36:56<36:16,  7.97s/it]                                                                                                                                                                                                                                                              {'loss': '0.7082', 'grad_norm': '0.6143', 'learning_rate': '0.0001316', 'ppl': '2.03', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '20.52', 'tokens/total': 2687380, 'tokens/trainable': 1438124, 'epoch': '0.4601'}
 46%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                    | 232/505 [36:56<36:16,  7.97s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                   | 233/505 [37:03<33:54,  7.48s/it]                                                                                                                                                                                                                                                              {'loss': '0.7898', 'grad_norm': '0.8773', 'learning_rate': '0.0001309', 'ppl': '2.203', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '74.5', 'tokens/total': 2696565, 'tokens/trainable': 1442750, 'epoch': '0.4621'}
 46%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                   | 233/505 [37:03<33:54,  7.48s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                   | 234/505 [37:11<34:40,  7.68s/it]                                                                                                                                                                                                                                                              {'loss': '0.5911', 'grad_norm': '0.6274', 'learning_rate': '0.0001302', 'ppl': '1.806', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '64.52', 'tokens/total': 2707774, 'tokens/trainable': 1448583, 'epoch': '0.4641'}
 46%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                   | 234/505 [37:11<34:40,  7.68s/it] 47%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                   | 235/505 [37:19<34:55,  7.76s/it]                                                                                                                                                                                                                                                              {'loss': '0.8202', 'grad_norm': '1.069', 'learning_rate': '0.0001296', 'ppl': '2.271', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '57.91', 'tokens/total': 2719311, 'tokens/trainable': 1454715, 'epoch': '0.466'}
 47%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                   | 235/505 [37:19<34:55,  7.76s/it] 47%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                  | 236/505 [37:27<35:09,  7.84s/it]                                                                                                                                                                                                                                                              {'loss': '0.8082', 'grad_norm': '0.6285', 'learning_rate': '0.0001289', 'ppl': '2.244', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '163', 'tokens/total': 2731027, 'tokens/trainable': 1460926, 'epoch': '0.468'}
 47%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                  | 236/505 [37:27<35:09,  7.84s/it] 47%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                  | 237/505 [37:36<37:05,  8.30s/it]                                                                                                                                                                                                                                                              {'loss': '0.7088', 'grad_norm': '0.5351', 'learning_rate': '0.0001283', 'ppl': '2.031', 'memory/max_active (GiB)': '4.1', 'memory/max_allocated (GiB)': '4.1', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '75.6', 'tokens/total': 2744543, 'tokens/trainable': 1468649, 'epoch': '0.47'}
 47%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                  | 237/505 [37:36<37:05,  8.30s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 238/505 [37:45<37:00,  8.32s/it]                                                                                                                                                                                                                                                              {'loss': '0.8218', 'grad_norm': '0.7264', 'learning_rate': '0.0001276', 'ppl': '2.275', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '69.49', 'tokens/total': 2756562, 'tokens/trainable': 1475444, 'epoch': '0.472'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 238/505 [37:45<37:00,  8.32s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 239/505 [37:51<34:36,  7.81s/it]                                                                                                                                                                                                                                                              {'loss': '0.82', 'grad_norm': '0.7215', 'learning_rate': '0.0001269', 'ppl': '2.271', 'memory/max_active (GiB)': '3.64', 'memory/max_allocated (GiB)': '3.64', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '33.99', 'tokens/total': 2766105, 'tokens/trainable': 1479835, 'epoch': '0.474'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 239/505 [37:51<34:36,  7.81s/it] 48%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                | 240/505 [37:58<32:49,  7.43s/it]                                                                                                                                                                                                                                                              {'loss': '0.8011', 'grad_norm': '0.7225', 'learning_rate': '0.0001263', 'ppl': '2.228', 'memory/max_active (GiB)': '3.61', 'memory/max_allocated (GiB)': '3.61', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '54.65', 'tokens/total': 2775532, 'tokens/trainable': 1484527, 'epoch': '0.476'}
 48%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                | 240/505 [37:58<32:49,  7.43s/it] 48%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 241/505 [38:05<32:40,  7.43s/it]                                                                                                                                                                                                                                                              {'loss': '0.7255', 'grad_norm': '0.6591', 'learning_rate': '0.0001256', 'ppl': '2.066', 'memory/max_active (GiB)': '3.88', 'memory/max_allocated (GiB)': '3.88', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '72.03', 'tokens/total': 2786332, 'tokens/trainable': 1489873, 'epoch': '0.4779'}
 48%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 241/505 [38:05<32:40,  7.43s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                | 242/505 [38:12<31:36,  7.21s/it]                                                                                                                                                                                                                                                              {'loss': '0.707', 'grad_norm': '0.8125', 'learning_rate': '0.0001249', 'ppl': '2.028', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '47.1', 'tokens/total': 2796006, 'tokens/trainable': 1494226, 'epoch': '0.4799'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                | 242/505 [38:12<31:36,  7.21s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                               | 243/505 [38:22<35:02,  8.03s/it]                                                                                                                                                                                                                                                              {'loss': '0.6279', 'grad_norm': '0.738', 'learning_rate': '0.0001243', 'ppl': '1.874', 'memory/max_active (GiB)': '4.2', 'memory/max_allocated (GiB)': '4.2', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '104.4', 'tokens/total': 2810380, 'tokens/trainable': 1502651, 'epoch': '0.4819'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                               | 243/505 [38:22<35:02,  8.03s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                               | 244/505 [38:31<37:02,  8.52s/it]                                                                                                                                                                                                                                                              {'loss': '0.8764', 'grad_norm': '0.6134', 'learning_rate': '0.0001236', 'ppl': '2.402', 'memory/max_active (GiB)': '4.1', 'memory/max_allocated (GiB)': '4.1', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '79.29', 'tokens/total': 2824480, 'tokens/trainable': 1510194, 'epoch': '0.4839'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                               | 244/505 [38:31<37:02,  8.52s/it] 49%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                              | 245/505 [38:39<36:04,  8.33s/it]                                                                                                                                                                                                                                                              {'loss': '0.6724', 'grad_norm': '0.5965', 'learning_rate': '0.0001229', 'ppl': '1.959', 'memory/max_active (GiB)': '3.81', 'memory/max_allocated (GiB)': '3.81', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '51.65', 'tokens/total': 2835871, 'tokens/trainable': 1516428, 'epoch': '0.4859'}
 49%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                              | 245/505 [38:39<36:04,  8.33s/it] 49%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                              | 246/505 [38:46<34:12,  7.92s/it]                                                                                                                                                                                                                                                              {'loss': '0.7344', 'grad_norm': '0.6266', 'learning_rate': '0.0001223', 'ppl': '2.084', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '85.73', 'tokens/total': 2846065, 'tokens/trainable': 1521679, 'epoch': '0.4879'}
 49%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                              | 246/505 [38:46<34:12,  7.92s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                             | 247/505 [38:54<33:09,  7.71s/it]                                                                                                                                                                                                                                                              {'loss': '0.7985', 'grad_norm': '0.885', 'learning_rate': '0.0001216', 'ppl': '2.222', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '135.3', 'tokens/total': 2856489, 'tokens/trainable': 1527101, 'epoch': '0.4898'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                             | 247/505 [38:54<33:09,  7.71s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                             | 248/505 [39:02<33:50,  7.90s/it]                                                                                                                                                                                                                                                              {'loss': '0.6913', 'grad_norm': '0.649', 'learning_rate': '0.0001209', 'ppl': '1.996', 'memory/max_active (GiB)': '3.95', 'memory/max_allocated (GiB)': '3.95', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '102.5', 'tokens/total': 2868546, 'tokens/trainable': 1533367, 'epoch': '0.4918'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                             | 248/505 [39:02<33:50,  7.90s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                             | 249/505 [39:10<34:22,  8.06s/it]                                                                                                                                                                                                                                                              {'loss': '0.7191', 'grad_norm': '0.6297', 'learning_rate': '0.0001202', 'ppl': '2.053', 'memory/max_active (GiB)': '3.8', 'memory/max_allocated (GiB)': '3.8', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '111.9', 'tokens/total': 2880641, 'tokens/trainable': 1539699, 'epoch': '0.4938'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                             | 249/505 [39:10<34:22,  8.06s/it] 50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 250/505 [39:18<33:56,  7.98s/it]                                                                                                                                                                                                                                                              {'loss': '0.6866', 'grad_norm': '0.6395', 'learning_rate': '0.0001196', 'ppl': '1.987', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '53.24', 'tokens/total': 2891742, 'tokens/trainable': 1545468, 'epoch': '0.4958'}
 50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                            | 250/505 [39:18<33:56,  7.98s/it] 50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 251/505 [39:26<33:56,  8.02s/it]                                                                                                                                                                                                                                                              {'loss': '0.7545', 'grad_norm': '0.6194', 'learning_rate': '0.0001189', 'ppl': '2.126', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '48.68', 'tokens/total': 2903589, 'tokens/trainable': 1551901, 'epoch': '0.4978'}
 50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                            | 251/505 [39:26<33:56,  8.02s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 252/505 [39:35<35:03,  8.32s/it]                                                                                                                                                                                                                                                              {'loss': '0.859', 'grad_norm': '0.6329', 'learning_rate': '0.0001182', 'ppl': '2.361', 'memory/max_active (GiB)': '4.18', 'memory/max_allocated (GiB)': '4.18', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '41.73', 'tokens/total': 2916578, 'tokens/trainable': 1559128, 'epoch': '0.4998'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 252/505 [39:35<35:03,  8.32s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 253/505 [39:43<34:05,  8.12s/it]                                                                                                                                                                                                                                                              {'loss': '0.8696', 'grad_norm': '0.6943', 'learning_rate': '0.0001175', 'ppl': '2.386', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '50.27', 'tokens/total': 2927578, 'tokens/trainable': 1564879, 'epoch': '0.5017'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                           | 253/505 [39:43<34:05,  8.12s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                          | 254/505 [39:52<35:35,  8.51s/it]                                                                                                                                                                                                                                                              {'loss': '0.7535', 'grad_norm': '0.6231', 'learning_rate': '0.0001168', 'ppl': '2.124', 'memory/max_active (GiB)': '4.08', 'memory/max_allocated (GiB)': '4.08', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '34.61', 'tokens/total': 2941423, 'tokens/trainable': 1572946, 'epoch': '0.5037'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                          | 254/505 [39:52<35:35,  8.51s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                          | 255/505 [40:01<35:09,  8.44s/it]                                                                                                                                                                                                                                                              {'loss': '0.6649', 'grad_norm': '0.5512', 'learning_rate': '0.0001162', 'ppl': '1.944', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '32.52', 'tokens/total': 2953531, 'tokens/trainable': 1579733, 'epoch': '0.5057'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                          | 255/505 [40:01<35:09,  8.44s/it] 51%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 256/505 [40:09<34:53,  8.41s/it]                                                                                                                                                                                                                                                              {'loss': '0.7668', 'grad_norm': '0.6058', 'learning_rate': '0.0001155', 'ppl': '2.153', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '113.6', 'tokens/total': 2965571, 'tokens/trainable': 1586605, 'epoch': '0.5077'}
 51%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 256/505 [40:09<34:53,  8.41s/it][2026-06-13 17:21:34,439] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 51%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 257/505 [40:18<35:41,  8.63s/it]                                                                                                                                                                                                                                                              {'loss': '0.6668', 'grad_norm': '0.7301', 'learning_rate': '0.0001148', 'ppl': '1.948', 'memory/max_active (GiB)': '4.05', 'memory/max_allocated (GiB)': '4.05', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '84.26', 'tokens/total': 2978754, 'tokens/trainable': 1593860, 'epoch': '0.5097'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 257/505 [40:18<35:41,  8.63s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 258/505 [40:26<34:52,  8.47s/it]                                                                                                                                                                                                                                                              {'loss': '0.7168', 'grad_norm': '0.7387', 'learning_rate': '0.0001141', 'ppl': '2.048', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '52.78', 'tokens/total': 2990439, 'tokens/trainable': 1599988, 'epoch': '0.5117'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 258/505 [40:26<34:52,  8.47s/it][2026-06-13 17:21:48,054] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 51%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                        | 259/505 [40:33<32:20,  7.89s/it]                                                                                                                                                                                                                                                              {'loss': '0.7178', 'grad_norm': '0.8168', 'learning_rate': '0.0001134', 'ppl': '2.05', 'memory/max_active (GiB)': '3.68', 'memory/max_allocated (GiB)': '3.68', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '181.3', 'tokens/total': 2999754, 'tokens/trainable': 1604229, 'epoch': '0.5136'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                        | 259/505 [40:33<32:20,  7.89s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 260/505 [40:42<34:10,  8.37s/it]                                                                                                                                                                                                                                                              {'loss': '0.7447', 'grad_norm': '0.5313', 'learning_rate': '0.0001127', 'ppl': '2.106', 'memory/max_active (GiB)': '3.87', 'memory/max_allocated (GiB)': '3.87', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '30.58', 'tokens/total': 3013494, 'tokens/trainable': 1612460, 'epoch': '0.5156'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 260/505 [40:42<34:10,  8.37s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 261/505 [40:48<31:22,  7.72s/it]                                                                                                                                                                                                                                                              {'loss': '0.6144', 'grad_norm': '0.8376', 'learning_rate': '0.0001121', 'ppl': '1.849', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '43.6', 'tokens/total': 3022497, 'tokens/trainable': 1616583, 'epoch': '0.5176'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                        | 261/505 [40:48<31:22,  7.72s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                       | 262/505 [40:56<31:44,  7.84s/it]                                                                                                                                                                                                                                                              {'loss': '0.7183', 'grad_norm': '0.7406', 'learning_rate': '0.0001114', 'ppl': '2.051', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '75.74', 'tokens/total': 3034326, 'tokens/trainable': 1622934, 'epoch': '0.5196'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                       | 262/505 [40:56<31:44,  7.84s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                       | 263/505 [41:06<33:18,  8.26s/it]                                                                                                                                                                                                                                                              {'loss': '0.6973', 'grad_norm': '0.7245', 'learning_rate': '0.0001107', 'ppl': '2.008', 'memory/max_active (GiB)': '4.22', 'memory/max_allocated (GiB)': '4.22', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '101.5', 'tokens/total': 3047498, 'tokens/trainable': 1630132, 'epoch': '0.5216'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                       | 263/505 [41:06<33:18,  8.26s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                      | 264/505 [41:17<36:46,  9.15s/it]                                                                                                                                                                                                                                                              {'loss': '0.6943', 'grad_norm': '0.4919', 'learning_rate': '0.00011', 'ppl': '2.002', 'memory/max_active (GiB)': '4', 'memory/max_allocated (GiB)': '4', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '81.22', 'tokens/total': 3063828, 'tokens/trainable': 1639528, 'epoch': '0.5235'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                      | 264/505 [41:17<36:46,  9.15s/it] 52%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 265/505 [41:26<35:58,  8.99s/it]                                                                                                                                                                                                                                                              {'loss': '0.7603', 'grad_norm': '0.7289', 'learning_rate': '0.0001093', 'ppl': '2.139', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '153.9', 'tokens/total': 3076170, 'tokens/trainable': 1645808, 'epoch': '0.5255'}
 52%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 265/505 [41:26<35:58,  8.99s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                     | 266/505 [41:33<34:29,  8.66s/it]                                                                                                                                                                                                                                                              {'loss': '0.6348', 'grad_norm': '0.6447', 'learning_rate': '0.0001086', 'ppl': '1.887', 'memory/max_active (GiB)': '3.91', 'memory/max_allocated (GiB)': '3.91', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '44.93', 'tokens/total': 3087537, 'tokens/trainable': 1651693, 'epoch': '0.5275'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                     | 266/505 [41:33<34:29,  8.66s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 267/505 [41:43<34:49,  8.78s/it]                                                                                                                                                                                                                                                              {'loss': '0.7755', 'grad_norm': '0.6005', 'learning_rate': '0.0001079', 'ppl': '2.172', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '97.91', 'tokens/total': 3100706, 'tokens/trainable': 1659493, 'epoch': '0.5295'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 267/505 [41:43<34:49,  8.78s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 268/505 [41:50<33:02,  8.36s/it]                                                                                                                                                                                                                                                              {'loss': '0.7433', 'grad_norm': '0.6392', 'learning_rate': '0.0001072', 'ppl': '2.103', 'memory/max_active (GiB)': '3.8', 'memory/max_allocated (GiB)': '3.8', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '90', 'tokens/total': 3111556, 'tokens/trainable': 1665256, 'epoch': '0.5315'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 268/505 [41:50<33:02,  8.36s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 269/505 [41:56<30:27,  7.74s/it]                                                                                                                                                                                                                                                              {'loss': '0.7588', 'grad_norm': '0.7969', 'learning_rate': '0.0001066', 'ppl': '2.136', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '92.33', 'tokens/total': 3120638, 'tokens/trainable': 1669242, 'epoch': '0.5335'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                    | 269/505 [41:56<30:27,  7.74s/it] 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 270/505 [42:04<30:10,  7.70s/it]                                                                                                                                                                                                                                                              {'loss': '0.6831', 'grad_norm': '0.6161', 'learning_rate': '0.0001059', 'ppl': '1.98', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '132.8', 'tokens/total': 3131581, 'tokens/trainable': 1674387, 'epoch': '0.5354'}
 53%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 270/505 [42:04<30:10,  7.70s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 271/505 [42:12<30:15,  7.76s/it]                                                                                                                                                                                                                                                              {'loss': '0.7206', 'grad_norm': '0.8469', 'learning_rate': '0.0001052', 'ppl': '2.056', 'memory/max_active (GiB)': '4.1', 'memory/max_allocated (GiB)': '4.1', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '56.77', 'tokens/total': 3142794, 'tokens/trainable': 1680617, 'epoch': '0.5374'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 271/505 [42:12<30:15,  7.76s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 272/505 [42:19<29:36,  7.63s/it]                                                                                                                                                                                                                                                              {'loss': '0.7834', 'grad_norm': '0.6357', 'learning_rate': '0.0001045', 'ppl': '2.189', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '206.4', 'tokens/total': 3153488, 'tokens/trainable': 1686537, 'epoch': '0.5394'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 272/505 [42:19<29:36,  7.63s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                  | 273/505 [42:28<30:32,  7.90s/it]                                                                                                                                                                                                                                                              {'loss': '0.8283', 'grad_norm': '0.6142', 'learning_rate': '0.0001038', 'ppl': '2.289', 'memory/max_active (GiB)': '3.82', 'memory/max_allocated (GiB)': '3.82', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '84.3', 'tokens/total': 3165670, 'tokens/trainable': 1693690, 'epoch': '0.5414'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                  | 273/505 [42:28<30:32,  7.90s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 274/505 [42:37<31:52,  8.28s/it]                                                                                                                                                                                                                                                              {'loss': '0.7681', 'grad_norm': '0.7029', 'learning_rate': '0.0001031', 'ppl': '2.156', 'memory/max_active (GiB)': '3.98', 'memory/max_allocated (GiB)': '3.98', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '27.36', 'tokens/total': 3179058, 'tokens/trainable': 1701137, 'epoch': '0.5434'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 274/505 [42:37<31:52,  8.28s/it] 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 275/505 [42:47<33:40,  8.79s/it]                                                                                                                                                                                                                                                              {'loss': '0.7757', 'grad_norm': '0.7196', 'learning_rate': '0.0001024', 'ppl': '2.172', 'memory/max_active (GiB)': '4.15', 'memory/max_allocated (GiB)': '4.15', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '35.83', 'tokens/total': 3193312, 'tokens/trainable': 1709665, 'epoch': '0.5454'}
 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                  | 275/505 [42:47<33:40,  8.79s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 276/505 [42:54<31:37,  8.28s/it]                                                                                                                                                                                                                                                              {'loss': '0.7029', 'grad_norm': '0.7012', 'learning_rate': '0.0001017', 'ppl': '2.02', 'memory/max_active (GiB)': '3.89', 'memory/max_allocated (GiB)': '3.89', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '25.73', 'tokens/total': 3203574, 'tokens/trainable': 1714659, 'epoch': '0.5473'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 276/505 [42:54<31:37,  8.28s/it] 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 277/505 [43:02<31:37,  8.32s/it]                                                                                                                                                                                                                                                              {'loss': '0.7951', 'grad_norm': '0.642', 'learning_rate': '0.000101', 'ppl': '2.215', 'memory/max_active (GiB)': '3.98', 'memory/max_allocated (GiB)': '3.98', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '77.35', 'tokens/total': 3215887, 'tokens/trainable': 1721600, 'epoch': '0.5493'}
 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 277/505 [43:02<31:37,  8.32s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                | 278/505 [43:10<30:47,  8.14s/it]                                                                                                                                                                                                                                                              {'loss': '0.8203', 'grad_norm': '0.8695', 'learning_rate': '0.0001003', 'ppl': '2.271', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '181.5', 'tokens/total': 3226946, 'tokens/trainable': 1726994, 'epoch': '0.5513'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                | 278/505 [43:10<30:47,  8.14s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 279/505 [43:18<30:24,  8.07s/it]                                                                                                                                                                                                                                                              {'loss': '0.843', 'grad_norm': '0.6806', 'learning_rate': '9.965e-05', 'ppl': '2.323', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '72.32', 'tokens/total': 3238537, 'tokens/trainable': 1732552, 'epoch': '0.5533'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 279/505 [43:18<30:24,  8.07s/it] 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 280/505 [43:27<31:39,  8.44s/it]                                                                                                                                                                                                                                                              {'loss': '0.6043', 'grad_norm': '0.4881', 'learning_rate': '9.896e-05', 'ppl': '1.83', 'memory/max_active (GiB)': '3.89', 'memory/max_allocated (GiB)': '3.89', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '170.7', 'tokens/total': 3251964, 'tokens/trainable': 1739962, 'epoch': '0.5553'}
 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 280/505 [43:27<31:39,  8.44s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 281/505 [43:34<29:58,  8.03s/it]                                                                                                                                                                                                                                                              {'loss': '0.7321', 'grad_norm': '0.6504', 'learning_rate': '9.827e-05', 'ppl': '2.08', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '62.11', 'tokens/total': 3262340, 'tokens/trainable': 1744957, 'epoch': '0.5573'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                               | 281/505 [43:34<29:58,  8.03s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 282/505 [43:42<29:11,  7.85s/it]                                                                                                                                                                                                                                                              {'loss': '0.6978', 'grad_norm': '0.7918', 'learning_rate': '9.758e-05', 'ppl': '2.009', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '84.81', 'tokens/total': 3273139, 'tokens/trainable': 1750469, 'epoch': '0.5592'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 282/505 [43:42<29:11,  7.85s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                              | 283/505 [43:53<33:08,  8.96s/it]                                                                                                                                                                                                                                                              {'loss': '0.8064', 'grad_norm': '0.5784', 'learning_rate': '9.689e-05', 'ppl': '2.24', 'memory/max_active (GiB)': '4.05', 'memory/max_allocated (GiB)': '4.05', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '136.1', 'tokens/total': 3289823, 'tokens/trainable': 1760365, 'epoch': '0.5612'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                              | 283/505 [43:53<33:08,  8.96s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 284/505 [44:01<31:33,  8.57s/it]                                                                                                                                                                                                                                                              {'loss': '0.8035', 'grad_norm': '0.7216', 'learning_rate': '9.62e-05', 'ppl': '2.233', 'memory/max_active (GiB)': '3.95', 'memory/max_allocated (GiB)': '3.95', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '205.7', 'tokens/total': 3300876, 'tokens/trainable': 1766296, 'epoch': '0.5632'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 284/505 [44:01<31:33,  8.57s/it] 56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 285/505 [44:10<32:21,  8.83s/it]                                                                                                                                                                                                                                                              {'loss': '0.7787', 'grad_norm': '0.7762', 'learning_rate': '9.551e-05', 'ppl': '2.179', 'memory/max_active (GiB)': '3.95', 'memory/max_allocated (GiB)': '3.95', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '125.4', 'tokens/total': 3314443, 'tokens/trainable': 1774494, 'epoch': '0.5652'}
 56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 285/505 [44:10<32:21,  8.83s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 286/505 [44:17<29:30,  8.08s/it]                                                                                                                                                                                                                                                              {'loss': '0.7882', 'grad_norm': '0.8136', 'learning_rate': '9.482e-05', 'ppl': '2.199', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '25.04', 'tokens/total': 3323615, 'tokens/trainable': 1778635, 'epoch': '0.5672'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                             | 286/505 [44:17<29:30,  8.08s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 287/505 [44:27<31:41,  8.72s/it]                                                                                                                                                                                                                                                              {'loss': '0.7918', 'grad_norm': '0.5646', 'learning_rate': '9.413e-05', 'ppl': '2.207', 'memory/max_active (GiB)': '4.23', 'memory/max_allocated (GiB)': '4.23', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '51.51', 'tokens/total': 3338394, 'tokens/trainable': 1787224, 'epoch': '0.5692'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 287/505 [44:27<31:41,  8.72s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 288/505 [44:36<31:42,  8.77s/it]                                                                                                                                                                                                                                                              {'loss': '0.8983', 'grad_norm': '0.6781', 'learning_rate': '9.345e-05', 'ppl': '2.455', 'memory/max_active (GiB)': '4.23', 'memory/max_allocated (GiB)': '4.23', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '55.6', 'tokens/total': 3351163, 'tokens/trainable': 1794276, 'epoch': '0.5711'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 288/505 [44:36<31:42,  8.77s/it] 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 289/505 [44:43<29:49,  8.28s/it]                                                                                                                                                                                                                                                              {'loss': '0.7876', 'grad_norm': '0.6433', 'learning_rate': '9.276e-05', 'ppl': '2.198', 'memory/max_active (GiB)': '3.8', 'memory/max_allocated (GiB)': '3.8', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '57.85', 'tokens/total': 3361554, 'tokens/trainable': 1799623, 'epoch': '0.5731'}
 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 289/505 [44:43<29:49,  8.28s/it] 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 290/505 [44:50<28:56,  8.08s/it]                                                                                                                                                                                                                                                              {'loss': '0.7376', 'grad_norm': '0.7533', 'learning_rate': '9.207e-05', 'ppl': '2.091', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '107.4', 'tokens/total': 3372601, 'tokens/trainable': 1804850, 'epoch': '0.5751'}
 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 290/505 [44:50<28:56,  8.08s/it] 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                           | 291/505 [44:57<26:50,  7.53s/it]                                                                                                                                                                                                                                                              {'loss': '0.6295', 'grad_norm': '0.7832', 'learning_rate': '9.138e-05', 'ppl': '1.877', 'memory/max_active (GiB)': '3.83', 'memory/max_allocated (GiB)': '3.83', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '34.65', 'tokens/total': 3381610, 'tokens/trainable': 1808689, 'epoch': '0.5771'}
 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                           | 291/505 [44:57<26:50,  7.53s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                          | 292/505 [45:04<26:24,  7.44s/it]                                                                                                                                                                                                                                                              {'loss': '0.6734', 'grad_norm': '0.6111', 'learning_rate': '9.069e-05', 'ppl': '1.961', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '134.3', 'tokens/total': 3392067, 'tokens/trainable': 1814029, 'epoch': '0.5791'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                          | 292/505 [45:04<26:24,  7.44s/it] 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 293/505 [45:12<26:30,  7.50s/it]                                                                                                                                                                                                                                                              {'loss': '0.8135', 'grad_norm': '0.6849', 'learning_rate': '9.001e-05', 'ppl': '2.256', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '56.19', 'tokens/total': 3403094, 'tokens/trainable': 1819979, 'epoch': '0.5811'}
 58%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 293/505 [45:12<26:30,  7.50s/it] 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                         | 294/505 [45:19<25:45,  7.32s/it]                                                                                                                                                                                                                                                              {'loss': '0.7521', 'grad_norm': '0.7254', 'learning_rate': '8.932e-05', 'ppl': '2.121', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '72.54', 'tokens/total': 3413175, 'tokens/trainable': 1824766, 'epoch': '0.583'}
 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                         | 294/505 [45:19<25:45,  7.32s/it] 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 295/505 [45:27<26:38,  7.61s/it]                                                                                                                                                                                                                                                              {'loss': '0.7352', 'grad_norm': '0.6528', 'learning_rate': '8.863e-05', 'ppl': '2.086', 'memory/max_active (GiB)': '3.83', 'memory/max_allocated (GiB)': '3.83', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '31.15', 'tokens/total': 3425098, 'tokens/trainable': 1831149, 'epoch': '0.585'}
 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 295/505 [45:27<26:38,  7.61s/it] 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                         | 296/505 [45:39<31:38,  9.08s/it]                                                                                                                                                                                                                                                              {'loss': '0.772', 'grad_norm': '0.4734', 'learning_rate': '8.795e-05', 'ppl': '2.164', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '59.72', 'tokens/total': 3443153, 'tokens/trainable': 1842148, 'epoch': '0.587'}
 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                         | 296/505 [45:39<31:38,  9.08s/it] 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                        | 297/505 [45:47<30:15,  8.73s/it]                                                                                                                                                                                                                                                              {'loss': '0.7906', 'grad_norm': '0.7155', 'learning_rate': '8.726e-05', 'ppl': '2.205', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '56.9', 'tokens/total': 3454644, 'tokens/trainable': 1848200, 'epoch': '0.589'}
 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                        | 297/505 [45:47<30:15,  8.73s/it] 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 298/505 [45:54<28:16,  8.20s/it]                                                                                                                                                                                                                                                              {'loss': '0.7287', 'grad_norm': '0.691', 'learning_rate': '8.658e-05', 'ppl': '2.072', 'memory/max_active (GiB)': '3.72', 'memory/max_allocated (GiB)': '3.72', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '53.26', 'tokens/total': 3464581, 'tokens/trainable': 1853460, 'epoch': '0.591'}
 59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 298/505 [45:54<28:16,  8.20s/it] 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                       | 299/505 [46:01<27:05,  7.89s/it]                                                                                                                                                                                                                                                              {'loss': '0.7438', 'grad_norm': '0.8411', 'learning_rate': '8.589e-05', 'ppl': '2.104', 'memory/max_active (GiB)': '3.73', 'memory/max_allocated (GiB)': '3.73', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '159.6', 'tokens/total': 3474767, 'tokens/trainable': 1858441, 'epoch': '0.593'}
 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                       | 299/505 [46:01<27:05,  7.89s/it] 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                       | 300/505 [46:11<28:44,  8.41s/it]                                                                                                                                                                                                                                                              {'loss': '0.7966', 'grad_norm': '0.5732', 'learning_rate': '8.521e-05', 'ppl': '2.218', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '80.25', 'tokens/total': 3488588, 'tokens/trainable': 1866673, 'epoch': '0.5949'}
 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                       | 300/505 [46:11<28:44,  8.41s/it] 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                      | 301/505 [46:18<26:58,  7.93s/it]                                                                                                                                                                                                                                                              {'loss': '0.6479', 'grad_norm': '0.7988', 'learning_rate': '8.453e-05', 'ppl': '1.911', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '300.6', 'tokens/total': 3498488, 'tokens/trainable': 1871632, 'epoch': '0.5969'}
 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                      | 301/505 [46:18<26:58,  7.93s/it] 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                      | 302/505 [46:25<25:52,  7.65s/it]                                                                                                                                                                                                                                                              {'loss': '0.6623', 'grad_norm': '0.6572', 'learning_rate': '8.385e-05', 'ppl': '1.939', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '160.3', 'tokens/total': 3508616, 'tokens/trainable': 1876895, 'epoch': '0.5989'}
 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                      | 302/505 [46:25<25:52,  7.65s/it] 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                      | 303/505 [46:32<25:23,  7.54s/it]                                                                                                                                                                                                                                                              {'loss': '0.7638', 'grad_norm': '0.6701', 'learning_rate': '8.316e-05', 'ppl': '2.147', 'memory/max_active (GiB)': '3.65', 'memory/max_allocated (GiB)': '3.65', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '56.3', 'tokens/total': 3518734, 'tokens/trainable': 1881959, 'epoch': '0.6009'}
 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                      | 303/505 [46:32<25:23,  7.54s/it] 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 304/505 [46:42<27:37,  8.25s/it]                                                                                                                                                                                                                                                              {'loss': '0.6747', 'grad_norm': '0.6194', 'learning_rate': '8.248e-05', 'ppl': '1.963', 'memory/max_active (GiB)': '3.98', 'memory/max_allocated (GiB)': '3.98', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '117.5', 'tokens/total': 3533019, 'tokens/trainable': 1890609, 'epoch': '0.6029'}
 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 304/505 [46:42<27:37,  8.25s/it] 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                     | 305/505 [46:51<28:26,  8.53s/it]                                                                                                                                                                                                                                                              {'loss': '0.7527', 'grad_norm': '0.5745', 'learning_rate': '8.18e-05', 'ppl': '2.123', 'memory/max_active (GiB)': '3.98', 'memory/max_allocated (GiB)': '3.98', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '58.69', 'tokens/total': 3546421, 'tokens/trainable': 1898271, 'epoch': '0.6049'}
 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                     | 305/505 [46:51<28:26,  8.53s/it] 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 306/505 [46:58<26:09,  7.89s/it]                                                                                                                                                                                                                                                              {'loss': '0.7463', 'grad_norm': '0.8992', 'learning_rate': '8.113e-05', 'ppl': '2.109', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '119.2', 'tokens/total': 3555660, 'tokens/trainable': 1902223, 'epoch': '0.6068'}
 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 306/505 [46:58<26:09,  7.89s/it] 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 307/505 [47:04<25:04,  7.60s/it]                                                                                                                                                                                                                                                              {'loss': '0.6918', 'grad_norm': '0.8595', 'learning_rate': '8.045e-05', 'ppl': '1.997', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '72.85', 'tokens/total': 3565707, 'tokens/trainable': 1907237, 'epoch': '0.6088'}
 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 307/505 [47:04<25:04,  7.60s/it] 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                   | 308/505 [47:12<24:38,  7.51s/it]                                                                                                                                                                                                                                                              {'loss': '0.6715', 'grad_norm': '0.5846', 'learning_rate': '7.977e-05', 'ppl': '1.957', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '81.31', 'tokens/total': 3576281, 'tokens/trainable': 1912933, 'epoch': '0.6108'}
 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                   | 308/505 [47:12<24:38,  7.51s/it] 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 309/505 [47:20<25:25,  7.79s/it]                                                                                                                                                                                                                                                              {'loss': '0.7167', 'grad_norm': '0.6741', 'learning_rate': '7.91e-05', 'ppl': '2.048', 'memory/max_active (GiB)': '4.19', 'memory/max_allocated (GiB)': '4.19', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '75.39', 'tokens/total': 3588475, 'tokens/trainable': 1919154, 'epoch': '0.6128'}
 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 309/505 [47:20<25:25,  7.79s/it] 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 310/505 [47:30<26:55,  8.29s/it]                                                                                                                                                                                                                                                              {'loss': '0.6319', 'grad_norm': '0.6413', 'learning_rate': '7.842e-05', 'ppl': '1.881', 'memory/max_active (GiB)': '4.2', 'memory/max_allocated (GiB)': '4.2', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '33.12', 'tokens/total': 3602214, 'tokens/trainable': 1927084, 'epoch': '0.6148'}
 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                   | 310/505 [47:30<26:55,  8.29s/it] 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                  | 311/505 [47:38<27:12,  8.42s/it]                                                                                                                                                                                                                                                              {'loss': '0.8165', 'grad_norm': '0.6904', 'learning_rate': '7.775e-05', 'ppl': '2.262', 'memory/max_active (GiB)': '4', 'memory/max_allocated (GiB)': '4', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '23.63', 'tokens/total': 3614845, 'tokens/trainable': 1934174, 'epoch': '0.6168'}
 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                  | 311/505 [47:38<27:12,  8.42s/it][2026-06-13 17:28:59,706] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 312/505 [47:47<27:22,  8.51s/it]                                                                                                                                                                                                                                                              {'loss': '0.8041', 'grad_norm': '0.6233', 'learning_rate': '7.708e-05', 'ppl': '2.235', 'memory/max_active (GiB)': '3.96', 'memory/max_allocated (GiB)': '3.96', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '156', 'tokens/total': 3627464, 'tokens/trainable': 1941342, 'epoch': '0.6187'}
 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 312/505 [47:47<27:22,  8.51s/it] 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                 | 313/505 [47:55<26:40,  8.33s/it]                                                                                                                                                                                                                                                              {'loss': '0.7481', 'grad_norm': '0.6048', 'learning_rate': '7.64e-05', 'ppl': '2.113', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '140.6', 'tokens/total': 3638980, 'tokens/trainable': 1947559, 'epoch': '0.6207'}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                 | 313/505 [47:55<26:40,  8.33s/it] 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 314/505 [48:01<23:54,  7.51s/it]                                                                                                                                                                                                                                                              {'loss': '0.8223', 'grad_norm': '0.9699', 'learning_rate': '7.573e-05', 'ppl': '2.276', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '56.26', 'tokens/total': 3647094, 'tokens/trainable': 1950847, 'epoch': '0.6227'}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 314/505 [48:01<23:54,  7.51s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 315/505 [48:07<22:40,  7.16s/it]                                                                                                                                                                                                                                                              {'loss': '0.7231', 'grad_norm': '0.6639', 'learning_rate': '7.506e-05', 'ppl': '2.061', 'memory/max_active (GiB)': '3.65', 'memory/max_allocated (GiB)': '3.65', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '88.21', 'tokens/total': 3656404, 'tokens/trainable': 1955080, 'epoch': '0.6247'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                 | 315/505 [48:07<22:40,  7.16s/it] 63%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                | 316/505 [48:18<26:05,  8.28s/it]                                                                                                                                                                                                                                                              {'loss': '0.7494', 'grad_norm': '0.4918', 'learning_rate': '7.44e-05', 'ppl': '2.116', 'memory/max_active (GiB)': '4.08', 'memory/max_allocated (GiB)': '4.08', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '226.9', 'tokens/total': 3672306, 'tokens/trainable': 1964948, 'epoch': '0.6267'}
 63%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                | 316/505 [48:18<26:05,  8.28s/it] 63%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 317/505 [48:26<25:42,  8.20s/it]                                                                                                                                                                                                                                                              {'loss': '0.7622', 'grad_norm': '0.6122', 'learning_rate': '7.373e-05', 'ppl': '2.143', 'memory/max_active (GiB)': '3.93', 'memory/max_allocated (GiB)': '3.93', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '153.2', 'tokens/total': 3683908, 'tokens/trainable': 1971413, 'epoch': '0.6287'}
 63%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 317/505 [48:26<25:42,  8.20s/it] 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                               | 318/505 [48:33<24:11,  7.76s/it]                                                                                                                                                                                                                                                              {'loss': '0.668', 'grad_norm': '0.7693', 'learning_rate': '7.306e-05', 'ppl': '1.95', 'memory/max_active (GiB)': '3.83', 'memory/max_allocated (GiB)': '3.83', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '225.5', 'tokens/total': 3693639, 'tokens/trainable': 1975965, 'epoch': '0.6306'}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                               | 318/505 [48:33<24:11,  7.76s/it] 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 319/505 [48:40<24:06,  7.78s/it]                                                                                                                                                                                                                                                              {'loss': '0.7301', 'grad_norm': '0.641', 'learning_rate': '7.24e-05', 'ppl': '2.075', 'memory/max_active (GiB)': '3.83', 'memory/max_allocated (GiB)': '3.83', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '68.36', 'tokens/total': 3704958, 'tokens/trainable': 1982120, 'epoch': '0.6326'}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 319/505 [48:40<24:06,  7.78s/it] 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 320/505 [48:47<22:37,  7.34s/it]                                                                                                                                                                                                                                                              {'loss': '0.6241', 'grad_norm': '0.681', 'learning_rate': '7.174e-05', 'ppl': '1.867', 'memory/max_active (GiB)': '3.68', 'memory/max_allocated (GiB)': '3.68', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '67.09', 'tokens/total': 3714072, 'tokens/trainable': 1986415, 'epoch': '0.6346'}
 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 320/505 [48:47<22:37,  7.34s/it] 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 321/505 [48:56<23:54,  7.79s/it]                                                                                                                                                                                                                                                              {'loss': '0.7548', 'grad_norm': '0.6003', 'learning_rate': '7.107e-05', 'ppl': '2.127', 'memory/max_active (GiB)': '4.05', 'memory/max_allocated (GiB)': '4.05', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '278.3', 'tokens/total': 3726798, 'tokens/trainable': 1993796, 'epoch': '0.6366'}
 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                              | 321/505 [48:56<23:54,  7.79s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 322/505 [49:02<22:18,  7.31s/it]                                                                                                                                                                                                                                                              {'loss': '0.7514', 'grad_norm': '0.763', 'learning_rate': '7.041e-05', 'ppl': '2.12', 'memory/max_active (GiB)': '3.65', 'memory/max_allocated (GiB)': '3.65', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '23.73', 'tokens/total': 3735775, 'tokens/trainable': 1998000, 'epoch': '0.6386'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 322/505 [49:02<22:18,  7.31s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                             | 323/505 [49:09<21:46,  7.18s/it]                                                                                                                                                                                                                                                              {'loss': '0.7897', 'grad_norm': '0.666', 'learning_rate': '6.976e-05', 'ppl': '2.203', 'memory/max_active (GiB)': '3.79', 'memory/max_allocated (GiB)': '3.79', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '204.2', 'tokens/total': 3745770, 'tokens/trainable': 2003197, 'epoch': '0.6406'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                             | 323/505 [49:09<21:46,  7.18s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 324/505 [49:18<23:45,  7.88s/it]                                                                                                                                                                                                                                                              {'loss': '0.6654', 'grad_norm': '0.5911', 'learning_rate': '6.91e-05', 'ppl': '1.945', 'memory/max_active (GiB)': '4.23', 'memory/max_allocated (GiB)': '4.23', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '73.31', 'tokens/total': 3759516, 'tokens/trainable': 2011130, 'epoch': '0.6425'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 324/505 [49:18<23:45,  7.88s/it] 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                            | 325/505 [49:25<22:41,  7.56s/it]                                                                                                                                                                                                                                                              {'loss': '0.6602', 'grad_norm': '0.714', 'learning_rate': '6.844e-05', 'ppl': '1.935', 'memory/max_active (GiB)': '3.72', 'memory/max_allocated (GiB)': '3.72', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '46.7', 'tokens/total': 3769350, 'tokens/trainable': 2015490, 'epoch': '0.6445'}
 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                            | 325/505 [49:25<22:41,  7.56s/it] 65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                            | 326/505 [49:33<23:22,  7.83s/it]                                                                                                                                                                                                                                                              {'loss': '0.6666', 'grad_norm': '0.5697', 'learning_rate': '6.779e-05', 'ppl': '1.948', 'memory/max_active (GiB)': '3.86', 'memory/max_allocated (GiB)': '3.86', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '35.66', 'tokens/total': 3781519, 'tokens/trainable': 2022167, 'epoch': '0.6465'}
 65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                            | 326/505 [49:33<23:22,  7.83s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 327/505 [49:42<23:40,  7.98s/it]                                                                                                                                                                                                                                                              {'loss': '0.7571', 'grad_norm': '0.5547', 'learning_rate': '6.714e-05', 'ppl': '2.132', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '164.2', 'tokens/total': 3793568, 'tokens/trainable': 2028794, 'epoch': '0.6485'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 327/505 [49:42<23:40,  7.98s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 328/505 [49:49<22:43,  7.70s/it]                                                                                                                                                                                                                                                              {'loss': '0.7745', 'grad_norm': '0.776', 'learning_rate': '6.648e-05', 'ppl': '2.169', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '242.1', 'tokens/total': 3803941, 'tokens/trainable': 2034054, 'epoch': '0.6505'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 328/505 [49:49<22:43,  7.70s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                           | 329/505 [50:00<25:39,  8.75s/it]                                                                                                                                                                                                                                                              {'loss': '0.7297', 'grad_norm': '0.5503', 'learning_rate': '6.583e-05', 'ppl': '2.075', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '173.9', 'tokens/total': 3820185, 'tokens/trainable': 2044231, 'epoch': '0.6525'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                           | 329/505 [50:00<25:39,  8.75s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 330/505 [50:07<23:50,  8.17s/it]                                                                                                                                                                                                                                                              {'loss': '0.7825', 'grad_norm': '0.7301', 'learning_rate': '6.519e-05', 'ppl': '2.187', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '80.04', 'tokens/total': 3830198, 'tokens/trainable': 2048988, 'epoch': '0.6544'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 330/505 [50:07<23:50,  8.17s/it] 66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 331/505 [50:17<25:21,  8.74s/it]                                                                                                                                                                                                                                                              {'loss': '0.6196', 'grad_norm': '0.6718', 'learning_rate': '6.454e-05', 'ppl': '1.858', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '112.7', 'tokens/total': 3844682, 'tokens/trainable': 2057537, 'epoch': '0.6564'}
 66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                          | 331/505 [50:17<25:21,  8.74s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                         | 332/505 [50:24<23:35,  8.18s/it]                                                                                                                                                                                                                                                              {'loss': '0.8693', 'grad_norm': '0.6947', 'learning_rate': '6.389e-05', 'ppl': '2.385', 'memory/max_active (GiB)': '3.69', 'memory/max_allocated (GiB)': '3.69', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '62.14', 'tokens/total': 3854654, 'tokens/trainable': 2062403, 'epoch': '0.6584'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                         | 332/505 [50:24<23:35,  8.18s/it] 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 333/505 [50:31<23:02,  8.04s/it]                                                                                                                                                                                                                                                              {'loss': '0.6957', 'grad_norm': '0.5614', 'learning_rate': '6.325e-05', 'ppl': '2.005', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '140.6', 'tokens/total': 3865658, 'tokens/trainable': 2068502, 'epoch': '0.6604'}
 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 333/505 [50:31<23:02,  8.04s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                        | 334/505 [50:38<21:19,  7.48s/it]                                                                                                                                                                                                                                                              {'loss': '0.7889', 'grad_norm': '0.7243', 'learning_rate': '6.261e-05', 'ppl': '2.201', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '73.64', 'tokens/total': 3874706, 'tokens/trainable': 2072739, 'epoch': '0.6624'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                        | 334/505 [50:38<21:19,  7.48s/it][2026-06-13 17:32:03,571] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                        | 335/505 [50:52<27:11,  9.60s/it]                                                                                                                                                                                                                                                              {'loss': '0.6973', 'grad_norm': '0.4377', 'learning_rate': '6.197e-05', 'ppl': '2.008', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '76.85', 'tokens/total': 3894485, 'tokens/trainable': 2085283, 'epoch': '0.6644'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                        | 335/505 [50:52<27:11,  9.60s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 336/505 [51:00<25:46,  9.15s/it]                                                                                                                                                                                                                                                              {'loss': '0.6991', 'grad_norm': '0.6017', 'learning_rate': '6.133e-05', 'ppl': '2.012', 'memory/max_active (GiB)': '3.83', 'memory/max_allocated (GiB)': '3.83', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '71.81', 'tokens/total': 3906227, 'tokens/trainable': 2091572, 'epoch': '0.6663'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 336/505 [51:00<25:46,  9.15s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                       | 337/505 [51:07<23:57,  8.55s/it]                                                                                                                                                                                                                                                              {'loss': '0.8241', 'grad_norm': '0.7239', 'learning_rate': '6.07e-05', 'ppl': '2.28', 'memory/max_active (GiB)': '3.87', 'memory/max_allocated (GiB)': '3.87', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '73.27', 'tokens/total': 3916511, 'tokens/trainable': 2096413, 'epoch': '0.6683'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                       | 337/505 [51:07<23:57,  8.55s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 338/505 [51:14<21:59,  7.90s/it]                                                                                                                                                                                                                                                              {'loss': '0.9351', 'grad_norm': '0.793', 'learning_rate': '6.006e-05', 'ppl': '2.548', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '95.43', 'tokens/total': 3925611, 'tokens/trainable': 2100930, 'epoch': '0.6703'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 338/505 [51:14<21:59,  7.90s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 339/505 [51:23<23:12,  8.39s/it]                                                                                                                                                                                                                                                              {'loss': '0.6995', 'grad_norm': '0.6339', 'learning_rate': '5.943e-05', 'ppl': '2.013', 'memory/max_active (GiB)': '4.05', 'memory/max_allocated (GiB)': '4.05', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '115.8', 'tokens/total': 3939353, 'tokens/trainable': 2109148, 'epoch': '0.6723'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 339/505 [51:23<23:12,  8.39s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 340/505 [51:30<21:23,  7.78s/it]                                                                                                                                                                                                                                                              {'loss': '0.7332', 'grad_norm': '0.9024', 'learning_rate': '5.88e-05', 'ppl': '2.082', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '67.79', 'tokens/total': 3948672, 'tokens/trainable': 2113681, 'epoch': '0.6743'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 340/505 [51:30<21:23,  7.78s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                     | 341/505 [51:38<21:34,  7.89s/it]                                                                                                                                                                                                                                                              {'loss': '0.6858', 'grad_norm': '0.8266', 'learning_rate': '5.817e-05', 'ppl': '1.985', 'memory/max_active (GiB)': '4.24', 'memory/max_allocated (GiB)': '4.24', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '48.21', 'tokens/total': 3960358, 'tokens/trainable': 2120219, 'epoch': '0.6763'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                     | 341/505 [51:38<21:34,  7.89s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 342/505 [51:47<22:09,  8.16s/it]                                                                                                                                                                                                                                                              {'loss': '0.6838', 'grad_norm': '0.6113', 'learning_rate': '5.755e-05', 'ppl': '1.981', 'memory/max_active (GiB)': '4.09', 'memory/max_allocated (GiB)': '4.09', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '284.6', 'tokens/total': 3973031, 'tokens/trainable': 2127286, 'epoch': '0.6782'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 342/505 [51:47<22:09,  8.16s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 343/505 [51:54<21:26,  7.94s/it]                                                                                                                                                                                                                                                              {'loss': '0.7178', 'grad_norm': '0.7048', 'learning_rate': '5.692e-05', 'ppl': '2.05', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '46.76', 'tokens/total': 3983714, 'tokens/trainable': 2132868, 'epoch': '0.6802'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 343/505 [51:54<21:26,  7.94s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 344/505 [52:01<20:14,  7.55s/it]                                                                                                                                                                                                                                                              {'loss': '0.7576', 'grad_norm': '0.7635', 'learning_rate': '5.63e-05', 'ppl': '2.133', 'memory/max_active (GiB)': '3.66', 'memory/max_allocated (GiB)': '3.66', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '77.69', 'tokens/total': 3993216, 'tokens/trainable': 2137210, 'epoch': '0.6822'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 344/505 [52:01<20:14,  7.55s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 345/505 [52:09<20:23,  7.65s/it]                                                                                                                                                                                                                                                              {'loss': '0.7888', 'grad_norm': '0.6876', 'learning_rate': '5.568e-05', 'ppl': '2.201', 'memory/max_active (GiB)': '3.79', 'memory/max_allocated (GiB)': '3.79', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '171.3', 'tokens/total': 4004533, 'tokens/trainable': 2143145, 'epoch': '0.6842'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 345/505 [52:09<20:23,  7.65s/it][2026-06-13 17:33:30,478] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 346/505 [52:16<19:45,  7.46s/it]                                                                                                                                                                                                                                                              {'loss': '0.7351', 'grad_norm': '0.7542', 'learning_rate': '5.506e-05', 'ppl': '2.086', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '34.65', 'tokens/total': 4014668, 'tokens/trainable': 2148105, 'epoch': '0.6862'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 346/505 [52:16<19:45,  7.46s/it] 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 347/505 [52:23<19:27,  7.39s/it]                                                                                                                                                                                                                                                              {'loss': '0.7417', 'grad_norm': '0.6792', 'learning_rate': '5.445e-05', 'ppl': '2.1', 'memory/max_active (GiB)': '3.82', 'memory/max_allocated (GiB)': '3.82', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '35.3', 'tokens/total': 4025052, 'tokens/trainable': 2153708, 'epoch': '0.6882'}
 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 347/505 [52:23<19:27,  7.39s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 348/505 [52:32<20:39,  7.90s/it]                                                                                                                                                                                                                                                              {'loss': '0.7956', 'grad_norm': '0.7909', 'learning_rate': '5.383e-05', 'ppl': '2.216', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '47.91', 'tokens/total': 4038220, 'tokens/trainable': 2161120, 'epoch': '0.6901'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 348/505 [52:32<20:39,  7.90s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 349/505 [52:41<21:16,  8.18s/it]                                                                                                                                                                                                                                                              {'loss': '0.793', 'grad_norm': '0.7035', 'learning_rate': '5.322e-05', 'ppl': '2.21', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '114.9', 'tokens/total': 4051050, 'tokens/trainable': 2168393, 'epoch': '0.6921'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 349/505 [52:41<21:16,  8.18s/it] 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 350/505 [52:50<21:49,  8.45s/it]                                                                                                                                                                                                                                                              {'loss': '0.6428', 'grad_norm': '0.5212', 'learning_rate': '5.261e-05', 'ppl': '1.902', 'memory/max_active (GiB)': '4.04', 'memory/max_allocated (GiB)': '4.04', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '100.5', 'tokens/total': 4064108, 'tokens/trainable': 2175861, 'epoch': '0.6941'}
 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 350/505 [52:50<21:49,  8.45s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 351/505 [53:00<23:09,  9.02s/it]                                                                                                                                                                                                                                                              {'loss': '0.6374', 'grad_norm': '0.5157', 'learning_rate': '5.201e-05', 'ppl': '1.892', 'memory/max_active (GiB)': '4.05', 'memory/max_allocated (GiB)': '4.05', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '165.7', 'tokens/total': 4079076, 'tokens/trainable': 2184944, 'epoch': '0.6961'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 351/505 [53:00<23:09,  9.02s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 352/505 [53:06<20:53,  8.19s/it]                                                                                                                                                                                                                                                              {'loss': '0.8495', 'grad_norm': '0.8765', 'learning_rate': '5.14e-05', 'ppl': '2.339', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '102.3', 'tokens/total': 4088193, 'tokens/trainable': 2189002, 'epoch': '0.6981'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 352/505 [53:06<20:53,  8.19s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 353/505 [53:15<21:16,  8.40s/it]                                                                                                                                                                                                                                                              {'loss': '0.6794', 'grad_norm': '0.7039', 'learning_rate': '5.08e-05', 'ppl': '1.973', 'memory/max_active (GiB)': '4.23', 'memory/max_allocated (GiB)': '4.23', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '305.3', 'tokens/total': 4100969, 'tokens/trainable': 2196369, 'epoch': '0.7'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 353/505 [53:15<21:16,  8.40s/it] 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 354/505 [53:23<20:55,  8.31s/it]                                                                                                                                                                                                                                                              {'loss': '0.6087', 'grad_norm': '0.8518', 'learning_rate': '5.02e-05', 'ppl': '1.838', 'memory/max_active (GiB)': '4.04', 'memory/max_allocated (GiB)': '4.04', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '76.09', 'tokens/total': 4112753, 'tokens/trainable': 2202627, 'epoch': '0.702'}
 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 354/505 [53:23<20:55,  8.31s/it] 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 355/505 [53:31<20:03,  8.02s/it]                                                                                                                                                                                                                                                              {'loss': '0.6831', 'grad_norm': '0.645', 'learning_rate': '4.96e-05', 'ppl': '1.98', 'memory/max_active (GiB)': '3.65', 'memory/max_allocated (GiB)': '3.65', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '65.34', 'tokens/total': 4123348, 'tokens/trainable': 2207853, 'epoch': '0.704'}
 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 355/505 [53:31<20:03,  8.02s/it] 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                               | 356/505 [53:37<18:37,  7.50s/it]                                                                                                                                                                                                                                                              {'loss': '0.6672', 'grad_norm': '0.7391', 'learning_rate': '4.901e-05', 'ppl': '1.949', 'memory/max_active (GiB)': '3.66', 'memory/max_allocated (GiB)': '3.66', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '72.31', 'tokens/total': 4132346, 'tokens/trainable': 2212181, 'epoch': '0.706'}
 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                               | 356/505 [53:37<18:37,  7.50s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 357/505 [53:44<18:03,  7.32s/it]                                                                                                                                                                                                                                                              {'loss': '0.681', 'grad_norm': '0.9078', 'learning_rate': '4.841e-05', 'ppl': '1.976', 'memory/max_active (GiB)': '4.09', 'memory/max_allocated (GiB)': '4.09', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '30', 'tokens/total': 4142340, 'tokens/trainable': 2217350, 'epoch': '0.708'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 357/505 [53:44<18:03,  7.32s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 358/505 [53:53<19:08,  7.81s/it]                                                                                                                                                                                                                                                              {'loss': '0.7227', 'grad_norm': '0.6627', 'learning_rate': '4.782e-05', 'ppl': '2.06', 'memory/max_active (GiB)': '3.96', 'memory/max_allocated (GiB)': '3.96', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '17.52', 'tokens/total': 4155153, 'tokens/trainable': 2224575, 'epoch': '0.71'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 358/505 [53:53<19:08,  7.81s/it] 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 359/505 [53:59<17:55,  7.37s/it]                                                                                                                                                                                                                                                              {'loss': '0.6878', 'grad_norm': '0.9808', 'learning_rate': '4.724e-05', 'ppl': '1.989', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '112.4', 'tokens/total': 4164271, 'tokens/trainable': 2228777, 'epoch': '0.7119'}
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 359/505 [53:59<17:55,  7.37s/it] 71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 360/505 [54:07<17:47,  7.36s/it]                                                                                                                                                                                                                                                              {'loss': '0.7324', 'grad_norm': '0.6822', 'learning_rate': '4.665e-05', 'ppl': '2.08', 'memory/max_active (GiB)': '3.75', 'memory/max_allocated (GiB)': '3.75', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '103.1', 'tokens/total': 4174890, 'tokens/trainable': 2234298, 'epoch': '0.7139'}
 71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 360/505 [54:07<17:47,  7.36s/it] 71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 361/505 [54:15<18:25,  7.68s/it]                                                                                                                                                                                                                                                              {'loss': '0.7559', 'grad_norm': '0.5627', 'learning_rate': '4.607e-05', 'ppl': '2.13', 'memory/max_active (GiB)': '3.68', 'memory/max_allocated (GiB)': '3.68', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '74.63', 'tokens/total': 4187065, 'tokens/trainable': 2240638, 'epoch': '0.7159'}
 71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                             | 361/505 [54:15<18:25,  7.68s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 362/505 [54:23<18:14,  7.65s/it]                                                                                                                                                                                                                                                              {'loss': '0.6341', 'grad_norm': '0.6593', 'learning_rate': '4.549e-05', 'ppl': '1.885', 'memory/max_active (GiB)': '4.05', 'memory/max_allocated (GiB)': '4.05', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '298.5', 'tokens/total': 4198009, 'tokens/trainable': 2246397, 'epoch': '0.7179'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 362/505 [54:23<18:14,  7.65s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                            | 363/505 [54:32<19:07,  8.08s/it]                                                                                                                                                                                                                                                              {'loss': '0.5299', 'grad_norm': '0.7641', 'learning_rate': '4.491e-05', 'ppl': '1.699', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '89.46', 'tokens/total': 4211051, 'tokens/trainable': 2253597, 'epoch': '0.7199'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                            | 363/505 [54:32<19:07,  8.08s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 364/505 [54:39<18:18,  7.79s/it]                                                                                                                                                                                                                                                              {'loss': '0.6734', 'grad_norm': '0.7919', 'learning_rate': '4.434e-05', 'ppl': '1.961', 'memory/max_active (GiB)': '3.82', 'memory/max_allocated (GiB)': '3.82', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '71.76', 'tokens/total': 4221488, 'tokens/trainable': 2258946, 'epoch': '0.7219'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 364/505 [54:39<18:18,  7.79s/it] 72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                           | 365/505 [54:45<17:09,  7.36s/it]                                                                                                                                                                                                                                                              {'loss': '0.6427', 'grad_norm': '0.7723', 'learning_rate': '4.376e-05', 'ppl': '1.902', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '73.99', 'tokens/total': 4230514, 'tokens/trainable': 2263282, 'epoch': '0.7238'}
 72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                           | 365/505 [54:45<17:09,  7.36s/it] 72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 366/505 [54:54<17:47,  7.68s/it]                                                                                                                                                                                                                                                              {'loss': '0.7343', 'grad_norm': '0.8434', 'learning_rate': '4.319e-05', 'ppl': '2.084', 'memory/max_active (GiB)': '4.28', 'memory/max_allocated (GiB)': '4.28', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '24.66', 'tokens/total': 4242673, 'tokens/trainable': 2269500, 'epoch': '0.7258'}
 72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 366/505 [54:54<17:47,  7.68s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 367/505 [55:01<17:18,  7.53s/it]                                                                                                                                                                                                                                                              {'loss': '0.6688', 'grad_norm': '0.8539', 'learning_rate': '4.263e-05', 'ppl': '1.952', 'memory/max_active (GiB)': '3.72', 'memory/max_allocated (GiB)': '3.72', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '143.4', 'tokens/total': 4252983, 'tokens/trainable': 2274801, 'epoch': '0.7278'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 367/505 [55:01<17:18,  7.53s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 368/505 [55:10<18:09,  7.96s/it]                                                                                                                                                                                                                                                              {'loss': '0.6581', 'grad_norm': '0.5609', 'learning_rate': '4.206e-05', 'ppl': '1.931', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '111.1', 'tokens/total': 4265866, 'tokens/trainable': 2282008, 'epoch': '0.7298'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                          | 368/505 [55:10<18:09,  7.96s/it] 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                          | 369/505 [55:19<18:37,  8.22s/it]                                                                                                                                                                                                                                                              {'loss': '0.7448', 'grad_norm': '0.6515', 'learning_rate': '4.15e-05', 'ppl': '2.106', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '46.94', 'tokens/total': 4278702, 'tokens/trainable': 2289131, 'epoch': '0.7318'}
 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                          | 369/505 [55:19<18:37,  8.22s/it] 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 370/505 [55:27<18:44,  8.33s/it]                                                                                                                                                                                                                                                              {'loss': '0.7002', 'grad_norm': '0.6056', 'learning_rate': '4.094e-05', 'ppl': '2.014', 'memory/max_active (GiB)': '3.98', 'memory/max_allocated (GiB)': '3.98', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '133.8', 'tokens/total': 4291238, 'tokens/trainable': 2296330, 'epoch': '0.7338'}
 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 370/505 [55:27<18:44,  8.33s/it] 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 371/505 [55:33<17:01,  7.62s/it]                                                                                                                                                                                                                                                              {'loss': '0.7513', 'grad_norm': '0.8751', 'learning_rate': '4.039e-05', 'ppl': '2.12', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '82.14', 'tokens/total': 4299942, 'tokens/trainable': 2300128, 'epoch': '0.7357'}
 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 371/505 [55:33<17:01,  7.62s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 372/505 [55:41<16:45,  7.56s/it]                                                                                                                                                                                                                                                              {'loss': '0.7712', 'grad_norm': '0.741', 'learning_rate': '3.983e-05', 'ppl': '2.162', 'memory/max_active (GiB)': '3.72', 'memory/max_allocated (GiB)': '3.72', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '150.1', 'tokens/total': 4310623, 'tokens/trainable': 2305324, 'epoch': '0.7377'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 372/505 [55:41<16:45,  7.56s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 373/505 [55:47<16:06,  7.32s/it]                                                                                                                                                                                                                                                              {'loss': '0.6956', 'grad_norm': '0.6709', 'learning_rate': '3.928e-05', 'ppl': '2.005', 'memory/max_active (GiB)': '3.57', 'memory/max_allocated (GiB)': '3.57', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '88', 'tokens/total': 4320011, 'tokens/trainable': 2309730, 'epoch': '0.7397'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                        | 373/505 [55:47<16:06,  7.32s/it][2026-06-13 17:37:10,201] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 374/505 [55:55<16:00,  7.33s/it]                                                                                                                                                                                                                                                              {'loss': '0.7464', 'grad_norm': '0.9395', 'learning_rate': '3.874e-05', 'ppl': '2.109', 'memory/max_active (GiB)': '4.04', 'memory/max_allocated (GiB)': '4.04', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '108.6', 'tokens/total': 4330671, 'tokens/trainable': 2315250, 'epoch': '0.7417'}
 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 374/505 [55:55<16:00,  7.33s/it] 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 375/505 [56:03<16:44,  7.73s/it]                                                                                                                                                                                                                                                              {'loss': '0.6883', 'grad_norm': '0.6136', 'learning_rate': '3.819e-05', 'ppl': '1.99', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '187.3', 'tokens/total': 4343079, 'tokens/trainable': 2322242, 'epoch': '0.7437'}
 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 375/505 [56:03<16:44,  7.73s/it] 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 376/505 [56:11<16:38,  7.74s/it]                                                                                                                                                                                                                                                              {'loss': '0.7385', 'grad_norm': '0.7225', 'learning_rate': '3.765e-05', 'ppl': '2.093', 'memory/max_active (GiB)': '3.9', 'memory/max_allocated (GiB)': '3.9', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '39.73', 'tokens/total': 4354344, 'tokens/trainable': 2328057, 'epoch': '0.7457'}
 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 376/505 [56:11<16:38,  7.74s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 377/505 [56:19<16:28,  7.72s/it]                                                                                                                                                                                                                                                              {'loss': '0.6161', 'grad_norm': '0.6475', 'learning_rate': '3.711e-05', 'ppl': '1.852', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '45.18', 'tokens/total': 4365655, 'tokens/trainable': 2333791, 'epoch': '0.7476'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 377/505 [56:19<16:28,  7.72s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 378/505 [56:26<15:59,  7.56s/it]                                                                                                                                                                                                                                                              {'loss': '0.6571', 'grad_norm': '0.6418', 'learning_rate': '3.658e-05', 'ppl': '1.929', 'memory/max_active (GiB)': '3.69', 'memory/max_allocated (GiB)': '3.69', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '62.74', 'tokens/total': 4375905, 'tokens/trainable': 2339298, 'epoch': '0.7496'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                      | 378/505 [56:26<15:59,  7.56s/it] 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 379/505 [56:34<16:24,  7.82s/it]                                                                                                                                                                                                                                                              {'loss': '0.6346', 'grad_norm': '0.5622', 'learning_rate': '3.605e-05', 'ppl': '1.886', 'memory/max_active (GiB)': '3.74', 'memory/max_allocated (GiB)': '3.74', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '62.73', 'tokens/total': 4387958, 'tokens/trainable': 2346032, 'epoch': '0.7516'}
 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 379/505 [56:34<16:24,  7.82s/it] 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 380/505 [56:40<15:07,  7.26s/it]                                                                                                                                                                                                                                                              {'loss': '0.7647', 'grad_norm': '0.7847', 'learning_rate': '3.552e-05', 'ppl': '2.148', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '48.83', 'tokens/total': 4396514, 'tokens/trainable': 2350457, 'epoch': '0.7536'}
 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 380/505 [56:40<15:07,  7.26s/it] 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 381/505 [56:47<14:41,  7.11s/it]                                                                                                                                                                                                                                                              {'loss': '0.7099', 'grad_norm': '0.7886', 'learning_rate': '3.499e-05', 'ppl': '2.034', 'memory/max_active (GiB)': '3.72', 'memory/max_allocated (GiB)': '3.72', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '142', 'tokens/total': 4406331, 'tokens/trainable': 2355467, 'epoch': '0.7556'}
 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 381/505 [56:47<14:41,  7.11s/it] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 382/505 [56:54<14:41,  7.17s/it]                                                                                                                                                                                                                                                              {'loss': '0.7295', 'grad_norm': '0.8332', 'learning_rate': '3.447e-05', 'ppl': '2.074', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '106.9', 'tokens/total': 4416894, 'tokens/trainable': 2360566, 'epoch': '0.7576'}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 382/505 [56:54<14:41,  7.17s/it] 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 383/505 [57:03<15:34,  7.66s/it]                                                                                                                                                                                                                                                              {'loss': '0.7313', 'grad_norm': '0.6769', 'learning_rate': '3.395e-05', 'ppl': '2.078', 'memory/max_active (GiB)': '4.04', 'memory/max_allocated (GiB)': '4.04', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '33.61', 'tokens/total': 4429733, 'tokens/trainable': 2367651, 'epoch': '0.7595'}
 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 383/505 [57:03<15:34,  7.66s/it] 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                   | 384/505 [57:10<15:01,  7.45s/it]                                                                                                                                                                                                                                                              {'loss': '0.6948', 'grad_norm': '0.6601', 'learning_rate': '3.343e-05', 'ppl': '2.003', 'memory/max_active (GiB)': '3.64', 'memory/max_allocated (GiB)': '3.64', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '83.01', 'tokens/total': 4439962, 'tokens/trainable': 2372434, 'epoch': '0.7615'}
 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                   | 384/505 [57:10<15:01,  7.45s/it] 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 385/505 [57:20<16:19,  8.16s/it]                                                                                                                                                                                                                                                              {'loss': '0.8311', 'grad_norm': '0.6713', 'learning_rate': '3.292e-05', 'ppl': '2.296', 'memory/max_active (GiB)': '4.24', 'memory/max_allocated (GiB)': '4.24', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '31.36', 'tokens/total': 4454175, 'tokens/trainable': 2380669, 'epoch': '0.7635'}
 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 385/505 [57:20<16:19,  8.16s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 386/505 [57:29<16:30,  8.32s/it]                                                                                                                                                                                                                                                              {'loss': '0.8722', 'grad_norm': '0.833', 'learning_rate': '3.241e-05', 'ppl': '2.392', 'memory/max_active (GiB)': '3.96', 'memory/max_allocated (GiB)': '3.96', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '58.37', 'tokens/total': 4466606, 'tokens/trainable': 2387211, 'epoch': '0.7655'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 386/505 [57:29<16:30,  8.32s/it] 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 387/505 [57:36<15:40,  7.97s/it]                                                                                                                                                                                                                                                              {'loss': '0.6656', 'grad_norm': '0.6579', 'learning_rate': '3.19e-05', 'ppl': '1.946', 'memory/max_active (GiB)': '3.66', 'memory/max_allocated (GiB)': '3.66', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '95.2', 'tokens/total': 4477026, 'tokens/trainable': 2392239, 'epoch': '0.7675'}
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 387/505 [57:36<15:40,  7.97s/it][2026-06-13 17:38:58,171] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 388/505 [57:44<15:32,  7.97s/it]                                                                                                                                                                                                                                                              {'loss': '0.7822', 'grad_norm': '0.7131', 'learning_rate': '3.139e-05', 'ppl': '2.186', 'memory/max_active (GiB)': '3.92', 'memory/max_allocated (GiB)': '3.92', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '87.31', 'tokens/total': 4488501, 'tokens/trainable': 2398208, 'epoch': '0.7695'}
 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 388/505 [57:44<15:32,  7.97s/it] 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 389/505 [57:51<14:45,  7.63s/it]                                                                                                                                                                                                                                                              {'loss': '0.712', 'grad_norm': '0.8212', 'learning_rate': '3.089e-05', 'ppl': '2.038', 'memory/max_active (GiB)': '3.62', 'memory/max_allocated (GiB)': '3.62', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '19.58', 'tokens/total': 4498352, 'tokens/trainable': 2402841, 'epoch': '0.7714'}
 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 389/505 [57:51<14:45,  7.63s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 390/505 [57:59<14:52,  7.76s/it]                                                                                                                                                                                                                                                              {'loss': '0.7921', 'grad_norm': '0.6234', 'learning_rate': '3.04e-05', 'ppl': '2.208', 'memory/max_active (GiB)': '3.67', 'memory/max_allocated (GiB)': '3.67', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '150.8', 'tokens/total': 4510037, 'tokens/trainable': 2409256, 'epoch': '0.7734'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 390/505 [57:59<14:52,  7.76s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 391/505 [58:07<15:05,  7.94s/it]                                                                                                                                                                                                                                                              {'loss': '0.7014', 'grad_norm': '0.5802', 'learning_rate': '2.99e-05', 'ppl': '2.017', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '224.8', 'tokens/total': 4522224, 'tokens/trainable': 2416441, 'epoch': '0.7754'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 391/505 [58:07<15:05,  7.94s/it] 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 392/505 [58:16<15:24,  8.18s/it]                                                                                                                                                                                                                                                              {'loss': '0.7343', 'grad_norm': '0.8924', 'learning_rate': '2.941e-05', 'ppl': '2.084', 'memory/max_active (GiB)': '3.98', 'memory/max_allocated (GiB)': '3.98', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '26.55', 'tokens/total': 4534816, 'tokens/trainable': 2423261, 'epoch': '0.7774'}
 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                | 392/505 [58:16<15:24,  8.18s/it] 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 393/505 [58:21<13:40,  7.32s/it]                                                                                                                                                                                                                                                              {'loss': '0.6555', 'grad_norm': '0.8845', 'learning_rate': '2.892e-05', 'ppl': '1.926', 'memory/max_active (GiB)': '3.5', 'memory/max_allocated (GiB)': '3.5', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '112.2', 'tokens/total': 4542534, 'tokens/trainable': 2426118, 'epoch': '0.7794'}
 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 393/505 [58:21<13:40,  7.32s/it] 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                               | 394/505 [58:29<13:48,  7.47s/it]                                                                                                                                                                                                                                                              {'loss': '0.7035', 'grad_norm': '0.6413', 'learning_rate': '2.844e-05', 'ppl': '2.021', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '41.42', 'tokens/total': 4553852, 'tokens/trainable': 2431581, 'epoch': '0.7814'}
 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                               | 394/505 [58:29<13:48,  7.47s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 395/505 [58:37<13:49,  7.54s/it]                                                                                                                                                                                                                                                              {'loss': '0.7127', 'grad_norm': '0.695', 'learning_rate': '2.796e-05', 'ppl': '2.039', 'memory/max_active (GiB)': '3.79', 'memory/max_allocated (GiB)': '3.79', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '50.24', 'tokens/total': 4565044, 'tokens/trainable': 2437491, 'epoch': '0.7833'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 395/505 [58:37<13:49,  7.54s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 396/505 [58:45<14:24,  7.93s/it]                                                                                                                                                                                                                                                              {'loss': '0.6705', 'grad_norm': '0.6591', 'learning_rate': '2.748e-05', 'ppl': '1.955', 'memory/max_active (GiB)': '4', 'memory/max_allocated (GiB)': '4', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '70.87', 'tokens/total': 4577837, 'tokens/trainable': 2444164, 'epoch': '0.7853'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 396/505 [58:45<14:24,  7.93s/it] 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 397/505 [58:52<13:27,  7.48s/it]                                                                                                                                                                                                                                                              {'loss': '0.6929', 'grad_norm': '0.7231', 'learning_rate': '2.701e-05', 'ppl': '2', 'memory/max_active (GiB)': '3.68', 'memory/max_allocated (GiB)': '3.68', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '66.84', 'tokens/total': 4587069, 'tokens/trainable': 2448733, 'epoch': '0.7873'}
 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 397/505 [58:52<13:27,  7.48s/it] 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 398/505 [58:58<12:43,  7.14s/it]                                                                                                                                                                                                                                                              {'loss': '0.7093', 'grad_norm': '0.7251', 'learning_rate': '2.654e-05', 'ppl': '2.032', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '59.56', 'tokens/total': 4596248, 'tokens/trainable': 2453194, 'epoch': '0.7893'}
 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 398/505 [58:58<12:43,  7.14s/it] 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 399/505 [59:09<14:41,  8.32s/it]                                                                                                                                                                                                                                                              {'loss': '0.6631', 'grad_norm': '0.607', 'learning_rate': '2.607e-05', 'ppl': '1.941', 'memory/max_active (GiB)': '4.2', 'memory/max_allocated (GiB)': '4.2', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '66.99', 'tokens/total': 4612275, 'tokens/trainable': 2463067, 'epoch': '0.7913'}
 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 399/505 [59:09<14:41,  8.32s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 400/505 [59:21<16:06,  9.21s/it]                                                                                                                                                                                                                                                              {'loss': '0.6678', 'grad_norm': '0.5242', 'learning_rate': '2.561e-05', 'ppl': '1.95', 'memory/max_active (GiB)': '4.2', 'memory/max_allocated (GiB)': '4.2', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '47.92', 'tokens/total': 4628598, 'tokens/trainable': 2473048, 'epoch': '0.7933'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 400/505 [59:21<16:06,  9.21s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 401/505 [59:28<14:56,  8.62s/it]                                                                                                                                                                                                                                                              {'loss': '0.7568', 'grad_norm': '0.8715', 'learning_rate': '2.515e-05', 'ppl': '2.131', 'memory/max_active (GiB)': '4.04', 'memory/max_allocated (GiB)': '4.04', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '87.6', 'tokens/total': 4639063, 'tokens/trainable': 2478092, 'epoch': '0.7952'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                            | 401/505 [59:28<14:56,  8.62s/it] 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 402/505 [59:36<14:36,  8.51s/it]                                                                                                                                                                                                                                                              {'loss': '0.6606', 'grad_norm': '0.758', 'learning_rate': '2.469e-05', 'ppl': '1.936', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '72.66', 'tokens/total': 4651124, 'tokens/trainable': 2485081, 'epoch': '0.7972'}
 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 402/505 [59:36<14:36,  8.51s/it] 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 403/505 [59:48<16:10,  9.52s/it]                                                                                                                                                                                                                                                              {'loss': '0.6995', 'grad_norm': '0.4304', 'learning_rate': '2.424e-05', 'ppl': '2.013', 'memory/max_active (GiB)': '4', 'memory/max_allocated (GiB)': '4', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '74.65', 'tokens/total': 4668248, 'tokens/trainable': 2495723, 'epoch': '0.7992'}
 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 403/505 [59:48<16:10,  9.52s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 404/505 [59:57<15:32,  9.23s/it]                                                                                                                                                                                                                                                              {'loss': '0.7305', 'grad_norm': '0.5975', 'learning_rate': '2.379e-05', 'ppl': '2.076', 'memory/max_active (GiB)': '3.88', 'memory/max_allocated (GiB)': '3.88', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '59.63', 'tokens/total': 4680615, 'tokens/trainable': 2502760, 'epoch': '0.8012'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 404/505 [59:57<15:32,  9.23s/it] 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 405/505 [1:00:03<14:12,  8.53s/it]                                                                                                                                                                                                                                                              {'loss': '0.8369', 'grad_norm': '0.9334', 'learning_rate': '2.335e-05', 'ppl': '2.309', 'memory/max_active (GiB)': '3.64', 'memory/max_allocated (GiB)': '3.64', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '143', 'tokens/total': 4690586, 'tokens/trainable': 2507301, 'epoch': '0.8032'}
 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 405/505 [1:00:03<14:12,  8.53s/it] 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 406/505 [1:00:10<13:04,  7.92s/it]                                                                                                                                                                                                                                                              {'loss': '0.7477', 'grad_norm': '0.8936', 'learning_rate': '2.29e-05', 'ppl': '2.112', 'memory/max_active (GiB)': '3.83', 'memory/max_allocated (GiB)': '3.83', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '34.26', 'tokens/total': 4699889, 'tokens/trainable': 2511746, 'epoch': '0.8052'}
 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 406/505 [1:00:10<13:04,  7.92s/it] 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 407/505 [1:00:18<13:09,  8.05s/it]                                                                                                                                                                                                                                                              {'loss': '0.7576', 'grad_norm': '0.7151', 'learning_rate': '2.247e-05', 'ppl': '2.133', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '99.16', 'tokens/total': 4711867, 'tokens/trainable': 2518731, 'epoch': '0.8071'}
 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 407/505 [1:00:18<13:09,  8.05s/it] 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 408/505 [1:00:25<12:18,  7.61s/it]                                                                                                                                                                                                                                                              {'loss': '0.7864', 'grad_norm': '0.7771', 'learning_rate': '2.203e-05', 'ppl': '2.195', 'memory/max_active (GiB)': '3.62', 'memory/max_allocated (GiB)': '3.62', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '36.5', 'tokens/total': 4721340, 'tokens/trainable': 2523298, 'epoch': '0.8091'}
 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 408/505 [1:00:25<12:18,  7.61s/it] 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 409/505 [1:00:32<11:47,  7.37s/it]                                                                                                                                                                                                                                                              {'loss': '0.7731', 'grad_norm': '0.7198', 'learning_rate': '2.16e-05', 'ppl': '2.166', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '72.49', 'tokens/total': 4731188, 'tokens/trainable': 2528484, 'epoch': '0.8111'}
 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 409/505 [1:00:32<11:47,  7.37s/it] 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 410/505 [1:00:39<11:35,  7.32s/it]                                                                                                                                                                                                                                                              {'loss': '0.7604', 'grad_norm': '0.6296', 'learning_rate': '2.118e-05', 'ppl': '2.139', 'memory/max_active (GiB)': '3.65', 'memory/max_allocated (GiB)': '3.65', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '167.5', 'tokens/total': 4741581, 'tokens/trainable': 2534118, 'epoch': '0.8131'}
 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 410/505 [1:00:39<11:35,  7.32s/it] 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 411/505 [1:00:46<11:13,  7.16s/it]                                                                                                                                                                                                                                                              {'loss': '0.8026', 'grad_norm': '1.034', 'learning_rate': '2.075e-05', 'ppl': '2.231', 'memory/max_active (GiB)': '3.99', 'memory/max_allocated (GiB)': '3.99', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '50.75', 'tokens/total': 4751447, 'tokens/trainable': 2538872, 'epoch': '0.8151'}
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 411/505 [1:00:46<11:13,  7.16s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 412/505 [1:00:51<10:10,  6.57s/it]                                                                                                                                                                                                                                                              {'loss': '0.7192', 'grad_norm': '0.9212', 'learning_rate': '2.033e-05', 'ppl': '2.053', 'memory/max_active (GiB)': '3.54', 'memory/max_allocated (GiB)': '3.54', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '132.1', 'tokens/total': 4758858, 'tokens/trainable': 2541729, 'epoch': '0.8171'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 412/505 [1:00:51<10:10,  6.57s/it] 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 413/505 [1:01:00<11:19,  7.39s/it]                                                                                                                                                                                                                                                              {'loss': '0.7344', 'grad_norm': '0.5596', 'learning_rate': '1.992e-05', 'ppl': '2.084', 'memory/max_active (GiB)': '4.11', 'memory/max_allocated (GiB)': '4.11', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '90.65', 'tokens/total': 4772414, 'tokens/trainable': 2549782, 'epoch': '0.819'}
 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 413/505 [1:01:00<11:19,  7.39s/it][2026-06-13 17:42:27,409] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 414/505 [1:01:09<11:53,  7.85s/it]                                                                                                                                                                                                                                                              {'loss': '0.711', 'grad_norm': '0.6131', 'learning_rate': '1.951e-05', 'ppl': '2.036', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '224.6', 'tokens/total': 4785183, 'tokens/trainable': 2556811, 'epoch': '0.821'}
 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 414/505 [1:01:09<11:53,  7.85s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 415/505 [1:01:18<12:18,  8.20s/it]                                                                                                                                                                                                                                                              {'loss': '0.7802', 'grad_norm': '0.5919', 'learning_rate': '1.91e-05', 'ppl': '2.182', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '131.2', 'tokens/total': 4798094, 'tokens/trainable': 2564252, 'epoch': '0.823'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 415/505 [1:01:18<12:18,  8.20s/it][2026-06-13 17:42:43,953] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 416/505 [1:01:25<11:31,  7.77s/it]                                                                                                                                                                                                                                                              {'loss': '0.7901', 'grad_norm': '0.9811', 'learning_rate': '1.869e-05', 'ppl': '2.204', 'memory/max_active (GiB)': '3.67', 'memory/max_allocated (GiB)': '3.67', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '129.7', 'tokens/total': 4807819, 'tokens/trainable': 2568843, 'epoch': '0.825'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 416/505 [1:01:25<11:31,  7.77s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 417/505 [1:01:33<11:45,  8.02s/it]                                                                                                                                                                                                                                                              {'loss': '0.6904', 'grad_norm': '0.632', 'learning_rate': '1.829e-05', 'ppl': '1.995', 'memory/max_active (GiB)': '3.92', 'memory/max_allocated (GiB)': '3.92', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '140.7', 'tokens/total': 4820206, 'tokens/trainable': 2575682, 'epoch': '0.827'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 417/505 [1:01:33<11:45,  8.02s/it] 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 418/505 [1:01:43<12:09,  8.38s/it]                                                                                                                                                                                                                                                              {'loss': '0.5458', 'grad_norm': '0.6434', 'learning_rate': '1.79e-05', 'ppl': '1.726', 'memory/max_active (GiB)': '3.9', 'memory/max_allocated (GiB)': '3.9', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '15.7', 'tokens/total': 4833624, 'tokens/trainable': 2583709, 'epoch': '0.829'}
 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 418/505 [1:01:43<12:09,  8.38s/it] 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 419/505 [1:01:50<11:42,  8.17s/it]                                                                                                                                                                                                                                                              {'loss': '0.8113', 'grad_norm': '0.6756', 'learning_rate': '1.751e-05', 'ppl': '2.251', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '61.5', 'tokens/total': 4844715, 'tokens/trainable': 2589329, 'epoch': '0.8309'}
 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 419/505 [1:01:50<11:42,  8.17s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 420/505 [1:01:57<10:51,  7.66s/it]                                                                                                                                                                                                                                                              {'loss': '0.9016', 'grad_norm': '0.7979', 'learning_rate': '1.712e-05', 'ppl': '2.464', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '40.59', 'tokens/total': 4854139, 'tokens/trainable': 2593719, 'epoch': '0.8329'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 420/505 [1:01:57<10:51,  7.66s/it] 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 421/505 [1:02:05<10:56,  7.81s/it]                                                                                                                                                                                                                                                              {'loss': '0.6559', 'grad_norm': '0.7851', 'learning_rate': '1.673e-05', 'ppl': '1.927', 'memory/max_active (GiB)': '4.1', 'memory/max_allocated (GiB)': '4.1', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '26.73', 'tokens/total': 4865889, 'tokens/trainable': 2599368, 'epoch': '0.8349'}
 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                   | 421/505 [1:02:05<10:56,  7.81s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 422/505 [1:02:12<10:23,  7.51s/it]                                                                                                                                                                                                                                                              {'loss': '0.7423', 'grad_norm': '0.7235', 'learning_rate': '1.635e-05', 'ppl': '2.101', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '31.57', 'tokens/total': 4875771, 'tokens/trainable': 2604205, 'epoch': '0.8369'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 422/505 [1:02:12<10:23,  7.51s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 423/505 [1:02:22<11:14,  8.23s/it]                                                                                                                                                                                                                                                              {'loss': '0.6918', 'grad_norm': '0.7613', 'learning_rate': '1.598e-05', 'ppl': '1.997', 'memory/max_active (GiB)': '4.27', 'memory/max_allocated (GiB)': '4.27', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '94.68', 'tokens/total': 4890067, 'tokens/trainable': 2612234, 'epoch': '0.8389'}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 423/505 [1:02:22<11:14,  8.23s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 424/505 [1:02:30<11:01,  8.16s/it]                                                                                                                                                                                                                                                              {'loss': '0.6509', 'grad_norm': '0.6549', 'learning_rate': '1.56e-05', 'ppl': '1.917', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '61.17', 'tokens/total': 4901592, 'tokens/trainable': 2618461, 'epoch': '0.8409'}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 424/505 [1:02:30<11:01,  8.16s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 425/505 [1:02:41<11:57,  8.97s/it]                                                                                                                                                                                                                                                              {'loss': '0.7751', 'grad_norm': '0.5589', 'learning_rate': '1.524e-05', 'ppl': '2.171', 'memory/max_active (GiB)': '4.24', 'memory/max_allocated (GiB)': '4.24', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '221.9', 'tokens/total': 4917314, 'tokens/trainable': 2626994, 'epoch': '0.8428'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 425/505 [1:02:41<11:57,  8.97s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 426/505 [1:02:49<11:34,  8.79s/it]                                                                                                                                                                                                                                                              {'loss': '0.6544', 'grad_norm': '0.8352', 'learning_rate': '1.487e-05', 'ppl': '1.924', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '141.8', 'tokens/total': 4929397, 'tokens/trainable': 2633687, 'epoch': '0.8448'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 426/505 [1:02:49<11:34,  8.79s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 427/505 [1:02:58<11:23,  8.76s/it]                                                                                                                                                                                                                                                              {'loss': '0.8237', 'grad_norm': '0.6637', 'learning_rate': '1.451e-05', 'ppl': '2.279', 'memory/max_active (GiB)': '4.22', 'memory/max_allocated (GiB)': '4.22', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '126.5', 'tokens/total': 4941971, 'tokens/trainable': 2641107, 'epoch': '0.8468'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 427/505 [1:02:58<11:23,  8.76s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 428/505 [1:03:05<10:37,  8.28s/it]                                                                                                                                                                                                                                                              {'loss': '0.6985', 'grad_norm': '0.6793', 'learning_rate': '1.416e-05', 'ppl': '2.011', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '79.4', 'tokens/total': 4952361, 'tokens/trainable': 2646479, 'epoch': '0.8488'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 428/505 [1:03:05<10:37,  8.28s/it] 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 429/505 [1:03:12<10:02,  7.93s/it]                                                                                                                                                                                                                                                              {'loss': '0.6385', 'grad_norm': '0.7565', 'learning_rate': '1.38e-05', 'ppl': '1.894', 'memory/max_active (GiB)': '3.65', 'memory/max_allocated (GiB)': '3.65', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '56.68', 'tokens/total': 4962603, 'tokens/trainable': 2651621, 'epoch': '0.8508'}
 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 429/505 [1:03:12<10:02,  7.93s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 430/505 [1:03:18<09:02,  7.24s/it]                                                                                                                                                                                                                                                              {'loss': '0.7216', 'grad_norm': '0.8706', 'learning_rate': '1.346e-05', 'ppl': '2.058', 'memory/max_active (GiB)': '3.53', 'memory/max_allocated (GiB)': '3.53', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '95.48', 'tokens/total': 4970778, 'tokens/trainable': 2654946, 'epoch': '0.8528'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 430/505 [1:03:18<09:02,  7.24s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 431/505 [1:03:24<08:44,  7.08s/it]                                                                                                                                                                                                                                                              {'loss': '0.6871', 'grad_norm': '0.7441', 'learning_rate': '1.311e-05', 'ppl': '1.988', 'memory/max_active (GiB)': '3.63', 'memory/max_allocated (GiB)': '3.63', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '53.56', 'tokens/total': 4980424, 'tokens/trainable': 2659856, 'epoch': '0.8547'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 431/505 [1:03:24<08:44,  7.08s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 432/505 [1:03:31<08:23,  6.89s/it]                                                                                                                                                                                                                                                              {'loss': '0.7045', 'grad_norm': '0.8993', 'learning_rate': '1.277e-05', 'ppl': '2.023', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '87.02', 'tokens/total': 4989539, 'tokens/trainable': 2663983, 'epoch': '0.8567'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 432/505 [1:03:31<08:23,  6.89s/it] 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 433/505 [1:03:40<09:01,  7.52s/it]                                                                                                                                                                                                                                                              {'loss': '0.7304', 'grad_norm': '0.5847', 'learning_rate': '1.244e-05', 'ppl': '2.076', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '52.84', 'tokens/total': 5002439, 'tokens/trainable': 2671302, 'epoch': '0.8587'}
 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 433/505 [1:03:40<09:01,  7.52s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 434/505 [1:03:47<08:51,  7.49s/it]                                                                                                                                                                                                                                                              {'loss': '0.7235', 'grad_norm': '0.8462', 'learning_rate': '1.21e-05', 'ppl': '2.062', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '11.32', 'tokens/total': 5013175, 'tokens/trainable': 2676767, 'epoch': '0.8607'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 434/505 [1:03:47<08:51,  7.49s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 435/505 [1:03:53<08:16,  7.09s/it]                                                                                                                                                                                                                                                              {'loss': '0.7457', 'grad_norm': '0.7875', 'learning_rate': '1.178e-05', 'ppl': '2.108', 'memory/max_active (GiB)': '3.58', 'memory/max_allocated (GiB)': '3.58', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '118.2', 'tokens/total': 5022000, 'tokens/trainable': 2680838, 'epoch': '0.8627'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 435/505 [1:03:53<08:16,  7.09s/it] 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 436/505 [1:04:01<08:31,  7.42s/it]                                                                                                                                                                                                                                                              {'loss': '0.5946', 'grad_norm': '0.6674', 'learning_rate': '1.145e-05', 'ppl': '1.812', 'memory/max_active (GiB)': '3.97', 'memory/max_allocated (GiB)': '3.97', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '260', 'tokens/total': 5033850, 'tokens/trainable': 2687049, 'epoch': '0.8647'}
 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 436/505 [1:04:01<08:31,  7.42s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 437/505 [1:04:09<08:26,  7.45s/it]                                                                                                                                                                                                                                                              {'loss': '0.7115', 'grad_norm': '0.6868', 'learning_rate': '1.114e-05', 'ppl': '2.037', 'memory/max_active (GiB)': '3.8', 'memory/max_allocated (GiB)': '3.8', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '151', 'tokens/total': 5044688, 'tokens/trainable': 2692468, 'epoch': '0.8666'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 437/505 [1:04:09<08:26,  7.45s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 438/505 [1:04:17<08:21,  7.49s/it]                                                                                                                                                                                                                                                              {'loss': '0.7352', 'grad_norm': '0.857', 'learning_rate': '1.082e-05', 'ppl': '2.086', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '58.26', 'tokens/total': 5055623, 'tokens/trainable': 2697782, 'epoch': '0.8686'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 438/505 [1:04:17<08:21,  7.49s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 439/505 [1:04:25<08:31,  7.75s/it]                                                                                                                                                                                                                                                              {'loss': '0.672', 'grad_norm': '0.633', 'learning_rate': '1.051e-05', 'ppl': '1.958', 'memory/max_active (GiB)': '3.95', 'memory/max_allocated (GiB)': '3.95', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '40.75', 'tokens/total': 5067773, 'tokens/trainable': 2704422, 'epoch': '0.8706'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 439/505 [1:04:25<08:31,  7.75s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 440/505 [1:04:32<08:01,  7.42s/it]                                                                                                                                                                                                                                                              {'loss': '0.6259', 'grad_norm': '0.8406', 'learning_rate': '1.02e-05', 'ppl': '1.87', 'memory/max_active (GiB)': '3.87', 'memory/max_allocated (GiB)': '3.87', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '17.49', 'tokens/total': 5077259, 'tokens/trainable': 2709004, 'epoch': '0.8726'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 440/505 [1:04:32<08:01,  7.42s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 441/505 [1:04:44<09:29,  8.90s/it]                                                                                                                                                                                                                                                              {'loss': '0.7694', 'grad_norm': '0.948', 'learning_rate': '9.903e-06', 'ppl': '2.158', 'memory/max_active (GiB)': '4.26', 'memory/max_allocated (GiB)': '4.26', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '11.5', 'tokens/total': 5095003, 'tokens/trainable': 2719957, 'epoch': '0.8746'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 441/505 [1:04:44<09:29,  8.90s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 442/505 [1:04:51<08:48,  8.39s/it]                                                                                                                                                                                                                                                              {'loss': '0.6154', 'grad_norm': '0.7343', 'learning_rate': '9.606e-06', 'ppl': '1.85', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '67.43', 'tokens/total': 5105505, 'tokens/trainable': 2725393, 'epoch': '0.8765'}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 442/505 [1:04:51<08:48,  8.39s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 443/505 [1:04:59<08:31,  8.25s/it]                                                                                                                                                                                                                                                              {'loss': '0.723', 'grad_norm': '0.5948', 'learning_rate': '9.313e-06', 'ppl': '2.061', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '157.5', 'tokens/total': 5116964, 'tokens/trainable': 2731765, 'epoch': '0.8785'}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 443/505 [1:04:59<08:31,  8.25s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 444/505 [1:05:06<07:55,  7.80s/it]                                                                                                                                                                                                                                                              {'loss': '0.7152', 'grad_norm': '0.9878', 'learning_rate': '9.024e-06', 'ppl': '2.045', 'memory/max_active (GiB)': '3.62', 'memory/max_allocated (GiB)': '3.62', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '93.38', 'tokens/total': 5126778, 'tokens/trainable': 2736279, 'epoch': '0.8805'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 444/505 [1:05:06<07:55,  7.80s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 445/505 [1:05:11<07:09,  7.17s/it]                                                                                                                                                                                                                                                              {'loss': '0.8431', 'grad_norm': '0.8805', 'learning_rate': '8.739e-06', 'ppl': '2.324', 'memory/max_active (GiB)': '3.58', 'memory/max_allocated (GiB)': '3.58', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '69.67', 'tokens/total': 5134954, 'tokens/trainable': 2739989, 'epoch': '0.8825'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 445/505 [1:05:11<07:09,  7.17s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 446/505 [1:05:19<07:13,  7.35s/it]                                                                                                                                                                                                                                                              {'loss': '0.6293', 'grad_norm': '0.8116', 'learning_rate': '8.459e-06', 'ppl': '1.876', 'memory/max_active (GiB)': '3.87', 'memory/max_allocated (GiB)': '3.87', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '55.56', 'tokens/total': 5146087, 'tokens/trainable': 2745452, 'epoch': '0.8845'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 446/505 [1:05:19<07:13,  7.35s/it] 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 447/505 [1:05:26<06:59,  7.23s/it]                                                                                                                                                                                                                                                              {'loss': '0.7516', 'grad_norm': '0.7041', 'learning_rate': '8.183e-06', 'ppl': '2.12', 'memory/max_active (GiB)': '3.64', 'memory/max_allocated (GiB)': '3.64', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '66.01', 'tokens/total': 5156276, 'tokens/trainable': 2750767, 'epoch': '0.8865'}
 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 447/505 [1:05:26<06:59,  7.23s/it] 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 448/505 [1:05:34<06:58,  7.34s/it]                                                                                                                                                                                                                                                              {'loss': '0.6623', 'grad_norm': '0.8719', 'learning_rate': '7.912e-06', 'ppl': '1.939', 'memory/max_active (GiB)': '3.98', 'memory/max_allocated (GiB)': '3.98', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '64.3', 'tokens/total': 5166859, 'tokens/trainable': 2755939, 'epoch': '0.8884'}
 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 448/505 [1:05:34<06:58,  7.34s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 449/505 [1:05:41<06:41,  7.17s/it]                                                                                                                                                                                                                                                              {'loss': '0.7205', 'grad_norm': '0.9425', 'learning_rate': '7.645e-06', 'ppl': '2.056', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '67.3', 'tokens/total': 5176443, 'tokens/trainable': 2760532, 'epoch': '0.8904'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                       | 449/505 [1:05:41<06:41,  7.17s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 450/505 [1:05:47<06:21,  6.93s/it]                                                                                                                                                                                                                                                              {'loss': '0.788', 'grad_norm': '0.9615', 'learning_rate': '7.383e-06', 'ppl': '2.199', 'memory/max_active (GiB)': '3.65', 'memory/max_allocated (GiB)': '3.65', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '97.78', 'tokens/total': 5185585, 'tokens/trainable': 2764953, 'epoch': '0.8924'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 450/505 [1:05:47<06:21,  6.93s/it] 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 451/505 [1:05:55<06:26,  7.16s/it]                                                                                                                                                                                                                                                              {'loss': '0.7231', 'grad_norm': '0.8205', 'learning_rate': '7.124e-06', 'ppl': '2.061', 'memory/max_active (GiB)': '3.85', 'memory/max_allocated (GiB)': '3.85', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '88.26', 'tokens/total': 5196665, 'tokens/trainable': 2770680, 'epoch': '0.8944'}
 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 451/505 [1:05:55<06:26,  7.16s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 452/505 [1:06:02<06:26,  7.29s/it]                                                                                                                                                                                                                                                              {'loss': '0.7212', 'grad_norm': '0.6191', 'learning_rate': '6.871e-06', 'ppl': '2.057', 'memory/max_active (GiB)': '3.86', 'memory/max_allocated (GiB)': '3.86', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '57.61', 'tokens/total': 5207703, 'tokens/trainable': 2776505, 'epoch': '0.8964'}
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 452/505 [1:06:02<06:26,  7.29s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 453/505 [1:06:08<05:52,  6.79s/it]                                                                                                                                                                                                                                                              {'loss': '0.847', 'grad_norm': '0.8146', 'learning_rate': '6.621e-06', 'ppl': '2.333', 'memory/max_active (GiB)': '3.67', 'memory/max_allocated (GiB)': '3.67', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '52.12', 'tokens/total': 5215752, 'tokens/trainable': 2779910, 'epoch': '0.8984'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 453/505 [1:06:08<05:52,  6.79s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 454/505 [1:06:16<06:10,  7.27s/it]                                                                                                                                                                                                                                                              {'loss': '0.6276', 'grad_norm': '0.6043', 'learning_rate': '6.377e-06', 'ppl': '1.873', 'memory/max_active (GiB)': '3.9', 'memory/max_allocated (GiB)': '3.9', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '89.17', 'tokens/total': 5227837, 'tokens/trainable': 2786705, 'epoch': '0.9003'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 454/505 [1:06:16<06:10,  7.27s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 455/505 [1:06:22<05:43,  6.87s/it]                                                                                                                                                                                                                                                              {'loss': '0.764', 'grad_norm': '0.8232', 'learning_rate': '6.136e-06', 'ppl': '2.147', 'memory/max_active (GiB)': '3.6', 'memory/max_allocated (GiB)': '3.6', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '81.53', 'tokens/total': 5236309, 'tokens/trainable': 2790502, 'epoch': '0.9023'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 455/505 [1:06:22<05:43,  6.87s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 456/505 [1:06:30<05:53,  7.21s/it]                                                                                                                                                                                                                                                              {'loss': '0.8844', 'grad_norm': '0.9291', 'learning_rate': '5.9e-06', 'ppl': '2.421', 'memory/max_active (GiB)': '3.74', 'memory/max_allocated (GiB)': '3.74', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '70.37', 'tokens/total': 5247837, 'tokens/trainable': 2796272, 'epoch': '0.9043'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                    | 456/505 [1:06:30<05:53,  7.21s/it] 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 457/505 [1:06:37<05:38,  7.06s/it]                                                                                                                                                                                                                                                              {'loss': '0.7437', 'grad_norm': '0.7827', 'learning_rate': '5.669e-06', 'ppl': '2.104', 'memory/max_active (GiB)': '3.65', 'memory/max_allocated (GiB)': '3.65', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '63.87', 'tokens/total': 5257474, 'tokens/trainable': 2800434, 'epoch': '0.9063'}
 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 457/505 [1:06:37<05:38,  7.06s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 458/505 [1:06:45<05:50,  7.46s/it]                                                                                                                                                                                                                                                              {'loss': '0.8267', 'grad_norm': '0.7905', 'learning_rate': '5.442e-06', 'ppl': '2.286', 'memory/max_active (GiB)': '4.18', 'memory/max_allocated (GiB)': '4.18', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '54.56', 'tokens/total': 5269449, 'tokens/trainable': 2807104, 'epoch': '0.9083'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 458/505 [1:06:45<05:50,  7.46s/it] 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 459/505 [1:06:52<05:39,  7.39s/it]                                                                                                                                                                                                                                                              {'loss': '0.6798', 'grad_norm': '0.7336', 'learning_rate': '5.219e-06', 'ppl': '1.974', 'memory/max_active (GiB)': '3.87', 'memory/max_allocated (GiB)': '3.87', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '112.9', 'tokens/total': 5279712, 'tokens/trainable': 2812035, 'epoch': '0.9103'}
 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 459/505 [1:06:52<05:39,  7.39s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 460/505 [1:07:00<05:34,  7.43s/it]                                                                                                                                                                                                                                                              {'loss': '0.7368', 'grad_norm': '0.8045', 'learning_rate': '5.002e-06', 'ppl': '2.089', 'memory/max_active (GiB)': '3.81', 'memory/max_allocated (GiB)': '3.81', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '62.03', 'tokens/total': 5290335, 'tokens/trainable': 2817764, 'epoch': '0.9122'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 460/505 [1:07:00<05:34,  7.43s/it][2026-06-13 17:48:26,615] [WARNING] [py.warnings._showwarnmsg:112] [PID:590] /workspace/axolotl-venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1044: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 461/505 [1:07:09<05:41,  7.76s/it]                                                                                                                                                                                                                                                              {'loss': '0.7279', 'grad_norm': '0.6526', 'learning_rate': '4.788e-06', 'ppl': '2.071', 'memory/max_active (GiB)': '4.2', 'memory/max_allocated (GiB)': '4.2', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '304.4', 'tokens/total': 5302585, 'tokens/trainable': 2824679, 'epoch': '0.9142'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 461/505 [1:07:09<05:41,  7.76s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 462/505 [1:07:16<05:23,  7.53s/it]                                                                                                                                                                                                                                                              {'loss': '0.7426', 'grad_norm': '0.9673', 'learning_rate': '4.579e-06', 'ppl': '2.101', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '78.46', 'tokens/total': 5312392, 'tokens/trainable': 2829415, 'epoch': '0.9162'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 462/505 [1:07:16<05:23,  7.53s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 463/505 [1:07:24<05:27,  7.81s/it]                                                                                                                                                                                                                                                              {'loss': '0.6802', 'grad_norm': '0.5574', 'learning_rate': '4.375e-06', 'ppl': '1.974', 'memory/max_active (GiB)': '4.02', 'memory/max_allocated (GiB)': '4.02', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '78.26', 'tokens/total': 5324563, 'tokens/trainable': 2836375, 'epoch': '0.9182'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 463/505 [1:07:24<05:27,  7.81s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 464/505 [1:07:33<05:36,  8.20s/it]                                                                                                                                                                                                                                                              {'loss': '0.7677', 'grad_norm': '0.5754', 'learning_rate': '4.175e-06', 'ppl': '2.155', 'memory/max_active (GiB)': '3.75', 'memory/max_allocated (GiB)': '3.75', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '63.37', 'tokens/total': 5337416, 'tokens/trainable': 2843762, 'epoch': '0.9202'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 464/505 [1:07:33<05:36,  8.20s/it] 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 465/505 [1:07:40<05:13,  7.83s/it]                                                                                                                                                                                                                                                              {'loss': '0.6569', 'grad_norm': '0.9345', 'learning_rate': '3.98e-06', 'ppl': '1.929', 'memory/max_active (GiB)': '3.76', 'memory/max_allocated (GiB)': '3.76', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '97.21', 'tokens/total': 5347337, 'tokens/trainable': 2848432, 'epoch': '0.9222'}
 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 465/505 [1:07:40<05:13,  7.83s/it] 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 466/505 [1:07:48<05:11,  7.98s/it]                                                                                                                                                                                                                                                              {'loss': '0.6955', 'grad_norm': '0.7313', 'learning_rate': '3.79e-06', 'ppl': '2.005', 'memory/max_active (GiB)': '3.82', 'memory/max_allocated (GiB)': '3.82', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '151.5', 'tokens/total': 5359022, 'tokens/trainable': 2854476, 'epoch': '0.9241'}
 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                | 466/505 [1:07:48<05:11,  7.98s/it] 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 467/505 [1:07:57<05:14,  8.28s/it]                                                                                                                                                                                                                                                              {'loss': '0.6026', 'grad_norm': '0.557', 'learning_rate': '3.604e-06', 'ppl': '1.827', 'memory/max_active (GiB)': '3.92', 'memory/max_allocated (GiB)': '3.92', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '144.8', 'tokens/total': 5371789, 'tokens/trainable': 2861855, 'epoch': '0.9261'}
 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 467/505 [1:07:57<05:14,  8.28s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 468/505 [1:08:06<05:14,  8.49s/it]                                                                                                                                                                                                                                                              {'loss': '0.7475', 'grad_norm': '0.6966', 'learning_rate': '3.422e-06', 'ppl': '2.112', 'memory/max_active (GiB)': '4.25', 'memory/max_allocated (GiB)': '4.25', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '293.7', 'tokens/total': 5384622, 'tokens/trainable': 2868918, 'epoch': '0.9281'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 468/505 [1:08:06<05:14,  8.49s/it] 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 469/505 [1:08:14<04:54,  8.18s/it]                                                                                                                                                                                                                                                              {'loss': '0.6938', 'grad_norm': '0.6782', 'learning_rate': '3.246e-06', 'ppl': '2.001', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '45.05', 'tokens/total': 5395499, 'tokens/trainable': 2874435, 'epoch': '0.9301'}
 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 469/505 [1:08:14<04:54,  8.18s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 470/505 [1:08:21<04:32,  7.77s/it]                                                                                                                                                                                                                                                              {'loss': '0.6975', 'grad_norm': '0.7135', 'learning_rate': '3.073e-06', 'ppl': '2.009', 'memory/max_active (GiB)': '3.72', 'memory/max_allocated (GiB)': '3.72', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '175.5', 'tokens/total': 5405396, 'tokens/trainable': 2879385, 'epoch': '0.9321'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 470/505 [1:08:21<04:32,  7.77s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 471/505 [1:08:30<04:35,  8.11s/it]                                                                                                                                                                                                                                                              {'loss': '0.5886', 'grad_norm': '0.656', 'learning_rate': '2.906e-06', 'ppl': '1.801', 'memory/max_active (GiB)': '3.95', 'memory/max_allocated (GiB)': '3.95', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '20.8', 'tokens/total': 5418391, 'tokens/trainable': 2886724, 'epoch': '0.9341'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 471/505 [1:08:30<04:35,  8.11s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 472/505 [1:08:39<04:43,  8.59s/it]                                                                                                                                                                                                                                                              {'loss': '0.6591', 'grad_norm': '0.5807', 'learning_rate': '2.743e-06', 'ppl': '1.933', 'memory/max_active (GiB)': '3.96', 'memory/max_allocated (GiB)': '3.96', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '55.82', 'tokens/total': 5432375, 'tokens/trainable': 2894847, 'epoch': '0.936'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 472/505 [1:08:39<04:43,  8.59s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 473/505 [1:08:46<04:21,  8.16s/it]                                                                                                                                                                                                                                                              {'loss': '0.7149', 'grad_norm': '0.667', 'learning_rate': '2.585e-06', 'ppl': '2.044', 'memory/max_active (GiB)': '3.67', 'memory/max_allocated (GiB)': '3.67', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '131.5', 'tokens/total': 5442783, 'tokens/trainable': 2900306, 'epoch': '0.938'}
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 473/505 [1:08:46<04:21,  8.16s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 474/505 [1:08:56<04:24,  8.52s/it]                                                                                                                                                                                                                                                              {'loss': '0.6303', 'grad_norm': '0.6263', 'learning_rate': '2.431e-06', 'ppl': '1.878', 'memory/max_active (GiB)': '4.22', 'memory/max_allocated (GiB)': '4.22', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '133', 'tokens/total': 5456146, 'tokens/trainable': 2908298, 'epoch': '0.94'}
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 474/505 [1:08:56<04:24,  8.52s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 475/505 [1:09:02<03:59,  7.97s/it]                                                                                                                                                                                                                                                              {'loss': '0.7421', 'grad_norm': '0.7507', 'learning_rate': '2.282e-06', 'ppl': '2.1', 'memory/max_active (GiB)': '3.65', 'memory/max_allocated (GiB)': '3.65', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '117.8', 'tokens/total': 5465901, 'tokens/trainable': 2912779, 'epoch': '0.942'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎            | 475/505 [1:09:02<03:59,  7.97s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 476/505 [1:09:09<03:38,  7.55s/it]                                                                                                                                                                                                                                                              {'loss': '0.6523', 'grad_norm': '0.7015', 'learning_rate': '2.138e-06', 'ppl': '1.92', 'memory/max_active (GiB)': '3.74', 'memory/max_allocated (GiB)': '3.74', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '102.5', 'tokens/total': 5475221, 'tokens/trainable': 2917696, 'epoch': '0.944'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 476/505 [1:09:09<03:38,  7.55s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 477/505 [1:09:16<03:29,  7.48s/it]                                                                                                                                                                                                                                                              {'loss': '0.8358', 'grad_norm': '0.705', 'learning_rate': '1.998e-06', 'ppl': '2.307', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '79.47', 'tokens/total': 5485883, 'tokens/trainable': 2923107, 'epoch': '0.946'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 477/505 [1:09:16<03:29,  7.48s/it] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 478/505 [1:09:25<03:31,  7.83s/it]                                                                                                                                                                                                                                                              {'loss': '0.6761', 'grad_norm': '0.6675', 'learning_rate': '1.863e-06', 'ppl': '1.966', 'memory/max_active (GiB)': '4.08', 'memory/max_allocated (GiB)': '4.08', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '135.1', 'tokens/total': 5498464, 'tokens/trainable': 2930297, 'epoch': '0.9479'}
 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 478/505 [1:09:25<03:31,  7.83s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 479/505 [1:09:35<03:38,  8.42s/it]                                                                                                                                                                                                                                                              {'loss': '0.7667', 'grad_norm': '0.687', 'learning_rate': '1.733e-06', 'ppl': '2.153', 'memory/max_active (GiB)': '3.93', 'memory/max_allocated (GiB)': '3.93', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '154.4', 'tokens/total': 5512585, 'tokens/trainable': 2938735, 'epoch': '0.9499'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 479/505 [1:09:35<03:38,  8.42s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 480/505 [1:09:42<03:20,  8.02s/it]                                                                                                                                                                                                                                                              {'loss': '0.69', 'grad_norm': '0.6466', 'learning_rate': '1.607e-06', 'ppl': '1.994', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '79.76', 'tokens/total': 5522827, 'tokens/trainable': 2944127, 'epoch': '0.9519'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 480/505 [1:09:42<03:20,  8.02s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 481/505 [1:09:51<03:18,  8.27s/it]                                                                                                                                                                                                                                                              {'loss': '0.7002', 'grad_norm': '0.6959', 'learning_rate': '1.486e-06', 'ppl': '2.014', 'memory/max_active (GiB)': '4.2', 'memory/max_allocated (GiB)': '4.2', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '79.89', 'tokens/total': 5535712, 'tokens/trainable': 2950633, 'epoch': '0.9539'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 481/505 [1:09:51<03:18,  8.27s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 482/505 [1:09:59<03:07,  8.16s/it]                                                                                                                                                                                                                                                              {'loss': '0.6318', 'grad_norm': '0.5689', 'learning_rate': '1.37e-06', 'ppl': '1.881', 'memory/max_active (GiB)': '3.94', 'memory/max_allocated (GiB)': '3.94', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '97.26', 'tokens/total': 5547203, 'tokens/trainable': 2956702, 'epoch': '0.9559'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 482/505 [1:09:59<03:07,  8.16s/it] 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 483/505 [1:10:06<02:52,  7.86s/it]                                                                                                                                                                                                                                                              {'loss': '0.6147', 'grad_norm': '0.7472', 'learning_rate': '1.258e-06', 'ppl': '1.849', 'memory/max_active (GiB)': '3.74', 'memory/max_allocated (GiB)': '3.74', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '18.86', 'tokens/total': 5557593, 'tokens/trainable': 2961452, 'epoch': '0.9579'}
 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 483/505 [1:10:06<02:52,  7.86s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 484/505 [1:10:12<02:32,  7.26s/it]                                                                                                                                                                                                                                                              {'loss': '0.827', 'grad_norm': '1.09', 'learning_rate': '1.151e-06', 'ppl': '2.286', 'memory/max_active (GiB)': '3.71', 'memory/max_allocated (GiB)': '3.71', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '157.7', 'tokens/total': 5566063, 'tokens/trainable': 2964793, 'epoch': '0.9598'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 484/505 [1:10:12<02:32,  7.26s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 485/505 [1:10:20<02:34,  7.71s/it]                                                                                                                                                                                                                                                              {'loss': '0.6978', 'grad_norm': '0.5963', 'learning_rate': '1.049e-06', 'ppl': '2.009', 'memory/max_active (GiB)': '3.8', 'memory/max_allocated (GiB)': '3.8', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '34.78', 'tokens/total': 5578616, 'tokens/trainable': 2971504, 'epoch': '0.9618'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 485/505 [1:10:20<02:34,  7.71s/it] 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 486/505 [1:10:28<02:24,  7.61s/it]                                                                                                                                                                                                                                                              {'loss': '0.7316', 'grad_norm': '0.7846', 'learning_rate': '9.52e-07', 'ppl': '2.078', 'memory/max_active (GiB)': '3.88', 'memory/max_allocated (GiB)': '3.88', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '199.8', 'tokens/total': 5589261, 'tokens/trainable': 2976976, 'epoch': '0.9638'}
 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 486/505 [1:10:28<02:24,  7.61s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 487/505 [1:10:37<02:23,  7.96s/it]                                                                                                                                                                                                                                                              {'loss': '0.7759', 'grad_norm': '0.6971', 'learning_rate': '8.593e-07', 'ppl': '2.172', 'memory/max_active (GiB)': '4.01', 'memory/max_allocated (GiB)': '4.01', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '89.48', 'tokens/total': 5602019, 'tokens/trainable': 2983955, 'epoch': '0.9658'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 487/505 [1:10:37<02:23,  7.96s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 488/505 [1:10:48<02:34,  9.09s/it]                                                                                                                                                                                                                                                              {'loss': '0.6188', 'grad_norm': '0.5338', 'learning_rate': '7.713e-07', 'ppl': '1.857', 'memory/max_active (GiB)': '4.09', 'memory/max_allocated (GiB)': '4.09', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '210.5', 'tokens/total': 5619058, 'tokens/trainable': 2994373, 'epoch': '0.9678'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 488/505 [1:10:48<02:34,  9.09s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 489/505 [1:10:56<02:19,  8.70s/it]                                                                                                                                                                                                                                                              {'loss': '0.666', 'grad_norm': '0.8523', 'learning_rate': '6.881e-07', 'ppl': '1.947', 'memory/max_active (GiB)': '4.18', 'memory/max_allocated (GiB)': '4.18', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '20.34', 'tokens/total': 5630071, 'tokens/trainable': 3000261, 'epoch': '0.9698'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 489/505 [1:10:56<02:19,  8.70s/it] 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 490/505 [1:11:05<02:12,  8.83s/it]                                                                                                                                                                                                                                                              {'loss': '0.6801', 'grad_norm': '0.5775', 'learning_rate': '6.096e-07', 'ppl': '1.974', 'memory/max_active (GiB)': '3.87', 'memory/max_allocated (GiB)': '3.87', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '142.7', 'tokens/total': 5643276, 'tokens/trainable': 3007356, 'epoch': '0.9717'}
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 490/505 [1:11:05<02:12,  8.83s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 491/505 [1:11:12<01:53,  8.12s/it]                                                                                                                                                                                                                                                              {'loss': '0.7303', 'grad_norm': '0.7803', 'learning_rate': '5.358e-07', 'ppl': '2.076', 'memory/max_active (GiB)': '3.61', 'memory/max_allocated (GiB)': '3.61', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '60.92', 'tokens/total': 5652653, 'tokens/trainable': 3011671, 'epoch': '0.9737'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 491/505 [1:11:12<01:53,  8.12s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 492/505 [1:11:21<01:49,  8.39s/it]                                                                                                                                                                                                                                                              {'loss': '0.7077', 'grad_norm': '0.607', 'learning_rate': '4.668e-07', 'ppl': '2.029', 'memory/max_active (GiB)': '4.25', 'memory/max_allocated (GiB)': '4.25', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '49.22', 'tokens/total': 5665705, 'tokens/trainable': 3018879, 'epoch': '0.9757'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 492/505 [1:11:21<01:49,  8.39s/it] 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 493/505 [1:11:29<01:41,  8.42s/it]                                                                                                                                                                                                                                                              {'loss': '0.8096', 'grad_norm': '0.8187', 'learning_rate': '4.026e-07', 'ppl': '2.247', 'memory/max_active (GiB)': '4.05', 'memory/max_allocated (GiB)': '4.05', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '15.65', 'tokens/total': 5678119, 'tokens/trainable': 3025341, 'epoch': '0.9777'}
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 493/505 [1:11:29<01:41,  8.42s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 494/505 [1:11:37<01:29,  8.10s/it]                                                                                                                                                                                                                                                              {'loss': '0.7399', 'grad_norm': '0.7173', 'learning_rate': '3.431e-07', 'ppl': '2.096', 'memory/max_active (GiB)': '3.84', 'memory/max_allocated (GiB)': '3.84', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '80.3', 'tokens/total': 5688900, 'tokens/trainable': 3030899, 'epoch': '0.9797'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 494/505 [1:11:37<01:29,  8.10s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 495/505 [1:11:45<01:23,  8.33s/it]                                                                                                                                                                                                                                                              {'loss': '0.6938', 'grad_norm': '0.5828', 'learning_rate': '2.883e-07', 'ppl': '2.001', 'memory/max_active (GiB)': '3.96', 'memory/max_allocated (GiB)': '3.96', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '200.6', 'tokens/total': 5701867, 'tokens/trainable': 3037873, 'epoch': '0.9817'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 495/505 [1:11:45<01:23,  8.33s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 496/505 [1:11:57<01:23,  9.33s/it]                                                                                                                                                                                                                                                              {'loss': '0.7479', 'grad_norm': '0.6324', 'learning_rate': '2.383e-07', 'ppl': '2.113', 'memory/max_active (GiB)': '4.25', 'memory/max_allocated (GiB)': '4.25', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '16.29', 'tokens/total': 5718710, 'tokens/trainable': 3047692, 'epoch': '0.9836'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 496/505 [1:11:57<01:23,  9.33s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 497/505 [1:12:06<01:14,  9.28s/it]                                                                                                                                                                                                                                                              {'loss': '0.7594', 'grad_norm': '0.549', 'learning_rate': '1.93e-07', 'ppl': '2.137', 'memory/max_active (GiB)': '3.89', 'memory/max_allocated (GiB)': '3.89', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '124.4', 'tokens/total': 5732021, 'tokens/trainable': 3055315, 'epoch': '0.9856'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 497/505 [1:12:06<01:14,  9.28s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 498/505 [1:12:14<01:02,  8.95s/it]                                                                                                                                                                                                                                                              {'loss': '0.7089', 'grad_norm': '0.6946', 'learning_rate': '1.525e-07', 'ppl': '2.032', 'memory/max_active (GiB)': '3.82', 'memory/max_allocated (GiB)': '3.82', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '34.63', 'tokens/total': 5743864, 'tokens/trainable': 3061232, 'epoch': '0.9876'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 498/505 [1:12:14<01:02,  8.95s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 499/505 [1:12:22<00:51,  8.50s/it]                                                                                                                                                                                                                                                              {'loss': '0.7958', 'grad_norm': '0.8232', 'learning_rate': '1.168e-07', 'ppl': '2.216', 'memory/max_active (GiB)': '3.93', 'memory/max_allocated (GiB)': '3.93', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '66.6', 'tokens/total': 5754627, 'tokens/trainable': 3066700, 'epoch': '0.9896'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 499/505 [1:12:22<00:51,  8.50s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 500/505 [1:12:29<00:39,  7.95s/it]                                                                                                                                                                                                                                                              {'loss': '0.7002', 'grad_norm': '0.8315', 'learning_rate': '8.58e-08', 'ppl': '2.014', 'memory/max_active (GiB)': '3.7', 'memory/max_allocated (GiB)': '3.7', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '73.75', 'tokens/total': 5764229, 'tokens/trainable': 3071583, 'epoch': '0.9916'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 500/505 [1:12:29<00:39,  7.95s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 501/505 [1:12:38<00:33,  8.33s/it]                                                                                                                                                                                                                                                              {'loss': '0.7039', 'grad_norm': '0.6064', 'learning_rate': '5.959e-08', 'ppl': '2.022', 'memory/max_active (GiB)': '4.1', 'memory/max_allocated (GiB)': '4.1', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '150.3', 'tokens/total': 5777609, 'tokens/trainable': 3078878, 'epoch': '0.9936'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 501/505 [1:12:38<00:33,  8.33s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 502/505 [1:12:44<00:23,  7.84s/it]                                                                                                                                                                                                                                                              {'loss': '0.6835', 'grad_norm': '0.7579', 'learning_rate': '3.814e-08', 'ppl': '1.981', 'memory/max_active (GiB)': '3.78', 'memory/max_allocated (GiB)': '3.78', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '195.9', 'tokens/total': 5787283, 'tokens/trainable': 3083449, 'epoch': '0.9955'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 502/505 [1:12:44<00:23,  7.84s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 503/505 [1:12:55<00:17,  8.53s/it]                                                                                                                                                                                                                                                              {'loss': '0.7659', 'grad_norm': '0.5726', 'learning_rate': '2.145e-08', 'ppl': '2.151', 'memory/max_active (GiB)': '4.03', 'memory/max_allocated (GiB)': '4.03', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '83.72', 'tokens/total': 5801913, 'tokens/trainable': 3091880, 'epoch': '0.9975'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 503/505 [1:12:55<00:17,  8.53s/it]100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 504/505 [1:13:02<00:08,  8.19s/it]                                                                                                                                                                                                                                                              {'loss': '0.679', 'grad_norm': '0.6633', 'learning_rate': '9.535e-09', 'ppl': '1.972', 'memory/max_active (GiB)': '3.77', 'memory/max_allocated (GiB)': '3.77', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '56.06', 'tokens/total': 5812594, 'tokens/trainable': 3097648, 'epoch': '0.9995'}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 504/505 [1:13:02<00:08,  8.19s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 505/505 [1:13:05<00:00,  6.52s/it]                                                                                                                                                                                                                                                              {'loss': '0.7712', 'grad_norm': '1.12', 'learning_rate': '2.384e-09', 'ppl': '2.162', 'memory/max_active (GiB)': '3.81', 'memory/max_allocated (GiB)': '3.81', 'memory/device_reserved (GiB)': '4.52', 'tokens/train_per_sec_per_gpu': '242.5', 'tokens/total': 5816278, 'tokens/trainable': 3099784, 'epoch': '1'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 505/505 [1:13:05<00:00,  6.52s/it][2026-06-13 17:54:24,567] [INFO] [axolotl.core.trainers.base._save:818] [PID:590] Saving model checkpoint to ./outputs/Jacob-2-4B/checkpoint-505
                                                                                                                                                                                                                                                              {'train_runtime': '4386', 'train_samples_per_second': '0.921', 'train_steps_per_second': '0.115', 'train_loss': '0.7722', 'memory/max_active (GiB)': '3.16', 'memory/max_allocated (GiB)': '3.16', 'memory/device_reserved (GiB)': '4.52', 'epoch': '1', 'tokens/train_per_sec_per_gpu': '0'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 505/505 [1:13:06<00:00,  6.52s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 505/505 [1:13:06<00:00,  8.69s/it]
[2026-06-13 17:54:47,355] [INFO] [axolotl.train.save_trained_model:267] [PID:590] Training completed! Saving trained model to ./outputs/Jacob-2-4B.
[2026-06-13 17:54:47,615] [INFO] [axolotl.train.save_trained_model:388] [PID:590] Model successfully saved to ./outputs/Jacob-2-4B
[2026-06-13 17:54:47,746] [INFO] [axolotl.core.trainers.base._save:818] [PID:590] Saving model checkpoint to ./outputs/Jacob-2-4B
Processing Files (0 / 0)      : |                                                                                                                                                                                                |  0.00B /  0.00B            
New Data Upload               : |                                                                                                                                                                                                |  0.00B /  0.00B            [A

  ...ob-2-4B/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13.5kB / 13.5kB            [A[A


  ...Jacob-2-4B/tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.0MB / 20.0MB            [A[A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42.5MB / 42.5MB            [A[A[A[A

  ...ob-2-4B/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13.5kB / 13.5kB            [A[A


  ...Jacob-2-4B/tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.0MB / 20.0MB            [A[A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42.5MB / 42.5MB            [A[A[A[AProcessing Files (3 / 3)      : 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62.5MB / 62.5MB,   ???B/s  

  ...ob-2-4B/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13.5kB / 13.5kB            [A[A


  ...Jacob-2-4B/tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.0MB / 20.0MB            [A[A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42.5MB / 42.5MB            [A[A[A[A

  ...ob-2-4B/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13.5kB / 13.5kB            [A[A


  ...Jacob-2-4B/tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.0MB / 20.0MB            [A[A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42.5MB / 42.5MB            [A[A[A[A

  ...ob-2-4B/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13.5kB / 13.5kB            [A[A


  ...Jacob-2-4B/tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.0MB / 20.0MB            [A[A[A


  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42.5MB / 42.5MB            [A[A[A[AProcessing Files (3 / 3)      : 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62.5MB / 62.5MB,  0.00B/s  
New Data Upload               : |                                                                                                                                                                                                |  0.00B /  0.00B,  0.00B/s  
  ...ob-2-4B/training_args.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13.5kB / 13.5kB            
  ...Jacob-2-4B/tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.0MB / 20.0MB            
  ...adapter_model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42.5MB / 42.5MB