[2026-06-23 12:02:57,317] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:2077] bf16 support detected, enabling for this configuration.
config.json:   0%|                                                                                                                                                                                                                                                                                                                                  | 0.00/661 [00:00<?, ?B/s]config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 661/661 [00:00<00:00, 2.69MB/s]
[2026-06-23 12:02:58,129] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:2077] baseline 0.000GB ()
[2026-06-23 12:02:58,130] [INFO] [axolotl.cli.config.load_cfg:333] [PID:2077] config:
{
  "activation_offloading": false,
  "adapter": "lora",
  "attn_implementation": "flash_attention_2",
  "attn_needs_dtype_cast": true,
  "attn_supports_packing": true,
  "attn_uses_flash_lib": true,
  "axolotl_config_path": "ft.yaml",
  "base_model": "Qwen/Qwen2.5-Coder-3B",
  "base_model_config": "Qwen/Qwen2.5-Coder-3B",
  "batch_size": 32,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_80",
    "fp8": false,
    "n_gpu": 1,
    "n_node": 1,
    "tf32": true
  },
  "context_parallel_size": 1,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 12,
  "dataset_prepared_path": "./out/prepared_full",
  "datasets": [
    {
      "field": "content",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "dbaysal/all-contentx3",
      "trust_remote_code": false,
      "type": "completion"
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.11.0"
  },
  "eval_batch_size": 8,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "fp16": false,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 4,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "include_tkps": true,
  "layer_offloading": false,
  "learning_rate": 0.0002,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 128,
  "lora_dropout": 0.05,
  "lora_embedding_kernel": true,
  "lora_mlp_kernel": true,
  "lora_o_kernel": true,
  "lora_qkv_kernel": true,
  "lora_r": 64,
  "lora_target_linear": true,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "merge_method": "memory_efficient",
  "micro_batch_size": 8,
  "model_config_type": "qwen2",
  "num_epochs": 5.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_torch",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./out/learned",
  "pad_to_sequence_len": true,
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qgalore_cos_threshold": 0.4,
  "qgalore_gamma_proj": 2,
  "qgalore_proj_bits": 4,
  "qgalore_proj_group_size": 256,
  "qgalore_proj_quant": true,
  "qgalore_proj_type": "std",
  "qgalore_queue_size": 5,
  "qgalore_rank": 256,
  "qgalore_scale": 0.25,
  "qgalore_update_proj_gap": 200,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "relora_prune_method": "magnitude",
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": false,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "seed": 42,
  "sequence_len": 2048,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": false,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Qwen/Qwen2.5-Coder-3B",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "async_prefetch": false,
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "replay_buffer_size": 0,
    "replay_recompute_logps": true,
    "reroll_max_groups": 1,
    "reroll_start_fraction": 1.0,
    "reward_num_workers": 1,
    "scale_rewards": true,
    "skip_zero_advantage_batches": true,
    "sync_ref_model": false,
    "use_data_producer": false,
    "use_vllm": false,
    "vllm_lora_sync": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_otel_metrics": false,
  "use_ray": false,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "warmup_ratio": 0.03,
  "weight_decay": 0.0,
  "world_size": 1
}
tokenizer_config.json:   0%|                                                                                                                                                                                                                                                                                                                      | 0.00/7.23k [00:00<?, ?B/s]tokenizer_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7.23k/7.23k [00:00<00:00, 4.24MB/s]
vocab.json:   0%|                                                                                                                                                                                                                                                                                                                                 | 0.00/2.78M [00:00<?, ?B/s]vocab.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2.78M/2.78M [00:00<00:00, 13.8MB/s]vocab.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2.78M/2.78M [00:00<00:00, 13.7MB/s]
merges.txt:   0%|                                                                                                                                                                                                                                                                                                                                 | 0.00/1.67M [00:00<?, ?B/s]merges.txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.67M/1.67M [00:00<00:00, 14.9MB/s]merges.txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.67M/1.67M [00:00<00:00, 14.8MB/s]
tokenizer.json:   0%|                                                                                                                                                                                                                                                                                                                             | 0.00/7.03M [00:00<?, ?B/s]tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7.03M/7.03M [00:00<00:00, 24.1MB/s]tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7.03M/7.03M [00:00<00:00, 23.8MB/s]
[2026-06-23 12:03:04,496] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:2077] EOS: 151643 / <|endoftext|>
[2026-06-23 12:03:04,497] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:2077] BOS: None / None
[2026-06-23 12:03:04,497] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:2077] PAD: 151643 / <|endoftext|>
[2026-06-23 12:03:04,497] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:2077] UNK: None / None
[2026-06-23 12:03:04,497] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:2077] Unable to find prepared dataset in out/prepared_full/629a2e5ec728ba197df2909eccb74717
[2026-06-23 12:03:04,498] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:2077] Loading raw datasets...
[2026-06-23 12:03:04,498] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:2077] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
Fetching 0 files: 0it [00:00, ?it/s][AFetching 0 files: 0it [00:00, ?it/s]
Download complete: : 0.00B [00:00, ?B/s]              
README.md:   0%|                                                                                                                                                                                                                                                                                                                                    | 0.00/602 [00:00<?, ?B/s][AREADME.md: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 602/602 [00:00<00:00, 3.24MB/s]
Download complete: : 0.00B [00:01, ?B/s]
data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:00<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:00<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:00<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:00<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:00<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:01<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:01<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:01<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:01<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:01<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:02<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:02<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:02<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:02<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:02<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:03<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:03<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:03<?, ?B/s]data/train-00000-of-00001.parquet:   0%|                                                                                                                                                                                                                                                                                                           | 0.00/620k [00:03<?, ?B/s]data/train-00000-of-00001.parquet: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 620k/620k [00:03<00:00, 3.10MB/s]data/train-00000-of-00001.parquet: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 620k/620k [00:03<00:00, 162kB/s]
Generating train split:   0%|                                                                                                                                                                                                                                                                                                                 | 0/1800 [00:00<?, ? examples/s]Generating train split: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1800/1800 [00:00<00:00, 103679.68 examples/s]
[2026-06-23 12:03:13,218] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:2077] Loading dataset: dbaysal/all-contentx3 with base_type: completion and prompt_style: None
Tokenizing Prompts (num_proc=12):   0%|                                                                                                                                                                                                                                                                                                       | 0/1800 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=12):   8%|███████████████████████▊                                                                                                                                                                                                                                                                     | 150/1800 [00:02<00:29, 55.54 examples/s]Tokenizing Prompts (num_proc=12):  17%|███████████████████████████████████████████████▌                                                                                                                                                                                                                                             | 300/1800 [00:04<00:19, 75.93 examples/s]Tokenizing Prompts (num_proc=12):  25%|███████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                     | 450/1800 [00:05<00:15, 86.25 examples/s]Tokenizing Prompts (num_proc=12):  33%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                              | 600/1800 [00:07<00:13, 89.67 examples/s]Tokenizing Prompts (num_proc=12):  42%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                      | 750/1800 [00:08<00:11, 88.53 examples/s]Tokenizing Prompts (num_proc=12):  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                              | 900/1800 [00:10<00:09, 92.98 examples/s]Tokenizing Prompts (num_proc=12):  58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 1050/1800 [00:11<00:07, 96.23 examples/s]Tokenizing Prompts (num_proc=12):  67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 1200/1800 [00:13<00:06, 95.02 examples/s]Tokenizing Prompts (num_proc=12):  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                       | 1350/1800 [00:15<00:04, 92.09 examples/s]Tokenizing Prompts (num_proc=12):  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                               | 1500/1800 [00:16<00:03, 94.49 examples/s]Tokenizing Prompts (num_proc=12):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 1650/1800 [00:18<00:01, 97.44 examples/s]Tokenizing Prompts (num_proc=12): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1800/1800 [00:19<00:00, 97.43 examples/s]Tokenizing Prompts (num_proc=12): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1800/1800 [00:19<00:00, 90.91 examples/s]
[2026-06-23 12:03:34,638] [INFO] [axolotl.utils.data.utils._log_dataset_stats:212] [PID:2077] min_input_len: 33
[2026-06-23 12:03:34,638] [INFO] [axolotl.utils.data.utils._log_dataset_stats:213] [PID:2077] max_input_len: 1302
Dropping Invalid Sequences (<None or >2048) (num_proc=12):   0%|                                                                                                                                                                                                                                                                              | 0/1800 [00:00<?, ? examples/s]Dropping Invalid Sequences (<None or >2048) (num_proc=12):   8%|█████████████████████▌                                                                                                                                                                                                                                             | 150/1800 [00:00<00:02, 717.96 examples/s]Dropping Invalid Sequences (<None or >2048) (num_proc=12): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1800/1800 [00:00<00:00, 5144.28 examples/s]
Saving the dataset (0/7 shards):   0%|                                                                                                                                                                                                                                                                                                        | 0/1800 [00:00<?, ? examples/s]Saving the dataset (0/7 shards):  14%|████████████████████████████████████████▉                                                                                                                                                                                                                                                     | 258/1800 [00:13<01:23, 18.46 examples/s]Saving the dataset (1/7 shards):  14%|████████████████████████████████████████▉                                                                                                                                                                                                                                                     | 258/1800 [00:13<01:23, 18.46 examples/s]Saving the dataset (2/7 shards):  29%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                            | 515/1800 [00:13<01:09, 18.46 examples/s]Saving the dataset (3/7 shards):  43%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                   | 772/1800 [00:13<00:55, 18.46 examples/s]Saving the dataset (4/7 shards):  57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                          | 1029/1800 [00:13<00:41, 18.46 examples/s]Saving the dataset (5/7 shards):  71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 1286/1800 [00:13<00:27, 18.46 examples/s]Saving the dataset (6/7 shards):  86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 1543/1800 [00:14<00:13, 18.46 examples/s]Saving the dataset (7/7 shards): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1800/1800 [00:14<00:00, 18.46 examples/s]Saving the dataset (7/7 shards): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1800/1800 [00:15<00:00, 118.96 examples/s]
[2026-06-23 12:03:50,287] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:2077] total_num_tokens: 506_796
[2026-06-23 12:03:50,302] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:2077] `total_supervised_tokens: 506_796`
[2026-06-23 12:03:50,302] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:2077] total_num_steps: 282
[2026-06-23 12:03:50,303] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:2077] Maximum number of steps set at 282
[2026-06-23 12:03:50,359] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:2077] loading tokenizer... Qwen/Qwen2.5-Coder-3B
[2026-06-23 12:03:52,290] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:2077] EOS: 151643 / <|endoftext|>
[2026-06-23 12:03:52,290] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:2077] BOS: None / None
[2026-06-23 12:03:52,290] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:2077] PAD: 151643 / <|endoftext|>
[2026-06-23 12:03:52,290] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:2077] UNK: None / None
[2026-06-23 12:03:52,290] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:2077] Loading model
[2026-06-23 12:03:52,531] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:2077] Patched OptimState8bit for torch.compile compatibility
[2026-06-23 12:03:52,532] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:2077] Patched OptimState4bit for torch.compile compatibility
[2026-06-23 12:03:52,532] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:2077] Patched OptimStateFp8 for torch.compile compatibility
[2026-06-23 12:03:52,540] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:2077] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-06-23 12:03:52,541] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:2077] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-06-23 12:03:52,543] [WARNING] [axolotl.loaders.patch_manager._apply_self_attention_lora_patch:662] [PID:2077] Cannot patch self-attention - requires no dropout
model.safetensors.index.json:   0%|                                                                                                                                                                                                                                                                                                               | 0.00/35.6k [00:00<?, ?B/s]model.safetensors.index.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35.6k/35.6k [00:00<00:00, 59.9MB/s]
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
Fetching 2 files:   0%|                                                                                                                                                                                                                                                                                                                                 | 0/2 [00:00<?, ?it/s][ADownloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/1.21G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/6.17G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/6.17G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/6.17G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/6.17G [00:01<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/6.17G [00:01<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/6.17G [00:01<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/6.17G [00:01<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/6.17G [00:01<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/6.17G [00:02<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/6.17G [00:02<?, ?B/s]Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                          | 0.00/6.17G [00:02<?, ?B/s]Downloading (incomplete total...):   5%|███████████████▊                                                                                                                                                                                                                                                                                  | 335M/6.17G [00:02<00:03, 1.66GB/s]Downloading (incomplete total...):  20%|████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                        | 1.21G/6.17G [00:03<00:03, 1.50GB/s]Downloading (incomplete total...):  24%|█████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                            | 1.48G/6.17G [00:04<00:06, 673MB/s]Downloading (incomplete total...):  31%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                         | 1.89G/6.17G [00:04<00:04, 881MB/s]Downloading (incomplete total...):  36%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                         | 2.22G/6.17G [00:05<00:06, 581MB/s]Downloading (incomplete total...):  38%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                   | 2.35G/6.17G [00:05<00:07, 537MB/s]Downloading (incomplete total...):  57%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                             | 3.50G/6.17G [00:08<00:06, 424MB/s]Downloading (incomplete total...):  82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 5.04G/6.17G [00:13<00:02, 391MB/s]Downloading (incomplete total...): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.17G/6.17G [00:16<00:00, 368MB/s]
Fetching 2 files:  50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                            | 1/2 [00:16<00:16, 16.72s/it][AFetching 2 files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:16<00:00,  8.36s/it]
Download complete: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.17G/6.17G [00:16<00:00, 368MB/s]
Downloading (incomplete total...): 0.00B [00:00, ?B/s][A

Fetching 7 files:   0%|                                                                                                                                                                                                                                                                                                                                 | 0/7 [00:00<?, ?it/s][A[A
Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                            | 0.00/234 [00:00<?, ?B/s][A
Downloading (incomplete total...): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 882B/s][ADownload complete: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.17G/6.17G [00:17<00:00, 348MB/s]

Downloading (incomplete total...):   2%|██████▋                                                                                                                                                                                                                                                                                              | 234/10.2k [00:00<00:11, 882B/s][A
Downloading (incomplete total...):  39%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                | 10.2k/25.8k [00:00<00:17, 882B/s][A
Downloading (incomplete total...):  43%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                    | 11.2k/25.8k [00:00<00:16, 882B/s][A

Fetching 7 files:  14%|████████████████████████████████████████████▋                                                                                                                                                                                                                                                                            | 1/7 [00:00<00:01,  3.65it/s][A[A
Downloading (incomplete total...):  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 25.8k/26.0k [00:00<00:00, 882B/s][A
Downloading (incomplete total...):  30%|███████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                           | 26.0k/86.3k [00:00<01:08, 882B/s][A
Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                               | 86.3k/1.04G [00:00<326:43:23, 882B/s][A
Downloading (incomplete total...):   0%|                                                                                                                                                                                                                                                                                                | 86.3k/1.04G [00:00<2:04:25, 139kB/s][A
Downloading (incomplete total...):   3%|████████▉                                                                                                                                                                                                                                                                                        | 32.0M/1.04G [00:02<01:03, 15.7MB/s][A
Downloading (incomplete total...):  21%|██████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                    | 222M/1.04G [00:02<00:06, 131MB/s][A
Downloading (incomplete total...):  29%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                            | 305M/1.04G [00:04<00:09, 73.7MB/s][A
Downloading (incomplete total...):  64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 666M/1.04G [00:04<00:01, 218MB/s][A
Downloading (incomplete total...):  74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 769M/1.04G [00:04<00:01, 250MB/s][A
Downloading (incomplete total...):  95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 983M/1.04G [00:05<00:00, 358MB/s][A

Fetching 7 files:  29%|█████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                               | 2/7 [00:05<00:16,  3.30s/it][A[AFetching 7 files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:05<00:00,  1.23it/s]

Download complete: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.04G/1.04G [00:05<00:00, 358MB/s][ALoading weights:   0%|                                                                                                                                                                                                                                                                                                                                | 0/434 [00:00<?, ?it/s]Loading weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 434/434 [00:00<00:00, 5597.68it/s]
generation_config.json:   0%|                                                                                                                                                                                                                                                                                                                       | 0.00/139 [00:00<?, ?B/s]generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 139/139 [00:00<00:00, 773kB/s]
[2026-06-23 12:04:19,318] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:433] [PID:2077] Converting modules to torch.bfloat16
[2026-06-23 12:04:19,766] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:2077] Memory usage after model load 0.000GB ()
[2026-06-23 12:04:19,822] [INFO] [axolotl.loaders.adapter._build_peft_lora_config:170] [PID:2077] found linear modules: ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']
Download complete: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.04G/1.04G [00:09<00:00, 109MB/s]
trainable params: 119,734,272 || all params: 3,205,672,960 || trainable%: 3.7351
[2026-06-23 12:04:21,621] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:2077] after adapters 0.000GB ()
[2026-06-23 12:04:23,806] [INFO] [axolotl.monkeypatch.lora_kernels.apply_lora_kernel_patches:418] [PID:2077] LoRA kernels: dropout=0.05 enabled
[2026-06-23 12:04:28,312] [INFO] [axolotl.train.save_initial_configs:450] [PID:2077] Pre-saving adapter config to ./out/learned...
[2026-06-23 12:04:28,312] [INFO] [axolotl.train.save_initial_configs:454] [PID:2077] Pre-saving tokenizer to ./out/learned...
[2026-06-23 12:04:28,418] [INFO] [axolotl.train.save_initial_configs:459] [PID:2077] Pre-saving model config to ./out/learned...
[2026-06-23 12:04:28,422] [INFO] [axolotl.train.execute_training:226] [PID:2077] Starting trainer...
  0%|                                                                                                                                                                                                                                                                                                                                                 | 0/282 [00:00<?, ?it/s]  0%|█▏                                                                                                                                                                                                                                                                                                                                     | 1/282 [00:16<1:16:03, 16.24s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '1.42', 'grad_norm': '0.7371', 'learning_rate': '0', 'ppl': '4.139', 'memory/max_active (GiB)': '36.86', 'memory/max_allocated (GiB)': '36.86', 'memory/device_reserved (GiB)': '37.84', 'tokens/train_per_sec_per_gpu': '199.9', 'tokens/total': 65536, 'tokens/trainable': 9479, 'epoch': '0.01778'}
  0%|█▏                                                                                                                                                                                                                                                                                                                                     | 1/282 [00:16<1:16:03, 16.24s/it]  1%|██▎                                                                                                                                                                                                                                                                                                                                    | 2/282 [00:27<1:00:48, 13.03s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '1.44', 'grad_norm': '0.9725', 'learning_rate': '2.5e-05', 'ppl': '4.22', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.31', 'tokens/train_per_sec_per_gpu': '262.6', 'tokens/total': 131072, 'tokens/trainable': 18124, 'epoch': '0.03556'}
  1%|██▎                                                                                                                                                                                                                                                                                                                                    | 2/282 [00:27<1:00:48, 13.03s/it]  1%|███▌                                                                                                                                                                                                                                                                                                                                     | 3/282 [00:37<55:49, 12.00s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '1.345', 'grad_norm': '0.6199', 'learning_rate': '5e-05', 'ppl': '3.838', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.31', 'tokens/train_per_sec_per_gpu': '181.9', 'tokens/total': 196608, 'tokens/trainable': 28229, 'epoch': '0.05333'}
  1%|███▌                                                                                                                                                                                                                                                                                                                                     | 3/282 [00:37<55:49, 12.00s/it]  1%|████▋                                                                                                                                                                                                                                                                                                                                    | 4/282 [00:48<53:22, 11.52s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '1.314', 'grad_norm': '0.7848', 'learning_rate': '7.5e-05', 'ppl': '3.721', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.31', 'tokens/train_per_sec_per_gpu': '88.97', 'tokens/total': 262144, 'tokens/trainable': 35580, 'epoch': '0.07111'}
  1%|████▋                                                                                                                                                                                                                                                                                                                                    | 4/282 [00:48<53:22, 11.52s/it]  2%|█████▊                                                                                                                                                                                                                                                                                                                                   | 5/282 [00:59<51:58, 11.26s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '1.237', 'grad_norm': '0.5375', 'learning_rate': '0.0001', 'ppl': '3.446', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.31', 'tokens/train_per_sec_per_gpu': '171.1', 'tokens/total': 327680, 'tokens/trainable': 46084, 'epoch': '0.08889'}
  2%|█████▊                                                                                                                                                                                                                                                                                                                                   | 5/282 [00:59<51:58, 11.26s/it]  2%|███████                                                                                                                                                                                                                                                                                                                                  | 6/282 [01:10<51:05, 11.11s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '1.202', 'grad_norm': '0.5982', 'learning_rate': '0.000125', 'ppl': '3.326', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '161.1', 'tokens/total': 393216, 'tokens/trainable': 57712, 'epoch': '0.1067'}
  2%|███████                                                                                                                                                                                                                                                                                                                                  | 6/282 [01:10<51:05, 11.11s/it]  2%|████████▏                                                                                                                                                                                                                                                                                                                                | 7/282 [01:20<50:25, 11.00s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '1.091', 'grad_norm': '0.7115', 'learning_rate': '0.00015', 'ppl': '2.976', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '273.2', 'tokens/total': 458752, 'tokens/trainable': 66537, 'epoch': '0.1244'}
  2%|████████▏                                                                                                                                                                                                                                                                                                                                | 7/282 [01:20<50:25, 11.00s/it]  3%|█████████▎                                                                                                                                                                                                                                                                                                                               | 8/282 [01:31<49:56, 10.94s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '1.085', 'grad_norm': '0.4292', 'learning_rate': '0.000175', 'ppl': '2.96', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '233.1', 'tokens/total': 524288, 'tokens/trainable': 74850, 'epoch': '0.1422'}
  3%|█████████▎                                                                                                                                                                                                                                                                                                                               | 8/282 [01:31<49:56, 10.94s/it]  3%|██████████▌                                                                                                                                                                                                                                                                                                                              | 9/282 [01:42<49:34, 10.89s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '1.056', 'grad_norm': '0.4463', 'learning_rate': '0.0002', 'ppl': '2.875', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '351.5', 'tokens/total': 589824, 'tokens/trainable': 86999, 'epoch': '0.16'}
  3%|██████████▌                                                                                                                                                                                                                                                                                                                              | 9/282 [01:42<49:34, 10.89s/it]  4%|███████████▋                                                                                                                                                                                                                                                                                                                            | 10/282 [01:53<49:14, 10.86s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.9747', 'grad_norm': '0.4668', 'learning_rate': '0.0002', 'ppl': '2.65', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '253.7', 'tokens/total': 655360, 'tokens/trainable': 94732, 'epoch': '0.1778'}
  4%|███████████▋                                                                                                                                                                                                                                                                                                                            | 10/282 [01:53<49:14, 10.86s/it]  4%|████████████▊                                                                                                                                                                                                                                                                                                                           | 11/282 [02:04<48:59, 10.85s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.8451', 'grad_norm': '0.3904', 'learning_rate': '0.0002', 'ppl': '2.328', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '303.6', 'tokens/total': 720896, 'tokens/trainable': 104896, 'epoch': '0.1956'}
  4%|████████████▊                                                                                                                                                                                                                                                                                                                           | 11/282 [02:04<48:59, 10.85s/it]  4%|█████████████▉                                                                                                                                                                                                                                                                                                                          | 12/282 [02:14<48:44, 10.83s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.9493', 'grad_norm': '0.4925', 'learning_rate': '0.0001999', 'ppl': '2.584', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '163.7', 'tokens/total': 786432, 'tokens/trainable': 114016, 'epoch': '0.2133'}
  4%|█████████████▉                                                                                                                                                                                                                                                                                                                          | 12/282 [02:14<48:44, 10.83s/it]  5%|███████████████                                                                                                                                                                                                                                                                                                                         | 13/282 [02:25<48:30, 10.82s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.9384', 'grad_norm': '0.3978', 'learning_rate': '0.0001999', 'ppl': '2.556', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '138.1', 'tokens/total': 851968, 'tokens/trainable': 122210, 'epoch': '0.2311'}
  5%|███████████████                                                                                                                                                                                                                                                                                                                         | 13/282 [02:25<48:30, 10.82s/it]  5%|████████████████▎                                                                                                                                                                                                                                                                                                                       | 14/282 [02:36<48:17, 10.81s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.9422', 'grad_norm': '0.4624', 'learning_rate': '0.0001998', 'ppl': '2.566', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '245.5', 'tokens/total': 917504, 'tokens/trainable': 130965, 'epoch': '0.2489'}
  5%|████████████████▎                                                                                                                                                                                                                                                                                                                       | 14/282 [02:36<48:17, 10.81s/it]  5%|█████████████████▍                                                                                                                                                                                                                                                                                                                      | 15/282 [02:47<48:05, 10.81s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.9065', 'grad_norm': '0.3961', 'learning_rate': '0.0001998', 'ppl': '2.476', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '135.4', 'tokens/total': 983040, 'tokens/trainable': 139606, 'epoch': '0.2667'}
  5%|█████████████████▍                                                                                                                                                                                                                                                                                                                      | 15/282 [02:47<48:05, 10.81s/it]  6%|██████████████████▌                                                                                                                                                                                                                                                                                                                     | 16/282 [02:58<47:53, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.9192', 'grad_norm': '0.4016', 'learning_rate': '0.0001997', 'ppl': '2.507', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '179.1', 'tokens/total': 1048576, 'tokens/trainable': 149708, 'epoch': '0.2844'}
  6%|██████████████████▌                                                                                                                                                                                                                                                                                                                     | 16/282 [02:58<47:53, 10.80s/it]  6%|███████████████████▊                                                                                                                                                                                                                                                                                                                    | 17/282 [03:08<47:40, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.8277', 'grad_norm': '0.3831', 'learning_rate': '0.0001996', 'ppl': '2.288', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '174.8', 'tokens/total': 1114112, 'tokens/trainable': 156788, 'epoch': '0.3022'}
  6%|███████████████████▊                                                                                                                                                                                                                                                                                                                    | 17/282 [03:08<47:40, 10.79s/it]  6%|████████████████████▉                                                                                                                                                                                                                                                                                                                   | 18/282 [03:19<47:29, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.8358', 'grad_norm': '0.3577', 'learning_rate': '0.0001995', 'ppl': '2.307', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '200.9', 'tokens/total': 1179648, 'tokens/trainable': 165432, 'epoch': '0.32'}
  6%|████████████████████▉                                                                                                                                                                                                                                                                                                                   | 18/282 [03:19<47:29, 10.79s/it]  7%|██████████████████████                                                                                                                                                                                                                                                                                                                  | 19/282 [03:30<47:18, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.7292', 'grad_norm': '0.399', 'learning_rate': '0.0001993', 'ppl': '2.073', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '133.8', 'tokens/total': 1245184, 'tokens/trainable': 173968, 'epoch': '0.3378'}
  7%|██████████████████████                                                                                                                                                                                                                                                                                                                  | 19/282 [03:30<47:18, 10.79s/it]  7%|███████████████████████▎                                                                                                                                                                                                                                                                                                                | 20/282 [03:41<47:07, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.8019', 'grad_norm': '0.3612', 'learning_rate': '0.0001992', 'ppl': '2.23', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '226.9', 'tokens/total': 1310720, 'tokens/trainable': 183937, 'epoch': '0.3556'}
  7%|███████████████████████▎                                                                                                                                                                                                                                                                                                                | 20/282 [03:41<47:07, 10.79s/it]  7%|████████████████████████▍                                                                                                                                                                                                                                                                                                               | 21/282 [03:52<46:57, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.8269', 'grad_norm': '0.3396', 'learning_rate': '0.0001991', 'ppl': '2.286', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '165.7', 'tokens/total': 1376256, 'tokens/trainable': 192041, 'epoch': '0.3733'}
  7%|████████████████████████▍                                                                                                                                                                                                                                                                                                               | 21/282 [03:52<46:57, 10.79s/it]  8%|█████████████████████████▌                                                                                                                                                                                                                                                                                                              | 22/282 [04:02<46:46, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.7763', 'grad_norm': '0.408', 'learning_rate': '0.0001989', 'ppl': '2.173', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '199.4', 'tokens/total': 1441792, 'tokens/trainable': 199376, 'epoch': '0.3911'}
  8%|█████████████████████████▌                                                                                                                                                                                                                                                                                                              | 22/282 [04:02<46:46, 10.79s/it]  8%|██████████████████████████▊                                                                                                                                                                                                                                                                                                             | 23/282 [04:13<46:34, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.8297', 'grad_norm': '0.4304', 'learning_rate': '0.0001987', 'ppl': '2.293', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '110.4', 'tokens/total': 1507328, 'tokens/trainable': 208257, 'epoch': '0.4089'}
  8%|██████████████████████████▊                                                                                                                                                                                                                                                                                                             | 23/282 [04:13<46:34, 10.79s/it]  9%|███████████████████████████▉                                                                                                                                                                                                                                                                                                            | 24/282 [04:24<46:24, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.7301', 'grad_norm': '0.3443', 'learning_rate': '0.0001985', 'ppl': '2.075', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '243.6', 'tokens/total': 1572864, 'tokens/trainable': 217078, 'epoch': '0.4267'}
  9%|███████████████████████████▉                                                                                                                                                                                                                                                                                                            | 24/282 [04:24<46:24, 10.79s/it]  9%|█████████████████████████████                                                                                                                                                                                                                                                                                                           | 25/282 [04:35<46:12, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.7435', 'grad_norm': '0.3773', 'learning_rate': '0.0001983', 'ppl': '2.103', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '218.5', 'tokens/total': 1638400, 'tokens/trainable': 225289, 'epoch': '0.4444'}
  9%|█████████████████████████████                                                                                                                                                                                                                                                                                                           | 25/282 [04:35<46:12, 10.79s/it]  9%|██████████████████████████████▏                                                                                                                                                                                                                                                                                                         | 26/282 [04:46<46:02, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.7609', 'grad_norm': '0.3491', 'learning_rate': '0.0001981', 'ppl': '2.14', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '265.3', 'tokens/total': 1703936, 'tokens/trainable': 236042, 'epoch': '0.4622'}
  9%|██████████████████████████████▏                                                                                                                                                                                                                                                                                                         | 26/282 [04:46<46:02, 10.79s/it] 10%|███████████████████████████████▍                                                                                                                                                                                                                                                                                                        | 27/282 [04:56<45:51, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.692', 'grad_norm': '0.3742', 'learning_rate': '0.0001979', 'ppl': '1.998', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '181.9', 'tokens/total': 1769472, 'tokens/trainable': 245354, 'epoch': '0.48'}
 10%|███████████████████████████████▍                                                                                                                                                                                                                                                                                                        | 27/282 [04:56<45:51, 10.79s/it] 10%|████████████████████████████████▌                                                                                                                                                                                                                                                                                                       | 28/282 [05:07<45:40, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.7157', 'grad_norm': '0.5769', 'learning_rate': '0.0001976', 'ppl': '2.046', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '197.6', 'tokens/total': 1835008, 'tokens/trainable': 254439, 'epoch': '0.4978'}
 10%|████████████████████████████████▌                                                                                                                                                                                                                                                                                                       | 28/282 [05:07<45:40, 10.79s/it] 10%|█████████████████████████████████▋                                                                                                                                                                                                                                                                                                      | 29/282 [05:18<45:29, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.7411', 'grad_norm': '0.4154', 'learning_rate': '0.0001974', 'ppl': '2.098', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '192.4', 'tokens/total': 1900544, 'tokens/trainable': 263019, 'epoch': '0.5156'}
 10%|█████████████████████████████████▋                                                                                                                                                                                                                                                                                                      | 29/282 [05:18<45:29, 10.79s/it] 11%|██████████████████████████████████▉                                                                                                                                                                                                                                                                                                     | 30/282 [05:29<45:18, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.8077', 'grad_norm': '0.4476', 'learning_rate': '0.0001971', 'ppl': '2.243', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '230.1', 'tokens/total': 1966080, 'tokens/trainable': 270184, 'epoch': '0.5333'}
 11%|██████████████████████████████████▉                                                                                                                                                                                                                                                                                                     | 30/282 [05:29<45:18, 10.79s/it] 11%|████████████████████████████████████                                                                                                                                                                                                                                                                                                    | 31/282 [05:39<45:08, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.7156', 'grad_norm': '0.4493', 'learning_rate': '0.0001968', 'ppl': '2.045', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '261.8', 'tokens/total': 2031616, 'tokens/trainable': 278282, 'epoch': '0.5511'}
 11%|████████████████████████████████████                                                                                                                                                                                                                                                                                                    | 31/282 [05:39<45:08, 10.79s/it] 11%|█████████████████████████████████████▏                                                                                                                                                                                                                                                                                                  | 32/282 [05:50<44:58, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5916', 'grad_norm': '0.4421', 'learning_rate': '0.0001965', 'ppl': '1.807', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '146.6', 'tokens/total': 2097152, 'tokens/trainable': 288922, 'epoch': '0.5689'}
 11%|█████████████████████████████████████▏                                                                                                                                                                                                                                                                                                  | 32/282 [05:50<44:58, 10.79s/it] 12%|██████████████████████████████████████▍                                                                                                                                                                                                                                                                                                 | 33/282 [06:01<44:47, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.713', 'grad_norm': '0.4049', 'learning_rate': '0.0001962', 'ppl': '2.04', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '173.2', 'tokens/total': 2162688, 'tokens/trainable': 296398, 'epoch': '0.5867'}
 12%|██████████████████████████████████████▍                                                                                                                                                                                                                                                                                                 | 33/282 [06:01<44:47, 10.79s/it] 12%|███████████████████████████████████████▌                                                                                                                                                                                                                                                                                                | 34/282 [06:12<44:35, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.6986', 'grad_norm': '0.3848', 'learning_rate': '0.0001959', 'ppl': '2.011', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '131.9', 'tokens/total': 2228224, 'tokens/trainable': 304219, 'epoch': '0.6044'}
 12%|███████████████████████████████████████▌                                                                                                                                                                                                                                                                                                | 34/282 [06:12<44:35, 10.79s/it] 12%|████████████████████████████████████████▋                                                                                                                                                                                                                                                                                               | 35/282 [06:23<44:24, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.6594', 'grad_norm': '0.3984', 'learning_rate': '0.0001956', 'ppl': '1.934', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '250.2', 'tokens/total': 2293760, 'tokens/trainable': 312562, 'epoch': '0.6222'}
 12%|████████████████████████████████████████▋                                                                                                                                                                                                                                                                                               | 35/282 [06:23<44:24, 10.79s/it] 13%|█████████████████████████████████████████▊                                                                                                                                                                                                                                                                                              | 36/282 [06:33<44:15, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.6835', 'grad_norm': '0.3784', 'learning_rate': '0.0001952', 'ppl': '1.981', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '218.5', 'tokens/total': 2359296, 'tokens/trainable': 322188, 'epoch': '0.64'}
 13%|█████████████████████████████████████████▊                                                                                                                                                                                                                                                                                              | 36/282 [06:33<44:15, 10.79s/it] 13%|███████████████████████████████████████████                                                                                                                                                                                                                                                                                             | 37/282 [06:44<44:04, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.683', 'grad_norm': '0.413', 'learning_rate': '0.0001949', 'ppl': '1.98', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '231.4', 'tokens/total': 2424832, 'tokens/trainable': 330786, 'epoch': '0.6578'}
 13%|███████████████████████████████████████████                                                                                                                                                                                                                                                                                             | 37/282 [06:44<44:04, 10.79s/it] 13%|████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                           | 38/282 [06:55<43:54, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.6736', 'grad_norm': '0.3726', 'learning_rate': '0.0001945', 'ppl': '1.961', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '222.6', 'tokens/total': 2490368, 'tokens/trainable': 342308, 'epoch': '0.6756'}
 13%|████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                           | 38/282 [06:55<43:54, 10.80s/it] 14%|█████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                          | 39/282 [07:06<43:43, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5924', 'grad_norm': '0.362', 'learning_rate': '0.0001941', 'ppl': '1.808', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '144.8', 'tokens/total': 2555904, 'tokens/trainable': 350585, 'epoch': '0.6933'}
 14%|█████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                          | 39/282 [07:06<43:43, 10.79s/it] 14%|██████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                         | 40/282 [07:17<43:32, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.6718', 'grad_norm': '0.3995', 'learning_rate': '0.0001937', 'ppl': '1.958', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '170.2', 'tokens/total': 2621440, 'tokens/trainable': 359728, 'epoch': '0.7111'}
 14%|██████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                         | 40/282 [07:17<43:32, 10.79s/it] 15%|███████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                        | 41/282 [07:27<43:21, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.7395', 'grad_norm': '0.5009', 'learning_rate': '0.0001933', 'ppl': '2.095', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '105.5', 'tokens/total': 2686976, 'tokens/trainable': 366017, 'epoch': '0.7289'}
 15%|███████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                        | 41/282 [07:27<43:21, 10.80s/it] 15%|████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                       | 42/282 [07:38<43:10, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5718', 'grad_norm': '0.4266', 'learning_rate': '0.0001929', 'ppl': '1.771', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '144.1', 'tokens/total': 2752512, 'tokens/trainable': 375083, 'epoch': '0.7467'}
 15%|████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                       | 42/282 [07:38<43:10, 10.79s/it] 15%|██████████████████████████████████████████████████                                                                                                                                                                                                                                                                                      | 43/282 [07:49<43:00, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.6304', 'grad_norm': '0.3731', 'learning_rate': '0.0001925', 'ppl': '1.878', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '250.1', 'tokens/total': 2818048, 'tokens/trainable': 384879, 'epoch': '0.7644'}
 15%|██████████████████████████████████████████████████                                                                                                                                                                                                                                                                                      | 43/282 [07:49<43:00, 10.80s/it] 16%|███████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                    | 44/282 [08:00<42:49, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.6228', 'grad_norm': '0.3691', 'learning_rate': '0.0001921', 'ppl': '1.864', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '300.9', 'tokens/total': 2883584, 'tokens/trainable': 394266, 'epoch': '0.7822'}
 16%|███████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                                    | 44/282 [08:00<42:49, 10.80s/it] 16%|████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                   | 45/282 [08:11<42:37, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.6747', 'grad_norm': '0.4154', 'learning_rate': '0.0001916', 'ppl': '1.963', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '201.1', 'tokens/total': 2949120, 'tokens/trainable': 402492, 'epoch': '0.8'}
 16%|████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                                   | 45/282 [08:11<42:37, 10.79s/it] 16%|█████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                  | 46/282 [08:21<42:26, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5739', 'grad_norm': '0.3963', 'learning_rate': '0.0001911', 'ppl': '1.775', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '288', 'tokens/total': 3014656, 'tokens/trainable': 411758, 'epoch': '0.8178'}
 16%|█████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                  | 46/282 [08:21<42:26, 10.79s/it] 17%|██████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                 | 47/282 [08:32<42:15, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5996', 'grad_norm': '0.4424', 'learning_rate': '0.0001907', 'ppl': '1.821', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '166.9', 'tokens/total': 3080192, 'tokens/trainable': 418988, 'epoch': '0.8356'}
 17%|██████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                                 | 47/282 [08:32<42:15, 10.79s/it] 17%|███████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                | 48/282 [08:43<42:04, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5556', 'grad_norm': '0.4682', 'learning_rate': '0.0001902', 'ppl': '1.743', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '338.3', 'tokens/total': 3145728, 'tokens/trainable': 426983, 'epoch': '0.8533'}
 17%|███████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                                | 48/282 [08:43<42:04, 10.79s/it] 17%|████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                               | 49/282 [08:54<41:53, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.6205', 'grad_norm': '0.5343', 'learning_rate': '0.0001897', 'ppl': '1.86', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '156.4', 'tokens/total': 3211264, 'tokens/trainable': 435620, 'epoch': '0.8711'}
 17%|████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                               | 49/282 [08:54<41:53, 10.79s/it] 18%|██████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                             | 50/282 [09:05<41:44, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5215', 'grad_norm': '1.571', 'learning_rate': '0.0001892', 'ppl': '1.684', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '324.5', 'tokens/total': 3276800, 'tokens/trainable': 446287, 'epoch': '0.8889'}
 18%|██████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                             | 50/282 [09:05<41:44, 10.79s/it] 18%|███████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                            | 51/282 [09:15<41:33, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5073', 'grad_norm': '0.4135', 'learning_rate': '0.0001886', 'ppl': '1.661', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '327.5', 'tokens/total': 3342336, 'tokens/trainable': 456138, 'epoch': '0.9067'}
 18%|███████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                            | 51/282 [09:15<41:33, 10.79s/it] 18%|████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                           | 52/282 [09:26<41:22, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5532', 'grad_norm': '0.5819', 'learning_rate': '0.0001881', 'ppl': '1.739', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '296.7', 'tokens/total': 3407872, 'tokens/trainable': 465829, 'epoch': '0.9244'}
 18%|████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                           | 52/282 [09:26<41:22, 10.79s/it] 19%|█████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                          | 53/282 [09:37<41:11, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.4092', 'grad_norm': '0.3692', 'learning_rate': '0.0001875', 'ppl': '1.506', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '302.7', 'tokens/total': 3473408, 'tokens/trainable': 475701, 'epoch': '0.9422'}
 19%|█████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                                          | 53/282 [09:37<41:11, 10.79s/it] 19%|██████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                         | 54/282 [09:48<41:00, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5487', 'grad_norm': '0.481', 'learning_rate': '0.000187', 'ppl': '1.731', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '140.3', 'tokens/total': 3538944, 'tokens/trainable': 484259, 'epoch': '0.96'}
 19%|██████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                         | 54/282 [09:48<41:00, 10.79s/it] 20%|███████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                        | 55/282 [09:59<40:49, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5397', 'grad_norm': '0.6582', 'learning_rate': '0.0001864', 'ppl': '1.716', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '138.1', 'tokens/total': 3604480, 'tokens/trainable': 492739, 'epoch': '0.9778'}
 20%|███████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                        | 55/282 [09:59<40:49, 10.79s/it] 20%|█████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                      | 56/282 [10:09<40:39, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.5022', 'grad_norm': '0.4311', 'learning_rate': '0.0001858', 'ppl': '1.652', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '150.6', 'tokens/total': 3670016, 'tokens/trainable': 503052, 'epoch': '0.9956'}
 20%|█████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                                      | 56/282 [10:09<40:39, 10.79s/it] 20%|██████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                     | 57/282 [10:12<31:26,  8.39s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.4058', 'grad_norm': '0.6347', 'learning_rate': '0.0001852', 'ppl': '1.5', 'memory/max_active (GiB)': '37.3', 'memory/max_allocated (GiB)': '37.3', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '1379', 'tokens/total': 3686400, 'tokens/trainable': 506796, 'epoch': '1'}
 20%|██████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                                     | 57/282 [10:12<31:26,  8.39s/it][2026-06-23 12:14:41,431] [INFO] [axolotl.core.trainers.base._save:828] [PID:2077] Saving model checkpoint to ./out/learned/checkpoint-57
 21%|███████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                    | 58/282 [10:27<39:11, 10.50s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.402', 'grad_norm': '0.441', 'learning_rate': '0.0001846', 'ppl': '1.495', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.38', 'tokens/train_per_sec_per_gpu': '192.2', 'tokens/total': 3751936, 'tokens/trainable': 514703, 'epoch': '1.018'}
 21%|███████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                                    | 58/282 [10:27<39:11, 10.50s/it] 21%|████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                   | 59/282 [10:38<39:21, 10.59s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.428', 'grad_norm': '0.3898', 'learning_rate': '0.000184', 'ppl': '1.534', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.41', 'tokens/train_per_sec_per_gpu': '172.6', 'tokens/total': 3817472, 'tokens/trainable': 524013, 'epoch': '1.036'}
 21%|████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                   | 59/282 [10:38<39:21, 10.59s/it] 21%|█████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                  | 60/282 [10:49<39:24, 10.65s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.4127', 'grad_norm': '0.6502', 'learning_rate': '0.0001834', 'ppl': '1.511', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.42', 'tokens/train_per_sec_per_gpu': '256.6', 'tokens/total': 3883008, 'tokens/trainable': 533529, 'epoch': '1.053'}
 21%|█████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                                  | 60/282 [10:49<39:24, 10.65s/it] 22%|██████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                 | 61/282 [11:00<39:23, 10.70s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3877', 'grad_norm': '1.029', 'learning_rate': '0.0001827', 'ppl': '1.474', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.42', 'tokens/train_per_sec_per_gpu': '350.4', 'tokens/total': 3948544, 'tokens/trainable': 544034, 'epoch': '1.071'}
 22%|██████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                                 | 61/282 [11:00<39:23, 10.70s/it] 22%|████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                | 62/282 [11:11<39:19, 10.73s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3805', 'grad_norm': '0.4456', 'learning_rate': '0.0001821', 'ppl': '1.463', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '188', 'tokens/total': 4014080, 'tokens/trainable': 550814, 'epoch': '1.089'}
 22%|████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                                | 62/282 [11:11<39:19, 10.73s/it] 22%|█████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                              | 63/282 [11:21<39:12, 10.74s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.4165', 'grad_norm': '0.4913', 'learning_rate': '0.0001814', 'ppl': '1.517', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '148.5', 'tokens/total': 4079616, 'tokens/trainable': 558144, 'epoch': '1.107'}
 22%|█████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                              | 63/282 [11:21<39:12, 10.74s/it] 23%|██████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                             | 64/282 [11:32<39:05, 10.76s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3582', 'grad_norm': '0.4214', 'learning_rate': '0.0001808', 'ppl': '1.431', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '225.2', 'tokens/total': 4145152, 'tokens/trainable': 567491, 'epoch': '1.124'}
 23%|██████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                             | 64/282 [11:32<39:05, 10.76s/it] 23%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                            | 65/282 [11:43<38:57, 10.77s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.4296', 'grad_norm': '1.775', 'learning_rate': '0.0001801', 'ppl': '1.537', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '209.7', 'tokens/total': 4210688, 'tokens/trainable': 577992, 'epoch': '1.142'}
 23%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                            | 65/282 [11:43<38:57, 10.77s/it] 23%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                           | 66/282 [11:54<38:47, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.4283', 'grad_norm': '0.5105', 'learning_rate': '0.0001794', 'ppl': '1.535', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '153.9', 'tokens/total': 4276224, 'tokens/trainable': 586589, 'epoch': '1.16'}
 23%|████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                                           | 66/282 [11:54<38:47, 10.78s/it] 24%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                          | 67/282 [12:05<38:37, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3446', 'grad_norm': '0.4216', 'learning_rate': '0.0001787', 'ppl': '1.411', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '195.6', 'tokens/total': 4341760, 'tokens/trainable': 594156, 'epoch': '1.178'}
 24%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                          | 67/282 [12:05<38:37, 10.78s/it] 24%|███████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                         | 68/282 [12:15<38:28, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.321', 'grad_norm': '0.4286', 'learning_rate': '0.000178', 'ppl': '1.378', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '357.4', 'tokens/total': 4407296, 'tokens/trainable': 605254, 'epoch': '1.196'}
 24%|███████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                         | 68/282 [12:15<38:28, 10.79s/it] 24%|████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                       | 69/282 [12:26<38:19, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3316', 'grad_norm': '0.4897', 'learning_rate': '0.0001773', 'ppl': '1.393', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '210.7', 'tokens/total': 4472832, 'tokens/trainable': 615245, 'epoch': '1.213'}
 24%|████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                                       | 69/282 [12:26<38:19, 10.79s/it] 25%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                      | 70/282 [12:37<38:08, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.4164', 'grad_norm': '0.5754', 'learning_rate': '0.0001765', 'ppl': '1.516', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '335.8', 'tokens/total': 4538368, 'tokens/trainable': 624026, 'epoch': '1.231'}
 25%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                                      | 70/282 [12:37<38:08, 10.80s/it] 25%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                     | 71/282 [12:48<37:57, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3632', 'grad_norm': '0.4438', 'learning_rate': '0.0001758', 'ppl': '1.438', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '175.7', 'tokens/total': 4603904, 'tokens/trainable': 632737, 'epoch': '1.249'}
 25%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                     | 71/282 [12:48<37:57, 10.79s/it] 26%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                    | 72/282 [12:59<37:47, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3162', 'grad_norm': '0.3642', 'learning_rate': '0.000175', 'ppl': '1.372', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '147.7', 'tokens/total': 4669440, 'tokens/trainable': 642832, 'epoch': '1.267'}
 26%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                                    | 72/282 [12:59<37:47, 10.80s/it] 26%|████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                   | 73/282 [13:09<37:36, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3416', 'grad_norm': '0.3922', 'learning_rate': '0.0001743', 'ppl': '1.407', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '241.6', 'tokens/total': 4734976, 'tokens/trainable': 654526, 'epoch': '1.284'}
 26%|████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                                   | 73/282 [13:09<37:36, 10.80s/it] 26%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                  | 74/282 [13:20<37:25, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2999', 'grad_norm': '0.4886', 'learning_rate': '0.0001735', 'ppl': '1.35', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '308.7', 'tokens/total': 4800512, 'tokens/trainable': 663833, 'epoch': '1.302'}
 26%|██████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                                  | 74/282 [13:20<37:25, 10.79s/it] 27%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                | 75/282 [13:31<37:13, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3363', 'grad_norm': '0.5305', 'learning_rate': '0.0001727', 'ppl': '1.4', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '180.7', 'tokens/total': 4866048, 'tokens/trainable': 669822, 'epoch': '1.32'}
 27%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                                | 75/282 [13:31<37:13, 10.79s/it] 27%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                               | 76/282 [13:42<37:03, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3892', 'grad_norm': '0.5095', 'learning_rate': '0.0001719', 'ppl': '1.476', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '324.4', 'tokens/total': 4931584, 'tokens/trainable': 678469, 'epoch': '1.338'}
 27%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                               | 76/282 [13:42<37:03, 10.79s/it] 27%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                              | 77/282 [13:53<36:52, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3566', 'grad_norm': '0.581', 'learning_rate': '0.0001711', 'ppl': '1.429', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '241.8', 'tokens/total': 4997120, 'tokens/trainable': 686691, 'epoch': '1.356'}
 27%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                              | 77/282 [13:53<36:52, 10.79s/it] 28%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                             | 78/282 [14:03<36:42, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3465', 'grad_norm': '0.4772', 'learning_rate': '0.0001703', 'ppl': '1.414', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '217.5', 'tokens/total': 5062656, 'tokens/trainable': 695039, 'epoch': '1.373'}
 28%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                             | 78/282 [14:03<36:42, 10.80s/it] 28%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                            | 79/282 [14:14<36:31, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3678', 'grad_norm': '0.4533', 'learning_rate': '0.0001695', 'ppl': '1.445', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '220.6', 'tokens/total': 5128192, 'tokens/trainable': 703747, 'epoch': '1.391'}
 28%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                                            | 79/282 [14:14<36:31, 10.80s/it] 28%|█████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                           | 80/282 [14:25<36:20, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3747', 'grad_norm': '0.4183', 'learning_rate': '0.0001687', 'ppl': '1.455', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '183.4', 'tokens/total': 5193728, 'tokens/trainable': 712262, 'epoch': '1.409'}
 28%|█████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                           | 80/282 [14:25<36:20, 10.80s/it] 29%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                         | 81/282 [14:36<36:09, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3242', 'grad_norm': '0.4431', 'learning_rate': '0.0001678', 'ppl': '1.383', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '220.2', 'tokens/total': 5259264, 'tokens/trainable': 720100, 'epoch': '1.427'}
 29%|██████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                         | 81/282 [14:36<36:09, 10.80s/it] 29%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                        | 82/282 [14:47<35:58, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3399', 'grad_norm': '0.4055', 'learning_rate': '0.000167', 'ppl': '1.405', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '178.9', 'tokens/total': 5324800, 'tokens/trainable': 728533, 'epoch': '1.444'}
 29%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                                        | 82/282 [14:47<35:58, 10.79s/it] 29%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                       | 83/282 [14:57<35:48, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3413', 'grad_norm': '0.4431', 'learning_rate': '0.0001661', 'ppl': '1.407', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '180.2', 'tokens/total': 5390336, 'tokens/trainable': 739551, 'epoch': '1.462'}
 29%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                       | 83/282 [14:57<35:48, 10.80s/it] 30%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                      | 84/282 [15:08<35:37, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2997', 'grad_norm': '0.4298', 'learning_rate': '0.0001652', 'ppl': '1.35', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '238.9', 'tokens/total': 5455872, 'tokens/trainable': 748505, 'epoch': '1.48'}
 30%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                                      | 84/282 [15:08<35:37, 10.80s/it] 30%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                     | 85/282 [15:19<35:26, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2974', 'grad_norm': '0.4838', 'learning_rate': '0.0001644', 'ppl': '1.346', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '282.9', 'tokens/total': 5521408, 'tokens/trainable': 758709, 'epoch': '1.498'}
 30%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                                     | 85/282 [15:19<35:26, 10.80s/it] 30%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                    | 86/282 [15:30<35:15, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3022', 'grad_norm': '0.4824', 'learning_rate': '0.0001635', 'ppl': '1.353', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '119.3', 'tokens/total': 5586944, 'tokens/trainable': 767351, 'epoch': '1.516'}
 30%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                                    | 86/282 [15:30<35:15, 10.79s/it] 31%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                  | 87/282 [15:41<35:05, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3074', 'grad_norm': '0.5292', 'learning_rate': '0.0001626', 'ppl': '1.36', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '265.3', 'tokens/total': 5652480, 'tokens/trainable': 775649, 'epoch': '1.533'}
 31%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                                  | 87/282 [15:41<35:05, 10.80s/it] 31%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                 | 88/282 [15:51<34:53, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2396', 'grad_norm': '0.4157', 'learning_rate': '0.0001617', 'ppl': '1.271', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '207', 'tokens/total': 5718016, 'tokens/trainable': 782799, 'epoch': '1.551'}
 31%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                                 | 88/282 [15:51<34:53, 10.79s/it] 32%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                | 89/282 [16:02<34:43, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2734', 'grad_norm': '0.4625', 'learning_rate': '0.0001608', 'ppl': '1.314', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '278.7', 'tokens/total': 5783552, 'tokens/trainable': 792206, 'epoch': '1.569'}
 32%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                | 89/282 [16:02<34:43, 10.79s/it] 32%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                               | 90/282 [16:13<34:32, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.25', 'grad_norm': '0.3943', 'learning_rate': '0.0001599', 'ppl': '1.284', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '406.8', 'tokens/total': 5849088, 'tokens/trainable': 803473, 'epoch': '1.587'}
 32%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                               | 90/282 [16:13<34:32, 10.80s/it] 32%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                              | 91/282 [16:24<34:21, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2501', 'grad_norm': '0.44', 'learning_rate': '0.000159', 'ppl': '1.284', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '120.5', 'tokens/total': 5914624, 'tokens/trainable': 812149, 'epoch': '1.604'}
 32%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                              | 91/282 [16:24<34:21, 10.79s/it] 33%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                             | 92/282 [16:35<34:10, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.24', 'grad_norm': '0.3757', 'learning_rate': '0.000158', 'ppl': '1.271', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '212.9', 'tokens/total': 5980160, 'tokens/trainable': 821133, 'epoch': '1.622'}
 33%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                             | 92/282 [16:35<34:10, 10.79s/it] 33%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                           | 93/282 [16:45<34:00, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2251', 'grad_norm': '0.3665', 'learning_rate': '0.0001571', 'ppl': '1.252', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '118.7', 'tokens/total': 6045696, 'tokens/trainable': 830407, 'epoch': '1.64'}
 33%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                           | 93/282 [16:45<34:00, 10.79s/it] 33%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                          | 94/282 [16:56<33:49, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2874', 'grad_norm': '0.4065', 'learning_rate': '0.0001562', 'ppl': '1.333', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '149.7', 'tokens/total': 6111232, 'tokens/trainable': 838926, 'epoch': '1.658'}
 33%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                          | 94/282 [16:56<33:49, 10.79s/it] 34%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                         | 95/282 [17:07<33:40, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2395', 'grad_norm': '0.431', 'learning_rate': '0.0001552', 'ppl': '1.271', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '217', 'tokens/total': 6176768, 'tokens/trainable': 848990, 'epoch': '1.676'}
 34%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                                         | 95/282 [17:07<33:40, 10.80s/it] 34%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                        | 96/282 [17:18<33:28, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.205', 'grad_norm': '0.4521', 'learning_rate': '0.0001542', 'ppl': '1.228', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '123.4', 'tokens/total': 6242304, 'tokens/trainable': 856818, 'epoch': '1.693'}
 34%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                                        | 96/282 [17:18<33:28, 10.80s/it] 34%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                       | 97/282 [17:29<33:16, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.3126', 'grad_norm': '0.5698', 'learning_rate': '0.0001533', 'ppl': '1.367', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '211.9', 'tokens/total': 6307840, 'tokens/trainable': 864550, 'epoch': '1.711'}
 34%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                                       | 97/282 [17:29<33:16, 10.79s/it] 35%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                      | 98/282 [17:39<33:06, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2802', 'grad_norm': '0.5712', 'learning_rate': '0.0001523', 'ppl': '1.323', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '127.3', 'tokens/total': 6373376, 'tokens/trainable': 872869, 'epoch': '1.729'}
 35%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                      | 98/282 [17:39<33:06, 10.79s/it] 35%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                    | 99/282 [17:50<32:55, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2663', 'grad_norm': '0.5703', 'learning_rate': '0.0001513', 'ppl': '1.305', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '128.7', 'tokens/total': 6438912, 'tokens/trainable': 881895, 'epoch': '1.747'}
 35%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                                    | 99/282 [17:50<32:55, 10.79s/it] 35%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                   | 100/282 [18:01<32:44, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2464', 'grad_norm': '0.3975', 'learning_rate': '0.0001503', 'ppl': '1.279', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '273.8', 'tokens/total': 6504448, 'tokens/trainable': 891773, 'epoch': '1.764'}
 35%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                                   | 100/282 [18:01<32:44, 10.80s/it] 36%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                  | 101/282 [18:12<32:34, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2755', 'grad_norm': '0.4582', 'learning_rate': '0.0001493', 'ppl': '1.317', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '193.9', 'tokens/total': 6569984, 'tokens/trainable': 900953, 'epoch': '1.782'}
 36%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                                  | 101/282 [18:12<32:34, 10.80s/it] 36%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                | 102/282 [18:23<32:22, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2806', 'grad_norm': '0.5262', 'learning_rate': '0.0001483', 'ppl': '1.324', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '198.8', 'tokens/total': 6635520, 'tokens/trainable': 909369, 'epoch': '1.8'}
 36%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                                | 102/282 [18:23<32:22, 10.79s/it] 37%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                               | 103/282 [18:33<32:12, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2408', 'grad_norm': '0.3756', 'learning_rate': '0.0001473', 'ppl': '1.272', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '216', 'tokens/total': 6701056, 'tokens/trainable': 919530, 'epoch': '1.818'}
 37%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                               | 103/282 [18:33<32:12, 10.79s/it] 37%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                              | 104/282 [18:44<32:01, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2316', 'grad_norm': '0.4064', 'learning_rate': '0.0001463', 'ppl': '1.261', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '225.5', 'tokens/total': 6766592, 'tokens/trainable': 928257, 'epoch': '1.836'}
 37%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                              | 104/282 [18:44<32:01, 10.80s/it] 37%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                             | 105/282 [18:55<31:50, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2198', 'grad_norm': '0.3972', 'learning_rate': '0.0001453', 'ppl': '1.246', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '106.9', 'tokens/total': 6832128, 'tokens/trainable': 937586, 'epoch': '1.853'}
 37%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                             | 105/282 [18:55<31:50, 10.80s/it] 38%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                            | 106/282 [19:06<31:39, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.243', 'grad_norm': '0.4221', 'learning_rate': '0.0001443', 'ppl': '1.275', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '177.2', 'tokens/total': 6897664, 'tokens/trainable': 946502, 'epoch': '1.871'}
 38%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                            | 106/282 [19:06<31:39, 10.80s/it] 38%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                           | 107/282 [19:16<31:29, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2432', 'grad_norm': '0.4412', 'learning_rate': '0.0001432', 'ppl': '1.275', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '228.2', 'tokens/total': 6963200, 'tokens/trainable': 953881, 'epoch': '1.889'}
 38%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                           | 107/282 [19:16<31:29, 10.79s/it] 38%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                         | 108/282 [19:27<31:18, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2189', 'grad_norm': '0.4177', 'learning_rate': '0.0001422', 'ppl': '1.245', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '258.4', 'tokens/total': 7028736, 'tokens/trainable': 963313, 'epoch': '1.907'}
 38%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                         | 108/282 [19:27<31:18, 10.79s/it] 39%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                        | 109/282 [19:38<31:07, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2277', 'grad_norm': '0.4547', 'learning_rate': '0.0001412', 'ppl': '1.256', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '309.7', 'tokens/total': 7094272, 'tokens/trainable': 971098, 'epoch': '1.924'}
 39%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                                        | 109/282 [19:38<31:07, 10.79s/it] 39%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                       | 110/282 [19:49<30:56, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2539', 'grad_norm': '0.4283', 'learning_rate': '0.0001401', 'ppl': '1.289', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '279.1', 'tokens/total': 7159808, 'tokens/trainable': 980867, 'epoch': '1.942'}
 39%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                       | 110/282 [19:49<30:56, 10.79s/it] 39%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                      | 111/282 [20:00<30:46, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2816', 'grad_norm': '0.5904', 'learning_rate': '0.0001391', 'ppl': '1.325', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '411.8', 'tokens/total': 7225344, 'tokens/trainable': 991499, 'epoch': '1.96'}
 39%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                                      | 111/282 [20:00<30:46, 10.80s/it] 40%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                     | 112/282 [20:10<30:34, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2308', 'grad_norm': '0.474', 'learning_rate': '0.000138', 'ppl': '1.26', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '179.7', 'tokens/total': 7290880, 'tokens/trainable': 1001033, 'epoch': '1.978'}
 40%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                     | 112/282 [20:10<30:34, 10.79s/it] 40%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                    | 113/282 [20:21<30:24, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2183', 'grad_norm': '0.388', 'learning_rate': '0.0001369', 'ppl': '1.244', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '122.1', 'tokens/total': 7356416, 'tokens/trainable': 1011339, 'epoch': '1.996'}
 40%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                                    | 113/282 [20:21<30:24, 10.79s/it] 40%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                  | 114/282 [20:24<23:28,  8.39s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1701', 'grad_norm': '0.6718', 'learning_rate': '0.0001359', 'ppl': '1.185', 'memory/max_active (GiB)': '37.3', 'memory/max_allocated (GiB)': '37.3', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '828.5', 'tokens/total': 7372800, 'tokens/trainable': 1013592, 'epoch': '2'}
 40%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                                  | 114/282 [20:24<23:28,  8.39s/it][2026-06-23 12:24:53,374] [INFO] [axolotl.core.trainers.base._save:828] [PID:2077] Saving model checkpoint to ./out/learned/checkpoint-114
 41%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                 | 115/282 [20:39<28:57, 10.40s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.211', 'grad_norm': '0.3541', 'learning_rate': '0.0001348', 'ppl': '1.235', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '259.4', 'tokens/total': 7438336, 'tokens/trainable': 1021495, 'epoch': '2.018'}
 41%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                                 | 115/282 [20:39<28:57, 10.40s/it] 41%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                | 116/282 [20:50<29:06, 10.52s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1848', 'grad_norm': '0.3741', 'learning_rate': '0.0001337', 'ppl': '1.203', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '181.9', 'tokens/total': 7503872, 'tokens/trainable': 1032544, 'epoch': '2.036'}
 41%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                | 116/282 [20:50<29:06, 10.52s/it] 41%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                               | 117/282 [21:01<29:10, 10.61s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1806', 'grad_norm': '0.4137', 'learning_rate': '0.0001326', 'ppl': '1.198', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '334.7', 'tokens/total': 7569408, 'tokens/trainable': 1043614, 'epoch': '2.053'}
 41%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                               | 117/282 [21:01<29:10, 10.61s/it] 42%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                              | 118/282 [21:12<29:08, 10.66s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1678', 'grad_norm': '0.4339', 'learning_rate': '0.0001316', 'ppl': '1.183', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '177', 'tokens/total': 7634944, 'tokens/trainable': 1051870, 'epoch': '2.071'}
 42%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                              | 118/282 [21:12<29:08, 10.66s/it] 42%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                             | 119/282 [21:22<29:04, 10.70s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1977', 'grad_norm': '0.6372', 'learning_rate': '0.0001305', 'ppl': '1.219', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '136', 'tokens/total': 7700480, 'tokens/trainable': 1058233, 'epoch': '2.089'}
 42%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                             | 119/282 [21:22<29:04, 10.70s/it] 43%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                           | 120/282 [21:33<28:58, 10.73s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.184', 'grad_norm': '0.4493', 'learning_rate': '0.0001294', 'ppl': '1.202', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '243.7', 'tokens/total': 7766016, 'tokens/trainable': 1067950, 'epoch': '2.107'}
 43%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                                           | 120/282 [21:33<28:58, 10.73s/it] 43%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                          | 121/282 [21:44<28:51, 10.75s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2083', 'grad_norm': '0.3846', 'learning_rate': '0.0001283', 'ppl': '1.232', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '273.1', 'tokens/total': 7831552, 'tokens/trainable': 1076823, 'epoch': '2.124'}
 43%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                          | 121/282 [21:44<28:51, 10.75s/it] 43%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                         | 122/282 [21:55<28:43, 10.77s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1876', 'grad_norm': '0.3895', 'learning_rate': '0.0001272', 'ppl': '1.206', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '374.1', 'tokens/total': 7897088, 'tokens/trainable': 1088695, 'epoch': '2.142'}
 43%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                         | 122/282 [21:55<28:43, 10.77s/it] 44%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                        | 123/282 [22:06<28:33, 10.77s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1934', 'grad_norm': '0.3638', 'learning_rate': '0.0001261', 'ppl': '1.213', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '156.5', 'tokens/total': 7962624, 'tokens/trainable': 1095630, 'epoch': '2.16'}
 44%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                        | 123/282 [22:06<28:33, 10.77s/it] 44%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                       | 124/282 [22:16<28:22, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2228', 'grad_norm': '0.4402', 'learning_rate': '0.000125', 'ppl': '1.25', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '127.2', 'tokens/total': 8028160, 'tokens/trainable': 1101538, 'epoch': '2.178'}
 44%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                                       | 124/282 [22:16<28:22, 10.78s/it] 44%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                      | 125/282 [22:27<28:13, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1528', 'grad_norm': '0.4048', 'learning_rate': '0.0001238', 'ppl': '1.165', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '172.9', 'tokens/total': 8093696, 'tokens/trainable': 1110536, 'epoch': '2.196'}
 44%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                      | 125/282 [22:27<28:13, 10.78s/it] 45%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                     | 126/282 [22:38<28:03, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1586', 'grad_norm': '1.486', 'learning_rate': '0.0001227', 'ppl': '1.172', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '126.1', 'tokens/total': 8159232, 'tokens/trainable': 1118550, 'epoch': '2.213'}
 45%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                                     | 126/282 [22:38<28:03, 10.79s/it] 45%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                   | 127/282 [22:49<27:53, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1546', 'grad_norm': '0.3887', 'learning_rate': '0.0001216', 'ppl': '1.167', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '214', 'tokens/total': 8224768, 'tokens/trainable': 1126335, 'epoch': '2.231'}
 45%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                   | 127/282 [22:49<27:53, 10.79s/it] 45%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                  | 128/282 [23:00<27:42, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1455', 'grad_norm': '0.4043', 'learning_rate': '0.0001205', 'ppl': '1.157', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '138.6', 'tokens/total': 8290304, 'tokens/trainable': 1134703, 'epoch': '2.249'}
 45%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                                  | 128/282 [23:00<27:42, 10.80s/it] 46%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                 | 129/282 [23:10<27:31, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1642', 'grad_norm': '0.4258', 'learning_rate': '0.0001194', 'ppl': '1.178', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '288', 'tokens/total': 8355840, 'tokens/trainable': 1145309, 'epoch': '2.267'}
 46%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                 | 129/282 [23:10<27:31, 10.80s/it] 46%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                | 130/282 [23:21<27:21, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1497', 'grad_norm': '0.4268', 'learning_rate': '0.0001182', 'ppl': '1.162', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '217.2', 'tokens/total': 8421376, 'tokens/trainable': 1154354, 'epoch': '2.284'}
 46%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                                | 130/282 [23:21<27:21, 10.80s/it] 46%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                               | 131/282 [23:32<27:10, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.2093', 'grad_norm': '0.4835', 'learning_rate': '0.0001171', 'ppl': '1.233', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '183.3', 'tokens/total': 8486912, 'tokens/trainable': 1162036, 'epoch': '2.302'}
 46%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                               | 131/282 [23:32<27:10, 10.79s/it] 47%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                              | 132/282 [23:43<26:59, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1787', 'grad_norm': '0.5305', 'learning_rate': '0.000116', 'ppl': '1.196', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '291.8', 'tokens/total': 8552448, 'tokens/trainable': 1171873, 'epoch': '2.32'}
 47%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                              | 132/282 [23:43<26:59, 10.80s/it] 47%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                            | 133/282 [23:53<26:48, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1426', 'grad_norm': '0.3314', 'learning_rate': '0.0001149', 'ppl': '1.153', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '299.5', 'tokens/total': 8617984, 'tokens/trainable': 1181052, 'epoch': '2.338'}
 47%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                            | 133/282 [23:53<26:48, 10.80s/it] 48%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                           | 134/282 [24:04<26:38, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1371', 'grad_norm': '0.2998', 'learning_rate': '0.0001137', 'ppl': '1.147', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '183.4', 'tokens/total': 8683520, 'tokens/trainable': 1191168, 'epoch': '2.356'}
 48%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                           | 134/282 [24:04<26:38, 10.80s/it] 48%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                          | 135/282 [24:15<26:27, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1676', 'grad_norm': '0.3774', 'learning_rate': '0.0001126', 'ppl': '1.182', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.51', 'tokens/train_per_sec_per_gpu': '117.5', 'tokens/total': 8749056, 'tokens/trainable': 1197982, 'epoch': '2.373'}
 48%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                          | 135/282 [24:15<26:27, 10.80s/it] 48%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                         | 136/282 [24:26<26:16, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1554', 'grad_norm': '0.3881', 'learning_rate': '0.0001114', 'ppl': '1.168', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '328.6', 'tokens/total': 8814592, 'tokens/trainable': 1208711, 'epoch': '2.391'}
 48%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                         | 136/282 [24:26<26:16, 10.80s/it] 49%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                        | 137/282 [24:37<26:05, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1573', 'grad_norm': '0.4031', 'learning_rate': '0.0001103', 'ppl': '1.17', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '273.6', 'tokens/total': 8880128, 'tokens/trainable': 1217894, 'epoch': '2.409'}
 49%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                        | 137/282 [24:37<26:05, 10.80s/it] 49%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                       | 138/282 [24:47<25:55, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1379', 'grad_norm': '0.3218', 'learning_rate': '0.0001092', 'ppl': '1.148', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '180.5', 'tokens/total': 8945664, 'tokens/trainable': 1226123, 'epoch': '2.427'}
 49%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                                       | 138/282 [24:48<25:55, 10.80s/it] 49%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                     | 139/282 [24:58<25:44, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1442', 'grad_norm': '0.3592', 'learning_rate': '0.000108', 'ppl': '1.155', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '248.9', 'tokens/total': 9011200, 'tokens/trainable': 1236562, 'epoch': '2.444'}
 49%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                     | 139/282 [24:58<25:44, 10.80s/it] 50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                    | 140/282 [25:09<25:33, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1569', 'grad_norm': '0.4724', 'learning_rate': '0.0001069', 'ppl': '1.17', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '319.5', 'tokens/total': 9076736, 'tokens/trainable': 1245313, 'epoch': '2.462'}
 50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                    | 140/282 [25:09<25:33, 10.80s/it] 50%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                   | 141/282 [25:20<25:22, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1585', 'grad_norm': '0.404', 'learning_rate': '0.0001057', 'ppl': '1.172', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '152.1', 'tokens/total': 9142272, 'tokens/trainable': 1253591, 'epoch': '2.48'}
 50%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                   | 141/282 [25:20<25:22, 10.80s/it] 50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                  | 142/282 [25:31<25:11, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1419', 'grad_norm': '0.4383', 'learning_rate': '0.0001046', 'ppl': '1.152', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '121', 'tokens/total': 9207808, 'tokens/trainable': 1261224, 'epoch': '2.498'}
 50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                                  | 142/282 [25:31<25:11, 10.80s/it] 51%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                 | 143/282 [25:41<25:01, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1324', 'grad_norm': '0.3427', 'learning_rate': '0.0001034', 'ppl': '1.142', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '312.5', 'tokens/total': 9273344, 'tokens/trainable': 1272926, 'epoch': '2.516'}
 51%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                                 | 143/282 [25:41<25:01, 10.80s/it] 51%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                | 144/282 [25:52<24:49, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1442', 'grad_norm': '0.6019', 'learning_rate': '0.0001023', 'ppl': '1.155', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '108.3', 'tokens/total': 9338880, 'tokens/trainable': 1281583, 'epoch': '2.533'}
 51%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                | 144/282 [25:52<24:49, 10.79s/it] 51%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                              | 145/282 [26:03<24:37, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1236', 'grad_norm': '0.3662', 'learning_rate': '0.0001011', 'ppl': '1.132', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '282.4', 'tokens/total': 9404416, 'tokens/trainable': 1291375, 'epoch': '2.551'}
 51%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                              | 145/282 [26:03<24:37, 10.79s/it] 52%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                             | 146/282 [26:14<24:25, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1358', 'grad_norm': '0.4775', 'learning_rate': '0.0001', 'ppl': '1.145', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '246.5', 'tokens/total': 9469952, 'tokens/trainable': 1299404, 'epoch': '2.569'}
 52%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                             | 146/282 [26:14<24:25, 10.78s/it] 52%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                            | 147/282 [26:25<24:15, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1279', 'grad_norm': '0.6042', 'learning_rate': '9.885e-05', 'ppl': '1.136', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '194.6', 'tokens/total': 9535488, 'tokens/trainable': 1308365, 'epoch': '2.587'}
 52%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                            | 147/282 [26:25<24:15, 10.78s/it] 52%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                           | 148/282 [26:35<24:05, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1648', 'grad_norm': '0.4459', 'learning_rate': '9.771e-05', 'ppl': '1.179', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '129.9', 'tokens/total': 9601024, 'tokens/trainable': 1315181, 'epoch': '2.604'}
 52%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                           | 148/282 [26:35<24:05, 10.79s/it] 53%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                          | 149/282 [26:46<23:54, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1777', 'grad_norm': '0.417', 'learning_rate': '9.656e-05', 'ppl': '1.194', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '125', 'tokens/total': 9666560, 'tokens/trainable': 1321935, 'epoch': '2.622'}
 53%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                          | 149/282 [26:46<23:54, 10.79s/it] 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                         | 150/282 [26:57<23:43, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1326', 'grad_norm': '0.3656', 'learning_rate': '9.542e-05', 'ppl': '1.142', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '211.7', 'tokens/total': 9732096, 'tokens/trainable': 1329792, 'epoch': '2.64'}
 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                         | 150/282 [26:57<23:43, 10.78s/it] 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                        | 151/282 [27:08<23:33, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1204', 'grad_norm': '0.3507', 'learning_rate': '9.427e-05', 'ppl': '1.128', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '249.2', 'tokens/total': 9797632, 'tokens/trainable': 1339244, 'epoch': '2.658'}
 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                        | 151/282 [27:08<23:33, 10.79s/it] 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                      | 152/282 [27:19<23:22, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1289', 'grad_norm': '0.3929', 'learning_rate': '9.313e-05', 'ppl': '1.138', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '173.6', 'tokens/total': 9863168, 'tokens/trainable': 1347601, 'epoch': '2.676'}
 54%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                      | 152/282 [27:19<23:22, 10.79s/it] 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                     | 153/282 [27:29<23:11, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1131', 'grad_norm': '0.4127', 'learning_rate': '9.198e-05', 'ppl': '1.12', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '184.5', 'tokens/total': 9928704, 'tokens/trainable': 1356137, 'epoch': '2.693'}
 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                     | 153/282 [27:29<23:11, 10.79s/it] 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                    | 154/282 [27:40<23:01, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1094', 'grad_norm': '0.3323', 'learning_rate': '9.084e-05', 'ppl': '1.116', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '192.7', 'tokens/total': 9994240, 'tokens/trainable': 1366678, 'epoch': '2.711'}
 55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                    | 154/282 [27:40<23:01, 10.79s/it] 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                   | 155/282 [27:51<22:50, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.101', 'grad_norm': '0.4039', 'learning_rate': '8.97e-05', 'ppl': '1.106', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '306.8', 'tokens/total': 10059776, 'tokens/trainable': 1375906, 'epoch': '2.729'}
 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                   | 155/282 [27:51<22:50, 10.79s/it] 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                  | 156/282 [28:02<22:40, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1127', 'grad_norm': '0.394', 'learning_rate': '8.856e-05', 'ppl': '1.119', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '312.1', 'tokens/total': 10125312, 'tokens/trainable': 1386437, 'epoch': '2.747'}
 55%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                  | 156/282 [28:02<22:40, 10.80s/it] 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                 | 157/282 [28:13<22:29, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1271', 'grad_norm': '0.4079', 'learning_rate': '8.742e-05', 'ppl': '1.136', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '169.2', 'tokens/total': 10190848, 'tokens/trainable': 1396425, 'epoch': '2.764'}
 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                 | 157/282 [28:13<22:29, 10.80s/it] 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                               | 158/282 [28:23<22:18, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1374', 'grad_norm': '0.4557', 'learning_rate': '8.628e-05', 'ppl': '1.147', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '281.4', 'tokens/total': 10256384, 'tokens/trainable': 1406363, 'epoch': '2.782'}
 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                               | 158/282 [28:23<22:18, 10.80s/it] 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                              | 159/282 [28:34<22:07, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1377', 'grad_norm': '0.4628', 'learning_rate': '8.515e-05', 'ppl': '1.148', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '209.2', 'tokens/total': 10321920, 'tokens/trainable': 1413900, 'epoch': '2.8'}
 56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                              | 159/282 [28:34<22:07, 10.79s/it] 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                             | 160/282 [28:45<21:56, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1035', 'grad_norm': '0.3202', 'learning_rate': '8.402e-05', 'ppl': '1.109', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '248.7', 'tokens/total': 10387456, 'tokens/trainable': 1423615, 'epoch': '2.818'}
 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                             | 160/282 [28:45<21:56, 10.79s/it] 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                            | 161/282 [28:56<21:46, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.09881', 'grad_norm': '0.326', 'learning_rate': '8.289e-05', 'ppl': '1.104', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '187.5', 'tokens/total': 10452992, 'tokens/trainable': 1432248, 'epoch': '2.836'}
 57%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                            | 161/282 [28:56<21:46, 10.79s/it] 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                           | 162/282 [29:06<21:35, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1023', 'grad_norm': '0.3132', 'learning_rate': '8.176e-05', 'ppl': '1.108', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '204.9', 'tokens/total': 10518528, 'tokens/trainable': 1441186, 'epoch': '2.853'}
 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                           | 162/282 [29:06<21:35, 10.79s/it] 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                          | 163/282 [29:17<21:24, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1471', 'grad_norm': '0.4479', 'learning_rate': '8.063e-05', 'ppl': '1.158', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '90.1', 'tokens/total': 10584064, 'tokens/trainable': 1448387, 'epoch': '2.871'}
 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                          | 163/282 [29:17<21:24, 10.79s/it] 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                        | 164/282 [29:28<21:13, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1241', 'grad_norm': '0.4009', 'learning_rate': '7.951e-05', 'ppl': '1.132', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '254.2', 'tokens/total': 10649600, 'tokens/trainable': 1458308, 'epoch': '2.889'}
 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                        | 164/282 [29:28<21:13, 10.79s/it] 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                       | 165/282 [29:39<21:02, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1351', 'grad_norm': '0.443', 'learning_rate': '7.839e-05', 'ppl': '1.145', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '139.3', 'tokens/total': 10715136, 'tokens/trainable': 1465834, 'epoch': '2.907'}
 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                       | 165/282 [29:39<21:02, 10.79s/it] 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                      | 166/282 [29:50<20:52, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.129', 'grad_norm': '0.4343', 'learning_rate': '7.727e-05', 'ppl': '1.138', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '275.3', 'tokens/total': 10780672, 'tokens/trainable': 1475859, 'epoch': '2.924'}
 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                      | 166/282 [29:50<20:52, 10.79s/it] 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                     | 167/282 [30:00<20:41, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.09724', 'grad_norm': '0.3055', 'learning_rate': '7.615e-05', 'ppl': '1.102', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '321.7', 'tokens/total': 10846208, 'tokens/trainable': 1485708, 'epoch': '2.942'}
 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                     | 167/282 [30:00<20:41, 10.80s/it] 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                    | 168/282 [30:11<20:31, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.09621', 'grad_norm': '0.3305', 'learning_rate': '7.504e-05', 'ppl': '1.101', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '363.8', 'tokens/total': 10911744, 'tokens/trainable': 1498354, 'epoch': '2.96'}
 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                    | 168/282 [30:11<20:31, 10.80s/it] 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                   | 169/282 [30:22<20:20, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1007', 'grad_norm': '0.371', 'learning_rate': '7.393e-05', 'ppl': '1.106', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '181.6', 'tokens/total': 10977280, 'tokens/trainable': 1506109, 'epoch': '2.978'}
 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                   | 169/282 [30:22<20:20, 10.80s/it] 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                 | 170/282 [30:33<20:09, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.07308', 'grad_norm': '0.2846', 'learning_rate': '7.283e-05', 'ppl': '1.076', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '161.9', 'tokens/total': 11042816, 'tokens/trainable': 1517924, 'epoch': '2.996'}
 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                 | 170/282 [30:33<20:09, 10.80s/it] 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                | 171/282 [30:36<15:31,  8.39s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1036', 'grad_norm': '0.7371', 'learning_rate': '7.173e-05', 'ppl': '1.109', 'memory/max_active (GiB)': '37.3', 'memory/max_allocated (GiB)': '37.3', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '906.5', 'tokens/total': 11059200, 'tokens/trainable': 1520388, 'epoch': '3'}
 61%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                | 171/282 [30:36<15:31,  8.39s/it][2026-06-23 12:35:04,975] [INFO] [axolotl.core.trainers.base._save:828] [PID:2077] Saving model checkpoint to ./out/learned/checkpoint-171
 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                               | 172/282 [30:51<19:06, 10.42s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.1027', 'grad_norm': '0.4368', 'learning_rate': '7.063e-05', 'ppl': '1.108', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.52', 'tokens/train_per_sec_per_gpu': '181.8', 'tokens/total': 11124736, 'tokens/trainable': 1527357, 'epoch': '3.018'}
 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                               | 172/282 [30:51<19:06, 10.42s/it] 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                              | 173/282 [31:02<19:08, 10.53s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05671', 'grad_norm': '0.2307', 'learning_rate': '6.953e-05', 'ppl': '1.058', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.39', 'tokens/train_per_sec_per_gpu': '188.7', 'tokens/total': 11190272, 'tokens/trainable': 1538421, 'epoch': '3.036'}
 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                              | 173/282 [31:02<19:08, 10.53s/it] 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                             | 174/282 [31:12<19:04, 10.60s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.07824', 'grad_norm': '0.4021', 'learning_rate': '6.844e-05', 'ppl': '1.081', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.42', 'tokens/train_per_sec_per_gpu': '209.9', 'tokens/total': 11255808, 'tokens/trainable': 1545892, 'epoch': '3.053'}
 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                             | 174/282 [31:12<19:04, 10.60s/it] 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                            | 175/282 [31:23<19:00, 10.66s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.0769', 'grad_norm': '0.3209', 'learning_rate': '6.736e-05', 'ppl': '1.08', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.42', 'tokens/train_per_sec_per_gpu': '350.1', 'tokens/total': 11321344, 'tokens/trainable': 1557217, 'epoch': '3.071'}
 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                            | 175/282 [31:23<19:00, 10.66s/it] 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                           | 176/282 [31:34<18:54, 10.70s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.09488', 'grad_norm': '0.3722', 'learning_rate': '6.628e-05', 'ppl': '1.1', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '213.4', 'tokens/total': 11386880, 'tokens/trainable': 1566413, 'epoch': '3.089'}
 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                           | 176/282 [31:34<18:54, 10.70s/it] 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                         | 177/282 [31:45<18:46, 10.73s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.07044', 'grad_norm': '0.3205', 'learning_rate': '6.52e-05', 'ppl': '1.073', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '230.1', 'tokens/total': 11452416, 'tokens/trainable': 1575767, 'epoch': '3.107'}
 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                         | 177/282 [31:45<18:46, 10.73s/it] 63%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                        | 178/282 [31:56<18:38, 10.75s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.08268', 'grad_norm': '0.4017', 'learning_rate': '6.413e-05', 'ppl': '1.086', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '242.2', 'tokens/total': 11517952, 'tokens/trainable': 1584110, 'epoch': '3.124'}
 63%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                        | 178/282 [31:56<18:38, 10.75s/it] 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                       | 179/282 [32:06<18:28, 10.76s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.09781', 'grad_norm': '0.4186', 'learning_rate': '6.306e-05', 'ppl': '1.103', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '100.1', 'tokens/total': 11583488, 'tokens/trainable': 1590048, 'epoch': '3.142'}
 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                       | 179/282 [32:06<18:28, 10.76s/it] 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 180/282 [32:17<18:18, 10.77s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.07678', 'grad_norm': '0.3336', 'learning_rate': '6.2e-05', 'ppl': '1.08', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '220.3', 'tokens/total': 11649024, 'tokens/trainable': 1599811, 'epoch': '3.16'}
 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 180/282 [32:17<18:18, 10.77s/it] 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 181/282 [32:28<18:08, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.07804', 'grad_norm': '0.321', 'learning_rate': '6.094e-05', 'ppl': '1.081', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '260.6', 'tokens/total': 11714560, 'tokens/trainable': 1607572, 'epoch': '3.178'}
 64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 181/282 [32:28<18:08, 10.78s/it] 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                    | 182/282 [32:39<17:59, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.07476', 'grad_norm': '0.3177', 'learning_rate': '5.989e-05', 'ppl': '1.078', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '259.5', 'tokens/total': 11780096, 'tokens/trainable': 1617842, 'epoch': '3.196'}
 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                    | 182/282 [32:39<17:59, 10.79s/it] 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                  | 183/282 [32:50<17:48, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.07072', 'grad_norm': '0.2951', 'learning_rate': '5.884e-05', 'ppl': '1.073', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '173.1', 'tokens/total': 11845632, 'tokens/trainable': 1626233, 'epoch': '3.213'}
 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                  | 183/282 [32:50<17:48, 10.79s/it] 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 184/282 [33:00<17:37, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.07653', 'grad_norm': '0.3529', 'learning_rate': '5.78e-05', 'ppl': '1.08', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '267.1', 'tokens/total': 11911168, 'tokens/trainable': 1634557, 'epoch': '3.231'}
 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 184/282 [33:00<17:37, 10.80s/it] 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 185/282 [33:11<17:26, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06434', 'grad_norm': '0.3024', 'learning_rate': '5.676e-05', 'ppl': '1.066', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '148.7', 'tokens/total': 11976704, 'tokens/trainable': 1642969, 'epoch': '3.249'}
 66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                | 185/282 [33:11<17:26, 10.79s/it] 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                               | 186/282 [33:22<17:16, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.0662', 'grad_norm': '0.312', 'learning_rate': '5.573e-05', 'ppl': '1.068', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '174', 'tokens/total': 12042240, 'tokens/trainable': 1651185, 'epoch': '3.267'}
 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                               | 186/282 [33:22<17:16, 10.79s/it] 66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 187/282 [33:33<17:05, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.07623', 'grad_norm': '0.3948', 'learning_rate': '5.47e-05', 'ppl': '1.079', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '190.5', 'tokens/total': 12107776, 'tokens/trainable': 1659039, 'epoch': '3.284'}
 66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                              | 187/282 [33:33<17:05, 10.79s/it] 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                             | 188/282 [33:44<16:54, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.0725', 'grad_norm': '0.4377', 'learning_rate': '5.368e-05', 'ppl': '1.075', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '110.7', 'tokens/total': 12173312, 'tokens/trainable': 1668539, 'epoch': '3.302'}
 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                             | 188/282 [33:44<16:54, 10.79s/it] 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 189/282 [33:54<16:43, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.08527', 'grad_norm': '0.4415', 'learning_rate': '5.267e-05', 'ppl': '1.089', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '112.2', 'tokens/total': 12238848, 'tokens/trainable': 1676935, 'epoch': '3.32'}
 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                           | 189/282 [33:54<16:43, 10.79s/it] 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 190/282 [34:05<16:33, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.08252', 'grad_norm': '0.387', 'learning_rate': '5.166e-05', 'ppl': '1.086', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '218.7', 'tokens/total': 12304384, 'tokens/trainable': 1685426, 'epoch': '3.338'}
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                          | 190/282 [34:05<16:33, 10.80s/it] 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 191/282 [34:16<16:22, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.07319', 'grad_norm': '0.316', 'learning_rate': '5.066e-05', 'ppl': '1.076', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '178', 'tokens/total': 12369920, 'tokens/trainable': 1693571, 'epoch': '3.356'}
 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 191/282 [34:16<16:22, 10.80s/it] 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 192/282 [34:27<16:11, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06126', 'grad_norm': '0.2803', 'learning_rate': '4.967e-05', 'ppl': '1.063', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '171.2', 'tokens/total': 12435456, 'tokens/trainable': 1701473, 'epoch': '3.373'}
 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 192/282 [34:27<16:11, 10.80s/it] 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                       | 193/282 [34:37<16:00, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06925', 'grad_norm': '0.3516', 'learning_rate': '4.868e-05', 'ppl': '1.072', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '253.1', 'tokens/total': 12500992, 'tokens/trainable': 1711202, 'epoch': '3.391'}
 68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                       | 193/282 [34:37<16:00, 10.80s/it] 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 194/282 [34:48<15:50, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06001', 'grad_norm': '0.2671', 'learning_rate': '4.77e-05', 'ppl': '1.062', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '186', 'tokens/total': 12566528, 'tokens/trainable': 1723987, 'epoch': '3.409'}
 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 194/282 [34:48<15:50, 10.80s/it] 69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 195/282 [34:59<15:39, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06486', 'grad_norm': '0.3027', 'learning_rate': '4.673e-05', 'ppl': '1.067', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '211.1', 'tokens/total': 12632064, 'tokens/trainable': 1734111, 'epoch': '3.427'}
 69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 195/282 [34:59<15:39, 10.79s/it] 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                   | 196/282 [35:10<15:27, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.08202', 'grad_norm': '0.3509', 'learning_rate': '4.576e-05', 'ppl': '1.085', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '147.9', 'tokens/total': 12697600, 'tokens/trainable': 1741170, 'epoch': '3.444'}
 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                   | 196/282 [35:10<15:27, 10.78s/it] 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 197/282 [35:21<15:16, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05726', 'grad_norm': '0.2738', 'learning_rate': '4.48e-05', 'ppl': '1.059', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '304.8', 'tokens/total': 12763136, 'tokens/trainable': 1750862, 'epoch': '3.462'}
 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                  | 197/282 [35:21<15:16, 10.78s/it] 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 198/282 [35:31<15:05, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05742', 'grad_norm': '0.2888', 'learning_rate': '4.385e-05', 'ppl': '1.059', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '346.4', 'tokens/total': 12828672, 'tokens/trainable': 1760327, 'epoch': '3.48'}
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 198/282 [35:31<15:05, 10.78s/it] 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 199/282 [35:42<14:55, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05594', 'grad_norm': '0.2664', 'learning_rate': '4.29e-05', 'ppl': '1.058', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '201.7', 'tokens/total': 12894208, 'tokens/trainable': 1770137, 'epoch': '3.498'}
 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 199/282 [35:42<14:55, 10.79s/it] 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 200/282 [35:53<14:44, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06768', 'grad_norm': '0.3056', 'learning_rate': '4.197e-05', 'ppl': '1.07', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '154.8', 'tokens/total': 12959744, 'tokens/trainable': 1780649, 'epoch': '3.516'}
 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                               | 200/282 [35:53<14:44, 10.79s/it] 71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                              | 201/282 [36:04<14:34, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06715', 'grad_norm': '0.2904', 'learning_rate': '4.104e-05', 'ppl': '1.069', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '187.2', 'tokens/total': 13025280, 'tokens/trainable': 1789358, 'epoch': '3.533'}
 71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                              | 201/282 [36:04<14:34, 10.79s/it] 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 202/282 [36:15<14:23, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05936', 'grad_norm': '0.2846', 'learning_rate': '4.011e-05', 'ppl': '1.061', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '248.8', 'tokens/total': 13090816, 'tokens/trainable': 1798425, 'epoch': '3.551'}
 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                            | 202/282 [36:15<14:23, 10.79s/it] 72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 203/282 [36:25<14:12, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05311', 'grad_norm': '0.2539', 'learning_rate': '3.92e-05', 'ppl': '1.055', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '198', 'tokens/total': 13156352, 'tokens/trainable': 1808536, 'epoch': '3.569'}
 72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                           | 203/282 [36:25<14:12, 10.79s/it] 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                          | 204/282 [36:36<14:01, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06636', 'grad_norm': '0.3478', 'learning_rate': '3.829e-05', 'ppl': '1.069', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '174', 'tokens/total': 13221888, 'tokens/trainable': 1816863, 'epoch': '3.587'}
 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                          | 204/282 [36:36<14:01, 10.79s/it] 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 205/282 [36:47<13:50, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05827', 'grad_norm': '0.2537', 'learning_rate': '3.74e-05', 'ppl': '1.06', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '199.9', 'tokens/total': 13287424, 'tokens/trainable': 1825314, 'epoch': '3.604'}
 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 205/282 [36:47<13:50, 10.79s/it] 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 206/282 [36:58<13:40, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03992', 'grad_norm': '0.1849', 'learning_rate': '3.651e-05', 'ppl': '1.041', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '210.7', 'tokens/total': 13352960, 'tokens/trainable': 1837005, 'epoch': '3.622'}
 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 206/282 [36:58<13:40, 10.79s/it] 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                       | 207/282 [37:09<13:29, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05873', 'grad_norm': '0.3123', 'learning_rate': '3.562e-05', 'ppl': '1.06', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '136.4', 'tokens/total': 13418496, 'tokens/trainable': 1847494, 'epoch': '3.64'}
 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                       | 207/282 [37:09<13:29, 10.80s/it] 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 208/282 [37:19<13:19, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05666', 'grad_norm': '0.2927', 'learning_rate': '3.475e-05', 'ppl': '1.058', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '240.4', 'tokens/total': 13484032, 'tokens/trainable': 1858183, 'epoch': '3.658'}
 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 208/282 [37:19<13:19, 10.80s/it] 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 209/282 [37:30<13:08, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05975', 'grad_norm': '0.297', 'learning_rate': '3.389e-05', 'ppl': '1.062', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '287.7', 'tokens/total': 13549568, 'tokens/trainable': 1867369, 'epoch': '3.676'}
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                    | 209/282 [37:30<13:08, 10.80s/it] 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 210/282 [37:41<12:57, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06414', 'grad_norm': '0.302', 'learning_rate': '3.303e-05', 'ppl': '1.066', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '181.6', 'tokens/total': 13615104, 'tokens/trainable': 1876089, 'epoch': '3.693'}
 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 210/282 [37:41<12:57, 10.80s/it] 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                  | 211/282 [37:52<12:46, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06114', 'grad_norm': '0.3029', 'learning_rate': '3.218e-05', 'ppl': '1.063', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '187', 'tokens/total': 13680640, 'tokens/trainable': 1884923, 'epoch': '3.711'}
 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                  | 211/282 [37:52<12:46, 10.80s/it] 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 212/282 [38:03<12:36, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04302', 'grad_norm': '0.2201', 'learning_rate': '3.134e-05', 'ppl': '1.044', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '224.2', 'tokens/total': 13746176, 'tokens/trainable': 1895843, 'epoch': '3.729'}
 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 212/282 [38:03<12:36, 10.80s/it] 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 213/282 [38:13<12:25, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05042', 'grad_norm': '0.2483', 'learning_rate': '3.052e-05', 'ppl': '1.052', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '183.4', 'tokens/total': 13811712, 'tokens/trainable': 1906133, 'epoch': '3.747'}
 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 213/282 [38:13<12:25, 10.80s/it] 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 214/282 [38:24<12:14, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06126', 'grad_norm': '0.2604', 'learning_rate': '2.97e-05', 'ppl': '1.063', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '222.9', 'tokens/total': 13877248, 'tokens/trainable': 1915359, 'epoch': '3.764'}
 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 214/282 [38:24<12:14, 10.80s/it] 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 215/282 [38:35<12:03, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.07048', 'grad_norm': '0.2984', 'learning_rate': '2.889e-05', 'ppl': '1.073', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '176.5', 'tokens/total': 13942784, 'tokens/trainable': 1922000, 'epoch': '3.782'}
 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 215/282 [38:35<12:03, 10.80s/it] 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                            | 216/282 [38:46<11:52, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05804', 'grad_norm': '0.335', 'learning_rate': '2.808e-05', 'ppl': '1.06', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '309.7', 'tokens/total': 14008320, 'tokens/trainable': 1931085, 'epoch': '3.8'}
 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                            | 216/282 [38:46<11:52, 10.80s/it] 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 217/282 [38:57<11:41, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05711', 'grad_norm': '0.2767', 'learning_rate': '2.729e-05', 'ppl': '1.059', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '191', 'tokens/total': 14073856, 'tokens/trainable': 1940982, 'epoch': '3.818'}
 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 217/282 [38:57<11:41, 10.80s/it] 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 218/282 [39:07<11:31, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05261', 'grad_norm': '0.2528', 'learning_rate': '2.651e-05', 'ppl': '1.054', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '189.2', 'tokens/total': 14139392, 'tokens/trainable': 1949304, 'epoch': '3.836'}
 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 218/282 [39:07<11:31, 10.80s/it] 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                         | 219/282 [39:18<11:20, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06048', 'grad_norm': '0.2983', 'learning_rate': '2.574e-05', 'ppl': '1.062', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '270.1', 'tokens/total': 14204928, 'tokens/trainable': 1958132, 'epoch': '3.853'}
 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                         | 219/282 [39:18<11:20, 10.80s/it] 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 220/282 [39:29<11:09, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06569', 'grad_norm': '0.3062', 'learning_rate': '2.497e-05', 'ppl': '1.068', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '165.8', 'tokens/total': 14270464, 'tokens/trainable': 1966271, 'epoch': '3.871'}
 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                        | 220/282 [39:29<11:09, 10.80s/it] 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 221/282 [39:40<11:03, 10.88s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04388', 'grad_norm': '0.1911', 'learning_rate': '2.422e-05', 'ppl': '1.045', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '216', 'tokens/total': 14336000, 'tokens/trainable': 1976870, 'epoch': '3.889'}
 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 221/282 [39:40<11:03, 10.88s/it] 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 222/282 [39:51<10:51, 10.85s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05706', 'grad_norm': '0.2354', 'learning_rate': '2.348e-05', 'ppl': '1.059', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.37', 'tokens/train_per_sec_per_gpu': '259.4', 'tokens/total': 14401536, 'tokens/trainable': 1984124, 'epoch': '3.907'}
 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 222/282 [39:51<10:51, 10.85s/it] 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 223/282 [40:02<10:39, 10.84s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06251', 'grad_norm': '0.2697', 'learning_rate': '2.274e-05', 'ppl': '1.065', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.45', 'tokens/train_per_sec_per_gpu': '120.3', 'tokens/total': 14467072, 'tokens/trainable': 1993317, 'epoch': '3.924'}
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 223/282 [40:02<10:39, 10.84s/it] 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 224/282 [40:12<10:27, 10.83s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06289', 'grad_norm': '0.3342', 'learning_rate': '2.202e-05', 'ppl': '1.065', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.45', 'tokens/train_per_sec_per_gpu': '246', 'tokens/total': 14532608, 'tokens/trainable': 2001589, 'epoch': '3.942'}
 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 224/282 [40:12<10:27, 10.83s/it] 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 225/282 [40:23<10:16, 10.82s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04806', 'grad_norm': '0.2052', 'learning_rate': '2.131e-05', 'ppl': '1.049', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.45', 'tokens/train_per_sec_per_gpu': '240.9', 'tokens/total': 14598144, 'tokens/trainable': 2010934, 'epoch': '3.96'}
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 225/282 [40:23<10:16, 10.82s/it] 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                 | 226/282 [40:34<10:05, 10.81s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05836', 'grad_norm': '0.2765', 'learning_rate': '2.061e-05', 'ppl': '1.06', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.45', 'tokens/train_per_sec_per_gpu': '161.9', 'tokens/total': 14663680, 'tokens/trainable': 2018851, 'epoch': '3.978'}
 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                 | 226/282 [40:34<10:05, 10.81s/it] 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 227/282 [40:45<09:54, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.0625', 'grad_norm': '0.2687', 'learning_rate': '1.991e-05', 'ppl': '1.065', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.45', 'tokens/train_per_sec_per_gpu': '116.8', 'tokens/total': 14729216, 'tokens/trainable': 2025610, 'epoch': '3.996'}
 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 227/282 [40:45<09:54, 10.80s/it] 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 228/282 [40:48<07:33,  8.39s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06698', 'grad_norm': '0.5448', 'learning_rate': '1.923e-05', 'ppl': '1.069', 'memory/max_active (GiB)': '37.3', 'memory/max_allocated (GiB)': '37.3', 'memory/device_reserved (GiB)': '38.45', 'tokens/train_per_sec_per_gpu': '580.1', 'tokens/total': 14745600, 'tokens/trainable': 2027184, 'epoch': '4'}
 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                              | 228/282 [40:48<07:33,  8.39s/it][2026-06-23 12:45:16,910] [INFO] [axolotl.core.trainers.base._save:828] [PID:2077] Saving model checkpoint to ./out/learned/checkpoint-228
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 229/282 [41:03<09:12, 10.42s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04661', 'grad_norm': '0.211', 'learning_rate': '1.856e-05', 'ppl': '1.048', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.45', 'tokens/train_per_sec_per_gpu': '110.4', 'tokens/total': 14811136, 'tokens/trainable': 2035426, 'epoch': '4.018'}
 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 229/282 [41:03<09:12, 10.42s/it] 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 230/282 [41:14<09:07, 10.54s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03875', 'grad_norm': '0.1713', 'learning_rate': '1.79e-05', 'ppl': '1.04', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '237.6', 'tokens/total': 14876672, 'tokens/trainable': 2044178, 'epoch': '4.036'}
 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 230/282 [41:14<09:07, 10.54s/it] 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 231/282 [41:24<09:01, 10.61s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05557', 'grad_norm': '0.2262', 'learning_rate': '1.725e-05', 'ppl': '1.057', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '140.4', 'tokens/total': 14942208, 'tokens/trainable': 2050814, 'epoch': '4.053'}
 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 231/282 [41:24<09:01, 10.61s/it] 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                          | 232/282 [41:35<08:53, 10.67s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06252', 'grad_norm': '0.379', 'learning_rate': '1.661e-05', 'ppl': '1.065', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '267', 'tokens/total': 15007744, 'tokens/trainable': 2058101, 'epoch': '4.071'}
 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                          | 232/282 [41:35<08:53, 10.67s/it] 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 233/282 [41:46<08:44, 10.71s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04235', 'grad_norm': '0.2006', 'learning_rate': '1.599e-05', 'ppl': '1.043', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '166.7', 'tokens/total': 15073280, 'tokens/trainable': 2067388, 'epoch': '4.089'}
 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 233/282 [41:46<08:44, 10.71s/it] 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 234/282 [41:57<08:35, 10.73s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04386', 'grad_norm': '0.1995', 'learning_rate': '1.537e-05', 'ppl': '1.045', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.47', 'tokens/train_per_sec_per_gpu': '221', 'tokens/total': 15138816, 'tokens/trainable': 2077508, 'epoch': '4.107'}
 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 234/282 [41:57<08:35, 10.73s/it] 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 235/282 [42:07<08:24, 10.74s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04764', 'grad_norm': '0.2085', 'learning_rate': '1.477e-05', 'ppl': '1.049', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '177', 'tokens/total': 15204352, 'tokens/trainable': 2084719, 'epoch': '4.124'}
 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 235/282 [42:07<08:24, 10.74s/it] 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 236/282 [42:18<08:14, 10.75s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03865', 'grad_norm': '0.1679', 'learning_rate': '1.417e-05', 'ppl': '1.039', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '210', 'tokens/total': 15269888, 'tokens/trainable': 2094768, 'epoch': '4.142'}
 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 236/282 [42:18<08:14, 10.75s/it] 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 237/282 [42:29<08:04, 10.76s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03926', 'grad_norm': '0.1771', 'learning_rate': '1.359e-05', 'ppl': '1.04', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '248.9', 'tokens/total': 15335424, 'tokens/trainable': 2104723, 'epoch': '4.16'}
 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 237/282 [42:29<08:04, 10.76s/it] 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 238/282 [42:40<07:53, 10.76s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04079', 'grad_norm': '0.2033', 'learning_rate': '1.302e-05', 'ppl': '1.042', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '186.3', 'tokens/total': 15400960, 'tokens/trainable': 2113477, 'epoch': '4.178'}
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 238/282 [42:40<07:53, 10.76s/it] 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 239/282 [42:51<07:43, 10.77s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05116', 'grad_norm': '0.2589', 'learning_rate': '1.246e-05', 'ppl': '1.052', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '230', 'tokens/total': 15466496, 'tokens/trainable': 2122036, 'epoch': '4.196'}
 85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 239/282 [42:51<07:43, 10.77s/it] 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 240/282 [43:01<07:32, 10.77s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.06248', 'grad_norm': '0.2999', 'learning_rate': '1.191e-05', 'ppl': '1.064', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '202.1', 'tokens/total': 15532032, 'tokens/trainable': 2128746, 'epoch': '4.213'}
 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 240/282 [43:01<07:32, 10.77s/it] 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                               | 241/282 [43:12<07:21, 10.77s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04288', 'grad_norm': '0.2031', 'learning_rate': '1.137e-05', 'ppl': '1.044', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '318.6', 'tokens/total': 15597568, 'tokens/trainable': 2139108, 'epoch': '4.231'}
 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                               | 241/282 [43:12<07:21, 10.77s/it] 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 242/282 [43:23<07:11, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04319', 'grad_norm': '0.1751', 'learning_rate': '1.085e-05', 'ppl': '1.044', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '224.4', 'tokens/total': 15663104, 'tokens/trainable': 2147883, 'epoch': '4.249'}
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                              | 242/282 [43:23<07:11, 10.78s/it] 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 243/282 [43:34<07:00, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04717', 'grad_norm': '0.2024', 'learning_rate': '1.033e-05', 'ppl': '1.048', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '173.3', 'tokens/total': 15728640, 'tokens/trainable': 2155171, 'epoch': '4.267'}
 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 243/282 [43:34<07:00, 10.78s/it] 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 244/282 [43:44<06:49, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04443', 'grad_norm': '0.208', 'learning_rate': '9.832e-06', 'ppl': '1.045', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '190.1', 'tokens/total': 15794176, 'tokens/trainable': 2163723, 'epoch': '4.284'}
 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 244/282 [43:44<06:49, 10.78s/it] 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 245/282 [43:55<06:39, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04296', 'grad_norm': '0.1936', 'learning_rate': '9.342e-06', 'ppl': '1.044', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '265.1', 'tokens/total': 15859712, 'tokens/trainable': 2172948, 'epoch': '4.302'}
 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 245/282 [43:55<06:39, 10.78s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 246/282 [44:06<06:28, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05108', 'grad_norm': '0.2488', 'learning_rate': '8.864e-06', 'ppl': '1.052', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '284.6', 'tokens/total': 15925248, 'tokens/trainable': 2181206, 'epoch': '4.32'}
 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 246/282 [44:06<06:28, 10.79s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 247/282 [44:17<06:17, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04187', 'grad_norm': '0.1868', 'learning_rate': '8.398e-06', 'ppl': '1.043', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '236.3', 'tokens/total': 15990784, 'tokens/trainable': 2189850, 'epoch': '4.338'}
 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 247/282 [44:17<06:17, 10.79s/it] 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 248/282 [44:28<06:07, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.02949', 'grad_norm': '0.1317', 'learning_rate': '7.945e-06', 'ppl': '1.03', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '236.3', 'tokens/total': 16056320, 'tokens/trainable': 2203324, 'epoch': '4.356'}
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 248/282 [44:28<06:07, 10.80s/it] 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 249/282 [44:38<05:56, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05169', 'grad_norm': '0.2551', 'learning_rate': '7.503e-06', 'ppl': '1.053', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '243.5', 'tokens/total': 16121856, 'tokens/trainable': 2211245, 'epoch': '4.373'}
 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 249/282 [44:38<05:56, 10.80s/it] 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 250/282 [44:49<05:45, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03748', 'grad_norm': '0.1648', 'learning_rate': '7.073e-06', 'ppl': '1.038', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '234.8', 'tokens/total': 16187392, 'tokens/trainable': 2221247, 'epoch': '4.391'}
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 250/282 [44:49<05:45, 10.80s/it] 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 251/282 [45:00<05:34, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04545', 'grad_norm': '0.224', 'learning_rate': '6.656e-06', 'ppl': '1.046', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '205.1', 'tokens/total': 16252928, 'tokens/trainable': 2230038, 'epoch': '4.409'}
 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 251/282 [45:00<05:34, 10.80s/it] 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 252/282 [45:11<05:24, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03317', 'grad_norm': '0.1616', 'learning_rate': '6.251e-06', 'ppl': '1.034', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '366.7', 'tokens/total': 16318464, 'tokens/trainable': 2242881, 'epoch': '4.427'}
 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                  | 252/282 [45:11<05:24, 10.80s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 253/282 [45:22<05:13, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04402', 'grad_norm': '0.1877', 'learning_rate': '5.858e-06', 'ppl': '1.045', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '137.7', 'tokens/total': 16384000, 'tokens/trainable': 2250467, 'epoch': '4.444'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 253/282 [45:22<05:13, 10.80s/it] 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 254/282 [45:32<05:02, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03938', 'grad_norm': '0.1851', 'learning_rate': '5.477e-06', 'ppl': '1.04', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '200.4', 'tokens/total': 16449536, 'tokens/trainable': 2259773, 'epoch': '4.462'}
 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 254/282 [45:32<05:02, 10.80s/it] 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 255/282 [45:43<04:51, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03304', 'grad_norm': '0.1269', 'learning_rate': '5.109e-06', 'ppl': '1.034', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '317.5', 'tokens/total': 16515072, 'tokens/trainable': 2270892, 'epoch': '4.48'}
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 255/282 [45:43<04:51, 10.80s/it] 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 256/282 [45:54<04:40, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04827', 'grad_norm': '0.206', 'learning_rate': '4.754e-06', 'ppl': '1.049', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '214.2', 'tokens/total': 16580608, 'tokens/trainable': 2277970, 'epoch': '4.498'}
 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 256/282 [45:54<04:40, 10.80s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 257/282 [46:05<04:30, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03764', 'grad_norm': '0.1982', 'learning_rate': '4.411e-06', 'ppl': '1.038', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '164', 'tokens/total': 16646144, 'tokens/trainable': 2288917, 'epoch': '4.516'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 257/282 [46:05<04:30, 10.80s/it] 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 258/282 [46:16<04:19, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03888', 'grad_norm': '0.1777', 'learning_rate': '4.08e-06', 'ppl': '1.04', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '241.5', 'tokens/total': 16711680, 'tokens/trainable': 2298433, 'epoch': '4.533'}
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 258/282 [46:16<04:19, 10.80s/it] 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 259/282 [46:26<04:08, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03776', 'grad_norm': '0.1815', 'learning_rate': '3.762e-06', 'ppl': '1.038', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '216.4', 'tokens/total': 16777216, 'tokens/trainable': 2308589, 'epoch': '4.551'}
 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 259/282 [46:26<04:08, 10.80s/it] 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 260/282 [46:37<03:57, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05449', 'grad_norm': '0.2388', 'learning_rate': '3.457e-06', 'ppl': '1.056', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '176.4', 'tokens/total': 16842752, 'tokens/trainable': 2315207, 'epoch': '4.569'}
 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 260/282 [46:37<03:57, 10.80s/it] 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 261/282 [46:48<03:46, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04094', 'grad_norm': '0.1973', 'learning_rate': '3.165e-06', 'ppl': '1.042', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '136.8', 'tokens/total': 16908288, 'tokens/trainable': 2323409, 'epoch': '4.587'}
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 261/282 [46:48<03:46, 10.79s/it] 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 262/282 [46:59<03:35, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04596', 'grad_norm': '0.2063', 'learning_rate': '2.885e-06', 'ppl': '1.047', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '313.3', 'tokens/total': 16973824, 'tokens/trainable': 2331941, 'epoch': '4.604'}
 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 262/282 [46:59<03:35, 10.78s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 263/282 [47:10<03:24, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03874', 'grad_norm': '0.18', 'learning_rate': '2.618e-06', 'ppl': '1.04', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '286.4', 'tokens/total': 17039360, 'tokens/trainable': 2342281, 'epoch': '4.622'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                      | 263/282 [47:10<03:24, 10.78s/it] 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 264/282 [47:20<03:14, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04488', 'grad_norm': '0.2169', 'learning_rate': '2.364e-06', 'ppl': '1.046', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '115.9', 'tokens/total': 17104896, 'tokens/trainable': 2350592, 'epoch': '4.64'}
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 264/282 [47:20<03:14, 10.78s/it] 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 265/282 [47:31<03:03, 10.78s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04711', 'grad_norm': '0.2048', 'learning_rate': '2.122e-06', 'ppl': '1.048', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '128.6', 'tokens/total': 17170432, 'tokens/trainable': 2358465, 'epoch': '4.658'}
 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 265/282 [47:31<03:03, 10.78s/it] 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 266/282 [47:42<02:52, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03553', 'grad_norm': '0.1769', 'learning_rate': '1.894e-06', 'ppl': '1.036', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '273.4', 'tokens/total': 17235968, 'tokens/trainable': 2368569, 'epoch': '4.676'}
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 266/282 [47:42<02:52, 10.79s/it] 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 267/282 [47:53<02:41, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04072', 'grad_norm': '0.176', 'learning_rate': '1.678e-06', 'ppl': '1.042', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '290.2', 'tokens/total': 17301504, 'tokens/trainable': 2378115, 'epoch': '4.693'}
 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 267/282 [47:53<02:41, 10.79s/it] 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 268/282 [48:04<02:31, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04474', 'grad_norm': '0.1941', 'learning_rate': '1.475e-06', 'ppl': '1.046', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '313.8', 'tokens/total': 17367040, 'tokens/trainable': 2387451, 'epoch': '4.711'}
 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 268/282 [48:04<02:31, 10.79s/it] 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 269/282 [48:14<02:20, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03644', 'grad_norm': '0.1669', 'learning_rate': '1.286e-06', 'ppl': '1.037', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '234.3', 'tokens/total': 17432576, 'tokens/trainable': 2396676, 'epoch': '4.729'}
 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 269/282 [48:14<02:20, 10.79s/it] 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 270/282 [48:25<02:09, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03655', 'grad_norm': '0.1794', 'learning_rate': '1.109e-06', 'ppl': '1.037', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '201.6', 'tokens/total': 17498112, 'tokens/trainable': 2407048, 'epoch': '4.747'}
 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 270/282 [48:25<02:09, 10.79s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 271/282 [48:36<01:58, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05113', 'grad_norm': '0.2205', 'learning_rate': '9.45e-07', 'ppl': '1.052', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '190.4', 'tokens/total': 17563648, 'tokens/trainable': 2414700, 'epoch': '4.764'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 271/282 [48:36<01:58, 10.79s/it] 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 272/282 [48:47<01:47, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03662', 'grad_norm': '0.1556', 'learning_rate': '7.943e-07', 'ppl': '1.037', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '232.8', 'tokens/total': 17629184, 'tokens/trainable': 2424757, 'epoch': '4.782'}
 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 272/282 [48:47<01:47, 10.80s/it] 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 273/282 [48:58<01:37, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04215', 'grad_norm': '0.2128', 'learning_rate': '6.566e-07', 'ppl': '1.043', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '247.8', 'tokens/total': 17694720, 'tokens/trainable': 2435090, 'epoch': '4.8'}
 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 273/282 [48:58<01:37, 10.80s/it] 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 274/282 [49:08<01:26, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04676', 'grad_norm': '0.2036', 'learning_rate': '5.319e-07', 'ppl': '1.048', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '133.9', 'tokens/total': 17760256, 'tokens/trainable': 2442869, 'epoch': '4.818'}
 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 274/282 [49:08<01:26, 10.80s/it] 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 275/282 [49:19<01:15, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.05282', 'grad_norm': '0.2487', 'learning_rate': '4.204e-07', 'ppl': '1.054', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '119.4', 'tokens/total': 17825792, 'tokens/trainable': 2450956, 'epoch': '4.836'}
 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 275/282 [49:19<01:15, 10.80s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 276/282 [49:30<01:04, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04254', 'grad_norm': '0.2007', 'learning_rate': '3.219e-07', 'ppl': '1.043', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '319.4', 'tokens/total': 17891328, 'tokens/trainable': 2459468, 'epoch': '4.853'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████       | 276/282 [49:30<01:04, 10.80s/it] 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 277/282 [49:41<00:53, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.0449', 'grad_norm': '0.2558', 'learning_rate': '2.365e-07', 'ppl': '1.046', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '260.4', 'tokens/total': 17956864, 'tokens/trainable': 2468149, 'epoch': '4.871'}
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 277/282 [49:41<00:53, 10.79s/it] 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 278/282 [49:51<00:43, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03449', 'grad_norm': '0.1585', 'learning_rate': '1.643e-07', 'ppl': '1.035', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '267.4', 'tokens/total': 18022400, 'tokens/trainable': 2478488, 'epoch': '4.889'}
 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 278/282 [49:51<00:43, 10.79s/it] 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 279/282 [50:02<00:32, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04515', 'grad_norm': '0.1817', 'learning_rate': '1.052e-07', 'ppl': '1.046', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '196.8', 'tokens/total': 18087936, 'tokens/trainable': 2486671, 'epoch': '4.907'}
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 279/282 [50:02<00:32, 10.79s/it] 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 280/282 [50:13<00:21, 10.79s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04704', 'grad_norm': '0.2391', 'learning_rate': '5.915e-08', 'ppl': '1.048', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '274', 'tokens/total': 18153472, 'tokens/trainable': 2495170, 'epoch': '4.924'}
 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 280/282 [50:13<00:21, 10.79s/it]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 281/282 [50:24<00:10, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.04679', 'grad_norm': '0.2206', 'learning_rate': '2.629e-08', 'ppl': '1.048', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '145.4', 'tokens/total': 18219008, 'tokens/trainable': 2502375, 'epoch': '4.942'}
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 281/282 [50:24<00:10, 10.80s/it]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [50:35<00:00, 10.80s/it]                                                                                                                                                                                                                                                                                                                                                                              {'loss': '0.03765', 'grad_norm': '0.186', 'learning_rate': '6.573e-09', 'ppl': '1.038', 'memory/max_active (GiB)': '37.75', 'memory/max_allocated (GiB)': '37.75', 'memory/device_reserved (GiB)': '38.48', 'tokens/train_per_sec_per_gpu': '250.4', 'tokens/total': 18284544, 'tokens/trainable': 2511935, 'epoch': '4.96'}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [50:35<00:00, 10.80s/it][2026-06-23 12:55:04,046] [INFO] [axolotl.core.trainers.base._save:828] [PID:2077] Saving model checkpoint to ./out/learned/checkpoint-282
                                                                                                                                                                                                                                                                                                                                                                              {'train_runtime': '3038', 'train_samples_per_second': '2.97', 'train_steps_per_second': '0.093', 'train_loss': '0.2715', 'memory/max_active (GiB)': '7.11', 'memory/max_allocated (GiB)': '7.11', 'memory/device_reserved (GiB)': '38.48', 'epoch': '4.96', 'tokens/train_per_sec_per_gpu': '0'}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [50:38<00:00, 10.80s/it]100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [50:38<00:00, 10.77s/it]
[2026-06-23 12:55:07,332] [INFO] [axolotl.train.save_trained_model:267] [PID:2077] Training completed! Saving trained model to ./out/learned.
[2026-06-23 12:55:11,571] [INFO] [axolotl.train.save_trained_model:388] [PID:2077] Model successfully saved to ./out/learned